diff --git "a/sft/Full_competesmoev30/checkpoint-16632/trainer_state.json" "b/sft/Full_competesmoev30/checkpoint-16632/trainer_state.json" new file mode 100644--- /dev/null +++ "b/sft/Full_competesmoev30/checkpoint-16632/trainer_state.json" @@ -0,0 +1,249513 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.05028445, + "auxiliary_loss_mlp": 0.02215396, + "balance_loss_clip": 2.43573999, + "balance_loss_mlp": 1.76983953, + "epoch": 6.012325266796934e-05, + "flos": 24456507091200.0, + "grad_norm": 55.00561300220404, + "language_loss": 2.85272503, + "learning_rate": 0.0, + "loss": 1.94613922, + "num_input_tokens_seen": 19155, + "step": 1, + "time_per_iteration": 18.059409618377686 + }, + { + "auxiliary_loss_clip": 0.03380539, + "auxiliary_loss_mlp": 0.01459449, + "balance_loss_clip": 1.62786555, + "balance_loss_mlp": 1.18936849, + "epoch": 0.00012024650533593868, + "flos": 20225931246720.0, + "grad_norm": 34.93149751452764, + "language_loss": 1.82606053, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.87446034, + "num_input_tokens_seen": 36175, + "step": 2, + "time_per_iteration": 2.6318798065185547 + }, + { + "auxiliary_loss_clip": 0.03320229, + "auxiliary_loss_mlp": 0.01440978, + "balance_loss_clip": 1.62577581, + "balance_loss_mlp": 1.18882656, + "epoch": 0.000180369758003908, + "flos": 22309935454080.0, + "grad_norm": 32.71870482280511, + "language_loss": 1.57573509, + "learning_rate": 7.073439208833112e-07, + "loss": 1.62334716, + "num_input_tokens_seen": 54870, + "step": 3, + "time_per_iteration": 2.6362481117248535 + }, + { + "auxiliary_loss_clip": 0.03361497, + "auxiliary_loss_mlp": 0.01451404, + "balance_loss_clip": 1.62418985, + "balance_loss_mlp": 1.15500188, + "epoch": 0.00024049301067187735, + "flos": 22414650577920.0, + "grad_norm": 51.2387172839747, + "language_loss": 1.67362881, + "learning_rate": 8.925686513863519e-07, + "loss": 1.72175777, + "num_input_tokens_seen": 74575, + "step": 4, + "time_per_iteration": 2.7070822715759277 + }, + { + "auxiliary_loss_clip": 0.03402497, + "auxiliary_loss_mlp": 0.01505358, + "balance_loss_clip": 1.62493396, + "balance_loss_mlp": 1.21715808, + "epoch": 0.0003006162633398467, + "flos": 21396978449280.0, + "grad_norm": 56.088721215944275, + "language_loss": 1.91627169, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.96535027, + "num_input_tokens_seen": 92580, + "step": 5, + "time_per_iteration": 2.91436767578125 + }, + { + "auxiliary_loss_clip": 0.03370454, + "auxiliary_loss_mlp": 0.01515599, + "balance_loss_clip": 1.61556244, + "balance_loss_mlp": 1.22110426, + "epoch": 0.000360739516007816, + "flos": 21652375127040.0, + "grad_norm": 33.397169652317885, + "language_loss": 1.60591149, + "learning_rate": 1.153628246576487e-06, + "loss": 1.65477204, + "num_input_tokens_seen": 109705, + "step": 6, + "time_per_iteration": 2.994969367980957 + }, + { + "auxiliary_loss_clip": 0.03354239, + "auxiliary_loss_mlp": 0.01486417, + "balance_loss_clip": 1.61577415, + "balance_loss_mlp": 1.20336628, + "epoch": 0.0004208627686757854, + "flos": 27159742897920.0, + "grad_norm": 24.6270766983672, + "language_loss": 1.53276002, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.58116663, + "num_input_tokens_seen": 129425, + "step": 7, + "time_per_iteration": 3.0675876140594482 + }, + { + "auxiliary_loss_clip": 0.03321216, + "auxiliary_loss_mlp": 0.0144328, + "balance_loss_clip": 1.61205018, + "balance_loss_mlp": 1.16499734, + "epoch": 0.0004809860213437547, + "flos": 31319096135040.0, + "grad_norm": 31.71613063643349, + "language_loss": 1.43881059, + "learning_rate": 1.338852977079528e-06, + "loss": 1.48645568, + "num_input_tokens_seen": 149210, + "step": 8, + "time_per_iteration": 3.172358751296997 + }, + { + "auxiliary_loss_clip": 0.03368839, + "auxiliary_loss_mlp": 0.01496105, + "balance_loss_clip": 1.6120348, + "balance_loss_mlp": 1.21229148, + "epoch": 0.000541109274011724, + "flos": 32160411463680.0, + "grad_norm": 28.204490849684397, + "language_loss": 1.4969244, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.54557395, + "num_input_tokens_seen": 169055, + "step": 9, + "time_per_iteration": 3.112215280532837 + }, + { + "auxiliary_loss_clip": 0.03308365, + "auxiliary_loss_mlp": 0.01475035, + "balance_loss_clip": 1.61541438, + "balance_loss_mlp": 1.20647991, + "epoch": 0.0006012325266796934, + "flos": 18916808163840.0, + "grad_norm": 23.420774723604698, + "language_loss": 1.44714785, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.49498188, + "num_input_tokens_seen": 188045, + "step": 10, + "time_per_iteration": 2.9495606422424316 + }, + { + "auxiliary_loss_clip": 0.03364194, + "auxiliary_loss_mlp": 0.01494262, + "balance_loss_clip": 1.62042511, + "balance_loss_mlp": 1.22036684, + "epoch": 0.0006613557793476627, + "flos": 20774861867520.0, + "grad_norm": 18.353281468858004, + "language_loss": 1.4520936, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.50067806, + "num_input_tokens_seen": 207035, + "step": 11, + "time_per_iteration": 3.0797431468963623 + }, + { + "auxiliary_loss_clip": 0.03292683, + "auxiliary_loss_mlp": 0.0145154, + "balance_loss_clip": 1.60771322, + "balance_loss_mlp": 1.17554641, + "epoch": 0.000721479032015632, + "flos": 16581680997120.0, + "grad_norm": 16.61869254675767, + "language_loss": 1.45121813, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.49866033, + "num_input_tokens_seen": 223225, + "step": 12, + "time_per_iteration": 2.9887659549713135 + }, + { + "auxiliary_loss_clip": 0.03321669, + "auxiliary_loss_mlp": 0.01405912, + "balance_loss_clip": 1.61740756, + "balance_loss_mlp": 1.14765704, + "epoch": 0.0007816022846836014, + "flos": 23805471144960.0, + "grad_norm": 14.02187318243825, + "language_loss": 1.23759985, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.28487587, + "num_input_tokens_seen": 242570, + "step": 13, + "time_per_iteration": 3.032742977142334 + }, + { + "auxiliary_loss_clip": 0.03287474, + "auxiliary_loss_mlp": 0.01470749, + "balance_loss_clip": 1.61299658, + "balance_loss_mlp": 1.20257616, + "epoch": 0.0008417255373515708, + "flos": 19172204841600.0, + "grad_norm": 5.790568956401358, + "language_loss": 1.20684385, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.254426, + "num_input_tokens_seen": 261215, + "step": 14, + "time_per_iteration": 3.002887487411499 + }, + { + "auxiliary_loss_clip": 0.03272826, + "auxiliary_loss_mlp": 0.01431255, + "balance_loss_clip": 1.6181426, + "balance_loss_mlp": 1.16804111, + "epoch": 0.00090184879001954, + "flos": 26395564026240.0, + "grad_norm": 6.353887091300461, + "language_loss": 1.12925518, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.176296, + "num_input_tokens_seen": 280035, + "step": 15, + "time_per_iteration": 3.0238780975341797 + }, + { + "auxiliary_loss_clip": 0.03238489, + "auxiliary_loss_mlp": 0.01411651, + "balance_loss_clip": 1.60288334, + "balance_loss_mlp": 1.16197944, + "epoch": 0.0009619720426875094, + "flos": 24679500785280.0, + "grad_norm": 4.670310144758637, + "language_loss": 1.11125767, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.15775907, + "num_input_tokens_seen": 300265, + "step": 16, + "time_per_iteration": 4.605847120285034 + }, + { + "auxiliary_loss_clip": 0.03223993, + "auxiliary_loss_mlp": 0.01417304, + "balance_loss_clip": 1.60910368, + "balance_loss_mlp": 1.17774093, + "epoch": 0.0010220952953554788, + "flos": 18624531196800.0, + "grad_norm": 8.838429022323517, + "language_loss": 1.12645221, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.17286515, + "num_input_tokens_seen": 317375, + "step": 17, + "time_per_iteration": 4.579033851623535 + }, + { + "auxiliary_loss_clip": 0.03161492, + "auxiliary_loss_mlp": 0.01379312, + "balance_loss_clip": 1.60685277, + "balance_loss_mlp": 1.1475693, + "epoch": 0.001082218548023448, + "flos": 26142537646080.0, + "grad_norm": 3.823557061532633, + "language_loss": 1.08069181, + "learning_rate": 1.860972167459798e-06, + "loss": 1.12609982, + "num_input_tokens_seen": 337975, + "step": 18, + "time_per_iteration": 3.0132579803466797 + }, + { + "auxiliary_loss_clip": 0.0318761, + "auxiliary_loss_mlp": 0.01403306, + "balance_loss_clip": 1.60585093, + "balance_loss_mlp": 1.13799417, + "epoch": 0.0011423418006914173, + "flos": 19609776322560.0, + "grad_norm": 4.403621106373983, + "language_loss": 1.02445412, + "learning_rate": 1.89578346593066e-06, + "loss": 1.07036328, + "num_input_tokens_seen": 356635, + "step": 19, + "time_per_iteration": 3.016176462173462 + }, + { + "auxiliary_loss_clip": 0.0313029, + "auxiliary_loss_mlp": 0.01342049, + "balance_loss_clip": 1.60759044, + "balance_loss_mlp": 1.12155962, + "epoch": 0.0012024650533593868, + "flos": 17895365107200.0, + "grad_norm": 3.958333686933058, + "language_loss": 1.16706228, + "learning_rate": 1.928808765521199e-06, + "loss": 1.21178555, + "num_input_tokens_seen": 375625, + "step": 20, + "time_per_iteration": 3.0274486541748047 + }, + { + "auxiliary_loss_clip": 0.03118109, + "auxiliary_loss_mlp": 0.01378536, + "balance_loss_clip": 1.58886433, + "balance_loss_mlp": 1.1298182, + "epoch": 0.001262588306027356, + "flos": 21252043071360.0, + "grad_norm": 4.333519066420982, + "language_loss": 1.06129968, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.10626626, + "num_input_tokens_seen": 394350, + "step": 21, + "time_per_iteration": 2.9418578147888184 + }, + { + "auxiliary_loss_clip": 0.03013912, + "auxiliary_loss_mlp": 0.0137937, + "balance_loss_clip": 1.57028937, + "balance_loss_mlp": 1.14552903, + "epoch": 0.0013227115586953253, + "flos": 26104077158400.0, + "grad_norm": 3.63841390311849, + "language_loss": 1.05861485, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.10254765, + "num_input_tokens_seen": 413255, + "step": 22, + "time_per_iteration": 2.9651288986206055 + }, + { + "auxiliary_loss_clip": 0.02966296, + "auxiliary_loss_mlp": 0.01334065, + "balance_loss_clip": 1.57175612, + "balance_loss_mlp": 1.12377954, + "epoch": 0.0013828348113632948, + "flos": 23951376190080.0, + "grad_norm": 2.8746130742538347, + "language_loss": 0.9177655, + "learning_rate": 2.018794797290208e-06, + "loss": 0.96076906, + "num_input_tokens_seen": 433065, + "step": 23, + "time_per_iteration": 3.049853563308716 + }, + { + "auxiliary_loss_clip": 0.02932793, + "auxiliary_loss_mlp": 0.01362183, + "balance_loss_clip": 1.56404662, + "balance_loss_mlp": 1.14236116, + "epoch": 0.001442958064031264, + "flos": 15959851724160.0, + "grad_norm": 3.0897201135857735, + "language_loss": 1.08192635, + "learning_rate": 2.046196897962839e-06, + "loss": 1.12487614, + "num_input_tokens_seen": 451175, + "step": 24, + "time_per_iteration": 3.0543172359466553 + }, + { + "auxiliary_loss_clip": 0.02823838, + "auxiliary_loss_mlp": 0.01329007, + "balance_loss_clip": 1.55692792, + "balance_loss_mlp": 1.11853111, + "epoch": 0.0015030813166992333, + "flos": 18108350801280.0, + "grad_norm": 4.111246686692462, + "language_loss": 1.01367807, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.05520654, + "num_input_tokens_seen": 468775, + "step": 25, + "time_per_iteration": 3.0059614181518555 + }, + { + "auxiliary_loss_clip": 0.02818207, + "auxiliary_loss_mlp": 0.01309454, + "balance_loss_clip": 1.55974329, + "balance_loss_mlp": 1.10012197, + "epoch": 0.0015632045693672028, + "flos": 22234558763520.0, + "grad_norm": 2.7163042439620018, + "language_loss": 1.0669204, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.10819697, + "num_input_tokens_seen": 488530, + "step": 26, + "time_per_iteration": 3.1159534454345703 + }, + { + "auxiliary_loss_clip": 0.0276047, + "auxiliary_loss_mlp": 0.01325034, + "balance_loss_clip": 1.54973662, + "balance_loss_mlp": 1.12533486, + "epoch": 0.001623327822035172, + "flos": 23991955580160.0, + "grad_norm": 2.562596284241794, + "language_loss": 0.95537072, + "learning_rate": 2.122031762649933e-06, + "loss": 0.99622583, + "num_input_tokens_seen": 510495, + "step": 27, + "time_per_iteration": 3.018643617630005 + }, + { + "auxiliary_loss_clip": 0.02736222, + "auxiliary_loss_mlp": 0.01311707, + "balance_loss_clip": 1.55399776, + "balance_loss_mlp": 1.13089037, + "epoch": 0.0016834510747031415, + "flos": 19677647070720.0, + "grad_norm": 2.42975125432869, + "language_loss": 1.06393945, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.10441875, + "num_input_tokens_seen": 528605, + "step": 28, + "time_per_iteration": 2.9263083934783936 + }, + { + "auxiliary_loss_clip": 0.0270011, + "auxiliary_loss_mlp": 0.0131913, + "balance_loss_clip": 1.53841436, + "balance_loss_mlp": 1.13297284, + "epoch": 0.0017435743273711108, + "flos": 20923819568640.0, + "grad_norm": 4.42090805909513, + "language_loss": 1.02493238, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.06512475, + "num_input_tokens_seen": 548515, + "step": 29, + "time_per_iteration": 3.0062997341156006 + }, + { + "auxiliary_loss_clip": 0.0269246, + "auxiliary_loss_mlp": 0.01312758, + "balance_loss_clip": 1.53459728, + "balance_loss_mlp": 1.12631428, + "epoch": 0.00180369758003908, + "flos": 19528976678400.0, + "grad_norm": 3.1534114534186446, + "language_loss": 1.19265521, + "learning_rate": 2.189868360711334e-06, + "loss": 1.23270726, + "num_input_tokens_seen": 564025, + "step": 30, + "time_per_iteration": 2.931145429611206 + }, + { + "auxiliary_loss_clip": 0.02610377, + "auxiliary_loss_mlp": 0.01337183, + "balance_loss_clip": 1.52116311, + "balance_loss_mlp": 1.15665221, + "epoch": 0.0018638208327070496, + "flos": 27453169100160.0, + "grad_norm": 2.735994596991484, + "language_loss": 1.02616811, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.06564379, + "num_input_tokens_seen": 583345, + "step": 31, + "time_per_iteration": 2.993251085281372 + }, + { + "auxiliary_loss_clip": 0.02582044, + "auxiliary_loss_mlp": 0.01331305, + "balance_loss_clip": 1.522609, + "balance_loss_mlp": 1.15163302, + "epoch": 0.0019239440853750188, + "flos": 13589460380160.0, + "grad_norm": 3.9056907796043654, + "language_loss": 0.95266509, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.99179864, + "num_input_tokens_seen": 600010, + "step": 32, + "time_per_iteration": 2.9459571838378906 + }, + { + "auxiliary_loss_clip": 0.02564836, + "auxiliary_loss_mlp": 0.01302659, + "balance_loss_clip": 1.51811624, + "balance_loss_mlp": 1.13586164, + "epoch": 0.001984067338042988, + "flos": 11253866336640.0, + "grad_norm": 3.226486022987097, + "language_loss": 0.95143497, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.99010992, + "num_input_tokens_seen": 616295, + "step": 33, + "time_per_iteration": 2.9855570793151855 + }, + { + "auxiliary_loss_clip": 0.02421202, + "auxiliary_loss_mlp": 0.01304214, + "balance_loss_clip": 1.48474145, + "balance_loss_mlp": 1.14676213, + "epoch": 0.0020441905907109576, + "flos": 22386245898240.0, + "grad_norm": 2.1714659525821247, + "language_loss": 0.91547924, + "learning_rate": 2.270454923596497e-06, + "loss": 0.9527334, + "num_input_tokens_seen": 637640, + "step": 34, + "time_per_iteration": 2.981541872024536 + }, + { + "auxiliary_loss_clip": 0.02375249, + "auxiliary_loss_mlp": 0.01271963, + "balance_loss_clip": 1.45095515, + "balance_loss_mlp": 1.11689591, + "epoch": 0.0021043138433789266, + "flos": 49778580337920.0, + "grad_norm": 2.2635429103650386, + "language_loss": 0.76603377, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.80250585, + "num_input_tokens_seen": 659710, + "step": 35, + "time_per_iteration": 3.2267208099365234 + }, + { + "auxiliary_loss_clip": 0.02347187, + "auxiliary_loss_mlp": 0.01276388, + "balance_loss_clip": 1.46356034, + "balance_loss_mlp": 1.13238275, + "epoch": 0.002164437096046896, + "flos": 20557961591040.0, + "grad_norm": 2.3605884715298506, + "language_loss": 0.88713098, + "learning_rate": 2.307256493152974e-06, + "loss": 0.92336679, + "num_input_tokens_seen": 679670, + "step": 36, + "time_per_iteration": 2.948162078857422 + }, + { + "auxiliary_loss_clip": 0.02289192, + "auxiliary_loss_mlp": 0.01338204, + "balance_loss_clip": 1.45043015, + "balance_loss_mlp": 1.19105196, + "epoch": 0.0022245603487148656, + "flos": 26542295084160.0, + "grad_norm": 2.4929063351918166, + "language_loss": 0.93038809, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.96666199, + "num_input_tokens_seen": 700170, + "step": 37, + "time_per_iteration": 2.9556422233581543 + }, + { + "auxiliary_loss_clip": 0.02249098, + "auxiliary_loss_mlp": 0.01276785, + "balance_loss_clip": 1.44485605, + "balance_loss_mlp": 1.15500069, + "epoch": 0.0022846836013828346, + "flos": 20338188226560.0, + "grad_norm": 2.177909778954084, + "language_loss": 1.03952074, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.07477951, + "num_input_tokens_seen": 718545, + "step": 38, + "time_per_iteration": 2.9959065914154053 + }, + { + "auxiliary_loss_clip": 0.02216028, + "auxiliary_loss_mlp": 0.01260768, + "balance_loss_clip": 1.43807542, + "balance_loss_mlp": 1.13726676, + "epoch": 0.002344806854050804, + "flos": 26247575992320.0, + "grad_norm": 2.22652515093943, + "language_loss": 0.85297108, + "learning_rate": 2.358792165262154e-06, + "loss": 0.887739, + "num_input_tokens_seen": 739865, + "step": 39, + "time_per_iteration": 3.035399913787842 + }, + { + "auxiliary_loss_clip": 0.02192275, + "auxiliary_loss_mlp": 0.01250434, + "balance_loss_clip": 1.4289664, + "balance_loss_mlp": 1.12216496, + "epoch": 0.0024049301067187736, + "flos": 11801539981440.0, + "grad_norm": 3.258228308703562, + "language_loss": 0.90279335, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.93722045, + "num_input_tokens_seen": 755770, + "step": 40, + "time_per_iteration": 3.060368299484253 + }, + { + "auxiliary_loss_clip": 0.02142113, + "auxiliary_loss_mlp": 0.01273783, + "balance_loss_clip": 1.41895449, + "balance_loss_mlp": 1.16086745, + "epoch": 0.0024650533593867426, + "flos": 20631506688000.0, + "grad_norm": 3.245861029799582, + "language_loss": 0.93271625, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.9668752, + "num_input_tokens_seen": 773440, + "step": 41, + "time_per_iteration": 2.9518353939056396 + }, + { + "auxiliary_loss_clip": 0.02105753, + "auxiliary_loss_mlp": 0.01254821, + "balance_loss_clip": 1.41097844, + "balance_loss_mlp": 1.15168142, + "epoch": 0.002525176612054712, + "flos": 18406122549120.0, + "grad_norm": 3.3039479788253536, + "language_loss": 0.97533798, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.0089438, + "num_input_tokens_seen": 790455, + "step": 42, + "time_per_iteration": 2.933177947998047 + }, + { + "auxiliary_loss_clip": 0.020675, + "auxiliary_loss_mlp": 0.01298422, + "balance_loss_clip": 1.41198874, + "balance_loss_mlp": 1.19189644, + "epoch": 0.0025852998647226816, + "flos": 28184023128960.0, + "grad_norm": 3.15071165872949, + "language_loss": 0.97562659, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.00928593, + "num_input_tokens_seen": 810645, + "step": 43, + "time_per_iteration": 2.9760589599609375 + }, + { + "auxiliary_loss_clip": 0.02086351, + "auxiliary_loss_mlp": 0.01314601, + "balance_loss_clip": 1.41042757, + "balance_loss_mlp": 1.20283043, + "epoch": 0.0026454231173906506, + "flos": 14283110897280.0, + "grad_norm": 2.3612650137146574, + "language_loss": 0.93435001, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.96835947, + "num_input_tokens_seen": 827470, + "step": 44, + "time_per_iteration": 2.9239895343780518 + }, + { + "auxiliary_loss_clip": 0.02043996, + "auxiliary_loss_mlp": 0.01272131, + "balance_loss_clip": 1.40557313, + "balance_loss_mlp": 1.17399764, + "epoch": 0.00270554637005862, + "flos": 22419211605120.0, + "grad_norm": 2.1476860292916644, + "language_loss": 0.98677421, + "learning_rate": 2.450927955901469e-06, + "loss": 1.01993537, + "num_input_tokens_seen": 847285, + "step": 45, + "time_per_iteration": 2.9626305103302 + }, + { + "auxiliary_loss_clip": 0.02018804, + "auxiliary_loss_mlp": 0.01228873, + "balance_loss_clip": 1.39126372, + "balance_loss_mlp": 1.14208817, + "epoch": 0.0027656696227265896, + "flos": 23985778440960.0, + "grad_norm": 1.8862192248435494, + "language_loss": 1.02800822, + "learning_rate": 2.465079122983384e-06, + "loss": 1.06048501, + "num_input_tokens_seen": 867545, + "step": 46, + "time_per_iteration": 2.9913573265075684 + }, + { + "auxiliary_loss_clip": 0.0198766, + "auxiliary_loss_mlp": 0.01272862, + "balance_loss_clip": 1.38388658, + "balance_loss_mlp": 1.182549, + "epoch": 0.0028257928753945586, + "flos": 37669503087360.0, + "grad_norm": 2.1076645953887696, + "language_loss": 0.87839413, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.9109993, + "num_input_tokens_seen": 889915, + "step": 47, + "time_per_iteration": 3.0189881324768066 + }, + { + "auxiliary_loss_clip": 0.01949271, + "auxiliary_loss_mlp": 0.01255947, + "balance_loss_clip": 1.37360096, + "balance_loss_mlp": 1.16963911, + "epoch": 0.002885916128062528, + "flos": 22454547609600.0, + "grad_norm": 4.4561049138068, + "language_loss": 0.87809587, + "learning_rate": 2.492481223656015e-06, + "loss": 0.91014802, + "num_input_tokens_seen": 908975, + "step": 48, + "time_per_iteration": 2.863565444946289 + }, + { + "auxiliary_loss_clip": 0.01949016, + "auxiliary_loss_mlp": 0.0124182, + "balance_loss_clip": 1.36337733, + "balance_loss_mlp": 1.15069616, + "epoch": 0.0029460393807304976, + "flos": 27012796358400.0, + "grad_norm": 2.9451035624229855, + "language_loss": 0.89691317, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.9288215, + "num_input_tokens_seen": 929810, + "step": 49, + "time_per_iteration": 2.9967453479766846 + }, + { + "auxiliary_loss_clip": 0.0194038, + "auxiliary_loss_mlp": 0.01234077, + "balance_loss_clip": 1.35742152, + "balance_loss_mlp": 1.14996314, + "epoch": 0.0030061626333984666, + "flos": 15851832549120.0, + "grad_norm": 3.162716210197168, + "language_loss": 0.90914285, + "learning_rate": 2.51876455396287e-06, + "loss": 0.94088745, + "num_input_tokens_seen": 948650, + "step": 50, + "time_per_iteration": 2.8832523822784424 + }, + { + "auxiliary_loss_clip": 0.01938537, + "auxiliary_loss_mlp": 0.01199505, + "balance_loss_clip": 1.36240602, + "balance_loss_mlp": 1.11844242, + "epoch": 0.003066285886066436, + "flos": 31827052316160.0, + "grad_norm": 6.098010360158733, + "language_loss": 0.86977792, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.90115827, + "num_input_tokens_seen": 966455, + "step": 51, + "time_per_iteration": 2.9061717987060547 + }, + { + "auxiliary_loss_clip": 0.01895637, + "auxiliary_loss_mlp": 0.01206588, + "balance_loss_clip": 1.35252357, + "balance_loss_mlp": 1.12829173, + "epoch": 0.0031264091387344056, + "flos": 41427482774400.0, + "grad_norm": 2.043292881862276, + "language_loss": 0.95171362, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.98273587, + "num_input_tokens_seen": 988110, + "step": 52, + "time_per_iteration": 3.0266616344451904 + }, + { + "auxiliary_loss_clip": 0.01893195, + "auxiliary_loss_mlp": 0.01241159, + "balance_loss_clip": 1.34894896, + "balance_loss_mlp": 1.16162264, + "epoch": 0.0031865323914023747, + "flos": 23440941970560.0, + "grad_norm": 4.2358840345824635, + "language_loss": 0.92323011, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.95457363, + "num_input_tokens_seen": 1008550, + "step": 53, + "time_per_iteration": 2.8850226402282715 + }, + { + "auxiliary_loss_clip": 0.01882736, + "auxiliary_loss_mlp": 0.01197045, + "balance_loss_clip": 1.35264134, + "balance_loss_mlp": 1.11669779, + "epoch": 0.003246655644070344, + "flos": 14429195510400.0, + "grad_norm": 2.290226623360683, + "language_loss": 0.8260113, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.85680908, + "num_input_tokens_seen": 1026840, + "step": 54, + "time_per_iteration": 2.9433553218841553 + }, + { + "auxiliary_loss_clip": 0.01880073, + "auxiliary_loss_mlp": 0.01210775, + "balance_loss_clip": 1.34162152, + "balance_loss_mlp": 1.13233542, + "epoch": 0.0033067788967383136, + "flos": 35918247496320.0, + "grad_norm": 2.911577423572303, + "language_loss": 0.81303245, + "learning_rate": 2.580130221340046e-06, + "loss": 0.84394085, + "num_input_tokens_seen": 1048875, + "step": 55, + "time_per_iteration": 3.0040643215179443 + }, + { + "auxiliary_loss_clip": 0.01870075, + "auxiliary_loss_mlp": 0.0120375, + "balance_loss_clip": 1.33644819, + "balance_loss_mlp": 1.12521541, + "epoch": 0.003366902149406283, + "flos": 22958732862720.0, + "grad_norm": 2.639118679342801, + "language_loss": 0.87089968, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.90163803, + "num_input_tokens_seen": 1066435, + "step": 56, + "time_per_iteration": 2.830453395843506 + }, + { + "auxiliary_loss_clip": 0.01869912, + "auxiliary_loss_mlp": 0.01161425, + "balance_loss_clip": 1.32921791, + "balance_loss_mlp": 1.08851671, + "epoch": 0.003427025402074252, + "flos": 26582838560640.0, + "grad_norm": 2.101574700040827, + "language_loss": 0.92890096, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.95921433, + "num_input_tokens_seen": 1090330, + "step": 57, + "time_per_iteration": 7.0071024894714355 + }, + { + "auxiliary_loss_clip": 0.01833802, + "auxiliary_loss_mlp": 0.0121675, + "balance_loss_clip": 1.33333457, + "balance_loss_mlp": 1.14493799, + "epoch": 0.0034871486547422216, + "flos": 23951196622080.0, + "grad_norm": 14.610065921505914, + "language_loss": 0.9972856, + "learning_rate": 2.614325098333948e-06, + "loss": 1.02779114, + "num_input_tokens_seen": 1109840, + "step": 58, + "time_per_iteration": 2.830960273742676 + }, + { + "auxiliary_loss_clip": 0.0181804, + "auxiliary_loss_mlp": 0.01199311, + "balance_loss_clip": 1.32073379, + "balance_loss_mlp": 1.12835753, + "epoch": 0.003547271907410191, + "flos": 21214983214080.0, + "grad_norm": 2.120622270947527, + "language_loss": 0.88172519, + "learning_rate": 2.625331386578098e-06, + "loss": 0.91189873, + "num_input_tokens_seen": 1128415, + "step": 59, + "time_per_iteration": 2.8507089614868164 + }, + { + "auxiliary_loss_clip": 0.01839573, + "auxiliary_loss_mlp": 0.01163328, + "balance_loss_clip": 1.32924581, + "balance_loss_mlp": 1.09075332, + "epoch": 0.00360739516007816, + "flos": 16504903676160.0, + "grad_norm": 2.021991994360373, + "language_loss": 0.93542433, + "learning_rate": 2.63615268640451e-06, + "loss": 0.96545339, + "num_input_tokens_seen": 1146515, + "step": 60, + "time_per_iteration": 2.8517534732818604 + }, + { + "auxiliary_loss_clip": 0.0181893, + "auxiliary_loss_mlp": 0.01176948, + "balance_loss_clip": 1.31414318, + "balance_loss_mlp": 1.10923755, + "epoch": 0.0036675184127461296, + "flos": 19464805031040.0, + "grad_norm": 2.908283338489548, + "language_loss": 0.90021706, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.9301759, + "num_input_tokens_seen": 1166330, + "step": 61, + "time_per_iteration": 2.8853390216827393 + }, + { + "auxiliary_loss_clip": 0.01803943, + "auxiliary_loss_mlp": 0.01142904, + "balance_loss_clip": 1.31131864, + "balance_loss_mlp": 1.07581341, + "epoch": 0.003727641665414099, + "flos": 20957323979520.0, + "grad_norm": 1.8428161811646855, + "language_loss": 0.88479733, + "learning_rate": 2.657264485425803e-06, + "loss": 0.91426575, + "num_input_tokens_seen": 1186010, + "step": 62, + "time_per_iteration": 2.8860812187194824 + }, + { + "auxiliary_loss_clip": 0.01785338, + "auxiliary_loss_mlp": 0.0116457, + "balance_loss_clip": 1.30233741, + "balance_loss_mlp": 1.09504724, + "epoch": 0.003787764918082068, + "flos": 18406050721920.0, + "grad_norm": 2.4385306002926512, + "language_loss": 0.96280968, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.99230874, + "num_input_tokens_seen": 1204985, + "step": 63, + "time_per_iteration": 2.9081404209136963 + }, + { + "auxiliary_loss_clip": 0.01795068, + "auxiliary_loss_mlp": 0.01171321, + "balance_loss_clip": 1.31071985, + "balance_loss_mlp": 1.10499322, + "epoch": 0.0038478881707500376, + "flos": 12459243962880.0, + "grad_norm": 3.0781639748926697, + "language_loss": 0.98840165, + "learning_rate": 2.677705954159056e-06, + "loss": 1.01806557, + "num_input_tokens_seen": 1223545, + "step": 64, + "time_per_iteration": 2.893603801727295 + }, + { + "auxiliary_loss_clip": 0.01801311, + "auxiliary_loss_mlp": 0.01151112, + "balance_loss_clip": 1.30960393, + "balance_loss_mlp": 1.08368695, + "epoch": 0.003908011423418007, + "flos": 13553334276480.0, + "grad_norm": 2.4813676281781554, + "language_loss": 0.85397774, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.88350195, + "num_input_tokens_seen": 1241175, + "step": 65, + "time_per_iteration": 2.8768796920776367 + }, + { + "auxiliary_loss_clip": 0.01777474, + "auxiliary_loss_mlp": 0.01155217, + "balance_loss_clip": 1.29563761, + "balance_loss_mlp": 1.087888, + "epoch": 0.003968134676085976, + "flos": 18333475292160.0, + "grad_norm": 1.8550079005121831, + "language_loss": 0.85281348, + "learning_rate": 2.697518353781685e-06, + "loss": 0.88214046, + "num_input_tokens_seen": 1259315, + "step": 66, + "time_per_iteration": 2.769274950027466 + }, + { + "auxiliary_loss_clip": 0.01779987, + "auxiliary_loss_mlp": 0.01151372, + "balance_loss_clip": 1.29312515, + "balance_loss_mlp": 1.07650828, + "epoch": 0.004028257928753946, + "flos": 20485242506880.0, + "grad_norm": 2.74895944689593, + "language_loss": 0.96567476, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.99498826, + "num_input_tokens_seen": 1277055, + "step": 67, + "time_per_iteration": 2.889369249343872 + }, + { + "auxiliary_loss_clip": 0.01752442, + "auxiliary_loss_mlp": 0.01152779, + "balance_loss_clip": 1.28765118, + "balance_loss_mlp": 1.08120584, + "epoch": 0.004088381181421915, + "flos": 18843837684480.0, + "grad_norm": 2.109359538419204, + "language_loss": 0.94516367, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.97421581, + "num_input_tokens_seen": 1294355, + "step": 68, + "time_per_iteration": 2.8107409477233887 + }, + { + "auxiliary_loss_clip": 0.01747204, + "auxiliary_loss_mlp": 0.0115424, + "balance_loss_clip": 1.28511512, + "balance_loss_mlp": 1.08476448, + "epoch": 0.004148504434089885, + "flos": 19427817000960.0, + "grad_norm": 2.2931216646069092, + "language_loss": 0.96014255, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.98915702, + "num_input_tokens_seen": 1313525, + "step": 69, + "time_per_iteration": 2.8138387203216553 + }, + { + "auxiliary_loss_clip": 0.01741342, + "auxiliary_loss_mlp": 0.01160375, + "balance_loss_clip": 1.28807163, + "balance_loss_mlp": 1.09581161, + "epoch": 0.004208627686757853, + "flos": 20811023884800.0, + "grad_norm": 2.1764096137707494, + "language_loss": 0.98070192, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.00971913, + "num_input_tokens_seen": 1330505, + "step": 70, + "time_per_iteration": 2.8319084644317627 + }, + { + "auxiliary_loss_clip": 0.0174721, + "auxiliary_loss_mlp": 0.01145619, + "balance_loss_clip": 1.27791202, + "balance_loss_mlp": 1.07685876, + "epoch": 0.004268750939425823, + "flos": 19098623831040.0, + "grad_norm": 2.9300158782571324, + "language_loss": 0.94016141, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.96908975, + "num_input_tokens_seen": 1349615, + "step": 71, + "time_per_iteration": 2.8469433784484863 + }, + { + "auxiliary_loss_clip": 0.01815227, + "auxiliary_loss_mlp": 0.01294388, + "balance_loss_clip": 1.43495834, + "balance_loss_mlp": 1.25490558, + "epoch": 0.004328874192093792, + "flos": 52439635514880.0, + "grad_norm": 2.409331683106634, + "language_loss": 0.65682542, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68792164, + "num_input_tokens_seen": 1410275, + "step": 72, + "time_per_iteration": 3.2019593715667725 + }, + { + "auxiliary_loss_clip": 0.01799527, + "auxiliary_loss_mlp": 0.01271558, + "balance_loss_clip": 1.43197393, + "balance_loss_mlp": 1.2316941, + "epoch": 0.004388997444761762, + "flos": 66473239564800.0, + "grad_norm": 2.25068040880696, + "language_loss": 0.63694263, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66765356, + "num_input_tokens_seen": 1473020, + "step": 73, + "time_per_iteration": 3.3545596599578857 + }, + { + "auxiliary_loss_clip": 0.01720805, + "auxiliary_loss_mlp": 0.01140553, + "balance_loss_clip": 1.26912856, + "balance_loss_mlp": 1.07279444, + "epoch": 0.004449120697429731, + "flos": 18952970181120.0, + "grad_norm": 2.554977860093902, + "language_loss": 0.86212188, + "learning_rate": 2.771181708202938e-06, + "loss": 0.89073551, + "num_input_tokens_seen": 1490385, + "step": 74, + "time_per_iteration": 2.823498487472534 + }, + { + "auxiliary_loss_clip": 0.0172287, + "auxiliary_loss_mlp": 0.01162493, + "balance_loss_clip": 1.26811171, + "balance_loss_mlp": 1.09344697, + "epoch": 0.004509243950097701, + "flos": 21105491581440.0, + "grad_norm": 3.0087618017840105, + "language_loss": 0.97196102, + "learning_rate": 2.779824149153005e-06, + "loss": 1.00081468, + "num_input_tokens_seen": 1509725, + "step": 75, + "time_per_iteration": 2.888415575027466 + }, + { + "auxiliary_loss_clip": 0.0170198, + "auxiliary_loss_mlp": 0.01142315, + "balance_loss_clip": 1.26420689, + "balance_loss_mlp": 1.07608271, + "epoch": 0.004569367202765669, + "flos": 20698730991360.0, + "grad_norm": 2.6610382542709043, + "language_loss": 0.87740695, + "learning_rate": 2.788352117317012e-06, + "loss": 0.90584993, + "num_input_tokens_seen": 1527245, + "step": 76, + "time_per_iteration": 2.9226863384246826 + }, + { + "auxiliary_loss_clip": 0.01702512, + "auxiliary_loss_mlp": 0.01145374, + "balance_loss_clip": 1.26239479, + "balance_loss_mlp": 1.07656646, + "epoch": 0.004629490455433639, + "flos": 28658474899200.0, + "grad_norm": 2.4272090643104574, + "language_loss": 0.91791159, + "learning_rate": 2.796768605577095e-06, + "loss": 0.94639051, + "num_input_tokens_seen": 1548930, + "step": 77, + "time_per_iteration": 2.8720929622650146 + }, + { + "auxiliary_loss_clip": 0.01693018, + "auxiliary_loss_mlp": 0.01165978, + "balance_loss_clip": 1.26398146, + "balance_loss_mlp": 1.09569168, + "epoch": 0.004689613708101608, + "flos": 11072409805440.0, + "grad_norm": 2.2822185142383034, + "language_loss": 0.9211635, + "learning_rate": 2.80507649095533e-06, + "loss": 0.94975346, + "num_input_tokens_seen": 1565695, + "step": 78, + "time_per_iteration": 2.7832391262054443 + }, + { + "auxiliary_loss_clip": 0.01689271, + "auxiliary_loss_mlp": 0.01153255, + "balance_loss_clip": 1.25836253, + "balance_loss_mlp": 1.08482933, + "epoch": 0.004749736960769578, + "flos": 21799106184960.0, + "grad_norm": 2.263191265943929, + "language_loss": 0.82771945, + "learning_rate": 2.813278540517843e-06, + "loss": 0.85614467, + "num_input_tokens_seen": 1582625, + "step": 79, + "time_per_iteration": 2.7723355293273926 + }, + { + "auxiliary_loss_clip": 0.01702468, + "auxiliary_loss_mlp": 0.01130708, + "balance_loss_clip": 1.26147008, + "balance_loss_mlp": 1.0609467, + "epoch": 0.004809860213437547, + "flos": 19792597570560.0, + "grad_norm": 1.9992491725405546, + "language_loss": 0.91272199, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.94105375, + "num_input_tokens_seen": 1601725, + "step": 80, + "time_per_iteration": 2.742046356201172 + }, + { + "auxiliary_loss_clip": 0.01671156, + "auxiliary_loss_mlp": 0.01144048, + "balance_loss_clip": 1.25365841, + "balance_loss_mlp": 1.07371473, + "epoch": 0.004869983466105517, + "flos": 26574327037440.0, + "grad_norm": 2.0371265012476742, + "language_loss": 0.95241439, + "learning_rate": 2.829375683533245e-06, + "loss": 0.9805665, + "num_input_tokens_seen": 1622420, + "step": 81, + "time_per_iteration": 2.8996386528015137 + }, + { + "auxiliary_loss_clip": 0.01686092, + "auxiliary_loss_mlp": 0.01147828, + "balance_loss_clip": 1.25779653, + "balance_loss_mlp": 1.08149946, + "epoch": 0.004930106718773485, + "flos": 12823378087680.0, + "grad_norm": 2.9441337112970296, + "language_loss": 0.96288472, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.99122393, + "num_input_tokens_seen": 1640715, + "step": 82, + "time_per_iteration": 2.819120407104492 + }, + { + "auxiliary_loss_clip": 0.01668255, + "auxiliary_loss_mlp": 0.01156428, + "balance_loss_clip": 1.2461338, + "balance_loss_mlp": 1.08709574, + "epoch": 0.004990229971441455, + "flos": 25774919902080.0, + "grad_norm": 2.6601797838877856, + "language_loss": 0.86762071, + "learning_rate": 2.84508017388607e-06, + "loss": 0.89586747, + "num_input_tokens_seen": 1662210, + "step": 83, + "time_per_iteration": 2.7959344387054443 + }, + { + "auxiliary_loss_clip": 0.01662665, + "auxiliary_loss_mlp": 0.01154043, + "balance_loss_clip": 1.24844718, + "balance_loss_mlp": 1.084234, + "epoch": 0.005050353224109424, + "flos": 17457254922240.0, + "grad_norm": 2.5416281292503986, + "language_loss": 0.92081314, + "learning_rate": 2.852791070641559e-06, + "loss": 0.94898021, + "num_input_tokens_seen": 1681070, + "step": 84, + "time_per_iteration": 2.7176246643066406 + }, + { + "auxiliary_loss_clip": 0.01647627, + "auxiliary_loss_mlp": 0.01154949, + "balance_loss_clip": 1.36429358, + "balance_loss_mlp": 1.11527622, + "epoch": 0.005110476476777394, + "flos": 69805460367360.0, + "grad_norm": 1.4023430227621099, + "language_loss": 0.6252538, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.65327954, + "num_input_tokens_seen": 1747140, + "step": 85, + "time_per_iteration": 3.296835422515869 + }, + { + "auxiliary_loss_clip": 0.01649469, + "auxiliary_loss_mlp": 0.0112642, + "balance_loss_clip": 1.23797417, + "balance_loss_mlp": 1.05642033, + "epoch": 0.005170599729445363, + "flos": 24790105739520.0, + "grad_norm": 1.805253124779358, + "language_loss": 0.90709531, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.93485421, + "num_input_tokens_seen": 1767475, + "step": 86, + "time_per_iteration": 2.761484146118164 + }, + { + "auxiliary_loss_clip": 0.01653351, + "auxiliary_loss_mlp": 0.01158608, + "balance_loss_clip": 1.24437881, + "balance_loss_mlp": 1.08741617, + "epoch": 0.005230722982113333, + "flos": 23258048895360.0, + "grad_norm": 2.3398213465495776, + "language_loss": 0.81961077, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.8477304, + "num_input_tokens_seen": 1784980, + "step": 87, + "time_per_iteration": 2.763185739517212 + }, + { + "auxiliary_loss_clip": 0.01641581, + "auxiliary_loss_mlp": 0.01152623, + "balance_loss_clip": 1.24129367, + "balance_loss_mlp": 1.08457828, + "epoch": 0.005290846234781301, + "flos": 16727909264640.0, + "grad_norm": 3.1951080427559857, + "language_loss": 0.95790672, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.98584872, + "num_input_tokens_seen": 1803030, + "step": 88, + "time_per_iteration": 2.7855517864227295 + }, + { + "auxiliary_loss_clip": 0.01658657, + "auxiliary_loss_mlp": 0.01147064, + "balance_loss_clip": 1.24130976, + "balance_loss_mlp": 1.07978201, + "epoch": 0.005350969487449271, + "flos": 20886077352960.0, + "grad_norm": 3.405407923072192, + "language_loss": 0.86023164, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.88828892, + "num_input_tokens_seen": 1822865, + "step": 89, + "time_per_iteration": 2.7517924308776855 + }, + { + "auxiliary_loss_clip": 0.01647446, + "auxiliary_loss_mlp": 0.01133456, + "balance_loss_clip": 1.23541856, + "balance_loss_mlp": 1.06727123, + "epoch": 0.00541109274011724, + "flos": 26209977431040.0, + "grad_norm": 2.130771496386599, + "language_loss": 0.9150058, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.94281483, + "num_input_tokens_seen": 1842435, + "step": 90, + "time_per_iteration": 2.7526872158050537 + }, + { + "auxiliary_loss_clip": 0.01629409, + "auxiliary_loss_mlp": 0.01133822, + "balance_loss_clip": 1.23219132, + "balance_loss_mlp": 1.06582534, + "epoch": 0.00547121599278521, + "flos": 21178569801600.0, + "grad_norm": 2.6928798867856796, + "language_loss": 0.86073506, + "learning_rate": 2.90432674275074e-06, + "loss": 0.88836741, + "num_input_tokens_seen": 1860065, + "step": 91, + "time_per_iteration": 2.7995588779449463 + }, + { + "auxiliary_loss_clip": 0.01628638, + "auxiliary_loss_mlp": 0.01138916, + "balance_loss_clip": 1.22774827, + "balance_loss_mlp": 1.07335091, + "epoch": 0.005531339245453179, + "flos": 19718801078400.0, + "grad_norm": 5.062847798051961, + "language_loss": 0.87041199, + "learning_rate": 2.91136344867656e-06, + "loss": 0.8980875, + "num_input_tokens_seen": 1878135, + "step": 92, + "time_per_iteration": 2.7813079357147217 + }, + { + "auxiliary_loss_clip": 0.01620799, + "auxiliary_loss_mlp": 0.01174163, + "balance_loss_clip": 1.21933174, + "balance_loss_mlp": 1.10650027, + "epoch": 0.005591462498121149, + "flos": 17636089760640.0, + "grad_norm": 4.340668874696889, + "language_loss": 0.9210887, + "learning_rate": 2.918324080615938e-06, + "loss": 0.94903833, + "num_input_tokens_seen": 1894895, + "step": 93, + "time_per_iteration": 2.7582218647003174 + }, + { + "auxiliary_loss_clip": 0.0163427, + "auxiliary_loss_mlp": 0.01153574, + "balance_loss_clip": 1.22659743, + "balance_loss_mlp": 1.08238208, + "epoch": 0.005651585750789117, + "flos": 20011221699840.0, + "grad_norm": 4.327341326162078, + "language_loss": 0.87578797, + "learning_rate": 2.925210265866963e-06, + "loss": 0.90366644, + "num_input_tokens_seen": 1913220, + "step": 94, + "time_per_iteration": 2.783581256866455 + }, + { + "auxiliary_loss_clip": 0.01570285, + "auxiliary_loss_mlp": 0.01051726, + "balance_loss_clip": 1.31970167, + "balance_loss_mlp": 1.01376939, + "epoch": 0.005711709003457087, + "flos": 59812957981440.0, + "grad_norm": 1.3608185384271176, + "language_loss": 0.68098927, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70720935, + "num_input_tokens_seen": 1970970, + "step": 95, + "time_per_iteration": 3.1328847408294678 + }, + { + "auxiliary_loss_clip": 0.01612519, + "auxiliary_loss_mlp": 0.01150182, + "balance_loss_clip": 1.21488237, + "balance_loss_mlp": 1.08318627, + "epoch": 0.005771832256125056, + "flos": 15559591495680.0, + "grad_norm": 6.736145376327001, + "language_loss": 0.90221369, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.92984068, + "num_input_tokens_seen": 1988930, + "step": 96, + "time_per_iteration": 2.8015241622924805 + }, + { + "auxiliary_loss_clip": 0.01605814, + "auxiliary_loss_mlp": 0.01142022, + "balance_loss_clip": 1.21851277, + "balance_loss_mlp": 1.08003318, + "epoch": 0.005831955508793026, + "flos": 22528380015360.0, + "grad_norm": 3.8307865500968044, + "language_loss": 0.89869905, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.92617744, + "num_input_tokens_seen": 2006285, + "step": 97, + "time_per_iteration": 4.387299060821533 + }, + { + "auxiliary_loss_clip": 0.01593214, + "auxiliary_loss_mlp": 0.01140673, + "balance_loss_clip": 1.2102325, + "balance_loss_mlp": 1.07200789, + "epoch": 0.005892078761460995, + "flos": 22049834094720.0, + "grad_norm": 2.291581893082518, + "language_loss": 0.76274347, + "learning_rate": 2.952041322436969e-06, + "loss": 0.79008234, + "num_input_tokens_seen": 2024905, + "step": 98, + "time_per_iteration": 2.751507043838501 + }, + { + "auxiliary_loss_clip": 0.01533926, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.29271698, + "balance_loss_mlp": 1.00129879, + "epoch": 0.005952202014128965, + "flos": 68539143317760.0, + "grad_norm": 1.0388395506080574, + "language_loss": 0.65518898, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.68089598, + "num_input_tokens_seen": 2086220, + "step": 99, + "time_per_iteration": 3.3125040531158447 + }, + { + "auxiliary_loss_clip": 0.01595694, + "auxiliary_loss_mlp": 0.01142556, + "balance_loss_clip": 1.21028757, + "balance_loss_mlp": 1.07217503, + "epoch": 0.006012325266796933, + "flos": 22960887678720.0, + "grad_norm": 2.051483688350497, + "language_loss": 0.90885437, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.93623686, + "num_input_tokens_seen": 2103365, + "step": 100, + "time_per_iteration": 2.7632548809051514 + }, + { + "auxiliary_loss_clip": 0.01607235, + "auxiliary_loss_mlp": 0.01150276, + "balance_loss_clip": 1.21294045, + "balance_loss_mlp": 1.08394814, + "epoch": 0.006072448519464903, + "flos": 17347942857600.0, + "grad_norm": 2.0181737234491566, + "language_loss": 0.91081136, + "learning_rate": 2.971455421902446e-06, + "loss": 0.9383865, + "num_input_tokens_seen": 2121995, + "step": 101, + "time_per_iteration": 2.7214279174804688 + }, + { + "auxiliary_loss_clip": 0.015938, + "auxiliary_loss_mlp": 0.01152009, + "balance_loss_clip": 1.21248627, + "balance_loss_mlp": 1.08124638, + "epoch": 0.006132571772132872, + "flos": 24681116897280.0, + "grad_norm": 2.076276442041171, + "language_loss": 0.90774924, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.93520737, + "num_input_tokens_seen": 2141815, + "step": 102, + "time_per_iteration": 2.8389108180999756 + }, + { + "auxiliary_loss_clip": 0.01588155, + "auxiliary_loss_mlp": 0.01133785, + "balance_loss_clip": 1.20914173, + "balance_loss_mlp": 1.06912589, + "epoch": 0.006192695024800842, + "flos": 21465675210240.0, + "grad_norm": 2.3272829989328456, + "language_loss": 0.88006896, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.90728837, + "num_input_tokens_seen": 2161125, + "step": 103, + "time_per_iteration": 2.768784761428833 + }, + { + "auxiliary_loss_clip": 0.01588751, + "auxiliary_loss_mlp": 0.01136216, + "balance_loss_clip": 1.21138883, + "balance_loss_mlp": 1.06998372, + "epoch": 0.006252818277468811, + "flos": 17420410546560.0, + "grad_norm": 1.9182889224259552, + "language_loss": 0.93644351, + "learning_rate": 2.990301221458371e-06, + "loss": 0.96369314, + "num_input_tokens_seen": 2179510, + "step": 104, + "time_per_iteration": 2.7109038829803467 + }, + { + "auxiliary_loss_clip": 0.01579421, + "auxiliary_loss_mlp": 0.01146524, + "balance_loss_clip": 1.20086741, + "balance_loss_mlp": 1.08258009, + "epoch": 0.006312941530136781, + "flos": 19099557584640.0, + "grad_norm": 3.0437899698059367, + "language_loss": 0.96655375, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.99381316, + "num_input_tokens_seen": 2197870, + "step": 105, + "time_per_iteration": 2.7254133224487305 + }, + { + "auxiliary_loss_clip": 0.01578331, + "auxiliary_loss_mlp": 0.01158544, + "balance_loss_clip": 1.20144236, + "balance_loss_mlp": 1.08768642, + "epoch": 0.006373064782804749, + "flos": 24060831909120.0, + "grad_norm": 3.1837681777002302, + "language_loss": 0.87119448, + "learning_rate": 3.002565443382063e-06, + "loss": 0.89856327, + "num_input_tokens_seen": 2217495, + "step": 106, + "time_per_iteration": 2.7705447673797607 + }, + { + "auxiliary_loss_clip": 0.01561845, + "auxiliary_loss_mlp": 0.01143018, + "balance_loss_clip": 1.18746924, + "balance_loss_mlp": 1.0751636, + "epoch": 0.006433188035472719, + "flos": 18332433797760.0, + "grad_norm": 2.228856706842439, + "language_loss": 0.83398581, + "learning_rate": 3.008611048208843e-06, + "loss": 0.86103439, + "num_input_tokens_seen": 2236520, + "step": 107, + "time_per_iteration": 2.6885263919830322 + }, + { + "auxiliary_loss_clip": 0.01469631, + "auxiliary_loss_mlp": 0.0103327, + "balance_loss_clip": 1.25210869, + "balance_loss_mlp": 1.00179863, + "epoch": 0.006493311288140688, + "flos": 62562387594240.0, + "grad_norm": 0.9900995959758047, + "language_loss": 0.64796811, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67299712, + "num_input_tokens_seen": 2300140, + "step": 108, + "time_per_iteration": 3.278621196746826 + }, + { + "auxiliary_loss_clip": 0.01552898, + "auxiliary_loss_mlp": 0.01132858, + "balance_loss_clip": 1.18960094, + "balance_loss_mlp": 1.06424141, + "epoch": 0.006553434540808658, + "flos": 19500141035520.0, + "grad_norm": 2.019247660217844, + "language_loss": 0.97709465, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.00395215, + "num_input_tokens_seen": 2317320, + "step": 109, + "time_per_iteration": 2.750502347946167 + }, + { + "auxiliary_loss_clip": 0.01550996, + "auxiliary_loss_mlp": 0.01140204, + "balance_loss_clip": 1.19136214, + "balance_loss_mlp": 1.07430482, + "epoch": 0.006613557793476627, + "flos": 21105132445440.0, + "grad_norm": 1.9540987754213832, + "language_loss": 0.84243041, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.86934245, + "num_input_tokens_seen": 2337820, + "step": 110, + "time_per_iteration": 2.82443904876709 + }, + { + "auxiliary_loss_clip": 0.01544634, + "auxiliary_loss_mlp": 0.01151549, + "balance_loss_clip": 1.18396342, + "balance_loss_mlp": 1.08493507, + "epoch": 0.006673681046144597, + "flos": 26030747543040.0, + "grad_norm": 2.4580319150483563, + "language_loss": 0.82940048, + "learning_rate": 3.032241303393073e-06, + "loss": 0.85636234, + "num_input_tokens_seen": 2358560, + "step": 111, + "time_per_iteration": 2.8308968544006348 + }, + { + "auxiliary_loss_clip": 0.0154596, + "auxiliary_loss_mlp": 0.01133366, + "balance_loss_clip": 1.18776846, + "balance_loss_mlp": 1.06970847, + "epoch": 0.006733804298812566, + "flos": 23147767163520.0, + "grad_norm": 2.356589096997363, + "language_loss": 0.93989801, + "learning_rate": 3.0380158011446e-06, + "loss": 0.9666912, + "num_input_tokens_seen": 2379005, + "step": 112, + "time_per_iteration": 2.8007922172546387 + }, + { + "auxiliary_loss_clip": 0.01549647, + "auxiliary_loss_mlp": 0.01136979, + "balance_loss_clip": 1.18394601, + "balance_loss_mlp": 1.07322621, + "epoch": 0.006793927551480535, + "flos": 11764444210560.0, + "grad_norm": 2.521639841990545, + "language_loss": 0.79509294, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.82195914, + "num_input_tokens_seen": 2395610, + "step": 113, + "time_per_iteration": 2.7599966526031494 + }, + { + "auxiliary_loss_clip": 0.0153736, + "auxiliary_loss_mlp": 0.01131524, + "balance_loss_clip": 1.18028498, + "balance_loss_mlp": 1.06562555, + "epoch": 0.006854050804148504, + "flos": 19171953446400.0, + "grad_norm": 2.343117351168218, + "language_loss": 0.93439317, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.96108204, + "num_input_tokens_seen": 2415005, + "step": 114, + "time_per_iteration": 2.723540782928467 + }, + { + "auxiliary_loss_clip": 0.01544971, + "auxiliary_loss_mlp": 0.01138932, + "balance_loss_clip": 1.17997146, + "balance_loss_mlp": 1.07918465, + "epoch": 0.006914174056816474, + "flos": 21981891519360.0, + "grad_norm": 1.9509019191057126, + "language_loss": 0.9463321, + "learning_rate": 3.055034911425055e-06, + "loss": 0.97317111, + "num_input_tokens_seen": 2433965, + "step": 115, + "time_per_iteration": 2.7077698707580566 + }, + { + "auxiliary_loss_clip": 0.01537699, + "auxiliary_loss_mlp": 0.01118178, + "balance_loss_clip": 1.17675614, + "balance_loss_mlp": 1.05151677, + "epoch": 0.006974297309484443, + "flos": 16289152634880.0, + "grad_norm": 10.363795807176915, + "language_loss": 0.82148951, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.84804827, + "num_input_tokens_seen": 2451605, + "step": 116, + "time_per_iteration": 2.681190013885498 + }, + { + "auxiliary_loss_clip": 0.01528803, + "auxiliary_loss_mlp": 0.01126189, + "balance_loss_clip": 1.17677391, + "balance_loss_mlp": 1.06219721, + "epoch": 0.007034420562152413, + "flos": 26104005331200.0, + "grad_norm": 2.4150591879391627, + "language_loss": 0.88368428, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.91023421, + "num_input_tokens_seen": 2472035, + "step": 117, + "time_per_iteration": 2.776143789291382 + }, + { + "auxiliary_loss_clip": 0.01527909, + "auxiliary_loss_mlp": 0.01146127, + "balance_loss_clip": 1.17495561, + "balance_loss_mlp": 1.08041906, + "epoch": 0.007094543814820382, + "flos": 14204609723520.0, + "grad_norm": 2.3639764059040265, + "language_loss": 0.8454417, + "learning_rate": 3.071615712271274e-06, + "loss": 0.87218207, + "num_input_tokens_seen": 2489285, + "step": 118, + "time_per_iteration": 2.7110469341278076 + }, + { + "auxiliary_loss_clip": 0.01538161, + "auxiliary_loss_mlp": 0.01163868, + "balance_loss_clip": 1.1759789, + "balance_loss_mlp": 1.0984937, + "epoch": 0.007154667067488351, + "flos": 14976007228800.0, + "grad_norm": 2.231843342078736, + "language_loss": 0.99319011, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.02021039, + "num_input_tokens_seen": 2506460, + "step": 119, + "time_per_iteration": 2.674121856689453 + }, + { + "auxiliary_loss_clip": 0.01540018, + "auxiliary_loss_mlp": 0.01120611, + "balance_loss_clip": 1.17242217, + "balance_loss_mlp": 1.05738258, + "epoch": 0.00721479032015632, + "flos": 20193288762240.0, + "grad_norm": 2.7981733983226764, + "language_loss": 0.8963809, + "learning_rate": 3.082437012097686e-06, + "loss": 0.92298722, + "num_input_tokens_seen": 2525565, + "step": 120, + "time_per_iteration": 2.745962381362915 + }, + { + "auxiliary_loss_clip": 0.01524916, + "auxiliary_loss_mlp": 0.01129465, + "balance_loss_clip": 1.1734432, + "balance_loss_mlp": 1.06513989, + "epoch": 0.00727491357282429, + "flos": 23147228459520.0, + "grad_norm": 1.797716104424251, + "language_loss": 0.93491542, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.96145928, + "num_input_tokens_seen": 2546605, + "step": 121, + "time_per_iteration": 2.7924466133117676 + }, + { + "auxiliary_loss_clip": 0.01526294, + "auxiliary_loss_mlp": 0.0114832, + "balance_loss_clip": 1.17395604, + "balance_loss_mlp": 1.08490098, + "epoch": 0.007335036825492259, + "flos": 15521669712000.0, + "grad_norm": 2.3704869501778285, + "language_loss": 0.90462255, + "learning_rate": 3.09307943925077e-06, + "loss": 0.93136871, + "num_input_tokens_seen": 2560730, + "step": 122, + "time_per_iteration": 2.930413246154785 + }, + { + "auxiliary_loss_clip": 0.01521826, + "auxiliary_loss_mlp": 0.01146566, + "balance_loss_clip": 1.1681807, + "balance_loss_mlp": 1.07861674, + "epoch": 0.007395160078160229, + "flos": 24243365848320.0, + "grad_norm": 2.4867163179710037, + "language_loss": 0.92660481, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.95328873, + "num_input_tokens_seen": 2579550, + "step": 123, + "time_per_iteration": 2.7484309673309326 + }, + { + "auxiliary_loss_clip": 0.01519363, + "auxiliary_loss_mlp": 0.01127611, + "balance_loss_clip": 1.16324139, + "balance_loss_mlp": 1.0651449, + "epoch": 0.007455283330828198, + "flos": 31759792099200.0, + "grad_norm": 2.366639004459226, + "language_loss": 0.71187961, + "learning_rate": 3.103548811118979e-06, + "loss": 0.73834932, + "num_input_tokens_seen": 2600390, + "step": 124, + "time_per_iteration": 2.8419976234436035 + }, + { + "auxiliary_loss_clip": 0.01506936, + "auxiliary_loss_mlp": 0.01125571, + "balance_loss_clip": 1.16464007, + "balance_loss_mlp": 1.06167519, + "epoch": 0.007515406583496167, + "flos": 26615157822720.0, + "grad_norm": 2.1632751269766106, + "language_loss": 0.88450015, + "learning_rate": 3.108720342404542e-06, + "loss": 0.91082525, + "num_input_tokens_seen": 2620770, + "step": 125, + "time_per_iteration": 2.823296308517456 + }, + { + "auxiliary_loss_clip": 0.01522239, + "auxiliary_loss_mlp": 0.01142214, + "balance_loss_clip": 1.16456664, + "balance_loss_mlp": 1.07912827, + "epoch": 0.007575529836164136, + "flos": 18223696350720.0, + "grad_norm": 2.6632616920164067, + "language_loss": 0.82381976, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.85046428, + "num_input_tokens_seen": 2639900, + "step": 126, + "time_per_iteration": 2.7325809001922607 + }, + { + "auxiliary_loss_clip": 0.015153, + "auxiliary_loss_mlp": 0.01142869, + "balance_loss_clip": 1.16330886, + "balance_loss_mlp": 1.08088017, + "epoch": 0.007635653088832106, + "flos": 21580410228480.0, + "grad_norm": 3.925284628341409, + "language_loss": 0.6743899, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.7009716, + "num_input_tokens_seen": 2657450, + "step": 127, + "time_per_iteration": 2.709821939468384 + }, + { + "auxiliary_loss_clip": 0.01503057, + "auxiliary_loss_mlp": 0.01132416, + "balance_loss_clip": 1.165169, + "balance_loss_mlp": 1.06861567, + "epoch": 0.007695776341500075, + "flos": 25375054723200.0, + "grad_norm": 2.0535131533503734, + "language_loss": 0.8819322, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.90828693, + "num_input_tokens_seen": 2678150, + "step": 128, + "time_per_iteration": 2.764707565307617 + }, + { + "auxiliary_loss_clip": 0.01505955, + "auxiliary_loss_mlp": 0.01144223, + "balance_loss_clip": 1.16043079, + "balance_loss_mlp": 1.08042252, + "epoch": 0.007755899594168045, + "flos": 22343906741760.0, + "grad_norm": 2.6427711693827005, + "language_loss": 0.84719259, + "learning_rate": 3.129000827968184e-06, + "loss": 0.87369436, + "num_input_tokens_seen": 2698290, + "step": 129, + "time_per_iteration": 2.7472774982452393 + }, + { + "auxiliary_loss_clip": 0.01497871, + "auxiliary_loss_mlp": 0.01130211, + "balance_loss_clip": 1.15871263, + "balance_loss_mlp": 1.06655347, + "epoch": 0.007816022846836013, + "flos": 22638230784000.0, + "grad_norm": 2.366492959329914, + "language_loss": 0.97564614, + "learning_rate": 3.133972684206866e-06, + "loss": 1.00192702, + "num_input_tokens_seen": 2717630, + "step": 130, + "time_per_iteration": 2.6955018043518066 + }, + { + "auxiliary_loss_clip": 0.01492272, + "auxiliary_loss_mlp": 0.01134965, + "balance_loss_clip": 1.15630865, + "balance_loss_mlp": 1.06987715, + "epoch": 0.007876146099503984, + "flos": 18182901479040.0, + "grad_norm": 2.2164470079204572, + "language_loss": 0.82658112, + "learning_rate": 3.138906441556014e-06, + "loss": 0.85285342, + "num_input_tokens_seen": 2735835, + "step": 131, + "time_per_iteration": 2.722247362136841 + }, + { + "auxiliary_loss_clip": 0.01500937, + "auxiliary_loss_mlp": 0.01128359, + "balance_loss_clip": 1.15885806, + "balance_loss_mlp": 1.06694245, + "epoch": 0.007936269352171952, + "flos": 27119486730240.0, + "grad_norm": 2.7663180664822193, + "language_loss": 0.82781422, + "learning_rate": 3.143802679474861e-06, + "loss": 0.85410714, + "num_input_tokens_seen": 2756335, + "step": 132, + "time_per_iteration": 2.7937612533569336 + }, + { + "auxiliary_loss_clip": 0.01491919, + "auxiliary_loss_mlp": 0.01128624, + "balance_loss_clip": 1.15346444, + "balance_loss_mlp": 1.0664922, + "epoch": 0.007996392604839923, + "flos": 19026335710080.0, + "grad_norm": 2.182366740159355, + "language_loss": 0.95499313, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.98119843, + "num_input_tokens_seen": 2775090, + "step": 133, + "time_per_iteration": 2.7380354404449463 + }, + { + "auxiliary_loss_clip": 0.01487746, + "auxiliary_loss_mlp": 0.0112871, + "balance_loss_clip": 1.16170454, + "balance_loss_mlp": 1.06843781, + "epoch": 0.008056515857507891, + "flos": 25484151306240.0, + "grad_norm": 1.8164116645967854, + "language_loss": 0.73478442, + "learning_rate": 3.153484849651286e-06, + "loss": 0.76094896, + "num_input_tokens_seen": 2795320, + "step": 134, + "time_per_iteration": 2.7483408451080322 + }, + { + "auxiliary_loss_clip": 0.01484621, + "auxiliary_loss_mlp": 0.01132134, + "balance_loss_clip": 1.15115011, + "balance_loss_mlp": 1.06695068, + "epoch": 0.00811663911017586, + "flos": 20557566541440.0, + "grad_norm": 5.027018494085059, + "language_loss": 0.88792509, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.91409266, + "num_input_tokens_seen": 2812815, + "step": 135, + "time_per_iteration": 2.6838128566741943 + }, + { + "auxiliary_loss_clip": 0.01487119, + "auxiliary_loss_mlp": 0.0113257, + "balance_loss_clip": 1.15490174, + "balance_loss_mlp": 1.06714821, + "epoch": 0.00817676236284383, + "flos": 18799738761600.0, + "grad_norm": 1.9282722528396903, + "language_loss": 0.89138198, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.91757882, + "num_input_tokens_seen": 2830445, + "step": 136, + "time_per_iteration": 2.726475238800049 + }, + { + "auxiliary_loss_clip": 0.01483417, + "auxiliary_loss_mlp": 0.01110724, + "balance_loss_clip": 1.1494019, + "balance_loss_mlp": 1.05078554, + "epoch": 0.008236885615511799, + "flos": 23873593288320.0, + "grad_norm": 2.2984339413846078, + "language_loss": 0.84091324, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.86685467, + "num_input_tokens_seen": 2846965, + "step": 137, + "time_per_iteration": 7.4708640575408936 + }, + { + "auxiliary_loss_clip": 0.01481848, + "auxiliary_loss_mlp": 0.01118837, + "balance_loss_clip": 1.1500535, + "balance_loss_mlp": 1.05894589, + "epoch": 0.00829700886817977, + "flos": 24643626076800.0, + "grad_norm": 1.69378413504035, + "language_loss": 0.9018681, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.92787492, + "num_input_tokens_seen": 2867520, + "step": 138, + "time_per_iteration": 4.311830520629883 + }, + { + "auxiliary_loss_clip": 0.01469655, + "auxiliary_loss_mlp": 0.01123604, + "balance_loss_clip": 1.14824438, + "balance_loss_mlp": 1.05904007, + "epoch": 0.008357132120847738, + "flos": 25262007644160.0, + "grad_norm": 2.1515203004813785, + "language_loss": 0.91478992, + "learning_rate": 3.177071816289865e-06, + "loss": 0.94072247, + "num_input_tokens_seen": 2885675, + "step": 139, + "time_per_iteration": 2.7678122520446777 + }, + { + "auxiliary_loss_clip": 0.01486799, + "auxiliary_loss_mlp": 0.01124947, + "balance_loss_clip": 1.15521085, + "balance_loss_mlp": 1.06195688, + "epoch": 0.008417255373515706, + "flos": 27344898529920.0, + "grad_norm": 2.305315677890536, + "language_loss": 0.85667789, + "learning_rate": 3.181687263893095e-06, + "loss": 0.88279533, + "num_input_tokens_seen": 2905960, + "step": 140, + "time_per_iteration": 2.8557639122009277 + }, + { + "auxiliary_loss_clip": 0.01473538, + "auxiliary_loss_mlp": 0.01122701, + "balance_loss_clip": 1.14923954, + "balance_loss_mlp": 1.06166625, + "epoch": 0.008477378626183677, + "flos": 17639070589440.0, + "grad_norm": 2.3443620963590455, + "language_loss": 0.84346074, + "learning_rate": 3.186269861057098e-06, + "loss": 0.86942315, + "num_input_tokens_seen": 2922780, + "step": 141, + "time_per_iteration": 2.7656807899475098 + }, + { + "auxiliary_loss_clip": 0.01477141, + "auxiliary_loss_mlp": 0.01135217, + "balance_loss_clip": 1.14718878, + "balance_loss_mlp": 1.07360983, + "epoch": 0.008537501878851645, + "flos": 13881342297600.0, + "grad_norm": 2.29020652115343, + "language_loss": 0.8105557, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.83667928, + "num_input_tokens_seen": 2938765, + "step": 142, + "time_per_iteration": 2.747598171234131 + }, + { + "auxiliary_loss_clip": 0.01378886, + "auxiliary_loss_mlp": 0.01060004, + "balance_loss_clip": 1.19240355, + "balance_loss_mlp": 1.03406358, + "epoch": 0.008597625131519616, + "flos": 71248101281280.0, + "grad_norm": 1.056887207538052, + "language_loss": 0.66899812, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69338703, + "num_input_tokens_seen": 3006665, + "step": 143, + "time_per_iteration": 3.346982002258301 + }, + { + "auxiliary_loss_clip": 0.01467707, + "auxiliary_loss_mlp": 0.01123721, + "balance_loss_clip": 1.14666772, + "balance_loss_mlp": 1.06273365, + "epoch": 0.008657748384187584, + "flos": 17602836744960.0, + "grad_norm": 2.6467048454978523, + "language_loss": 0.84356761, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.86948192, + "num_input_tokens_seen": 3024335, + "step": 144, + "time_per_iteration": 2.762087345123291 + }, + { + "auxiliary_loss_clip": 0.01455701, + "auxiliary_loss_mlp": 0.01114511, + "balance_loss_clip": 1.14058816, + "balance_loss_mlp": 1.05085373, + "epoch": 0.008717871636855555, + "flos": 19715317459200.0, + "grad_norm": 1.8692883316747366, + "language_loss": 0.88353741, + "learning_rate": 3.204280886775619e-06, + "loss": 0.90923953, + "num_input_tokens_seen": 3043300, + "step": 145, + "time_per_iteration": 2.7050039768218994 + }, + { + "auxiliary_loss_clip": 0.01470385, + "auxiliary_loss_mlp": 0.01121817, + "balance_loss_clip": 1.14247775, + "balance_loss_mlp": 1.05873132, + "epoch": 0.008777994889523523, + "flos": 24717422568960.0, + "grad_norm": 1.860830881508538, + "language_loss": 0.86182559, + "learning_rate": 3.208706005112005e-06, + "loss": 0.88774765, + "num_input_tokens_seen": 3064610, + "step": 146, + "time_per_iteration": 2.741013288497925 + }, + { + "auxiliary_loss_clip": 0.01356998, + "auxiliary_loss_mlp": 0.01029681, + "balance_loss_clip": 1.18072379, + "balance_loss_mlp": 1.00431335, + "epoch": 0.008838118142191492, + "flos": 70132067758080.0, + "grad_norm": 0.8598047517885464, + "language_loss": 0.60122073, + "learning_rate": 3.213100917627104e-06, + "loss": 0.6250875, + "num_input_tokens_seen": 3130385, + "step": 147, + "time_per_iteration": 3.27382230758667 + }, + { + "auxiliary_loss_clip": 0.01463009, + "auxiliary_loss_mlp": 0.01123472, + "balance_loss_clip": 1.14658976, + "balance_loss_mlp": 1.06548882, + "epoch": 0.008898241394859462, + "flos": 20044797937920.0, + "grad_norm": 1.8116070485228748, + "language_loss": 0.84620225, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.87206709, + "num_input_tokens_seen": 3149760, + "step": 148, + "time_per_iteration": 2.72910475730896 + }, + { + "auxiliary_loss_clip": 0.01466623, + "auxiliary_loss_mlp": 0.01144944, + "balance_loss_clip": 1.14777792, + "balance_loss_mlp": 1.07985532, + "epoch": 0.008958364647527431, + "flos": 10743611685120.0, + "grad_norm": 2.5530775415688205, + "language_loss": 0.88680327, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.91291893, + "num_input_tokens_seen": 3164500, + "step": 149, + "time_per_iteration": 2.688528537750244 + }, + { + "auxiliary_loss_clip": 0.01463954, + "auxiliary_loss_mlp": 0.01114885, + "balance_loss_clip": 1.14290714, + "balance_loss_mlp": 1.05728304, + "epoch": 0.009018487900195401, + "flos": 29127467802240.0, + "grad_norm": 2.1996557200804823, + "language_loss": 0.93269086, + "learning_rate": 3.226108474846181e-06, + "loss": 0.95847929, + "num_input_tokens_seen": 3182455, + "step": 150, + "time_per_iteration": 2.7901580333709717 + }, + { + "auxiliary_loss_clip": 0.01450819, + "auxiliary_loss_mlp": 0.01114571, + "balance_loss_clip": 1.13812149, + "balance_loss_mlp": 1.05839944, + "epoch": 0.00907861115286337, + "flos": 32963661354240.0, + "grad_norm": 4.690239135210318, + "language_loss": 0.7421813, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.7678352, + "num_input_tokens_seen": 3203995, + "step": 151, + "time_per_iteration": 2.79590106010437 + }, + { + "auxiliary_loss_clip": 0.01463077, + "auxiliary_loss_mlp": 0.01128244, + "balance_loss_clip": 1.14311624, + "balance_loss_mlp": 1.06954527, + "epoch": 0.009138734405531338, + "flos": 21762441377280.0, + "grad_norm": 4.291097242497492, + "language_loss": 0.88460332, + "learning_rate": 3.234636443010188e-06, + "loss": 0.9105165, + "num_input_tokens_seen": 3222575, + "step": 152, + "time_per_iteration": 2.701775550842285 + }, + { + "auxiliary_loss_clip": 0.01462099, + "auxiliary_loss_mlp": 0.01122264, + "balance_loss_clip": 1.14743185, + "balance_loss_mlp": 1.06275451, + "epoch": 0.009198857658199309, + "flos": 20842517134080.0, + "grad_norm": 3.861411936226758, + "language_loss": 0.83918798, + "learning_rate": 3.238858439669943e-06, + "loss": 0.8650316, + "num_input_tokens_seen": 3240180, + "step": 153, + "time_per_iteration": 2.730654716491699 + }, + { + "auxiliary_loss_clip": 0.01453756, + "auxiliary_loss_mlp": 0.01136244, + "balance_loss_clip": 1.14024806, + "balance_loss_mlp": 1.07554269, + "epoch": 0.009258980910867277, + "flos": 24827381078400.0, + "grad_norm": 1.8788427995178905, + "language_loss": 0.89924759, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.92514759, + "num_input_tokens_seen": 3259800, + "step": 154, + "time_per_iteration": 2.8150386810302734 + }, + { + "auxiliary_loss_clip": 0.01457041, + "auxiliary_loss_mlp": 0.01148182, + "balance_loss_clip": 1.1422174, + "balance_loss_mlp": 1.08934021, + "epoch": 0.009319104163535248, + "flos": 28767786963840.0, + "grad_norm": 2.155148564981828, + "language_loss": 0.89730597, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.9233582, + "num_input_tokens_seen": 3280400, + "step": 155, + "time_per_iteration": 2.7780215740203857 + }, + { + "auxiliary_loss_clip": 0.01462257, + "auxiliary_loss_mlp": 0.01115972, + "balance_loss_clip": 1.14140153, + "balance_loss_mlp": 1.0580368, + "epoch": 0.009379227416203216, + "flos": 16582004219520.0, + "grad_norm": 2.6722626388977986, + "language_loss": 0.86758631, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.8933686, + "num_input_tokens_seen": 3297600, + "step": 156, + "time_per_iteration": 2.7195818424224854 + }, + { + "auxiliary_loss_clip": 0.01460326, + "auxiliary_loss_mlp": 0.01116019, + "balance_loss_clip": 1.14530039, + "balance_loss_mlp": 1.05770147, + "epoch": 0.009439350668871187, + "flos": 18329919845760.0, + "grad_norm": 2.3212743339319926, + "language_loss": 0.99652225, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.0222857, + "num_input_tokens_seen": 3313635, + "step": 157, + "time_per_iteration": 2.7530624866485596 + }, + { + "auxiliary_loss_clip": 0.01445494, + "auxiliary_loss_mlp": 0.01139991, + "balance_loss_clip": 1.14011836, + "balance_loss_mlp": 1.08162606, + "epoch": 0.009499473921539155, + "flos": 24349912565760.0, + "grad_norm": 2.2650385025378834, + "language_loss": 0.88388717, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.90974212, + "num_input_tokens_seen": 3333735, + "step": 158, + "time_per_iteration": 2.744640588760376 + }, + { + "auxiliary_loss_clip": 0.01451838, + "auxiliary_loss_mlp": 0.01122147, + "balance_loss_clip": 1.13977575, + "balance_loss_mlp": 1.0630666, + "epoch": 0.009559597174207124, + "flos": 16399326625920.0, + "grad_norm": 2.1807440045696165, + "language_loss": 0.86407602, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.88981581, + "num_input_tokens_seen": 3348800, + "step": 159, + "time_per_iteration": 2.7330005168914795 + }, + { + "auxiliary_loss_clip": 0.01441743, + "auxiliary_loss_mlp": 0.01137796, + "balance_loss_clip": 1.13474953, + "balance_loss_mlp": 1.07752383, + "epoch": 0.009619720426875094, + "flos": 22856890826880.0, + "grad_norm": 1.7296815250329798, + "language_loss": 0.86756837, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.89336377, + "num_input_tokens_seen": 3368595, + "step": 160, + "time_per_iteration": 2.844817876815796 + }, + { + "auxiliary_loss_clip": 0.01447614, + "auxiliary_loss_mlp": 0.0112266, + "balance_loss_clip": 1.13978457, + "balance_loss_mlp": 1.06725168, + "epoch": 0.009679843679543063, + "flos": 19135001329920.0, + "grad_norm": 2.462408333273543, + "language_loss": 0.91543746, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.94114017, + "num_input_tokens_seen": 3384975, + "step": 161, + "time_per_iteration": 2.667666435241699 + }, + { + "auxiliary_loss_clip": 0.01453392, + "auxiliary_loss_mlp": 0.01111804, + "balance_loss_clip": 1.14104879, + "balance_loss_mlp": 1.05610919, + "epoch": 0.009739966932211033, + "flos": 20302995876480.0, + "grad_norm": 1.7914334411859298, + "language_loss": 0.91582954, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.94148147, + "num_input_tokens_seen": 3404755, + "step": 162, + "time_per_iteration": 2.6779961585998535 + }, + { + "auxiliary_loss_clip": 0.0131522, + "auxiliary_loss_mlp": 0.01056953, + "balance_loss_clip": 1.15019548, + "balance_loss_mlp": 1.03358769, + "epoch": 0.009800090184879002, + "flos": 67034234177280.0, + "grad_norm": 1.183297200633083, + "language_loss": 0.72292268, + "learning_rate": 3.279622189013474e-06, + "loss": 0.74664438, + "num_input_tokens_seen": 3467210, + "step": 163, + "time_per_iteration": 3.226755142211914 + }, + { + "auxiliary_loss_clip": 0.01439788, + "auxiliary_loss_mlp": 0.01116102, + "balance_loss_clip": 1.13873029, + "balance_loss_mlp": 1.05921507, + "epoch": 0.00986021343754697, + "flos": 17164690646400.0, + "grad_norm": 3.3372881081540937, + "language_loss": 0.84684807, + "learning_rate": 3.283560135133457e-06, + "loss": 0.87240696, + "num_input_tokens_seen": 3483220, + "step": 164, + "time_per_iteration": 2.768935203552246 + }, + { + "auxiliary_loss_clip": 0.01430933, + "auxiliary_loss_mlp": 0.0110117, + "balance_loss_clip": 1.13048434, + "balance_loss_mlp": 1.04533219, + "epoch": 0.00992033669021494, + "flos": 17749424148480.0, + "grad_norm": 4.079659732294038, + "language_loss": 0.89080763, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.91612864, + "num_input_tokens_seen": 3501465, + "step": 165, + "time_per_iteration": 2.673292875289917 + }, + { + "auxiliary_loss_clip": 0.01433192, + "auxiliary_loss_mlp": 0.01128138, + "balance_loss_clip": 1.13111067, + "balance_loss_mlp": 1.06819916, + "epoch": 0.00998045994288291, + "flos": 25297164080640.0, + "grad_norm": 1.7359539169577796, + "language_loss": 0.79931343, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.82492673, + "num_input_tokens_seen": 3520480, + "step": 166, + "time_per_iteration": 2.762742757797241 + }, + { + "auxiliary_loss_clip": 0.01438026, + "auxiliary_loss_mlp": 0.01129718, + "balance_loss_clip": 1.13488948, + "balance_loss_mlp": 1.07066131, + "epoch": 0.01004058319555088, + "flos": 32298954220800.0, + "grad_norm": 2.3252666324684585, + "language_loss": 0.92125285, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.94693023, + "num_input_tokens_seen": 3539570, + "step": 167, + "time_per_iteration": 2.970964193344116 + }, + { + "auxiliary_loss_clip": 0.01429698, + "auxiliary_loss_mlp": 0.01133324, + "balance_loss_clip": 1.13294363, + "balance_loss_mlp": 1.07734346, + "epoch": 0.010100706448218848, + "flos": 11319941404800.0, + "grad_norm": 13.512238716069085, + "language_loss": 0.90781063, + "learning_rate": 3.299075396334735e-06, + "loss": 0.93344086, + "num_input_tokens_seen": 3555465, + "step": 168, + "time_per_iteration": 2.8039841651916504 + }, + { + "auxiliary_loss_clip": 0.01424367, + "auxiliary_loss_mlp": 0.01104795, + "balance_loss_clip": 1.12848639, + "balance_loss_mlp": 1.04700291, + "epoch": 0.010160829700886819, + "flos": 29719491765120.0, + "grad_norm": 1.6705351130563955, + "language_loss": 0.87173021, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.89702177, + "num_input_tokens_seen": 3578970, + "step": 169, + "time_per_iteration": 2.8215444087982178 + }, + { + "auxiliary_loss_clip": 0.01425902, + "auxiliary_loss_mlp": 0.01110538, + "balance_loss_clip": 1.13139379, + "balance_loss_mlp": 1.05317438, + "epoch": 0.010220952953554787, + "flos": 20412343854720.0, + "grad_norm": 1.7404257397879006, + "language_loss": 0.84622329, + "learning_rate": 3.306695037731344e-06, + "loss": 0.87158769, + "num_input_tokens_seen": 3597275, + "step": 170, + "time_per_iteration": 2.6759181022644043 + }, + { + "auxiliary_loss_clip": 0.0143612, + "auxiliary_loss_mlp": 0.01137162, + "balance_loss_clip": 1.13149834, + "balance_loss_mlp": 1.07874942, + "epoch": 0.010281076206222756, + "flos": 31285124847360.0, + "grad_norm": 2.174517661608974, + "language_loss": 0.89936447, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.92509729, + "num_input_tokens_seen": 3618905, + "step": 171, + "time_per_iteration": 2.800394058227539 + }, + { + "auxiliary_loss_clip": 0.01430673, + "auxiliary_loss_mlp": 0.01108779, + "balance_loss_clip": 1.1347487, + "balance_loss_mlp": 1.05382347, + "epoch": 0.010341199458890726, + "flos": 21982286568960.0, + "grad_norm": 1.938241860949196, + "language_loss": 0.88895655, + "learning_rate": 3.314225558471224e-06, + "loss": 0.91435111, + "num_input_tokens_seen": 3639610, + "step": 172, + "time_per_iteration": 2.755190849304199 + }, + { + "auxiliary_loss_clip": 0.01418638, + "auxiliary_loss_mlp": 0.01118471, + "balance_loss_clip": 1.12744904, + "balance_loss_mlp": 1.06270456, + "epoch": 0.010401322711558695, + "flos": 30810529422720.0, + "grad_norm": 1.7925778946034159, + "language_loss": 0.80943549, + "learning_rate": 3.317958045350308e-06, + "loss": 0.83480656, + "num_input_tokens_seen": 3664030, + "step": 173, + "time_per_iteration": 2.751945734024048 + }, + { + "auxiliary_loss_clip": 0.01429615, + "auxiliary_loss_mlp": 0.01107965, + "balance_loss_clip": 1.13108575, + "balance_loss_mlp": 1.05534625, + "epoch": 0.010461445964226665, + "flos": 24715124098560.0, + "grad_norm": 2.1644843911099216, + "language_loss": 0.82763064, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.85300648, + "num_input_tokens_seen": 3683615, + "step": 174, + "time_per_iteration": 2.676630735397339 + }, + { + "auxiliary_loss_clip": 0.01423443, + "auxiliary_loss_mlp": 0.01120976, + "balance_loss_clip": 1.12816644, + "balance_loss_mlp": 1.06523335, + "epoch": 0.010521569216894634, + "flos": 27710361457920.0, + "grad_norm": 2.331494685324117, + "language_loss": 0.72837007, + "learning_rate": 3.325358726641591e-06, + "loss": 0.75381434, + "num_input_tokens_seen": 3704540, + "step": 175, + "time_per_iteration": 2.6876866817474365 + }, + { + "auxiliary_loss_clip": 0.01425333, + "auxiliary_loss_mlp": 0.01127215, + "balance_loss_clip": 1.12866652, + "balance_loss_mlp": 1.06980324, + "epoch": 0.010581692469562603, + "flos": 12458346122880.0, + "grad_norm": 4.811985773634618, + "language_loss": 0.97983754, + "learning_rate": 3.329027409977902e-06, + "loss": 1.00536299, + "num_input_tokens_seen": 3721320, + "step": 176, + "time_per_iteration": 2.8159937858581543 + }, + { + "auxiliary_loss_clip": 0.0141033, + "auxiliary_loss_mlp": 0.01130651, + "balance_loss_clip": 1.12546706, + "balance_loss_mlp": 1.07738805, + "epoch": 0.010641815722230573, + "flos": 19427601519360.0, + "grad_norm": 2.8326118759658585, + "language_loss": 0.76926064, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.7946704, + "num_input_tokens_seen": 3739385, + "step": 177, + "time_per_iteration": 5.7707555294036865 + }, + { + "auxiliary_loss_clip": 0.01421858, + "auxiliary_loss_mlp": 0.01104718, + "balance_loss_clip": 1.12455702, + "balance_loss_mlp": 1.05002475, + "epoch": 0.010701938974898541, + "flos": 18332577452160.0, + "grad_norm": 2.6517911185675014, + "language_loss": 0.76942402, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.79468977, + "num_input_tokens_seen": 3756360, + "step": 178, + "time_per_iteration": 4.337082386016846 + }, + { + "auxiliary_loss_clip": 0.01430293, + "auxiliary_loss_mlp": 0.01109414, + "balance_loss_clip": 1.1303575, + "balance_loss_mlp": 1.05252683, + "epoch": 0.010762062227566512, + "flos": 19203985399680.0, + "grad_norm": 2.6843360372821925, + "language_loss": 0.84022826, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.86562538, + "num_input_tokens_seen": 3773930, + "step": 179, + "time_per_iteration": 2.6826629638671875 + }, + { + "auxiliary_loss_clip": 0.01418094, + "auxiliary_loss_mlp": 0.01108667, + "balance_loss_clip": 1.12202275, + "balance_loss_mlp": 1.05158973, + "epoch": 0.01082218548023448, + "flos": 31425427370880.0, + "grad_norm": 2.0256655839140083, + "language_loss": 0.83674574, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.86201334, + "num_input_tokens_seen": 3793630, + "step": 180, + "time_per_iteration": 2.7483785152435303 + }, + { + "auxiliary_loss_clip": 0.01421326, + "auxiliary_loss_mlp": 0.01120347, + "balance_loss_clip": 1.12740374, + "balance_loss_mlp": 1.0646286, + "epoch": 0.01088230873290245, + "flos": 25046436170880.0, + "grad_norm": 3.253139118534122, + "language_loss": 0.77958715, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.80500388, + "num_input_tokens_seen": 3813610, + "step": 181, + "time_per_iteration": 2.698941469192505 + }, + { + "auxiliary_loss_clip": 0.01414948, + "auxiliary_loss_mlp": 0.01130231, + "balance_loss_clip": 1.12188053, + "balance_loss_mlp": 1.07577634, + "epoch": 0.01094243198557042, + "flos": 22893411980160.0, + "grad_norm": 2.56637338396407, + "language_loss": 0.76438594, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.78983772, + "num_input_tokens_seen": 3831390, + "step": 182, + "time_per_iteration": 2.6951375007629395 + }, + { + "auxiliary_loss_clip": 0.01412526, + "auxiliary_loss_mlp": 0.01126665, + "balance_loss_clip": 1.12167537, + "balance_loss_mlp": 1.0702554, + "epoch": 0.011002555238238388, + "flos": 17165049782400.0, + "grad_norm": 2.083158831639218, + "language_loss": 0.87484097, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.90023291, + "num_input_tokens_seen": 3849705, + "step": 183, + "time_per_iteration": 2.733753204345703 + }, + { + "auxiliary_loss_clip": 0.01415922, + "auxiliary_loss_mlp": 0.01110585, + "balance_loss_clip": 1.12529624, + "balance_loss_mlp": 1.05922985, + "epoch": 0.011062678490906358, + "flos": 22310150935680.0, + "grad_norm": 3.105080129831269, + "language_loss": 0.86911464, + "learning_rate": 3.357647774369736e-06, + "loss": 0.89437973, + "num_input_tokens_seen": 3869230, + "step": 184, + "time_per_iteration": 2.6783828735351562 + }, + { + "auxiliary_loss_clip": 0.01410648, + "auxiliary_loss_mlp": 0.01108321, + "balance_loss_clip": 1.12499499, + "balance_loss_mlp": 1.05203021, + "epoch": 0.011122801743574327, + "flos": 24388373053440.0, + "grad_norm": 1.8650514063709744, + "language_loss": 0.83885491, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.86404455, + "num_input_tokens_seen": 3889735, + "step": 185, + "time_per_iteration": 2.6863327026367188 + }, + { + "auxiliary_loss_clip": 0.01419384, + "auxiliary_loss_mlp": 0.01107812, + "balance_loss_clip": 1.12355363, + "balance_loss_mlp": 1.04999495, + "epoch": 0.011182924996242297, + "flos": 18150258994560.0, + "grad_norm": 2.8933407749520743, + "language_loss": 0.71027243, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.73554444, + "num_input_tokens_seen": 3908855, + "step": 186, + "time_per_iteration": 2.819805383682251 + }, + { + "auxiliary_loss_clip": 0.01415699, + "auxiliary_loss_mlp": 0.01108312, + "balance_loss_clip": 1.12262082, + "balance_loss_mlp": 1.05574071, + "epoch": 0.011243048248910266, + "flos": 15486800584320.0, + "grad_norm": 2.4244794785226733, + "language_loss": 1.01999915, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.04523933, + "num_input_tokens_seen": 3923865, + "step": 187, + "time_per_iteration": 2.65875506401062 + }, + { + "auxiliary_loss_clip": 0.01404987, + "auxiliary_loss_mlp": 0.01107995, + "balance_loss_clip": 1.12269068, + "balance_loss_mlp": 1.05253887, + "epoch": 0.011303171501578235, + "flos": 40916868986880.0, + "grad_norm": 2.0089158406542524, + "language_loss": 0.74998611, + "learning_rate": 3.371494591560139e-06, + "loss": 0.77511597, + "num_input_tokens_seen": 3946870, + "step": 188, + "time_per_iteration": 2.8631174564361572 + }, + { + "auxiliary_loss_clip": 0.01298557, + "auxiliary_loss_mlp": 0.01067058, + "balance_loss_clip": 1.14124644, + "balance_loss_mlp": 1.04474187, + "epoch": 0.011363294754246205, + "flos": 66302697790080.0, + "grad_norm": 0.7620731385906954, + "language_loss": 0.56192517, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.5855813, + "num_input_tokens_seen": 4010005, + "step": 189, + "time_per_iteration": 3.2704074382781982 + }, + { + "auxiliary_loss_clip": 0.01402206, + "auxiliary_loss_mlp": 0.011217, + "balance_loss_clip": 1.11730003, + "balance_loss_mlp": 1.06662548, + "epoch": 0.011423418006914174, + "flos": 24900279730560.0, + "grad_norm": 2.640219984380571, + "language_loss": 0.95085573, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.97609472, + "num_input_tokens_seen": 4029035, + "step": 190, + "time_per_iteration": 2.6898255348205566 + }, + { + "auxiliary_loss_clip": 0.01405088, + "auxiliary_loss_mlp": 0.01103893, + "balance_loss_clip": 1.11979234, + "balance_loss_mlp": 1.05167961, + "epoch": 0.011483541259582144, + "flos": 19791879298560.0, + "grad_norm": 4.133813113517846, + "language_loss": 0.8463847, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.8714745, + "num_input_tokens_seen": 4046995, + "step": 191, + "time_per_iteration": 2.71589994430542 + }, + { + "auxiliary_loss_clip": 0.01403196, + "auxiliary_loss_mlp": 0.01118385, + "balance_loss_clip": 1.11570346, + "balance_loss_mlp": 1.06624269, + "epoch": 0.011543664512250112, + "flos": 26176939896960.0, + "grad_norm": 2.0065119945705887, + "language_loss": 0.91894913, + "learning_rate": 3.385049875042367e-06, + "loss": 0.94416493, + "num_input_tokens_seen": 4065865, + "step": 192, + "time_per_iteration": 2.775974988937378 + }, + { + "auxiliary_loss_clip": 0.01398496, + "auxiliary_loss_mlp": 0.01118924, + "balance_loss_clip": 1.11665678, + "balance_loss_mlp": 1.06117916, + "epoch": 0.011603787764918083, + "flos": 23768985905280.0, + "grad_norm": 2.10033302347605, + "language_loss": 0.86923265, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.89440691, + "num_input_tokens_seen": 4085305, + "step": 193, + "time_per_iteration": 2.792947292327881 + }, + { + "auxiliary_loss_clip": 0.01402535, + "auxiliary_loss_mlp": 0.01102276, + "balance_loss_clip": 1.11514282, + "balance_loss_mlp": 1.05061066, + "epoch": 0.011663911017586051, + "flos": 25954688494080.0, + "grad_norm": 2.2253165290939076, + "language_loss": 0.92296255, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.94801068, + "num_input_tokens_seen": 4105185, + "step": 194, + "time_per_iteration": 2.6886558532714844 + }, + { + "auxiliary_loss_clip": 0.01407209, + "auxiliary_loss_mlp": 0.01108641, + "balance_loss_clip": 1.11930478, + "balance_loss_mlp": 1.05630851, + "epoch": 0.01172403427025402, + "flos": 17895149625600.0, + "grad_norm": 2.4241235245311503, + "language_loss": 0.89768875, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.92284721, + "num_input_tokens_seen": 4123160, + "step": 195, + "time_per_iteration": 2.654517889022827 + }, + { + "auxiliary_loss_clip": 0.01400339, + "auxiliary_loss_mlp": 0.01114485, + "balance_loss_clip": 1.11779022, + "balance_loss_mlp": 1.05981565, + "epoch": 0.01178415752292199, + "flos": 17894539094400.0, + "grad_norm": 3.1130999341447385, + "language_loss": 0.86019921, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.88534749, + "num_input_tokens_seen": 4140425, + "step": 196, + "time_per_iteration": 2.643598794937134 + }, + { + "auxiliary_loss_clip": 0.01398067, + "auxiliary_loss_mlp": 0.01107082, + "balance_loss_clip": 1.11464977, + "balance_loss_mlp": 1.05308056, + "epoch": 0.011844280775589959, + "flos": 22893555634560.0, + "grad_norm": 3.666533247373141, + "language_loss": 0.93052697, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.95557845, + "num_input_tokens_seen": 4159555, + "step": 197, + "time_per_iteration": 2.7120354175567627 + }, + { + "auxiliary_loss_clip": 0.01396424, + "auxiliary_loss_mlp": 0.01112388, + "balance_loss_clip": 1.11625624, + "balance_loss_mlp": 1.05943501, + "epoch": 0.01190440402825793, + "flos": 26980333441920.0, + "grad_norm": 1.9614954763997827, + "language_loss": 0.79043806, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.81552619, + "num_input_tokens_seen": 4180480, + "step": 198, + "time_per_iteration": 2.774059772491455 + }, + { + "auxiliary_loss_clip": 0.0139305, + "auxiliary_loss_mlp": 0.01120527, + "balance_loss_clip": 1.11708748, + "balance_loss_mlp": 1.06821764, + "epoch": 0.011964527280925898, + "flos": 20521584092160.0, + "grad_norm": 1.8823459083646328, + "language_loss": 0.88239717, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.90753293, + "num_input_tokens_seen": 4198835, + "step": 199, + "time_per_iteration": 2.6808881759643555 + }, + { + "auxiliary_loss_clip": 0.01403709, + "auxiliary_loss_mlp": 0.0112899, + "balance_loss_clip": 1.11951399, + "balance_loss_mlp": 1.07200766, + "epoch": 0.012024650533593867, + "flos": 27745984771200.0, + "grad_norm": 2.0663906916258497, + "language_loss": 0.81151628, + "learning_rate": 3.411333205349222e-06, + "loss": 0.83684325, + "num_input_tokens_seen": 4219335, + "step": 200, + "time_per_iteration": 2.625380516052246 + }, + { + "auxiliary_loss_clip": 0.0140201, + "auxiliary_loss_mlp": 0.01104413, + "balance_loss_clip": 1.11633158, + "balance_loss_mlp": 1.05048287, + "epoch": 0.012084773786261837, + "flos": 10452017076480.0, + "grad_norm": 2.253120238884594, + "language_loss": 0.87696433, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.90202856, + "num_input_tokens_seen": 4236940, + "step": 201, + "time_per_iteration": 2.6062326431274414 + }, + { + "auxiliary_loss_clip": 0.01399494, + "auxiliary_loss_mlp": 0.01115643, + "balance_loss_clip": 1.11764228, + "balance_loss_mlp": 1.0614028, + "epoch": 0.012144897038929806, + "flos": 23105751229440.0, + "grad_norm": 2.088192664231089, + "language_loss": 0.84052485, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.86567622, + "num_input_tokens_seen": 4256755, + "step": 202, + "time_per_iteration": 2.6981592178344727 + }, + { + "auxiliary_loss_clip": 0.01388741, + "auxiliary_loss_mlp": 0.0111019, + "balance_loss_clip": 1.11006808, + "balance_loss_mlp": 1.05771446, + "epoch": 0.012205020291597776, + "flos": 21033203460480.0, + "grad_norm": 1.7861279575653157, + "language_loss": 0.89964712, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.92463642, + "num_input_tokens_seen": 4276505, + "step": 203, + "time_per_iteration": 2.668757438659668 + }, + { + "auxiliary_loss_clip": 0.01276289, + "auxiliary_loss_mlp": 0.01095021, + "balance_loss_clip": 1.12578154, + "balance_loss_mlp": 1.07470798, + "epoch": 0.012265143544265745, + "flos": 68447785075200.0, + "grad_norm": 1.0265297625980543, + "language_loss": 0.61255801, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.63627112, + "num_input_tokens_seen": 4330965, + "step": 204, + "time_per_iteration": 3.161599636077881 + }, + { + "auxiliary_loss_clip": 0.01396271, + "auxiliary_loss_mlp": 0.01111806, + "balance_loss_clip": 1.11291122, + "balance_loss_mlp": 1.05930579, + "epoch": 0.012325266796933715, + "flos": 17019252478080.0, + "grad_norm": 2.3248674300118184, + "language_loss": 0.91324663, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.93832743, + "num_input_tokens_seen": 4348200, + "step": 205, + "time_per_iteration": 2.6764047145843506 + }, + { + "auxiliary_loss_clip": 0.01404558, + "auxiliary_loss_mlp": 0.0112167, + "balance_loss_clip": 1.11773109, + "balance_loss_mlp": 1.06773925, + "epoch": 0.012385390049601683, + "flos": 20190056538240.0, + "grad_norm": 2.1088315130515207, + "language_loss": 0.89305568, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.91831797, + "num_input_tokens_seen": 4365460, + "step": 206, + "time_per_iteration": 2.7157227993011475 + }, + { + "auxiliary_loss_clip": 0.0139534, + "auxiliary_loss_mlp": 0.01100957, + "balance_loss_clip": 1.11176991, + "balance_loss_mlp": 1.04888678, + "epoch": 0.012445513302269652, + "flos": 16253134272000.0, + "grad_norm": 2.399816031687551, + "language_loss": 0.95542914, + "learning_rate": 3.43348263905683e-06, + "loss": 0.9803921, + "num_input_tokens_seen": 4383650, + "step": 207, + "time_per_iteration": 2.611348867416382 + }, + { + "auxiliary_loss_clip": 0.01393005, + "auxiliary_loss_mlp": 0.01117764, + "balance_loss_clip": 1.11658561, + "balance_loss_mlp": 1.06497812, + "epoch": 0.012505636554937622, + "flos": 23769380954880.0, + "grad_norm": 1.8144323603981871, + "language_loss": 0.75985783, + "learning_rate": 3.436585547151547e-06, + "loss": 0.78496552, + "num_input_tokens_seen": 4403765, + "step": 208, + "time_per_iteration": 2.7184154987335205 + }, + { + "auxiliary_loss_clip": 0.0138146, + "auxiliary_loss_mlp": 0.01108623, + "balance_loss_clip": 1.11071992, + "balance_loss_mlp": 1.05576587, + "epoch": 0.012565759807605591, + "flos": 30591546157440.0, + "grad_norm": 2.2326965650696855, + "language_loss": 0.98386943, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.00877023, + "num_input_tokens_seen": 4421935, + "step": 209, + "time_per_iteration": 2.7354249954223633 + }, + { + "auxiliary_loss_clip": 0.01387012, + "auxiliary_loss_mlp": 0.0111836, + "balance_loss_clip": 1.11136842, + "balance_loss_mlp": 1.06490695, + "epoch": 0.012625883060273561, + "flos": 40113511355520.0, + "grad_norm": 9.084733304650118, + "language_loss": 0.85514843, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.88020217, + "num_input_tokens_seen": 4441470, + "step": 210, + "time_per_iteration": 2.888749122619629 + }, + { + "auxiliary_loss_clip": 0.01384384, + "auxiliary_loss_mlp": 0.01121559, + "balance_loss_clip": 1.11018038, + "balance_loss_mlp": 1.07115781, + "epoch": 0.01268600631294153, + "flos": 27089178629760.0, + "grad_norm": 3.431917100192063, + "language_loss": 0.97194636, + "learning_rate": 3.445805545042314e-06, + "loss": 0.99700582, + "num_input_tokens_seen": 4459950, + "step": 211, + "time_per_iteration": 2.7465193271636963 + }, + { + "auxiliary_loss_clip": 0.01393556, + "auxiliary_loss_mlp": 0.01123542, + "balance_loss_clip": 1.11511767, + "balance_loss_mlp": 1.06999326, + "epoch": 0.012746129565609499, + "flos": 16982767238400.0, + "grad_norm": 2.3992368053115163, + "language_loss": 0.9508543, + "learning_rate": 3.448849769075239e-06, + "loss": 0.97602528, + "num_input_tokens_seen": 4478390, + "step": 212, + "time_per_iteration": 2.6340651512145996 + }, + { + "auxiliary_loss_clip": 0.01381697, + "auxiliary_loss_mlp": 0.01116386, + "balance_loss_clip": 1.112149, + "balance_loss_mlp": 1.06381512, + "epoch": 0.012806252818277469, + "flos": 46533476995200.0, + "grad_norm": 1.701444843398511, + "language_loss": 0.76078421, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.78576505, + "num_input_tokens_seen": 4501665, + "step": 213, + "time_per_iteration": 2.9250640869140625 + }, + { + "auxiliary_loss_clip": 0.01385821, + "auxiliary_loss_mlp": 0.01111776, + "balance_loss_clip": 1.11002433, + "balance_loss_mlp": 1.06056333, + "epoch": 0.012866376070945438, + "flos": 14388616120320.0, + "grad_norm": 3.5300370267625922, + "language_loss": 0.86698866, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.89196461, + "num_input_tokens_seen": 4519055, + "step": 214, + "time_per_iteration": 2.645289659500122 + }, + { + "auxiliary_loss_clip": 0.01383455, + "auxiliary_loss_mlp": 0.01128262, + "balance_loss_clip": 1.1159339, + "balance_loss_mlp": 1.07359219, + "epoch": 0.012926499323613408, + "flos": 26140813793280.0, + "grad_norm": 2.14433888305053, + "language_loss": 0.77582061, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.80093777, + "num_input_tokens_seen": 4540870, + "step": 215, + "time_per_iteration": 2.7315175533294678 + }, + { + "auxiliary_loss_clip": 0.01391951, + "auxiliary_loss_mlp": 0.01115104, + "balance_loss_clip": 1.11440635, + "balance_loss_mlp": 1.0638206, + "epoch": 0.012986622576281377, + "flos": 30117202128000.0, + "grad_norm": 2.2964706747038233, + "language_loss": 0.90423942, + "learning_rate": 3.460884739729461e-06, + "loss": 0.92930996, + "num_input_tokens_seen": 4560395, + "step": 216, + "time_per_iteration": 2.724698781967163 + }, + { + "auxiliary_loss_clip": 0.01384729, + "auxiliary_loss_mlp": 0.01113374, + "balance_loss_clip": 1.10847259, + "balance_loss_mlp": 1.06096959, + "epoch": 0.013046745828949347, + "flos": 13954025468160.0, + "grad_norm": 3.60062834696173, + "language_loss": 0.93473232, + "learning_rate": 3.463858658104523e-06, + "loss": 0.95971346, + "num_input_tokens_seen": 4575785, + "step": 217, + "time_per_iteration": 5.762276649475098 + }, + { + "auxiliary_loss_clip": 0.01377712, + "auxiliary_loss_mlp": 0.0110874, + "balance_loss_clip": 1.10726643, + "balance_loss_mlp": 1.05433273, + "epoch": 0.013106869081617315, + "flos": 17347835116800.0, + "grad_norm": 1.943339896357513, + "language_loss": 0.93811166, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.96297616, + "num_input_tokens_seen": 4594985, + "step": 218, + "time_per_iteration": 5.832701206207275 + }, + { + "auxiliary_loss_clip": 0.01372884, + "auxiliary_loss_mlp": 0.01106717, + "balance_loss_clip": 1.10647273, + "balance_loss_mlp": 1.05552888, + "epoch": 0.013166992334285284, + "flos": 25884914325120.0, + "grad_norm": 2.252873600345955, + "language_loss": 0.86196327, + "learning_rate": 3.46976560030214e-06, + "loss": 0.88675928, + "num_input_tokens_seen": 4616125, + "step": 219, + "time_per_iteration": 2.794581651687622 + }, + { + "auxiliary_loss_clip": 0.0137885, + "auxiliary_loss_mlp": 0.01102953, + "balance_loss_clip": 1.10957599, + "balance_loss_mlp": 1.05188394, + "epoch": 0.013227115586953254, + "flos": 31175956437120.0, + "grad_norm": 1.897987121161891, + "language_loss": 0.8748548, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.89967287, + "num_input_tokens_seen": 4637795, + "step": 220, + "time_per_iteration": 2.799927234649658 + }, + { + "auxiliary_loss_clip": 0.01370688, + "auxiliary_loss_mlp": 0.01115596, + "balance_loss_clip": 1.10440111, + "balance_loss_mlp": 1.0679127, + "epoch": 0.013287238839621223, + "flos": 20409470766720.0, + "grad_norm": 3.2557072980071795, + "language_loss": 0.86437249, + "learning_rate": 3.475618842282164e-06, + "loss": 0.88923532, + "num_input_tokens_seen": 4656835, + "step": 221, + "time_per_iteration": 2.7040672302246094 + }, + { + "auxiliary_loss_clip": 0.01376134, + "auxiliary_loss_mlp": 0.01116397, + "balance_loss_clip": 1.10384834, + "balance_loss_mlp": 1.0637064, + "epoch": 0.013347362092289193, + "flos": 14137134024960.0, + "grad_norm": 2.585706849100757, + "language_loss": 0.92369294, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.94861829, + "num_input_tokens_seen": 4673015, + "step": 222, + "time_per_iteration": 2.6648194789886475 + }, + { + "auxiliary_loss_clip": 0.01373283, + "auxiliary_loss_mlp": 0.01106423, + "balance_loss_clip": 1.10636806, + "balance_loss_mlp": 1.05156267, + "epoch": 0.013407485344957162, + "flos": 21797705554560.0, + "grad_norm": 7.739608779999776, + "language_loss": 0.95708215, + "learning_rate": 3.481419351635897e-06, + "loss": 0.98187923, + "num_input_tokens_seen": 4692355, + "step": 223, + "time_per_iteration": 2.7261807918548584 + }, + { + "auxiliary_loss_clip": 0.01374555, + "auxiliary_loss_mlp": 0.0110963, + "balance_loss_clip": 1.10768425, + "balance_loss_mlp": 1.05870414, + "epoch": 0.013467608597625132, + "flos": 18621622195200.0, + "grad_norm": 2.673591615227502, + "language_loss": 0.88031876, + "learning_rate": 3.484300126837776e-06, + "loss": 0.90516055, + "num_input_tokens_seen": 4710080, + "step": 224, + "time_per_iteration": 2.601686477661133 + }, + { + "auxiliary_loss_clip": 0.01374533, + "auxiliary_loss_mlp": 0.01103, + "balance_loss_clip": 1.10679817, + "balance_loss_mlp": 1.04804444, + "epoch": 0.013527731850293101, + "flos": 18552314903040.0, + "grad_norm": 3.0722216996453535, + "language_loss": 0.89625597, + "learning_rate": 3.487168070036317e-06, + "loss": 0.9210313, + "num_input_tokens_seen": 4728980, + "step": 225, + "time_per_iteration": 2.6677513122558594 + }, + { + "auxiliary_loss_clip": 0.01369955, + "auxiliary_loss_mlp": 0.0112021, + "balance_loss_clip": 1.10561275, + "balance_loss_mlp": 1.06675696, + "epoch": 0.01358785510296107, + "flos": 19165381257600.0, + "grad_norm": 1.9576206039109396, + "language_loss": 0.98980033, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.01470196, + "num_input_tokens_seen": 4747020, + "step": 226, + "time_per_iteration": 2.8320930004119873 + }, + { + "auxiliary_loss_clip": 0.01375268, + "auxiliary_loss_mlp": 0.01110039, + "balance_loss_clip": 1.10837173, + "balance_loss_mlp": 1.05572701, + "epoch": 0.01364797835562904, + "flos": 23329941966720.0, + "grad_norm": 2.3303410550109245, + "language_loss": 0.90965348, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.93450654, + "num_input_tokens_seen": 4765000, + "step": 227, + "time_per_iteration": 2.648606061935425 + }, + { + "auxiliary_loss_clip": 0.01255161, + "auxiliary_loss_mlp": 0.01079249, + "balance_loss_clip": 1.11229861, + "balance_loss_mlp": 1.06017554, + "epoch": 0.013708101608297009, + "flos": 70993746097920.0, + "grad_norm": 0.9472069433514878, + "language_loss": 0.57650995, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.59985405, + "num_input_tokens_seen": 4833210, + "step": 228, + "time_per_iteration": 3.246328592300415 + }, + { + "auxiliary_loss_clip": 0.01366835, + "auxiliary_loss_mlp": 0.01117377, + "balance_loss_clip": 1.10507822, + "balance_loss_mlp": 1.06711841, + "epoch": 0.013768224860964979, + "flos": 16325170997760.0, + "grad_norm": 2.957038430634678, + "language_loss": 0.87773621, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.90257835, + "num_input_tokens_seen": 4850120, + "step": 229, + "time_per_iteration": 2.6319024562835693 + }, + { + "auxiliary_loss_clip": 0.01375278, + "auxiliary_loss_mlp": 0.01098609, + "balance_loss_clip": 1.10567176, + "balance_loss_mlp": 1.04873204, + "epoch": 0.013828348113632948, + "flos": 20193037367040.0, + "grad_norm": 4.72663824849547, + "language_loss": 0.83937395, + "learning_rate": 3.501319237118231e-06, + "loss": 0.86411285, + "num_input_tokens_seen": 4866215, + "step": 230, + "time_per_iteration": 2.7026398181915283 + }, + { + "auxiliary_loss_clip": 0.01373544, + "auxiliary_loss_mlp": 0.01113683, + "balance_loss_clip": 1.10701275, + "balance_loss_mlp": 1.06361556, + "epoch": 0.013888471366300916, + "flos": 20741070147840.0, + "grad_norm": 2.2562202151287867, + "language_loss": 0.904212, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.9290843, + "num_input_tokens_seen": 4885630, + "step": 231, + "time_per_iteration": 2.6424474716186523 + }, + { + "auxiliary_loss_clip": 0.01377759, + "auxiliary_loss_mlp": 0.01110232, + "balance_loss_clip": 1.11118639, + "balance_loss_mlp": 1.06030726, + "epoch": 0.013948594618968886, + "flos": 22090628966400.0, + "grad_norm": 2.0229562700819215, + "language_loss": 0.83624899, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.86112887, + "num_input_tokens_seen": 4905570, + "step": 232, + "time_per_iteration": 2.621704339981079 + }, + { + "auxiliary_loss_clip": 0.01377798, + "auxiliary_loss_mlp": 0.01094369, + "balance_loss_clip": 1.10229027, + "balance_loss_mlp": 1.04253721, + "epoch": 0.014008717871636855, + "flos": 19063108258560.0, + "grad_norm": 5.516695444379509, + "language_loss": 0.74727643, + "learning_rate": 3.509663010692652e-06, + "loss": 0.77199805, + "num_input_tokens_seen": 4923535, + "step": 233, + "time_per_iteration": 2.659188747406006 + }, + { + "auxiliary_loss_clip": 0.01382744, + "auxiliary_loss_mlp": 0.01125121, + "balance_loss_clip": 1.1099937, + "balance_loss_mlp": 1.0723356, + "epoch": 0.014068841124304825, + "flos": 14530822064640.0, + "grad_norm": 2.5763093382937483, + "language_loss": 0.85633421, + "learning_rate": 3.512420411838642e-06, + "loss": 0.88141286, + "num_input_tokens_seen": 4939200, + "step": 234, + "time_per_iteration": 2.610635757446289 + }, + { + "auxiliary_loss_clip": 0.01374562, + "auxiliary_loss_mlp": 0.01114672, + "balance_loss_clip": 1.10890436, + "balance_loss_mlp": 1.06467605, + "epoch": 0.014128964376972794, + "flos": 18077396256000.0, + "grad_norm": 2.467487286445388, + "language_loss": 0.89192498, + "learning_rate": 3.515166054308634e-06, + "loss": 0.91681731, + "num_input_tokens_seen": 4956620, + "step": 235, + "time_per_iteration": 2.668769359588623 + }, + { + "auxiliary_loss_clip": 0.01373018, + "auxiliary_loss_mlp": 0.01131641, + "balance_loss_clip": 1.11011076, + "balance_loss_mlp": 1.08073914, + "epoch": 0.014189087629640764, + "flos": 25334331678720.0, + "grad_norm": 2.143165146200321, + "language_loss": 0.85535377, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.88040036, + "num_input_tokens_seen": 4975650, + "step": 236, + "time_per_iteration": 2.7570323944091797 + }, + { + "auxiliary_loss_clip": 0.01369632, + "auxiliary_loss_mlp": 0.01100269, + "balance_loss_clip": 1.10296702, + "balance_loss_mlp": 1.04905629, + "epoch": 0.014249210882308733, + "flos": 36139744713600.0, + "grad_norm": 2.1351980688483136, + "language_loss": 0.82550979, + "learning_rate": 3.520622461401154e-06, + "loss": 0.85020876, + "num_input_tokens_seen": 4997415, + "step": 237, + "time_per_iteration": 2.811617374420166 + }, + { + "auxiliary_loss_clip": 0.01369728, + "auxiliary_loss_mlp": 0.01124352, + "balance_loss_clip": 1.10659075, + "balance_loss_mlp": 1.07085085, + "epoch": 0.014309334134976702, + "flos": 12932977461120.0, + "grad_norm": 2.0241581748099313, + "language_loss": 0.77096599, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.79590684, + "num_input_tokens_seen": 5013905, + "step": 238, + "time_per_iteration": 2.8044662475585938 + }, + { + "auxiliary_loss_clip": 0.01367496, + "auxiliary_loss_mlp": 0.01111406, + "balance_loss_clip": 1.10897434, + "balance_loss_mlp": 1.06343579, + "epoch": 0.014369457387644672, + "flos": 20777519473920.0, + "grad_norm": 1.8300428555870456, + "language_loss": 0.8707583, + "learning_rate": 3.526033015791284e-06, + "loss": 0.89554727, + "num_input_tokens_seen": 5033645, + "step": 239, + "time_per_iteration": 2.681452751159668 + }, + { + "auxiliary_loss_clip": 0.01353036, + "auxiliary_loss_mlp": 0.01103184, + "balance_loss_clip": 1.10036874, + "balance_loss_mlp": 1.05516672, + "epoch": 0.01442958064031264, + "flos": 25848536826240.0, + "grad_norm": 2.109315431148974, + "language_loss": 0.93055749, + "learning_rate": 3.528721337790862e-06, + "loss": 0.95511973, + "num_input_tokens_seen": 5052875, + "step": 240, + "time_per_iteration": 2.679826021194458 + }, + { + "auxiliary_loss_clip": 0.01360794, + "auxiliary_loss_mlp": 0.01103084, + "balance_loss_clip": 1.10475957, + "balance_loss_mlp": 1.05611515, + "epoch": 0.014489703892980611, + "flos": 28219718269440.0, + "grad_norm": 3.7136133710916575, + "language_loss": 0.8482846, + "learning_rate": 3.531398481704111e-06, + "loss": 0.87292337, + "num_input_tokens_seen": 5075005, + "step": 241, + "time_per_iteration": 2.679126262664795 + }, + { + "auxiliary_loss_clip": 0.01359518, + "auxiliary_loss_mlp": 0.01119602, + "balance_loss_clip": 1.11010456, + "balance_loss_mlp": 1.06931913, + "epoch": 0.01454982714564858, + "flos": 22490925108480.0, + "grad_norm": 1.8502491938168453, + "language_loss": 0.88590866, + "learning_rate": 3.534064540103573e-06, + "loss": 0.9106999, + "num_input_tokens_seen": 5091875, + "step": 242, + "time_per_iteration": 2.7366583347320557 + }, + { + "auxiliary_loss_clip": 0.01359534, + "auxiliary_loss_mlp": 0.01104713, + "balance_loss_clip": 1.10356677, + "balance_loss_mlp": 1.05342889, + "epoch": 0.014609950398316548, + "flos": 21653201139840.0, + "grad_norm": 2.261458758817042, + "language_loss": 0.86688942, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89153194, + "num_input_tokens_seen": 5111290, + "step": 243, + "time_per_iteration": 2.764378070831299 + }, + { + "auxiliary_loss_clip": 0.01364897, + "auxiliary_loss_mlp": 0.01106776, + "balance_loss_clip": 1.10636568, + "balance_loss_mlp": 1.05656552, + "epoch": 0.014670073650984519, + "flos": 21869993675520.0, + "grad_norm": 1.6964959858678799, + "language_loss": 0.84256208, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.86727887, + "num_input_tokens_seen": 5132265, + "step": 244, + "time_per_iteration": 2.630441188812256 + }, + { + "auxiliary_loss_clip": 0.01372266, + "auxiliary_loss_mlp": 0.01115072, + "balance_loss_clip": 1.10771632, + "balance_loss_mlp": 1.06328762, + "epoch": 0.014730196903652487, + "flos": 23183713699200.0, + "grad_norm": 8.49550264430495, + "language_loss": 0.78613877, + "learning_rate": 3.54199711087864e-06, + "loss": 0.81101215, + "num_input_tokens_seen": 5148575, + "step": 245, + "time_per_iteration": 2.6991443634033203 + }, + { + "auxiliary_loss_clip": 0.01371598, + "auxiliary_loss_mlp": 0.0110404, + "balance_loss_clip": 1.10405719, + "balance_loss_mlp": 1.05008554, + "epoch": 0.014790320156320457, + "flos": 23222605150080.0, + "grad_norm": 2.2582939339926305, + "language_loss": 0.84165329, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.86640966, + "num_input_tokens_seen": 5170415, + "step": 246, + "time_per_iteration": 2.726743221282959 + }, + { + "auxiliary_loss_clip": 0.01365538, + "auxiliary_loss_mlp": 0.01101456, + "balance_loss_clip": 1.10242295, + "balance_loss_mlp": 1.05062532, + "epoch": 0.014850443408988426, + "flos": 15815490963840.0, + "grad_norm": 1.9870849133800452, + "language_loss": 0.89958012, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.92425001, + "num_input_tokens_seen": 5188565, + "step": 247, + "time_per_iteration": 2.5998406410217285 + }, + { + "auxiliary_loss_clip": 0.01364581, + "auxiliary_loss_mlp": 0.01098108, + "balance_loss_clip": 1.09896278, + "balance_loss_mlp": 1.0489223, + "epoch": 0.014910566661656396, + "flos": 22781657790720.0, + "grad_norm": 2.0527635487774343, + "language_loss": 0.783005, + "learning_rate": 3.549833136812155e-06, + "loss": 0.80763197, + "num_input_tokens_seen": 5207810, + "step": 248, + "time_per_iteration": 2.689784049987793 + }, + { + "auxiliary_loss_clip": 0.01365896, + "auxiliary_loss_mlp": 0.01110511, + "balance_loss_clip": 1.10732806, + "balance_loss_mlp": 1.06044269, + "epoch": 0.014970689914324365, + "flos": 26865023806080.0, + "grad_norm": 1.9405946352322343, + "language_loss": 0.83855766, + "learning_rate": 3.552424094769381e-06, + "loss": 0.86332172, + "num_input_tokens_seen": 5226210, + "step": 249, + "time_per_iteration": 2.8210339546203613 + }, + { + "auxiliary_loss_clip": 0.01358179, + "auxiliary_loss_mlp": 0.01106801, + "balance_loss_clip": 1.10089588, + "balance_loss_mlp": 1.05802023, + "epoch": 0.015030813166992334, + "flos": 13985662371840.0, + "grad_norm": 2.0689026358419786, + "language_loss": 0.93631709, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.96096689, + "num_input_tokens_seen": 5241660, + "step": 250, + "time_per_iteration": 2.7074570655822754 + }, + { + "auxiliary_loss_clip": 0.01368183, + "auxiliary_loss_mlp": 0.01115393, + "balance_loss_clip": 1.1065619, + "balance_loss_mlp": 1.06415713, + "epoch": 0.015090936419660304, + "flos": 24717817618560.0, + "grad_norm": 2.6509740932573127, + "language_loss": 0.9678722, + "learning_rate": 3.5575749397087034e-06, + "loss": 0.99270797, + "num_input_tokens_seen": 5261090, + "step": 251, + "time_per_iteration": 2.6740176677703857 + }, + { + "auxiliary_loss_clip": 0.01361249, + "auxiliary_loss_mlp": 0.01108489, + "balance_loss_clip": 1.10063529, + "balance_loss_mlp": 1.0597558, + "epoch": 0.015151059672328273, + "flos": 25738793798400.0, + "grad_norm": 1.996044018630987, + "language_loss": 0.84516245, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.86985981, + "num_input_tokens_seen": 5279175, + "step": 252, + "time_per_iteration": 2.7198123931884766 + }, + { + "auxiliary_loss_clip": 0.01356789, + "auxiliary_loss_mlp": 0.0111346, + "balance_loss_clip": 1.1023767, + "balance_loss_mlp": 1.06346345, + "epoch": 0.015211182924996243, + "flos": 21871214737920.0, + "grad_norm": 2.3132428526475275, + "language_loss": 0.98516917, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.0098716, + "num_input_tokens_seen": 5296975, + "step": 253, + "time_per_iteration": 2.6751561164855957 + }, + { + "auxiliary_loss_clip": 0.01244193, + "auxiliary_loss_mlp": 0.01100072, + "balance_loss_clip": 1.1058414, + "balance_loss_mlp": 1.08338308, + "epoch": 0.015271306177664212, + "flos": 66895080888960.0, + "grad_norm": 0.8719135194962525, + "language_loss": 0.55628473, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.57972741, + "num_input_tokens_seen": 5358375, + "step": 254, + "time_per_iteration": 3.2305996417999268 + }, + { + "auxiliary_loss_clip": 0.0136146, + "auxiliary_loss_mlp": 0.01119692, + "balance_loss_clip": 1.0985806, + "balance_loss_mlp": 1.06952846, + "epoch": 0.01533142943033218, + "flos": 26834069260800.0, + "grad_norm": 2.113472843461701, + "language_loss": 0.90234184, + "learning_rate": 3.567754632921479e-06, + "loss": 0.92715329, + "num_input_tokens_seen": 5377255, + "step": 255, + "time_per_iteration": 2.7138473987579346 + }, + { + "auxiliary_loss_clip": 0.01357311, + "auxiliary_loss_mlp": 0.01137867, + "balance_loss_clip": 1.1001389, + "balance_loss_mlp": 1.08803785, + "epoch": 0.01539155268300015, + "flos": 20813753318400.0, + "grad_norm": 2.320838285045027, + "language_loss": 0.85392761, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.87887937, + "num_input_tokens_seen": 5395320, + "step": 256, + "time_per_iteration": 2.7135775089263916 + }, + { + "auxiliary_loss_clip": 0.01363873, + "auxiliary_loss_mlp": 0.0112257, + "balance_loss_clip": 1.10053098, + "balance_loss_mlp": 1.07281172, + "epoch": 0.01545167593566812, + "flos": 15961862885760.0, + "grad_norm": 4.480294478847577, + "language_loss": 0.71472508, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.73958945, + "num_input_tokens_seen": 5411970, + "step": 257, + "time_per_iteration": 6.340675592422485 + }, + { + "auxiliary_loss_clip": 0.01355912, + "auxiliary_loss_mlp": 0.01112611, + "balance_loss_clip": 1.10014856, + "balance_loss_mlp": 1.06280565, + "epoch": 0.01551179918833609, + "flos": 22601745544320.0, + "grad_norm": 2.0292888191897673, + "language_loss": 0.94713151, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.97181678, + "num_input_tokens_seen": 5430245, + "step": 258, + "time_per_iteration": 5.674164772033691 + }, + { + "auxiliary_loss_clip": 0.01356656, + "auxiliary_loss_mlp": 0.01113313, + "balance_loss_clip": 1.09867072, + "balance_loss_mlp": 1.0645566, + "epoch": 0.015571922441004058, + "flos": 22816706486400.0, + "grad_norm": 2.3215886633849236, + "language_loss": 0.93037683, + "learning_rate": 3.577775880881658e-06, + "loss": 0.95507646, + "num_input_tokens_seen": 5448905, + "step": 259, + "time_per_iteration": 2.6286497116088867 + }, + { + "auxiliary_loss_clip": 0.01348977, + "auxiliary_loss_mlp": 0.01102171, + "balance_loss_clip": 1.10076857, + "balance_loss_mlp": 1.05625176, + "epoch": 0.015632045693672027, + "flos": 18947439486720.0, + "grad_norm": 1.9575053933526474, + "language_loss": 0.97368109, + "learning_rate": 3.5802570099000424e-06, + "loss": 0.99819261, + "num_input_tokens_seen": 5466405, + "step": 260, + "time_per_iteration": 2.625072717666626 + }, + { + "auxiliary_loss_clip": 0.01362999, + "auxiliary_loss_mlp": 0.01127943, + "balance_loss_clip": 1.1010474, + "balance_loss_mlp": 1.07940137, + "epoch": 0.015692168946339995, + "flos": 29971728046080.0, + "grad_norm": 2.2828802632863305, + "language_loss": 0.87807435, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.90298378, + "num_input_tokens_seen": 5487055, + "step": 261, + "time_per_iteration": 2.6737279891967773 + }, + { + "auxiliary_loss_clip": 0.01357008, + "auxiliary_loss_mlp": 0.01125312, + "balance_loss_clip": 1.09822345, + "balance_loss_mlp": 1.07665133, + "epoch": 0.015752292199007967, + "flos": 19392085946880.0, + "grad_norm": 5.057676675675106, + "language_loss": 0.67100549, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.69582868, + "num_input_tokens_seen": 5506600, + "step": 262, + "time_per_iteration": 2.651690721511841 + }, + { + "auxiliary_loss_clip": 0.01353953, + "auxiliary_loss_mlp": 0.01135541, + "balance_loss_clip": 1.09924924, + "balance_loss_mlp": 1.08499634, + "epoch": 0.015812415451675936, + "flos": 20339804338560.0, + "grad_norm": 3.0820356667611337, + "language_loss": 0.68077701, + "learning_rate": 3.587643540438383e-06, + "loss": 0.70567191, + "num_input_tokens_seen": 5524350, + "step": 263, + "time_per_iteration": 2.6885130405426025 + }, + { + "auxiliary_loss_clip": 0.01355592, + "auxiliary_loss_mlp": 0.01116799, + "balance_loss_clip": 1.09620881, + "balance_loss_mlp": 1.06766081, + "epoch": 0.015872538704343905, + "flos": 17525412979200.0, + "grad_norm": 3.9089218881424674, + "language_loss": 0.85002583, + "learning_rate": 3.590087005168037e-06, + "loss": 0.87474978, + "num_input_tokens_seen": 5542145, + "step": 264, + "time_per_iteration": 2.6557912826538086 + }, + { + "auxiliary_loss_clip": 0.01360388, + "auxiliary_loss_mlp": 0.01102763, + "balance_loss_clip": 1.10088885, + "balance_loss_mlp": 1.056319, + "epoch": 0.015932661957011873, + "flos": 15260490944640.0, + "grad_norm": 2.7020928553211476, + "language_loss": 1.04234743, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.06697881, + "num_input_tokens_seen": 5557920, + "step": 265, + "time_per_iteration": 2.6262216567993164 + }, + { + "auxiliary_loss_clip": 0.01364512, + "auxiliary_loss_mlp": 0.01120309, + "balance_loss_clip": 1.1033864, + "balance_loss_mlp": 1.06835794, + "epoch": 0.015992785209679845, + "flos": 20302528999680.0, + "grad_norm": 3.1220748516520134, + "language_loss": 0.74914098, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.7739892, + "num_input_tokens_seen": 5576290, + "step": 266, + "time_per_iteration": 2.6244583129882812 + }, + { + "auxiliary_loss_clip": 0.01349738, + "auxiliary_loss_mlp": 0.0111189, + "balance_loss_clip": 1.1000762, + "balance_loss_mlp": 1.06206095, + "epoch": 0.016052908462347814, + "flos": 23362368969600.0, + "grad_norm": 1.8166776194063956, + "language_loss": 0.90909529, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.93371153, + "num_input_tokens_seen": 5595205, + "step": 267, + "time_per_iteration": 2.6753580570220947 + }, + { + "auxiliary_loss_clip": 0.01359091, + "auxiliary_loss_mlp": 0.01115968, + "balance_loss_clip": 1.10122573, + "balance_loss_mlp": 1.06797481, + "epoch": 0.016113031715015783, + "flos": 21286588976640.0, + "grad_norm": 2.450608875877181, + "language_loss": 0.85636413, + "learning_rate": 3.599769175344462e-06, + "loss": 0.88111478, + "num_input_tokens_seen": 5612645, + "step": 268, + "time_per_iteration": 2.7161567211151123 + }, + { + "auxiliary_loss_clip": 0.01351132, + "auxiliary_loss_mlp": 0.01102276, + "balance_loss_clip": 1.10226274, + "balance_loss_mlp": 1.05475891, + "epoch": 0.01617315496768375, + "flos": 18914689261440.0, + "grad_norm": 2.1714201716772457, + "language_loss": 0.88080788, + "learning_rate": 3.602167137831432e-06, + "loss": 0.90534198, + "num_input_tokens_seen": 5628345, + "step": 269, + "time_per_iteration": 2.6403756141662598 + }, + { + "auxiliary_loss_clip": 0.01357907, + "auxiliary_loss_mlp": 0.01111574, + "balance_loss_clip": 1.10001528, + "balance_loss_mlp": 1.06021833, + "epoch": 0.01623327822035172, + "flos": 16546488647040.0, + "grad_norm": 2.5848702107942803, + "language_loss": 0.97077739, + "learning_rate": 3.6045562024779565e-06, + "loss": 0.99547219, + "num_input_tokens_seen": 5645940, + "step": 270, + "time_per_iteration": 2.635546922683716 + }, + { + "auxiliary_loss_clip": 0.01356007, + "auxiliary_loss_mlp": 0.01118132, + "balance_loss_clip": 1.10402, + "balance_loss_mlp": 1.06918478, + "epoch": 0.016293401473019692, + "flos": 23513481486720.0, + "grad_norm": 2.1115750591463223, + "language_loss": 0.86112005, + "learning_rate": 3.606936435072361e-06, + "loss": 0.8858614, + "num_input_tokens_seen": 5665690, + "step": 271, + "time_per_iteration": 2.6877286434173584 + }, + { + "auxiliary_loss_clip": 0.013537, + "auxiliary_loss_mlp": 0.01105687, + "balance_loss_clip": 1.0962286, + "balance_loss_mlp": 1.057693, + "epoch": 0.01635352472568766, + "flos": 29016072748800.0, + "grad_norm": 2.5391912683658413, + "language_loss": 0.81550127, + "learning_rate": 3.609307900676025e-06, + "loss": 0.84009504, + "num_input_tokens_seen": 5683190, + "step": 272, + "time_per_iteration": 2.6728365421295166 + }, + { + "auxiliary_loss_clip": 0.01348527, + "auxiliary_loss_mlp": 0.01120864, + "balance_loss_clip": 1.09806561, + "balance_loss_mlp": 1.07368064, + "epoch": 0.01641364797835563, + "flos": 13370513028480.0, + "grad_norm": 2.3613573538590487, + "language_loss": 0.81075382, + "learning_rate": 3.611670663634051e-06, + "loss": 0.83544779, + "num_input_tokens_seen": 5699780, + "step": 273, + "time_per_iteration": 2.595008134841919 + }, + { + "auxiliary_loss_clip": 0.01346135, + "auxiliary_loss_mlp": 0.01105539, + "balance_loss_clip": 1.09398317, + "balance_loss_mlp": 1.05749762, + "epoch": 0.016473771231023598, + "flos": 18878239935360.0, + "grad_norm": 2.1979313648400547, + "language_loss": 0.9131726, + "learning_rate": 3.614024787585744e-06, + "loss": 0.9376893, + "num_input_tokens_seen": 5716980, + "step": 274, + "time_per_iteration": 2.684718132019043 + }, + { + "auxiliary_loss_clip": 0.013432, + "auxiliary_loss_mlp": 0.01108715, + "balance_loss_clip": 1.09515727, + "balance_loss_mlp": 1.06062579, + "epoch": 0.016533894483691566, + "flos": 22601637803520.0, + "grad_norm": 1.9719932168994616, + "language_loss": 0.88054645, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.90506566, + "num_input_tokens_seen": 5737780, + "step": 275, + "time_per_iteration": 2.7204532623291016 + }, + { + "auxiliary_loss_clip": 0.01346726, + "auxiliary_loss_mlp": 0.01102856, + "balance_loss_clip": 1.09623361, + "balance_loss_mlp": 1.05312169, + "epoch": 0.01659401773635954, + "flos": 21507188353920.0, + "grad_norm": 1.7930545784536995, + "language_loss": 0.80726624, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.83176208, + "num_input_tokens_seen": 5758330, + "step": 276, + "time_per_iteration": 3.04716157913208 + }, + { + "auxiliary_loss_clip": 0.0133817, + "auxiliary_loss_mlp": 0.01096103, + "balance_loss_clip": 1.09588337, + "balance_loss_mlp": 1.05220985, + "epoch": 0.016654140989027507, + "flos": 32850973411200.0, + "grad_norm": 1.9196343116615175, + "language_loss": 0.80707026, + "learning_rate": 3.621035951423551e-06, + "loss": 0.83141291, + "num_input_tokens_seen": 5778340, + "step": 277, + "time_per_iteration": 2.809645652770996 + }, + { + "auxiliary_loss_clip": 0.01337061, + "auxiliary_loss_mlp": 0.0109637, + "balance_loss_clip": 1.08979487, + "balance_loss_mlp": 1.04923487, + "epoch": 0.016714264241695476, + "flos": 12306228024960.0, + "grad_norm": 2.3224792061881185, + "language_loss": 0.80508065, + "learning_rate": 3.623356141983041e-06, + "loss": 0.82941496, + "num_input_tokens_seen": 5794295, + "step": 278, + "time_per_iteration": 2.604830741882324 + }, + { + "auxiliary_loss_clip": 0.01341116, + "auxiliary_loss_mlp": 0.01101968, + "balance_loss_clip": 1.09395671, + "balance_loss_mlp": 1.05585837, + "epoch": 0.016774387494363444, + "flos": 27123796362240.0, + "grad_norm": 2.0021377353660057, + "language_loss": 0.90582991, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.93026078, + "num_input_tokens_seen": 5814405, + "step": 279, + "time_per_iteration": 2.7193243503570557 + }, + { + "auxiliary_loss_clip": 0.01346095, + "auxiliary_loss_mlp": 0.01112065, + "balance_loss_clip": 1.09383631, + "balance_loss_mlp": 1.06450009, + "epoch": 0.016834510747031413, + "flos": 20191493082240.0, + "grad_norm": 2.9314445951013988, + "language_loss": 0.94049025, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.96507192, + "num_input_tokens_seen": 5832795, + "step": 280, + "time_per_iteration": 2.680924654006958 + }, + { + "auxiliary_loss_clip": 0.01346658, + "auxiliary_loss_mlp": 0.01109166, + "balance_loss_clip": 1.09285879, + "balance_loss_mlp": 1.06060064, + "epoch": 0.016894633999699385, + "flos": 27274262434560.0, + "grad_norm": 2.6758913403282483, + "language_loss": 0.74425459, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.76881289, + "num_input_tokens_seen": 5855750, + "step": 281, + "time_per_iteration": 2.691152811050415 + }, + { + "auxiliary_loss_clip": 0.01343371, + "auxiliary_loss_mlp": 0.01117708, + "balance_loss_clip": 1.09609079, + "balance_loss_mlp": 1.0724318, + "epoch": 0.016954757252367354, + "flos": 14902964922240.0, + "grad_norm": 3.4878028680462005, + "language_loss": 0.80255079, + "learning_rate": 3.632554186750274e-06, + "loss": 0.82716167, + "num_input_tokens_seen": 5872610, + "step": 282, + "time_per_iteration": 2.592664957046509 + }, + { + "auxiliary_loss_clip": 0.01348082, + "auxiliary_loss_mlp": 0.01118449, + "balance_loss_clip": 1.09700727, + "balance_loss_mlp": 1.07114697, + "epoch": 0.017014880505035322, + "flos": 21358805270400.0, + "grad_norm": 2.296781711700251, + "language_loss": 0.77719986, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.80186516, + "num_input_tokens_seen": 5892985, + "step": 283, + "time_per_iteration": 2.6502227783203125 + }, + { + "auxiliary_loss_clip": 0.01347311, + "auxiliary_loss_mlp": 0.01092934, + "balance_loss_clip": 1.0977478, + "balance_loss_mlp": 1.04804015, + "epoch": 0.01707500375770329, + "flos": 35333154858240.0, + "grad_norm": 2.3467060832193414, + "language_loss": 0.84246969, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.86687213, + "num_input_tokens_seen": 5914060, + "step": 284, + "time_per_iteration": 2.8534958362579346 + }, + { + "auxiliary_loss_clip": 0.01337962, + "auxiliary_loss_mlp": 0.01100399, + "balance_loss_clip": 1.09212708, + "balance_loss_mlp": 1.05297756, + "epoch": 0.01713512701037126, + "flos": 23582070506880.0, + "grad_norm": 2.7335752956200388, + "language_loss": 0.96998906, + "learning_rate": 3.639367500948819e-06, + "loss": 0.99437273, + "num_input_tokens_seen": 5932860, + "step": 285, + "time_per_iteration": 2.6338655948638916 + }, + { + "auxiliary_loss_clip": 0.01341319, + "auxiliary_loss_mlp": 0.01095606, + "balance_loss_clip": 1.09538078, + "balance_loss_mlp": 1.05123687, + "epoch": 0.01719525026303923, + "flos": 27634661544960.0, + "grad_norm": 2.294843469150046, + "language_loss": 0.94079655, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96516573, + "num_input_tokens_seen": 5952725, + "step": 286, + "time_per_iteration": 2.711087942123413 + }, + { + "auxiliary_loss_clip": 0.01332862, + "auxiliary_loss_mlp": 0.0109035, + "balance_loss_clip": 1.08986938, + "balance_loss_mlp": 1.04409683, + "epoch": 0.0172553735157072, + "flos": 26979722910720.0, + "grad_norm": 1.9277896882465477, + "language_loss": 0.92464817, + "learning_rate": 3.643869982119001e-06, + "loss": 0.94888031, + "num_input_tokens_seen": 5970560, + "step": 287, + "time_per_iteration": 2.640267848968506 + }, + { + "auxiliary_loss_clip": 0.01338192, + "auxiliary_loss_mlp": 0.01092315, + "balance_loss_clip": 1.09039164, + "balance_loss_mlp": 1.04651475, + "epoch": 0.01731549676837517, + "flos": 14056621689600.0, + "grad_norm": 2.7883535936791035, + "language_loss": 1.01873291, + "learning_rate": 3.646109470232502e-06, + "loss": 1.04303789, + "num_input_tokens_seen": 5982980, + "step": 288, + "time_per_iteration": 2.558312177658081 + }, + { + "auxiliary_loss_clip": 0.01225082, + "auxiliary_loss_mlp": 0.01188305, + "balance_loss_clip": 1.09194219, + "balance_loss_mlp": 1.17228377, + "epoch": 0.017375620021043137, + "flos": 66510694471680.0, + "grad_norm": 0.9289960013542303, + "language_loss": 0.63867617, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66281009, + "num_input_tokens_seen": 6049445, + "step": 289, + "time_per_iteration": 3.386254072189331 + }, + { + "auxiliary_loss_clip": 0.01341215, + "auxiliary_loss_mlp": 0.01107788, + "balance_loss_clip": 1.09622383, + "balance_loss_mlp": 1.06482446, + "epoch": 0.01743574327371111, + "flos": 15225154940160.0, + "grad_norm": 2.368974734045724, + "language_loss": 0.88156199, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.90605205, + "num_input_tokens_seen": 6064150, + "step": 290, + "time_per_iteration": 2.5670948028564453 + }, + { + "auxiliary_loss_clip": 0.0133848, + "auxiliary_loss_mlp": 0.010946, + "balance_loss_clip": 1.09388971, + "balance_loss_mlp": 1.04965782, + "epoch": 0.017495866526379078, + "flos": 25373869574400.0, + "grad_norm": 2.2011772664145504, + "language_loss": 0.84472585, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.8690567, + "num_input_tokens_seen": 6083920, + "step": 291, + "time_per_iteration": 2.648452043533325 + }, + { + "auxiliary_loss_clip": 0.01343563, + "auxiliary_loss_mlp": 0.01115116, + "balance_loss_clip": 1.10129941, + "balance_loss_mlp": 1.06607366, + "epoch": 0.017555989779047047, + "flos": 26359473836160.0, + "grad_norm": 1.7675259544479762, + "language_loss": 0.72679955, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.75138628, + "num_input_tokens_seen": 6105460, + "step": 292, + "time_per_iteration": 2.7239537239074707 + }, + { + "auxiliary_loss_clip": 0.01334066, + "auxiliary_loss_mlp": 0.01107289, + "balance_loss_clip": 1.09397244, + "balance_loss_mlp": 1.06170392, + "epoch": 0.017616113031715015, + "flos": 22338807010560.0, + "grad_norm": 2.419616990787406, + "language_loss": 0.86866581, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.89307928, + "num_input_tokens_seen": 6122890, + "step": 293, + "time_per_iteration": 2.642854690551758 + }, + { + "auxiliary_loss_clip": 0.01333726, + "auxiliary_loss_mlp": 0.0110557, + "balance_loss_clip": 1.09271646, + "balance_loss_mlp": 1.06086659, + "epoch": 0.017676236284382984, + "flos": 20156911263360.0, + "grad_norm": 2.112624444766753, + "language_loss": 0.80896151, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.83335447, + "num_input_tokens_seen": 6142890, + "step": 294, + "time_per_iteration": 2.598176956176758 + }, + { + "auxiliary_loss_clip": 0.01334179, + "auxiliary_loss_mlp": 0.01113433, + "balance_loss_clip": 1.09030747, + "balance_loss_mlp": 1.06892014, + "epoch": 0.017736359537050956, + "flos": 25223331674880.0, + "grad_norm": 2.8289841764142416, + "language_loss": 0.83806521, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.86254132, + "num_input_tokens_seen": 6162030, + "step": 295, + "time_per_iteration": 2.750103712081909 + }, + { + "auxiliary_loss_clip": 0.01339845, + "auxiliary_loss_mlp": 0.01121984, + "balance_loss_clip": 1.09978509, + "balance_loss_mlp": 1.0772326, + "epoch": 0.017796482789718925, + "flos": 20338798757760.0, + "grad_norm": 1.8804378237246864, + "language_loss": 0.84576106, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.87037927, + "num_input_tokens_seen": 6180540, + "step": 296, + "time_per_iteration": 4.154251337051392 + }, + { + "auxiliary_loss_clip": 0.01337678, + "auxiliary_loss_mlp": 0.01105295, + "balance_loss_clip": 1.09463406, + "balance_loss_mlp": 1.06154561, + "epoch": 0.017856606042386893, + "flos": 22379206832640.0, + "grad_norm": 2.055710812588959, + "language_loss": 0.87810111, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90253091, + "num_input_tokens_seen": 6199425, + "step": 297, + "time_per_iteration": 4.379676103591919 + }, + { + "auxiliary_loss_clip": 0.0133717, + "auxiliary_loss_mlp": 0.01103766, + "balance_loss_clip": 1.09343684, + "balance_loss_mlp": 1.06004047, + "epoch": 0.017916729295054862, + "flos": 20230061310720.0, + "grad_norm": 2.689351030321763, + "language_loss": 0.88947791, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.91388726, + "num_input_tokens_seen": 6219170, + "step": 298, + "time_per_iteration": 4.1055779457092285 + }, + { + "auxiliary_loss_clip": 0.01333843, + "auxiliary_loss_mlp": 0.01121179, + "balance_loss_clip": 1.09470236, + "balance_loss_mlp": 1.07499719, + "epoch": 0.01797685254772283, + "flos": 19390972625280.0, + "grad_norm": 1.8935027270905305, + "language_loss": 0.88550889, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.91005915, + "num_input_tokens_seen": 6237930, + "step": 299, + "time_per_iteration": 2.611168622970581 + }, + { + "auxiliary_loss_clip": 0.0133938, + "auxiliary_loss_mlp": 0.0110718, + "balance_loss_clip": 1.09468794, + "balance_loss_mlp": 1.06130886, + "epoch": 0.018036975800390802, + "flos": 24426007528320.0, + "grad_norm": 4.075580609786654, + "language_loss": 0.64664406, + "learning_rate": 3.672392800539357e-06, + "loss": 0.67110968, + "num_input_tokens_seen": 6257170, + "step": 300, + "time_per_iteration": 2.645603656768799 + }, + { + "auxiliary_loss_clip": 0.01338559, + "auxiliary_loss_mlp": 0.01111665, + "balance_loss_clip": 1.09775913, + "balance_loss_mlp": 1.06636548, + "epoch": 0.01809709905305877, + "flos": 15778933896960.0, + "grad_norm": 2.5071418214687515, + "language_loss": 0.87940675, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.90390897, + "num_input_tokens_seen": 6274780, + "step": 301, + "time_per_iteration": 2.6035923957824707 + }, + { + "auxiliary_loss_clip": 0.01238361, + "auxiliary_loss_mlp": 0.01073699, + "balance_loss_clip": 1.1100142, + "balance_loss_mlp": 1.05901265, + "epoch": 0.01815722230572674, + "flos": 67348382526720.0, + "grad_norm": 0.8350739260664176, + "language_loss": 0.62219667, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64531732, + "num_input_tokens_seen": 6340435, + "step": 302, + "time_per_iteration": 3.3307297229766846 + }, + { + "auxiliary_loss_clip": 0.0132981, + "auxiliary_loss_mlp": 0.01110918, + "balance_loss_clip": 1.0910126, + "balance_loss_mlp": 1.06507051, + "epoch": 0.01821734555839471, + "flos": 15485615435520.0, + "grad_norm": 2.115144575016314, + "language_loss": 0.89737153, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.9217788, + "num_input_tokens_seen": 6358160, + "step": 303, + "time_per_iteration": 2.6773293018341064 + }, + { + "auxiliary_loss_clip": 0.01335628, + "auxiliary_loss_mlp": 0.01118481, + "balance_loss_clip": 1.09579217, + "balance_loss_mlp": 1.07237101, + "epoch": 0.018277468811062677, + "flos": 24097424889600.0, + "grad_norm": 1.8670669350935472, + "language_loss": 0.80417514, + "learning_rate": 3.680920768703364e-06, + "loss": 0.82871628, + "num_input_tokens_seen": 6378485, + "step": 304, + "time_per_iteration": 2.691347360610962 + }, + { + "auxiliary_loss_clip": 0.01330802, + "auxiliary_loss_mlp": 0.01091671, + "balance_loss_clip": 1.09832263, + "balance_loss_mlp": 1.04858923, + "epoch": 0.01833759206373065, + "flos": 20959335141120.0, + "grad_norm": 1.6863564291935742, + "language_loss": 0.82761526, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.85184002, + "num_input_tokens_seen": 6397845, + "step": 305, + "time_per_iteration": 2.6883981227874756 + }, + { + "auxiliary_loss_clip": 0.01330759, + "auxiliary_loss_mlp": 0.01093908, + "balance_loss_clip": 1.09012437, + "balance_loss_mlp": 1.05115986, + "epoch": 0.018397715316398618, + "flos": 19390757143680.0, + "grad_norm": 2.1780708917523297, + "language_loss": 0.91148543, + "learning_rate": 3.685142765363119e-06, + "loss": 0.93573213, + "num_input_tokens_seen": 6416475, + "step": 306, + "time_per_iteration": 2.6465187072753906 + }, + { + "auxiliary_loss_clip": 0.01324743, + "auxiliary_loss_mlp": 0.01091696, + "balance_loss_clip": 1.08900762, + "balance_loss_mlp": 1.04882836, + "epoch": 0.018457838569066586, + "flos": 29132531619840.0, + "grad_norm": 3.4680205003751072, + "language_loss": 0.86581063, + "learning_rate": 3.687243426879095e-06, + "loss": 0.88997507, + "num_input_tokens_seen": 6437520, + "step": 307, + "time_per_iteration": 2.7787318229675293 + }, + { + "auxiliary_loss_clip": 0.01326572, + "auxiliary_loss_mlp": 0.01110018, + "balance_loss_clip": 1.09346747, + "balance_loss_mlp": 1.06247783, + "epoch": 0.018517961821734555, + "flos": 19208654167680.0, + "grad_norm": 2.413130156754219, + "language_loss": 0.71650648, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.74087244, + "num_input_tokens_seen": 6455680, + "step": 308, + "time_per_iteration": 2.652973175048828 + }, + { + "auxiliary_loss_clip": 0.01331912, + "auxiliary_loss_mlp": 0.01102766, + "balance_loss_clip": 1.09061241, + "balance_loss_mlp": 1.05911207, + "epoch": 0.018578085074402523, + "flos": 19863018184320.0, + "grad_norm": 2.1869498369051077, + "language_loss": 0.91841364, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.94276047, + "num_input_tokens_seen": 6474880, + "step": 309, + "time_per_iteration": 2.6613030433654785 + }, + { + "auxiliary_loss_clip": 0.01339178, + "auxiliary_loss_mlp": 0.01096668, + "balance_loss_clip": 1.09145641, + "balance_loss_mlp": 1.05084395, + "epoch": 0.018638208327070496, + "flos": 29606947476480.0, + "grad_norm": 2.0400456475786353, + "language_loss": 0.72784412, + "learning_rate": 3.69350459956065e-06, + "loss": 0.75220263, + "num_input_tokens_seen": 6495945, + "step": 310, + "time_per_iteration": 2.705345392227173 + }, + { + "auxiliary_loss_clip": 0.01331019, + "auxiliary_loss_mlp": 0.01113021, + "balance_loss_clip": 1.09560525, + "balance_loss_mlp": 1.06922317, + "epoch": 0.018698331579738464, + "flos": 45731555907840.0, + "grad_norm": 2.1345597100799645, + "language_loss": 0.74162471, + "learning_rate": 3.695578199367497e-06, + "loss": 0.76606506, + "num_input_tokens_seen": 6519930, + "step": 311, + "time_per_iteration": 2.846503496170044 + }, + { + "auxiliary_loss_clip": 0.01338389, + "auxiliary_loss_mlp": 0.01104203, + "balance_loss_clip": 1.09206033, + "balance_loss_mlp": 1.0609777, + "epoch": 0.018758454832406433, + "flos": 20483662308480.0, + "grad_norm": 3.713635021153945, + "language_loss": 0.91668129, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.94110715, + "num_input_tokens_seen": 6535070, + "step": 312, + "time_per_iteration": 2.598400592803955 + }, + { + "auxiliary_loss_clip": 0.01339145, + "auxiliary_loss_mlp": 0.01116197, + "balance_loss_clip": 1.09512305, + "balance_loss_mlp": 1.07034922, + "epoch": 0.0188185780850744, + "flos": 15777784661760.0, + "grad_norm": 4.5530066286460045, + "language_loss": 0.89634913, + "learning_rate": 3.699705471087043e-06, + "loss": 0.92090249, + "num_input_tokens_seen": 6554135, + "step": 313, + "time_per_iteration": 2.6944596767425537 + }, + { + "auxiliary_loss_clip": 0.01340962, + "auxiliary_loss_mlp": 0.0109941, + "balance_loss_clip": 1.09381938, + "balance_loss_mlp": 1.05430174, + "epoch": 0.018878701337742373, + "flos": 22455732758400.0, + "grad_norm": 2.3990870717118455, + "language_loss": 0.7335974, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.75800109, + "num_input_tokens_seen": 6572275, + "step": 314, + "time_per_iteration": 2.6550133228302 + }, + { + "auxiliary_loss_clip": 0.01329658, + "auxiliary_loss_mlp": 0.01105546, + "balance_loss_clip": 1.09075165, + "balance_loss_mlp": 1.06246412, + "epoch": 0.018938824590410342, + "flos": 30993530238720.0, + "grad_norm": 5.81191681220521, + "language_loss": 0.89890182, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.92325383, + "num_input_tokens_seen": 6594520, + "step": 315, + "time_per_iteration": 2.7121222019195557 + }, + { + "auxiliary_loss_clip": 0.0133262, + "auxiliary_loss_mlp": 0.01096177, + "balance_loss_clip": 1.09287357, + "balance_loss_mlp": 1.05209303, + "epoch": 0.01899894784307831, + "flos": 23258910821760.0, + "grad_norm": 2.446494284682687, + "language_loss": 0.80517328, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.82946122, + "num_input_tokens_seen": 6614245, + "step": 316, + "time_per_iteration": 2.640573501586914 + }, + { + "auxiliary_loss_clip": 0.01326654, + "auxiliary_loss_mlp": 0.01094904, + "balance_loss_clip": 1.09036672, + "balance_loss_mlp": 1.05046248, + "epoch": 0.01905907109574628, + "flos": 17457901367040.0, + "grad_norm": 2.3705495670370524, + "language_loss": 0.90161496, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.92583054, + "num_input_tokens_seen": 6632015, + "step": 317, + "time_per_iteration": 2.594388246536255 + }, + { + "auxiliary_loss_clip": 0.01324014, + "auxiliary_loss_mlp": 0.01097498, + "balance_loss_clip": 1.08944559, + "balance_loss_mlp": 1.05281842, + "epoch": 0.019119194348414248, + "flos": 14970225139200.0, + "grad_norm": 7.443622240044352, + "language_loss": 0.90836811, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93258321, + "num_input_tokens_seen": 6649015, + "step": 318, + "time_per_iteration": 2.6647114753723145 + }, + { + "auxiliary_loss_clip": 0.01326579, + "auxiliary_loss_mlp": 0.01092817, + "balance_loss_clip": 1.0886786, + "balance_loss_mlp": 1.05102181, + "epoch": 0.01917931760108222, + "flos": 25482822503040.0, + "grad_norm": 2.232217614618188, + "language_loss": 0.93955356, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.9637475, + "num_input_tokens_seen": 6669225, + "step": 319, + "time_per_iteration": 2.6901800632476807 + }, + { + "auxiliary_loss_clip": 0.01209258, + "auxiliary_loss_mlp": 0.01057567, + "balance_loss_clip": 1.08611965, + "balance_loss_mlp": 1.04288089, + "epoch": 0.01923944085375019, + "flos": 71556967353600.0, + "grad_norm": 1.0009907084180605, + "language_loss": 0.59817195, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62084019, + "num_input_tokens_seen": 6725775, + "step": 320, + "time_per_iteration": 3.1044812202453613 + }, + { + "auxiliary_loss_clip": 0.01323701, + "auxiliary_loss_mlp": 0.01105882, + "balance_loss_clip": 1.08827436, + "balance_loss_mlp": 1.06291938, + "epoch": 0.019299564106418157, + "flos": 19682495406720.0, + "grad_norm": 3.6735645336458163, + "language_loss": 0.89620435, + "learning_rate": 3.715954969092154e-06, + "loss": 0.92050016, + "num_input_tokens_seen": 6744170, + "step": 321, + "time_per_iteration": 2.650325298309326 + }, + { + "auxiliary_loss_clip": 0.01333523, + "auxiliary_loss_mlp": 0.01118534, + "balance_loss_clip": 1.09200621, + "balance_loss_mlp": 1.07440257, + "epoch": 0.019359687359086126, + "flos": 24387151991040.0, + "grad_norm": 2.289334718991835, + "language_loss": 0.82897186, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.85349244, + "num_input_tokens_seen": 6764565, + "step": 322, + "time_per_iteration": 2.65793514251709 + }, + { + "auxiliary_loss_clip": 0.01332983, + "auxiliary_loss_mlp": 0.01092262, + "balance_loss_clip": 1.09035325, + "balance_loss_mlp": 1.05061018, + "epoch": 0.019419810611754094, + "flos": 23951376190080.0, + "grad_norm": 2.3678949255052912, + "language_loss": 0.72983897, + "learning_rate": 3.719954063833981e-06, + "loss": 0.75409144, + "num_input_tokens_seen": 6785310, + "step": 323, + "time_per_iteration": 2.6827828884124756 + }, + { + "auxiliary_loss_clip": 0.01321298, + "auxiliary_loss_mlp": 0.01092254, + "balance_loss_clip": 1.08474624, + "balance_loss_mlp": 1.04974401, + "epoch": 0.019479933864422067, + "flos": 22160223567360.0, + "grad_norm": 1.9971507164977458, + "language_loss": 0.92358303, + "learning_rate": 3.721944334919596e-06, + "loss": 0.9477185, + "num_input_tokens_seen": 6803290, + "step": 324, + "time_per_iteration": 2.667363405227661 + }, + { + "auxiliary_loss_clip": 0.0133014, + "auxiliary_loss_mlp": 0.01089098, + "balance_loss_clip": 1.09217644, + "balance_loss_mlp": 1.04878139, + "epoch": 0.019540057117090035, + "flos": 22236821320320.0, + "grad_norm": 6.407507213214319, + "language_loss": 0.65127969, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.67547202, + "num_input_tokens_seen": 6822570, + "step": 325, + "time_per_iteration": 2.658700466156006 + }, + { + "auxiliary_loss_clip": 0.01328385, + "auxiliary_loss_mlp": 0.01109788, + "balance_loss_clip": 1.09598839, + "balance_loss_mlp": 1.06675363, + "epoch": 0.019600180369758004, + "flos": 23076771932160.0, + "grad_norm": 1.7177375017641943, + "language_loss": 0.76394802, + "learning_rate": 3.72590651470665e-06, + "loss": 0.78832972, + "num_input_tokens_seen": 6841910, + "step": 326, + "time_per_iteration": 2.6326630115509033 + }, + { + "auxiliary_loss_clip": 0.01322824, + "auxiliary_loss_mlp": 0.01103487, + "balance_loss_clip": 1.09083152, + "balance_loss_mlp": 1.06040514, + "epoch": 0.019660303622425972, + "flos": 25410857604480.0, + "grad_norm": 2.041100065316132, + "language_loss": 0.79262185, + "learning_rate": 3.727878498433505e-06, + "loss": 0.81688493, + "num_input_tokens_seen": 6862480, + "step": 327, + "time_per_iteration": 2.7195518016815186 + }, + { + "auxiliary_loss_clip": 0.0132945, + "auxiliary_loss_mlp": 0.01099712, + "balance_loss_clip": 1.09292865, + "balance_loss_mlp": 1.05832207, + "epoch": 0.01972042687509394, + "flos": 23657519024640.0, + "grad_norm": 2.852301933148325, + "language_loss": 0.80569315, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.82998472, + "num_input_tokens_seen": 6882015, + "step": 328, + "time_per_iteration": 2.6789369583129883 + }, + { + "auxiliary_loss_clip": 0.01327544, + "auxiliary_loss_mlp": 0.01094059, + "balance_loss_clip": 1.08719349, + "balance_loss_mlp": 1.05045235, + "epoch": 0.019780550127761913, + "flos": 18223480869120.0, + "grad_norm": 2.280823996815513, + "language_loss": 0.93599927, + "learning_rate": 3.731804438545683e-06, + "loss": 0.96021533, + "num_input_tokens_seen": 6899785, + "step": 329, + "time_per_iteration": 2.6043548583984375 + }, + { + "auxiliary_loss_clip": 0.0133329, + "auxiliary_loss_mlp": 0.0110952, + "balance_loss_clip": 1.09211767, + "balance_loss_mlp": 1.06629419, + "epoch": 0.01984067338042988, + "flos": 22418780641920.0, + "grad_norm": 2.788704520584699, + "language_loss": 0.7476396, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.77206767, + "num_input_tokens_seen": 6918575, + "step": 330, + "time_per_iteration": 2.706001043319702 + }, + { + "auxiliary_loss_clip": 0.0133006, + "auxiliary_loss_mlp": 0.01115344, + "balance_loss_clip": 1.09077096, + "balance_loss_mlp": 1.07280993, + "epoch": 0.01990079663309785, + "flos": 17055199013760.0, + "grad_norm": 4.201650057157668, + "language_loss": 0.93435889, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.95881295, + "num_input_tokens_seen": 6936965, + "step": 331, + "time_per_iteration": 2.6499180793762207 + }, + { + "auxiliary_loss_clip": 0.01316843, + "auxiliary_loss_mlp": 0.01085812, + "balance_loss_clip": 1.08825564, + "balance_loss_mlp": 1.04563856, + "epoch": 0.01996091988576582, + "flos": 15961791058560.0, + "grad_norm": 2.5475056489813968, + "language_loss": 0.9293468, + "learning_rate": 3.737648825272422e-06, + "loss": 0.95337331, + "num_input_tokens_seen": 6953475, + "step": 332, + "time_per_iteration": 2.5990231037139893 + }, + { + "auxiliary_loss_clip": 0.01325701, + "auxiliary_loss_mlp": 0.01091941, + "balance_loss_clip": 1.09376514, + "balance_loss_mlp": 1.04902601, + "epoch": 0.02002104313843379, + "flos": 23586451966080.0, + "grad_norm": 2.7319388202061106, + "language_loss": 0.75380504, + "learning_rate": 3.739585224276384e-06, + "loss": 0.77798152, + "num_input_tokens_seen": 6971630, + "step": 333, + "time_per_iteration": 2.6225569248199463 + }, + { + "auxiliary_loss_clip": 0.01323488, + "auxiliary_loss_mlp": 0.01083816, + "balance_loss_clip": 1.08822608, + "balance_loss_mlp": 1.04249835, + "epoch": 0.02008116639110176, + "flos": 34094883352320.0, + "grad_norm": 3.3732742696494924, + "language_loss": 0.78797042, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.81204355, + "num_input_tokens_seen": 6992775, + "step": 334, + "time_per_iteration": 2.725562572479248 + }, + { + "auxiliary_loss_clip": 0.01325152, + "auxiliary_loss_mlp": 0.01093257, + "balance_loss_clip": 1.08535278, + "balance_loss_mlp": 1.04867256, + "epoch": 0.020141289643769728, + "flos": 19683716469120.0, + "grad_norm": 1.945115565921162, + "language_loss": 0.83465719, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.8588413, + "num_input_tokens_seen": 7011425, + "step": 335, + "time_per_iteration": 2.638871192932129 + }, + { + "auxiliary_loss_clip": 0.01322365, + "auxiliary_loss_mlp": 0.01085854, + "balance_loss_clip": 1.08842373, + "balance_loss_mlp": 1.04405963, + "epoch": 0.020201412896437697, + "flos": 20740567357440.0, + "grad_norm": 2.3527147371949058, + "language_loss": 0.92432821, + "learning_rate": 3.745359722027911e-06, + "loss": 0.94841033, + "num_input_tokens_seen": 7029450, + "step": 336, + "time_per_iteration": 2.6654980182647705 + }, + { + "auxiliary_loss_clip": 0.01321531, + "auxiliary_loss_mlp": 0.01079695, + "balance_loss_clip": 1.08577883, + "balance_loss_mlp": 1.03818631, + "epoch": 0.020261536149105665, + "flos": 20266510636800.0, + "grad_norm": 1.7223490941555537, + "language_loss": 0.88663971, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.91065204, + "num_input_tokens_seen": 7047555, + "step": 337, + "time_per_iteration": 4.246743440628052 + }, + { + "auxiliary_loss_clip": 0.01312441, + "auxiliary_loss_mlp": 0.01102336, + "balance_loss_clip": 1.08320296, + "balance_loss_mlp": 1.05841899, + "epoch": 0.020321659401773638, + "flos": 25848752307840.0, + "grad_norm": 1.6493597356962735, + "language_loss": 0.89869279, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.92284054, + "num_input_tokens_seen": 7068185, + "step": 338, + "time_per_iteration": 5.869866609573364 + }, + { + "auxiliary_loss_clip": 0.01321566, + "auxiliary_loss_mlp": 0.0109858, + "balance_loss_clip": 1.08546185, + "balance_loss_mlp": 1.05554605, + "epoch": 0.020381782654441606, + "flos": 17495033051520.0, + "grad_norm": 2.1603069065052694, + "language_loss": 0.85168982, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.87589133, + "num_input_tokens_seen": 7085955, + "step": 339, + "time_per_iteration": 2.603130340576172 + }, + { + "auxiliary_loss_clip": 0.01328225, + "auxiliary_loss_mlp": 0.01099064, + "balance_loss_clip": 1.0902226, + "balance_loss_mlp": 1.05524242, + "epoch": 0.020441905907109575, + "flos": 24243940465920.0, + "grad_norm": 2.1746002196087817, + "language_loss": 0.88821882, + "learning_rate": 3.75297936342452e-06, + "loss": 0.91249174, + "num_input_tokens_seen": 7106345, + "step": 340, + "time_per_iteration": 2.7247626781463623 + }, + { + "auxiliary_loss_clip": 0.01322505, + "auxiliary_loss_mlp": 0.01085559, + "balance_loss_clip": 1.08594203, + "balance_loss_mlp": 1.04004502, + "epoch": 0.020502029159777543, + "flos": 22233301787520.0, + "grad_norm": 2.004763613818719, + "language_loss": 0.88489276, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.9089734, + "num_input_tokens_seen": 7125070, + "step": 341, + "time_per_iteration": 2.731411933898926 + }, + { + "auxiliary_loss_clip": 0.01324734, + "auxiliary_loss_mlp": 0.01098572, + "balance_loss_clip": 1.08451748, + "balance_loss_mlp": 1.05479813, + "epoch": 0.020562152412445512, + "flos": 23987861429760.0, + "grad_norm": 2.3638593093640736, + "language_loss": 0.80611861, + "learning_rate": 3.756755633390458e-06, + "loss": 0.83035159, + "num_input_tokens_seen": 7144675, + "step": 342, + "time_per_iteration": 2.6085095405578613 + }, + { + "auxiliary_loss_clip": 0.01313805, + "auxiliary_loss_mlp": 0.01098164, + "balance_loss_clip": 1.08411694, + "balance_loss_mlp": 1.05138612, + "epoch": 0.020622275665113484, + "flos": 26975305537920.0, + "grad_norm": 1.727276092160433, + "language_loss": 0.89612651, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.92024612, + "num_input_tokens_seen": 7165505, + "step": 343, + "time_per_iteration": 2.739912509918213 + }, + { + "auxiliary_loss_clip": 0.01324722, + "auxiliary_loss_mlp": 0.01096954, + "balance_loss_clip": 1.09109879, + "balance_loss_mlp": 1.05518293, + "epoch": 0.020682398917781453, + "flos": 22600704049920.0, + "grad_norm": 2.6902665590614663, + "language_loss": 0.78381217, + "learning_rate": 3.7605098841644e-06, + "loss": 0.80802888, + "num_input_tokens_seen": 7184605, + "step": 344, + "time_per_iteration": 2.638439655303955 + }, + { + "auxiliary_loss_clip": 0.01310552, + "auxiliary_loss_mlp": 0.01103983, + "balance_loss_clip": 1.08375537, + "balance_loss_mlp": 1.05982804, + "epoch": 0.02074252217044942, + "flos": 15013605790080.0, + "grad_norm": 2.2675296623639114, + "language_loss": 0.75051636, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.77466166, + "num_input_tokens_seen": 7203065, + "step": 345, + "time_per_iteration": 2.581258773803711 + }, + { + "auxiliary_loss_clip": 0.01316305, + "auxiliary_loss_mlp": 0.01107937, + "balance_loss_clip": 1.08855689, + "balance_loss_mlp": 1.06447339, + "epoch": 0.02080264542311739, + "flos": 25337958952320.0, + "grad_norm": 2.2144688897761395, + "language_loss": 0.90414572, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.92838824, + "num_input_tokens_seen": 7222995, + "step": 346, + "time_per_iteration": 2.6281676292419434 + }, + { + "auxiliary_loss_clip": 0.01312286, + "auxiliary_loss_mlp": 0.01096576, + "balance_loss_clip": 1.08357453, + "balance_loss_mlp": 1.05621195, + "epoch": 0.02086276867578536, + "flos": 24388804016640.0, + "grad_norm": 3.1106741063140366, + "language_loss": 0.79133296, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.81542158, + "num_input_tokens_seen": 7244625, + "step": 347, + "time_per_iteration": 2.6477038860321045 + }, + { + "auxiliary_loss_clip": 0.01317665, + "auxiliary_loss_mlp": 0.01097416, + "balance_loss_clip": 1.08921003, + "balance_loss_mlp": 1.05328524, + "epoch": 0.02092289192845333, + "flos": 24462205459200.0, + "grad_norm": 3.7065871267995893, + "language_loss": 0.71211165, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.73626244, + "num_input_tokens_seen": 7263255, + "step": 348, + "time_per_iteration": 2.6215686798095703 + }, + { + "auxiliary_loss_clip": 0.01319168, + "auxiliary_loss_mlp": 0.01104109, + "balance_loss_clip": 1.0859139, + "balance_loss_mlp": 1.06066906, + "epoch": 0.0209830151811213, + "flos": 17451185523840.0, + "grad_norm": 2.3976328225512495, + "language_loss": 0.77118891, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.79542166, + "num_input_tokens_seen": 7279275, + "step": 349, + "time_per_iteration": 2.60102915763855 + }, + { + "auxiliary_loss_clip": 0.01304146, + "auxiliary_loss_mlp": 0.01101496, + "balance_loss_clip": 1.08412242, + "balance_loss_mlp": 1.06017756, + "epoch": 0.021043138433789268, + "flos": 24573995562240.0, + "grad_norm": 1.7599420553547571, + "language_loss": 0.85191035, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.87596673, + "num_input_tokens_seen": 7300180, + "step": 350, + "time_per_iteration": 2.7636313438415527 + }, + { + "auxiliary_loss_clip": 0.01310639, + "auxiliary_loss_mlp": 0.01090182, + "balance_loss_clip": 1.08742464, + "balance_loss_mlp": 1.05015147, + "epoch": 0.021103261686457236, + "flos": 24454053072000.0, + "grad_norm": 2.2188224040826956, + "language_loss": 0.7998929, + "learning_rate": 3.773480007028776e-06, + "loss": 0.82390112, + "num_input_tokens_seen": 7317430, + "step": 351, + "time_per_iteration": 2.651803493499756 + }, + { + "auxiliary_loss_clip": 0.01318922, + "auxiliary_loss_mlp": 0.01104903, + "balance_loss_clip": 1.08851838, + "balance_loss_mlp": 1.06093884, + "epoch": 0.021163384939125205, + "flos": 14683083816960.0, + "grad_norm": 2.30399977815629, + "language_loss": 0.8746841, + "learning_rate": 3.775311735671078e-06, + "loss": 0.89892232, + "num_input_tokens_seen": 7334875, + "step": 352, + "time_per_iteration": 2.687080144882202 + }, + { + "auxiliary_loss_clip": 0.01311303, + "auxiliary_loss_mlp": 0.01101912, + "balance_loss_clip": 1.0859803, + "balance_loss_mlp": 1.05861485, + "epoch": 0.021223508191793177, + "flos": 24493195918080.0, + "grad_norm": 2.574621592267882, + "language_loss": 0.8247534, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.84888554, + "num_input_tokens_seen": 7355185, + "step": 353, + "time_per_iteration": 2.7096078395843506 + }, + { + "auxiliary_loss_clip": 0.01308698, + "auxiliary_loss_mlp": 0.01092448, + "balance_loss_clip": 1.08573294, + "balance_loss_mlp": 1.05160654, + "epoch": 0.021283631444461146, + "flos": 24126978804480.0, + "grad_norm": 1.9591973719581535, + "language_loss": 0.8089481, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.83295953, + "num_input_tokens_seen": 7374425, + "step": 354, + "time_per_iteration": 2.658649444580078 + }, + { + "auxiliary_loss_clip": 0.01314249, + "auxiliary_loss_mlp": 0.01095812, + "balance_loss_clip": 1.08369493, + "balance_loss_mlp": 1.05218124, + "epoch": 0.021343754697129114, + "flos": 25192233475200.0, + "grad_norm": 2.22170783568627, + "language_loss": 0.81311834, + "learning_rate": 3.780775860546545e-06, + "loss": 0.837219, + "num_input_tokens_seen": 7394175, + "step": 355, + "time_per_iteration": 2.619551420211792 + }, + { + "auxiliary_loss_clip": 0.01310207, + "auxiliary_loss_mlp": 0.01090401, + "balance_loss_clip": 1.08222032, + "balance_loss_mlp": 1.04851055, + "epoch": 0.021403877949797083, + "flos": 17274182279040.0, + "grad_norm": 2.212340256471132, + "language_loss": 0.89746779, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.92147392, + "num_input_tokens_seen": 7412645, + "step": 356, + "time_per_iteration": 2.5877137184143066 + }, + { + "auxiliary_loss_clip": 0.01308298, + "auxiliary_loss_mlp": 0.0108474, + "balance_loss_clip": 1.08573771, + "balance_loss_mlp": 1.04191971, + "epoch": 0.021464001202465055, + "flos": 30917435276160.0, + "grad_norm": 1.9878508054592678, + "language_loss": 0.79956681, + "learning_rate": 3.784393017158528e-06, + "loss": 0.82349718, + "num_input_tokens_seen": 7432275, + "step": 357, + "time_per_iteration": 2.781755208969116 + }, + { + "auxiliary_loss_clip": 0.0130988, + "auxiliary_loss_mlp": 0.01083565, + "balance_loss_clip": 1.08250284, + "balance_loss_mlp": 1.04417801, + "epoch": 0.021524124455133024, + "flos": 18186385098240.0, + "grad_norm": 2.6679617624252137, + "language_loss": 0.76516652, + "learning_rate": 3.786194003461506e-06, + "loss": 0.78910094, + "num_input_tokens_seen": 7450245, + "step": 358, + "time_per_iteration": 2.63144850730896 + }, + { + "auxiliary_loss_clip": 0.01307251, + "auxiliary_loss_mlp": 0.01092013, + "balance_loss_clip": 1.08083165, + "balance_loss_mlp": 1.04842997, + "epoch": 0.021584247707800992, + "flos": 13805786039040.0, + "grad_norm": 2.344744226979962, + "language_loss": 0.88770491, + "learning_rate": 3.787989966086264e-06, + "loss": 0.91169769, + "num_input_tokens_seen": 7466845, + "step": 359, + "time_per_iteration": 2.641932964324951 + }, + { + "auxiliary_loss_clip": 0.01315087, + "auxiliary_loss_mlp": 0.01090441, + "balance_loss_clip": 1.08486438, + "balance_loss_mlp": 1.05088758, + "epoch": 0.02164437096046896, + "flos": 23294713703040.0, + "grad_norm": 3.6505103877164804, + "language_loss": 0.75853801, + "learning_rate": 3.789780932980997e-06, + "loss": 0.78259325, + "num_input_tokens_seen": 7485450, + "step": 360, + "time_per_iteration": 2.5901477336883545 + }, + { + "auxiliary_loss_clip": 0.01203506, + "auxiliary_loss_mlp": 0.0103078, + "balance_loss_clip": 1.07682121, + "balance_loss_mlp": 1.01781011, + "epoch": 0.02170449421313693, + "flos": 68899578341760.0, + "grad_norm": 0.8439708743577624, + "language_loss": 0.64861441, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67095727, + "num_input_tokens_seen": 7553780, + "step": 361, + "time_per_iteration": 3.278409957885742 + }, + { + "auxiliary_loss_clip": 0.01306068, + "auxiliary_loss_mlp": 0.01086116, + "balance_loss_clip": 1.0792098, + "balance_loss_mlp": 1.04501224, + "epoch": 0.0217646174658049, + "flos": 25228539146880.0, + "grad_norm": 3.144635825096315, + "language_loss": 0.78844237, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.81236422, + "num_input_tokens_seen": 7574155, + "step": 362, + "time_per_iteration": 2.6302051544189453 + }, + { + "auxiliary_loss_clip": 0.01309585, + "auxiliary_loss_mlp": 0.01093258, + "balance_loss_clip": 1.08188891, + "balance_loss_mlp": 1.05244076, + "epoch": 0.02182474071847287, + "flos": 22893124671360.0, + "grad_norm": 2.019833715135914, + "language_loss": 0.92474592, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.94877434, + "num_input_tokens_seen": 7592320, + "step": 363, + "time_per_iteration": 2.6566081047058105 + }, + { + "auxiliary_loss_clip": 0.01305173, + "auxiliary_loss_mlp": 0.01096467, + "balance_loss_clip": 1.0816617, + "balance_loss_mlp": 1.05693769, + "epoch": 0.02188486397114084, + "flos": 23658991482240.0, + "grad_norm": 2.282586403147275, + "language_loss": 0.89844346, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.92245984, + "num_input_tokens_seen": 7611185, + "step": 364, + "time_per_iteration": 2.711911201477051 + }, + { + "auxiliary_loss_clip": 0.01311963, + "auxiliary_loss_mlp": 0.0109247, + "balance_loss_clip": 1.08607888, + "balance_loss_mlp": 1.04955506, + "epoch": 0.021944987223808807, + "flos": 21543637680000.0, + "grad_norm": 1.948927065488749, + "language_loss": 0.79460645, + "learning_rate": 3.798661793553676e-06, + "loss": 0.81865084, + "num_input_tokens_seen": 7631970, + "step": 365, + "time_per_iteration": 2.6396052837371826 + }, + { + "auxiliary_loss_clip": 0.01306043, + "auxiliary_loss_mlp": 0.01100405, + "balance_loss_clip": 1.08267248, + "balance_loss_mlp": 1.05658317, + "epoch": 0.022005110476476776, + "flos": 16070887641600.0, + "grad_norm": 1.85181498507666, + "language_loss": 0.84341359, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.86747801, + "num_input_tokens_seen": 7649745, + "step": 366, + "time_per_iteration": 2.6278867721557617 + }, + { + "auxiliary_loss_clip": 0.01312113, + "auxiliary_loss_mlp": 0.01087574, + "balance_loss_clip": 1.08304918, + "balance_loss_mlp": 1.04859269, + "epoch": 0.022065233729144748, + "flos": 21433715084160.0, + "grad_norm": 1.9326288300300676, + "language_loss": 0.87040466, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.89440155, + "num_input_tokens_seen": 7668830, + "step": 367, + "time_per_iteration": 2.6410560607910156 + }, + { + "auxiliary_loss_clip": 0.01312217, + "auxiliary_loss_mlp": 0.01096053, + "balance_loss_clip": 1.08074582, + "balance_loss_mlp": 1.05335259, + "epoch": 0.022125356981812717, + "flos": 21543709507200.0, + "grad_norm": 2.7247329926128976, + "language_loss": 0.8487373, + "learning_rate": 3.803932100062912e-06, + "loss": 0.87282002, + "num_input_tokens_seen": 7687240, + "step": 368, + "time_per_iteration": 2.652012825012207 + }, + { + "auxiliary_loss_clip": 0.01312089, + "auxiliary_loss_mlp": 0.01079926, + "balance_loss_clip": 1.0801568, + "balance_loss_mlp": 1.04027653, + "epoch": 0.022185480234480685, + "flos": 20704153944960.0, + "grad_norm": 2.4839328990540794, + "language_loss": 0.75997221, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.78389233, + "num_input_tokens_seen": 7704440, + "step": 369, + "time_per_iteration": 2.601384401321411 + }, + { + "auxiliary_loss_clip": 0.01306737, + "auxiliary_loss_mlp": 0.01099274, + "balance_loss_clip": 1.08232927, + "balance_loss_mlp": 1.05836105, + "epoch": 0.022245603487148654, + "flos": 25193203142400.0, + "grad_norm": 2.189428421230448, + "language_loss": 0.82977992, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.85383999, + "num_input_tokens_seen": 7727160, + "step": 370, + "time_per_iteration": 2.6538548469543457 + }, + { + "auxiliary_loss_clip": 0.01306327, + "auxiliary_loss_mlp": 0.01099594, + "balance_loss_clip": 1.08127654, + "balance_loss_mlp": 1.05713177, + "epoch": 0.022305726739816623, + "flos": 21395936954880.0, + "grad_norm": 1.8569755368340455, + "language_loss": 0.81588483, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.83994406, + "num_input_tokens_seen": 7747730, + "step": 371, + "time_per_iteration": 2.6779489517211914 + }, + { + "auxiliary_loss_clip": 0.01311283, + "auxiliary_loss_mlp": 0.01093653, + "balance_loss_clip": 1.08593988, + "balance_loss_mlp": 1.05169153, + "epoch": 0.022365849992484595, + "flos": 22492146170880.0, + "grad_norm": 2.0622769904034817, + "language_loss": 0.83493644, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.85898578, + "num_input_tokens_seen": 7766765, + "step": 372, + "time_per_iteration": 2.676797866821289 + }, + { + "auxiliary_loss_clip": 0.01303906, + "auxiliary_loss_mlp": 0.01091688, + "balance_loss_clip": 1.08125615, + "balance_loss_mlp": 1.05022752, + "epoch": 0.022425973245152563, + "flos": 17856581397120.0, + "grad_norm": 2.8569846697004424, + "language_loss": 0.79004842, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.81400436, + "num_input_tokens_seen": 7784010, + "step": 373, + "time_per_iteration": 2.593186616897583 + }, + { + "auxiliary_loss_clip": 0.01309731, + "auxiliary_loss_mlp": 0.01087409, + "balance_loss_clip": 1.08431911, + "balance_loss_mlp": 1.0448271, + "epoch": 0.022486096497820532, + "flos": 15483029656320.0, + "grad_norm": 2.5442660874947385, + "language_loss": 0.77622557, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.80019701, + "num_input_tokens_seen": 7801305, + "step": 374, + "time_per_iteration": 2.628392457962036 + }, + { + "auxiliary_loss_clip": 0.0129871, + "auxiliary_loss_mlp": 0.01076131, + "balance_loss_clip": 1.07404125, + "balance_loss_mlp": 1.03395462, + "epoch": 0.0225462197504885, + "flos": 27784157950080.0, + "grad_norm": 1.574507922341891, + "language_loss": 0.86032569, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.88407415, + "num_input_tokens_seen": 7823965, + "step": 375, + "time_per_iteration": 2.6783435344696045 + }, + { + "auxiliary_loss_clip": 0.01307026, + "auxiliary_loss_mlp": 0.01102393, + "balance_loss_clip": 1.08340597, + "balance_loss_mlp": 1.0590483, + "epoch": 0.02260634300315647, + "flos": 19975490645760.0, + "grad_norm": 2.1279260859120286, + "language_loss": 0.8901403, + "learning_rate": 3.817778917253314e-06, + "loss": 0.91423446, + "num_input_tokens_seen": 7842115, + "step": 376, + "time_per_iteration": 2.621629476547241 + }, + { + "auxiliary_loss_clip": 0.01306872, + "auxiliary_loss_mlp": 0.01087647, + "balance_loss_clip": 1.07870364, + "balance_loss_mlp": 1.04868913, + "epoch": 0.02266646625582444, + "flos": 16028189349120.0, + "grad_norm": 3.0367767906095917, + "language_loss": 0.75437558, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.77832079, + "num_input_tokens_seen": 7857830, + "step": 377, + "time_per_iteration": 2.5465245246887207 + }, + { + "auxiliary_loss_clip": 0.01298987, + "auxiliary_loss_mlp": 0.01093623, + "balance_loss_clip": 1.08128345, + "balance_loss_mlp": 1.0517087, + "epoch": 0.02272658950849241, + "flos": 20404622430720.0, + "grad_norm": 2.1955644054597374, + "language_loss": 0.99231368, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.01623976, + "num_input_tokens_seen": 7875840, + "step": 378, + "time_per_iteration": 7.184643983840942 + }, + { + "auxiliary_loss_clip": 0.01202133, + "auxiliary_loss_mlp": 0.01040839, + "balance_loss_clip": 1.0828104, + "balance_loss_mlp": 1.0283463, + "epoch": 0.02278671276116038, + "flos": 69847332647040.0, + "grad_norm": 0.9608118941287621, + "language_loss": 0.75395739, + "learning_rate": 3.822895650276492e-06, + "loss": 0.7763871, + "num_input_tokens_seen": 7940190, + "step": 379, + "time_per_iteration": 4.961140394210815 + }, + { + "auxiliary_loss_clip": 0.01308523, + "auxiliary_loss_mlp": 0.01087195, + "balance_loss_clip": 1.07820678, + "balance_loss_mlp": 1.04792738, + "epoch": 0.022846836013828347, + "flos": 38508771340800.0, + "grad_norm": 3.7276648293904375, + "language_loss": 0.78197825, + "learning_rate": 3.824592231451859e-06, + "loss": 0.8059355, + "num_input_tokens_seen": 7960840, + "step": 380, + "time_per_iteration": 2.7892863750457764 + }, + { + "auxiliary_loss_clip": 0.01301718, + "auxiliary_loss_mlp": 0.01088822, + "balance_loss_clip": 1.07955217, + "balance_loss_mlp": 1.04945946, + "epoch": 0.02290695926649632, + "flos": 20959478795520.0, + "grad_norm": 2.0941800643649855, + "language_loss": 0.96743369, + "learning_rate": 3.826284353801652e-06, + "loss": 0.99133915, + "num_input_tokens_seen": 7975500, + "step": 381, + "time_per_iteration": 2.619854688644409 + }, + { + "auxiliary_loss_clip": 0.01311313, + "auxiliary_loss_mlp": 0.01093973, + "balance_loss_clip": 1.08192921, + "balance_loss_mlp": 1.0539186, + "epoch": 0.022967082519164288, + "flos": 24022407335040.0, + "grad_norm": 2.122042453210184, + "language_loss": 0.87664795, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90070075, + "num_input_tokens_seen": 7993880, + "step": 382, + "time_per_iteration": 2.617398500442505 + }, + { + "auxiliary_loss_clip": 0.01304042, + "auxiliary_loss_mlp": 0.01096828, + "balance_loss_clip": 1.0821979, + "balance_loss_mlp": 1.05760849, + "epoch": 0.023027205771832256, + "flos": 20997149184000.0, + "grad_norm": 1.978420170714987, + "language_loss": 0.84990942, + "learning_rate": 3.829655315342268e-06, + "loss": 0.87391812, + "num_input_tokens_seen": 8012730, + "step": 383, + "time_per_iteration": 2.6345314979553223 + }, + { + "auxiliary_loss_clip": 0.01300873, + "auxiliary_loss_mlp": 0.0111136, + "balance_loss_clip": 1.08199024, + "balance_loss_mlp": 1.0716393, + "epoch": 0.023087329024500225, + "flos": 21360816432000.0, + "grad_norm": 2.0575071112917778, + "language_loss": 0.83349717, + "learning_rate": 3.831334200735543e-06, + "loss": 0.8576194, + "num_input_tokens_seen": 8031275, + "step": 384, + "time_per_iteration": 2.6339902877807617 + }, + { + "auxiliary_loss_clip": 0.0129979, + "auxiliary_loss_mlp": 0.010893, + "balance_loss_clip": 1.08362782, + "balance_loss_mlp": 1.05255938, + "epoch": 0.023147452277168194, + "flos": 21872435800320.0, + "grad_norm": 1.7828777740185773, + "language_loss": 0.89289594, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.91678685, + "num_input_tokens_seen": 8051600, + "step": 385, + "time_per_iteration": 2.690460205078125 + }, + { + "auxiliary_loss_clip": 0.01305297, + "auxiliary_loss_mlp": 0.01118129, + "balance_loss_clip": 1.08288455, + "balance_loss_mlp": 1.07926655, + "epoch": 0.023207575529836166, + "flos": 18916700423040.0, + "grad_norm": 1.9487706588237765, + "language_loss": 0.70157433, + "learning_rate": 3.83467889492477e-06, + "loss": 0.72580856, + "num_input_tokens_seen": 8070600, + "step": 386, + "time_per_iteration": 2.681957721710205 + }, + { + "auxiliary_loss_clip": 0.01305989, + "auxiliary_loss_mlp": 0.0109088, + "balance_loss_clip": 1.08441973, + "balance_loss_mlp": 1.05309081, + "epoch": 0.023267698782504134, + "flos": 25046005207680.0, + "grad_norm": 2.354342660334866, + "language_loss": 0.87840039, + "learning_rate": 3.836344748851495e-06, + "loss": 0.90236908, + "num_input_tokens_seen": 8090680, + "step": 387, + "time_per_iteration": 2.6511123180389404 + }, + { + "auxiliary_loss_clip": 0.01304298, + "auxiliary_loss_mlp": 0.01075541, + "balance_loss_clip": 1.08178413, + "balance_loss_mlp": 1.03658366, + "epoch": 0.023327822035172103, + "flos": 28879217930880.0, + "grad_norm": 2.2068948332198643, + "language_loss": 0.8341614, + "learning_rate": 3.838006303795566e-06, + "loss": 0.85795981, + "num_input_tokens_seen": 8114610, + "step": 388, + "time_per_iteration": 2.7062034606933594 + }, + { + "auxiliary_loss_clip": 0.01301997, + "auxiliary_loss_mlp": 0.01089724, + "balance_loss_clip": 1.08110905, + "balance_loss_mlp": 1.05284107, + "epoch": 0.02338794528784007, + "flos": 27121533805440.0, + "grad_norm": 2.1887236217853863, + "language_loss": 0.93710232, + "learning_rate": 3.839663581888206e-06, + "loss": 0.96101958, + "num_input_tokens_seen": 8133975, + "step": 389, + "time_per_iteration": 2.680280923843384 + }, + { + "auxiliary_loss_clip": 0.01296082, + "auxiliary_loss_mlp": 0.01083127, + "balance_loss_clip": 1.0818491, + "balance_loss_mlp": 1.04397893, + "epoch": 0.02344806854050804, + "flos": 21322355944320.0, + "grad_norm": 1.981860280002506, + "language_loss": 0.87747037, + "learning_rate": 3.841316605090178e-06, + "loss": 0.9012624, + "num_input_tokens_seen": 8153570, + "step": 390, + "time_per_iteration": 2.65970516204834 + }, + { + "auxiliary_loss_clip": 0.01301203, + "auxiliary_loss_mlp": 0.01092853, + "balance_loss_clip": 1.08357048, + "balance_loss_mlp": 1.0568521, + "epoch": 0.023508191793176012, + "flos": 24789997998720.0, + "grad_norm": 2.134782100250632, + "language_loss": 0.89370871, + "learning_rate": 3.842965395193529e-06, + "loss": 0.91764927, + "num_input_tokens_seen": 8170075, + "step": 391, + "time_per_iteration": 2.620009660720825 + }, + { + "auxiliary_loss_clip": 0.01296395, + "auxiliary_loss_mlp": 0.01072264, + "balance_loss_clip": 1.07956719, + "balance_loss_mlp": 1.03521371, + "epoch": 0.02356831504584398, + "flos": 25995375624960.0, + "grad_norm": 2.366558958564603, + "language_loss": 0.86076117, + "learning_rate": 3.84460997382332e-06, + "loss": 0.88444775, + "num_input_tokens_seen": 8190420, + "step": 392, + "time_per_iteration": 2.7171695232391357 + }, + { + "auxiliary_loss_clip": 0.01293283, + "auxiliary_loss_mlp": 0.01084283, + "balance_loss_clip": 1.07891107, + "balance_loss_mlp": 1.04763794, + "epoch": 0.02362843829851195, + "flos": 19062461813760.0, + "grad_norm": 2.038818686720474, + "language_loss": 0.89096916, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.91474473, + "num_input_tokens_seen": 8208790, + "step": 393, + "time_per_iteration": 2.632129669189453 + }, + { + "auxiliary_loss_clip": 0.01304158, + "auxiliary_loss_mlp": 0.01102255, + "balance_loss_clip": 1.08471596, + "balance_loss_mlp": 1.06279635, + "epoch": 0.023688561551179918, + "flos": 16071031296000.0, + "grad_norm": 1.7920692319020195, + "language_loss": 0.8156364, + "learning_rate": 3.84788658233771e-06, + "loss": 0.83970058, + "num_input_tokens_seen": 8226885, + "step": 394, + "time_per_iteration": 2.5932936668395996 + }, + { + "auxiliary_loss_clip": 0.01296851, + "auxiliary_loss_mlp": 0.01088191, + "balance_loss_clip": 1.07939875, + "balance_loss_mlp": 1.04920936, + "epoch": 0.023748684803847887, + "flos": 21724375939200.0, + "grad_norm": 4.539737106404062, + "language_loss": 0.85808635, + "learning_rate": 3.84951865465269e-06, + "loss": 0.88193679, + "num_input_tokens_seen": 8246825, + "step": 395, + "time_per_iteration": 2.6112868785858154 + }, + { + "auxiliary_loss_clip": 0.01194704, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.07210529, + "balance_loss_mlp": 1.02319229, + "epoch": 0.02380880805651586, + "flos": 61926192881280.0, + "grad_norm": 0.9258089920958834, + "language_loss": 0.6380353, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66032922, + "num_input_tokens_seen": 8302835, + "step": 396, + "time_per_iteration": 3.031489133834839 + }, + { + "auxiliary_loss_clip": 0.0129188, + "auxiliary_loss_mlp": 0.01071022, + "balance_loss_clip": 1.07806754, + "balance_loss_mlp": 1.03447223, + "epoch": 0.023868931309183827, + "flos": 20266331068800.0, + "grad_norm": 2.3741099598177624, + "language_loss": 0.83878696, + "learning_rate": 3.852770440269372e-06, + "loss": 0.86241591, + "num_input_tokens_seen": 8320745, + "step": 397, + "time_per_iteration": 2.6049532890319824 + }, + { + "auxiliary_loss_clip": 0.01297108, + "auxiliary_loss_mlp": 0.01087341, + "balance_loss_clip": 1.08104038, + "balance_loss_mlp": 1.04890823, + "epoch": 0.023929054561851796, + "flos": 21139103733120.0, + "grad_norm": 4.6847154905409205, + "language_loss": 0.84066498, + "learning_rate": 3.854390195044404e-06, + "loss": 0.86450952, + "num_input_tokens_seen": 8339540, + "step": 398, + "time_per_iteration": 2.6516692638397217 + }, + { + "auxiliary_loss_clip": 0.01295876, + "auxiliary_loss_mlp": 0.01078722, + "balance_loss_clip": 1.07671928, + "balance_loss_mlp": 1.04007471, + "epoch": 0.023989177814519765, + "flos": 13698521049600.0, + "grad_norm": 2.80358563189936, + "language_loss": 0.86029691, + "learning_rate": 3.856005885185868e-06, + "loss": 0.88404286, + "num_input_tokens_seen": 8354890, + "step": 399, + "time_per_iteration": 2.5452589988708496 + }, + { + "auxiliary_loss_clip": 0.01292698, + "auxiliary_loss_mlp": 0.01090822, + "balance_loss_clip": 1.08074594, + "balance_loss_mlp": 1.05308056, + "epoch": 0.024049301067187733, + "flos": 26322018929280.0, + "grad_norm": 2.021318687641168, + "language_loss": 0.86254489, + "learning_rate": 3.857617531042398e-06, + "loss": 0.88638014, + "num_input_tokens_seen": 8375845, + "step": 400, + "time_per_iteration": 2.6626927852630615 + }, + { + "auxiliary_loss_clip": 0.01299822, + "auxiliary_loss_mlp": 0.01083301, + "balance_loss_clip": 1.08346462, + "balance_loss_mlp": 1.04687035, + "epoch": 0.024109424319855705, + "flos": 24425432910720.0, + "grad_norm": 1.735822397657743, + "language_loss": 0.79276752, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.81659877, + "num_input_tokens_seen": 8395240, + "step": 401, + "time_per_iteration": 2.68418025970459 + }, + { + "auxiliary_loss_clip": 0.0129275, + "auxiliary_loss_mlp": 0.01091389, + "balance_loss_clip": 1.07852793, + "balance_loss_mlp": 1.05493474, + "epoch": 0.024169547572523674, + "flos": 29604397610880.0, + "grad_norm": 3.889755427752258, + "language_loss": 0.78890866, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.81274998, + "num_input_tokens_seen": 8416950, + "step": 402, + "time_per_iteration": 2.7509379386901855 + }, + { + "auxiliary_loss_clip": 0.01296434, + "auxiliary_loss_mlp": 0.01082712, + "balance_loss_clip": 1.07797897, + "balance_loss_mlp": 1.04399323, + "epoch": 0.024229670825191642, + "flos": 22601458235520.0, + "grad_norm": 2.49356632429363, + "language_loss": 0.94936156, + "learning_rate": 3.86242840411147e-06, + "loss": 0.97315305, + "num_input_tokens_seen": 8433660, + "step": 403, + "time_per_iteration": 2.5760560035705566 + }, + { + "auxiliary_loss_clip": 0.0129994, + "auxiliary_loss_mlp": 0.01091893, + "balance_loss_clip": 1.07754242, + "balance_loss_mlp": 1.05315053, + "epoch": 0.02428979407785961, + "flos": 18150258994560.0, + "grad_norm": 2.361656575803209, + "language_loss": 0.99877387, + "learning_rate": 3.864024073288798e-06, + "loss": 1.0226922, + "num_input_tokens_seen": 8450180, + "step": 404, + "time_per_iteration": 2.5966458320617676 + }, + { + "auxiliary_loss_clip": 0.01298911, + "auxiliary_loss_mlp": 0.01100127, + "balance_loss_clip": 1.08096266, + "balance_loss_mlp": 1.06312442, + "epoch": 0.024349917330527583, + "flos": 15304984917120.0, + "grad_norm": 2.3162348618509276, + "language_loss": 0.8802169, + "learning_rate": 3.865615797668091e-06, + "loss": 0.90420723, + "num_input_tokens_seen": 8467775, + "step": 405, + "time_per_iteration": 2.5728275775909424 + }, + { + "auxiliary_loss_clip": 0.01306827, + "auxiliary_loss_mlp": 0.01097881, + "balance_loss_clip": 1.084512, + "balance_loss_mlp": 1.06004393, + "epoch": 0.024410040583195552, + "flos": 20773892200320.0, + "grad_norm": 2.7399607903318275, + "language_loss": 0.93386561, + "learning_rate": 3.867203596705844e-06, + "loss": 0.95791268, + "num_input_tokens_seen": 8486765, + "step": 406, + "time_per_iteration": 2.612668991088867 + }, + { + "auxiliary_loss_clip": 0.01299426, + "auxiliary_loss_mlp": 0.01088378, + "balance_loss_clip": 1.08213782, + "balance_loss_mlp": 1.0500164, + "epoch": 0.02447016383586352, + "flos": 21798854789760.0, + "grad_norm": 2.1742012769968526, + "language_loss": 0.87128031, + "learning_rate": 3.86878748971496e-06, + "loss": 0.89515841, + "num_input_tokens_seen": 8506515, + "step": 407, + "time_per_iteration": 2.5982017517089844 + }, + { + "auxiliary_loss_clip": 0.01298266, + "auxiliary_loss_mlp": 0.01083858, + "balance_loss_clip": 1.08472157, + "balance_loss_mlp": 1.04630709, + "epoch": 0.02453028708853149, + "flos": 33948116380800.0, + "grad_norm": 2.1458430439144234, + "language_loss": 0.74102569, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.76484692, + "num_input_tokens_seen": 8528035, + "step": 408, + "time_per_iteration": 2.708670139312744 + }, + { + "auxiliary_loss_clip": 0.01300128, + "auxiliary_loss_mlp": 0.01089985, + "balance_loss_clip": 1.08222318, + "balance_loss_mlp": 1.05233896, + "epoch": 0.024590410341199458, + "flos": 21793000872960.0, + "grad_norm": 2.4878473813549675, + "language_loss": 0.92509401, + "learning_rate": 3.871943634189376e-06, + "loss": 0.94899511, + "num_input_tokens_seen": 8546455, + "step": 409, + "time_per_iteration": 2.665321111679077 + }, + { + "auxiliary_loss_clip": 0.01296394, + "auxiliary_loss_mlp": 0.01077538, + "balance_loss_clip": 1.08126342, + "balance_loss_mlp": 1.04291987, + "epoch": 0.02465053359386743, + "flos": 35114782124160.0, + "grad_norm": 2.2521095969191722, + "language_loss": 0.82792604, + "learning_rate": 3.873515923575128e-06, + "loss": 0.85166532, + "num_input_tokens_seen": 8568450, + "step": 410, + "time_per_iteration": 2.848928213119507 + }, + { + "auxiliary_loss_clip": 0.01299459, + "auxiliary_loss_mlp": 0.01089133, + "balance_loss_clip": 1.08187068, + "balance_loss_mlp": 1.05284572, + "epoch": 0.0247106568465354, + "flos": 27451409333760.0, + "grad_norm": 2.1393760271628595, + "language_loss": 0.77577484, + "learning_rate": 3.875084382775879e-06, + "loss": 0.79966074, + "num_input_tokens_seen": 8589340, + "step": 411, + "time_per_iteration": 2.6645278930664062 + }, + { + "auxiliary_loss_clip": 0.01298341, + "auxiliary_loss_mlp": 0.0110154, + "balance_loss_clip": 1.07977521, + "balance_loss_mlp": 1.06289268, + "epoch": 0.024770780099203367, + "flos": 20703794808960.0, + "grad_norm": 2.2974658872162665, + "language_loss": 0.86379063, + "learning_rate": 3.87664903040738e-06, + "loss": 0.88778943, + "num_input_tokens_seen": 8607150, + "step": 412, + "time_per_iteration": 2.6091151237487793 + }, + { + "auxiliary_loss_clip": 0.01187014, + "auxiliary_loss_mlp": 0.01031436, + "balance_loss_clip": 1.07387948, + "balance_loss_mlp": 1.02089787, + "epoch": 0.024830903351871336, + "flos": 69551859369600.0, + "grad_norm": 0.8687159185244209, + "language_loss": 0.5852263, + "learning_rate": 3.878209884949994e-06, + "loss": 0.60741079, + "num_input_tokens_seen": 8669865, + "step": 413, + "time_per_iteration": 3.2269625663757324 + }, + { + "auxiliary_loss_clip": 0.0129043, + "auxiliary_loss_mlp": 0.01091958, + "balance_loss_clip": 1.07709181, + "balance_loss_mlp": 1.05249953, + "epoch": 0.024891026604539304, + "flos": 32270477713920.0, + "grad_norm": 1.8280666153990437, + "language_loss": 0.80517173, + "learning_rate": 3.879766964750006e-06, + "loss": 0.82899559, + "num_input_tokens_seen": 8690235, + "step": 414, + "time_per_iteration": 2.720341444015503 + }, + { + "auxiliary_loss_clip": 0.01287097, + "auxiliary_loss_mlp": 0.0109242, + "balance_loss_clip": 1.0756042, + "balance_loss_mlp": 1.0556556, + "epoch": 0.024951149857207276, + "flos": 18840282238080.0, + "grad_norm": 2.1921003994701302, + "language_loss": 0.80227423, + "learning_rate": 3.881320288020917e-06, + "loss": 0.82606936, + "num_input_tokens_seen": 8706295, + "step": 415, + "time_per_iteration": 2.6473400592803955 + }, + { + "auxiliary_loss_clip": 0.01302694, + "auxiliary_loss_mlp": 0.01082455, + "balance_loss_clip": 1.08156919, + "balance_loss_mlp": 1.04497528, + "epoch": 0.025011273109875245, + "flos": 15377201210880.0, + "grad_norm": 2.9318871737289776, + "language_loss": 0.96236515, + "learning_rate": 3.882869872844723e-06, + "loss": 0.9862166, + "num_input_tokens_seen": 8724200, + "step": 416, + "time_per_iteration": 2.596189260482788 + }, + { + "auxiliary_loss_clip": 0.01291636, + "auxiliary_loss_mlp": 0.01074465, + "balance_loss_clip": 1.07628798, + "balance_loss_mlp": 1.0355792, + "epoch": 0.025071396362543213, + "flos": 18915515274240.0, + "grad_norm": 1.741746736079687, + "language_loss": 0.77381694, + "learning_rate": 3.884415737173176e-06, + "loss": 0.79747796, + "num_input_tokens_seen": 8744170, + "step": 417, + "time_per_iteration": 5.610344171524048 + }, + { + "auxiliary_loss_clip": 0.01290746, + "auxiliary_loss_mlp": 0.0109022, + "balance_loss_clip": 1.08072221, + "balance_loss_mlp": 1.05264485, + "epoch": 0.025131519615211182, + "flos": 25337958952320.0, + "grad_norm": 1.554385639735456, + "language_loss": 0.77076226, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.79457194, + "num_input_tokens_seen": 8765120, + "step": 418, + "time_per_iteration": 5.837290525436401 + }, + { + "auxiliary_loss_clip": 0.01297026, + "auxiliary_loss_mlp": 0.01071197, + "balance_loss_clip": 1.08019948, + "balance_loss_mlp": 1.03550553, + "epoch": 0.02519164286787915, + "flos": 18953149749120.0, + "grad_norm": 2.4603268634516207, + "language_loss": 0.81445098, + "learning_rate": 3.887496375507294e-06, + "loss": 0.83813322, + "num_input_tokens_seen": 8783500, + "step": 419, + "time_per_iteration": 2.582590341567993 + }, + { + "auxiliary_loss_clip": 0.01291114, + "auxiliary_loss_mlp": 0.01086736, + "balance_loss_clip": 1.07929599, + "balance_loss_mlp": 1.04708743, + "epoch": 0.025251766120547123, + "flos": 17421092904960.0, + "grad_norm": 1.8078532084212713, + "language_loss": 0.73618573, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.75996423, + "num_input_tokens_seen": 8801175, + "step": 420, + "time_per_iteration": 2.6739418506622314 + }, + { + "auxiliary_loss_clip": 0.01290485, + "auxiliary_loss_mlp": 0.01096292, + "balance_loss_clip": 1.07605243, + "balance_loss_mlp": 1.05924153, + "epoch": 0.02531188937321509, + "flos": 25045430590080.0, + "grad_norm": 1.77336014903074, + "language_loss": 0.79040134, + "learning_rate": 3.890562344079484e-06, + "loss": 0.81426907, + "num_input_tokens_seen": 8820215, + "step": 421, + "time_per_iteration": 2.6928632259368896 + }, + { + "auxiliary_loss_clip": 0.01290689, + "auxiliary_loss_mlp": 0.01088863, + "balance_loss_clip": 1.07922924, + "balance_loss_mlp": 1.04983425, + "epoch": 0.02537201262588306, + "flos": 30592228515840.0, + "grad_norm": 2.2139016136437104, + "language_loss": 0.8203755, + "learning_rate": 3.89208987073549e-06, + "loss": 0.84417105, + "num_input_tokens_seen": 8839660, + "step": 422, + "time_per_iteration": 2.714707851409912 + }, + { + "auxiliary_loss_clip": 0.01293659, + "auxiliary_loss_mlp": 0.01078975, + "balance_loss_clip": 1.07677865, + "balance_loss_mlp": 1.04430926, + "epoch": 0.02543213587855103, + "flos": 26065365275520.0, + "grad_norm": 2.1259138778576356, + "language_loss": 0.83458018, + "learning_rate": 3.893613781940409e-06, + "loss": 0.85830647, + "num_input_tokens_seen": 8859280, + "step": 423, + "time_per_iteration": 2.652757167816162 + }, + { + "auxiliary_loss_clip": 0.01287497, + "auxiliary_loss_mlp": 0.01078335, + "balance_loss_clip": 1.0742569, + "balance_loss_mlp": 1.04221487, + "epoch": 0.025492259131218997, + "flos": 36022818965760.0, + "grad_norm": 2.012741083661608, + "language_loss": 0.74129444, + "learning_rate": 3.895134094768415e-06, + "loss": 0.76495278, + "num_input_tokens_seen": 8880560, + "step": 424, + "time_per_iteration": 2.7724521160125732 + }, + { + "auxiliary_loss_clip": 0.01296446, + "auxiliary_loss_mlp": 0.01093799, + "balance_loss_clip": 1.07987142, + "balance_loss_mlp": 1.05782199, + "epoch": 0.02555238238388697, + "flos": 18588045957120.0, + "grad_norm": 4.623670538116741, + "language_loss": 0.83193713, + "learning_rate": 3.896650826173015e-06, + "loss": 0.85583955, + "num_input_tokens_seen": 8899155, + "step": 425, + "time_per_iteration": 2.608029842376709 + }, + { + "auxiliary_loss_clip": 0.01292462, + "auxiliary_loss_mlp": 0.01092376, + "balance_loss_clip": 1.07259536, + "balance_loss_mlp": 1.0544672, + "epoch": 0.025612505636554938, + "flos": 24243186280320.0, + "grad_norm": 2.5075767706443566, + "language_loss": 0.853073, + "learning_rate": 3.898163992988186e-06, + "loss": 0.87692136, + "num_input_tokens_seen": 8917890, + "step": 426, + "time_per_iteration": 2.6445271968841553 + }, + { + "auxiliary_loss_clip": 0.01175923, + "auxiliary_loss_mlp": 0.01017688, + "balance_loss_clip": 1.06532824, + "balance_loss_mlp": 1.00781715, + "epoch": 0.025672628889222907, + "flos": 60586941265920.0, + "grad_norm": 0.8949637292547264, + "language_loss": 0.57219732, + "learning_rate": 3.899673611929491e-06, + "loss": 0.5941335, + "num_input_tokens_seen": 8978260, + "step": 427, + "time_per_iteration": 3.2690517902374268 + }, + { + "auxiliary_loss_clip": 0.01291989, + "auxiliary_loss_mlp": 0.01092649, + "balance_loss_clip": 1.08155811, + "balance_loss_mlp": 1.05674267, + "epoch": 0.025732752141890875, + "flos": 19573255169280.0, + "grad_norm": 2.4869215225306673, + "language_loss": 0.88130605, + "learning_rate": 3.901179699595194e-06, + "loss": 0.90515244, + "num_input_tokens_seen": 8994460, + "step": 428, + "time_per_iteration": 2.6143813133239746 + }, + { + "auxiliary_loss_clip": 0.01283603, + "auxiliary_loss_mlp": 0.0107531, + "balance_loss_clip": 1.07418942, + "balance_loss_mlp": 1.03735399, + "epoch": 0.025792875394558847, + "flos": 31284262920960.0, + "grad_norm": 2.067247304638145, + "language_loss": 0.85790849, + "learning_rate": 3.902682272467353e-06, + "loss": 0.88149762, + "num_input_tokens_seen": 9016670, + "step": 429, + "time_per_iteration": 2.749328374862671 + }, + { + "auxiliary_loss_clip": 0.01288943, + "auxiliary_loss_mlp": 0.01083888, + "balance_loss_clip": 1.07337689, + "balance_loss_mlp": 1.04590786, + "epoch": 0.025852998647226816, + "flos": 32379610210560.0, + "grad_norm": 2.4411876712444034, + "language_loss": 0.8815223, + "learning_rate": 3.904181346912895e-06, + "loss": 0.90525061, + "num_input_tokens_seen": 9039720, + "step": 430, + "time_per_iteration": 2.7483572959899902 + }, + { + "auxiliary_loss_clip": 0.01290726, + "auxiliary_loss_mlp": 0.01080495, + "balance_loss_clip": 1.0803287, + "balance_loss_mlp": 1.04573333, + "epoch": 0.025913121899894784, + "flos": 20193288762240.0, + "grad_norm": 2.086180078538185, + "language_loss": 0.84249514, + "learning_rate": 3.905676939184698e-06, + "loss": 0.8662073, + "num_input_tokens_seen": 9059850, + "step": 431, + "time_per_iteration": 2.6531126499176025 + }, + { + "auxiliary_loss_clip": 0.01286945, + "auxiliary_loss_mlp": 0.01073345, + "balance_loss_clip": 1.07570636, + "balance_loss_mlp": 1.03951311, + "epoch": 0.025973245152562753, + "flos": 14720430983040.0, + "grad_norm": 2.681931959502968, + "language_loss": 0.86511916, + "learning_rate": 3.907169065422638e-06, + "loss": 0.88872206, + "num_input_tokens_seen": 9077590, + "step": 432, + "time_per_iteration": 2.7582762241363525 + }, + { + "auxiliary_loss_clip": 0.01287429, + "auxiliary_loss_mlp": 0.01072961, + "balance_loss_clip": 1.07632601, + "balance_loss_mlp": 1.03891492, + "epoch": 0.02603336840523072, + "flos": 30992991534720.0, + "grad_norm": 1.95596969308187, + "language_loss": 0.76036298, + "learning_rate": 3.908657741654636e-06, + "loss": 0.7839669, + "num_input_tokens_seen": 9099880, + "step": 433, + "time_per_iteration": 2.707771062850952 + }, + { + "auxiliary_loss_clip": 0.01289436, + "auxiliary_loss_mlp": 0.01088504, + "balance_loss_clip": 1.07470191, + "balance_loss_mlp": 1.04973757, + "epoch": 0.026093491657898694, + "flos": 17674262939520.0, + "grad_norm": 2.157056093147959, + "language_loss": 0.8979522, + "learning_rate": 3.910142983797699e-06, + "loss": 0.92173159, + "num_input_tokens_seen": 9118620, + "step": 434, + "time_per_iteration": 2.5665409564971924 + }, + { + "auxiliary_loss_clip": 0.01289617, + "auxiliary_loss_mlp": 0.01096405, + "balance_loss_clip": 1.07960439, + "balance_loss_mlp": 1.05904448, + "epoch": 0.026153614910566662, + "flos": 17857874286720.0, + "grad_norm": 2.306071945033866, + "language_loss": 0.80187833, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.82573849, + "num_input_tokens_seen": 9135655, + "step": 435, + "time_per_iteration": 2.614440679550171 + }, + { + "auxiliary_loss_clip": 0.01285396, + "auxiliary_loss_mlp": 0.01092207, + "balance_loss_clip": 1.07367229, + "balance_loss_mlp": 1.05503798, + "epoch": 0.02621373816323463, + "flos": 20011113959040.0, + "grad_norm": 3.0257040949539356, + "language_loss": 0.86361396, + "learning_rate": 3.913103228936546e-06, + "loss": 0.88739002, + "num_input_tokens_seen": 9153520, + "step": 436, + "time_per_iteration": 2.635033130645752 + }, + { + "auxiliary_loss_clip": 0.01289558, + "auxiliary_loss_mlp": 0.01096903, + "balance_loss_clip": 1.07716811, + "balance_loss_mlp": 1.06080687, + "epoch": 0.0262738614159026, + "flos": 19281193683840.0, + "grad_norm": 2.4233286399217993, + "language_loss": 0.74725163, + "learning_rate": 3.914578263220868e-06, + "loss": 0.77111626, + "num_input_tokens_seen": 9170750, + "step": 437, + "time_per_iteration": 2.6614880561828613 + }, + { + "auxiliary_loss_clip": 0.01286403, + "auxiliary_loss_mlp": 0.01100399, + "balance_loss_clip": 1.07628679, + "balance_loss_mlp": 1.06220388, + "epoch": 0.026333984668570568, + "flos": 18807208790400.0, + "grad_norm": 2.79370908187484, + "language_loss": 0.9131338, + "learning_rate": 3.916049925995316e-06, + "loss": 0.93700182, + "num_input_tokens_seen": 9188430, + "step": 438, + "time_per_iteration": 2.674877166748047 + }, + { + "auxiliary_loss_clip": 0.01169678, + "auxiliary_loss_mlp": 0.01072518, + "balance_loss_clip": 1.0602653, + "balance_loss_mlp": 1.06250465, + "epoch": 0.02639410792123854, + "flos": 64572020691840.0, + "grad_norm": 0.8871275810137318, + "language_loss": 0.62631273, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64873469, + "num_input_tokens_seen": 9255835, + "step": 439, + "time_per_iteration": 3.2527849674224854 + }, + { + "auxiliary_loss_clip": 0.01296492, + "auxiliary_loss_mlp": 0.01095184, + "balance_loss_clip": 1.08175814, + "balance_loss_mlp": 1.05758572, + "epoch": 0.02645423117390651, + "flos": 28473462921600.0, + "grad_norm": 3.31985956061953, + "language_loss": 0.75982475, + "learning_rate": 3.918983198419573e-06, + "loss": 0.78374153, + "num_input_tokens_seen": 9276835, + "step": 440, + "time_per_iteration": 2.6770262718200684 + }, + { + "auxiliary_loss_clip": 0.01286342, + "auxiliary_loss_mlp": 0.01076505, + "balance_loss_clip": 1.07652593, + "balance_loss_mlp": 1.04048026, + "epoch": 0.026514354426574478, + "flos": 18551237495040.0, + "grad_norm": 3.0236705091068283, + "language_loss": 0.83197021, + "learning_rate": 3.920444838510415e-06, + "loss": 0.85559869, + "num_input_tokens_seen": 9295075, + "step": 441, + "time_per_iteration": 2.591306209564209 + }, + { + "auxiliary_loss_clip": 0.01291817, + "auxiliary_loss_mlp": 0.01086154, + "balance_loss_clip": 1.07703269, + "balance_loss_mlp": 1.04829359, + "epoch": 0.026574477679242446, + "flos": 20667812359680.0, + "grad_norm": 2.202684635319811, + "language_loss": 0.78490162, + "learning_rate": 3.92190316797534e-06, + "loss": 0.80868137, + "num_input_tokens_seen": 9314205, + "step": 442, + "time_per_iteration": 2.633054733276367 + }, + { + "auxiliary_loss_clip": 0.0116251, + "auxiliary_loss_mlp": 0.01015158, + "balance_loss_clip": 1.05336332, + "balance_loss_mlp": 1.0054301, + "epoch": 0.026634600931910415, + "flos": 57956125340160.0, + "grad_norm": 0.9609264438471399, + "language_loss": 0.64459753, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66637421, + "num_input_tokens_seen": 9367395, + "step": 443, + "time_per_iteration": 3.1241400241851807 + }, + { + "auxiliary_loss_clip": 0.01291897, + "auxiliary_loss_mlp": 0.01085882, + "balance_loss_clip": 1.08147204, + "balance_loss_mlp": 1.04906964, + "epoch": 0.026694724184578387, + "flos": 15815131827840.0, + "grad_norm": 2.121488874389134, + "language_loss": 0.82093638, + "learning_rate": 3.924809954779425e-06, + "loss": 0.84471416, + "num_input_tokens_seen": 9385185, + "step": 444, + "time_per_iteration": 2.6202428340911865 + }, + { + "auxiliary_loss_clip": 0.0129406, + "auxiliary_loss_mlp": 0.01082041, + "balance_loss_clip": 1.07940578, + "balance_loss_mlp": 1.04263067, + "epoch": 0.026754847437246355, + "flos": 23440259612160.0, + "grad_norm": 2.2213674770888607, + "language_loss": 0.95689106, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.98065209, + "num_input_tokens_seen": 9403225, + "step": 445, + "time_per_iteration": 2.6071228981018066 + }, + { + "auxiliary_loss_clip": 0.01289866, + "auxiliary_loss_mlp": 0.01094053, + "balance_loss_clip": 1.07953668, + "balance_loss_mlp": 1.05492878, + "epoch": 0.026814970689914324, + "flos": 17341801632000.0, + "grad_norm": 2.775359545549618, + "language_loss": 0.91932094, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.94316012, + "num_input_tokens_seen": 9420540, + "step": 446, + "time_per_iteration": 2.5791916847229004 + }, + { + "auxiliary_loss_clip": 0.01289847, + "auxiliary_loss_mlp": 0.01088114, + "balance_loss_clip": 1.08072042, + "balance_loss_mlp": 1.05092025, + "epoch": 0.026875093942582293, + "flos": 17894718662400.0, + "grad_norm": 2.0562763127679204, + "language_loss": 0.79831308, + "learning_rate": 3.92914567610317e-06, + "loss": 0.82209271, + "num_input_tokens_seen": 9438840, + "step": 447, + "time_per_iteration": 2.6420843601226807 + }, + { + "auxiliary_loss_clip": 0.01289397, + "auxiliary_loss_mlp": 0.01079607, + "balance_loss_clip": 1.07901013, + "balance_loss_mlp": 1.04446411, + "epoch": 0.026935217195250265, + "flos": 21723980889600.0, + "grad_norm": 2.231264914467203, + "language_loss": 0.86402845, + "learning_rate": 3.930584452530952e-06, + "loss": 0.8877185, + "num_input_tokens_seen": 9457215, + "step": 448, + "time_per_iteration": 2.590277910232544 + }, + { + "auxiliary_loss_clip": 0.01282455, + "auxiliary_loss_mlp": 0.01091099, + "balance_loss_clip": 1.07706833, + "balance_loss_mlp": 1.05662322, + "epoch": 0.026995340447918233, + "flos": 23622685810560.0, + "grad_norm": 1.941778256808524, + "language_loss": 0.88581634, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.90955186, + "num_input_tokens_seen": 9475615, + "step": 449, + "time_per_iteration": 2.610065460205078 + }, + { + "auxiliary_loss_clip": 0.01293472, + "auxiliary_loss_mlp": 0.01085576, + "balance_loss_clip": 1.07856452, + "balance_loss_mlp": 1.04814398, + "epoch": 0.027055463700586202, + "flos": 17931275729280.0, + "grad_norm": 2.199007921978797, + "language_loss": 0.80395782, + "learning_rate": 3.933452395729493e-06, + "loss": 0.8277483, + "num_input_tokens_seen": 9493975, + "step": 450, + "time_per_iteration": 2.637465238571167 + }, + { + "auxiliary_loss_clip": 0.01284612, + "auxiliary_loss_mlp": 0.0108001, + "balance_loss_clip": 1.08025336, + "balance_loss_mlp": 1.04384232, + "epoch": 0.02711558695325417, + "flos": 25118903859840.0, + "grad_norm": 1.599374223212879, + "language_loss": 0.81562543, + "learning_rate": 3.934881590952304e-06, + "loss": 0.83927161, + "num_input_tokens_seen": 9514810, + "step": 451, + "time_per_iteration": 2.6506927013397217 + }, + { + "auxiliary_loss_clip": 0.0128567, + "auxiliary_loss_mlp": 0.01090719, + "balance_loss_clip": 1.08126068, + "balance_loss_mlp": 1.0533824, + "epoch": 0.02717571020592214, + "flos": 24239559006720.0, + "grad_norm": 1.9677929562692107, + "language_loss": 0.77019048, + "learning_rate": 3.936307620734599e-06, + "loss": 0.79395437, + "num_input_tokens_seen": 9533635, + "step": 452, + "time_per_iteration": 2.5751442909240723 + }, + { + "auxiliary_loss_clip": 0.01286865, + "auxiliary_loss_mlp": 0.01088287, + "balance_loss_clip": 1.08011293, + "balance_loss_mlp": 1.05135596, + "epoch": 0.02723583345859011, + "flos": 25118939773440.0, + "grad_norm": 1.7205362750177517, + "language_loss": 0.72874546, + "learning_rate": 3.937730499067294e-06, + "loss": 0.75249696, + "num_input_tokens_seen": 9555420, + "step": 453, + "time_per_iteration": 2.668083667755127 + }, + { + "auxiliary_loss_clip": 0.01281405, + "auxiliary_loss_mlp": 0.01083223, + "balance_loss_clip": 1.07714963, + "balance_loss_mlp": 1.04748416, + "epoch": 0.02729595671125808, + "flos": 42741597847680.0, + "grad_norm": 1.8353680194819204, + "language_loss": 0.82419729, + "learning_rate": 3.939150239848748e-06, + "loss": 0.84784359, + "num_input_tokens_seen": 9578950, + "step": 454, + "time_per_iteration": 2.8580126762390137 + }, + { + "auxiliary_loss_clip": 0.01285525, + "auxiliary_loss_mlp": 0.01077241, + "balance_loss_clip": 1.07935429, + "balance_loss_mlp": 1.043648, + "epoch": 0.02735607996392605, + "flos": 21430985650560.0, + "grad_norm": 1.985829769195046, + "language_loss": 0.75404847, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.77767611, + "num_input_tokens_seen": 9598160, + "step": 455, + "time_per_iteration": 2.6593477725982666 + }, + { + "auxiliary_loss_clip": 0.01282853, + "auxiliary_loss_mlp": 0.01094959, + "balance_loss_clip": 1.07477236, + "balance_loss_mlp": 1.0597918, + "epoch": 0.027416203216594017, + "flos": 20851280052480.0, + "grad_norm": 1.92483069519606, + "language_loss": 0.80670613, + "learning_rate": 3.941980363893499e-06, + "loss": 0.83048427, + "num_input_tokens_seen": 9616010, + "step": 456, + "time_per_iteration": 2.6798384189605713 + }, + { + "auxiliary_loss_clip": 0.01280135, + "auxiliary_loss_mlp": 0.01080319, + "balance_loss_clip": 1.07714963, + "balance_loss_mlp": 1.0435549, + "epoch": 0.027476326469261986, + "flos": 13224500242560.0, + "grad_norm": 2.171481572134165, + "language_loss": 0.81587321, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.83947778, + "num_input_tokens_seen": 9634000, + "step": 457, + "time_per_iteration": 5.62308406829834 + }, + { + "auxiliary_loss_clip": 0.01283922, + "auxiliary_loss_mlp": 0.01084055, + "balance_loss_clip": 1.07603848, + "balance_loss_mlp": 1.04891229, + "epoch": 0.027536449721929958, + "flos": 24024526237440.0, + "grad_norm": 2.024184269172234, + "language_loss": 0.94030929, + "learning_rate": 3.944798102235412e-06, + "loss": 0.96398914, + "num_input_tokens_seen": 9653455, + "step": 458, + "time_per_iteration": 5.694372653961182 + }, + { + "auxiliary_loss_clip": 0.01280807, + "auxiliary_loss_mlp": 0.01091426, + "balance_loss_clip": 1.07479525, + "balance_loss_mlp": 1.05666471, + "epoch": 0.027596572974597926, + "flos": 13006055681280.0, + "grad_norm": 2.356061876390436, + "language_loss": 0.79279089, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.81651318, + "num_input_tokens_seen": 9669650, + "step": 459, + "time_per_iteration": 2.626948595046997 + }, + { + "auxiliary_loss_clip": 0.01286253, + "auxiliary_loss_mlp": 0.01081623, + "balance_loss_clip": 1.08119941, + "balance_loss_mlp": 1.04278445, + "epoch": 0.027656696227265895, + "flos": 26143076350080.0, + "grad_norm": 2.0583603779546404, + "language_loss": 0.83362132, + "learning_rate": 3.947603562811407e-06, + "loss": 0.85730016, + "num_input_tokens_seen": 9691415, + "step": 460, + "time_per_iteration": 2.7191598415374756 + }, + { + "auxiliary_loss_clip": 0.01158037, + "auxiliary_loss_mlp": 0.01054463, + "balance_loss_clip": 1.05032754, + "balance_loss_mlp": 1.044402, + "epoch": 0.027716819479933864, + "flos": 60697222997760.0, + "grad_norm": 1.612511499168885, + "language_loss": 0.7351321, + "learning_rate": 3.949001722282675e-06, + "loss": 0.7572571, + "num_input_tokens_seen": 9755605, + "step": 461, + "time_per_iteration": 3.210820436477661 + }, + { + "auxiliary_loss_clip": 0.01284234, + "auxiliary_loss_mlp": 0.01079832, + "balance_loss_clip": 1.08432341, + "balance_loss_mlp": 1.04700136, + "epoch": 0.027776942732601832, + "flos": 31211938886400.0, + "grad_norm": 2.4500038571081073, + "language_loss": 0.81596625, + "learning_rate": 3.950396852153582e-06, + "loss": 0.839607, + "num_input_tokens_seen": 9776270, + "step": 462, + "time_per_iteration": 2.683197021484375 + }, + { + "auxiliary_loss_clip": 0.01280414, + "auxiliary_loss_mlp": 0.0107864, + "balance_loss_clip": 1.07752454, + "balance_loss_mlp": 1.0454762, + "epoch": 0.027837065985269804, + "flos": 22674644196480.0, + "grad_norm": 2.258526594266715, + "language_loss": 0.90062451, + "learning_rate": 3.951788965525118e-06, + "loss": 0.92421508, + "num_input_tokens_seen": 9794465, + "step": 463, + "time_per_iteration": 2.641674757003784 + }, + { + "auxiliary_loss_clip": 0.01151842, + "auxiliary_loss_mlp": 0.01010002, + "balance_loss_clip": 1.04755902, + "balance_loss_mlp": 1.00027454, + "epoch": 0.027897189237937773, + "flos": 62182487399040.0, + "grad_norm": 0.8962796480673014, + "language_loss": 0.59058654, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61220491, + "num_input_tokens_seen": 9849685, + "step": 464, + "time_per_iteration": 3.1129612922668457 + }, + { + "auxiliary_loss_clip": 0.01292933, + "auxiliary_loss_mlp": 0.01100533, + "balance_loss_clip": 1.08296049, + "balance_loss_mlp": 1.06412649, + "epoch": 0.02795731249060574, + "flos": 24493160004480.0, + "grad_norm": 2.3712654859298055, + "language_loss": 0.81454253, + "learning_rate": 3.954564194750784e-06, + "loss": 0.83847719, + "num_input_tokens_seen": 9869505, + "step": 465, + "time_per_iteration": 2.723144769668579 + }, + { + "auxiliary_loss_clip": 0.01279938, + "auxiliary_loss_mlp": 0.01092668, + "balance_loss_clip": 1.07546401, + "balance_loss_mlp": 1.05630863, + "epoch": 0.02801743574327371, + "flos": 23733003456000.0, + "grad_norm": 1.9968224423519798, + "language_loss": 0.78396618, + "learning_rate": 3.955947336385828e-06, + "loss": 0.80769229, + "num_input_tokens_seen": 9890950, + "step": 466, + "time_per_iteration": 2.6278555393218994 + }, + { + "auxiliary_loss_clip": 0.0127853, + "auxiliary_loss_mlp": 0.01091802, + "balance_loss_clip": 1.07703936, + "balance_loss_mlp": 1.05661178, + "epoch": 0.02807755899594168, + "flos": 20629100476800.0, + "grad_norm": 2.010021605622182, + "language_loss": 0.87699366, + "learning_rate": 3.957327513084761e-06, + "loss": 0.90069699, + "num_input_tokens_seen": 9911265, + "step": 467, + "time_per_iteration": 2.6687490940093994 + }, + { + "auxiliary_loss_clip": 0.01285129, + "auxiliary_loss_mlp": 0.01112935, + "balance_loss_clip": 1.07874036, + "balance_loss_mlp": 1.07576585, + "epoch": 0.02813768224860965, + "flos": 19244564789760.0, + "grad_norm": 2.2302958424490416, + "language_loss": 0.86091757, + "learning_rate": 3.958704737531818e-06, + "loss": 0.88489819, + "num_input_tokens_seen": 9929025, + "step": 468, + "time_per_iteration": 2.5745644569396973 + }, + { + "auxiliary_loss_clip": 0.01281128, + "auxiliary_loss_mlp": 0.01085455, + "balance_loss_clip": 1.07529211, + "balance_loss_mlp": 1.04857147, + "epoch": 0.02819780550127762, + "flos": 20813968800000.0, + "grad_norm": 2.1866562002509875, + "language_loss": 0.91690558, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.94057143, + "num_input_tokens_seen": 9945190, + "step": 469, + "time_per_iteration": 2.610821008682251 + }, + { + "auxiliary_loss_clip": 0.0127909, + "auxiliary_loss_mlp": 0.0110095, + "balance_loss_clip": 1.07675052, + "balance_loss_mlp": 1.06482995, + "epoch": 0.028257928753945588, + "flos": 19974125928960.0, + "grad_norm": 2.674428223667968, + "language_loss": 0.81758964, + "learning_rate": 3.96145038000181e-06, + "loss": 0.84139001, + "num_input_tokens_seen": 9962820, + "step": 470, + "time_per_iteration": 2.6004326343536377 + }, + { + "auxiliary_loss_clip": 0.0128074, + "auxiliary_loss_mlp": 0.01086643, + "balance_loss_clip": 1.07482624, + "balance_loss_mlp": 1.04947352, + "epoch": 0.028318052006613557, + "flos": 20484488321280.0, + "grad_norm": 1.788793606991614, + "language_loss": 0.93071401, + "learning_rate": 3.962818822989861e-06, + "loss": 0.95438784, + "num_input_tokens_seen": 9982595, + "step": 471, + "time_per_iteration": 2.556288719177246 + }, + { + "auxiliary_loss_clip": 0.01273697, + "auxiliary_loss_mlp": 0.0110454, + "balance_loss_clip": 1.07223165, + "balance_loss_mlp": 1.06884849, + "epoch": 0.02837817525928153, + "flos": 28514832410880.0, + "grad_norm": 1.8550872135639116, + "language_loss": 0.7613501, + "learning_rate": 3.964184363657625e-06, + "loss": 0.78513247, + "num_input_tokens_seen": 10004645, + "step": 472, + "time_per_iteration": 2.667804002761841 + }, + { + "auxiliary_loss_clip": 0.01280341, + "auxiliary_loss_mlp": 0.01090649, + "balance_loss_clip": 1.07279634, + "balance_loss_mlp": 1.05624473, + "epoch": 0.028438298511949497, + "flos": 18551668458240.0, + "grad_norm": 1.9914661475951314, + "language_loss": 0.93097353, + "learning_rate": 3.965547014290071e-06, + "loss": 0.95468336, + "num_input_tokens_seen": 10022555, + "step": 473, + "time_per_iteration": 2.6402342319488525 + }, + { + "auxiliary_loss_clip": 0.01287339, + "auxiliary_loss_mlp": 0.01124194, + "balance_loss_clip": 1.07773685, + "balance_loss_mlp": 1.08979011, + "epoch": 0.028498421764617466, + "flos": 16910227722240.0, + "grad_norm": 3.2560638787193237, + "language_loss": 0.88488632, + "learning_rate": 3.96690678709433e-06, + "loss": 0.90900171, + "num_input_tokens_seen": 10041025, + "step": 474, + "time_per_iteration": 2.5853888988494873 + }, + { + "auxiliary_loss_clip": 0.0127783, + "auxiliary_loss_mlp": 0.01093132, + "balance_loss_clip": 1.07535374, + "balance_loss_mlp": 1.05620146, + "epoch": 0.028558545017285435, + "flos": 27778699082880.0, + "grad_norm": 3.1427023167402006, + "language_loss": 0.78901398, + "learning_rate": 3.968263694200355e-06, + "loss": 0.81272364, + "num_input_tokens_seen": 10060775, + "step": 475, + "time_per_iteration": 2.654519557952881 + }, + { + "auxiliary_loss_clip": 0.01148107, + "auxiliary_loss_mlp": 0.01095224, + "balance_loss_clip": 1.04505777, + "balance_loss_mlp": 1.08583021, + "epoch": 0.028618668269953403, + "flos": 65654367258240.0, + "grad_norm": 0.9280065830162254, + "language_loss": 0.66926932, + "learning_rate": 3.969617747661569e-06, + "loss": 0.6917026, + "num_input_tokens_seen": 10120225, + "step": 476, + "time_per_iteration": 3.1292569637298584 + }, + { + "auxiliary_loss_clip": 0.01279748, + "auxiliary_loss_mlp": 0.01088794, + "balance_loss_clip": 1.07638311, + "balance_loss_mlp": 1.05188656, + "epoch": 0.028678791522621375, + "flos": 21937074324480.0, + "grad_norm": 2.985672001195028, + "language_loss": 0.83807188, + "learning_rate": 3.970968959455509e-06, + "loss": 0.86175728, + "num_input_tokens_seen": 10137880, + "step": 477, + "time_per_iteration": 2.651493549346924 + }, + { + "auxiliary_loss_clip": 0.01284956, + "auxiliary_loss_mlp": 0.0108711, + "balance_loss_clip": 1.07924342, + "balance_loss_mlp": 1.05089426, + "epoch": 0.028738914775289344, + "flos": 24572128055040.0, + "grad_norm": 2.1929055744411943, + "language_loss": 0.8233152, + "learning_rate": 3.97231734148446e-06, + "loss": 0.84703588, + "num_input_tokens_seen": 10156930, + "step": 478, + "time_per_iteration": 2.6986753940582275 + }, + { + "auxiliary_loss_clip": 0.01277687, + "auxiliary_loss_mlp": 0.01080644, + "balance_loss_clip": 1.07448888, + "balance_loss_mlp": 1.04500043, + "epoch": 0.028799038027957313, + "flos": 23257977068160.0, + "grad_norm": 4.057107988717453, + "language_loss": 0.81195259, + "learning_rate": 3.973662905576082e-06, + "loss": 0.83553594, + "num_input_tokens_seen": 10176295, + "step": 479, + "time_per_iteration": 2.6321041584014893 + }, + { + "auxiliary_loss_clip": 0.01273765, + "auxiliary_loss_mlp": 0.01083313, + "balance_loss_clip": 1.07335579, + "balance_loss_mlp": 1.04552341, + "epoch": 0.02885916128062528, + "flos": 22164102236160.0, + "grad_norm": 2.352225573775279, + "language_loss": 0.7335608, + "learning_rate": 3.975005663484038e-06, + "loss": 0.75713164, + "num_input_tokens_seen": 10195790, + "step": 480, + "time_per_iteration": 2.650696277618408 + }, + { + "auxiliary_loss_clip": 0.01273107, + "auxiliary_loss_mlp": 0.01075586, + "balance_loss_clip": 1.07424879, + "balance_loss_mlp": 1.04277968, + "epoch": 0.02891928453329325, + "flos": 22932842135040.0, + "grad_norm": 1.867890428108999, + "language_loss": 0.87560165, + "learning_rate": 3.976345626888605e-06, + "loss": 0.89908862, + "num_input_tokens_seen": 10218405, + "step": 481, + "time_per_iteration": 2.6585533618927 + }, + { + "auxiliary_loss_clip": 0.01142103, + "auxiliary_loss_mlp": 0.01017301, + "balance_loss_clip": 1.04286921, + "balance_loss_mlp": 1.00895679, + "epoch": 0.028979407785961222, + "flos": 57432941792640.0, + "grad_norm": 0.8486437303263991, + "language_loss": 0.66030192, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68189597, + "num_input_tokens_seen": 10271005, + "step": 482, + "time_per_iteration": 2.9788918495178223 + }, + { + "auxiliary_loss_clip": 0.01287904, + "auxiliary_loss_mlp": 0.01082416, + "balance_loss_clip": 1.07739437, + "balance_loss_mlp": 1.04868007, + "epoch": 0.02903953103862919, + "flos": 16722737706240.0, + "grad_norm": 2.6473263724689873, + "language_loss": 0.7899214, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81362462, + "num_input_tokens_seen": 10288405, + "step": 483, + "time_per_iteration": 2.5642752647399902 + }, + { + "auxiliary_loss_clip": 0.01283775, + "auxiliary_loss_mlp": 0.01097438, + "balance_loss_clip": 1.07794189, + "balance_loss_mlp": 1.06155562, + "epoch": 0.02909965429129716, + "flos": 16763640318720.0, + "grad_norm": 2.6777328906555766, + "language_loss": 0.75510043, + "learning_rate": 3.980348865796749e-06, + "loss": 0.77891254, + "num_input_tokens_seen": 10306875, + "step": 484, + "time_per_iteration": 2.608337640762329 + }, + { + "auxiliary_loss_clip": 0.0127962, + "auxiliary_loss_mlp": 0.01081582, + "balance_loss_clip": 1.07543373, + "balance_loss_mlp": 1.04760778, + "epoch": 0.029159777543965128, + "flos": 19785343023360.0, + "grad_norm": 2.3457282915841113, + "language_loss": 0.8378315, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.86144352, + "num_input_tokens_seen": 10323965, + "step": 485, + "time_per_iteration": 2.591409921646118 + }, + { + "auxiliary_loss_clip": 0.01282377, + "auxiliary_loss_mlp": 0.01084922, + "balance_loss_clip": 1.08029485, + "balance_loss_mlp": 1.04956484, + "epoch": 0.029219900796633096, + "flos": 19642670202240.0, + "grad_norm": 2.044831141886674, + "language_loss": 0.84432101, + "learning_rate": 3.983003930109732e-06, + "loss": 0.86799401, + "num_input_tokens_seen": 10342620, + "step": 486, + "time_per_iteration": 2.7101452350616455 + }, + { + "auxiliary_loss_clip": 0.01276806, + "auxiliary_loss_mlp": 0.01090739, + "balance_loss_clip": 1.07363296, + "balance_loss_mlp": 1.05476189, + "epoch": 0.02928002404930107, + "flos": 25885704424320.0, + "grad_norm": 12.432525192672303, + "language_loss": 0.88968349, + "learning_rate": 3.984327367746315e-06, + "loss": 0.91335887, + "num_input_tokens_seen": 10364610, + "step": 487, + "time_per_iteration": 2.637910842895508 + }, + { + "auxiliary_loss_clip": 0.01283084, + "auxiliary_loss_mlp": 0.01069223, + "balance_loss_clip": 1.07921362, + "balance_loss_mlp": 1.03677416, + "epoch": 0.029340147301969037, + "flos": 20660234590080.0, + "grad_norm": 2.566388301054309, + "language_loss": 0.88581878, + "learning_rate": 3.985648090637122e-06, + "loss": 0.90934181, + "num_input_tokens_seen": 10380910, + "step": 488, + "time_per_iteration": 2.6569244861602783 + }, + { + "auxiliary_loss_clip": 0.01275613, + "auxiliary_loss_mlp": 0.01081415, + "balance_loss_clip": 1.07419777, + "balance_loss_mlp": 1.04667735, + "epoch": 0.029400270554637006, + "flos": 24428018689920.0, + "grad_norm": 2.0135021623582503, + "language_loss": 0.88869834, + "learning_rate": 3.986966109896785e-06, + "loss": 0.91226858, + "num_input_tokens_seen": 10400665, + "step": 489, + "time_per_iteration": 2.805555582046509 + }, + { + "auxiliary_loss_clip": 0.01271096, + "auxiliary_loss_mlp": 0.01077182, + "balance_loss_clip": 1.0704807, + "balance_loss_mlp": 1.04168141, + "epoch": 0.029460393807304974, + "flos": 20120892900480.0, + "grad_norm": 2.807428314395572, + "language_loss": 0.88554472, + "learning_rate": 3.988281436571815e-06, + "loss": 0.90902752, + "num_input_tokens_seen": 10420150, + "step": 490, + "time_per_iteration": 2.612993001937866 + }, + { + "auxiliary_loss_clip": 0.01276687, + "auxiliary_loss_mlp": 0.01088031, + "balance_loss_clip": 1.0729506, + "balance_loss_mlp": 1.0536747, + "epoch": 0.029520517059972943, + "flos": 17675914965120.0, + "grad_norm": 2.430337539839543, + "language_loss": 0.91496718, + "learning_rate": 3.989594081641164e-06, + "loss": 0.93861437, + "num_input_tokens_seen": 10438210, + "step": 491, + "time_per_iteration": 2.6203627586364746 + }, + { + "auxiliary_loss_clip": 0.01266864, + "auxiliary_loss_mlp": 0.01072939, + "balance_loss_clip": 1.07131863, + "balance_loss_mlp": 1.03984618, + "epoch": 0.029580640312640915, + "flos": 18953185662720.0, + "grad_norm": 1.9753258841331502, + "language_loss": 0.85654163, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.87993968, + "num_input_tokens_seen": 10455125, + "step": 492, + "time_per_iteration": 2.636378288269043 + }, + { + "auxiliary_loss_clip": 0.01279009, + "auxiliary_loss_mlp": 0.01100381, + "balance_loss_clip": 1.07765996, + "balance_loss_mlp": 1.06471384, + "epoch": 0.029640763565308884, + "flos": 18726121837440.0, + "grad_norm": 4.076790847855052, + "language_loss": 0.84615922, + "learning_rate": 3.992211370544093e-06, + "loss": 0.86995316, + "num_input_tokens_seen": 10470990, + "step": 493, + "time_per_iteration": 2.6144914627075195 + }, + { + "auxiliary_loss_clip": 0.01272514, + "auxiliary_loss_mlp": 0.01074657, + "balance_loss_clip": 1.07140934, + "balance_loss_mlp": 1.04042029, + "epoch": 0.029700886817976852, + "flos": 20595308757120.0, + "grad_norm": 1.8084917907335818, + "language_loss": 0.8658669, + "learning_rate": 3.99351603600268e-06, + "loss": 0.88933873, + "num_input_tokens_seen": 10490685, + "step": 494, + "time_per_iteration": 2.7063095569610596 + }, + { + "auxiliary_loss_clip": 0.01281688, + "auxiliary_loss_mlp": 0.01084428, + "balance_loss_clip": 1.07739305, + "balance_loss_mlp": 1.05279028, + "epoch": 0.02976101007064482, + "flos": 22236857233920.0, + "grad_norm": 7.125038043922513, + "language_loss": 0.86841047, + "learning_rate": 3.994818063106668e-06, + "loss": 0.8920716, + "num_input_tokens_seen": 10509435, + "step": 495, + "time_per_iteration": 2.641700267791748 + }, + { + "auxiliary_loss_clip": 0.01268945, + "auxiliary_loss_mlp": 0.01078198, + "balance_loss_clip": 1.07384837, + "balance_loss_mlp": 1.04508162, + "epoch": 0.029821133323312793, + "flos": 23732644320000.0, + "grad_norm": 2.201071528053665, + "language_loss": 0.61988759, + "learning_rate": 3.99611746250533e-06, + "loss": 0.64335901, + "num_input_tokens_seen": 10530050, + "step": 496, + "time_per_iteration": 2.6524407863616943 + }, + { + "auxiliary_loss_clip": 0.01270994, + "auxiliary_loss_mlp": 0.01089922, + "balance_loss_clip": 1.07575428, + "balance_loss_mlp": 1.05680561, + "epoch": 0.02988125657598076, + "flos": 22419498913920.0, + "grad_norm": 1.7538974268426115, + "language_loss": 0.88820887, + "learning_rate": 3.997414244783595e-06, + "loss": 0.91181797, + "num_input_tokens_seen": 10551370, + "step": 497, + "time_per_iteration": 5.648245811462402 + }, + { + "auxiliary_loss_clip": 0.01277289, + "auxiliary_loss_mlp": 0.01079642, + "balance_loss_clip": 1.07670021, + "balance_loss_mlp": 1.04604888, + "epoch": 0.02994137982864873, + "flos": 13845108453120.0, + "grad_norm": 2.8395997319333204, + "language_loss": 0.85091698, + "learning_rate": 3.998708420462557e-06, + "loss": 0.87448633, + "num_input_tokens_seen": 10569225, + "step": 498, + "time_per_iteration": 4.362173080444336 + }, + { + "auxiliary_loss_clip": 0.0127249, + "auxiliary_loss_mlp": 0.01078673, + "balance_loss_clip": 1.07436109, + "balance_loss_mlp": 1.04691589, + "epoch": 0.0300015030813167, + "flos": 23908354675200.0, + "grad_norm": 3.2275044857926605, + "language_loss": 0.77883017, + "learning_rate": 4e-06, + "loss": 0.80234182, + "num_input_tokens_seen": 10586170, + "step": 499, + "time_per_iteration": 2.6029655933380127 + }, + { + "auxiliary_loss_clip": 0.01272525, + "auxiliary_loss_mlp": 0.01082339, + "balance_loss_clip": 1.07433248, + "balance_loss_mlp": 1.04905546, + "epoch": 0.030061626333984667, + "flos": 22016796560640.0, + "grad_norm": 2.244229511477372, + "language_loss": 0.82687509, + "learning_rate": 3.9999999620799e-06, + "loss": 0.85042375, + "num_input_tokens_seen": 10606205, + "step": 500, + "time_per_iteration": 2.6293113231658936 + }, + { + "auxiliary_loss_clip": 0.01266453, + "auxiliary_loss_mlp": 0.0108458, + "balance_loss_clip": 1.07100737, + "balance_loss_mlp": 1.04922247, + "epoch": 0.03012174958665264, + "flos": 23039747988480.0, + "grad_norm": 3.2569274145363356, + "language_loss": 0.88086087, + "learning_rate": 3.9999998483196e-06, + "loss": 0.90437114, + "num_input_tokens_seen": 10625995, + "step": 501, + "time_per_iteration": 2.601081132888794 + }, + { + "auxiliary_loss_clip": 0.01273997, + "auxiliary_loss_mlp": 0.01071746, + "balance_loss_clip": 1.07361674, + "balance_loss_mlp": 1.04025102, + "epoch": 0.030181872839320608, + "flos": 18953257489920.0, + "grad_norm": 3.3627001763511855, + "language_loss": 0.86654103, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.88999844, + "num_input_tokens_seen": 10644105, + "step": 502, + "time_per_iteration": 2.5507659912109375 + }, + { + "auxiliary_loss_clip": 0.01270542, + "auxiliary_loss_mlp": 0.01081534, + "balance_loss_clip": 1.07475543, + "balance_loss_mlp": 1.04827452, + "epoch": 0.030241996091988577, + "flos": 16728017005440.0, + "grad_norm": 2.4572357458963876, + "language_loss": 0.84281206, + "learning_rate": 3.999999393278425e-06, + "loss": 0.86633277, + "num_input_tokens_seen": 10661090, + "step": 503, + "time_per_iteration": 2.618587017059326 + }, + { + "auxiliary_loss_clip": 0.01262547, + "auxiliary_loss_mlp": 0.01091143, + "balance_loss_clip": 1.0710721, + "balance_loss_mlp": 1.05781209, + "epoch": 0.030302119344656545, + "flos": 28621271387520.0, + "grad_norm": 1.6994359255159197, + "language_loss": 0.88137805, + "learning_rate": 3.999999051997567e-06, + "loss": 0.90491492, + "num_input_tokens_seen": 10682380, + "step": 504, + "time_per_iteration": 2.6794183254241943 + }, + { + "auxiliary_loss_clip": 0.01264601, + "auxiliary_loss_mlp": 0.01086749, + "balance_loss_clip": 1.07040262, + "balance_loss_mlp": 1.0541091, + "epoch": 0.030362242597324514, + "flos": 15669334523520.0, + "grad_norm": 2.074855698516145, + "language_loss": 0.786093, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.80960649, + "num_input_tokens_seen": 10699925, + "step": 505, + "time_per_iteration": 2.564960479736328 + }, + { + "auxiliary_loss_clip": 0.01134686, + "auxiliary_loss_mlp": 0.010147, + "balance_loss_clip": 1.03763247, + "balance_loss_mlp": 1.00692737, + "epoch": 0.030422365849992486, + "flos": 72125973676800.0, + "grad_norm": 0.9565689962416369, + "language_loss": 0.54981297, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57130682, + "num_input_tokens_seen": 10766525, + "step": 506, + "time_per_iteration": 3.3345654010772705 + }, + { + "auxiliary_loss_clip": 0.01266577, + "auxiliary_loss_mlp": 0.01090299, + "balance_loss_clip": 1.07119894, + "balance_loss_mlp": 1.05687308, + "epoch": 0.030482489102660455, + "flos": 19427817000960.0, + "grad_norm": 2.2738865373146684, + "language_loss": 0.83377159, + "learning_rate": 3.999997573114069e-06, + "loss": 0.8573404, + "num_input_tokens_seen": 10786725, + "step": 507, + "time_per_iteration": 2.645613670349121 + }, + { + "auxiliary_loss_clip": 0.01269938, + "auxiliary_loss_mlp": 0.01076205, + "balance_loss_clip": 1.07151937, + "balance_loss_mlp": 1.04344678, + "epoch": 0.030542612355328423, + "flos": 20375822701440.0, + "grad_norm": 2.375369924968869, + "language_loss": 0.88842839, + "learning_rate": 3.999996928472659e-06, + "loss": 0.91188985, + "num_input_tokens_seen": 10805390, + "step": 508, + "time_per_iteration": 2.617283344268799 + }, + { + "auxiliary_loss_clip": 0.01272148, + "auxiliary_loss_mlp": 0.01067206, + "balance_loss_clip": 1.07232118, + "balance_loss_mlp": 1.03394616, + "epoch": 0.030602735607996392, + "flos": 34677354297600.0, + "grad_norm": 6.964954749829821, + "language_loss": 0.71807706, + "learning_rate": 3.999996207991165e-06, + "loss": 0.74147063, + "num_input_tokens_seen": 10828030, + "step": 509, + "time_per_iteration": 2.7723498344421387 + }, + { + "auxiliary_loss_clip": 0.01264594, + "auxiliary_loss_mlp": 0.01074377, + "balance_loss_clip": 1.07241154, + "balance_loss_mlp": 1.04333544, + "epoch": 0.03066285886066436, + "flos": 23658668259840.0, + "grad_norm": 1.9285974370038053, + "language_loss": 0.82031929, + "learning_rate": 3.999995411669614e-06, + "loss": 0.84370899, + "num_input_tokens_seen": 10845240, + "step": 510, + "time_per_iteration": 2.6254217624664307 + }, + { + "auxiliary_loss_clip": 0.01268793, + "auxiliary_loss_mlp": 0.01075379, + "balance_loss_clip": 1.07532823, + "balance_loss_mlp": 1.04252458, + "epoch": 0.030722982113332332, + "flos": 23002975440000.0, + "grad_norm": 5.706057095430757, + "language_loss": 0.83572316, + "learning_rate": 3.999994539508036e-06, + "loss": 0.85916495, + "num_input_tokens_seen": 10864325, + "step": 511, + "time_per_iteration": 2.613457441329956 + }, + { + "auxiliary_loss_clip": 0.01269742, + "auxiliary_loss_mlp": 0.01081314, + "balance_loss_clip": 1.07207167, + "balance_loss_mlp": 1.0496521, + "epoch": 0.0307831053660003, + "flos": 24750855152640.0, + "grad_norm": 2.025270681093948, + "language_loss": 0.82109964, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.84461015, + "num_input_tokens_seen": 10883860, + "step": 512, + "time_per_iteration": 2.630404233932495 + }, + { + "auxiliary_loss_clip": 0.01266054, + "auxiliary_loss_mlp": 0.01084436, + "balance_loss_clip": 1.07086158, + "balance_loss_mlp": 1.05070007, + "epoch": 0.03084322861866827, + "flos": 26140885620480.0, + "grad_norm": 2.500363981205655, + "language_loss": 0.86933553, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.89284045, + "num_input_tokens_seen": 10904555, + "step": 513, + "time_per_iteration": 2.671926259994507 + }, + { + "auxiliary_loss_clip": 0.01272542, + "auxiliary_loss_mlp": 0.01080065, + "balance_loss_clip": 1.07461214, + "balance_loss_mlp": 1.04744935, + "epoch": 0.03090335187133624, + "flos": 18771298168320.0, + "grad_norm": 1.704575426690477, + "language_loss": 0.79124331, + "learning_rate": 3.999991467983491e-06, + "loss": 0.81476939, + "num_input_tokens_seen": 10923700, + "step": 514, + "time_per_iteration": 2.6158573627471924 + }, + { + "auxiliary_loss_clip": 0.01265821, + "auxiliary_loss_mlp": 0.01067844, + "balance_loss_clip": 1.07397485, + "balance_loss_mlp": 1.03711247, + "epoch": 0.030963475124004207, + "flos": 23221886878080.0, + "grad_norm": 2.729063628201222, + "language_loss": 0.77758944, + "learning_rate": 3.999990292462167e-06, + "loss": 0.80092615, + "num_input_tokens_seen": 10942730, + "step": 515, + "time_per_iteration": 2.636294364929199 + }, + { + "auxiliary_loss_clip": 0.0126398, + "auxiliary_loss_mlp": 0.01072575, + "balance_loss_clip": 1.06835747, + "balance_loss_mlp": 1.03874326, + "epoch": 0.03102359837667218, + "flos": 42525595411200.0, + "grad_norm": 2.1228851207681503, + "language_loss": 0.82452714, + "learning_rate": 3.999989041101011e-06, + "loss": 0.84789264, + "num_input_tokens_seen": 10967120, + "step": 516, + "time_per_iteration": 2.8078057765960693 + }, + { + "auxiliary_loss_clip": 0.01263726, + "auxiliary_loss_mlp": 0.01073859, + "balance_loss_clip": 1.0712111, + "balance_loss_mlp": 1.04090929, + "epoch": 0.031083721629340148, + "flos": 21176953689600.0, + "grad_norm": 1.9016724574566626, + "language_loss": 0.79088318, + "learning_rate": 3.999987713900071e-06, + "loss": 0.81425899, + "num_input_tokens_seen": 10986775, + "step": 517, + "time_per_iteration": 2.5935981273651123 + }, + { + "auxiliary_loss_clip": 0.0125895, + "auxiliary_loss_mlp": 0.0107836, + "balance_loss_clip": 1.07049131, + "balance_loss_mlp": 1.04629326, + "epoch": 0.031143844882008116, + "flos": 29716187713920.0, + "grad_norm": 1.6829619528007147, + "language_loss": 0.90798068, + "learning_rate": 3.999986310859396e-06, + "loss": 0.93135381, + "num_input_tokens_seen": 11011360, + "step": 518, + "time_per_iteration": 2.6855509281158447 + }, + { + "auxiliary_loss_clip": 0.01272237, + "auxiliary_loss_mlp": 0.01097567, + "balance_loss_clip": 1.07848859, + "balance_loss_mlp": 1.06230497, + "epoch": 0.031203968134676085, + "flos": 23112467072640.0, + "grad_norm": 1.8835331125391583, + "language_loss": 0.86759162, + "learning_rate": 3.999984831979039e-06, + "loss": 0.89128959, + "num_input_tokens_seen": 11030150, + "step": 519, + "time_per_iteration": 2.628380060195923 + }, + { + "auxiliary_loss_clip": 0.01265864, + "auxiliary_loss_mlp": 0.01086943, + "balance_loss_clip": 1.06901193, + "balance_loss_mlp": 1.05578136, + "epoch": 0.03126409138734405, + "flos": 20954379064320.0, + "grad_norm": 3.8823628482318164, + "language_loss": 0.87246573, + "learning_rate": 3.999983277259057e-06, + "loss": 0.89599377, + "num_input_tokens_seen": 11049145, + "step": 520, + "time_per_iteration": 2.5850255489349365 + }, + { + "auxiliary_loss_clip": 0.01269157, + "auxiliary_loss_mlp": 0.01086266, + "balance_loss_clip": 1.07231963, + "balance_loss_mlp": 1.0528394, + "epoch": 0.031324214640012026, + "flos": 21650112570240.0, + "grad_norm": 1.7050130216714323, + "language_loss": 0.89274424, + "learning_rate": 3.999981646699509e-06, + "loss": 0.91629851, + "num_input_tokens_seen": 11068835, + "step": 521, + "time_per_iteration": 2.6412506103515625 + }, + { + "auxiliary_loss_clip": 0.01263772, + "auxiliary_loss_mlp": 0.01082584, + "balance_loss_clip": 1.0717473, + "balance_loss_mlp": 1.04827595, + "epoch": 0.03138433789267999, + "flos": 23441337020160.0, + "grad_norm": 2.085624200373119, + "language_loss": 0.71452564, + "learning_rate": 3.999979940300456e-06, + "loss": 0.73798925, + "num_input_tokens_seen": 11088980, + "step": 522, + "time_per_iteration": 2.6561174392700195 + }, + { + "auxiliary_loss_clip": 0.01265725, + "auxiliary_loss_mlp": 0.01082552, + "balance_loss_clip": 1.06871116, + "balance_loss_mlp": 1.05079484, + "epoch": 0.03144446114534796, + "flos": 18982164960000.0, + "grad_norm": 4.223323698032832, + "language_loss": 0.84758592, + "learning_rate": 3.999978158061963e-06, + "loss": 0.87106872, + "num_input_tokens_seen": 11104300, + "step": 523, + "time_per_iteration": 2.608565330505371 + }, + { + "auxiliary_loss_clip": 0.01271589, + "auxiliary_loss_mlp": 0.01076253, + "balance_loss_clip": 1.07193565, + "balance_loss_mlp": 1.04296994, + "epoch": 0.031504584398015935, + "flos": 22637692080000.0, + "grad_norm": 2.324094801199308, + "language_loss": 0.89989722, + "learning_rate": 3.999976299984099e-06, + "loss": 0.92337573, + "num_input_tokens_seen": 11123335, + "step": 524, + "time_per_iteration": 2.68269944190979 + }, + { + "auxiliary_loss_clip": 0.01273471, + "auxiliary_loss_mlp": 0.0108318, + "balance_loss_clip": 1.07427168, + "balance_loss_mlp": 1.04944324, + "epoch": 0.0315647076506839, + "flos": 25297056339840.0, + "grad_norm": 2.4635323942475766, + "language_loss": 0.80114233, + "learning_rate": 3.999974366066933e-06, + "loss": 0.82470882, + "num_input_tokens_seen": 11140880, + "step": 525, + "time_per_iteration": 2.6396324634552 + }, + { + "auxiliary_loss_clip": 0.01264716, + "auxiliary_loss_mlp": 0.01080959, + "balance_loss_clip": 1.0681529, + "balance_loss_mlp": 1.04798603, + "epoch": 0.03162483090335187, + "flos": 16982839065600.0, + "grad_norm": 2.3553733144031948, + "language_loss": 0.81162, + "learning_rate": 3.999972356310538e-06, + "loss": 0.83507675, + "num_input_tokens_seen": 11158710, + "step": 526, + "time_per_iteration": 2.6167168617248535 + }, + { + "auxiliary_loss_clip": 0.01273987, + "auxiliary_loss_mlp": 0.01072725, + "balance_loss_clip": 1.07507181, + "balance_loss_mlp": 1.03736734, + "epoch": 0.03168495415601984, + "flos": 18734489706240.0, + "grad_norm": 1.9666844995001491, + "language_loss": 0.81491739, + "learning_rate": 3.999970270714991e-06, + "loss": 0.83838451, + "num_input_tokens_seen": 11177550, + "step": 527, + "time_per_iteration": 2.580310821533203 + }, + { + "auxiliary_loss_clip": 0.01261155, + "auxiliary_loss_mlp": 0.01080842, + "balance_loss_clip": 1.06786597, + "balance_loss_mlp": 1.04717755, + "epoch": 0.03174507740868781, + "flos": 21214875473280.0, + "grad_norm": 1.9105688869262756, + "language_loss": 0.93801636, + "learning_rate": 3.999968109280371e-06, + "loss": 0.96143627, + "num_input_tokens_seen": 11196230, + "step": 528, + "time_per_iteration": 2.5901002883911133 + }, + { + "auxiliary_loss_clip": 0.01263275, + "auxiliary_loss_mlp": 0.01071724, + "balance_loss_clip": 1.06776333, + "balance_loss_mlp": 1.0387274, + "epoch": 0.03180520066135578, + "flos": 24787663614720.0, + "grad_norm": 1.8924176613796981, + "language_loss": 0.84130204, + "learning_rate": 3.99996587200676e-06, + "loss": 0.86465204, + "num_input_tokens_seen": 11214935, + "step": 529, + "time_per_iteration": 2.593867063522339 + }, + { + "auxiliary_loss_clip": 0.01266309, + "auxiliary_loss_mlp": 0.01088988, + "balance_loss_clip": 1.07501197, + "balance_loss_mlp": 1.0563724, + "epoch": 0.03186532391402375, + "flos": 24864261367680.0, + "grad_norm": 2.316883777672742, + "language_loss": 0.90458709, + "learning_rate": 3.999963558894243e-06, + "loss": 0.92814004, + "num_input_tokens_seen": 11235310, + "step": 530, + "time_per_iteration": 2.5994982719421387 + }, + { + "auxiliary_loss_clip": 0.01261024, + "auxiliary_loss_mlp": 0.0107627, + "balance_loss_clip": 1.06481552, + "balance_loss_mlp": 1.04188991, + "epoch": 0.03192544716669172, + "flos": 21215055041280.0, + "grad_norm": 2.2744046769674324, + "language_loss": 0.76334512, + "learning_rate": 3.999961169942907e-06, + "loss": 0.78671807, + "num_input_tokens_seen": 11254425, + "step": 531, + "time_per_iteration": 2.618149757385254 + }, + { + "auxiliary_loss_clip": 0.01260981, + "auxiliary_loss_mlp": 0.01064937, + "balance_loss_clip": 1.0669558, + "balance_loss_mlp": 1.03143883, + "epoch": 0.03198557041935969, + "flos": 24353216616960.0, + "grad_norm": 2.467757262816931, + "language_loss": 0.90483695, + "learning_rate": 3.999958705152843e-06, + "loss": 0.92809618, + "num_input_tokens_seen": 11274595, + "step": 532, + "time_per_iteration": 2.647947072982788 + }, + { + "auxiliary_loss_clip": 0.01146464, + "auxiliary_loss_mlp": 0.01012028, + "balance_loss_clip": 1.04988623, + "balance_loss_mlp": 1.00325394, + "epoch": 0.032045693672027656, + "flos": 61827367587840.0, + "grad_norm": 1.9655071928838626, + "language_loss": 0.57953775, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.60112268, + "num_input_tokens_seen": 11336705, + "step": 533, + "time_per_iteration": 3.2502808570861816 + }, + { + "auxiliary_loss_clip": 0.01260941, + "auxiliary_loss_mlp": 0.01084263, + "balance_loss_clip": 1.06724441, + "balance_loss_mlp": 1.0516715, + "epoch": 0.03210581692469563, + "flos": 28401174800640.0, + "grad_norm": 1.7138682169725878, + "language_loss": 0.86666048, + "learning_rate": 3.999953548056907e-06, + "loss": 0.89011252, + "num_input_tokens_seen": 11356820, + "step": 534, + "time_per_iteration": 2.678739070892334 + }, + { + "auxiliary_loss_clip": 0.01259554, + "auxiliary_loss_mlp": 0.01066669, + "balance_loss_clip": 1.06782031, + "balance_loss_mlp": 1.03407741, + "epoch": 0.03216594017736359, + "flos": 24717709877760.0, + "grad_norm": 2.12774196295415, + "language_loss": 0.77627808, + "learning_rate": 3.999950855751232e-06, + "loss": 0.79954034, + "num_input_tokens_seen": 11376645, + "step": 535, + "time_per_iteration": 2.7128217220306396 + }, + { + "auxiliary_loss_clip": 0.01261708, + "auxiliary_loss_mlp": 0.01081378, + "balance_loss_clip": 1.06843078, + "balance_loss_mlp": 1.0485003, + "epoch": 0.032226063430031565, + "flos": 31175453646720.0, + "grad_norm": 3.9913279940153585, + "language_loss": 0.80939913, + "learning_rate": 3.999948087607219e-06, + "loss": 0.83283001, + "num_input_tokens_seen": 11397310, + "step": 536, + "time_per_iteration": 2.7490127086639404 + }, + { + "auxiliary_loss_clip": 0.01262237, + "auxiliary_loss_mlp": 0.01075987, + "balance_loss_clip": 1.06839073, + "balance_loss_mlp": 1.04167831, + "epoch": 0.03228618668269954, + "flos": 32198225506560.0, + "grad_norm": 1.6888601787189168, + "language_loss": 0.7009111, + "learning_rate": 3.999945243624975e-06, + "loss": 0.72429335, + "num_input_tokens_seen": 11418475, + "step": 537, + "time_per_iteration": 5.5609166622161865 + }, + { + "auxiliary_loss_clip": 0.0126357, + "auxiliary_loss_mlp": 0.01084205, + "balance_loss_clip": 1.07331729, + "balance_loss_mlp": 1.05161297, + "epoch": 0.0323463099353675, + "flos": 22670154996480.0, + "grad_norm": 2.146306428033486, + "language_loss": 0.82684958, + "learning_rate": 3.999942323804607e-06, + "loss": 0.85032725, + "num_input_tokens_seen": 11436630, + "step": 538, + "time_per_iteration": 2.5465030670166016 + }, + { + "auxiliary_loss_clip": 0.01269537, + "auxiliary_loss_mlp": 0.01078099, + "balance_loss_clip": 1.06987572, + "balance_loss_mlp": 1.04536414, + "epoch": 0.032406433188035474, + "flos": 26905172232960.0, + "grad_norm": 1.8709064214989917, + "language_loss": 0.79146457, + "learning_rate": 3.999939328146225e-06, + "loss": 0.81494099, + "num_input_tokens_seen": 11457275, + "step": 539, + "time_per_iteration": 4.172123432159424 + }, + { + "auxiliary_loss_clip": 0.0126143, + "auxiliary_loss_mlp": 0.01069528, + "balance_loss_clip": 1.06830835, + "balance_loss_mlp": 1.03567231, + "epoch": 0.03246655644070344, + "flos": 31503928544640.0, + "grad_norm": 35.59051030008172, + "language_loss": 0.77379727, + "learning_rate": 3.999936256649943e-06, + "loss": 0.79710686, + "num_input_tokens_seen": 11476925, + "step": 540, + "time_per_iteration": 2.5633046627044678 + }, + { + "auxiliary_loss_clip": 0.01269863, + "auxiliary_loss_mlp": 0.01073669, + "balance_loss_clip": 1.07271969, + "balance_loss_mlp": 1.04124355, + "epoch": 0.03252667969337141, + "flos": 23218331431680.0, + "grad_norm": 2.0489065110302636, + "language_loss": 0.85458571, + "learning_rate": 3.999933109315878e-06, + "loss": 0.878021, + "num_input_tokens_seen": 11496830, + "step": 541, + "time_per_iteration": 2.6079938411712646 + }, + { + "auxiliary_loss_clip": 0.01258504, + "auxiliary_loss_mlp": 0.01082451, + "balance_loss_clip": 1.06961954, + "balance_loss_mlp": 1.04835749, + "epoch": 0.032586802946039384, + "flos": 14757454926720.0, + "grad_norm": 2.674731240129174, + "language_loss": 0.89234567, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.91575521, + "num_input_tokens_seen": 11515605, + "step": 542, + "time_per_iteration": 2.597036600112915 + }, + { + "auxiliary_loss_clip": 0.0126351, + "auxiliary_loss_mlp": 0.01081041, + "balance_loss_clip": 1.06974792, + "balance_loss_mlp": 1.04792452, + "epoch": 0.03264692619870735, + "flos": 24280677100800.0, + "grad_norm": 2.2714121360014334, + "language_loss": 0.71123677, + "learning_rate": 3.999926587134879e-06, + "loss": 0.73468232, + "num_input_tokens_seen": 11536230, + "step": 543, + "time_per_iteration": 2.634601354598999 + }, + { + "auxiliary_loss_clip": 0.01259994, + "auxiliary_loss_mlp": 0.01088763, + "balance_loss_clip": 1.06379187, + "balance_loss_mlp": 1.05545604, + "epoch": 0.03270704945137532, + "flos": 22893160584960.0, + "grad_norm": 4.777521083182084, + "language_loss": 0.91540575, + "learning_rate": 3.999923212288192e-06, + "loss": 0.93889332, + "num_input_tokens_seen": 11554715, + "step": 544, + "time_per_iteration": 2.6173009872436523 + }, + { + "auxiliary_loss_clip": 0.01264485, + "auxiliary_loss_mlp": 0.01085684, + "balance_loss_clip": 1.06989884, + "balance_loss_mlp": 1.05571437, + "epoch": 0.032767172704043286, + "flos": 18041018757120.0, + "grad_norm": 2.6951315012120025, + "language_loss": 0.65799558, + "learning_rate": 3.999919761604216e-06, + "loss": 0.68149722, + "num_input_tokens_seen": 11571370, + "step": 545, + "time_per_iteration": 2.6500988006591797 + }, + { + "auxiliary_loss_clip": 0.012623, + "auxiliary_loss_mlp": 0.0107161, + "balance_loss_clip": 1.06693912, + "balance_loss_mlp": 1.0393517, + "epoch": 0.03282729595671126, + "flos": 22528739151360.0, + "grad_norm": 2.2564766449723908, + "language_loss": 0.92221987, + "learning_rate": 3.999916235083083e-06, + "loss": 0.94555902, + "num_input_tokens_seen": 11588560, + "step": 546, + "time_per_iteration": 2.673250913619995 + }, + { + "auxiliary_loss_clip": 0.01260258, + "auxiliary_loss_mlp": 0.01077296, + "balance_loss_clip": 1.06488204, + "balance_loss_mlp": 1.04313052, + "epoch": 0.03288741920937923, + "flos": 20410620001920.0, + "grad_norm": 2.1923718908590653, + "language_loss": 0.81706661, + "learning_rate": 3.999912632724925e-06, + "loss": 0.84044212, + "num_input_tokens_seen": 11605685, + "step": 547, + "time_per_iteration": 2.725198745727539 + }, + { + "auxiliary_loss_clip": 0.0126227, + "auxiliary_loss_mlp": 0.0107871, + "balance_loss_clip": 1.06794477, + "balance_loss_mlp": 1.04480648, + "epoch": 0.032947542462047195, + "flos": 20777986350720.0, + "grad_norm": 1.730652582963277, + "language_loss": 0.81227565, + "learning_rate": 3.999908954529881e-06, + "loss": 0.83568549, + "num_input_tokens_seen": 11626290, + "step": 548, + "time_per_iteration": 2.714073419570923 + }, + { + "auxiliary_loss_clip": 0.01264818, + "auxiliary_loss_mlp": 0.01084154, + "balance_loss_clip": 1.06963027, + "balance_loss_mlp": 1.04870164, + "epoch": 0.03300766571471517, + "flos": 19901263190400.0, + "grad_norm": 3.8540092911047603, + "language_loss": 0.67460287, + "learning_rate": 3.999905200498087e-06, + "loss": 0.69809258, + "num_input_tokens_seen": 11643950, + "step": 549, + "time_per_iteration": 2.6747171878814697 + }, + { + "auxiliary_loss_clip": 0.0125805, + "auxiliary_loss_mlp": 0.01076001, + "balance_loss_clip": 1.06968856, + "balance_loss_mlp": 1.04236054, + "epoch": 0.03306778896738313, + "flos": 17967760968960.0, + "grad_norm": 1.933615596136007, + "language_loss": 0.86379111, + "learning_rate": 3.999901370629689e-06, + "loss": 0.88713157, + "num_input_tokens_seen": 11662560, + "step": 550, + "time_per_iteration": 2.553386926651001 + }, + { + "auxiliary_loss_clip": 0.01264951, + "auxiliary_loss_mlp": 0.01095377, + "balance_loss_clip": 1.07279766, + "balance_loss_mlp": 1.06142652, + "epoch": 0.033127912220051105, + "flos": 21653380707840.0, + "grad_norm": 3.1958143211070977, + "language_loss": 0.8127178, + "learning_rate": 3.99989746492483e-06, + "loss": 0.83632112, + "num_input_tokens_seen": 11682265, + "step": 551, + "time_per_iteration": 2.6231682300567627 + }, + { + "auxiliary_loss_clip": 0.01271579, + "auxiliary_loss_mlp": 0.0108998, + "balance_loss_clip": 1.07285261, + "balance_loss_mlp": 1.05626702, + "epoch": 0.03318803547271908, + "flos": 30188376927360.0, + "grad_norm": 2.9473143774727606, + "language_loss": 0.86134821, + "learning_rate": 3.999893483383658e-06, + "loss": 0.88496381, + "num_input_tokens_seen": 11699300, + "step": 552, + "time_per_iteration": 2.7002694606781006 + }, + { + "auxiliary_loss_clip": 0.01267081, + "auxiliary_loss_mlp": 0.01081671, + "balance_loss_clip": 1.07191086, + "balance_loss_mlp": 1.04650474, + "epoch": 0.03324815872538704, + "flos": 20376038183040.0, + "grad_norm": 2.990469903058063, + "language_loss": 0.9301765, + "learning_rate": 3.999889426006326e-06, + "loss": 0.95366406, + "num_input_tokens_seen": 11716955, + "step": 553, + "time_per_iteration": 2.6629648208618164 + }, + { + "auxiliary_loss_clip": 0.01262345, + "auxiliary_loss_mlp": 0.01077186, + "balance_loss_clip": 1.06925786, + "balance_loss_mlp": 1.04149485, + "epoch": 0.033308281978055014, + "flos": 24494560634880.0, + "grad_norm": 2.1924330874053166, + "language_loss": 0.78881586, + "learning_rate": 3.999885292792986e-06, + "loss": 0.8122111, + "num_input_tokens_seen": 11736130, + "step": 554, + "time_per_iteration": 2.668970823287964 + }, + { + "auxiliary_loss_clip": 0.01258048, + "auxiliary_loss_mlp": 0.0108557, + "balance_loss_clip": 1.06745815, + "balance_loss_mlp": 1.05045104, + "epoch": 0.03336840523072298, + "flos": 23400326666880.0, + "grad_norm": 2.2144550089326938, + "language_loss": 0.81971425, + "learning_rate": 3.999881083743795e-06, + "loss": 0.84315038, + "num_input_tokens_seen": 11754425, + "step": 555, + "time_per_iteration": 2.610807418823242 + }, + { + "auxiliary_loss_clip": 0.01264442, + "auxiliary_loss_mlp": 0.0108339, + "balance_loss_clip": 1.06914032, + "balance_loss_mlp": 1.04805672, + "epoch": 0.03342852848339095, + "flos": 30550571717760.0, + "grad_norm": 3.7821745066525487, + "language_loss": 0.88661897, + "learning_rate": 3.999876798858914e-06, + "loss": 0.9100973, + "num_input_tokens_seen": 11772845, + "step": 556, + "time_per_iteration": 2.6288907527923584 + }, + { + "auxiliary_loss_clip": 0.01262553, + "auxiliary_loss_mlp": 0.01084158, + "balance_loss_clip": 1.06896496, + "balance_loss_mlp": 1.04863358, + "epoch": 0.03348865173605892, + "flos": 22893304239360.0, + "grad_norm": 1.974910128087634, + "language_loss": 0.83708388, + "learning_rate": 3.999872438138503e-06, + "loss": 0.860551, + "num_input_tokens_seen": 11792850, + "step": 557, + "time_per_iteration": 2.649401903152466 + }, + { + "auxiliary_loss_clip": 0.01268198, + "auxiliary_loss_mlp": 0.01069057, + "balance_loss_clip": 1.07400489, + "balance_loss_mlp": 1.03684711, + "epoch": 0.03354877498872689, + "flos": 17676022705920.0, + "grad_norm": 3.176542206824637, + "language_loss": 0.94202292, + "learning_rate": 3.999868001582729e-06, + "loss": 0.96539545, + "num_input_tokens_seen": 11809670, + "step": 558, + "time_per_iteration": 2.550515651702881 + }, + { + "auxiliary_loss_clip": 0.01258948, + "auxiliary_loss_mlp": 0.01074291, + "balance_loss_clip": 1.06591845, + "balance_loss_mlp": 1.04036427, + "epoch": 0.03360889824139486, + "flos": 21652985658240.0, + "grad_norm": 2.6619487077732384, + "language_loss": 0.77115649, + "learning_rate": 3.99986348919176e-06, + "loss": 0.79448891, + "num_input_tokens_seen": 11829665, + "step": 559, + "time_per_iteration": 2.729597330093384 + }, + { + "auxiliary_loss_clip": 0.01261947, + "auxiliary_loss_mlp": 0.01080822, + "balance_loss_clip": 1.06835234, + "balance_loss_mlp": 1.04882574, + "epoch": 0.033669021494062826, + "flos": 21795730306560.0, + "grad_norm": 1.945022837871561, + "language_loss": 0.87472397, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.89815164, + "num_input_tokens_seen": 11848190, + "step": 560, + "time_per_iteration": 2.6082279682159424 + }, + { + "auxiliary_loss_clip": 0.01257198, + "auxiliary_loss_mlp": 0.0107356, + "balance_loss_clip": 1.06704283, + "balance_loss_mlp": 1.04199314, + "epoch": 0.0337291447467308, + "flos": 21866222747520.0, + "grad_norm": 2.4061219554407502, + "language_loss": 0.81578708, + "learning_rate": 3.999854236904925e-06, + "loss": 0.83909464, + "num_input_tokens_seen": 11864795, + "step": 561, + "time_per_iteration": 2.602193832397461 + }, + { + "auxiliary_loss_clip": 0.01254722, + "auxiliary_loss_mlp": 0.01076361, + "balance_loss_clip": 1.06685936, + "balance_loss_mlp": 1.04422247, + "epoch": 0.03378926799939877, + "flos": 24245951627520.0, + "grad_norm": 1.683217504050761, + "language_loss": 0.82320511, + "learning_rate": 3.999849497009409e-06, + "loss": 0.84651601, + "num_input_tokens_seen": 11885275, + "step": 562, + "time_per_iteration": 2.675872564315796 + }, + { + "auxiliary_loss_clip": 0.01262146, + "auxiliary_loss_mlp": 0.01084212, + "balance_loss_clip": 1.06894755, + "balance_loss_mlp": 1.0508337, + "epoch": 0.033849391252066735, + "flos": 16507812677760.0, + "grad_norm": 2.262509698135982, + "language_loss": 0.84285647, + "learning_rate": 3.999844681279401e-06, + "loss": 0.86632001, + "num_input_tokens_seen": 11903595, + "step": 563, + "time_per_iteration": 2.586944103240967 + }, + { + "auxiliary_loss_clip": 0.01258135, + "auxiliary_loss_mlp": 0.01083866, + "balance_loss_clip": 1.0675565, + "balance_loss_mlp": 1.05094075, + "epoch": 0.03390951450473471, + "flos": 15669298609920.0, + "grad_norm": 2.115200912185494, + "language_loss": 0.94438875, + "learning_rate": 3.99983978971508e-06, + "loss": 0.96780878, + "num_input_tokens_seen": 11917815, + "step": 564, + "time_per_iteration": 2.5444440841674805 + }, + { + "auxiliary_loss_clip": 0.01259509, + "auxiliary_loss_mlp": 0.01073406, + "balance_loss_clip": 1.06518865, + "balance_loss_mlp": 1.03907406, + "epoch": 0.03396963775740267, + "flos": 22674787850880.0, + "grad_norm": 2.6560391741906924, + "language_loss": 0.94669235, + "learning_rate": 3.999834822316635e-06, + "loss": 0.97002149, + "num_input_tokens_seen": 11936305, + "step": 565, + "time_per_iteration": 2.5614171028137207 + }, + { + "auxiliary_loss_clip": 0.01150452, + "auxiliary_loss_mlp": 0.01081579, + "balance_loss_clip": 1.04835606, + "balance_loss_mlp": 1.07499874, + "epoch": 0.034029761010070644, + "flos": 64392683063040.0, + "grad_norm": 1.0610477485673708, + "language_loss": 0.54800498, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.57032537, + "num_input_tokens_seen": 11998940, + "step": 566, + "time_per_iteration": 3.229137659072876 + }, + { + "auxiliary_loss_clip": 0.0126129, + "auxiliary_loss_mlp": 0.01073482, + "balance_loss_clip": 1.06798041, + "balance_loss_mlp": 1.03793335, + "epoch": 0.034089884262738616, + "flos": 25004204755200.0, + "grad_norm": 3.1955261820278564, + "language_loss": 0.76836932, + "learning_rate": 3.999824660018126e-06, + "loss": 0.79171705, + "num_input_tokens_seen": 12018860, + "step": 567, + "time_per_iteration": 2.632741928100586 + }, + { + "auxiliary_loss_clip": 0.01253596, + "auxiliary_loss_mlp": 0.01083559, + "balance_loss_clip": 1.06611466, + "balance_loss_mlp": 1.05153918, + "epoch": 0.03415000751540658, + "flos": 28439096584320.0, + "grad_norm": 2.115683621050472, + "language_loss": 0.80834144, + "learning_rate": 3.999819465118447e-06, + "loss": 0.83171296, + "num_input_tokens_seen": 12039675, + "step": 568, + "time_per_iteration": 2.7206337451934814 + }, + { + "auxiliary_loss_clip": 0.01254921, + "auxiliary_loss_mlp": 0.01082401, + "balance_loss_clip": 1.06888509, + "balance_loss_mlp": 1.04940367, + "epoch": 0.034210130768074554, + "flos": 21468727866240.0, + "grad_norm": 1.891360159585894, + "language_loss": 0.86560667, + "learning_rate": 3.999814194385413e-06, + "loss": 0.88897985, + "num_input_tokens_seen": 12057680, + "step": 569, + "time_per_iteration": 2.7271673679351807 + }, + { + "auxiliary_loss_clip": 0.01255135, + "auxiliary_loss_mlp": 0.01082251, + "balance_loss_clip": 1.06644094, + "balance_loss_mlp": 1.04922962, + "epoch": 0.03427025402074252, + "flos": 18697501676160.0, + "grad_norm": 1.6888504559193653, + "language_loss": 0.95945716, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.982831, + "num_input_tokens_seen": 12076135, + "step": 570, + "time_per_iteration": 2.5918867588043213 + }, + { + "auxiliary_loss_clip": 0.01255487, + "auxiliary_loss_mlp": 0.0108066, + "balance_loss_clip": 1.06228065, + "balance_loss_mlp": 1.0435617, + "epoch": 0.03433037727341049, + "flos": 20849987162880.0, + "grad_norm": 2.39132447086081, + "language_loss": 0.7964232, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.8197847, + "num_input_tokens_seen": 12094785, + "step": 571, + "time_per_iteration": 2.590184450149536 + }, + { + "auxiliary_loss_clip": 0.01256218, + "auxiliary_loss_mlp": 0.01091484, + "balance_loss_clip": 1.06740785, + "balance_loss_mlp": 1.0565083, + "epoch": 0.03439050052607846, + "flos": 25410282986880.0, + "grad_norm": 2.0738695690993, + "language_loss": 0.80214274, + "learning_rate": 3.999797927188199e-06, + "loss": 0.82561976, + "num_input_tokens_seen": 12114590, + "step": 572, + "time_per_iteration": 2.6862123012542725 + }, + { + "auxiliary_loss_clip": 0.01263024, + "auxiliary_loss_mlp": 0.01074173, + "balance_loss_clip": 1.06995344, + "balance_loss_mlp": 1.04098535, + "epoch": 0.03445062377874643, + "flos": 17640147997440.0, + "grad_norm": 2.2324763929909284, + "language_loss": 0.84548658, + "learning_rate": 3.999792353123774e-06, + "loss": 0.86885858, + "num_input_tokens_seen": 12132390, + "step": 573, + "time_per_iteration": 2.78487229347229 + }, + { + "auxiliary_loss_clip": 0.01256326, + "auxiliary_loss_mlp": 0.01068789, + "balance_loss_clip": 1.0644815, + "balance_loss_mlp": 1.03781831, + "epoch": 0.0345107470314144, + "flos": 16764502245120.0, + "grad_norm": 2.576428901855709, + "language_loss": 0.76602584, + "learning_rate": 3.999786703227023e-06, + "loss": 0.78927696, + "num_input_tokens_seen": 12149035, + "step": 574, + "time_per_iteration": 2.5697100162506104 + }, + { + "auxiliary_loss_clip": 0.01255191, + "auxiliary_loss_mlp": 0.0107671, + "balance_loss_clip": 1.06581593, + "balance_loss_mlp": 1.04502439, + "epoch": 0.03457087028408237, + "flos": 14684448533760.0, + "grad_norm": 2.156110110571344, + "language_loss": 0.83854586, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.86186486, + "num_input_tokens_seen": 12167530, + "step": 575, + "time_per_iteration": 2.596418619155884 + }, + { + "auxiliary_loss_clip": 0.01249695, + "auxiliary_loss_mlp": 0.01076053, + "balance_loss_clip": 1.06684637, + "balance_loss_mlp": 1.04334211, + "epoch": 0.03463099353675034, + "flos": 20011293527040.0, + "grad_norm": 2.350120742735315, + "language_loss": 0.83990753, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.86316502, + "num_input_tokens_seen": 12186340, + "step": 576, + "time_per_iteration": 5.821930646896362 + }, + { + "auxiliary_loss_clip": 0.01257114, + "auxiliary_loss_mlp": 0.01079503, + "balance_loss_clip": 1.07237518, + "balance_loss_mlp": 1.04817426, + "epoch": 0.03469111678941831, + "flos": 25301150490240.0, + "grad_norm": 2.138457686407641, + "language_loss": 0.85803086, + "learning_rate": 3.99976929854497e-06, + "loss": 0.88139701, + "num_input_tokens_seen": 12204090, + "step": 577, + "time_per_iteration": 4.225277423858643 + }, + { + "auxiliary_loss_clip": 0.01253845, + "auxiliary_loss_mlp": 0.01080214, + "balance_loss_clip": 1.06869018, + "balance_loss_mlp": 1.04712176, + "epoch": 0.034751240042086275, + "flos": 23259413612160.0, + "grad_norm": 4.535240156776142, + "language_loss": 0.72226608, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.74560666, + "num_input_tokens_seen": 12224850, + "step": 578, + "time_per_iteration": 4.486239433288574 + }, + { + "auxiliary_loss_clip": 0.01251871, + "auxiliary_loss_mlp": 0.01080519, + "balance_loss_clip": 1.06461096, + "balance_loss_mlp": 1.04663968, + "epoch": 0.03481136329475425, + "flos": 23769237300480.0, + "grad_norm": 1.9496379050984929, + "language_loss": 0.77785492, + "learning_rate": 3.999757316265973e-06, + "loss": 0.80117887, + "num_input_tokens_seen": 12244935, + "step": 579, + "time_per_iteration": 2.6706583499908447 + }, + { + "auxiliary_loss_clip": 0.01251647, + "auxiliary_loss_mlp": 0.01087497, + "balance_loss_clip": 1.06656826, + "balance_loss_mlp": 1.05435717, + "epoch": 0.03487148654742222, + "flos": 20157521794560.0, + "grad_norm": 2.054973215074824, + "language_loss": 0.86841297, + "learning_rate": 3.999751211379863e-06, + "loss": 0.8918044, + "num_input_tokens_seen": 12262140, + "step": 580, + "time_per_iteration": 2.639146566390991 + }, + { + "auxiliary_loss_clip": 0.01256528, + "auxiliary_loss_mlp": 0.01069029, + "balance_loss_clip": 1.06636667, + "balance_loss_mlp": 1.0398469, + "epoch": 0.034931609800090184, + "flos": 15669585918720.0, + "grad_norm": 2.205850105033732, + "language_loss": 0.82570344, + "learning_rate": 3.999745030662987e-06, + "loss": 0.84895897, + "num_input_tokens_seen": 12280930, + "step": 581, + "time_per_iteration": 2.6505649089813232 + }, + { + "auxiliary_loss_clip": 0.01252942, + "auxiliary_loss_mlp": 0.01072317, + "balance_loss_clip": 1.06823969, + "balance_loss_mlp": 1.04168022, + "epoch": 0.034991733052758156, + "flos": 16362374509440.0, + "grad_norm": 2.1922492117358146, + "language_loss": 0.7733047, + "learning_rate": 3.99973877411558e-06, + "loss": 0.79655731, + "num_input_tokens_seen": 12299125, + "step": 582, + "time_per_iteration": 2.7323596477508545 + }, + { + "auxiliary_loss_clip": 0.01250253, + "auxiliary_loss_mlp": 0.01082356, + "balance_loss_clip": 1.06794167, + "balance_loss_mlp": 1.04861939, + "epoch": 0.03505185630542612, + "flos": 19387309438080.0, + "grad_norm": 2.1536178016194327, + "language_loss": 0.87679923, + "learning_rate": 3.999732441737877e-06, + "loss": 0.90012532, + "num_input_tokens_seen": 12316905, + "step": 583, + "time_per_iteration": 2.6049294471740723 + }, + { + "auxiliary_loss_clip": 0.01255473, + "auxiliary_loss_mlp": 0.01092826, + "balance_loss_clip": 1.06699181, + "balance_loss_mlp": 1.06104505, + "epoch": 0.03511197955809409, + "flos": 21323828401920.0, + "grad_norm": 3.7027110169592015, + "language_loss": 0.81196821, + "learning_rate": 3.99972603353012e-06, + "loss": 0.83545119, + "num_input_tokens_seen": 12335070, + "step": 584, + "time_per_iteration": 2.6011815071105957 + }, + { + "auxiliary_loss_clip": 0.01251161, + "auxiliary_loss_mlp": 0.01069463, + "balance_loss_clip": 1.06472683, + "balance_loss_mlp": 1.03832567, + "epoch": 0.035172102810762065, + "flos": 14136595320960.0, + "grad_norm": 3.067717812226321, + "language_loss": 0.92399198, + "learning_rate": 3.999719549492551e-06, + "loss": 0.94719815, + "num_input_tokens_seen": 12350315, + "step": 585, + "time_per_iteration": 2.5592780113220215 + }, + { + "auxiliary_loss_clip": 0.01251271, + "auxiliary_loss_mlp": 0.01077423, + "balance_loss_clip": 1.06562734, + "balance_loss_mlp": 1.04552317, + "epoch": 0.03523222606343003, + "flos": 20296890564480.0, + "grad_norm": 2.196660024103635, + "language_loss": 0.87644351, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.89973044, + "num_input_tokens_seen": 12366030, + "step": 586, + "time_per_iteration": 2.5486221313476562 + }, + { + "auxiliary_loss_clip": 0.01256485, + "auxiliary_loss_mlp": 0.0108018, + "balance_loss_clip": 1.06803596, + "balance_loss_mlp": 1.04918551, + "epoch": 0.035292349316098, + "flos": 20375822701440.0, + "grad_norm": 2.1222089199850878, + "language_loss": 0.76079381, + "learning_rate": 3.999706353928965e-06, + "loss": 0.78416049, + "num_input_tokens_seen": 12384895, + "step": 587, + "time_per_iteration": 2.5923714637756348 + }, + { + "auxiliary_loss_clip": 0.01257125, + "auxiliary_loss_mlp": 0.01068649, + "balance_loss_clip": 1.06683922, + "balance_loss_mlp": 1.03586686, + "epoch": 0.03535247256876597, + "flos": 21468871520640.0, + "grad_norm": 2.212352192395094, + "language_loss": 0.78601038, + "learning_rate": 3.999699642403449e-06, + "loss": 0.80926806, + "num_input_tokens_seen": 12404980, + "step": 588, + "time_per_iteration": 2.579280138015747 + }, + { + "auxiliary_loss_clip": 0.0125398, + "auxiliary_loss_mlp": 0.0107827, + "balance_loss_clip": 1.06582928, + "balance_loss_mlp": 1.04367518, + "epoch": 0.03541259582143394, + "flos": 23623044946560.0, + "grad_norm": 2.153589114745919, + "language_loss": 0.94312829, + "learning_rate": 3.99969285504912e-06, + "loss": 0.96645081, + "num_input_tokens_seen": 12423835, + "step": 589, + "time_per_iteration": 2.5964701175689697 + }, + { + "auxiliary_loss_clip": 0.01256884, + "auxiliary_loss_mlp": 0.01078108, + "balance_loss_clip": 1.06697679, + "balance_loss_mlp": 1.04666042, + "epoch": 0.03547271907410191, + "flos": 33726367768320.0, + "grad_norm": 2.1162556876212695, + "language_loss": 0.84116042, + "learning_rate": 3.99968599186624e-06, + "loss": 0.8645103, + "num_input_tokens_seen": 12443135, + "step": 590, + "time_per_iteration": 2.746436357498169 + }, + { + "auxiliary_loss_clip": 0.01249398, + "auxiliary_loss_mlp": 0.01068452, + "balance_loss_clip": 1.06658125, + "balance_loss_mlp": 1.03893578, + "epoch": 0.03553284232676988, + "flos": 21142695093120.0, + "grad_norm": 1.984522351394552, + "language_loss": 0.8684091, + "learning_rate": 3.999679052855065e-06, + "loss": 0.89158762, + "num_input_tokens_seen": 12462895, + "step": 591, + "time_per_iteration": 2.692303419113159 + }, + { + "auxiliary_loss_clip": 0.01250641, + "auxiliary_loss_mlp": 0.01082122, + "balance_loss_clip": 1.06297326, + "balance_loss_mlp": 1.04883862, + "epoch": 0.03559296557943785, + "flos": 20046593617920.0, + "grad_norm": 2.0873185001780783, + "language_loss": 0.83075488, + "learning_rate": 3.999672038015861e-06, + "loss": 0.85408247, + "num_input_tokens_seen": 12481515, + "step": 592, + "time_per_iteration": 2.7822203636169434 + }, + { + "auxiliary_loss_clip": 0.01146211, + "auxiliary_loss_mlp": 0.01034159, + "balance_loss_clip": 1.05013406, + "balance_loss_mlp": 1.02676773, + "epoch": 0.035653088832105814, + "flos": 60334597244160.0, + "grad_norm": 0.8804992705477848, + "language_loss": 0.59754086, + "learning_rate": 3.999664947348893e-06, + "loss": 0.61934447, + "num_input_tokens_seen": 12548220, + "step": 593, + "time_per_iteration": 3.274080276489258 + }, + { + "auxiliary_loss_clip": 0.01249386, + "auxiliary_loss_mlp": 0.0107742, + "balance_loss_clip": 1.06737614, + "balance_loss_mlp": 1.04473329, + "epoch": 0.035713212084773786, + "flos": 20113135562880.0, + "grad_norm": 1.8086551314359374, + "language_loss": 0.87077361, + "learning_rate": 3.999657780854429e-06, + "loss": 0.89404166, + "num_input_tokens_seen": 12566105, + "step": 594, + "time_per_iteration": 2.682236671447754 + }, + { + "auxiliary_loss_clip": 0.012487, + "auxiliary_loss_mlp": 0.01082358, + "balance_loss_clip": 1.06235993, + "balance_loss_mlp": 1.05057716, + "epoch": 0.03577333533744176, + "flos": 26285785084800.0, + "grad_norm": 5.516524335860627, + "language_loss": 0.83920246, + "learning_rate": 3.999650538532742e-06, + "loss": 0.86251307, + "num_input_tokens_seen": 12586680, + "step": 595, + "time_per_iteration": 2.773669481277466 + }, + { + "auxiliary_loss_clip": 0.01248678, + "auxiliary_loss_mlp": 0.01090544, + "balance_loss_clip": 1.06579614, + "balance_loss_mlp": 1.05850017, + "epoch": 0.035833458590109724, + "flos": 10889732211840.0, + "grad_norm": 2.3448814752825204, + "language_loss": 0.96041518, + "learning_rate": 3.999643220384106e-06, + "loss": 0.98380733, + "num_input_tokens_seen": 12601605, + "step": 596, + "time_per_iteration": 2.6541590690612793 + }, + { + "auxiliary_loss_clip": 0.01252662, + "auxiliary_loss_mlp": 0.01081887, + "balance_loss_clip": 1.0675534, + "balance_loss_mlp": 1.05165553, + "epoch": 0.035893581842777696, + "flos": 22090198003200.0, + "grad_norm": 2.4353221882859004, + "language_loss": 0.82993281, + "learning_rate": 3.999635826408799e-06, + "loss": 0.85327828, + "num_input_tokens_seen": 12620365, + "step": 597, + "time_per_iteration": 2.7023818492889404 + }, + { + "auxiliary_loss_clip": 0.01247839, + "auxiliary_loss_mlp": 0.01079829, + "balance_loss_clip": 1.0668776, + "balance_loss_mlp": 1.04766583, + "epoch": 0.03595370509544566, + "flos": 23038347358080.0, + "grad_norm": 2.374757318483944, + "language_loss": 0.81364304, + "learning_rate": 3.999628356607101e-06, + "loss": 0.83691972, + "num_input_tokens_seen": 12641140, + "step": 598, + "time_per_iteration": 2.731229782104492 + }, + { + "auxiliary_loss_clip": 0.01243692, + "auxiliary_loss_mlp": 0.01077827, + "balance_loss_clip": 1.0663228, + "balance_loss_mlp": 1.04587913, + "epoch": 0.03601382834811363, + "flos": 20777734955520.0, + "grad_norm": 1.817680341814684, + "language_loss": 0.81172699, + "learning_rate": 3.999620810979295e-06, + "loss": 0.83494222, + "num_input_tokens_seen": 12661080, + "step": 599, + "time_per_iteration": 2.710191011428833 + }, + { + "auxiliary_loss_clip": 0.01250419, + "auxiliary_loss_mlp": 0.01074577, + "balance_loss_clip": 1.06356514, + "balance_loss_mlp": 1.045228, + "epoch": 0.036073951600781605, + "flos": 23951627585280.0, + "grad_norm": 2.3963649020429627, + "language_loss": 0.8651731, + "learning_rate": 3.999613189525668e-06, + "loss": 0.88842309, + "num_input_tokens_seen": 12678270, + "step": 600, + "time_per_iteration": 2.682262420654297 + }, + { + "auxiliary_loss_clip": 0.01241882, + "auxiliary_loss_mlp": 0.01084809, + "balance_loss_clip": 1.05918193, + "balance_loss_mlp": 1.05297971, + "epoch": 0.03613407485344957, + "flos": 18912283050240.0, + "grad_norm": 2.0308947613075423, + "language_loss": 0.82355881, + "learning_rate": 3.999605492246508e-06, + "loss": 0.84682572, + "num_input_tokens_seen": 12697295, + "step": 601, + "time_per_iteration": 2.6570894718170166 + }, + { + "auxiliary_loss_clip": 0.01240868, + "auxiliary_loss_mlp": 0.010708, + "balance_loss_clip": 1.06129336, + "balance_loss_mlp": 1.03920949, + "epoch": 0.03619419810611754, + "flos": 23038526926080.0, + "grad_norm": 2.3080142694085555, + "language_loss": 0.7502507, + "learning_rate": 3.999597719142107e-06, + "loss": 0.77336735, + "num_input_tokens_seen": 12716165, + "step": 602, + "time_per_iteration": 2.6434237957000732 + }, + { + "auxiliary_loss_clip": 0.01239543, + "auxiliary_loss_mlp": 0.01066859, + "balance_loss_clip": 1.0604254, + "balance_loss_mlp": 1.03562629, + "epoch": 0.03625432135878551, + "flos": 29457774293760.0, + "grad_norm": 1.9681237382646195, + "language_loss": 0.79599822, + "learning_rate": 3.999589870212761e-06, + "loss": 0.81906223, + "num_input_tokens_seen": 12735475, + "step": 603, + "time_per_iteration": 2.7201666831970215 + }, + { + "auxiliary_loss_clip": 0.01244834, + "auxiliary_loss_mlp": 0.01071177, + "balance_loss_clip": 1.06545615, + "balance_loss_mlp": 1.04130292, + "epoch": 0.03631444461145348, + "flos": 23508525409920.0, + "grad_norm": 1.8363641170913294, + "language_loss": 0.86668456, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.88984472, + "num_input_tokens_seen": 12754540, + "step": 604, + "time_per_iteration": 2.60249924659729 + }, + { + "auxiliary_loss_clip": 0.01248906, + "auxiliary_loss_mlp": 0.01072985, + "balance_loss_clip": 1.0674324, + "balance_loss_mlp": 1.04010737, + "epoch": 0.03637456786412145, + "flos": 16618130323200.0, + "grad_norm": 2.510130211393037, + "language_loss": 0.80746496, + "learning_rate": 3.999573944880424e-06, + "loss": 0.83068383, + "num_input_tokens_seen": 12773050, + "step": 605, + "time_per_iteration": 2.766684055328369 + }, + { + "auxiliary_loss_clip": 0.01244274, + "auxiliary_loss_mlp": 0.0107873, + "balance_loss_clip": 1.0630821, + "balance_loss_mlp": 1.04846251, + "epoch": 0.03643469111678942, + "flos": 15851832549120.0, + "grad_norm": 2.2216143800596835, + "language_loss": 0.85942292, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.882653, + "num_input_tokens_seen": 12791240, + "step": 606, + "time_per_iteration": 2.6133925914764404 + }, + { + "auxiliary_loss_clip": 0.01247732, + "auxiliary_loss_mlp": 0.01077404, + "balance_loss_clip": 1.06413972, + "balance_loss_mlp": 1.04588532, + "epoch": 0.03649481436945739, + "flos": 23620387340160.0, + "grad_norm": 2.0684825764003394, + "language_loss": 0.82179952, + "learning_rate": 3.999557716251912e-06, + "loss": 0.84505081, + "num_input_tokens_seen": 12812245, + "step": 607, + "time_per_iteration": 2.6805856227874756 + }, + { + "auxiliary_loss_clip": 0.01245394, + "auxiliary_loss_mlp": 0.01073743, + "balance_loss_clip": 1.06585169, + "balance_loss_mlp": 1.04317796, + "epoch": 0.036554937622125354, + "flos": 21755581879680.0, + "grad_norm": 2.3717179235904533, + "language_loss": 0.83567071, + "learning_rate": 3.999549488202358e-06, + "loss": 0.8588621, + "num_input_tokens_seen": 12831085, + "step": 608, + "time_per_iteration": 2.6593453884124756 + }, + { + "auxiliary_loss_clip": 0.01251062, + "auxiliary_loss_mlp": 0.01073705, + "balance_loss_clip": 1.06682992, + "balance_loss_mlp": 1.04006422, + "epoch": 0.036615060874793326, + "flos": 17819772935040.0, + "grad_norm": 2.4795108668903305, + "language_loss": 0.8201133, + "learning_rate": 3.999541184329688e-06, + "loss": 0.84336102, + "num_input_tokens_seen": 12849115, + "step": 609, + "time_per_iteration": 2.6299383640289307 + }, + { + "auxiliary_loss_clip": 0.01255655, + "auxiliary_loss_mlp": 0.01091893, + "balance_loss_clip": 1.07322037, + "balance_loss_mlp": 1.06158984, + "epoch": 0.0366751841274613, + "flos": 26753808320640.0, + "grad_norm": 1.992640540297191, + "language_loss": 0.79448462, + "learning_rate": 3.999532804634215e-06, + "loss": 0.81796008, + "num_input_tokens_seen": 12868005, + "step": 610, + "time_per_iteration": 2.65120530128479 + }, + { + "auxiliary_loss_clip": 0.01254423, + "auxiliary_loss_mlp": 0.01088228, + "balance_loss_clip": 1.06914616, + "balance_loss_mlp": 1.05656588, + "epoch": 0.03673530738012926, + "flos": 22196960202240.0, + "grad_norm": 1.9328503999291824, + "language_loss": 0.87282723, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.89625371, + "num_input_tokens_seen": 12886890, + "step": 611, + "time_per_iteration": 2.7398059368133545 + }, + { + "auxiliary_loss_clip": 0.01248885, + "auxiliary_loss_mlp": 0.01097673, + "balance_loss_clip": 1.06917143, + "balance_loss_mlp": 1.06651139, + "epoch": 0.036795430632797235, + "flos": 24681655601280.0, + "grad_norm": 3.7435200854847266, + "language_loss": 0.72589231, + "learning_rate": 3.999515817776136e-06, + "loss": 0.74935788, + "num_input_tokens_seen": 12906130, + "step": 612, + "time_per_iteration": 2.700406551361084 + }, + { + "auxiliary_loss_clip": 0.01249112, + "auxiliary_loss_mlp": 0.01076924, + "balance_loss_clip": 1.06581926, + "balance_loss_mlp": 1.04480934, + "epoch": 0.0368555538854652, + "flos": 17748921358080.0, + "grad_norm": 3.0863603820013434, + "language_loss": 0.79110008, + "learning_rate": 3.999507210614175e-06, + "loss": 0.81436038, + "num_input_tokens_seen": 12925260, + "step": 613, + "time_per_iteration": 2.630472183227539 + }, + { + "auxiliary_loss_clip": 0.01242581, + "auxiliary_loss_mlp": 0.01090278, + "balance_loss_clip": 1.06378841, + "balance_loss_mlp": 1.05961776, + "epoch": 0.03691567713813317, + "flos": 20594554571520.0, + "grad_norm": 2.2015687298668336, + "language_loss": 0.93885028, + "learning_rate": 3.9994985276307e-06, + "loss": 0.96217889, + "num_input_tokens_seen": 12944590, + "step": 614, + "time_per_iteration": 2.6977972984313965 + }, + { + "auxiliary_loss_clip": 0.01254503, + "auxiliary_loss_mlp": 0.01081137, + "balance_loss_clip": 1.07009673, + "balance_loss_mlp": 1.04732919, + "epoch": 0.036975800390801145, + "flos": 33650380546560.0, + "grad_norm": 3.0661216019279576, + "language_loss": 0.72932875, + "learning_rate": 3.999489768826041e-06, + "loss": 0.75268513, + "num_input_tokens_seen": 12964785, + "step": 615, + "time_per_iteration": 2.697291612625122 + }, + { + "auxiliary_loss_clip": 0.01250213, + "auxiliary_loss_mlp": 0.010716, + "balance_loss_clip": 1.06649876, + "balance_loss_mlp": 1.04015231, + "epoch": 0.03703592364346911, + "flos": 28293694329600.0, + "grad_norm": 2.9941392641088695, + "language_loss": 0.81630868, + "learning_rate": 3.999480934200528e-06, + "loss": 0.83952683, + "num_input_tokens_seen": 12986705, + "step": 616, + "time_per_iteration": 4.1762495040893555 + }, + { + "auxiliary_loss_clip": 0.0124999, + "auxiliary_loss_mlp": 0.01076541, + "balance_loss_clip": 1.06807041, + "balance_loss_mlp": 1.0467627, + "epoch": 0.03709604689613708, + "flos": 31504215853440.0, + "grad_norm": 2.320593216419041, + "language_loss": 0.68178958, + "learning_rate": 3.999472023754499e-06, + "loss": 0.70505488, + "num_input_tokens_seen": 13010560, + "step": 617, + "time_per_iteration": 4.224538564682007 + }, + { + "auxiliary_loss_clip": 0.01254259, + "auxiliary_loss_mlp": 0.010771, + "balance_loss_clip": 1.07098567, + "balance_loss_mlp": 1.04415071, + "epoch": 0.03715617014880505, + "flos": 19609381272960.0, + "grad_norm": 2.245411088847763, + "language_loss": 0.80595517, + "learning_rate": 3.99946303748829e-06, + "loss": 0.82926875, + "num_input_tokens_seen": 13028935, + "step": 618, + "time_per_iteration": 4.200341463088989 + }, + { + "auxiliary_loss_clip": 0.01257669, + "auxiliary_loss_mlp": 0.01079294, + "balance_loss_clip": 1.06808555, + "balance_loss_mlp": 1.04605901, + "epoch": 0.03721629340147302, + "flos": 15924192497280.0, + "grad_norm": 10.155035046705617, + "language_loss": 0.91591841, + "learning_rate": 3.999453975402242e-06, + "loss": 0.93928802, + "num_input_tokens_seen": 13046000, + "step": 619, + "time_per_iteration": 2.5787301063537598 + }, + { + "auxiliary_loss_clip": 0.01251145, + "auxiliary_loss_mlp": 0.01083548, + "balance_loss_clip": 1.06999123, + "balance_loss_mlp": 1.05181432, + "epoch": 0.03727641665414099, + "flos": 21104090951040.0, + "grad_norm": 2.0803022158745406, + "language_loss": 0.94071603, + "learning_rate": 3.9994448374967e-06, + "loss": 0.96406299, + "num_input_tokens_seen": 13062995, + "step": 620, + "time_per_iteration": 2.5987205505371094 + }, + { + "auxiliary_loss_clip": 0.01249568, + "auxiliary_loss_mlp": 0.0108317, + "balance_loss_clip": 1.06624317, + "balance_loss_mlp": 1.0502919, + "epoch": 0.037336539906808956, + "flos": 24131683486080.0, + "grad_norm": 1.7431896174296577, + "language_loss": 0.77319217, + "learning_rate": 3.999435623772008e-06, + "loss": 0.79651952, + "num_input_tokens_seen": 13084120, + "step": 621, + "time_per_iteration": 2.68758225440979 + }, + { + "auxiliary_loss_clip": 0.01247252, + "auxiliary_loss_mlp": 0.01071013, + "balance_loss_clip": 1.06894088, + "balance_loss_mlp": 1.03792048, + "epoch": 0.03739666315947693, + "flos": 22346384780160.0, + "grad_norm": 2.3852872810563364, + "language_loss": 0.86546707, + "learning_rate": 3.999426334228518e-06, + "loss": 0.88864976, + "num_input_tokens_seen": 13100035, + "step": 622, + "time_per_iteration": 2.607121467590332 + }, + { + "auxiliary_loss_clip": 0.012499, + "auxiliary_loss_mlp": 0.01072461, + "balance_loss_clip": 1.06715882, + "balance_loss_mlp": 1.04048872, + "epoch": 0.0374567864121449, + "flos": 20449511452800.0, + "grad_norm": 2.2621736327299766, + "language_loss": 0.90008956, + "learning_rate": 3.999416968866581e-06, + "loss": 0.92331314, + "num_input_tokens_seen": 13118070, + "step": 623, + "time_per_iteration": 2.6513512134552 + }, + { + "auxiliary_loss_clip": 0.01251762, + "auxiliary_loss_mlp": 0.01090534, + "balance_loss_clip": 1.07006013, + "balance_loss_mlp": 1.05844235, + "epoch": 0.037516909664812866, + "flos": 19208043636480.0, + "grad_norm": 2.760597076727266, + "language_loss": 0.84095174, + "learning_rate": 3.999407527686551e-06, + "loss": 0.8643747, + "num_input_tokens_seen": 13136355, + "step": 624, + "time_per_iteration": 2.66623592376709 + }, + { + "auxiliary_loss_clip": 0.01252431, + "auxiliary_loss_mlp": 0.01076353, + "balance_loss_clip": 1.06697702, + "balance_loss_mlp": 1.04423809, + "epoch": 0.03757703291748084, + "flos": 35005218664320.0, + "grad_norm": 4.259276014089895, + "language_loss": 0.66778994, + "learning_rate": 3.999398010688788e-06, + "loss": 0.69107783, + "num_input_tokens_seen": 13155435, + "step": 625, + "time_per_iteration": 2.7288877964019775 + }, + { + "auxiliary_loss_clip": 0.01244959, + "auxiliary_loss_mlp": 0.01076274, + "balance_loss_clip": 1.06605244, + "balance_loss_mlp": 1.042943, + "epoch": 0.0376371561701488, + "flos": 25483899911040.0, + "grad_norm": 3.375450269409945, + "language_loss": 0.77496696, + "learning_rate": 3.999388417873652e-06, + "loss": 0.79817927, + "num_input_tokens_seen": 13174295, + "step": 626, + "time_per_iteration": 2.648942470550537 + }, + { + "auxiliary_loss_clip": 0.01249107, + "auxiliary_loss_mlp": 0.0108376, + "balance_loss_clip": 1.06770003, + "balance_loss_mlp": 1.05200303, + "epoch": 0.037697279422816775, + "flos": 18185630912640.0, + "grad_norm": 2.0480468386724766, + "language_loss": 0.81463408, + "learning_rate": 3.999378749241506e-06, + "loss": 0.83796275, + "num_input_tokens_seen": 13192500, + "step": 627, + "time_per_iteration": 2.6209845542907715 + }, + { + "auxiliary_loss_clip": 0.01254363, + "auxiliary_loss_mlp": 0.01084942, + "balance_loss_clip": 1.07041132, + "balance_loss_mlp": 1.05215955, + "epoch": 0.03775740267548475, + "flos": 24644272521600.0, + "grad_norm": 1.6934072791943036, + "language_loss": 0.88809037, + "learning_rate": 3.999369004792719e-06, + "loss": 0.91148341, + "num_input_tokens_seen": 13213470, + "step": 628, + "time_per_iteration": 2.7221415042877197 + }, + { + "auxiliary_loss_clip": 0.01247303, + "auxiliary_loss_mlp": 0.01080197, + "balance_loss_clip": 1.0627017, + "balance_loss_mlp": 1.04765344, + "epoch": 0.03781752592815271, + "flos": 21288205088640.0, + "grad_norm": 2.536151380104699, + "language_loss": 0.79840028, + "learning_rate": 3.999359184527658e-06, + "loss": 0.82167524, + "num_input_tokens_seen": 13232365, + "step": 629, + "time_per_iteration": 2.6535024642944336 + }, + { + "auxiliary_loss_clip": 0.01249218, + "auxiliary_loss_mlp": 0.0106958, + "balance_loss_clip": 1.06675959, + "balance_loss_mlp": 1.03885961, + "epoch": 0.037877649180820684, + "flos": 22089623385600.0, + "grad_norm": 1.6861994278356789, + "language_loss": 0.76824844, + "learning_rate": 3.999349288446696e-06, + "loss": 0.79143643, + "num_input_tokens_seen": 13251920, + "step": 630, + "time_per_iteration": 2.6175966262817383 + }, + { + "auxiliary_loss_clip": 0.01254291, + "auxiliary_loss_mlp": 0.01075963, + "balance_loss_clip": 1.06833327, + "balance_loss_mlp": 1.04504025, + "epoch": 0.03793777243348865, + "flos": 14501339976960.0, + "grad_norm": 3.12435515576561, + "language_loss": 0.91593724, + "learning_rate": 3.99933931655021e-06, + "loss": 0.93923974, + "num_input_tokens_seen": 13267440, + "step": 631, + "time_per_iteration": 2.565293788909912 + }, + { + "auxiliary_loss_clip": 0.01243525, + "auxiliary_loss_mlp": 0.01087901, + "balance_loss_clip": 1.06386209, + "balance_loss_mlp": 1.05356884, + "epoch": 0.03799789568615662, + "flos": 21908418249600.0, + "grad_norm": 1.6822536287963328, + "language_loss": 0.92157543, + "learning_rate": 3.999329268838575e-06, + "loss": 0.94488978, + "num_input_tokens_seen": 13287850, + "step": 632, + "time_per_iteration": 2.6235203742980957 + }, + { + "auxiliary_loss_clip": 0.01248362, + "auxiliary_loss_mlp": 0.01067296, + "balance_loss_clip": 1.06696796, + "balance_loss_mlp": 1.03613472, + "epoch": 0.03805801893882459, + "flos": 24827021942400.0, + "grad_norm": 2.1097171792430456, + "language_loss": 0.83139223, + "learning_rate": 3.999319145312175e-06, + "loss": 0.85454881, + "num_input_tokens_seen": 13307760, + "step": 633, + "time_per_iteration": 2.6461985111236572 + }, + { + "auxiliary_loss_clip": 0.01247735, + "auxiliary_loss_mlp": 0.01079895, + "balance_loss_clip": 1.06473529, + "balance_loss_mlp": 1.04811358, + "epoch": 0.03811814219149256, + "flos": 30482952364800.0, + "grad_norm": 1.599115294194595, + "language_loss": 0.69883299, + "learning_rate": 3.999308945971392e-06, + "loss": 0.72210932, + "num_input_tokens_seen": 13331230, + "step": 634, + "time_per_iteration": 2.709033727645874 + }, + { + "auxiliary_loss_clip": 0.01133204, + "auxiliary_loss_mlp": 0.01009504, + "balance_loss_clip": 1.04124916, + "balance_loss_mlp": 1.00249422, + "epoch": 0.03817826544416053, + "flos": 66992577379200.0, + "grad_norm": 0.893126545279708, + "language_loss": 0.61645919, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63788629, + "num_input_tokens_seen": 13394760, + "step": 635, + "time_per_iteration": 3.2099475860595703 + }, + { + "auxiliary_loss_clip": 0.01244276, + "auxiliary_loss_mlp": 0.01072984, + "balance_loss_clip": 1.06475401, + "balance_loss_mlp": 1.04129851, + "epoch": 0.038238388696828496, + "flos": 20485350247680.0, + "grad_norm": 2.0563589539657205, + "language_loss": 0.83629507, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.85946769, + "num_input_tokens_seen": 13412775, + "step": 636, + "time_per_iteration": 2.6278960704803467 + }, + { + "auxiliary_loss_clip": 0.01248078, + "auxiliary_loss_mlp": 0.01096471, + "balance_loss_clip": 1.06714165, + "balance_loss_mlp": 1.06530952, + "epoch": 0.03829851194949647, + "flos": 17965893461760.0, + "grad_norm": 2.346379148367956, + "language_loss": 0.79578567, + "learning_rate": 3.999277893066632e-06, + "loss": 0.81923115, + "num_input_tokens_seen": 13427835, + "step": 637, + "time_per_iteration": 2.646414279937744 + }, + { + "auxiliary_loss_clip": 0.01247939, + "auxiliary_loss_mlp": 0.01088528, + "balance_loss_clip": 1.06356907, + "balance_loss_mlp": 1.0562222, + "epoch": 0.03835863520216444, + "flos": 22456522857600.0, + "grad_norm": 1.9563283234999833, + "language_loss": 0.83989692, + "learning_rate": 3.999267390472215e-06, + "loss": 0.86326158, + "num_input_tokens_seen": 13447295, + "step": 638, + "time_per_iteration": 2.6416285037994385 + }, + { + "auxiliary_loss_clip": 0.01253172, + "auxiliary_loss_mlp": 0.01074704, + "balance_loss_clip": 1.06563985, + "balance_loss_mlp": 1.04163575, + "epoch": 0.038418758454832405, + "flos": 22164425458560.0, + "grad_norm": 2.5596504471077224, + "language_loss": 0.70109725, + "learning_rate": 3.999256812065381e-06, + "loss": 0.72437602, + "num_input_tokens_seen": 13468455, + "step": 639, + "time_per_iteration": 2.610682487487793 + }, + { + "auxiliary_loss_clip": 0.01248829, + "auxiliary_loss_mlp": 0.01081808, + "balance_loss_clip": 1.06618333, + "balance_loss_mlp": 1.04790449, + "epoch": 0.03847888170750038, + "flos": 22747435107840.0, + "grad_norm": 2.5791624605537082, + "language_loss": 0.85322344, + "learning_rate": 3.999246157846526e-06, + "loss": 0.87652987, + "num_input_tokens_seen": 13489085, + "step": 640, + "time_per_iteration": 2.700456380844116 + }, + { + "auxiliary_loss_clip": 0.01252579, + "auxiliary_loss_mlp": 0.01083722, + "balance_loss_clip": 1.06751871, + "balance_loss_mlp": 1.04934239, + "epoch": 0.03853900496016834, + "flos": 22711201263360.0, + "grad_norm": 2.331268680461456, + "language_loss": 0.82141805, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.84478104, + "num_input_tokens_seen": 13509120, + "step": 641, + "time_per_iteration": 2.6572046279907227 + }, + { + "auxiliary_loss_clip": 0.0112759, + "auxiliary_loss_mlp": 0.01008008, + "balance_loss_clip": 1.03825259, + "balance_loss_mlp": 1.00095105, + "epoch": 0.038599128212836314, + "flos": 70399136355840.0, + "grad_norm": 0.9037629700551453, + "language_loss": 0.65444964, + "learning_rate": 3.999224621974381e-06, + "loss": 0.67580563, + "num_input_tokens_seen": 13562005, + "step": 642, + "time_per_iteration": 3.199925422668457 + }, + { + "auxiliary_loss_clip": 0.01246698, + "auxiliary_loss_mlp": 0.01064563, + "balance_loss_clip": 1.0651319, + "balance_loss_mlp": 1.03453398, + "epoch": 0.03865925146550429, + "flos": 23295144666240.0, + "grad_norm": 1.9113268312481755, + "language_loss": 0.79272145, + "learning_rate": 3.999213740321906e-06, + "loss": 0.81583405, + "num_input_tokens_seen": 13582185, + "step": 643, + "time_per_iteration": 2.641437292098999 + }, + { + "auxiliary_loss_clip": 0.01244786, + "auxiliary_loss_mlp": 0.01076057, + "balance_loss_clip": 1.06219232, + "balance_loss_mlp": 1.04599261, + "epoch": 0.03871937471817225, + "flos": 21430446946560.0, + "grad_norm": 2.2104774200729262, + "language_loss": 0.8294487, + "learning_rate": 3.999202782859046e-06, + "loss": 0.85265714, + "num_input_tokens_seen": 13599555, + "step": 644, + "time_per_iteration": 2.600558280944824 + }, + { + "auxiliary_loss_clip": 0.01247273, + "auxiliary_loss_mlp": 0.01074554, + "balance_loss_clip": 1.06383467, + "balance_loss_mlp": 1.04193854, + "epoch": 0.038779497970840224, + "flos": 34277309550720.0, + "grad_norm": 1.994902925690418, + "language_loss": 0.82286513, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.8460834, + "num_input_tokens_seen": 13621160, + "step": 645, + "time_per_iteration": 2.6751983165740967 + }, + { + "auxiliary_loss_clip": 0.01248631, + "auxiliary_loss_mlp": 0.01070807, + "balance_loss_clip": 1.06525111, + "balance_loss_mlp": 1.03890657, + "epoch": 0.03883962122350819, + "flos": 22748189293440.0, + "grad_norm": 2.290384247239265, + "language_loss": 0.81889713, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.84209144, + "num_input_tokens_seen": 13641915, + "step": 646, + "time_per_iteration": 2.6987667083740234 + }, + { + "auxiliary_loss_clip": 0.01250204, + "auxiliary_loss_mlp": 0.01078836, + "balance_loss_clip": 1.06982899, + "balance_loss_mlp": 1.04791331, + "epoch": 0.03889974447617616, + "flos": 21945837242880.0, + "grad_norm": 1.9171219640425325, + "language_loss": 0.82015383, + "learning_rate": 3.999169455612323e-06, + "loss": 0.84344423, + "num_input_tokens_seen": 13661410, + "step": 647, + "time_per_iteration": 2.590102195739746 + }, + { + "auxiliary_loss_clip": 0.0124696, + "auxiliary_loss_mlp": 0.01072111, + "balance_loss_clip": 1.06628954, + "balance_loss_mlp": 1.04216528, + "epoch": 0.03895986772884413, + "flos": 31504826384640.0, + "grad_norm": 1.9398424653049293, + "language_loss": 0.84477997, + "learning_rate": 3.999158194912106e-06, + "loss": 0.86797059, + "num_input_tokens_seen": 13681705, + "step": 648, + "time_per_iteration": 2.7516121864318848 + }, + { + "auxiliary_loss_clip": 0.01244808, + "auxiliary_loss_mlp": 0.0107293, + "balance_loss_clip": 1.06524062, + "balance_loss_mlp": 1.04210222, + "epoch": 0.0390199909815121, + "flos": 19901011795200.0, + "grad_norm": 2.3870859420748136, + "language_loss": 0.84254295, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.86572027, + "num_input_tokens_seen": 13700400, + "step": 649, + "time_per_iteration": 2.6116180419921875 + }, + { + "auxiliary_loss_clip": 0.01246653, + "auxiliary_loss_mlp": 0.01073574, + "balance_loss_clip": 1.06560743, + "balance_loss_mlp": 1.0416739, + "epoch": 0.03908011423418007, + "flos": 21612478095360.0, + "grad_norm": 2.00775905451926, + "language_loss": 0.79783499, + "learning_rate": 3.999135446087263e-06, + "loss": 0.82103723, + "num_input_tokens_seen": 13720145, + "step": 650, + "time_per_iteration": 2.574939727783203 + }, + { + "auxiliary_loss_clip": 0.01242721, + "auxiliary_loss_mlp": 0.01077536, + "balance_loss_clip": 1.06209707, + "balance_loss_mlp": 1.04534984, + "epoch": 0.039140237486848035, + "flos": 18661411486080.0, + "grad_norm": 2.334811800093409, + "language_loss": 0.78698987, + "learning_rate": 3.9991239579635e-06, + "loss": 0.81019247, + "num_input_tokens_seen": 13737500, + "step": 651, + "time_per_iteration": 2.5930917263031006 + }, + { + "auxiliary_loss_clip": 0.0124425, + "auxiliary_loss_mlp": 0.010838, + "balance_loss_clip": 1.06317663, + "balance_loss_mlp": 1.05087411, + "epoch": 0.03920036073951601, + "flos": 18661124177280.0, + "grad_norm": 3.361008988618244, + "language_loss": 0.87392938, + "learning_rate": 3.999112394032757e-06, + "loss": 0.89720988, + "num_input_tokens_seen": 13754750, + "step": 652, + "time_per_iteration": 2.6072869300842285 + }, + { + "auxiliary_loss_clip": 0.01239638, + "auxiliary_loss_mlp": 0.01073938, + "balance_loss_clip": 1.06362963, + "balance_loss_mlp": 1.0434916, + "epoch": 0.03926048399218398, + "flos": 31354468053120.0, + "grad_norm": 2.6218665998754904, + "language_loss": 0.79297256, + "learning_rate": 3.999100754295471e-06, + "loss": 0.81610829, + "num_input_tokens_seen": 13771990, + "step": 653, + "time_per_iteration": 2.626145362854004 + }, + { + "auxiliary_loss_clip": 0.01250652, + "auxiliary_loss_mlp": 0.01075546, + "balance_loss_clip": 1.06496143, + "balance_loss_mlp": 1.04374111, + "epoch": 0.039320607244851945, + "flos": 29603499770880.0, + "grad_norm": 2.0720296605490094, + "language_loss": 0.85909009, + "learning_rate": 3.999089038752085e-06, + "loss": 0.88235211, + "num_input_tokens_seen": 13792750, + "step": 654, + "time_per_iteration": 2.6775124073028564 + }, + { + "auxiliary_loss_clip": 0.01126661, + "auxiliary_loss_mlp": 0.01016641, + "balance_loss_clip": 1.03977203, + "balance_loss_mlp": 1.01001298, + "epoch": 0.03938073049751992, + "flos": 66534609951360.0, + "grad_norm": 0.7366259780501333, + "language_loss": 0.4997997, + "learning_rate": 3.999077247403041e-06, + "loss": 0.52123272, + "num_input_tokens_seen": 13858570, + "step": 655, + "time_per_iteration": 3.3006510734558105 + }, + { + "auxiliary_loss_clip": 0.01241143, + "auxiliary_loss_mlp": 0.01076374, + "balance_loss_clip": 1.0658412, + "balance_loss_mlp": 1.04680991, + "epoch": 0.03944085375018788, + "flos": 23367827836800.0, + "grad_norm": 4.17474796245144, + "language_loss": 0.80903178, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.83220696, + "num_input_tokens_seen": 13876335, + "step": 656, + "time_per_iteration": 4.228931427001953 + }, + { + "auxiliary_loss_clip": 0.01251519, + "auxiliary_loss_mlp": 0.01093573, + "balance_loss_clip": 1.06740427, + "balance_loss_mlp": 1.05802524, + "epoch": 0.039500977002855854, + "flos": 18548292579840.0, + "grad_norm": 2.068956760077258, + "language_loss": 0.76289558, + "learning_rate": 3.999053437289776e-06, + "loss": 0.7863465, + "num_input_tokens_seen": 13892640, + "step": 657, + "time_per_iteration": 4.218473434448242 + }, + { + "auxiliary_loss_clip": 0.0124824, + "auxiliary_loss_mlp": 0.01076812, + "balance_loss_clip": 1.06641233, + "balance_loss_mlp": 1.04522133, + "epoch": 0.039561100255523826, + "flos": 25338174433920.0, + "grad_norm": 2.07475431213476, + "language_loss": 0.8179062, + "learning_rate": 3.999041418526457e-06, + "loss": 0.84115672, + "num_input_tokens_seen": 13910085, + "step": 658, + "time_per_iteration": 2.671675682067871 + }, + { + "auxiliary_loss_clip": 0.01242678, + "auxiliary_loss_mlp": 0.01077963, + "balance_loss_clip": 1.06347871, + "balance_loss_mlp": 1.0454669, + "epoch": 0.03962122350819179, + "flos": 18219889509120.0, + "grad_norm": 2.2444983110753625, + "language_loss": 0.90790772, + "learning_rate": 3.999029323959287e-06, + "loss": 0.93111408, + "num_input_tokens_seen": 13928800, + "step": 659, + "time_per_iteration": 4.2601988315582275 + }, + { + "auxiliary_loss_clip": 0.01247633, + "auxiliary_loss_mlp": 0.01073069, + "balance_loss_clip": 1.06654835, + "balance_loss_mlp": 1.04215825, + "epoch": 0.03968134676085976, + "flos": 20522230536960.0, + "grad_norm": 2.2083626038373656, + "language_loss": 0.79760063, + "learning_rate": 3.999017153588724e-06, + "loss": 0.82080764, + "num_input_tokens_seen": 13948325, + "step": 660, + "time_per_iteration": 2.62716007232666 + }, + { + "auxiliary_loss_clip": 0.01246027, + "auxiliary_loss_mlp": 0.01077579, + "balance_loss_clip": 1.0675652, + "balance_loss_mlp": 1.0456785, + "epoch": 0.03974147001352773, + "flos": 22422587483520.0, + "grad_norm": 1.6747851381362888, + "language_loss": 0.81757367, + "learning_rate": 3.999004907415231e-06, + "loss": 0.8408097, + "num_input_tokens_seen": 13969090, + "step": 661, + "time_per_iteration": 2.645423412322998 + }, + { + "auxiliary_loss_clip": 0.01119895, + "auxiliary_loss_mlp": 0.01007167, + "balance_loss_clip": 1.03320217, + "balance_loss_mlp": 1.00077713, + "epoch": 0.0398015932661957, + "flos": 71128769322240.0, + "grad_norm": 0.9117564509831767, + "language_loss": 0.69349593, + "learning_rate": 3.998992585439272e-06, + "loss": 0.71476656, + "num_input_tokens_seen": 14037555, + "step": 662, + "time_per_iteration": 3.3032331466674805 + }, + { + "auxiliary_loss_clip": 0.01249217, + "auxiliary_loss_mlp": 0.01074722, + "balance_loss_clip": 1.06995225, + "balance_loss_mlp": 1.04322648, + "epoch": 0.03986171651886367, + "flos": 16800951571200.0, + "grad_norm": 2.160679749799672, + "language_loss": 0.82765651, + "learning_rate": 3.998980187661314e-06, + "loss": 0.85089582, + "num_input_tokens_seen": 14055765, + "step": 663, + "time_per_iteration": 2.6217782497406006 + }, + { + "auxiliary_loss_clip": 0.01252759, + "auxiliary_loss_mlp": 0.01063705, + "balance_loss_clip": 1.06966817, + "balance_loss_mlp": 1.03254378, + "epoch": 0.03992183977153164, + "flos": 24535068197760.0, + "grad_norm": 2.19374813563436, + "language_loss": 0.87302262, + "learning_rate": 3.998967714081826e-06, + "loss": 0.89618725, + "num_input_tokens_seen": 14074195, + "step": 664, + "time_per_iteration": 2.6729183197021484 + }, + { + "auxiliary_loss_clip": 0.01241647, + "auxiliary_loss_mlp": 0.0106515, + "balance_loss_clip": 1.06656313, + "balance_loss_mlp": 1.03346384, + "epoch": 0.03998196302419961, + "flos": 15595897167360.0, + "grad_norm": 2.036983550581997, + "language_loss": 0.84821391, + "learning_rate": 3.998955164701281e-06, + "loss": 0.87128186, + "num_input_tokens_seen": 14090215, + "step": 665, + "time_per_iteration": 2.593832015991211 + }, + { + "auxiliary_loss_clip": 0.012521, + "auxiliary_loss_mlp": 0.01085682, + "balance_loss_clip": 1.06867695, + "balance_loss_mlp": 1.05223155, + "epoch": 0.04004208627686758, + "flos": 25305065072640.0, + "grad_norm": 2.172699570421913, + "language_loss": 0.81745672, + "learning_rate": 3.998942539520158e-06, + "loss": 0.8408345, + "num_input_tokens_seen": 14112150, + "step": 666, + "time_per_iteration": 2.6743290424346924 + }, + { + "auxiliary_loss_clip": 0.01241565, + "auxiliary_loss_mlp": 0.01073617, + "balance_loss_clip": 1.06443083, + "balance_loss_mlp": 1.04007161, + "epoch": 0.04010220952953555, + "flos": 23475847011840.0, + "grad_norm": 2.1003520396389828, + "language_loss": 0.87117827, + "learning_rate": 3.998929838538932e-06, + "loss": 0.89433014, + "num_input_tokens_seen": 14131475, + "step": 667, + "time_per_iteration": 2.6147067546844482 + }, + { + "auxiliary_loss_clip": 0.0124275, + "auxiliary_loss_mlp": 0.01071583, + "balance_loss_clip": 1.07009172, + "balance_loss_mlp": 1.04161382, + "epoch": 0.04016233278220352, + "flos": 18617025254400.0, + "grad_norm": 2.331266403294307, + "language_loss": 0.80641299, + "learning_rate": 3.998917061758087e-06, + "loss": 0.82955635, + "num_input_tokens_seen": 14146165, + "step": 668, + "time_per_iteration": 2.6015820503234863 + }, + { + "auxiliary_loss_clip": 0.01115034, + "auxiliary_loss_mlp": 0.01008949, + "balance_loss_clip": 1.02975297, + "balance_loss_mlp": 1.00317907, + "epoch": 0.040222456034871484, + "flos": 70906194696960.0, + "grad_norm": 0.7870483750596657, + "language_loss": 0.60066259, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62190247, + "num_input_tokens_seen": 14215005, + "step": 669, + "time_per_iteration": 3.2993202209472656 + }, + { + "auxiliary_loss_clip": 0.01242272, + "auxiliary_loss_mlp": 0.01071485, + "balance_loss_clip": 1.06408751, + "balance_loss_mlp": 1.04120564, + "epoch": 0.040282579287539456, + "flos": 23764712186880.0, + "grad_norm": 1.7022357666604506, + "language_loss": 0.86290276, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.88604033, + "num_input_tokens_seen": 14235510, + "step": 670, + "time_per_iteration": 2.700657844543457 + }, + { + "auxiliary_loss_clip": 0.01242087, + "auxiliary_loss_mlp": 0.01080448, + "balance_loss_clip": 1.06647801, + "balance_loss_mlp": 1.05014467, + "epoch": 0.04034270254020743, + "flos": 18478518410880.0, + "grad_norm": 1.8224152334464152, + "language_loss": 0.75569212, + "learning_rate": 3.998878276622692e-06, + "loss": 0.77891749, + "num_input_tokens_seen": 14254565, + "step": 671, + "time_per_iteration": 2.6698572635650635 + }, + { + "auxiliary_loss_clip": 0.01248936, + "auxiliary_loss_mlp": 0.01076667, + "balance_loss_clip": 1.06943047, + "balance_loss_mlp": 1.04605412, + "epoch": 0.040402825792875394, + "flos": 17201858244480.0, + "grad_norm": 1.9730812981627939, + "language_loss": 0.92416775, + "learning_rate": 3.998865196648242e-06, + "loss": 0.94742376, + "num_input_tokens_seen": 14271885, + "step": 672, + "time_per_iteration": 2.567563533782959 + }, + { + "auxiliary_loss_clip": 0.01245231, + "auxiliary_loss_mlp": 0.010776, + "balance_loss_clip": 1.0677104, + "balance_loss_mlp": 1.04422188, + "epoch": 0.040462949045543366, + "flos": 19172168928000.0, + "grad_norm": 1.800141829654062, + "language_loss": 0.90174723, + "learning_rate": 3.998852040876622e-06, + "loss": 0.92497551, + "num_input_tokens_seen": 14289670, + "step": 673, + "time_per_iteration": 2.547154426574707 + }, + { + "auxiliary_loss_clip": 0.01239752, + "auxiliary_loss_mlp": 0.01084248, + "balance_loss_clip": 1.06466973, + "balance_loss_mlp": 1.05184698, + "epoch": 0.04052307229821133, + "flos": 24019821555840.0, + "grad_norm": 2.3989934860433486, + "language_loss": 0.75016737, + "learning_rate": 3.998838809308334e-06, + "loss": 0.7734074, + "num_input_tokens_seen": 14309285, + "step": 674, + "time_per_iteration": 2.681896924972534 + }, + { + "auxiliary_loss_clip": 0.01249861, + "auxiliary_loss_mlp": 0.01064308, + "balance_loss_clip": 1.06744063, + "balance_loss_mlp": 1.03334963, + "epoch": 0.0405831955508793, + "flos": 16436601964800.0, + "grad_norm": 2.55613513039197, + "language_loss": 0.78289407, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.80603576, + "num_input_tokens_seen": 14328300, + "step": 675, + "time_per_iteration": 2.6965043544769287 + }, + { + "auxiliary_loss_clip": 0.01241749, + "auxiliary_loss_mlp": 0.01079652, + "balance_loss_clip": 1.06532836, + "balance_loss_mlp": 1.04648817, + "epoch": 0.040643318803547275, + "flos": 24279922915200.0, + "grad_norm": 2.047384767684118, + "language_loss": 0.76844448, + "learning_rate": 3.998812118783757e-06, + "loss": 0.79165846, + "num_input_tokens_seen": 14346395, + "step": 676, + "time_per_iteration": 2.6216623783111572 + }, + { + "auxiliary_loss_clip": 0.01248147, + "auxiliary_loss_mlp": 0.01079294, + "balance_loss_clip": 1.06811619, + "balance_loss_mlp": 1.04813254, + "epoch": 0.04070344205621524, + "flos": 17712076982400.0, + "grad_norm": 2.318905665785744, + "language_loss": 0.85139382, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.8746683, + "num_input_tokens_seen": 14364605, + "step": 677, + "time_per_iteration": 2.5663015842437744 + }, + { + "auxiliary_loss_clip": 0.01240385, + "auxiliary_loss_mlp": 0.01070741, + "balance_loss_clip": 1.06558609, + "balance_loss_mlp": 1.03901923, + "epoch": 0.04076356530888321, + "flos": 26177658168960.0, + "grad_norm": 2.5041724349122645, + "language_loss": 0.76572061, + "learning_rate": 3.998785125078559e-06, + "loss": 0.78883183, + "num_input_tokens_seen": 14385265, + "step": 678, + "time_per_iteration": 2.624689817428589 + }, + { + "auxiliary_loss_clip": 0.01240972, + "auxiliary_loss_mlp": 0.01072606, + "balance_loss_clip": 1.06374967, + "balance_loss_mlp": 1.04242194, + "epoch": 0.04082368856155118, + "flos": 35773455772800.0, + "grad_norm": 1.7096242150987748, + "language_loss": 0.82139099, + "learning_rate": 3.998771514534505e-06, + "loss": 0.84452677, + "num_input_tokens_seen": 14406090, + "step": 679, + "time_per_iteration": 2.7073023319244385 + }, + { + "auxiliary_loss_clip": 0.01248879, + "auxiliary_loss_mlp": 0.01064116, + "balance_loss_clip": 1.07185793, + "balance_loss_mlp": 1.0340035, + "epoch": 0.04088381181421915, + "flos": 28146640049280.0, + "grad_norm": 1.963288262989073, + "language_loss": 0.76260424, + "learning_rate": 3.998757828196835e-06, + "loss": 0.78573418, + "num_input_tokens_seen": 14425130, + "step": 680, + "time_per_iteration": 2.6767218112945557 + }, + { + "auxiliary_loss_clip": 0.01244441, + "auxiliary_loss_mlp": 0.01071738, + "balance_loss_clip": 1.06458521, + "balance_loss_mlp": 1.03864551, + "epoch": 0.04094393506688712, + "flos": 27597673514880.0, + "grad_norm": 1.713943858995997, + "language_loss": 0.83089912, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.85406095, + "num_input_tokens_seen": 14447355, + "step": 681, + "time_per_iteration": 2.6386382579803467 + }, + { + "auxiliary_loss_clip": 0.01244279, + "auxiliary_loss_mlp": 0.01073303, + "balance_loss_clip": 1.06438065, + "balance_loss_mlp": 1.04127121, + "epoch": 0.04100405831955509, + "flos": 23112036109440.0, + "grad_norm": 1.706698119772261, + "language_loss": 0.71538687, + "learning_rate": 3.998730228142726e-06, + "loss": 0.7385627, + "num_input_tokens_seen": 14466790, + "step": 682, + "time_per_iteration": 2.618792772293091 + }, + { + "auxiliary_loss_clip": 0.01243156, + "auxiliary_loss_mlp": 0.01078429, + "balance_loss_clip": 1.06440282, + "balance_loss_mlp": 1.04781592, + "epoch": 0.04106418157222306, + "flos": 20156731695360.0, + "grad_norm": 1.6947476714586034, + "language_loss": 0.72599399, + "learning_rate": 3.998716314427333e-06, + "loss": 0.74920982, + "num_input_tokens_seen": 14485195, + "step": 683, + "time_per_iteration": 2.676133394241333 + }, + { + "auxiliary_loss_clip": 0.01241071, + "auxiliary_loss_mlp": 0.01079531, + "balance_loss_clip": 1.07077932, + "balance_loss_mlp": 1.04851258, + "epoch": 0.041124304824891024, + "flos": 17420697855360.0, + "grad_norm": 2.098652785935233, + "language_loss": 0.81419414, + "learning_rate": 3.998702324920417e-06, + "loss": 0.8374002, + "num_input_tokens_seen": 14503370, + "step": 684, + "time_per_iteration": 2.6538476943969727 + }, + { + "auxiliary_loss_clip": 0.01242791, + "auxiliary_loss_mlp": 0.0107365, + "balance_loss_clip": 1.06783867, + "balance_loss_mlp": 1.04139185, + "epoch": 0.041184428077558996, + "flos": 25780163287680.0, + "grad_norm": 1.5053911947555274, + "language_loss": 0.90680599, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.92997038, + "num_input_tokens_seen": 14526415, + "step": 685, + "time_per_iteration": 2.6541450023651123 + }, + { + "auxiliary_loss_clip": 0.01244219, + "auxiliary_loss_mlp": 0.01072481, + "balance_loss_clip": 1.06659365, + "balance_loss_mlp": 1.04093838, + "epoch": 0.04124455133022697, + "flos": 22964766347520.0, + "grad_norm": 2.2251875217653185, + "language_loss": 0.87851977, + "learning_rate": 3.998674118534141e-06, + "loss": 0.90168673, + "num_input_tokens_seen": 14546595, + "step": 686, + "time_per_iteration": 2.7298531532287598 + }, + { + "auxiliary_loss_clip": 0.01247476, + "auxiliary_loss_mlp": 0.01073385, + "balance_loss_clip": 1.06586432, + "balance_loss_mlp": 1.04224789, + "epoch": 0.04130467458289493, + "flos": 21289067015040.0, + "grad_norm": 1.8582614005091855, + "language_loss": 0.7152915, + "learning_rate": 3.998659901655851e-06, + "loss": 0.73850012, + "num_input_tokens_seen": 14566590, + "step": 687, + "time_per_iteration": 2.6284232139587402 + }, + { + "auxiliary_loss_clip": 0.01243582, + "auxiliary_loss_mlp": 0.01076448, + "balance_loss_clip": 1.06979251, + "balance_loss_mlp": 1.04756403, + "epoch": 0.041364797835562905, + "flos": 19974233669760.0, + "grad_norm": 2.596672934278983, + "language_loss": 0.86028284, + "learning_rate": 3.998645608988177e-06, + "loss": 0.88348317, + "num_input_tokens_seen": 14585965, + "step": 688, + "time_per_iteration": 2.522634506225586 + }, + { + "auxiliary_loss_clip": 0.01241593, + "auxiliary_loss_mlp": 0.01079647, + "balance_loss_clip": 1.06802177, + "balance_loss_mlp": 1.04908216, + "epoch": 0.04142492108823087, + "flos": 21906227520000.0, + "grad_norm": 2.852238187591699, + "language_loss": 0.83393514, + "learning_rate": 3.998631240531661e-06, + "loss": 0.85714757, + "num_input_tokens_seen": 14606015, + "step": 689, + "time_per_iteration": 2.6140944957733154 + }, + { + "auxiliary_loss_clip": 0.01238254, + "auxiliary_loss_mlp": 0.01085009, + "balance_loss_clip": 1.06293654, + "balance_loss_mlp": 1.05463421, + "epoch": 0.04148504434089884, + "flos": 27639617621760.0, + "grad_norm": 2.870474577544969, + "language_loss": 0.68398476, + "learning_rate": 3.998616796286848e-06, + "loss": 0.70721734, + "num_input_tokens_seen": 14629955, + "step": 690, + "time_per_iteration": 2.658987522125244 + }, + { + "auxiliary_loss_clip": 0.01235903, + "auxiliary_loss_mlp": 0.01075275, + "balance_loss_clip": 1.0625304, + "balance_loss_mlp": 1.04565191, + "epoch": 0.041545167593566815, + "flos": 20518387781760.0, + "grad_norm": 1.634289561889102, + "language_loss": 0.74927461, + "learning_rate": 3.998602276254286e-06, + "loss": 0.77238643, + "num_input_tokens_seen": 14648000, + "step": 691, + "time_per_iteration": 2.599957227706909 + }, + { + "auxiliary_loss_clip": 0.01239089, + "auxiliary_loss_mlp": 0.01081705, + "balance_loss_clip": 1.06458938, + "balance_loss_mlp": 1.04978108, + "epoch": 0.04160529084623478, + "flos": 11868907939200.0, + "grad_norm": 2.123432521314224, + "language_loss": 0.84469771, + "learning_rate": 3.998587680434526e-06, + "loss": 0.86790562, + "num_input_tokens_seen": 14662235, + "step": 692, + "time_per_iteration": 2.5748491287231445 + }, + { + "auxiliary_loss_clip": 0.01242126, + "auxiliary_loss_mlp": 0.01076613, + "balance_loss_clip": 1.06274796, + "balance_loss_mlp": 1.04409313, + "epoch": 0.04166541409890275, + "flos": 14828306503680.0, + "grad_norm": 2.3463094595874665, + "language_loss": 0.88948715, + "learning_rate": 3.99857300882812e-06, + "loss": 0.91267455, + "num_input_tokens_seen": 14676065, + "step": 693, + "time_per_iteration": 2.569277286529541 + }, + { + "auxiliary_loss_clip": 0.01245438, + "auxiliary_loss_mlp": 0.01071471, + "balance_loss_clip": 1.06845784, + "balance_loss_mlp": 1.04123962, + "epoch": 0.04172553735157072, + "flos": 25808137004160.0, + "grad_norm": 5.499777597079252, + "language_loss": 0.81987685, + "learning_rate": 3.998558261435626e-06, + "loss": 0.84304595, + "num_input_tokens_seen": 14694955, + "step": 694, + "time_per_iteration": 2.6798722743988037 + }, + { + "auxiliary_loss_clip": 0.01242101, + "auxiliary_loss_mlp": 0.01073692, + "balance_loss_clip": 1.06179321, + "balance_loss_mlp": 1.04303181, + "epoch": 0.04178566060423869, + "flos": 24279815174400.0, + "grad_norm": 2.051302362473346, + "language_loss": 0.83672506, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.85988301, + "num_input_tokens_seen": 14715510, + "step": 695, + "time_per_iteration": 2.684537649154663 + }, + { + "auxiliary_loss_clip": 0.01242205, + "auxiliary_loss_mlp": 0.01080004, + "balance_loss_clip": 1.06535804, + "balance_loss_mlp": 1.04822254, + "epoch": 0.04184578385690666, + "flos": 18222008411520.0, + "grad_norm": 2.113561459264794, + "language_loss": 0.84351176, + "learning_rate": 3.99852853929461e-06, + "loss": 0.86673379, + "num_input_tokens_seen": 14731755, + "step": 696, + "time_per_iteration": 4.1141321659088135 + }, + { + "auxiliary_loss_clip": 0.01238462, + "auxiliary_loss_mlp": 0.01083207, + "balance_loss_clip": 1.06265593, + "balance_loss_mlp": 1.05099702, + "epoch": 0.041905907109574626, + "flos": 22776342577920.0, + "grad_norm": 6.921460264787684, + "language_loss": 0.93193012, + "learning_rate": 3.998513564547216e-06, + "loss": 0.95514685, + "num_input_tokens_seen": 14750810, + "step": 697, + "time_per_iteration": 5.71666693687439 + }, + { + "auxiliary_loss_clip": 0.01235964, + "auxiliary_loss_mlp": 0.01074448, + "balance_loss_clip": 1.06324339, + "balance_loss_mlp": 1.04495573, + "epoch": 0.0419660303622426, + "flos": 20156947176960.0, + "grad_norm": 2.1002029886241904, + "language_loss": 0.83775562, + "learning_rate": 3.998498514015987e-06, + "loss": 0.86085975, + "num_input_tokens_seen": 14768435, + "step": 698, + "time_per_iteration": 4.194530010223389 + }, + { + "auxiliary_loss_clip": 0.01239177, + "auxiliary_loss_mlp": 0.01093516, + "balance_loss_clip": 1.06274605, + "balance_loss_mlp": 1.06175828, + "epoch": 0.042026153614910564, + "flos": 23076376882560.0, + "grad_norm": 2.1234669437327955, + "language_loss": 0.91715962, + "learning_rate": 3.998483387701495e-06, + "loss": 0.94048655, + "num_input_tokens_seen": 14786690, + "step": 699, + "time_per_iteration": 2.6399078369140625 + }, + { + "auxiliary_loss_clip": 0.01113327, + "auxiliary_loss_mlp": 0.0102038, + "balance_loss_clip": 1.03020263, + "balance_loss_mlp": 1.01403797, + "epoch": 0.042086276867578536, + "flos": 64495243370880.0, + "grad_norm": 0.9035134571641164, + "language_loss": 0.67873394, + "learning_rate": 3.998468185604312e-06, + "loss": 0.70007098, + "num_input_tokens_seen": 14853840, + "step": 700, + "time_per_iteration": 3.192026376724243 + }, + { + "auxiliary_loss_clip": 0.01246765, + "auxiliary_loss_mlp": 0.01082955, + "balance_loss_clip": 1.06717515, + "balance_loss_mlp": 1.05017269, + "epoch": 0.04214640012024651, + "flos": 15487016065920.0, + "grad_norm": 2.2754848646841888, + "language_loss": 0.884673, + "learning_rate": 3.998452907725016e-06, + "loss": 0.90797025, + "num_input_tokens_seen": 14869580, + "step": 701, + "time_per_iteration": 2.5790441036224365 + }, + { + "auxiliary_loss_clip": 0.01242428, + "auxiliary_loss_mlp": 0.01080259, + "balance_loss_clip": 1.06793952, + "balance_loss_mlp": 1.04833448, + "epoch": 0.04220652337291447, + "flos": 23877040993920.0, + "grad_norm": 2.000128536077818, + "language_loss": 0.67100394, + "learning_rate": 3.998437554064184e-06, + "loss": 0.69423079, + "num_input_tokens_seen": 14891065, + "step": 702, + "time_per_iteration": 2.6247870922088623 + }, + { + "auxiliary_loss_clip": 0.01107168, + "auxiliary_loss_mlp": 0.01005563, + "balance_loss_clip": 1.02512407, + "balance_loss_mlp": 0.99922067, + "epoch": 0.042266646625582445, + "flos": 63795451628160.0, + "grad_norm": 0.8439205282656718, + "language_loss": 0.60756463, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.62869191, + "num_input_tokens_seen": 14954815, + "step": 703, + "time_per_iteration": 3.1991655826568604 + }, + { + "auxiliary_loss_clip": 0.01107933, + "auxiliary_loss_mlp": 0.01006502, + "balance_loss_clip": 1.02562141, + "balance_loss_mlp": 0.99973089, + "epoch": 0.04232676987825041, + "flos": 50018863345920.0, + "grad_norm": 1.0471369072250156, + "language_loss": 0.57677412, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.59791845, + "num_input_tokens_seen": 15003050, + "step": 704, + "time_per_iteration": 3.037705659866333 + }, + { + "auxiliary_loss_clip": 0.01241513, + "auxiliary_loss_mlp": 0.01072126, + "balance_loss_clip": 1.06549489, + "balance_loss_mlp": 1.0406549, + "epoch": 0.04238689313091838, + "flos": 21616105368960.0, + "grad_norm": 2.9488804643242488, + "language_loss": 0.87553984, + "learning_rate": 3.998391038398319e-06, + "loss": 0.89867628, + "num_input_tokens_seen": 15021990, + "step": 705, + "time_per_iteration": 2.6233222484588623 + }, + { + "auxiliary_loss_clip": 0.01230342, + "auxiliary_loss_mlp": 0.0107194, + "balance_loss_clip": 1.0605582, + "balance_loss_mlp": 1.04204249, + "epoch": 0.042447016383586354, + "flos": 19135109070720.0, + "grad_norm": 2.556815837902013, + "language_loss": 0.71071029, + "learning_rate": 3.998375381617201e-06, + "loss": 0.73373306, + "num_input_tokens_seen": 15040700, + "step": 706, + "time_per_iteration": 2.560434579849243 + }, + { + "auxiliary_loss_clip": 0.0123412, + "auxiliary_loss_mlp": 0.01070349, + "balance_loss_clip": 1.06249404, + "balance_loss_mlp": 1.03799582, + "epoch": 0.04250713963625432, + "flos": 24426007528320.0, + "grad_norm": 2.0814078632624167, + "language_loss": 0.93418455, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.95722926, + "num_input_tokens_seen": 15056725, + "step": 707, + "time_per_iteration": 2.6130473613739014 + }, + { + "auxiliary_loss_clip": 0.01237541, + "auxiliary_loss_mlp": 0.01067908, + "balance_loss_clip": 1.05994225, + "balance_loss_mlp": 1.03617477, + "epoch": 0.04256726288892229, + "flos": 30367391333760.0, + "grad_norm": 2.424205580643553, + "language_loss": 0.81514043, + "learning_rate": 3.998343840719776e-06, + "loss": 0.83819497, + "num_input_tokens_seen": 15077550, + "step": 708, + "time_per_iteration": 2.656277894973755 + }, + { + "auxiliary_loss_clip": 0.01243932, + "auxiliary_loss_mlp": 0.0108167, + "balance_loss_clip": 1.06461239, + "balance_loss_mlp": 1.04934049, + "epoch": 0.04262738614159026, + "flos": 16362661818240.0, + "grad_norm": 2.0883592727868145, + "language_loss": 0.82027614, + "learning_rate": 3.998327956604666e-06, + "loss": 0.8435322, + "num_input_tokens_seen": 15094955, + "step": 709, + "time_per_iteration": 2.5758891105651855 + }, + { + "auxiliary_loss_clip": 0.01243538, + "auxiliary_loss_mlp": 0.01071217, + "balance_loss_clip": 1.06374872, + "balance_loss_mlp": 1.03960264, + "epoch": 0.04268750939425823, + "flos": 20412379768320.0, + "grad_norm": 2.7686525844665133, + "language_loss": 0.8502059, + "learning_rate": 3.99831199671276e-06, + "loss": 0.87335348, + "num_input_tokens_seen": 15113395, + "step": 710, + "time_per_iteration": 2.571559429168701 + }, + { + "auxiliary_loss_clip": 0.0124498, + "auxiliary_loss_mlp": 0.01072229, + "balance_loss_clip": 1.06788397, + "balance_loss_mlp": 1.04166365, + "epoch": 0.0427476326469262, + "flos": 20302959962880.0, + "grad_norm": 7.911177124524585, + "language_loss": 0.84914303, + "learning_rate": 3.998295961044662e-06, + "loss": 0.87231517, + "num_input_tokens_seen": 15132920, + "step": 711, + "time_per_iteration": 2.569959878921509 + }, + { + "auxiliary_loss_clip": 0.01237769, + "auxiliary_loss_mlp": 0.01074338, + "balance_loss_clip": 1.06188083, + "balance_loss_mlp": 1.04229426, + "epoch": 0.042807755899594166, + "flos": 21650794928640.0, + "grad_norm": 1.7189790042473796, + "language_loss": 0.85439789, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.87751901, + "num_input_tokens_seen": 15153115, + "step": 712, + "time_per_iteration": 2.6200509071350098 + }, + { + "auxiliary_loss_clip": 0.01242397, + "auxiliary_loss_mlp": 0.01069523, + "balance_loss_clip": 1.06085837, + "balance_loss_mlp": 1.03989983, + "epoch": 0.04286787915226214, + "flos": 21435007973760.0, + "grad_norm": 5.490507523621204, + "language_loss": 0.91178697, + "learning_rate": 3.998263662382328e-06, + "loss": 0.93490618, + "num_input_tokens_seen": 15172770, + "step": 713, + "time_per_iteration": 2.6353416442871094 + }, + { + "auxiliary_loss_clip": 0.01104693, + "auxiliary_loss_mlp": 0.01006514, + "balance_loss_clip": 1.02325606, + "balance_loss_mlp": 0.99955195, + "epoch": 0.04292800240493011, + "flos": 66397970615040.0, + "grad_norm": 0.9310328114407391, + "language_loss": 0.63725489, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.65836698, + "num_input_tokens_seen": 15240055, + "step": 714, + "time_per_iteration": 3.2544445991516113 + }, + { + "auxiliary_loss_clip": 0.01239175, + "auxiliary_loss_mlp": 0.01085992, + "balance_loss_clip": 1.06602359, + "balance_loss_mlp": 1.05552244, + "epoch": 0.042988125657598075, + "flos": 31650264552960.0, + "grad_norm": 1.8449858143817996, + "language_loss": 0.75010103, + "learning_rate": 3.998231060622563e-06, + "loss": 0.77335274, + "num_input_tokens_seen": 15261585, + "step": 715, + "time_per_iteration": 2.7048466205596924 + }, + { + "auxiliary_loss_clip": 0.01242734, + "auxiliary_loss_mlp": 0.01074126, + "balance_loss_clip": 1.0666225, + "balance_loss_mlp": 1.04227352, + "epoch": 0.04304824891026605, + "flos": 33248468292480.0, + "grad_norm": 1.9505519101092619, + "language_loss": 0.72289199, + "learning_rate": 3.998214646082688e-06, + "loss": 0.74606061, + "num_input_tokens_seen": 15281160, + "step": 716, + "time_per_iteration": 2.7807397842407227 + }, + { + "auxiliary_loss_clip": 0.01104303, + "auxiliary_loss_mlp": 0.01006894, + "balance_loss_clip": 1.02277207, + "balance_loss_mlp": 0.99997944, + "epoch": 0.04310837216293401, + "flos": 64064782782720.0, + "grad_norm": 0.9245106661639481, + "language_loss": 0.65587437, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67698634, + "num_input_tokens_seen": 15344505, + "step": 717, + "time_per_iteration": 3.250870943069458 + }, + { + "auxiliary_loss_clip": 0.01103971, + "auxiliary_loss_mlp": 0.01009587, + "balance_loss_clip": 1.02238059, + "balance_loss_mlp": 1.00267255, + "epoch": 0.043168495415601985, + "flos": 61343757849600.0, + "grad_norm": 0.9849394627593366, + "language_loss": 0.58785796, + "learning_rate": 3.998181589686065e-06, + "loss": 0.60899353, + "num_input_tokens_seen": 15404050, + "step": 718, + "time_per_iteration": 3.0402464866638184 + }, + { + "auxiliary_loss_clip": 0.0124025, + "auxiliary_loss_mlp": 0.0107507, + "balance_loss_clip": 1.06784248, + "balance_loss_mlp": 1.0424546, + "epoch": 0.04322861866826996, + "flos": 20704261685760.0, + "grad_norm": 1.9557310597444375, + "language_loss": 0.91440111, + "learning_rate": 3.99816494783057e-06, + "loss": 0.9375543, + "num_input_tokens_seen": 15424190, + "step": 719, + "time_per_iteration": 2.6500089168548584 + }, + { + "auxiliary_loss_clip": 0.01235843, + "auxiliary_loss_mlp": 0.01072906, + "balance_loss_clip": 1.06020999, + "balance_loss_mlp": 1.04296041, + "epoch": 0.04328874192093792, + "flos": 30373352991360.0, + "grad_norm": 1.7057721639328365, + "language_loss": 0.66461253, + "learning_rate": 3.99814823020446e-06, + "loss": 0.68770003, + "num_input_tokens_seen": 15446500, + "step": 720, + "time_per_iteration": 2.673184871673584 + }, + { + "auxiliary_loss_clip": 0.01234245, + "auxiliary_loss_mlp": 0.01072069, + "balance_loss_clip": 1.06111717, + "balance_loss_mlp": 1.04131258, + "epoch": 0.043348865173605894, + "flos": 21944795748480.0, + "grad_norm": 1.9491363249287763, + "language_loss": 0.77460182, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.79766488, + "num_input_tokens_seen": 15465830, + "step": 721, + "time_per_iteration": 2.6695611476898193 + }, + { + "auxiliary_loss_clip": 0.01241854, + "auxiliary_loss_mlp": 0.01087169, + "balance_loss_clip": 1.06622314, + "balance_loss_mlp": 1.05719972, + "epoch": 0.04340898842627386, + "flos": 15264225959040.0, + "grad_norm": 2.8383174670702718, + "language_loss": 0.88298881, + "learning_rate": 3.998114567642933e-06, + "loss": 0.90627909, + "num_input_tokens_seen": 15479985, + "step": 722, + "time_per_iteration": 2.661313533782959 + }, + { + "auxiliary_loss_clip": 0.01244836, + "auxiliary_loss_mlp": 0.01076885, + "balance_loss_clip": 1.06665182, + "balance_loss_mlp": 1.0480125, + "epoch": 0.04346911167894183, + "flos": 27965434913280.0, + "grad_norm": 5.515838365549148, + "language_loss": 0.84387141, + "learning_rate": 3.998097622708792e-06, + "loss": 0.86708868, + "num_input_tokens_seen": 15501545, + "step": 723, + "time_per_iteration": 2.6447954177856445 + }, + { + "auxiliary_loss_clip": 0.01245825, + "auxiliary_loss_mlp": 0.01081354, + "balance_loss_clip": 1.06723523, + "balance_loss_mlp": 1.05019248, + "epoch": 0.0435292349316098, + "flos": 29242202820480.0, + "grad_norm": 1.7852936089408447, + "language_loss": 0.82789439, + "learning_rate": 3.99808060200659e-06, + "loss": 0.85116619, + "num_input_tokens_seen": 15521725, + "step": 724, + "time_per_iteration": 2.676985263824463 + }, + { + "auxiliary_loss_clip": 0.0124127, + "auxiliary_loss_mlp": 0.01087491, + "balance_loss_clip": 1.06535757, + "balance_loss_mlp": 1.05609179, + "epoch": 0.04358935818427777, + "flos": 20558356640640.0, + "grad_norm": 2.011685360503238, + "language_loss": 0.79444051, + "learning_rate": 3.998063505536971e-06, + "loss": 0.81772816, + "num_input_tokens_seen": 15540910, + "step": 725, + "time_per_iteration": 2.6241447925567627 + }, + { + "auxiliary_loss_clip": 0.01251777, + "auxiliary_loss_mlp": 0.01074923, + "balance_loss_clip": 1.06783843, + "balance_loss_mlp": 1.04309392, + "epoch": 0.04364948143694574, + "flos": 14464926564480.0, + "grad_norm": 2.2160842755462817, + "language_loss": 0.87175703, + "learning_rate": 3.998046333300584e-06, + "loss": 0.89502406, + "num_input_tokens_seen": 15558640, + "step": 726, + "time_per_iteration": 2.555551052093506 + }, + { + "auxiliary_loss_clip": 0.01100917, + "auxiliary_loss_mlp": 0.01015411, + "balance_loss_clip": 1.02171838, + "balance_loss_mlp": 1.00947404, + "epoch": 0.043709604689613706, + "flos": 50067268922880.0, + "grad_norm": 0.908981905466007, + "language_loss": 0.55868411, + "learning_rate": 3.998029085298079e-06, + "loss": 0.5798474, + "num_input_tokens_seen": 15612975, + "step": 727, + "time_per_iteration": 3.375901699066162 + }, + { + "auxiliary_loss_clip": 0.01245647, + "auxiliary_loss_mlp": 0.0108809, + "balance_loss_clip": 1.06717396, + "balance_loss_mlp": 1.05614173, + "epoch": 0.04376972794228168, + "flos": 13991588115840.0, + "grad_norm": 2.282663852625415, + "language_loss": 0.82326066, + "learning_rate": 3.998011761530112e-06, + "loss": 0.84659809, + "num_input_tokens_seen": 15631070, + "step": 728, + "time_per_iteration": 2.605970621109009 + }, + { + "auxiliary_loss_clip": 0.01237902, + "auxiliary_loss_mlp": 0.01073495, + "balance_loss_clip": 1.06600416, + "balance_loss_mlp": 1.04321551, + "epoch": 0.04382985119494965, + "flos": 22009901149440.0, + "grad_norm": 2.1303486954703152, + "language_loss": 0.76890069, + "learning_rate": 3.997994361997338e-06, + "loss": 0.7920146, + "num_input_tokens_seen": 15647825, + "step": 729, + "time_per_iteration": 2.652466297149658 + }, + { + "auxiliary_loss_clip": 0.01243746, + "auxiliary_loss_mlp": 0.01079207, + "balance_loss_clip": 1.06438255, + "balance_loss_mlp": 1.04859376, + "epoch": 0.043889974447617615, + "flos": 24206521472640.0, + "grad_norm": 2.1385115795714107, + "language_loss": 0.95153189, + "learning_rate": 3.997976886700417e-06, + "loss": 0.97476137, + "num_input_tokens_seen": 15668260, + "step": 730, + "time_per_iteration": 2.734614133834839 + }, + { + "auxiliary_loss_clip": 0.01238581, + "auxiliary_loss_mlp": 0.01074727, + "balance_loss_clip": 1.06093788, + "balance_loss_mlp": 1.04315984, + "epoch": 0.04395009770028559, + "flos": 17274541415040.0, + "grad_norm": 2.333073864238008, + "language_loss": 0.88456279, + "learning_rate": 3.997959335640013e-06, + "loss": 0.90769589, + "num_input_tokens_seen": 15685630, + "step": 731, + "time_per_iteration": 2.5912294387817383 + }, + { + "auxiliary_loss_clip": 0.01242247, + "auxiliary_loss_mlp": 0.01076563, + "balance_loss_clip": 1.06636512, + "balance_loss_mlp": 1.04757094, + "epoch": 0.04401022095295355, + "flos": 12310286261760.0, + "grad_norm": 3.0398759554531254, + "language_loss": 0.88683128, + "learning_rate": 3.997941708816791e-06, + "loss": 0.9100194, + "num_input_tokens_seen": 15698645, + "step": 732, + "time_per_iteration": 2.5897367000579834 + }, + { + "auxiliary_loss_clip": 0.01242736, + "auxiliary_loss_mlp": 0.01087795, + "balance_loss_clip": 1.06544232, + "balance_loss_mlp": 1.05646718, + "epoch": 0.044070344205621524, + "flos": 20959658363520.0, + "grad_norm": 2.304959545118842, + "language_loss": 0.85829747, + "learning_rate": 3.997924006231419e-06, + "loss": 0.88160276, + "num_input_tokens_seen": 15716775, + "step": 733, + "time_per_iteration": 2.650681972503662 + }, + { + "auxiliary_loss_clip": 0.01246603, + "auxiliary_loss_mlp": 0.01088724, + "balance_loss_clip": 1.06722379, + "balance_loss_mlp": 1.05544066, + "epoch": 0.044130467458289496, + "flos": 13845288021120.0, + "grad_norm": 2.207780377909299, + "language_loss": 0.91189414, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.93524742, + "num_input_tokens_seen": 15733320, + "step": 734, + "time_per_iteration": 2.5956180095672607 + }, + { + "auxiliary_loss_clip": 0.01238395, + "auxiliary_loss_mlp": 0.01067579, + "balance_loss_clip": 1.06596422, + "balance_loss_mlp": 1.03781235, + "epoch": 0.04419059071095746, + "flos": 28655063107200.0, + "grad_norm": 1.9297536072777384, + "language_loss": 0.77884138, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.8019011, + "num_input_tokens_seen": 15752705, + "step": 735, + "time_per_iteration": 2.603809118270874 + }, + { + "auxiliary_loss_clip": 0.01234188, + "auxiliary_loss_mlp": 0.01070499, + "balance_loss_clip": 1.06063068, + "balance_loss_mlp": 1.04091144, + "epoch": 0.04425071396362543, + "flos": 28183304856960.0, + "grad_norm": 2.266122200005257, + "language_loss": 0.8832593, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.90630615, + "num_input_tokens_seen": 15772800, + "step": 736, + "time_per_iteration": 5.841086149215698 + }, + { + "auxiliary_loss_clip": 0.01235947, + "auxiliary_loss_mlp": 0.01081098, + "balance_loss_clip": 1.06597185, + "balance_loss_mlp": 1.05165362, + "epoch": 0.0443108372162934, + "flos": 23658452778240.0, + "grad_norm": 1.8984177574034653, + "language_loss": 0.84481263, + "learning_rate": 3.997852438281901e-06, + "loss": 0.8679831, + "num_input_tokens_seen": 15793665, + "step": 737, + "time_per_iteration": 4.1386003494262695 + }, + { + "auxiliary_loss_clip": 0.01240863, + "auxiliary_loss_mlp": 0.01072388, + "balance_loss_clip": 1.0653491, + "balance_loss_mlp": 1.03961766, + "epoch": 0.04437096046896137, + "flos": 33979861025280.0, + "grad_norm": 2.2366199062134706, + "language_loss": 0.84712577, + "learning_rate": 3.997834356895906e-06, + "loss": 0.87025833, + "num_input_tokens_seen": 15813175, + "step": 738, + "time_per_iteration": 4.447159290313721 + }, + { + "auxiliary_loss_clip": 0.01098733, + "auxiliary_loss_mlp": 0.0102196, + "balance_loss_clip": 1.02144337, + "balance_loss_mlp": 1.01685739, + "epoch": 0.04443108372162934, + "flos": 67397506375680.0, + "grad_norm": 0.8779518557387592, + "language_loss": 0.59179878, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61300576, + "num_input_tokens_seen": 15872050, + "step": 739, + "time_per_iteration": 3.0780396461486816 + }, + { + "auxiliary_loss_clip": 0.012386, + "auxiliary_loss_mlp": 0.01067387, + "balance_loss_clip": 1.06604302, + "balance_loss_mlp": 1.03717899, + "epoch": 0.04449120697429731, + "flos": 29752672953600.0, + "grad_norm": 2.295102845773205, + "language_loss": 0.91329807, + "learning_rate": 3.997797966850369e-06, + "loss": 0.93635798, + "num_input_tokens_seen": 15891085, + "step": 740, + "time_per_iteration": 2.6687562465667725 + }, + { + "auxiliary_loss_clip": 0.01243424, + "auxiliary_loss_mlp": 0.01067832, + "balance_loss_clip": 1.06807768, + "balance_loss_mlp": 1.03929377, + "epoch": 0.04455133022696528, + "flos": 36502119072000.0, + "grad_norm": 2.0543845689042484, + "language_loss": 0.71875739, + "learning_rate": 3.997779658192205e-06, + "loss": 0.74186987, + "num_input_tokens_seen": 15914225, + "step": 741, + "time_per_iteration": 2.707231283187866 + }, + { + "auxiliary_loss_clip": 0.01233192, + "auxiliary_loss_mlp": 0.01084138, + "balance_loss_clip": 1.062482, + "balance_loss_mlp": 1.05476475, + "epoch": 0.044611453479633245, + "flos": 28803661672320.0, + "grad_norm": 1.7086571433899975, + "language_loss": 0.88933527, + "learning_rate": 3.997761273778037e-06, + "loss": 0.91250861, + "num_input_tokens_seen": 15934540, + "step": 742, + "time_per_iteration": 2.6647751331329346 + }, + { + "auxiliary_loss_clip": 0.01237248, + "auxiliary_loss_mlp": 0.0106534, + "balance_loss_clip": 1.06481838, + "balance_loss_mlp": 1.03367805, + "epoch": 0.04467157673230122, + "flos": 20010970304640.0, + "grad_norm": 1.9055071619943689, + "language_loss": 0.83840811, + "learning_rate": 3.997742813608561e-06, + "loss": 0.86143398, + "num_input_tokens_seen": 15952560, + "step": 743, + "time_per_iteration": 2.697864055633545 + }, + { + "auxiliary_loss_clip": 0.01239398, + "auxiliary_loss_mlp": 0.01073846, + "balance_loss_clip": 1.06395566, + "balance_loss_mlp": 1.04373407, + "epoch": 0.04473169998496919, + "flos": 18004964480640.0, + "grad_norm": 2.2041873634107696, + "language_loss": 0.80026019, + "learning_rate": 3.997724277684479e-06, + "loss": 0.82339263, + "num_input_tokens_seen": 15970620, + "step": 744, + "time_per_iteration": 2.6551101207733154 + }, + { + "auxiliary_loss_clip": 0.01236158, + "auxiliary_loss_mlp": 0.01076186, + "balance_loss_clip": 1.06385589, + "balance_loss_mlp": 1.04665816, + "epoch": 0.044791823237637154, + "flos": 20631722169600.0, + "grad_norm": 2.139129927663487, + "language_loss": 0.85502481, + "learning_rate": 3.99770566600649e-06, + "loss": 0.87814826, + "num_input_tokens_seen": 15987325, + "step": 745, + "time_per_iteration": 2.6686010360717773 + }, + { + "auxiliary_loss_clip": 0.01235001, + "auxiliary_loss_mlp": 0.01066107, + "balance_loss_clip": 1.06320596, + "balance_loss_mlp": 1.03594685, + "epoch": 0.04485194649030513, + "flos": 31176171918720.0, + "grad_norm": 1.8251828520192552, + "language_loss": 0.69291008, + "learning_rate": 3.997686978575302e-06, + "loss": 0.71592116, + "num_input_tokens_seen": 16008310, + "step": 746, + "time_per_iteration": 2.6782095432281494 + }, + { + "auxiliary_loss_clip": 0.01244022, + "auxiliary_loss_mlp": 0.01081644, + "balance_loss_clip": 1.07012939, + "balance_loss_mlp": 1.05000615, + "epoch": 0.04491206974297309, + "flos": 26143291831680.0, + "grad_norm": 3.6053643469900982, + "language_loss": 0.68531066, + "learning_rate": 3.997668215391625e-06, + "loss": 0.70856726, + "num_input_tokens_seen": 16029620, + "step": 747, + "time_per_iteration": 2.6589114665985107 + }, + { + "auxiliary_loss_clip": 0.0124018, + "auxiliary_loss_mlp": 0.01083594, + "balance_loss_clip": 1.0652504, + "balance_loss_mlp": 1.05183625, + "epoch": 0.044972192995641064, + "flos": 20667668705280.0, + "grad_norm": 1.8376208182131786, + "language_loss": 0.66778374, + "learning_rate": 3.997649376456168e-06, + "loss": 0.69102144, + "num_input_tokens_seen": 16049065, + "step": 748, + "time_per_iteration": 2.674691677093506 + }, + { + "auxiliary_loss_clip": 0.01243343, + "auxiliary_loss_mlp": 0.01085665, + "balance_loss_clip": 1.07101417, + "balance_loss_mlp": 1.05596995, + "epoch": 0.045032316248309036, + "flos": 16106834177280.0, + "grad_norm": 2.4197486882062322, + "language_loss": 0.76684916, + "learning_rate": 3.997630461769647e-06, + "loss": 0.7901392, + "num_input_tokens_seen": 16066765, + "step": 749, + "time_per_iteration": 2.5940611362457275 + }, + { + "auxiliary_loss_clip": 0.01243381, + "auxiliary_loss_mlp": 0.01083303, + "balance_loss_clip": 1.06892776, + "balance_loss_mlp": 1.05338168, + "epoch": 0.045092439500977, + "flos": 17858843953920.0, + "grad_norm": 1.926675828378473, + "language_loss": 0.88739896, + "learning_rate": 3.997611471332778e-06, + "loss": 0.91066581, + "num_input_tokens_seen": 16085980, + "step": 750, + "time_per_iteration": 2.551717758178711 + }, + { + "auxiliary_loss_clip": 0.01238484, + "auxiliary_loss_mlp": 0.01077419, + "balance_loss_clip": 1.062783, + "balance_loss_mlp": 1.04404092, + "epoch": 0.04515256275364497, + "flos": 24462815990400.0, + "grad_norm": 3.4910287963746116, + "language_loss": 0.74371743, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.76687646, + "num_input_tokens_seen": 16106260, + "step": 751, + "time_per_iteration": 2.6299028396606445 + }, + { + "auxiliary_loss_clip": 0.0123577, + "auxiliary_loss_mlp": 0.01078322, + "balance_loss_clip": 1.06347609, + "balance_loss_mlp": 1.04884171, + "epoch": 0.04521268600631294, + "flos": 20916385453440.0, + "grad_norm": 3.3938056459605583, + "language_loss": 0.69115144, + "learning_rate": 3.997573263210883e-06, + "loss": 0.71429229, + "num_input_tokens_seen": 16123475, + "step": 752, + "time_per_iteration": 2.571223020553589 + }, + { + "auxiliary_loss_clip": 0.01235899, + "auxiliary_loss_mlp": 0.01060876, + "balance_loss_clip": 1.0627141, + "balance_loss_mlp": 1.03212225, + "epoch": 0.04527280925898091, + "flos": 13371374954880.0, + "grad_norm": 2.69328062598792, + "language_loss": 0.92126763, + "learning_rate": 3.997554045527305e-06, + "loss": 0.94423538, + "num_input_tokens_seen": 16138335, + "step": 753, + "time_per_iteration": 2.6100237369537354 + }, + { + "auxiliary_loss_clip": 0.01239023, + "auxiliary_loss_mlp": 0.01080271, + "balance_loss_clip": 1.06628633, + "balance_loss_mlp": 1.05116034, + "epoch": 0.04533293251164888, + "flos": 23254565276160.0, + "grad_norm": 4.138305317267875, + "language_loss": 0.91373456, + "learning_rate": 3.997534752096277e-06, + "loss": 0.93692756, + "num_input_tokens_seen": 16157110, + "step": 754, + "time_per_iteration": 2.642747402191162 + }, + { + "auxiliary_loss_clip": 0.01229195, + "auxiliary_loss_mlp": 0.01078016, + "balance_loss_clip": 1.06402516, + "balance_loss_mlp": 1.04725957, + "epoch": 0.04539305576431685, + "flos": 12422004537600.0, + "grad_norm": 4.559941934311277, + "language_loss": 0.78558046, + "learning_rate": 3.997515382918531e-06, + "loss": 0.80865264, + "num_input_tokens_seen": 16174155, + "step": 755, + "time_per_iteration": 2.6316659450531006 + }, + { + "auxiliary_loss_clip": 0.01240044, + "auxiliary_loss_mlp": 0.01081048, + "balance_loss_clip": 1.06624937, + "balance_loss_mlp": 1.05099559, + "epoch": 0.04545317901698482, + "flos": 16070995382400.0, + "grad_norm": 2.193539224658874, + "language_loss": 0.78473848, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.80794942, + "num_input_tokens_seen": 16192240, + "step": 756, + "time_per_iteration": 2.6390748023986816 + }, + { + "auxiliary_loss_clip": 0.01101224, + "auxiliary_loss_mlp": 0.01013849, + "balance_loss_clip": 1.02455997, + "balance_loss_mlp": 1.0089612, + "epoch": 0.045513302269652785, + "flos": 66396139021440.0, + "grad_norm": 0.8202876780471967, + "language_loss": 0.62756521, + "learning_rate": 3.997476417325827e-06, + "loss": 0.64871597, + "num_input_tokens_seen": 16255775, + "step": 757, + "time_per_iteration": 3.2393198013305664 + }, + { + "auxiliary_loss_clip": 0.01235136, + "auxiliary_loss_mlp": 0.01071767, + "balance_loss_clip": 1.06455243, + "balance_loss_mlp": 1.04346693, + "epoch": 0.04557342552232076, + "flos": 21471169991040.0, + "grad_norm": 1.6528285304744148, + "language_loss": 0.84211069, + "learning_rate": 3.997456820912346e-06, + "loss": 0.86517978, + "num_input_tokens_seen": 16277015, + "step": 758, + "time_per_iteration": 2.6508655548095703 + }, + { + "auxiliary_loss_clip": 0.01228461, + "auxiliary_loss_mlp": 0.01067033, + "balance_loss_clip": 1.05912399, + "balance_loss_mlp": 1.0391618, + "epoch": 0.04563354877498873, + "flos": 23732680233600.0, + "grad_norm": 2.695805662282291, + "language_loss": 0.88150775, + "learning_rate": 3.997437148755101e-06, + "loss": 0.9044627, + "num_input_tokens_seen": 16296005, + "step": 759, + "time_per_iteration": 2.7782890796661377 + }, + { + "auxiliary_loss_clip": 0.01240589, + "auxiliary_loss_mlp": 0.01078815, + "balance_loss_clip": 1.06747675, + "balance_loss_mlp": 1.04846466, + "epoch": 0.045693672027656694, + "flos": 25735741142400.0, + "grad_norm": 2.392455009776849, + "language_loss": 0.73440695, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.75760102, + "num_input_tokens_seen": 16315300, + "step": 760, + "time_per_iteration": 2.7138822078704834 + }, + { + "auxiliary_loss_clip": 0.01240372, + "auxiliary_loss_mlp": 0.01079791, + "balance_loss_clip": 1.07095265, + "balance_loss_mlp": 1.05162191, + "epoch": 0.045753795280324666, + "flos": 19719016560000.0, + "grad_norm": 3.497321311688565, + "language_loss": 0.81781888, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.84102058, + "num_input_tokens_seen": 16333820, + "step": 761, + "time_per_iteration": 2.631303310394287 + }, + { + "auxiliary_loss_clip": 0.01231969, + "auxiliary_loss_mlp": 0.01078623, + "balance_loss_clip": 1.06324267, + "balance_loss_mlp": 1.04922605, + "epoch": 0.04581391853299264, + "flos": 23255786338560.0, + "grad_norm": 2.0632320043111965, + "language_loss": 0.79811668, + "learning_rate": 3.997377677828266e-06, + "loss": 0.82122266, + "num_input_tokens_seen": 16355290, + "step": 762, + "time_per_iteration": 2.646928071975708 + }, + { + "auxiliary_loss_clip": 0.01093869, + "auxiliary_loss_mlp": 0.01027943, + "balance_loss_clip": 1.01857328, + "balance_loss_mlp": 1.02288842, + "epoch": 0.0458740417856606, + "flos": 64231155601920.0, + "grad_norm": 1.0128965743658471, + "language_loss": 0.58723813, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.60845619, + "num_input_tokens_seen": 16415995, + "step": 763, + "time_per_iteration": 3.1712563037872314 + }, + { + "auxiliary_loss_clip": 0.012343, + "auxiliary_loss_mlp": 0.01082461, + "balance_loss_clip": 1.06205368, + "balance_loss_mlp": 1.0531354, + "epoch": 0.045934165038328575, + "flos": 20770121272320.0, + "grad_norm": 4.978761831483118, + "language_loss": 0.87544954, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.89861715, + "num_input_tokens_seen": 16433120, + "step": 764, + "time_per_iteration": 2.5985426902770996 + }, + { + "auxiliary_loss_clip": 0.01236145, + "auxiliary_loss_mlp": 0.01087868, + "balance_loss_clip": 1.06553543, + "balance_loss_mlp": 1.05854285, + "epoch": 0.04599428829099654, + "flos": 30262891691520.0, + "grad_norm": 2.0894169515773067, + "language_loss": 0.85966802, + "learning_rate": 3.997317525234592e-06, + "loss": 0.88290817, + "num_input_tokens_seen": 16453360, + "step": 765, + "time_per_iteration": 2.6572606563568115 + }, + { + "auxiliary_loss_clip": 0.01239644, + "auxiliary_loss_mlp": 0.01077398, + "balance_loss_clip": 1.06530261, + "balance_loss_mlp": 1.04573584, + "epoch": 0.04605441154366451, + "flos": 23038921975680.0, + "grad_norm": 2.628046285830335, + "language_loss": 0.88265938, + "learning_rate": 3.997297322892056e-06, + "loss": 0.90582979, + "num_input_tokens_seen": 16471160, + "step": 766, + "time_per_iteration": 2.673226833343506 + }, + { + "auxiliary_loss_clip": 0.01235506, + "auxiliary_loss_mlp": 0.0107998, + "balance_loss_clip": 1.06371713, + "balance_loss_mlp": 1.05115545, + "epoch": 0.046114534796332485, + "flos": 22017407091840.0, + "grad_norm": 2.343908591401411, + "language_loss": 0.84302223, + "learning_rate": 3.997277044811806e-06, + "loss": 0.86617708, + "num_input_tokens_seen": 16488940, + "step": 767, + "time_per_iteration": 2.683429002761841 + }, + { + "auxiliary_loss_clip": 0.01236229, + "auxiliary_loss_mlp": 0.01067844, + "balance_loss_clip": 1.06769753, + "balance_loss_mlp": 1.03791094, + "epoch": 0.04617465804900045, + "flos": 29862380067840.0, + "grad_norm": 1.9268984031305718, + "language_loss": 0.8669976, + "learning_rate": 3.99725669099461e-06, + "loss": 0.89003831, + "num_input_tokens_seen": 16509505, + "step": 768, + "time_per_iteration": 2.8125200271606445 + }, + { + "auxiliary_loss_clip": 0.01234175, + "auxiliary_loss_mlp": 0.01076069, + "balance_loss_clip": 1.06150854, + "balance_loss_mlp": 1.04738712, + "epoch": 0.04623478130166842, + "flos": 25630056351360.0, + "grad_norm": 2.115272554881108, + "language_loss": 0.75152099, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.77462339, + "num_input_tokens_seen": 16528840, + "step": 769, + "time_per_iteration": 2.7286128997802734 + }, + { + "auxiliary_loss_clip": 0.01229956, + "auxiliary_loss_mlp": 0.01072391, + "balance_loss_clip": 1.06326365, + "balance_loss_mlp": 1.04462695, + "epoch": 0.04629490455433639, + "flos": 20449080489600.0, + "grad_norm": 1.8368669953292174, + "language_loss": 0.86292851, + "learning_rate": 3.997215756152471e-06, + "loss": 0.885952, + "num_input_tokens_seen": 16548335, + "step": 770, + "time_per_iteration": 2.68608021736145 + }, + { + "auxiliary_loss_clip": 0.01239009, + "auxiliary_loss_mlp": 0.01072125, + "balance_loss_clip": 1.06274092, + "balance_loss_mlp": 1.04284704, + "epoch": 0.04635502780700436, + "flos": 23148736830720.0, + "grad_norm": 2.058802627607224, + "language_loss": 0.86842889, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89154023, + "num_input_tokens_seen": 16567725, + "step": 771, + "time_per_iteration": 2.637509822845459 + }, + { + "auxiliary_loss_clip": 0.01239449, + "auxiliary_loss_mlp": 0.01079651, + "balance_loss_clip": 1.06184912, + "balance_loss_mlp": 1.04884768, + "epoch": 0.04641515105967233, + "flos": 23292020183040.0, + "grad_norm": 1.87920888608735, + "language_loss": 0.83691382, + "learning_rate": 3.997174518371848e-06, + "loss": 0.8601048, + "num_input_tokens_seen": 16588175, + "step": 772, + "time_per_iteration": 2.745006561279297 + }, + { + "auxiliary_loss_clip": 0.01236322, + "auxiliary_loss_mlp": 0.0107061, + "balance_loss_clip": 1.06672883, + "balance_loss_mlp": 1.04220271, + "epoch": 0.046475274312340296, + "flos": 25115204759040.0, + "grad_norm": 1.9655107083336736, + "language_loss": 0.73639083, + "learning_rate": 3.997153785881557e-06, + "loss": 0.75946015, + "num_input_tokens_seen": 16607735, + "step": 773, + "time_per_iteration": 2.869290828704834 + }, + { + "auxiliary_loss_clip": 0.01231219, + "auxiliary_loss_mlp": 0.01071681, + "balance_loss_clip": 1.06529772, + "balance_loss_mlp": 1.04054356, + "epoch": 0.04653539756500827, + "flos": 25264916645760.0, + "grad_norm": 2.096431798380756, + "language_loss": 0.78228974, + "learning_rate": 3.997132977658996e-06, + "loss": 0.80531871, + "num_input_tokens_seen": 16627225, + "step": 774, + "time_per_iteration": 2.6967568397521973 + }, + { + "auxiliary_loss_clip": 0.01230587, + "auxiliary_loss_mlp": 0.01069519, + "balance_loss_clip": 1.06347871, + "balance_loss_mlp": 1.04131365, + "epoch": 0.046595520817676234, + "flos": 35404150089600.0, + "grad_norm": 2.018140205527256, + "language_loss": 0.73187691, + "learning_rate": 3.997112093704952e-06, + "loss": 0.75487792, + "num_input_tokens_seen": 16647785, + "step": 775, + "time_per_iteration": 2.737140417098999 + }, + { + "auxiliary_loss_clip": 0.01231996, + "auxiliary_loss_mlp": 0.01066454, + "balance_loss_clip": 1.06187618, + "balance_loss_mlp": 1.03650832, + "epoch": 0.046655644070344206, + "flos": 18112516778880.0, + "grad_norm": 1.668093168561758, + "language_loss": 0.77180624, + "learning_rate": 3.997091134020217e-06, + "loss": 0.7947908, + "num_input_tokens_seen": 16667555, + "step": 776, + "time_per_iteration": 4.154085159301758 + }, + { + "auxiliary_loss_clip": 0.0122577, + "auxiliary_loss_mlp": 0.01071334, + "balance_loss_clip": 1.06031108, + "balance_loss_mlp": 1.04352236, + "epoch": 0.04671576732301218, + "flos": 29205286617600.0, + "grad_norm": 1.9054628166827923, + "language_loss": 0.7087816, + "learning_rate": 3.997070098605585e-06, + "loss": 0.73175263, + "num_input_tokens_seen": 16686875, + "step": 777, + "time_per_iteration": 4.176887512207031 + }, + { + "auxiliary_loss_clip": 0.0122979, + "auxiliary_loss_mlp": 0.01076806, + "balance_loss_clip": 1.06275606, + "balance_loss_mlp": 1.04705119, + "epoch": 0.04677589057568014, + "flos": 30478319510400.0, + "grad_norm": 1.8083238359854679, + "language_loss": 0.77069759, + "learning_rate": 3.997048987461856e-06, + "loss": 0.79376352, + "num_input_tokens_seen": 16706420, + "step": 778, + "time_per_iteration": 5.943394422531128 + }, + { + "auxiliary_loss_clip": 0.01227067, + "auxiliary_loss_mlp": 0.01064982, + "balance_loss_clip": 1.06043744, + "balance_loss_mlp": 1.03563297, + "epoch": 0.046836013828348115, + "flos": 20557674282240.0, + "grad_norm": 2.1737778598926463, + "language_loss": 0.79181123, + "learning_rate": 3.997027800589829e-06, + "loss": 0.81473172, + "num_input_tokens_seen": 16726390, + "step": 779, + "time_per_iteration": 2.611804485321045 + }, + { + "auxiliary_loss_clip": 0.01219629, + "auxiliary_loss_mlp": 0.01070238, + "balance_loss_clip": 1.05842376, + "balance_loss_mlp": 1.04271269, + "epoch": 0.04689613708101608, + "flos": 25447378757760.0, + "grad_norm": 1.888854926622149, + "language_loss": 0.77364886, + "learning_rate": 3.997006537990308e-06, + "loss": 0.79654753, + "num_input_tokens_seen": 16748965, + "step": 780, + "time_per_iteration": 2.668239116668701 + }, + { + "auxiliary_loss_clip": 0.012253, + "auxiliary_loss_mlp": 0.01073321, + "balance_loss_clip": 1.06098521, + "balance_loss_mlp": 1.04605746, + "epoch": 0.04695626033368405, + "flos": 23001395241600.0, + "grad_norm": 1.7616538282563206, + "language_loss": 0.76700419, + "learning_rate": 3.996985199664099e-06, + "loss": 0.78999043, + "num_input_tokens_seen": 16768620, + "step": 781, + "time_per_iteration": 2.5979926586151123 + }, + { + "auxiliary_loss_clip": 0.01236637, + "auxiliary_loss_mlp": 0.01077479, + "balance_loss_clip": 1.0639379, + "balance_loss_mlp": 1.04836786, + "epoch": 0.047016383586352024, + "flos": 29133357632640.0, + "grad_norm": 3.0946494667490856, + "language_loss": 0.73786414, + "learning_rate": 3.99696378561201e-06, + "loss": 0.76100528, + "num_input_tokens_seen": 16789755, + "step": 782, + "time_per_iteration": 2.708855390548706 + }, + { + "auxiliary_loss_clip": 0.0122968, + "auxiliary_loss_mlp": 0.01069368, + "balance_loss_clip": 1.06431556, + "balance_loss_mlp": 1.04253423, + "epoch": 0.04707650683901999, + "flos": 14976330451200.0, + "grad_norm": 2.1459158015790183, + "language_loss": 0.80524659, + "learning_rate": 3.996942295834855e-06, + "loss": 0.82823706, + "num_input_tokens_seen": 16807585, + "step": 783, + "time_per_iteration": 2.6355738639831543 + }, + { + "auxiliary_loss_clip": 0.01222415, + "auxiliary_loss_mlp": 0.01063155, + "balance_loss_clip": 1.06221437, + "balance_loss_mlp": 1.03663135, + "epoch": 0.04713663009168796, + "flos": 21651118151040.0, + "grad_norm": 1.9084512066318515, + "language_loss": 0.81687874, + "learning_rate": 3.996920730333448e-06, + "loss": 0.83973444, + "num_input_tokens_seen": 16827220, + "step": 784, + "time_per_iteration": 2.64365291595459 + }, + { + "auxiliary_loss_clip": 0.01226632, + "auxiliary_loss_mlp": 0.01074549, + "balance_loss_clip": 1.0582943, + "balance_loss_mlp": 1.04719007, + "epoch": 0.04719675334435593, + "flos": 21325408600320.0, + "grad_norm": 3.970707764370453, + "language_loss": 0.80619848, + "learning_rate": 3.996899089108607e-06, + "loss": 0.82921028, + "num_input_tokens_seen": 16846230, + "step": 785, + "time_per_iteration": 2.682971715927124 + }, + { + "auxiliary_loss_clip": 0.01231621, + "auxiliary_loss_mlp": 0.01063774, + "balance_loss_clip": 1.06683421, + "balance_loss_mlp": 1.03784585, + "epoch": 0.0472568765970239, + "flos": 17931383470080.0, + "grad_norm": 2.074448818096939, + "language_loss": 0.89784658, + "learning_rate": 3.996877372161152e-06, + "loss": 0.92080051, + "num_input_tokens_seen": 16865325, + "step": 786, + "time_per_iteration": 2.6072235107421875 + }, + { + "auxiliary_loss_clip": 0.01227201, + "auxiliary_loss_mlp": 0.01069453, + "balance_loss_clip": 1.05475712, + "balance_loss_mlp": 1.03912568, + "epoch": 0.04731699984969187, + "flos": 18077324428800.0, + "grad_norm": 6.783818284100465, + "language_loss": 0.76794451, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.79091108, + "num_input_tokens_seen": 16882930, + "step": 787, + "time_per_iteration": 2.595069646835327 + }, + { + "auxiliary_loss_clip": 0.01233526, + "auxiliary_loss_mlp": 0.01070856, + "balance_loss_clip": 1.06563127, + "balance_loss_mlp": 1.04248405, + "epoch": 0.047377123102359836, + "flos": 23185078416000.0, + "grad_norm": 2.309745026689568, + "language_loss": 0.81301165, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83605546, + "num_input_tokens_seen": 16900710, + "step": 788, + "time_per_iteration": 2.633812427520752 + }, + { + "auxiliary_loss_clip": 0.01225447, + "auxiliary_loss_mlp": 0.01078934, + "balance_loss_clip": 1.06370282, + "balance_loss_mlp": 1.04934621, + "epoch": 0.04743724635502781, + "flos": 22747794243840.0, + "grad_norm": 2.941245147417381, + "language_loss": 0.84428835, + "learning_rate": 3.996811766991355e-06, + "loss": 0.86733222, + "num_input_tokens_seen": 16919210, + "step": 789, + "time_per_iteration": 2.6711082458496094 + }, + { + "auxiliary_loss_clip": 0.01230866, + "auxiliary_loss_mlp": 0.01071483, + "balance_loss_clip": 1.06367648, + "balance_loss_mlp": 1.0441606, + "epoch": 0.04749736960769577, + "flos": 17238702620160.0, + "grad_norm": 2.0289407228390615, + "language_loss": 0.81787878, + "learning_rate": 3.996789747161709e-06, + "loss": 0.84090227, + "num_input_tokens_seen": 16937125, + "step": 790, + "time_per_iteration": 2.6136717796325684 + }, + { + "auxiliary_loss_clip": 0.01224033, + "auxiliary_loss_mlp": 0.01064065, + "balance_loss_clip": 1.05880189, + "balance_loss_mlp": 1.03546715, + "epoch": 0.047557492860363745, + "flos": 40479261592320.0, + "grad_norm": 2.9735437778568965, + "language_loss": 0.88116109, + "learning_rate": 3.996767651613597e-06, + "loss": 0.90404207, + "num_input_tokens_seen": 16958610, + "step": 791, + "time_per_iteration": 2.747586727142334 + }, + { + "auxiliary_loss_clip": 0.01226267, + "auxiliary_loss_mlp": 0.01066471, + "balance_loss_clip": 1.06144643, + "balance_loss_mlp": 1.03743124, + "epoch": 0.04761761611303172, + "flos": 18698004466560.0, + "grad_norm": 2.1239226540804537, + "language_loss": 0.90671498, + "learning_rate": 3.996745480347854e-06, + "loss": 0.92964232, + "num_input_tokens_seen": 16977300, + "step": 792, + "time_per_iteration": 2.591477870941162 + }, + { + "auxiliary_loss_clip": 0.01226882, + "auxiliary_loss_mlp": 0.0107926, + "balance_loss_clip": 1.05968022, + "balance_loss_mlp": 1.05225897, + "epoch": 0.04767773936569968, + "flos": 20921987975040.0, + "grad_norm": 1.9120988315570397, + "language_loss": 0.73246223, + "learning_rate": 3.996723233365324e-06, + "loss": 0.75552362, + "num_input_tokens_seen": 16994950, + "step": 793, + "time_per_iteration": 2.6319899559020996 + }, + { + "auxiliary_loss_clip": 0.01231301, + "auxiliary_loss_mlp": 0.01070716, + "balance_loss_clip": 1.06213653, + "balance_loss_mlp": 1.04146254, + "epoch": 0.047737862618367655, + "flos": 23732680233600.0, + "grad_norm": 1.86347948201136, + "language_loss": 0.86139679, + "learning_rate": 3.996700910666847e-06, + "loss": 0.88441694, + "num_input_tokens_seen": 17014760, + "step": 794, + "time_per_iteration": 2.6835687160491943 + }, + { + "auxiliary_loss_clip": 0.01228204, + "auxiliary_loss_mlp": 0.01077895, + "balance_loss_clip": 1.05969596, + "balance_loss_mlp": 1.04935622, + "epoch": 0.04779798587103562, + "flos": 23695764030720.0, + "grad_norm": 2.370166301863074, + "language_loss": 0.69069195, + "learning_rate": 3.996678512253272e-06, + "loss": 0.71375293, + "num_input_tokens_seen": 17032715, + "step": 795, + "time_per_iteration": 2.669261932373047 + }, + { + "auxiliary_loss_clip": 0.01225748, + "auxiliary_loss_mlp": 0.01076275, + "balance_loss_clip": 1.06129098, + "balance_loss_mlp": 1.04756904, + "epoch": 0.04785810912370359, + "flos": 23183641872000.0, + "grad_norm": 1.744925212230271, + "language_loss": 0.810256, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83327615, + "num_input_tokens_seen": 17052215, + "step": 796, + "time_per_iteration": 2.5800065994262695 + }, + { + "auxiliary_loss_clip": 0.01228235, + "auxiliary_loss_mlp": 0.01065433, + "balance_loss_clip": 1.06224668, + "balance_loss_mlp": 1.03638172, + "epoch": 0.047918232376371564, + "flos": 18040623707520.0, + "grad_norm": 1.979164246440182, + "language_loss": 0.8128069, + "learning_rate": 3.996633488284228e-06, + "loss": 0.83574355, + "num_input_tokens_seen": 17069225, + "step": 797, + "time_per_iteration": 2.58878493309021 + }, + { + "auxiliary_loss_clip": 0.01100259, + "auxiliary_loss_mlp": 0.01007215, + "balance_loss_clip": 1.02779806, + "balance_loss_mlp": 1.00266171, + "epoch": 0.04797835562903953, + "flos": 62442588758400.0, + "grad_norm": 0.912416075283383, + "language_loss": 0.64532876, + "learning_rate": 3.996610862730465e-06, + "loss": 0.66640353, + "num_input_tokens_seen": 17126680, + "step": 798, + "time_per_iteration": 3.0779380798339844 + }, + { + "auxiliary_loss_clip": 0.01229665, + "auxiliary_loss_mlp": 0.01068747, + "balance_loss_clip": 1.05799031, + "balance_loss_mlp": 1.04121017, + "epoch": 0.0480384788817075, + "flos": 21507296094720.0, + "grad_norm": 2.0206600610723333, + "language_loss": 0.91274291, + "learning_rate": 3.996588161465018e-06, + "loss": 0.935727, + "num_input_tokens_seen": 17144835, + "step": 799, + "time_per_iteration": 2.660438299179077 + }, + { + "auxiliary_loss_clip": 0.01230751, + "auxiliary_loss_mlp": 0.010715, + "balance_loss_clip": 1.06640434, + "balance_loss_mlp": 1.04274678, + "epoch": 0.048098602134375466, + "flos": 21726710323200.0, + "grad_norm": 2.0752654205923866, + "language_loss": 0.86825287, + "learning_rate": 3.996565384488748e-06, + "loss": 0.89127541, + "num_input_tokens_seen": 17165030, + "step": 800, + "time_per_iteration": 2.6700456142425537 + }, + { + "auxiliary_loss_clip": 0.01229893, + "auxiliary_loss_mlp": 0.01072058, + "balance_loss_clip": 1.06186771, + "balance_loss_mlp": 1.04618931, + "epoch": 0.04815872538704344, + "flos": 22931082368640.0, + "grad_norm": 2.5310108886746976, + "language_loss": 0.83949852, + "learning_rate": 3.996542531802518e-06, + "loss": 0.86251807, + "num_input_tokens_seen": 17184895, + "step": 801, + "time_per_iteration": 2.7724695205688477 + }, + { + "auxiliary_loss_clip": 0.01227846, + "auxiliary_loss_mlp": 0.010756, + "balance_loss_clip": 1.06226814, + "balance_loss_mlp": 1.04847932, + "epoch": 0.04821884863971141, + "flos": 43174716042240.0, + "grad_norm": 1.9607091513106172, + "language_loss": 0.79818648, + "learning_rate": 3.996519603407196e-06, + "loss": 0.82122099, + "num_input_tokens_seen": 17208225, + "step": 802, + "time_per_iteration": 2.861309766769409 + }, + { + "auxiliary_loss_clip": 0.0122832, + "auxiliary_loss_mlp": 0.01069086, + "balance_loss_clip": 1.06392837, + "balance_loss_mlp": 1.04278886, + "epoch": 0.048278971892379376, + "flos": 18620006083200.0, + "grad_norm": 1.798745906633195, + "language_loss": 0.86600745, + "learning_rate": 3.996496599303649e-06, + "loss": 0.88898146, + "num_input_tokens_seen": 17226305, + "step": 803, + "time_per_iteration": 2.612684965133667 + }, + { + "auxiliary_loss_clip": 0.01222438, + "auxiliary_loss_mlp": 0.01063116, + "balance_loss_clip": 1.06214345, + "balance_loss_mlp": 1.03643703, + "epoch": 0.04833909514504735, + "flos": 20230061310720.0, + "grad_norm": 5.958214069975319, + "language_loss": 0.85139012, + "learning_rate": 3.996473519492753e-06, + "loss": 0.8742457, + "num_input_tokens_seen": 17244545, + "step": 804, + "time_per_iteration": 2.596965789794922 + }, + { + "auxiliary_loss_clip": 0.01225485, + "auxiliary_loss_mlp": 0.0106948, + "balance_loss_clip": 1.06206632, + "balance_loss_mlp": 1.04222918, + "epoch": 0.04839921839771532, + "flos": 24645170361600.0, + "grad_norm": 1.9492340448514227, + "language_loss": 0.85939878, + "learning_rate": 3.99645036397538e-06, + "loss": 0.88234842, + "num_input_tokens_seen": 17265730, + "step": 805, + "time_per_iteration": 2.6773781776428223 + }, + { + "auxiliary_loss_clip": 0.01221339, + "auxiliary_loss_mlp": 0.01071867, + "balance_loss_clip": 1.05968738, + "balance_loss_mlp": 1.04591477, + "epoch": 0.048459341650383285, + "flos": 24827452905600.0, + "grad_norm": 1.8764849579047527, + "language_loss": 0.68025368, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.70318574, + "num_input_tokens_seen": 17284820, + "step": 806, + "time_per_iteration": 2.6270596981048584 + }, + { + "auxiliary_loss_clip": 0.01221043, + "auxiliary_loss_mlp": 0.01060505, + "balance_loss_clip": 1.06064904, + "balance_loss_mlp": 1.03384972, + "epoch": 0.04851946490305126, + "flos": 22163204396160.0, + "grad_norm": 8.586680684018, + "language_loss": 0.76488906, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.78770459, + "num_input_tokens_seen": 17305085, + "step": 807, + "time_per_iteration": 2.6783089637756348 + }, + { + "auxiliary_loss_clip": 0.01218859, + "auxiliary_loss_mlp": 0.01068871, + "balance_loss_clip": 1.05734789, + "balance_loss_mlp": 1.04290676, + "epoch": 0.04857958815571922, + "flos": 19792022952960.0, + "grad_norm": 2.4056749627509157, + "language_loss": 0.86882269, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.89170003, + "num_input_tokens_seen": 17322715, + "step": 808, + "time_per_iteration": 2.6447641849517822 + }, + { + "auxiliary_loss_clip": 0.01227529, + "auxiliary_loss_mlp": 0.01069446, + "balance_loss_clip": 1.06140316, + "balance_loss_mlp": 1.0424329, + "epoch": 0.048639711408387194, + "flos": 18697968552960.0, + "grad_norm": 2.6040733531164424, + "language_loss": 0.89710444, + "learning_rate": 3.996356984858732e-06, + "loss": 0.92007422, + "num_input_tokens_seen": 17341455, + "step": 809, + "time_per_iteration": 2.6679790019989014 + }, + { + "auxiliary_loss_clip": 0.01226608, + "auxiliary_loss_mlp": 0.01067211, + "balance_loss_clip": 1.0643065, + "balance_loss_mlp": 1.04060316, + "epoch": 0.048699834661055166, + "flos": 24863507182080.0, + "grad_norm": 3.0721319202916324, + "language_loss": 0.84918916, + "learning_rate": 3.996333450822208e-06, + "loss": 0.87212729, + "num_input_tokens_seen": 17360765, + "step": 810, + "time_per_iteration": 2.696772575378418 + }, + { + "auxiliary_loss_clip": 0.01227202, + "auxiliary_loss_mlp": 0.01067343, + "balance_loss_clip": 1.0622344, + "balance_loss_mlp": 1.04049683, + "epoch": 0.04875995791372313, + "flos": 20704010290560.0, + "grad_norm": 1.8136675943398954, + "language_loss": 0.80799425, + "learning_rate": 3.99630984108452e-06, + "loss": 0.83093977, + "num_input_tokens_seen": 17380625, + "step": 811, + "time_per_iteration": 2.653808355331421 + }, + { + "auxiliary_loss_clip": 0.01217843, + "auxiliary_loss_mlp": 0.01070621, + "balance_loss_clip": 1.05928314, + "balance_loss_mlp": 1.04466903, + "epoch": 0.048820081166391104, + "flos": 18588297352320.0, + "grad_norm": 1.7193599003225197, + "language_loss": 0.74634516, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.76922977, + "num_input_tokens_seen": 17399355, + "step": 812, + "time_per_iteration": 2.7274649143218994 + }, + { + "auxiliary_loss_clip": 0.01222659, + "auxiliary_loss_mlp": 0.01073562, + "balance_loss_clip": 1.06445217, + "balance_loss_mlp": 1.04862356, + "epoch": 0.04888020441905907, + "flos": 22707322594560.0, + "grad_norm": 1.9311665765462733, + "language_loss": 0.90124279, + "learning_rate": 3.996262394509233e-06, + "loss": 0.92420495, + "num_input_tokens_seen": 17418240, + "step": 813, + "time_per_iteration": 2.654874801635742 + }, + { + "auxiliary_loss_clip": 0.0122, + "auxiliary_loss_mlp": 0.01057827, + "balance_loss_clip": 1.06157589, + "balance_loss_mlp": 1.03248262, + "epoch": 0.04894032767172704, + "flos": 22784351310720.0, + "grad_norm": 1.9238840150723209, + "language_loss": 0.74904704, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.77182531, + "num_input_tokens_seen": 17436250, + "step": 814, + "time_per_iteration": 2.7381603717803955 + }, + { + "auxiliary_loss_clip": 0.01223782, + "auxiliary_loss_mlp": 0.01069686, + "balance_loss_clip": 1.06125045, + "balance_loss_mlp": 1.04289961, + "epoch": 0.04900045092439501, + "flos": 25516147345920.0, + "grad_norm": 2.1966001004582596, + "language_loss": 0.83816808, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.86110282, + "num_input_tokens_seen": 17455750, + "step": 815, + "time_per_iteration": 2.7289621829986572 + }, + { + "auxiliary_loss_clip": 0.01227011, + "auxiliary_loss_mlp": 0.01060571, + "balance_loss_clip": 1.06326818, + "balance_loss_mlp": 1.0344646, + "epoch": 0.04906057417706298, + "flos": 25958136199680.0, + "grad_norm": 2.3329994981275943, + "language_loss": 0.90796101, + "learning_rate": 3.996190656910043e-06, + "loss": 0.93083686, + "num_input_tokens_seen": 17474995, + "step": 816, + "time_per_iteration": 4.174290180206299 + }, + { + "auxiliary_loss_clip": 0.01226278, + "auxiliary_loss_mlp": 0.0105651, + "balance_loss_clip": 1.06172895, + "balance_loss_mlp": 1.03054583, + "epoch": 0.04912069742973095, + "flos": 18624638937600.0, + "grad_norm": 2.2253098946667853, + "language_loss": 0.79834002, + "learning_rate": 3.996166592984268e-06, + "loss": 0.82116789, + "num_input_tokens_seen": 17493395, + "step": 817, + "time_per_iteration": 4.2819907665252686 + }, + { + "auxiliary_loss_clip": 0.01222491, + "auxiliary_loss_mlp": 0.01072358, + "balance_loss_clip": 1.06228495, + "balance_loss_mlp": 1.04563141, + "epoch": 0.049180820682398915, + "flos": 23699786353920.0, + "grad_norm": 1.9292138186207266, + "language_loss": 0.8532303, + "learning_rate": 3.996142453363656e-06, + "loss": 0.8761788, + "num_input_tokens_seen": 17514565, + "step": 818, + "time_per_iteration": 7.687308073043823 + }, + { + "auxiliary_loss_clip": 0.01228571, + "auxiliary_loss_mlp": 0.01064433, + "balance_loss_clip": 1.06170368, + "balance_loss_mlp": 1.0369786, + "epoch": 0.04924094393506689, + "flos": 22420396753920.0, + "grad_norm": 2.1064810754058407, + "language_loss": 0.75623614, + "learning_rate": 3.996118238049124e-06, + "loss": 0.77916616, + "num_input_tokens_seen": 17534590, + "step": 819, + "time_per_iteration": 2.5708072185516357 + }, + { + "auxiliary_loss_clip": 0.01227988, + "auxiliary_loss_mlp": 0.010616, + "balance_loss_clip": 1.06580663, + "balance_loss_mlp": 1.03785336, + "epoch": 0.04930106718773486, + "flos": 15738246766080.0, + "grad_norm": 2.8685299631500487, + "language_loss": 0.85082126, + "learning_rate": 3.996093947041586e-06, + "loss": 0.87371719, + "num_input_tokens_seen": 17551900, + "step": 820, + "time_per_iteration": 2.695204973220825 + }, + { + "auxiliary_loss_clip": 0.01224953, + "auxiliary_loss_mlp": 0.01065985, + "balance_loss_clip": 1.06082845, + "balance_loss_mlp": 1.04037917, + "epoch": 0.049361190440402825, + "flos": 26250628648320.0, + "grad_norm": 1.734636988660555, + "language_loss": 0.90459162, + "learning_rate": 3.996069580341966e-06, + "loss": 0.92750102, + "num_input_tokens_seen": 17571485, + "step": 821, + "time_per_iteration": 2.6284992694854736 + }, + { + "auxiliary_loss_clip": 0.01222526, + "auxiliary_loss_mlp": 0.01080357, + "balance_loss_clip": 1.06015635, + "balance_loss_mlp": 1.05485809, + "epoch": 0.0494213136930708, + "flos": 21252366293760.0, + "grad_norm": 1.7915267676548876, + "language_loss": 0.89795959, + "learning_rate": 3.996045137951188e-06, + "loss": 0.92098844, + "num_input_tokens_seen": 17591410, + "step": 822, + "time_per_iteration": 2.6085855960845947 + }, + { + "auxiliary_loss_clip": 0.0122571, + "auxiliary_loss_mlp": 0.01062887, + "balance_loss_clip": 1.0639379, + "balance_loss_mlp": 1.03472972, + "epoch": 0.04948143694573876, + "flos": 27965506740480.0, + "grad_norm": 2.28747155105076, + "language_loss": 0.67558801, + "learning_rate": 3.996020619870178e-06, + "loss": 0.69847399, + "num_input_tokens_seen": 17612010, + "step": 823, + "time_per_iteration": 2.644277572631836 + }, + { + "auxiliary_loss_clip": 0.01099376, + "auxiliary_loss_mlp": 0.0100741, + "balance_loss_clip": 1.0267303, + "balance_loss_mlp": 1.00266516, + "epoch": 0.049541560198406734, + "flos": 66180995533440.0, + "grad_norm": 1.3456360586087317, + "language_loss": 0.62254131, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64360917, + "num_input_tokens_seen": 17673430, + "step": 824, + "time_per_iteration": 3.230381488800049 + }, + { + "auxiliary_loss_clip": 0.01228758, + "auxiliary_loss_mlp": 0.01066541, + "balance_loss_clip": 1.06346989, + "balance_loss_mlp": 1.03909945, + "epoch": 0.049601683451074706, + "flos": 22892693708160.0, + "grad_norm": 1.8854339538524305, + "language_loss": 0.90479428, + "learning_rate": 3.995971356641185e-06, + "loss": 0.92774737, + "num_input_tokens_seen": 17689545, + "step": 825, + "time_per_iteration": 2.58868670463562 + }, + { + "auxiliary_loss_clip": 0.01227734, + "auxiliary_loss_mlp": 0.01066527, + "balance_loss_clip": 1.06315517, + "balance_loss_mlp": 1.03844118, + "epoch": 0.04966180670374267, + "flos": 21433643256960.0, + "grad_norm": 2.307419213246734, + "language_loss": 0.66851091, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.69145352, + "num_input_tokens_seen": 17705965, + "step": 826, + "time_per_iteration": 2.59468412399292 + }, + { + "auxiliary_loss_clip": 0.01230149, + "auxiliary_loss_mlp": 0.01069061, + "balance_loss_clip": 1.06421614, + "balance_loss_mlp": 1.04216766, + "epoch": 0.04972192995641064, + "flos": 23107367341440.0, + "grad_norm": 1.8316571551414482, + "language_loss": 0.78298402, + "learning_rate": 3.995921790662459e-06, + "loss": 0.80597603, + "num_input_tokens_seen": 17724580, + "step": 827, + "time_per_iteration": 2.7148005962371826 + }, + { + "auxiliary_loss_clip": 0.01230507, + "auxiliary_loss_mlp": 0.01079145, + "balance_loss_clip": 1.06385946, + "balance_loss_mlp": 1.05119085, + "epoch": 0.04978205320907861, + "flos": 40406147458560.0, + "grad_norm": 1.6017511297862308, + "language_loss": 0.78696525, + "learning_rate": 3.995896894144294e-06, + "loss": 0.81006181, + "num_input_tokens_seen": 17747755, + "step": 828, + "time_per_iteration": 2.86991548538208 + }, + { + "auxiliary_loss_clip": 0.0121958, + "auxiliary_loss_mlp": 0.01059689, + "balance_loss_clip": 1.05939984, + "balance_loss_mlp": 1.03390431, + "epoch": 0.04984217646174658, + "flos": 25228539146880.0, + "grad_norm": 2.48577103336206, + "language_loss": 0.83530867, + "learning_rate": 3.995871921941519e-06, + "loss": 0.85810131, + "num_input_tokens_seen": 17768550, + "step": 829, + "time_per_iteration": 2.655895948410034 + }, + { + "auxiliary_loss_clip": 0.01226863, + "auxiliary_loss_mlp": 0.01080723, + "balance_loss_clip": 1.06109536, + "balance_loss_mlp": 1.05068195, + "epoch": 0.04990229971441455, + "flos": 15959636242560.0, + "grad_norm": 2.078538436430036, + "language_loss": 0.74857247, + "learning_rate": 3.99584687405508e-06, + "loss": 0.77164829, + "num_input_tokens_seen": 17786080, + "step": 830, + "time_per_iteration": 2.5820400714874268 + }, + { + "auxiliary_loss_clip": 0.0122584, + "auxiliary_loss_mlp": 0.01074077, + "balance_loss_clip": 1.06154907, + "balance_loss_mlp": 1.04667115, + "epoch": 0.04996242296708252, + "flos": 18405116968320.0, + "grad_norm": 1.8327841960194244, + "language_loss": 0.79279459, + "learning_rate": 3.995821750485929e-06, + "loss": 0.81579381, + "num_input_tokens_seen": 17803635, + "step": 831, + "time_per_iteration": 2.5980231761932373 + }, + { + "auxiliary_loss_clip": 0.01173206, + "auxiliary_loss_mlp": 0.01072743, + "balance_loss_clip": 1.0542444, + "balance_loss_mlp": 1.04725623, + "epoch": 0.05002254621975049, + "flos": 17858053854720.0, + "grad_norm": 3.034319898285603, + "language_loss": 0.91497368, + "learning_rate": 3.995796551235016e-06, + "loss": 0.93743312, + "num_input_tokens_seen": 17822190, + "step": 832, + "time_per_iteration": 2.7498815059661865 + }, + { + "auxiliary_loss_clip": 0.01194428, + "auxiliary_loss_mlp": 0.01081719, + "balance_loss_clip": 1.05826366, + "balance_loss_mlp": 1.05667353, + "epoch": 0.050082669472418455, + "flos": 45660273367680.0, + "grad_norm": 1.887029338258115, + "language_loss": 0.83167893, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.85444039, + "num_input_tokens_seen": 17846915, + "step": 833, + "time_per_iteration": 2.863208770751953 + }, + { + "auxiliary_loss_clip": 0.01199525, + "auxiliary_loss_mlp": 0.01061962, + "balance_loss_clip": 1.05888343, + "balance_loss_mlp": 1.03468657, + "epoch": 0.05014279272508643, + "flos": 37962067363200.0, + "grad_norm": 2.8753922020214033, + "language_loss": 0.82409853, + "learning_rate": 3.995745925691733e-06, + "loss": 0.84671336, + "num_input_tokens_seen": 17867270, + "step": 834, + "time_per_iteration": 2.7868030071258545 + }, + { + "auxiliary_loss_clip": 0.01216246, + "auxiliary_loss_mlp": 0.01064427, + "balance_loss_clip": 1.06272483, + "balance_loss_mlp": 1.03672278, + "epoch": 0.0502029159777544, + "flos": 20996179516800.0, + "grad_norm": 2.2306487397141646, + "language_loss": 0.92186153, + "learning_rate": 3.995720499401282e-06, + "loss": 0.94466823, + "num_input_tokens_seen": 17884880, + "step": 835, + "time_per_iteration": 2.6224496364593506 + }, + { + "auxiliary_loss_clip": 0.01229494, + "auxiliary_loss_mlp": 0.01074922, + "balance_loss_clip": 1.06143415, + "balance_loss_mlp": 1.0464313, + "epoch": 0.050263039230422364, + "flos": 15888066393600.0, + "grad_norm": 2.196832783808158, + "language_loss": 0.76143622, + "learning_rate": 3.995694997432911e-06, + "loss": 0.78448039, + "num_input_tokens_seen": 17903695, + "step": 836, + "time_per_iteration": 2.5648462772369385 + }, + { + "auxiliary_loss_clip": 0.01211162, + "auxiliary_loss_mlp": 0.01075977, + "balance_loss_clip": 1.06259084, + "balance_loss_mlp": 1.04992962, + "epoch": 0.050323162483090336, + "flos": 23732752060800.0, + "grad_norm": 2.100773352560791, + "language_loss": 0.83627856, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.85914999, + "num_input_tokens_seen": 17920745, + "step": 837, + "time_per_iteration": 2.7420156002044678 + }, + { + "auxiliary_loss_clip": 0.01198815, + "auxiliary_loss_mlp": 0.0078439, + "balance_loss_clip": 1.06345344, + "balance_loss_mlp": 1.00053763, + "epoch": 0.0503832857357583, + "flos": 20266223328000.0, + "grad_norm": 2.1353335821274477, + "language_loss": 0.72857559, + "learning_rate": 3.995643766466275e-06, + "loss": 0.7484076, + "num_input_tokens_seen": 17938220, + "step": 838, + "time_per_iteration": 2.679177761077881 + }, + { + "auxiliary_loss_clip": 0.01189223, + "auxiliary_loss_mlp": 0.01071526, + "balance_loss_clip": 1.05415273, + "balance_loss_mlp": 1.04510927, + "epoch": 0.05044340898842627, + "flos": 17785011548160.0, + "grad_norm": 1.8138261016039334, + "language_loss": 0.83462799, + "learning_rate": 3.995618037469953e-06, + "loss": 0.85723549, + "num_input_tokens_seen": 17957325, + "step": 839, + "time_per_iteration": 2.69063663482666 + }, + { + "auxiliary_loss_clip": 0.01220356, + "auxiliary_loss_mlp": 0.01069331, + "balance_loss_clip": 1.05991399, + "balance_loss_mlp": 1.04411805, + "epoch": 0.050503532241094246, + "flos": 22966526113920.0, + "grad_norm": 1.7513762525269907, + "language_loss": 0.85775483, + "learning_rate": 3.995592232799595e-06, + "loss": 0.88065171, + "num_input_tokens_seen": 17975875, + "step": 840, + "time_per_iteration": 2.6477303504943848 + }, + { + "auxiliary_loss_clip": 0.01192112, + "auxiliary_loss_mlp": 0.01064377, + "balance_loss_clip": 1.05451894, + "balance_loss_mlp": 1.036291, + "epoch": 0.05056365549376221, + "flos": 22776989022720.0, + "grad_norm": 1.7956760046069329, + "language_loss": 0.9457823, + "learning_rate": 3.99556635245618e-06, + "loss": 0.96834719, + "num_input_tokens_seen": 17994340, + "step": 841, + "time_per_iteration": 2.8354220390319824 + }, + { + "auxiliary_loss_clip": 0.0122473, + "auxiliary_loss_mlp": 0.01070125, + "balance_loss_clip": 1.06219172, + "balance_loss_mlp": 1.04329097, + "epoch": 0.05062377874643018, + "flos": 30916968399360.0, + "grad_norm": 2.3106044659054104, + "language_loss": 0.77566791, + "learning_rate": 3.995540396440688e-06, + "loss": 0.79861641, + "num_input_tokens_seen": 18015260, + "step": 842, + "time_per_iteration": 2.6909749507904053 + }, + { + "auxiliary_loss_clip": 0.01214637, + "auxiliary_loss_mlp": 0.01071033, + "balance_loss_clip": 1.06270838, + "balance_loss_mlp": 1.04391265, + "epoch": 0.05068390199909815, + "flos": 19647159402240.0, + "grad_norm": 2.8849837971101864, + "language_loss": 0.78126526, + "learning_rate": 3.995514364754105e-06, + "loss": 0.80412203, + "num_input_tokens_seen": 18033960, + "step": 843, + "time_per_iteration": 2.6534156799316406 + }, + { + "auxiliary_loss_clip": 0.01212948, + "auxiliary_loss_mlp": 0.01063612, + "balance_loss_clip": 1.06317043, + "balance_loss_mlp": 1.03894806, + "epoch": 0.05074402525176612, + "flos": 37962103276800.0, + "grad_norm": 1.9320015451631862, + "language_loss": 0.83256191, + "learning_rate": 3.995488257397417e-06, + "loss": 0.85532749, + "num_input_tokens_seen": 18056700, + "step": 844, + "time_per_iteration": 2.7682149410247803 + }, + { + "auxiliary_loss_clip": 0.01216308, + "auxiliary_loss_mlp": 0.01067162, + "balance_loss_clip": 1.06307864, + "balance_loss_mlp": 1.04138875, + "epoch": 0.05080414850443409, + "flos": 22054610603520.0, + "grad_norm": 2.113957107027846, + "language_loss": 0.77108061, + "learning_rate": 3.995462074371614e-06, + "loss": 0.79391527, + "num_input_tokens_seen": 18075815, + "step": 845, + "time_per_iteration": 2.6720399856567383 + }, + { + "auxiliary_loss_clip": 0.01206643, + "auxiliary_loss_mlp": 0.01065522, + "balance_loss_clip": 1.05881417, + "balance_loss_mlp": 1.03885484, + "epoch": 0.05086427175710206, + "flos": 20225787592320.0, + "grad_norm": 1.8497392628450484, + "language_loss": 0.87773871, + "learning_rate": 3.99543581567769e-06, + "loss": 0.90046036, + "num_input_tokens_seen": 18095095, + "step": 846, + "time_per_iteration": 2.696049690246582 + }, + { + "auxiliary_loss_clip": 0.01206291, + "auxiliary_loss_mlp": 0.01069231, + "balance_loss_clip": 1.06204462, + "balance_loss_mlp": 1.04330277, + "epoch": 0.05092439500977003, + "flos": 15159223526400.0, + "grad_norm": 1.695550491545423, + "language_loss": 0.87364423, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.89639944, + "num_input_tokens_seen": 18112675, + "step": 847, + "time_per_iteration": 2.666907548904419 + }, + { + "auxiliary_loss_clip": 0.01175052, + "auxiliary_loss_mlp": 0.01071976, + "balance_loss_clip": 1.06267309, + "balance_loss_mlp": 1.0447005, + "epoch": 0.050984518262437994, + "flos": 22055149307520.0, + "grad_norm": 2.5687168450386637, + "language_loss": 0.81878662, + "learning_rate": 3.995383071289462e-06, + "loss": 0.84125686, + "num_input_tokens_seen": 18130745, + "step": 848, + "time_per_iteration": 2.782135486602783 + }, + { + "auxiliary_loss_clip": 0.0122638, + "auxiliary_loss_mlp": 0.01071388, + "balance_loss_clip": 1.06619906, + "balance_loss_mlp": 1.04544854, + "epoch": 0.05104464151510597, + "flos": 30225329043840.0, + "grad_norm": 1.678404869397893, + "language_loss": 0.87187904, + "learning_rate": 3.995356585597158e-06, + "loss": 0.89485669, + "num_input_tokens_seen": 18152410, + "step": 849, + "time_per_iteration": 2.787992000579834 + }, + { + "auxiliary_loss_clip": 0.01220251, + "auxiliary_loss_mlp": 0.0106131, + "balance_loss_clip": 1.06049275, + "balance_loss_mlp": 1.03545308, + "epoch": 0.05110476476777394, + "flos": 18332900674560.0, + "grad_norm": 2.125711462362114, + "language_loss": 0.8315587, + "learning_rate": 3.995330024240732e-06, + "loss": 0.85437429, + "num_input_tokens_seen": 18170870, + "step": 850, + "time_per_iteration": 2.6548752784729004 + }, + { + "auxiliary_loss_clip": 0.01210598, + "auxiliary_loss_mlp": 0.01063491, + "balance_loss_clip": 1.06061506, + "balance_loss_mlp": 1.0379566, + "epoch": 0.051164888020441904, + "flos": 37998732170880.0, + "grad_norm": 2.2115645013354253, + "language_loss": 0.65423882, + "learning_rate": 3.995303387221192e-06, + "loss": 0.67697972, + "num_input_tokens_seen": 18191555, + "step": 851, + "time_per_iteration": 2.817197322845459 + }, + { + "auxiliary_loss_clip": 0.0120566, + "auxiliary_loss_mlp": 0.01075745, + "balance_loss_clip": 1.05822444, + "balance_loss_mlp": 1.04761147, + "epoch": 0.051225011273109876, + "flos": 23038634666880.0, + "grad_norm": 2.3720786299251073, + "language_loss": 0.83587611, + "learning_rate": 3.995276674539547e-06, + "loss": 0.8586902, + "num_input_tokens_seen": 18208620, + "step": 852, + "time_per_iteration": 2.685727119445801 + }, + { + "auxiliary_loss_clip": 0.01193575, + "auxiliary_loss_mlp": 0.01074152, + "balance_loss_clip": 1.05924761, + "balance_loss_mlp": 1.04737723, + "epoch": 0.05128513452577785, + "flos": 18259822454400.0, + "grad_norm": 2.1832763559951234, + "language_loss": 0.80761266, + "learning_rate": 3.995249886196811e-06, + "loss": 0.8302899, + "num_input_tokens_seen": 18226370, + "step": 853, + "time_per_iteration": 2.6078240871429443 + }, + { + "auxiliary_loss_clip": 0.01222394, + "auxiliary_loss_mlp": 0.01065268, + "balance_loss_clip": 1.06223083, + "balance_loss_mlp": 1.03780222, + "epoch": 0.05134525777844581, + "flos": 27198957571200.0, + "grad_norm": 1.8511550328562763, + "language_loss": 0.75617325, + "learning_rate": 3.995223022193999e-06, + "loss": 0.77904987, + "num_input_tokens_seen": 18247075, + "step": 854, + "time_per_iteration": 2.633543014526367 + }, + { + "auxiliary_loss_clip": 0.01202415, + "auxiliary_loss_mlp": 0.01065973, + "balance_loss_clip": 1.06141627, + "balance_loss_mlp": 1.03828049, + "epoch": 0.051405381031113785, + "flos": 28362247436160.0, + "grad_norm": 2.04057054323539, + "language_loss": 0.81722355, + "learning_rate": 3.99519608253213e-06, + "loss": 0.83990741, + "num_input_tokens_seen": 18265680, + "step": 855, + "time_per_iteration": 2.760880708694458 + }, + { + "auxiliary_loss_clip": 0.01076712, + "auxiliary_loss_mlp": 0.00762392, + "balance_loss_clip": 1.0358243, + "balance_loss_mlp": 1.00074518, + "epoch": 0.05146550428378175, + "flos": 65618169327360.0, + "grad_norm": 0.9894594919315515, + "language_loss": 0.65634769, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67473871, + "num_input_tokens_seen": 18327015, + "step": 856, + "time_per_iteration": 6.271182298660278 + }, + { + "auxiliary_loss_clip": 0.01194232, + "auxiliary_loss_mlp": 0.01056626, + "balance_loss_clip": 1.05972147, + "balance_loss_mlp": 1.02994716, + "epoch": 0.05152562753644972, + "flos": 22054861998720.0, + "grad_norm": 1.8001295724347575, + "language_loss": 0.77139348, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.79390204, + "num_input_tokens_seen": 18345235, + "step": 857, + "time_per_iteration": 4.905239582061768 + }, + { + "auxiliary_loss_clip": 0.01183581, + "auxiliary_loss_mlp": 0.01059685, + "balance_loss_clip": 1.05640614, + "balance_loss_mlp": 1.03291047, + "epoch": 0.051585750789117694, + "flos": 18509544783360.0, + "grad_norm": 2.111656321737554, + "language_loss": 0.89194518, + "learning_rate": 3.995114809602412e-06, + "loss": 0.91437781, + "num_input_tokens_seen": 18362350, + "step": 858, + "time_per_iteration": 2.7349045276641846 + }, + { + "auxiliary_loss_clip": 0.01196113, + "auxiliary_loss_mlp": 0.01060739, + "balance_loss_clip": 1.06114125, + "balance_loss_mlp": 1.03398848, + "epoch": 0.05164587404178566, + "flos": 23730238108800.0, + "grad_norm": 2.030377637624243, + "language_loss": 0.75684321, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.77941179, + "num_input_tokens_seen": 18383390, + "step": 859, + "time_per_iteration": 2.7611751556396484 + }, + { + "auxiliary_loss_clip": 0.01186313, + "auxiliary_loss_mlp": 0.0107269, + "balance_loss_clip": 1.05708003, + "balance_loss_mlp": 1.04354358, + "epoch": 0.05170599729445363, + "flos": 16252882876800.0, + "grad_norm": 2.134655488493178, + "language_loss": 0.91122925, + "learning_rate": 3.995060249372788e-06, + "loss": 0.93381929, + "num_input_tokens_seen": 18399220, + "step": 860, + "time_per_iteration": 2.666740894317627 + }, + { + "auxiliary_loss_clip": 0.0122488, + "auxiliary_loss_mlp": 0.01060586, + "balance_loss_clip": 1.06531346, + "balance_loss_mlp": 1.03536153, + "epoch": 0.0517661205471216, + "flos": 23985922095360.0, + "grad_norm": 1.7954568874114027, + "language_loss": 0.82378531, + "learning_rate": 3.99503285577813e-06, + "loss": 0.84663993, + "num_input_tokens_seen": 18419005, + "step": 861, + "time_per_iteration": 2.6337814331054688 + }, + { + "auxiliary_loss_clip": 0.01198486, + "auxiliary_loss_mlp": 0.01060236, + "balance_loss_clip": 1.06147969, + "balance_loss_mlp": 1.03437924, + "epoch": 0.05182624379978957, + "flos": 29277718392960.0, + "grad_norm": 2.5785699637959776, + "language_loss": 0.78664875, + "learning_rate": 3.995005386531627e-06, + "loss": 0.80923599, + "num_input_tokens_seen": 18440550, + "step": 862, + "time_per_iteration": 2.7570109367370605 + }, + { + "auxiliary_loss_clip": 0.01189664, + "auxiliary_loss_mlp": 0.01070327, + "balance_loss_clip": 1.058797, + "balance_loss_mlp": 1.04547238, + "epoch": 0.05188636705245754, + "flos": 24170826332160.0, + "grad_norm": 1.7880881456146414, + "language_loss": 0.89090264, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.91350257, + "num_input_tokens_seen": 18461950, + "step": 863, + "time_per_iteration": 2.7118866443634033 + }, + { + "auxiliary_loss_clip": 0.01201772, + "auxiliary_loss_mlp": 0.01064316, + "balance_loss_clip": 1.06488204, + "balance_loss_mlp": 1.0369451, + "epoch": 0.051946490305125506, + "flos": 26760703731840.0, + "grad_norm": 2.081656150811602, + "language_loss": 0.76119763, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.78385854, + "num_input_tokens_seen": 18480555, + "step": 864, + "time_per_iteration": 2.6946637630462646 + }, + { + "auxiliary_loss_clip": 0.01186585, + "auxiliary_loss_mlp": 0.01067959, + "balance_loss_clip": 1.05559874, + "balance_loss_mlp": 1.04046965, + "epoch": 0.05200661355779348, + "flos": 21502519585920.0, + "grad_norm": 1.9374308734697678, + "language_loss": 0.7908361, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81338149, + "num_input_tokens_seen": 18499645, + "step": 865, + "time_per_iteration": 2.7700579166412354 + }, + { + "auxiliary_loss_clip": 0.01210067, + "auxiliary_loss_mlp": 0.01067568, + "balance_loss_clip": 1.06164694, + "balance_loss_mlp": 1.04152083, + "epoch": 0.05206673681046144, + "flos": 18114492026880.0, + "grad_norm": 2.269489500676155, + "language_loss": 0.85860598, + "learning_rate": 3.994894753048032e-06, + "loss": 0.88138229, + "num_input_tokens_seen": 18516810, + "step": 866, + "time_per_iteration": 2.659614086151123 + }, + { + "auxiliary_loss_clip": 0.01186536, + "auxiliary_loss_mlp": 0.01070465, + "balance_loss_clip": 1.06327558, + "balance_loss_mlp": 1.04371393, + "epoch": 0.052126860063129415, + "flos": 17524191916800.0, + "grad_norm": 2.1733876112564565, + "language_loss": 0.87495244, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.89752245, + "num_input_tokens_seen": 18532510, + "step": 867, + "time_per_iteration": 2.740238904953003 + }, + { + "auxiliary_loss_clip": 0.01167585, + "auxiliary_loss_mlp": 0.01078445, + "balance_loss_clip": 1.05696058, + "balance_loss_mlp": 1.05437636, + "epoch": 0.05218698331579739, + "flos": 32598054771840.0, + "grad_norm": 1.8498678854952728, + "language_loss": 0.63917863, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.66163892, + "num_input_tokens_seen": 18557380, + "step": 868, + "time_per_iteration": 2.9310383796691895 + }, + { + "auxiliary_loss_clip": 0.01225135, + "auxiliary_loss_mlp": 0.01069894, + "balance_loss_clip": 1.06287289, + "balance_loss_mlp": 1.04173636, + "epoch": 0.05224710656846535, + "flos": 22127293774080.0, + "grad_norm": 2.742912036955754, + "language_loss": 0.83379138, + "learning_rate": 3.994810983642281e-06, + "loss": 0.85674161, + "num_input_tokens_seen": 18575720, + "step": 869, + "time_per_iteration": 2.6453137397766113 + }, + { + "auxiliary_loss_clip": 0.01216406, + "auxiliary_loss_mlp": 0.01056401, + "balance_loss_clip": 1.0645746, + "balance_loss_mlp": 1.03053236, + "epoch": 0.052307229821133325, + "flos": 11145092976000.0, + "grad_norm": 2.188953802542244, + "language_loss": 0.87822217, + "learning_rate": 3.994782909218751e-06, + "loss": 0.90095031, + "num_input_tokens_seen": 18592185, + "step": 870, + "time_per_iteration": 2.7044875621795654 + }, + { + "auxiliary_loss_clip": 0.01226316, + "auxiliary_loss_mlp": 0.01064746, + "balance_loss_clip": 1.06603277, + "balance_loss_mlp": 1.03965199, + "epoch": 0.05236735307380129, + "flos": 19128070005120.0, + "grad_norm": 1.975067156516721, + "language_loss": 0.80651748, + "learning_rate": 3.994754759152854e-06, + "loss": 0.82942802, + "num_input_tokens_seen": 18609560, + "step": 871, + "time_per_iteration": 2.6892175674438477 + }, + { + "auxiliary_loss_clip": 0.0119502, + "auxiliary_loss_mlp": 0.01064309, + "balance_loss_clip": 1.0650804, + "balance_loss_mlp": 1.0396452, + "epoch": 0.05242747632646926, + "flos": 20960663944320.0, + "grad_norm": 1.7402390708810018, + "language_loss": 0.81330585, + "learning_rate": 3.994726533445656e-06, + "loss": 0.83589917, + "num_input_tokens_seen": 18629405, + "step": 872, + "time_per_iteration": 2.8044185638427734 + }, + { + "auxiliary_loss_clip": 0.0107835, + "auxiliary_loss_mlp": 0.01020667, + "balance_loss_clip": 1.03168392, + "balance_loss_mlp": 1.01515913, + "epoch": 0.052487599579137234, + "flos": 65020542842880.0, + "grad_norm": 0.883483589670371, + "language_loss": 0.61589074, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.63688087, + "num_input_tokens_seen": 18681480, + "step": 873, + "time_per_iteration": 3.1711297035217285 + }, + { + "auxiliary_loss_clip": 0.01197438, + "auxiliary_loss_mlp": 0.01056818, + "balance_loss_clip": 1.06202292, + "balance_loss_mlp": 1.03120041, + "epoch": 0.0525477228318052, + "flos": 23288859786240.0, + "grad_norm": 2.1995328011281488, + "language_loss": 0.88965189, + "learning_rate": 3.994669855111643e-06, + "loss": 0.91219449, + "num_input_tokens_seen": 18700390, + "step": 874, + "time_per_iteration": 2.8240153789520264 + }, + { + "auxiliary_loss_clip": 0.01197247, + "auxiliary_loss_mlp": 0.01063458, + "balance_loss_clip": 1.0614326, + "balance_loss_mlp": 1.03682709, + "epoch": 0.05260784608447317, + "flos": 32230221546240.0, + "grad_norm": 1.858649685360537, + "language_loss": 0.74537963, + "learning_rate": 3.994641402486977e-06, + "loss": 0.76798666, + "num_input_tokens_seen": 18721280, + "step": 875, + "time_per_iteration": 2.9111931324005127 + }, + { + "auxiliary_loss_clip": 0.01206205, + "auxiliary_loss_mlp": 0.01058912, + "balance_loss_clip": 1.06306934, + "balance_loss_mlp": 1.03210175, + "epoch": 0.052667969337141136, + "flos": 24463211040000.0, + "grad_norm": 1.7697857141051123, + "language_loss": 0.92843151, + "learning_rate": 3.99461287422531e-06, + "loss": 0.95108265, + "num_input_tokens_seen": 18741545, + "step": 876, + "time_per_iteration": 2.800252676010132 + }, + { + "auxiliary_loss_clip": 0.01100151, + "auxiliary_loss_mlp": 0.01006341, + "balance_loss_clip": 1.02669787, + "balance_loss_mlp": 1.0020256, + "epoch": 0.05272809258980911, + "flos": 57784329567360.0, + "grad_norm": 0.8383495859932864, + "language_loss": 0.62929404, + "learning_rate": 3.994584270327722e-06, + "loss": 0.65035897, + "num_input_tokens_seen": 18801400, + "step": 877, + "time_per_iteration": 3.2090368270874023 + }, + { + "auxiliary_loss_clip": 0.01200578, + "auxiliary_loss_mlp": 0.0106702, + "balance_loss_clip": 1.06150424, + "balance_loss_mlp": 1.03931606, + "epoch": 0.05278821584247708, + "flos": 17420805596160.0, + "grad_norm": 2.042786693643985, + "language_loss": 0.85383844, + "learning_rate": 3.994555590795299e-06, + "loss": 0.87651443, + "num_input_tokens_seen": 18819670, + "step": 878, + "time_per_iteration": 2.823835849761963 + }, + { + "auxiliary_loss_clip": 0.0122514, + "auxiliary_loss_mlp": 0.01061117, + "balance_loss_clip": 1.0635035, + "balance_loss_mlp": 1.03551078, + "epoch": 0.052848339095145046, + "flos": 26137258346880.0, + "grad_norm": 1.7462717669338121, + "language_loss": 0.83076209, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.8536247, + "num_input_tokens_seen": 18840580, + "step": 879, + "time_per_iteration": 2.743673086166382 + }, + { + "auxiliary_loss_clip": 0.0119139, + "auxiliary_loss_mlp": 0.01066471, + "balance_loss_clip": 1.06152987, + "balance_loss_mlp": 1.04013824, + "epoch": 0.05290846234781302, + "flos": 16472081623680.0, + "grad_norm": 1.9601789563010765, + "language_loss": 0.84284604, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.86542469, + "num_input_tokens_seen": 18859295, + "step": 880, + "time_per_iteration": 2.7560529708862305 + }, + { + "auxiliary_loss_clip": 0.01184956, + "auxiliary_loss_mlp": 0.01065063, + "balance_loss_clip": 1.05969453, + "balance_loss_mlp": 1.03887296, + "epoch": 0.05296858560048098, + "flos": 19865173000320.0, + "grad_norm": 2.4477328752698564, + "language_loss": 0.86870736, + "learning_rate": 3.994469098399906e-06, + "loss": 0.89120758, + "num_input_tokens_seen": 18877485, + "step": 881, + "time_per_iteration": 2.855395555496216 + }, + { + "auxiliary_loss_clip": 0.01207858, + "auxiliary_loss_mlp": 0.01070235, + "balance_loss_clip": 1.05984437, + "balance_loss_mlp": 1.04238808, + "epoch": 0.053028708853148955, + "flos": 24388588535040.0, + "grad_norm": 1.7611192020675561, + "language_loss": 0.87967896, + "learning_rate": 3.994440116339046e-06, + "loss": 0.90245986, + "num_input_tokens_seen": 18898275, + "step": 882, + "time_per_iteration": 2.8480119705200195 + }, + { + "auxiliary_loss_clip": 0.01224906, + "auxiliary_loss_mlp": 0.01057944, + "balance_loss_clip": 1.06268644, + "balance_loss_mlp": 1.03059733, + "epoch": 0.05308883210581693, + "flos": 36393166143360.0, + "grad_norm": 2.3555018967788635, + "language_loss": 0.69469339, + "learning_rate": 3.994411058648816e-06, + "loss": 0.71752191, + "num_input_tokens_seen": 18920665, + "step": 883, + "time_per_iteration": 2.8808236122131348 + }, + { + "auxiliary_loss_clip": 0.01166777, + "auxiliary_loss_mlp": 0.01063991, + "balance_loss_clip": 1.05333591, + "balance_loss_mlp": 1.03855157, + "epoch": 0.05314895535848489, + "flos": 22855095146880.0, + "grad_norm": 2.039016812023355, + "language_loss": 0.76100993, + "learning_rate": 3.994381925330319e-06, + "loss": 0.78331757, + "num_input_tokens_seen": 18939835, + "step": 884, + "time_per_iteration": 2.8462212085723877 + }, + { + "auxiliary_loss_clip": 0.01172569, + "auxiliary_loss_mlp": 0.01066856, + "balance_loss_clip": 1.06269383, + "balance_loss_mlp": 1.04147613, + "epoch": 0.053209078611152864, + "flos": 12860330204160.0, + "grad_norm": 1.9865896222141148, + "language_loss": 0.86195529, + "learning_rate": 3.994352716384659e-06, + "loss": 0.88434947, + "num_input_tokens_seen": 18958405, + "step": 885, + "time_per_iteration": 2.7825753688812256 + }, + { + "auxiliary_loss_clip": 0.0118405, + "auxiliary_loss_mlp": 0.01068976, + "balance_loss_clip": 1.05229151, + "balance_loss_mlp": 1.04203486, + "epoch": 0.05326920186382083, + "flos": 12164596698240.0, + "grad_norm": 2.608647457747672, + "language_loss": 0.85971159, + "learning_rate": 3.994323431812945e-06, + "loss": 0.88224185, + "num_input_tokens_seen": 18975445, + "step": 886, + "time_per_iteration": 2.7393639087677 + }, + { + "auxiliary_loss_clip": 0.0117343, + "auxiliary_loss_mlp": 0.01065966, + "balance_loss_clip": 1.05620933, + "balance_loss_mlp": 1.03879774, + "epoch": 0.0533293251164888, + "flos": 22704485420160.0, + "grad_norm": 2.040002880698432, + "language_loss": 0.8961553, + "learning_rate": 3.994294071616286e-06, + "loss": 0.91854936, + "num_input_tokens_seen": 18991930, + "step": 887, + "time_per_iteration": 2.8606581687927246 + }, + { + "auxiliary_loss_clip": 0.01144444, + "auxiliary_loss_mlp": 0.01072438, + "balance_loss_clip": 1.04453194, + "balance_loss_mlp": 1.04411352, + "epoch": 0.053389448369156774, + "flos": 26940939200640.0, + "grad_norm": 2.062562868466936, + "language_loss": 0.74852538, + "learning_rate": 3.994264635795796e-06, + "loss": 0.77069414, + "num_input_tokens_seen": 19009790, + "step": 888, + "time_per_iteration": 2.8675312995910645 + }, + { + "auxiliary_loss_clip": 0.01164085, + "auxiliary_loss_mlp": 0.01072324, + "balance_loss_clip": 1.05659473, + "balance_loss_mlp": 1.04525173, + "epoch": 0.05344957162182474, + "flos": 25556331686400.0, + "grad_norm": 1.7884280759117637, + "language_loss": 0.88440782, + "learning_rate": 3.994235124352592e-06, + "loss": 0.9067719, + "num_input_tokens_seen": 19030170, + "step": 889, + "time_per_iteration": 2.9419636726379395 + }, + { + "auxiliary_loss_clip": 0.0121577, + "auxiliary_loss_mlp": 0.0105125, + "balance_loss_clip": 1.06085157, + "balance_loss_mlp": 1.02607334, + "epoch": 0.05350969487449271, + "flos": 19719591177600.0, + "grad_norm": 1.9333059575084248, + "language_loss": 0.88386381, + "learning_rate": 3.994205537287791e-06, + "loss": 0.90653402, + "num_input_tokens_seen": 19048075, + "step": 890, + "time_per_iteration": 2.7030327320098877 + }, + { + "auxiliary_loss_clip": 0.01195034, + "auxiliary_loss_mlp": 0.01069003, + "balance_loss_clip": 1.05835462, + "balance_loss_mlp": 1.04450595, + "epoch": 0.053569818127160676, + "flos": 27016351804800.0, + "grad_norm": 2.435204176890571, + "language_loss": 0.93450797, + "learning_rate": 3.994175874602517e-06, + "loss": 0.95714831, + "num_input_tokens_seen": 19067465, + "step": 891, + "time_per_iteration": 2.81527042388916 + }, + { + "auxiliary_loss_clip": 0.01190797, + "auxiliary_loss_mlp": 0.01066955, + "balance_loss_clip": 1.05605483, + "balance_loss_mlp": 1.03909576, + "epoch": 0.05362994137982865, + "flos": 13188338225280.0, + "grad_norm": 2.3400199158693087, + "language_loss": 0.71625131, + "learning_rate": 3.994146136297893e-06, + "loss": 0.73882878, + "num_input_tokens_seen": 19085505, + "step": 892, + "time_per_iteration": 2.825984239578247 + }, + { + "auxiliary_loss_clip": 0.01191313, + "auxiliary_loss_mlp": 0.0078394, + "balance_loss_clip": 1.05727172, + "balance_loss_mlp": 1.00024366, + "epoch": 0.05369006463249662, + "flos": 28658008022400.0, + "grad_norm": 1.6058100223173828, + "language_loss": 0.82331586, + "learning_rate": 3.994116322375049e-06, + "loss": 0.84306836, + "num_input_tokens_seen": 19104360, + "step": 893, + "time_per_iteration": 2.8618266582489014 + }, + { + "auxiliary_loss_clip": 0.01192677, + "auxiliary_loss_mlp": 0.01063531, + "balance_loss_clip": 1.0572021, + "balance_loss_mlp": 1.03850877, + "epoch": 0.053750187885164585, + "flos": 28913153304960.0, + "grad_norm": 2.0228714136718122, + "language_loss": 0.82052565, + "learning_rate": 3.994086432835114e-06, + "loss": 0.84308773, + "num_input_tokens_seen": 19124680, + "step": 894, + "time_per_iteration": 2.8347885608673096 + }, + { + "auxiliary_loss_clip": 0.0120111, + "auxiliary_loss_mlp": 0.01065233, + "balance_loss_clip": 1.0570271, + "balance_loss_mlp": 1.03997254, + "epoch": 0.05381031113783256, + "flos": 15158828476800.0, + "grad_norm": 2.260594705980758, + "language_loss": 0.76133072, + "learning_rate": 3.994056467679221e-06, + "loss": 0.78399414, + "num_input_tokens_seen": 19142895, + "step": 895, + "time_per_iteration": 2.7288858890533447 + }, + { + "auxiliary_loss_clip": 0.01200143, + "auxiliary_loss_mlp": 0.01060588, + "balance_loss_clip": 1.06422663, + "balance_loss_mlp": 1.03547084, + "epoch": 0.05387043439050053, + "flos": 21835232288640.0, + "grad_norm": 2.0450623179174974, + "language_loss": 0.86767507, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.89028239, + "num_input_tokens_seen": 19163125, + "step": 896, + "time_per_iteration": 4.404265642166138 + }, + { + "auxiliary_loss_clip": 0.0122203, + "auxiliary_loss_mlp": 0.00782931, + "balance_loss_clip": 1.06062579, + "balance_loss_mlp": 1.0002867, + "epoch": 0.053930557643168495, + "flos": 17310308382720.0, + "grad_norm": 3.0866230440609805, + "language_loss": 0.8797363, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.89978594, + "num_input_tokens_seen": 19179385, + "step": 897, + "time_per_iteration": 4.843130588531494 + }, + { + "auxiliary_loss_clip": 0.01201639, + "auxiliary_loss_mlp": 0.01063724, + "balance_loss_clip": 1.05896854, + "balance_loss_mlp": 1.03658032, + "epoch": 0.05399068089583647, + "flos": 17348481561600.0, + "grad_norm": 1.8270040910241792, + "language_loss": 0.90170419, + "learning_rate": 3.993966118527175e-06, + "loss": 0.92435783, + "num_input_tokens_seen": 19198725, + "step": 898, + "time_per_iteration": 2.695235252380371 + }, + { + "auxiliary_loss_clip": 0.01200189, + "auxiliary_loss_mlp": 0.01076438, + "balance_loss_clip": 1.05787873, + "balance_loss_mlp": 1.05105805, + "epoch": 0.05405080414850443, + "flos": 17486952491520.0, + "grad_norm": 2.793625116693953, + "language_loss": 0.91544139, + "learning_rate": 3.993935850918845e-06, + "loss": 0.93820769, + "num_input_tokens_seen": 19212380, + "step": 899, + "time_per_iteration": 2.7509548664093018 + }, + { + "auxiliary_loss_clip": 0.01186479, + "auxiliary_loss_mlp": 0.01068594, + "balance_loss_clip": 1.05614042, + "balance_loss_mlp": 1.04154527, + "epoch": 0.054110927401172404, + "flos": 24496787278080.0, + "grad_norm": 1.983572968760697, + "language_loss": 0.75742769, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.77997845, + "num_input_tokens_seen": 19232235, + "step": 900, + "time_per_iteration": 2.771371364593506 + }, + { + "auxiliary_loss_clip": 0.01211506, + "auxiliary_loss_mlp": 0.01058176, + "balance_loss_clip": 1.05839145, + "balance_loss_mlp": 1.03401244, + "epoch": 0.054171050653840376, + "flos": 22930040874240.0, + "grad_norm": 2.192527627735503, + "language_loss": 0.74331856, + "learning_rate": 3.993875088872592e-06, + "loss": 0.76601535, + "num_input_tokens_seen": 19251460, + "step": 901, + "time_per_iteration": 2.859912157058716 + }, + { + "auxiliary_loss_clip": 0.01177502, + "auxiliary_loss_mlp": 0.01065445, + "balance_loss_clip": 1.0569309, + "balance_loss_mlp": 1.04166329, + "epoch": 0.05423117390650834, + "flos": 12933192942720.0, + "grad_norm": 2.352700712836257, + "language_loss": 0.85287452, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.87530404, + "num_input_tokens_seen": 19269060, + "step": 902, + "time_per_iteration": 2.7940642833709717 + }, + { + "auxiliary_loss_clip": 0.01161069, + "auxiliary_loss_mlp": 0.01066664, + "balance_loss_clip": 1.04903233, + "balance_loss_mlp": 1.04112983, + "epoch": 0.05429129715917631, + "flos": 19901335017600.0, + "grad_norm": 1.9620711230312637, + "language_loss": 0.86385572, + "learning_rate": 3.993814024394569e-06, + "loss": 0.88613302, + "num_input_tokens_seen": 19288620, + "step": 903, + "time_per_iteration": 2.9258980751037598 + }, + { + "auxiliary_loss_clip": 0.0121005, + "auxiliary_loss_mlp": 0.01059616, + "balance_loss_clip": 1.06094384, + "balance_loss_mlp": 1.03534508, + "epoch": 0.05435142041184428, + "flos": 16908611610240.0, + "grad_norm": 2.175127974944855, + "language_loss": 0.74927866, + "learning_rate": 3.993783378746537e-06, + "loss": 0.7719754, + "num_input_tokens_seen": 19306615, + "step": 904, + "time_per_iteration": 2.7239954471588135 + }, + { + "auxiliary_loss_clip": 0.01208402, + "auxiliary_loss_mlp": 0.01067543, + "balance_loss_clip": 1.06052148, + "balance_loss_mlp": 1.04325962, + "epoch": 0.05441154366451225, + "flos": 23948323534080.0, + "grad_norm": 2.5191963984804535, + "language_loss": 0.85946918, + "learning_rate": 3.993752657494039e-06, + "loss": 0.88222867, + "num_input_tokens_seen": 19321680, + "step": 905, + "time_per_iteration": 2.693896532058716 + }, + { + "auxiliary_loss_clip": 0.01198232, + "auxiliary_loss_mlp": 0.01078072, + "balance_loss_clip": 1.06483209, + "balance_loss_mlp": 1.05400348, + "epoch": 0.05447166691718022, + "flos": 19975382904960.0, + "grad_norm": 1.7753581401878566, + "language_loss": 0.74413162, + "learning_rate": 3.993721860638241e-06, + "loss": 0.7668947, + "num_input_tokens_seen": 19339760, + "step": 906, + "time_per_iteration": 2.6679019927978516 + }, + { + "auxiliary_loss_clip": 0.01192373, + "auxiliary_loss_mlp": 0.01064381, + "balance_loss_clip": 1.05954027, + "balance_loss_mlp": 1.0397284, + "epoch": 0.05453179016984819, + "flos": 24936513575040.0, + "grad_norm": 2.3037248114268896, + "language_loss": 0.87340188, + "learning_rate": 3.993690988180309e-06, + "loss": 0.89596951, + "num_input_tokens_seen": 19359585, + "step": 907, + "time_per_iteration": 2.7363240718841553 + }, + { + "auxiliary_loss_clip": 0.01205519, + "auxiliary_loss_mlp": 0.01068463, + "balance_loss_clip": 1.0616293, + "balance_loss_mlp": 1.04332149, + "epoch": 0.05459191342251616, + "flos": 18115102558080.0, + "grad_norm": 1.6666873589767146, + "language_loss": 0.86928803, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.89202785, + "num_input_tokens_seen": 19378590, + "step": 908, + "time_per_iteration": 2.6266026496887207 + }, + { + "auxiliary_loss_clip": 0.01198848, + "auxiliary_loss_mlp": 0.01067336, + "balance_loss_clip": 1.05974221, + "balance_loss_mlp": 1.04107404, + "epoch": 0.054652036675184125, + "flos": 19208295031680.0, + "grad_norm": 2.1282794409977215, + "language_loss": 0.89792144, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.92058325, + "num_input_tokens_seen": 19397910, + "step": 909, + "time_per_iteration": 2.7163166999816895 + }, + { + "auxiliary_loss_clip": 0.01200393, + "auxiliary_loss_mlp": 0.01073374, + "balance_loss_clip": 1.06157839, + "balance_loss_mlp": 1.04742169, + "epoch": 0.0547121599278521, + "flos": 16325745615360.0, + "grad_norm": 2.095924869989121, + "language_loss": 0.70949811, + "learning_rate": 3.99359791720544e-06, + "loss": 0.73223579, + "num_input_tokens_seen": 19415950, + "step": 910, + "time_per_iteration": 2.6697354316711426 + }, + { + "auxiliary_loss_clip": 0.01187784, + "auxiliary_loss_mlp": 0.01054671, + "balance_loss_clip": 1.05651259, + "balance_loss_mlp": 1.02975583, + "epoch": 0.05477228318052007, + "flos": 20339014239360.0, + "grad_norm": 1.6633724338567386, + "language_loss": 0.83651805, + "learning_rate": 3.993566742350714e-06, + "loss": 0.85894263, + "num_input_tokens_seen": 19435275, + "step": 911, + "time_per_iteration": 2.692798137664795 + }, + { + "auxiliary_loss_clip": 0.01187113, + "auxiliary_loss_mlp": 0.01073028, + "balance_loss_clip": 1.05334687, + "balance_loss_mlp": 1.04719508, + "epoch": 0.054832406433188034, + "flos": 21973092687360.0, + "grad_norm": 2.283907419545301, + "language_loss": 0.76320881, + "learning_rate": 3.993535491899736e-06, + "loss": 0.78581023, + "num_input_tokens_seen": 19452090, + "step": 912, + "time_per_iteration": 2.6653189659118652 + }, + { + "auxiliary_loss_clip": 0.01186313, + "auxiliary_loss_mlp": 0.01051652, + "balance_loss_clip": 1.05707574, + "balance_loss_mlp": 1.0271548, + "epoch": 0.054892529685856006, + "flos": 16398931576320.0, + "grad_norm": 2.366460016615147, + "language_loss": 0.82826668, + "learning_rate": 3.993504165853694e-06, + "loss": 0.85064626, + "num_input_tokens_seen": 19470865, + "step": 913, + "time_per_iteration": 2.6826348304748535 + }, + { + "auxiliary_loss_clip": 0.01194515, + "auxiliary_loss_mlp": 0.01060483, + "balance_loss_clip": 1.0581125, + "balance_loss_mlp": 1.03651023, + "epoch": 0.05495265293852397, + "flos": 23912341084800.0, + "grad_norm": 3.3338391252510586, + "language_loss": 0.8373239, + "learning_rate": 3.993472764213772e-06, + "loss": 0.85987389, + "num_input_tokens_seen": 19492145, + "step": 914, + "time_per_iteration": 2.7358829975128174 + }, + { + "auxiliary_loss_clip": 0.0120705, + "auxiliary_loss_mlp": 0.0078227, + "balance_loss_clip": 1.06039774, + "balance_loss_mlp": 1.00027478, + "epoch": 0.055012776191191944, + "flos": 23586954756480.0, + "grad_norm": 2.520244909384168, + "language_loss": 0.90146536, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.92135859, + "num_input_tokens_seen": 19511015, + "step": 915, + "time_per_iteration": 2.9398341178894043 + }, + { + "auxiliary_loss_clip": 0.01201461, + "auxiliary_loss_mlp": 0.01059252, + "balance_loss_clip": 1.06274199, + "balance_loss_mlp": 1.03558862, + "epoch": 0.055072899443859916, + "flos": 17528501548800.0, + "grad_norm": 2.182721785653499, + "language_loss": 0.89710975, + "learning_rate": 3.993409734157064e-06, + "loss": 0.91971689, + "num_input_tokens_seen": 19529040, + "step": 916, + "time_per_iteration": 2.7210159301757812 + }, + { + "auxiliary_loss_clip": 0.01175226, + "auxiliary_loss_mlp": 0.01066073, + "balance_loss_clip": 1.05741024, + "balance_loss_mlp": 1.04103947, + "epoch": 0.05513302269652788, + "flos": 21687172427520.0, + "grad_norm": 1.7899379897310368, + "language_loss": 0.8016991, + "learning_rate": 3.993378105742666e-06, + "loss": 0.82411212, + "num_input_tokens_seen": 19549540, + "step": 917, + "time_per_iteration": 2.7923104763031006 + }, + { + "auxiliary_loss_clip": 0.01139072, + "auxiliary_loss_mlp": 0.0105947, + "balance_loss_clip": 1.05135942, + "balance_loss_mlp": 1.03414989, + "epoch": 0.05519314594919585, + "flos": 21613340021760.0, + "grad_norm": 2.106744179667805, + "language_loss": 0.79437333, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.81635869, + "num_input_tokens_seen": 19567570, + "step": 918, + "time_per_iteration": 2.8051092624664307 + }, + { + "auxiliary_loss_clip": 0.01196947, + "auxiliary_loss_mlp": 0.01055679, + "balance_loss_clip": 1.05616307, + "balance_loss_mlp": 1.03166997, + "epoch": 0.05525326920186382, + "flos": 21798567480960.0, + "grad_norm": 2.454030193031321, + "language_loss": 0.89019686, + "learning_rate": 3.99331462214778e-06, + "loss": 0.91272312, + "num_input_tokens_seen": 19585330, + "step": 919, + "time_per_iteration": 2.6846773624420166 + }, + { + "auxiliary_loss_clip": 0.01213326, + "auxiliary_loss_mlp": 0.01069349, + "balance_loss_clip": 1.05950904, + "balance_loss_mlp": 1.04417229, + "epoch": 0.05531339245453179, + "flos": 28439635288320.0, + "grad_norm": 2.246354931091656, + "language_loss": 0.8746047, + "learning_rate": 3.993282766969699e-06, + "loss": 0.89743137, + "num_input_tokens_seen": 19604970, + "step": 920, + "time_per_iteration": 2.6699845790863037 + }, + { + "auxiliary_loss_clip": 0.01190424, + "auxiliary_loss_mlp": 0.0106036, + "balance_loss_clip": 1.06023288, + "balance_loss_mlp": 1.03657782, + "epoch": 0.05537351570719976, + "flos": 37375143131520.0, + "grad_norm": 1.975714125194334, + "language_loss": 0.6568011, + "learning_rate": 3.993250836206136e-06, + "loss": 0.67930895, + "num_input_tokens_seen": 19626235, + "step": 921, + "time_per_iteration": 2.833644390106201 + }, + { + "auxiliary_loss_clip": 0.01209678, + "auxiliary_loss_mlp": 0.01065483, + "balance_loss_clip": 1.06060767, + "balance_loss_mlp": 1.03874445, + "epoch": 0.05543363895986773, + "flos": 20084479488000.0, + "grad_norm": 1.7242493696651606, + "language_loss": 0.71861136, + "learning_rate": 3.993218829858301e-06, + "loss": 0.74136293, + "num_input_tokens_seen": 19644305, + "step": 922, + "time_per_iteration": 2.6168808937072754 + }, + { + "auxiliary_loss_clip": 0.01187138, + "auxiliary_loss_mlp": 0.01067213, + "balance_loss_clip": 1.05423355, + "balance_loss_mlp": 1.04223895, + "epoch": 0.0554937622125357, + "flos": 24533200690560.0, + "grad_norm": 2.6848185900705412, + "language_loss": 0.82304025, + "learning_rate": 3.993186747927408e-06, + "loss": 0.8455838, + "num_input_tokens_seen": 19662130, + "step": 923, + "time_per_iteration": 2.7298316955566406 + }, + { + "auxiliary_loss_clip": 0.01202941, + "auxiliary_loss_mlp": 0.01064106, + "balance_loss_clip": 1.05725455, + "balance_loss_mlp": 1.03933442, + "epoch": 0.055553885465203665, + "flos": 14320063013760.0, + "grad_norm": 1.9334372940525173, + "language_loss": 0.78759122, + "learning_rate": 3.993154590414675e-06, + "loss": 0.81026167, + "num_input_tokens_seen": 19680715, + "step": 924, + "time_per_iteration": 2.6869630813598633 + }, + { + "auxiliary_loss_clip": 0.0116422, + "auxiliary_loss_mlp": 0.01053758, + "balance_loss_clip": 1.05395627, + "balance_loss_mlp": 1.02844954, + "epoch": 0.05561400871787164, + "flos": 27381132374400.0, + "grad_norm": 2.005203138116014, + "language_loss": 1.02005315, + "learning_rate": 3.993122357321319e-06, + "loss": 1.04223299, + "num_input_tokens_seen": 19700535, + "step": 925, + "time_per_iteration": 2.716089963912964 + }, + { + "auxiliary_loss_clip": 0.01163201, + "auxiliary_loss_mlp": 0.01052104, + "balance_loss_clip": 1.05070591, + "balance_loss_mlp": 1.02739179, + "epoch": 0.05567413197053961, + "flos": 23221096778880.0, + "grad_norm": 2.0106641835017482, + "language_loss": 0.80939209, + "learning_rate": 3.993090048648564e-06, + "loss": 0.83154511, + "num_input_tokens_seen": 19718825, + "step": 926, + "time_per_iteration": 2.895803451538086 + }, + { + "auxiliary_loss_clip": 0.01207515, + "auxiliary_loss_mlp": 0.01068168, + "balance_loss_clip": 1.05892682, + "balance_loss_mlp": 1.0419066, + "epoch": 0.055734255223207574, + "flos": 25264952559360.0, + "grad_norm": 2.9732625845644045, + "language_loss": 0.73220479, + "learning_rate": 3.993057664397634e-06, + "loss": 0.75496161, + "num_input_tokens_seen": 19739080, + "step": 927, + "time_per_iteration": 2.677725076675415 + }, + { + "auxiliary_loss_clip": 0.01101002, + "auxiliary_loss_mlp": 0.01015011, + "balance_loss_clip": 1.02922702, + "balance_loss_mlp": 1.01014709, + "epoch": 0.055794378475875546, + "flos": 66503116702080.0, + "grad_norm": 0.8406874373244947, + "language_loss": 0.59841412, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.61957431, + "num_input_tokens_seen": 19802960, + "step": 928, + "time_per_iteration": 3.187382221221924 + }, + { + "auxiliary_loss_clip": 0.01202438, + "auxiliary_loss_mlp": 0.01065066, + "balance_loss_clip": 1.05921853, + "balance_loss_mlp": 1.04070008, + "epoch": 0.05585450172854351, + "flos": 25337635729920.0, + "grad_norm": 2.0668361967965994, + "language_loss": 0.95411372, + "learning_rate": 3.992992669166168e-06, + "loss": 0.97678876, + "num_input_tokens_seen": 19822765, + "step": 929, + "time_per_iteration": 2.6930506229400635 + }, + { + "auxiliary_loss_clip": 0.01171806, + "auxiliary_loss_mlp": 0.01068051, + "balance_loss_clip": 1.05343258, + "balance_loss_mlp": 1.04101443, + "epoch": 0.05591462498121148, + "flos": 33911738881920.0, + "grad_norm": 2.1442452677256627, + "language_loss": 0.71756601, + "learning_rate": 3.992960058188094e-06, + "loss": 0.7399646, + "num_input_tokens_seen": 19843590, + "step": 930, + "time_per_iteration": 2.803219795227051 + }, + { + "auxiliary_loss_clip": 0.01188277, + "auxiliary_loss_mlp": 0.01058888, + "balance_loss_clip": 1.05783677, + "balance_loss_mlp": 1.03377056, + "epoch": 0.055974748233879455, + "flos": 17930880679680.0, + "grad_norm": 2.381261552273062, + "language_loss": 0.85279298, + "learning_rate": 3.992927371636776e-06, + "loss": 0.87526459, + "num_input_tokens_seen": 19860230, + "step": 931, + "time_per_iteration": 2.6215872764587402 + }, + { + "auxiliary_loss_clip": 0.01203533, + "auxiliary_loss_mlp": 0.00783076, + "balance_loss_clip": 1.05677414, + "balance_loss_mlp": 1.00025761, + "epoch": 0.05603487148654742, + "flos": 24021976371840.0, + "grad_norm": 2.2861197477099973, + "language_loss": 0.83645165, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.85631776, + "num_input_tokens_seen": 19880795, + "step": 932, + "time_per_iteration": 2.664062261581421 + }, + { + "auxiliary_loss_clip": 0.01200637, + "auxiliary_loss_mlp": 0.0107041, + "balance_loss_clip": 1.05897784, + "balance_loss_mlp": 1.04407716, + "epoch": 0.05609499473921539, + "flos": 17307758517120.0, + "grad_norm": 1.8036739452122519, + "language_loss": 0.73694205, + "learning_rate": 3.992861771819365e-06, + "loss": 0.7596525, + "num_input_tokens_seen": 19897960, + "step": 933, + "time_per_iteration": 2.631620168685913 + }, + { + "auxiliary_loss_clip": 0.01153445, + "auxiliary_loss_mlp": 0.01076903, + "balance_loss_clip": 1.04885209, + "balance_loss_mlp": 1.05060577, + "epoch": 0.05615511799188336, + "flos": 20994742972800.0, + "grad_norm": 2.385249039382274, + "language_loss": 0.86660421, + "learning_rate": 3.99282885855576e-06, + "loss": 0.88890779, + "num_input_tokens_seen": 19913315, + "step": 934, + "time_per_iteration": 2.7739439010620117 + }, + { + "auxiliary_loss_clip": 0.01164295, + "auxiliary_loss_mlp": 0.0108083, + "balance_loss_clip": 1.05509257, + "balance_loss_mlp": 1.0557723, + "epoch": 0.05621524124455133, + "flos": 17273535834240.0, + "grad_norm": 2.2740258482680433, + "language_loss": 0.80388415, + "learning_rate": 3.992795869723885e-06, + "loss": 0.82633543, + "num_input_tokens_seen": 19928790, + "step": 935, + "time_per_iteration": 5.93512487411499 + }, + { + "auxiliary_loss_clip": 0.01093927, + "auxiliary_loss_mlp": 0.01019701, + "balance_loss_clip": 1.02288604, + "balance_loss_mlp": 1.01540911, + "epoch": 0.0562753644972193, + "flos": 58719370458240.0, + "grad_norm": 0.820561718243334, + "language_loss": 0.69191676, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71305299, + "num_input_tokens_seen": 19988785, + "step": 936, + "time_per_iteration": 4.862478733062744 + }, + { + "auxiliary_loss_clip": 0.01213648, + "auxiliary_loss_mlp": 0.01068507, + "balance_loss_clip": 1.05806684, + "balance_loss_mlp": 1.04429567, + "epoch": 0.05633548774988727, + "flos": 17457039440640.0, + "grad_norm": 1.9573264311231433, + "language_loss": 0.7572521, + "learning_rate": 3.992729665360331e-06, + "loss": 0.78007358, + "num_input_tokens_seen": 20007685, + "step": 937, + "time_per_iteration": 4.219425916671753 + }, + { + "auxiliary_loss_clip": 0.01085529, + "auxiliary_loss_mlp": 0.01013805, + "balance_loss_clip": 1.02476001, + "balance_loss_mlp": 1.00944233, + "epoch": 0.05639561100255524, + "flos": 70654928083200.0, + "grad_norm": 0.9053055994078011, + "language_loss": 0.64309287, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66408622, + "num_input_tokens_seen": 20072750, + "step": 938, + "time_per_iteration": 3.1298794746398926 + }, + { + "auxiliary_loss_clip": 0.01171203, + "auxiliary_loss_mlp": 0.01068815, + "balance_loss_clip": 1.05175185, + "balance_loss_mlp": 1.0426966, + "epoch": 0.056455734255223204, + "flos": 20485996692480.0, + "grad_norm": 2.7427540631348832, + "language_loss": 0.79751205, + "learning_rate": 3.992663158738745e-06, + "loss": 0.8199122, + "num_input_tokens_seen": 20089070, + "step": 939, + "time_per_iteration": 2.6863484382629395 + }, + { + "auxiliary_loss_clip": 0.01175528, + "auxiliary_loss_mlp": 0.01068297, + "balance_loss_clip": 1.0509069, + "balance_loss_mlp": 1.04338217, + "epoch": 0.056515857507891176, + "flos": 22053569109120.0, + "grad_norm": 1.8374791395473227, + "language_loss": 0.73919088, + "learning_rate": 3.992629792084341e-06, + "loss": 0.76162916, + "num_input_tokens_seen": 20108790, + "step": 940, + "time_per_iteration": 2.7111120223999023 + }, + { + "auxiliary_loss_clip": 0.01198483, + "auxiliary_loss_mlp": 0.01058511, + "balance_loss_clip": 1.05900669, + "balance_loss_mlp": 1.03252339, + "epoch": 0.05657598076055915, + "flos": 24025316336640.0, + "grad_norm": 2.2993716569389813, + "language_loss": 0.70622003, + "learning_rate": 3.992596349869216e-06, + "loss": 0.72878999, + "num_input_tokens_seen": 20128455, + "step": 941, + "time_per_iteration": 2.657594680786133 + }, + { + "auxiliary_loss_clip": 0.01135396, + "auxiliary_loss_mlp": 0.01059543, + "balance_loss_clip": 1.04961574, + "balance_loss_mlp": 1.03382993, + "epoch": 0.05663610401322711, + "flos": 20480609652480.0, + "grad_norm": 2.0678542992190847, + "language_loss": 0.80921417, + "learning_rate": 3.992562832094637e-06, + "loss": 0.83116359, + "num_input_tokens_seen": 20145775, + "step": 942, + "time_per_iteration": 2.7379891872406006 + }, + { + "auxiliary_loss_clip": 0.01186767, + "auxiliary_loss_mlp": 0.01062055, + "balance_loss_clip": 1.05228579, + "balance_loss_mlp": 1.03554332, + "epoch": 0.056696227265895086, + "flos": 21069042255360.0, + "grad_norm": 2.245249922529115, + "language_loss": 0.88858449, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.91107273, + "num_input_tokens_seen": 20164315, + "step": 943, + "time_per_iteration": 2.6502583026885986 + }, + { + "auxiliary_loss_clip": 0.01199122, + "auxiliary_loss_mlp": 0.0105963, + "balance_loss_clip": 1.05991781, + "balance_loss_mlp": 1.03534663, + "epoch": 0.05675635051856306, + "flos": 17821317219840.0, + "grad_norm": 2.5514256959015995, + "language_loss": 0.74771839, + "learning_rate": 3.992495569872206e-06, + "loss": 0.77030593, + "num_input_tokens_seen": 20182760, + "step": 944, + "time_per_iteration": 2.676079034805298 + }, + { + "auxiliary_loss_clip": 0.01204502, + "auxiliary_loss_mlp": 0.01064591, + "balance_loss_clip": 1.05980551, + "balance_loss_mlp": 1.04085672, + "epoch": 0.05681647377123102, + "flos": 23114945111040.0, + "grad_norm": 1.5959266123312272, + "language_loss": 0.79406166, + "learning_rate": 3.992461825426906e-06, + "loss": 0.81675267, + "num_input_tokens_seen": 20203830, + "step": 945, + "time_per_iteration": 2.734299421310425 + }, + { + "auxiliary_loss_clip": 0.01195984, + "auxiliary_loss_mlp": 0.0105672, + "balance_loss_clip": 1.05686593, + "balance_loss_mlp": 1.03156662, + "epoch": 0.056876597023898995, + "flos": 16070528505600.0, + "grad_norm": 2.5637081249861824, + "language_loss": 0.82651746, + "learning_rate": 3.992428005427252e-06, + "loss": 0.84904456, + "num_input_tokens_seen": 20220365, + "step": 946, + "time_per_iteration": 2.6636929512023926 + }, + { + "auxiliary_loss_clip": 0.0122014, + "auxiliary_loss_mlp": 0.01061449, + "balance_loss_clip": 1.06224144, + "balance_loss_mlp": 1.03524721, + "epoch": 0.05693672027656696, + "flos": 16835641130880.0, + "grad_norm": 1.8433174156507384, + "language_loss": 0.79031301, + "learning_rate": 3.992394109874529e-06, + "loss": 0.81312895, + "num_input_tokens_seen": 20238640, + "step": 947, + "time_per_iteration": 2.623671293258667 + }, + { + "auxiliary_loss_clip": 0.0117587, + "auxiliary_loss_mlp": 0.01061489, + "balance_loss_clip": 1.05605412, + "balance_loss_mlp": 1.03569245, + "epoch": 0.05699684352923493, + "flos": 21389113370880.0, + "grad_norm": 6.8661947111986725, + "language_loss": 0.85425055, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.87662411, + "num_input_tokens_seen": 20251025, + "step": 948, + "time_per_iteration": 2.7410409450531006 + }, + { + "auxiliary_loss_clip": 0.01214005, + "auxiliary_loss_mlp": 0.01063231, + "balance_loss_clip": 1.05969238, + "balance_loss_mlp": 1.03598022, + "epoch": 0.057056966781902904, + "flos": 15560309767680.0, + "grad_norm": 3.649211317819821, + "language_loss": 0.87346625, + "learning_rate": 3.992326092115019e-06, + "loss": 0.89623863, + "num_input_tokens_seen": 20269775, + "step": 949, + "time_per_iteration": 2.6893157958984375 + }, + { + "auxiliary_loss_clip": 0.01194543, + "auxiliary_loss_mlp": 0.0106695, + "balance_loss_clip": 1.05799937, + "balance_loss_mlp": 1.04266715, + "epoch": 0.05711709003457087, + "flos": 19937856170880.0, + "grad_norm": 1.8324883776363103, + "language_loss": 0.7874645, + "learning_rate": 3.992291969910811e-06, + "loss": 0.8100794, + "num_input_tokens_seen": 20287715, + "step": 950, + "time_per_iteration": 2.623924732208252 + }, + { + "auxiliary_loss_clip": 0.01180518, + "auxiliary_loss_mlp": 0.01068771, + "balance_loss_clip": 1.05322623, + "balance_loss_mlp": 1.04384422, + "epoch": 0.05717721328723884, + "flos": 30332701774080.0, + "grad_norm": 3.8045132244795816, + "language_loss": 0.82477522, + "learning_rate": 3.992257772158691e-06, + "loss": 0.8472681, + "num_input_tokens_seen": 20307070, + "step": 951, + "time_per_iteration": 2.697479724884033 + }, + { + "auxiliary_loss_clip": 0.01167302, + "auxiliary_loss_mlp": 0.01061039, + "balance_loss_clip": 1.04906607, + "balance_loss_mlp": 1.03375173, + "epoch": 0.05723733653990681, + "flos": 23654358627840.0, + "grad_norm": 2.4180383362968634, + "language_loss": 0.86899263, + "learning_rate": 3.992223498859958e-06, + "loss": 0.89127606, + "num_input_tokens_seen": 20324945, + "step": 952, + "time_per_iteration": 2.707716226577759 + }, + { + "auxiliary_loss_clip": 0.01191405, + "auxiliary_loss_mlp": 0.01064705, + "balance_loss_clip": 1.05511189, + "balance_loss_mlp": 1.03630924, + "epoch": 0.05729745979257478, + "flos": 22055759838720.0, + "grad_norm": 2.195434645270168, + "language_loss": 0.79087842, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.81343949, + "num_input_tokens_seen": 20346135, + "step": 953, + "time_per_iteration": 2.671255588531494 + }, + { + "auxiliary_loss_clip": 0.01190026, + "auxiliary_loss_mlp": 0.01066447, + "balance_loss_clip": 1.05984342, + "balance_loss_mlp": 1.04056656, + "epoch": 0.05735758304524275, + "flos": 19604353368960.0, + "grad_norm": 2.2066085695914466, + "language_loss": 0.86644447, + "learning_rate": 3.992154725627848e-06, + "loss": 0.88900924, + "num_input_tokens_seen": 20364450, + "step": 954, + "time_per_iteration": 2.671657085418701 + }, + { + "auxiliary_loss_clip": 0.01210569, + "auxiliary_loss_mlp": 0.01062619, + "balance_loss_clip": 1.06119955, + "balance_loss_mlp": 1.03723955, + "epoch": 0.057417706297910716, + "flos": 19099018880640.0, + "grad_norm": 2.2872795023766113, + "language_loss": 0.88071024, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.90344214, + "num_input_tokens_seen": 20383500, + "step": 955, + "time_per_iteration": 2.69960880279541 + }, + { + "auxiliary_loss_clip": 0.01179864, + "auxiliary_loss_mlp": 0.01068889, + "balance_loss_clip": 1.0523231, + "balance_loss_mlp": 1.04209054, + "epoch": 0.05747782955057869, + "flos": 16654507822080.0, + "grad_norm": 1.9113555723128555, + "language_loss": 0.89160776, + "learning_rate": 3.992085650224914e-06, + "loss": 0.91409534, + "num_input_tokens_seen": 20400295, + "step": 956, + "time_per_iteration": 2.667868137359619 + }, + { + "auxiliary_loss_clip": 0.01167867, + "auxiliary_loss_mlp": 0.01060669, + "balance_loss_clip": 1.05720079, + "balance_loss_mlp": 1.03450251, + "epoch": 0.05753795280324665, + "flos": 14502058248960.0, + "grad_norm": 3.2877973901728095, + "language_loss": 0.75473189, + "learning_rate": 3.99205099921266e-06, + "loss": 0.77701724, + "num_input_tokens_seen": 20419085, + "step": 957, + "time_per_iteration": 2.6938796043395996 + }, + { + "auxiliary_loss_clip": 0.0117627, + "auxiliary_loss_mlp": 0.01072849, + "balance_loss_clip": 1.05432248, + "balance_loss_mlp": 1.0448705, + "epoch": 0.057598076055914625, + "flos": 18076318848000.0, + "grad_norm": 2.0004055711005257, + "language_loss": 0.79582155, + "learning_rate": 3.992016272661633e-06, + "loss": 0.81831264, + "num_input_tokens_seen": 20437465, + "step": 958, + "time_per_iteration": 2.6933834552764893 + }, + { + "auxiliary_loss_clip": 0.01186244, + "auxiliary_loss_mlp": 0.01059908, + "balance_loss_clip": 1.05851364, + "balance_loss_mlp": 1.03572011, + "epoch": 0.0576581993085826, + "flos": 22124600254080.0, + "grad_norm": 2.669863855173802, + "language_loss": 0.8840394, + "learning_rate": 3.99198147057315e-06, + "loss": 0.906501, + "num_input_tokens_seen": 20456235, + "step": 959, + "time_per_iteration": 2.7094578742980957 + }, + { + "auxiliary_loss_clip": 0.01169479, + "auxiliary_loss_mlp": 0.01063656, + "balance_loss_clip": 1.05511999, + "balance_loss_mlp": 1.03881276, + "epoch": 0.05771832256125056, + "flos": 33181746779520.0, + "grad_norm": 2.0960373333994764, + "language_loss": 0.78850955, + "learning_rate": 3.991946592948529e-06, + "loss": 0.8108409, + "num_input_tokens_seen": 20476825, + "step": 960, + "time_per_iteration": 2.822922945022583 + }, + { + "auxiliary_loss_clip": 0.0113413, + "auxiliary_loss_mlp": 0.01067189, + "balance_loss_clip": 1.05177355, + "balance_loss_mlp": 1.04020023, + "epoch": 0.057778445813918534, + "flos": 24170143973760.0, + "grad_norm": 2.063464892179025, + "language_loss": 0.92986894, + "learning_rate": 3.991911639789094e-06, + "loss": 0.95188212, + "num_input_tokens_seen": 20496965, + "step": 961, + "time_per_iteration": 2.793952226638794 + }, + { + "auxiliary_loss_clip": 0.01182535, + "auxiliary_loss_mlp": 0.0106764, + "balance_loss_clip": 1.0554297, + "balance_loss_mlp": 1.04091299, + "epoch": 0.0578385690665865, + "flos": 29643037666560.0, + "grad_norm": 2.0649993155313067, + "language_loss": 0.68164188, + "learning_rate": 3.991876611096169e-06, + "loss": 0.70414358, + "num_input_tokens_seen": 20518035, + "step": 962, + "time_per_iteration": 2.8396694660186768 + }, + { + "auxiliary_loss_clip": 0.01159524, + "auxiliary_loss_mlp": 0.01073851, + "balance_loss_clip": 1.05128908, + "balance_loss_mlp": 1.04909074, + "epoch": 0.05789869231925447, + "flos": 20885430908160.0, + "grad_norm": 2.2685465488517074, + "language_loss": 0.8848027, + "learning_rate": 3.991841506871084e-06, + "loss": 0.90713644, + "num_input_tokens_seen": 20534740, + "step": 963, + "time_per_iteration": 2.7077019214630127 + }, + { + "auxiliary_loss_clip": 0.01183778, + "auxiliary_loss_mlp": 0.01061251, + "balance_loss_clip": 1.06018209, + "balance_loss_mlp": 1.03516829, + "epoch": 0.057958815571922444, + "flos": 26031106679040.0, + "grad_norm": 2.392959969035536, + "language_loss": 0.85288298, + "learning_rate": 3.99180632711517e-06, + "loss": 0.87533331, + "num_input_tokens_seen": 20553485, + "step": 964, + "time_per_iteration": 2.7218217849731445 + }, + { + "auxiliary_loss_clip": 0.01188683, + "auxiliary_loss_mlp": 0.01069422, + "balance_loss_clip": 1.05959499, + "balance_loss_mlp": 1.04325557, + "epoch": 0.05801893882459041, + "flos": 18077683564800.0, + "grad_norm": 3.087349735715565, + "language_loss": 0.78159416, + "learning_rate": 3.99177107182976e-06, + "loss": 0.80417526, + "num_input_tokens_seen": 20572155, + "step": 965, + "time_per_iteration": 2.6902661323547363 + }, + { + "auxiliary_loss_clip": 0.01156531, + "auxiliary_loss_mlp": 0.0107109, + "balance_loss_clip": 1.0523715, + "balance_loss_mlp": 1.04462528, + "epoch": 0.05807906207725838, + "flos": 17748885444480.0, + "grad_norm": 1.9742288518319486, + "language_loss": 0.81403655, + "learning_rate": 3.99173574101619e-06, + "loss": 0.83631277, + "num_input_tokens_seen": 20590395, + "step": 966, + "time_per_iteration": 2.7423267364501953 + }, + { + "auxiliary_loss_clip": 0.01198908, + "auxiliary_loss_mlp": 0.01065021, + "balance_loss_clip": 1.058887, + "balance_loss_mlp": 1.04113197, + "epoch": 0.058139185329926346, + "flos": 18040372312320.0, + "grad_norm": 1.8776530142118544, + "language_loss": 0.76480806, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.78744727, + "num_input_tokens_seen": 20608435, + "step": 967, + "time_per_iteration": 2.642885446548462 + }, + { + "auxiliary_loss_clip": 0.01084339, + "auxiliary_loss_mlp": 0.0103139, + "balance_loss_clip": 1.02675521, + "balance_loss_mlp": 1.0269078, + "epoch": 0.05819930858259432, + "flos": 62363297485440.0, + "grad_norm": 0.985564929959949, + "language_loss": 0.57357776, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59473509, + "num_input_tokens_seen": 20668575, + "step": 968, + "time_per_iteration": 3.1017024517059326 + }, + { + "auxiliary_loss_clip": 0.01188824, + "auxiliary_loss_mlp": 0.01057715, + "balance_loss_clip": 1.05784404, + "balance_loss_mlp": 1.03147697, + "epoch": 0.05825943183526229, + "flos": 19135360465920.0, + "grad_norm": 2.1276337565108485, + "language_loss": 0.82286429, + "learning_rate": 3.991629295419945e-06, + "loss": 0.84532964, + "num_input_tokens_seen": 20687355, + "step": 969, + "time_per_iteration": 2.669055461883545 + }, + { + "auxiliary_loss_clip": 0.01206272, + "auxiliary_loss_mlp": 0.00782724, + "balance_loss_clip": 1.06255269, + "balance_loss_mlp": 1.00024962, + "epoch": 0.058319555087930255, + "flos": 29022465369600.0, + "grad_norm": 7.916507288074279, + "language_loss": 0.7803669, + "learning_rate": 3.991593662507167e-06, + "loss": 0.80025685, + "num_input_tokens_seen": 20705710, + "step": 970, + "time_per_iteration": 2.733030080795288 + }, + { + "auxiliary_loss_clip": 0.01181452, + "auxiliary_loss_mlp": 0.01064945, + "balance_loss_clip": 1.05691695, + "balance_loss_mlp": 1.03887415, + "epoch": 0.05837967834059823, + "flos": 18879999701760.0, + "grad_norm": 3.163102883752813, + "language_loss": 0.92229038, + "learning_rate": 3.991557954072958e-06, + "loss": 0.94475436, + "num_input_tokens_seen": 20722405, + "step": 971, + "time_per_iteration": 2.730377435684204 + }, + { + "auxiliary_loss_clip": 0.01180948, + "auxiliary_loss_mlp": 0.01062613, + "balance_loss_clip": 1.05320477, + "balance_loss_mlp": 1.03722143, + "epoch": 0.05843980159326619, + "flos": 25703062744320.0, + "grad_norm": 1.700187330091603, + "language_loss": 0.85959208, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88202775, + "num_input_tokens_seen": 20741480, + "step": 972, + "time_per_iteration": 2.687185049057007 + }, + { + "auxiliary_loss_clip": 0.0116993, + "auxiliary_loss_mlp": 0.01079713, + "balance_loss_clip": 1.05714142, + "balance_loss_mlp": 1.05601454, + "epoch": 0.058499924845934165, + "flos": 25552129795200.0, + "grad_norm": 2.00599255988541, + "language_loss": 0.87503272, + "learning_rate": 3.991486310645667e-06, + "loss": 0.89752913, + "num_input_tokens_seen": 20759685, + "step": 973, + "time_per_iteration": 2.7166664600372314 + }, + { + "auxiliary_loss_clip": 0.01206524, + "auxiliary_loss_mlp": 0.00784111, + "balance_loss_clip": 1.06111121, + "balance_loss_mlp": 1.00026989, + "epoch": 0.05856004809860214, + "flos": 16436171001600.0, + "grad_norm": 1.879365930358842, + "language_loss": 0.74800295, + "learning_rate": 3.991450375655301e-06, + "loss": 0.76790935, + "num_input_tokens_seen": 20778180, + "step": 974, + "time_per_iteration": 2.713594675064087 + }, + { + "auxiliary_loss_clip": 0.01197101, + "auxiliary_loss_mlp": 0.00782207, + "balance_loss_clip": 1.059551, + "balance_loss_mlp": 1.00025892, + "epoch": 0.0586201713512701, + "flos": 39458824116480.0, + "grad_norm": 1.5923993506380014, + "language_loss": 0.76874506, + "learning_rate": 3.991414365148936e-06, + "loss": 0.78853816, + "num_input_tokens_seen": 20802705, + "step": 975, + "time_per_iteration": 7.600914716720581 + }, + { + "auxiliary_loss_clip": 0.01215491, + "auxiliary_loss_mlp": 0.01069506, + "balance_loss_clip": 1.06030774, + "balance_loss_mlp": 1.0444721, + "epoch": 0.058680294603938074, + "flos": 23365170230400.0, + "grad_norm": 3.6132976830219734, + "language_loss": 0.76748288, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79033279, + "num_input_tokens_seen": 20822540, + "step": 976, + "time_per_iteration": 4.324799537658691 + }, + { + "auxiliary_loss_clip": 0.01176132, + "auxiliary_loss_mlp": 0.01077003, + "balance_loss_clip": 1.05271626, + "balance_loss_mlp": 1.04963279, + "epoch": 0.05874041785660604, + "flos": 32232017226240.0, + "grad_norm": 1.943198757110789, + "language_loss": 0.87343585, + "learning_rate": 3.991342117593679e-06, + "loss": 0.89596725, + "num_input_tokens_seen": 20844175, + "step": 977, + "time_per_iteration": 2.7742488384246826 + }, + { + "auxiliary_loss_clip": 0.01187161, + "auxiliary_loss_mlp": 0.01067914, + "balance_loss_clip": 1.06209528, + "balance_loss_mlp": 1.04231977, + "epoch": 0.05880054110927401, + "flos": 22310043194880.0, + "grad_norm": 1.718987046197629, + "language_loss": 0.7969116, + "learning_rate": 3.991305880547527e-06, + "loss": 0.81946236, + "num_input_tokens_seen": 20864730, + "step": 978, + "time_per_iteration": 2.733372926712036 + }, + { + "auxiliary_loss_clip": 0.01136264, + "auxiliary_loss_mlp": 0.01076585, + "balance_loss_clip": 1.05591321, + "balance_loss_mlp": 1.04927468, + "epoch": 0.05886066436194198, + "flos": 27380450016000.0, + "grad_norm": 1.8692877257975375, + "language_loss": 0.80665666, + "learning_rate": 3.991269567990855e-06, + "loss": 0.82878518, + "num_input_tokens_seen": 20885200, + "step": 979, + "time_per_iteration": 3.2624220848083496 + }, + { + "auxiliary_loss_clip": 0.01074686, + "auxiliary_loss_mlp": 0.01029701, + "balance_loss_clip": 1.02640033, + "balance_loss_mlp": 1.02495658, + "epoch": 0.05892078761460995, + "flos": 59584493525760.0, + "grad_norm": 0.9436493040005753, + "language_loss": 0.59004962, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.6110934, + "num_input_tokens_seen": 20940325, + "step": 980, + "time_per_iteration": 3.4688587188720703 + }, + { + "auxiliary_loss_clip": 0.01211665, + "auxiliary_loss_mlp": 0.01078603, + "balance_loss_clip": 1.06178868, + "balance_loss_mlp": 1.05242431, + "epoch": 0.05898091086727792, + "flos": 15414081500160.0, + "grad_norm": 2.2770545408130514, + "language_loss": 0.86436182, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.88726455, + "num_input_tokens_seen": 20958220, + "step": 981, + "time_per_iteration": 2.5824644565582275 + }, + { + "auxiliary_loss_clip": 0.01190085, + "auxiliary_loss_mlp": 0.0106921, + "balance_loss_clip": 1.05943286, + "balance_loss_mlp": 1.04629803, + "epoch": 0.059041034119945886, + "flos": 23655328295040.0, + "grad_norm": 2.1333982175691855, + "language_loss": 0.79293346, + "learning_rate": 3.991160177271513e-06, + "loss": 0.81552643, + "num_input_tokens_seen": 20978920, + "step": 982, + "time_per_iteration": 2.68428897857666 + }, + { + "auxiliary_loss_clip": 0.01192274, + "auxiliary_loss_mlp": 0.01068234, + "balance_loss_clip": 1.05926657, + "balance_loss_mlp": 1.04356933, + "epoch": 0.05910115737261386, + "flos": 24754087376640.0, + "grad_norm": 2.319627739094249, + "language_loss": 0.84413779, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.86674285, + "num_input_tokens_seen": 20999490, + "step": 983, + "time_per_iteration": 2.7006261348724365 + }, + { + "auxiliary_loss_clip": 0.0120015, + "auxiliary_loss_mlp": 0.01072669, + "balance_loss_clip": 1.05969584, + "balance_loss_mlp": 1.04799283, + "epoch": 0.05916128062528183, + "flos": 11728749070080.0, + "grad_norm": 1.8014395118859294, + "language_loss": 0.84510243, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.86783063, + "num_input_tokens_seen": 21017865, + "step": 984, + "time_per_iteration": 2.640246868133545 + }, + { + "auxiliary_loss_clip": 0.01188594, + "auxiliary_loss_mlp": 0.01055296, + "balance_loss_clip": 1.05650342, + "balance_loss_mlp": 1.03171611, + "epoch": 0.059221403877949795, + "flos": 21902995296000.0, + "grad_norm": 2.473231587287368, + "language_loss": 0.77611595, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.7985549, + "num_input_tokens_seen": 21035900, + "step": 985, + "time_per_iteration": 2.626371383666992 + }, + { + "auxiliary_loss_clip": 0.01150113, + "auxiliary_loss_mlp": 0.01060814, + "balance_loss_clip": 1.05341148, + "balance_loss_mlp": 1.03542209, + "epoch": 0.05928152713061777, + "flos": 20514580940160.0, + "grad_norm": 1.9082382068459252, + "language_loss": 0.90593231, + "learning_rate": 3.991013265915661e-06, + "loss": 0.92804158, + "num_input_tokens_seen": 21053235, + "step": 986, + "time_per_iteration": 2.7834935188293457 + }, + { + "auxiliary_loss_clip": 0.01200704, + "auxiliary_loss_mlp": 0.01061312, + "balance_loss_clip": 1.05555892, + "balance_loss_mlp": 1.03425193, + "epoch": 0.05934165038328574, + "flos": 24495135252480.0, + "grad_norm": 2.216017383423336, + "language_loss": 0.75688565, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.77950585, + "num_input_tokens_seen": 21073090, + "step": 987, + "time_per_iteration": 2.6669981479644775 + }, + { + "auxiliary_loss_clip": 0.01203558, + "auxiliary_loss_mlp": 0.01057756, + "balance_loss_clip": 1.06134868, + "balance_loss_mlp": 1.03331852, + "epoch": 0.059401773635953704, + "flos": 38728041914880.0, + "grad_norm": 2.2869993581633827, + "language_loss": 0.71867943, + "learning_rate": 3.990939357235621e-06, + "loss": 0.7412926, + "num_input_tokens_seen": 21094895, + "step": 988, + "time_per_iteration": 2.805851697921753 + }, + { + "auxiliary_loss_clip": 0.0105006, + "auxiliary_loss_mlp": 0.0101134, + "balance_loss_clip": 1.02230322, + "balance_loss_mlp": 1.00688171, + "epoch": 0.059461896888621676, + "flos": 58023565125120.0, + "grad_norm": 0.9416454944601763, + "language_loss": 0.7124939, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73310792, + "num_input_tokens_seen": 21147555, + "step": 989, + "time_per_iteration": 3.100306749343872 + }, + { + "auxiliary_loss_clip": 0.0117797, + "auxiliary_loss_mlp": 0.01072264, + "balance_loss_clip": 1.05793095, + "balance_loss_mlp": 1.04389191, + "epoch": 0.05952202014128964, + "flos": 22127760650880.0, + "grad_norm": 2.0167260155417113, + "language_loss": 0.78245646, + "learning_rate": 3.990865146569105e-06, + "loss": 0.80495882, + "num_input_tokens_seen": 21167845, + "step": 990, + "time_per_iteration": 2.8133904933929443 + }, + { + "auxiliary_loss_clip": 0.01198295, + "auxiliary_loss_mlp": 0.01053485, + "balance_loss_clip": 1.06166339, + "balance_loss_mlp": 1.02761686, + "epoch": 0.059582143393957614, + "flos": 20445776438400.0, + "grad_norm": 2.2411623387553727, + "language_loss": 0.86522102, + "learning_rate": 3.990827927994434e-06, + "loss": 0.88773882, + "num_input_tokens_seen": 21185085, + "step": 991, + "time_per_iteration": 2.6964831352233887 + }, + { + "auxiliary_loss_clip": 0.0121783, + "auxiliary_loss_mlp": 0.01064707, + "balance_loss_clip": 1.0613625, + "balance_loss_mlp": 1.03943431, + "epoch": 0.059642266646625586, + "flos": 20594877793920.0, + "grad_norm": 1.8566945591898132, + "language_loss": 0.76738375, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.79020917, + "num_input_tokens_seen": 21204230, + "step": 992, + "time_per_iteration": 2.646942377090454 + }, + { + "auxiliary_loss_clip": 0.01146457, + "auxiliary_loss_mlp": 0.01062309, + "balance_loss_clip": 1.05571234, + "balance_loss_mlp": 1.03834832, + "epoch": 0.05970238989929355, + "flos": 19352655792000.0, + "grad_norm": 2.3469050968731233, + "language_loss": 0.75117075, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.77325845, + "num_input_tokens_seen": 21222655, + "step": 993, + "time_per_iteration": 2.7642974853515625 + }, + { + "auxiliary_loss_clip": 0.01157785, + "auxiliary_loss_mlp": 0.01075532, + "balance_loss_clip": 1.05397618, + "balance_loss_mlp": 1.04774487, + "epoch": 0.05976251315196152, + "flos": 30264040926720.0, + "grad_norm": 2.725207959052886, + "language_loss": 0.79177904, + "learning_rate": 3.990715819321712e-06, + "loss": 0.81411219, + "num_input_tokens_seen": 21242310, + "step": 994, + "time_per_iteration": 2.8414714336395264 + }, + { + "auxiliary_loss_clip": 0.01214724, + "auxiliary_loss_mlp": 0.01079016, + "balance_loss_clip": 1.06264019, + "balance_loss_mlp": 1.05361295, + "epoch": 0.05982263640462949, + "flos": 23185150243200.0, + "grad_norm": 2.8097993094234983, + "language_loss": 0.79917169, + "learning_rate": 3.99067829878596e-06, + "loss": 0.82210916, + "num_input_tokens_seen": 21261410, + "step": 995, + "time_per_iteration": 2.6524364948272705 + }, + { + "auxiliary_loss_clip": 0.0116696, + "auxiliary_loss_mlp": 0.01068218, + "balance_loss_clip": 1.05704355, + "balance_loss_mlp": 1.04208767, + "epoch": 0.05988275965729746, + "flos": 27850879463040.0, + "grad_norm": 1.902030256537741, + "language_loss": 0.87013257, + "learning_rate": 3.990640702763487e-06, + "loss": 0.89248431, + "num_input_tokens_seen": 21280080, + "step": 996, + "time_per_iteration": 2.7431676387786865 + }, + { + "auxiliary_loss_clip": 0.01177854, + "auxiliary_loss_mlp": 0.01081123, + "balance_loss_clip": 1.05684328, + "balance_loss_mlp": 1.05055761, + "epoch": 0.05994288290996543, + "flos": 24680003575680.0, + "grad_norm": 2.971039758986745, + "language_loss": 0.87273014, + "learning_rate": 3.990603031255718e-06, + "loss": 0.89531994, + "num_input_tokens_seen": 21296765, + "step": 997, + "time_per_iteration": 2.748448371887207 + }, + { + "auxiliary_loss_clip": 0.01069915, + "auxiliary_loss_mlp": 0.01014417, + "balance_loss_clip": 1.02303648, + "balance_loss_mlp": 1.00972033, + "epoch": 0.0600030061626334, + "flos": 69929568835200.0, + "grad_norm": 1.0091092068179202, + "language_loss": 0.75381488, + "learning_rate": 3.990565284264083e-06, + "loss": 0.7746582, + "num_input_tokens_seen": 21363345, + "step": 998, + "time_per_iteration": 3.2950518131256104 + }, + { + "auxiliary_loss_clip": 0.01170062, + "auxiliary_loss_mlp": 0.01065521, + "balance_loss_clip": 1.05893683, + "balance_loss_mlp": 1.03893745, + "epoch": 0.06006312941530137, + "flos": 26540140268160.0, + "grad_norm": 1.8197691299520968, + "language_loss": 0.76053095, + "learning_rate": 3.990527461790013e-06, + "loss": 0.7828868, + "num_input_tokens_seen": 21385290, + "step": 999, + "time_per_iteration": 2.733802556991577 + }, + { + "auxiliary_loss_clip": 0.01197834, + "auxiliary_loss_mlp": 0.01059542, + "balance_loss_clip": 1.05646563, + "balance_loss_mlp": 1.03339899, + "epoch": 0.060123252667969335, + "flos": 27344000689920.0, + "grad_norm": 2.5948629341774874, + "language_loss": 0.82992184, + "learning_rate": 3.990489563834943e-06, + "loss": 0.85249555, + "num_input_tokens_seen": 21407625, + "step": 1000, + "time_per_iteration": 2.710981845855713 + }, + { + "auxiliary_loss_clip": 0.0118571, + "auxiliary_loss_mlp": 0.01062188, + "balance_loss_clip": 1.05856955, + "balance_loss_mlp": 1.03480577, + "epoch": 0.06018337592063731, + "flos": 27016710940800.0, + "grad_norm": 2.111409807940472, + "language_loss": 0.85820085, + "learning_rate": 3.990451590400309e-06, + "loss": 0.88067985, + "num_input_tokens_seen": 21426835, + "step": 1001, + "time_per_iteration": 2.73445463180542 + }, + { + "auxiliary_loss_clip": 0.01191917, + "auxiliary_loss_mlp": 0.01062059, + "balance_loss_clip": 1.06167853, + "balance_loss_mlp": 1.03719211, + "epoch": 0.06024349917330528, + "flos": 25592960580480.0, + "grad_norm": 1.8359711451165206, + "language_loss": 0.74128318, + "learning_rate": 3.990413541487551e-06, + "loss": 0.76382297, + "num_input_tokens_seen": 21444920, + "step": 1002, + "time_per_iteration": 2.8861100673675537 + }, + { + "auxiliary_loss_clip": 0.01214316, + "auxiliary_loss_mlp": 0.01062589, + "balance_loss_clip": 1.06316125, + "balance_loss_mlp": 1.03737664, + "epoch": 0.060303622425973244, + "flos": 26133271937280.0, + "grad_norm": 2.1835040648243997, + "language_loss": 0.75520515, + "learning_rate": 3.990375417098112e-06, + "loss": 0.77797419, + "num_input_tokens_seen": 21463555, + "step": 1003, + "time_per_iteration": 2.632889747619629 + }, + { + "auxiliary_loss_clip": 0.01187709, + "auxiliary_loss_mlp": 0.01064806, + "balance_loss_clip": 1.05934548, + "balance_loss_mlp": 1.03928304, + "epoch": 0.060363745678641216, + "flos": 20377187418240.0, + "grad_norm": 2.3150099602993155, + "language_loss": 0.70349169, + "learning_rate": 3.990337217233437e-06, + "loss": 0.72601682, + "num_input_tokens_seen": 21481990, + "step": 1004, + "time_per_iteration": 2.6947617530822754 + }, + { + "auxiliary_loss_clip": 0.01212815, + "auxiliary_loss_mlp": 0.01077454, + "balance_loss_clip": 1.06629324, + "balance_loss_mlp": 1.05168116, + "epoch": 0.06042386893130918, + "flos": 17749172753280.0, + "grad_norm": 2.276868338025253, + "language_loss": 0.83444524, + "learning_rate": 3.990298941894976e-06, + "loss": 0.85734791, + "num_input_tokens_seen": 21500385, + "step": 1005, + "time_per_iteration": 2.581683397293091 + }, + { + "auxiliary_loss_clip": 0.01077621, + "auxiliary_loss_mlp": 0.01004707, + "balance_loss_clip": 1.02541244, + "balance_loss_mlp": 1.00029612, + "epoch": 0.06048399218397715, + "flos": 68538496872960.0, + "grad_norm": 0.903813421793838, + "language_loss": 0.59018111, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61100447, + "num_input_tokens_seen": 21561040, + "step": 1006, + "time_per_iteration": 3.222104787826538 + }, + { + "auxiliary_loss_clip": 0.01183553, + "auxiliary_loss_mlp": 0.01059038, + "balance_loss_clip": 1.05334234, + "balance_loss_mlp": 1.03284812, + "epoch": 0.060544115436645125, + "flos": 23258515772160.0, + "grad_norm": 2.1584333764290853, + "language_loss": 0.74229443, + "learning_rate": 3.990222164802503e-06, + "loss": 0.76472032, + "num_input_tokens_seen": 21580655, + "step": 1007, + "time_per_iteration": 2.7130653858184814 + }, + { + "auxiliary_loss_clip": 0.0119408, + "auxiliary_loss_mlp": 0.01060431, + "balance_loss_clip": 1.06008601, + "balance_loss_mlp": 1.03493261, + "epoch": 0.06060423868931309, + "flos": 23878441624320.0, + "grad_norm": 1.7956876298455304, + "language_loss": 0.8081426, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.8306877, + "num_input_tokens_seen": 21599650, + "step": 1008, + "time_per_iteration": 2.7151994705200195 + }, + { + "auxiliary_loss_clip": 0.01175291, + "auxiliary_loss_mlp": 0.01056357, + "balance_loss_clip": 1.05982351, + "balance_loss_mlp": 1.0305717, + "epoch": 0.06066436194198106, + "flos": 18728061171840.0, + "grad_norm": 2.3069524306559837, + "language_loss": 0.78198558, + "learning_rate": 3.990145085832335e-06, + "loss": 0.8043021, + "num_input_tokens_seen": 21617550, + "step": 1009, + "time_per_iteration": 2.7313599586486816 + }, + { + "auxiliary_loss_clip": 0.01194621, + "auxiliary_loss_mlp": 0.01061233, + "balance_loss_clip": 1.06150866, + "balance_loss_mlp": 1.03726041, + "epoch": 0.06072448519464903, + "flos": 24640465680000.0, + "grad_norm": 1.7452257697216769, + "language_loss": 0.93148172, + "learning_rate": 3.990106433146769e-06, + "loss": 0.95404023, + "num_input_tokens_seen": 21635865, + "step": 1010, + "time_per_iteration": 2.7233662605285645 + }, + { + "auxiliary_loss_clip": 0.01148246, + "auxiliary_loss_mlp": 0.00784144, + "balance_loss_clip": 1.05304599, + "balance_loss_mlp": 1.00029802, + "epoch": 0.060784608447317, + "flos": 17378825575680.0, + "grad_norm": 2.9999367504779517, + "language_loss": 0.72022474, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.73954868, + "num_input_tokens_seen": 21653945, + "step": 1011, + "time_per_iteration": 2.804858446121216 + }, + { + "auxiliary_loss_clip": 0.01194231, + "auxiliary_loss_mlp": 0.01077344, + "balance_loss_clip": 1.05968046, + "balance_loss_mlp": 1.04868615, + "epoch": 0.06084473169998497, + "flos": 23692208584320.0, + "grad_norm": 1.9573218215833301, + "language_loss": 0.87526691, + "learning_rate": 3.990028901381999e-06, + "loss": 0.89798272, + "num_input_tokens_seen": 21671230, + "step": 1012, + "time_per_iteration": 2.6466245651245117 + }, + { + "auxiliary_loss_clip": 0.01184459, + "auxiliary_loss_mlp": 0.01064264, + "balance_loss_clip": 1.05652905, + "balance_loss_mlp": 1.03838325, + "epoch": 0.06090485495265294, + "flos": 23546339452800.0, + "grad_norm": 1.9062230938156723, + "language_loss": 0.76947677, + "learning_rate": 3.989990022305734e-06, + "loss": 0.79196405, + "num_input_tokens_seen": 21691155, + "step": 1013, + "time_per_iteration": 4.297588586807251 + }, + { + "auxiliary_loss_clip": 0.01207383, + "auxiliary_loss_mlp": 0.00783488, + "balance_loss_clip": 1.06573224, + "balance_loss_mlp": 1.00034499, + "epoch": 0.06096497820532091, + "flos": 20339301548160.0, + "grad_norm": 2.441711811862119, + "language_loss": 0.86151874, + "learning_rate": 3.98995106776885e-06, + "loss": 0.88142747, + "num_input_tokens_seen": 21707405, + "step": 1014, + "time_per_iteration": 4.301488637924194 + }, + { + "auxiliary_loss_clip": 0.0121503, + "auxiliary_loss_mlp": 0.01072817, + "balance_loss_clip": 1.06605387, + "balance_loss_mlp": 1.04508948, + "epoch": 0.061025101457988874, + "flos": 26939035779840.0, + "grad_norm": 2.4309754772209184, + "language_loss": 0.73197287, + "learning_rate": 3.98991203777282e-06, + "loss": 0.75485134, + "num_input_tokens_seen": 21728090, + "step": 1015, + "time_per_iteration": 4.384514808654785 + }, + { + "auxiliary_loss_clip": 0.01187374, + "auxiliary_loss_mlp": 0.01068593, + "balance_loss_clip": 1.06084347, + "balance_loss_mlp": 1.04228365, + "epoch": 0.061085224710656846, + "flos": 25375054723200.0, + "grad_norm": 1.5896529502124837, + "language_loss": 0.79109907, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.81365877, + "num_input_tokens_seen": 21747950, + "step": 1016, + "time_per_iteration": 4.3249351978302 + }, + { + "auxiliary_loss_clip": 0.01173015, + "auxiliary_loss_mlp": 0.0105746, + "balance_loss_clip": 1.06036103, + "balance_loss_mlp": 1.03249741, + "epoch": 0.06114534796332482, + "flos": 24824759385600.0, + "grad_norm": 1.6772682648410928, + "language_loss": 0.76014191, + "learning_rate": 3.989833751409254e-06, + "loss": 0.78244662, + "num_input_tokens_seen": 21767900, + "step": 1017, + "time_per_iteration": 2.7983243465423584 + }, + { + "auxiliary_loss_clip": 0.01188817, + "auxiliary_loss_mlp": 0.01074603, + "balance_loss_clip": 1.06584609, + "balance_loss_mlp": 1.0483532, + "epoch": 0.061205471215992784, + "flos": 20631434860800.0, + "grad_norm": 2.001716657382839, + "language_loss": 0.85798436, + "learning_rate": 3.989794495044685e-06, + "loss": 0.88061857, + "num_input_tokens_seen": 21787375, + "step": 1018, + "time_per_iteration": 2.702399253845215 + }, + { + "auxiliary_loss_clip": 0.01174344, + "auxiliary_loss_mlp": 0.01069438, + "balance_loss_clip": 1.06325769, + "balance_loss_mlp": 1.04231787, + "epoch": 0.061265594468660756, + "flos": 16508351381760.0, + "grad_norm": 2.9546929267460813, + "language_loss": 0.76985347, + "learning_rate": 3.989755163226909e-06, + "loss": 0.79229128, + "num_input_tokens_seen": 21806275, + "step": 1019, + "time_per_iteration": 2.780104875564575 + }, + { + "auxiliary_loss_clip": 0.01160861, + "auxiliary_loss_mlp": 0.0106141, + "balance_loss_clip": 1.05355084, + "balance_loss_mlp": 1.03511262, + "epoch": 0.06132571772132872, + "flos": 26246211275520.0, + "grad_norm": 2.1848809329980288, + "language_loss": 0.84122044, + "learning_rate": 3.989715755957418e-06, + "loss": 0.86344314, + "num_input_tokens_seen": 21826430, + "step": 1020, + "time_per_iteration": 2.785963535308838 + }, + { + "auxiliary_loss_clip": 0.01198473, + "auxiliary_loss_mlp": 0.01063342, + "balance_loss_clip": 1.06365371, + "balance_loss_mlp": 1.03604269, + "epoch": 0.06138584097399669, + "flos": 37414788768000.0, + "grad_norm": 1.933053672026977, + "language_loss": 0.79114467, + "learning_rate": 3.989676273237705e-06, + "loss": 0.81376278, + "num_input_tokens_seen": 21847800, + "step": 1021, + "time_per_iteration": 2.7968955039978027 + }, + { + "auxiliary_loss_clip": 0.01189659, + "auxiliary_loss_mlp": 0.01064044, + "balance_loss_clip": 1.06159925, + "balance_loss_mlp": 1.04114437, + "epoch": 0.061445964226664665, + "flos": 17420661941760.0, + "grad_norm": 2.089525934673828, + "language_loss": 0.87768298, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.90022004, + "num_input_tokens_seen": 21863385, + "step": 1022, + "time_per_iteration": 2.70906138420105 + }, + { + "auxiliary_loss_clip": 0.01198737, + "auxiliary_loss_mlp": 0.0106635, + "balance_loss_clip": 1.06627858, + "balance_loss_mlp": 1.04079151, + "epoch": 0.06150608747933263, + "flos": 22600021691520.0, + "grad_norm": 1.7284486983379121, + "language_loss": 0.82892007, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85157096, + "num_input_tokens_seen": 21881880, + "step": 1023, + "time_per_iteration": 2.71539568901062 + }, + { + "auxiliary_loss_clip": 0.01100664, + "auxiliary_loss_mlp": 0.01010751, + "balance_loss_clip": 1.03727341, + "balance_loss_mlp": 1.00614953, + "epoch": 0.0615662107320006, + "flos": 56741482005120.0, + "grad_norm": 0.8894752517384502, + "language_loss": 0.6505782, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67169237, + "num_input_tokens_seen": 21940550, + "step": 1024, + "time_per_iteration": 3.175217628479004 + }, + { + "auxiliary_loss_clip": 0.01167458, + "auxiliary_loss_mlp": 0.01073669, + "balance_loss_clip": 1.05906856, + "balance_loss_mlp": 1.04553604, + "epoch": 0.06162633398466857, + "flos": 22564793427840.0, + "grad_norm": 2.320347485789288, + "language_loss": 0.88069236, + "learning_rate": 3.989517587886636e-06, + "loss": 0.90310359, + "num_input_tokens_seen": 21958390, + "step": 1025, + "time_per_iteration": 2.690725564956665 + }, + { + "auxiliary_loss_clip": 0.01197064, + "auxiliary_loss_mlp": 0.01066504, + "balance_loss_clip": 1.06452, + "balance_loss_mlp": 1.04173219, + "epoch": 0.06168645723733654, + "flos": 25593104234880.0, + "grad_norm": 2.5217294712155414, + "language_loss": 0.84536898, + "learning_rate": 3.989477727938335e-06, + "loss": 0.86800468, + "num_input_tokens_seen": 21978625, + "step": 1026, + "time_per_iteration": 2.7420806884765625 + }, + { + "auxiliary_loss_clip": 0.01160797, + "auxiliary_loss_mlp": 0.0107525, + "balance_loss_clip": 1.05669701, + "balance_loss_mlp": 1.04934609, + "epoch": 0.06174658049000451, + "flos": 15997917162240.0, + "grad_norm": 2.354014397396182, + "language_loss": 0.8228389, + "learning_rate": 3.989437792548839e-06, + "loss": 0.84519935, + "num_input_tokens_seen": 21996035, + "step": 1027, + "time_per_iteration": 2.6683874130249023 + }, + { + "auxiliary_loss_clip": 0.01160199, + "auxiliary_loss_mlp": 0.01067253, + "balance_loss_clip": 1.06181073, + "balance_loss_mlp": 1.04232645, + "epoch": 0.06180670374267248, + "flos": 11285970117120.0, + "grad_norm": 4.43492107605727, + "language_loss": 0.83898664, + "learning_rate": 3.989397781719663e-06, + "loss": 0.86126107, + "num_input_tokens_seen": 22011625, + "step": 1028, + "time_per_iteration": 2.705387592315674 + }, + { + "auxiliary_loss_clip": 0.0106503, + "auxiliary_loss_mlp": 0.01008074, + "balance_loss_clip": 1.02410197, + "balance_loss_mlp": 1.00347257, + "epoch": 0.06186682699534045, + "flos": 65130142216320.0, + "grad_norm": 0.9383255649985517, + "language_loss": 0.604738, + "learning_rate": 3.989357695452323e-06, + "loss": 0.62546903, + "num_input_tokens_seen": 22066035, + "step": 1029, + "time_per_iteration": 3.0268616676330566 + }, + { + "auxiliary_loss_clip": 0.01176182, + "auxiliary_loss_mlp": 0.01074173, + "balance_loss_clip": 1.05641246, + "balance_loss_mlp": 1.04737473, + "epoch": 0.061926950248008414, + "flos": 21105742976640.0, + "grad_norm": 4.246634693563946, + "language_loss": 0.82589179, + "learning_rate": 3.98931753374834e-06, + "loss": 0.84839535, + "num_input_tokens_seen": 22085015, + "step": 1030, + "time_per_iteration": 2.7035892009735107 + }, + { + "auxiliary_loss_clip": 0.0122298, + "auxiliary_loss_mlp": 0.01077745, + "balance_loss_clip": 1.06850278, + "balance_loss_mlp": 1.05185235, + "epoch": 0.061987073500676386, + "flos": 17748454481280.0, + "grad_norm": 2.585240230669548, + "language_loss": 0.79576576, + "learning_rate": 3.989277296609237e-06, + "loss": 0.81877303, + "num_input_tokens_seen": 22102775, + "step": 1031, + "time_per_iteration": 2.60622501373291 + }, + { + "auxiliary_loss_clip": 0.01188957, + "auxiliary_loss_mlp": 0.01076754, + "balance_loss_clip": 1.06396544, + "balance_loss_mlp": 1.04982424, + "epoch": 0.06204719675334436, + "flos": 21836237869440.0, + "grad_norm": 1.8815476991595563, + "language_loss": 0.77384412, + "learning_rate": 3.98923698403654e-06, + "loss": 0.79650116, + "num_input_tokens_seen": 22121680, + "step": 1032, + "time_per_iteration": 2.6753971576690674 + }, + { + "auxiliary_loss_clip": 0.01198757, + "auxiliary_loss_mlp": 0.01074736, + "balance_loss_clip": 1.05916619, + "balance_loss_mlp": 1.04848623, + "epoch": 0.06210732000601232, + "flos": 19353697286400.0, + "grad_norm": 3.147941025479245, + "language_loss": 0.89323574, + "learning_rate": 3.989196596031776e-06, + "loss": 0.91597068, + "num_input_tokens_seen": 22138155, + "step": 1033, + "time_per_iteration": 2.7313079833984375 + }, + { + "auxiliary_loss_clip": 0.01209161, + "auxiliary_loss_mlp": 0.01066082, + "balance_loss_clip": 1.06214237, + "balance_loss_mlp": 1.04119134, + "epoch": 0.062167443258680295, + "flos": 24749382695040.0, + "grad_norm": 2.1035343880884145, + "language_loss": 0.8455385, + "learning_rate": 3.989156132596479e-06, + "loss": 0.8682909, + "num_input_tokens_seen": 22157420, + "step": 1034, + "time_per_iteration": 2.7541439533233643 + }, + { + "auxiliary_loss_clip": 0.01180042, + "auxiliary_loss_mlp": 0.01057312, + "balance_loss_clip": 1.05896068, + "balance_loss_mlp": 1.03155136, + "epoch": 0.06222756651134827, + "flos": 34458478773120.0, + "grad_norm": 1.8983498110529735, + "language_loss": 0.8082794, + "learning_rate": 3.989115593732182e-06, + "loss": 0.83065289, + "num_input_tokens_seen": 22178620, + "step": 1035, + "time_per_iteration": 2.7965424060821533 + }, + { + "auxiliary_loss_clip": 0.01158806, + "auxiliary_loss_mlp": 0.01072478, + "balance_loss_clip": 1.05936599, + "balance_loss_mlp": 1.04432034, + "epoch": 0.06228768976401623, + "flos": 25666469763840.0, + "grad_norm": 2.145216314952277, + "language_loss": 0.78365827, + "learning_rate": 3.989074979440421e-06, + "loss": 0.80597103, + "num_input_tokens_seen": 22197125, + "step": 1036, + "time_per_iteration": 2.7858450412750244 + }, + { + "auxiliary_loss_clip": 0.01192097, + "auxiliary_loss_mlp": 0.01071382, + "balance_loss_clip": 1.05977845, + "balance_loss_mlp": 1.04663444, + "epoch": 0.062347813016684205, + "flos": 25295619795840.0, + "grad_norm": 1.9535870339716077, + "language_loss": 0.86544567, + "learning_rate": 3.989034289722739e-06, + "loss": 0.88808048, + "num_input_tokens_seen": 22217575, + "step": 1037, + "time_per_iteration": 2.685373306274414 + }, + { + "auxiliary_loss_clip": 0.01197778, + "auxiliary_loss_mlp": 0.01057095, + "balance_loss_clip": 1.06127763, + "balance_loss_mlp": 1.02966499, + "epoch": 0.06240793626935217, + "flos": 26907039740160.0, + "grad_norm": 2.697396725345887, + "language_loss": 0.8067717, + "learning_rate": 3.988993524580676e-06, + "loss": 0.82932043, + "num_input_tokens_seen": 22236840, + "step": 1038, + "time_per_iteration": 2.7305831909179688 + }, + { + "auxiliary_loss_clip": 0.01145896, + "auxiliary_loss_mlp": 0.01072721, + "balance_loss_clip": 1.05226004, + "balance_loss_mlp": 1.04330015, + "epoch": 0.06246805952202014, + "flos": 21615782146560.0, + "grad_norm": 1.8888526922505675, + "language_loss": 0.85465872, + "learning_rate": 3.98895268401578e-06, + "loss": 0.87684488, + "num_input_tokens_seen": 22256465, + "step": 1039, + "time_per_iteration": 2.7351109981536865 + }, + { + "auxiliary_loss_clip": 0.01188545, + "auxiliary_loss_mlp": 0.01070323, + "balance_loss_clip": 1.05834138, + "balance_loss_mlp": 1.04472923, + "epoch": 0.0625281827746881, + "flos": 19311896833920.0, + "grad_norm": 2.217895985816133, + "language_loss": 0.81172895, + "learning_rate": 3.9889117680296e-06, + "loss": 0.83431756, + "num_input_tokens_seen": 22274025, + "step": 1040, + "time_per_iteration": 2.6532907485961914 + }, + { + "auxiliary_loss_clip": 0.0121654, + "auxiliary_loss_mlp": 0.0106312, + "balance_loss_clip": 1.06718016, + "balance_loss_mlp": 1.03808582, + "epoch": 0.06258830602735609, + "flos": 27745769289600.0, + "grad_norm": 2.1960038080149817, + "language_loss": 0.69304991, + "learning_rate": 3.988870776623685e-06, + "loss": 0.71584648, + "num_input_tokens_seen": 22292245, + "step": 1041, + "time_per_iteration": 2.6445486545562744 + }, + { + "auxiliary_loss_clip": 0.01214659, + "auxiliary_loss_mlp": 0.01057975, + "balance_loss_clip": 1.06247008, + "balance_loss_mlp": 1.03182077, + "epoch": 0.06264842928002405, + "flos": 23222605150080.0, + "grad_norm": 2.7326158002445, + "language_loss": 0.81187552, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.83460188, + "num_input_tokens_seen": 22311455, + "step": 1042, + "time_per_iteration": 2.6111559867858887 + }, + { + "auxiliary_loss_clip": 0.01211653, + "auxiliary_loss_mlp": 0.01052676, + "balance_loss_clip": 1.06253886, + "balance_loss_mlp": 1.02871442, + "epoch": 0.06270855253269202, + "flos": 38399495189760.0, + "grad_norm": 1.7165873820424848, + "language_loss": 0.76349056, + "learning_rate": 3.988788567558874e-06, + "loss": 0.78613389, + "num_input_tokens_seen": 22333750, + "step": 1043, + "time_per_iteration": 2.761768341064453 + }, + { + "auxiliary_loss_clip": 0.0118944, + "auxiliary_loss_mlp": 0.01063189, + "balance_loss_clip": 1.06111181, + "balance_loss_mlp": 1.03912091, + "epoch": 0.06276867578535998, + "flos": 22453542028800.0, + "grad_norm": 8.34017761542712, + "language_loss": 0.92031956, + "learning_rate": 3.988747349903097e-06, + "loss": 0.94284582, + "num_input_tokens_seen": 22351940, + "step": 1044, + "time_per_iteration": 2.636179208755493 + }, + { + "auxiliary_loss_clip": 0.01192566, + "auxiliary_loss_mlp": 0.01070128, + "balance_loss_clip": 1.05862689, + "balance_loss_mlp": 1.0456785, + "epoch": 0.06282879903802796, + "flos": 22930435923840.0, + "grad_norm": 2.3486674311430944, + "language_loss": 0.85913992, + "learning_rate": 3.988706056833821e-06, + "loss": 0.88176692, + "num_input_tokens_seen": 22372085, + "step": 1045, + "time_per_iteration": 2.7749502658843994 + }, + { + "auxiliary_loss_clip": 0.01179197, + "auxiliary_loss_mlp": 0.01065176, + "balance_loss_clip": 1.05804443, + "balance_loss_mlp": 1.04053521, + "epoch": 0.06288892229069593, + "flos": 34819237019520.0, + "grad_norm": 1.9846122850853416, + "language_loss": 0.7796576, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.80210131, + "num_input_tokens_seen": 22392020, + "step": 1046, + "time_per_iteration": 2.803135871887207 + }, + { + "auxiliary_loss_clip": 0.01197344, + "auxiliary_loss_mlp": 0.01069269, + "balance_loss_clip": 1.06361508, + "balance_loss_mlp": 1.04558206, + "epoch": 0.06294904554336389, + "flos": 19427134642560.0, + "grad_norm": 2.174325060947129, + "language_loss": 0.77326387, + "learning_rate": 3.988623244461039e-06, + "loss": 0.79592997, + "num_input_tokens_seen": 22411180, + "step": 1047, + "time_per_iteration": 2.647446632385254 + }, + { + "auxiliary_loss_clip": 0.01200907, + "auxiliary_loss_mlp": 0.0105799, + "balance_loss_clip": 1.06238222, + "balance_loss_mlp": 1.03314662, + "epoch": 0.06300916879603187, + "flos": 40661867358720.0, + "grad_norm": 2.4899372640825046, + "language_loss": 0.77190751, + "learning_rate": 3.988581725160672e-06, + "loss": 0.79449654, + "num_input_tokens_seen": 22435105, + "step": 1048, + "time_per_iteration": 2.8167293071746826 + }, + { + "auxiliary_loss_clip": 0.0118184, + "auxiliary_loss_mlp": 0.01064361, + "balance_loss_clip": 1.0613215, + "balance_loss_mlp": 1.03914821, + "epoch": 0.06306929204869983, + "flos": 23804142341760.0, + "grad_norm": 4.606540291271834, + "language_loss": 0.77258086, + "learning_rate": 3.988540130453087e-06, + "loss": 0.79504287, + "num_input_tokens_seen": 22452710, + "step": 1049, + "time_per_iteration": 2.6908538341522217 + }, + { + "auxiliary_loss_clip": 0.01194538, + "auxiliary_loss_mlp": 0.0105682, + "balance_loss_clip": 1.06043661, + "balance_loss_mlp": 1.03290701, + "epoch": 0.0631294153013678, + "flos": 18915802583040.0, + "grad_norm": 2.515998307474139, + "language_loss": 0.83302009, + "learning_rate": 3.988498460339862e-06, + "loss": 0.85553372, + "num_input_tokens_seen": 22470175, + "step": 1050, + "time_per_iteration": 2.62186861038208 + }, + { + "auxiliary_loss_clip": 0.01210654, + "auxiliary_loss_mlp": 0.01062894, + "balance_loss_clip": 1.06468701, + "balance_loss_mlp": 1.04008913, + "epoch": 0.06318953855403578, + "flos": 24280174310400.0, + "grad_norm": 5.5478202090132065, + "language_loss": 0.76564771, + "learning_rate": 3.988456714822575e-06, + "loss": 0.78838319, + "num_input_tokens_seen": 22490020, + "step": 1051, + "time_per_iteration": 2.732269525527954 + }, + { + "auxiliary_loss_clip": 0.01188416, + "auxiliary_loss_mlp": 0.01069443, + "balance_loss_clip": 1.06340146, + "balance_loss_mlp": 1.04492211, + "epoch": 0.06324966180670374, + "flos": 22528918719360.0, + "grad_norm": 1.9993900469270787, + "language_loss": 0.80410004, + "learning_rate": 3.98841489390281e-06, + "loss": 0.82667863, + "num_input_tokens_seen": 22509685, + "step": 1052, + "time_per_iteration": 2.7683873176574707 + }, + { + "auxiliary_loss_clip": 0.01211333, + "auxiliary_loss_mlp": 0.01058255, + "balance_loss_clip": 1.06324601, + "balance_loss_mlp": 1.03468728, + "epoch": 0.06330978505937171, + "flos": 15778107884160.0, + "grad_norm": 2.370007457349547, + "language_loss": 0.77433288, + "learning_rate": 3.988372997582155e-06, + "loss": 0.79702866, + "num_input_tokens_seen": 22527905, + "step": 1053, + "time_per_iteration": 5.757168531417847 + }, + { + "auxiliary_loss_clip": 0.01190721, + "auxiliary_loss_mlp": 0.00780448, + "balance_loss_clip": 1.06378174, + "balance_loss_mlp": 1.00028598, + "epoch": 0.06336990831203967, + "flos": 21471098163840.0, + "grad_norm": 3.085258828985267, + "language_loss": 0.84931248, + "learning_rate": 3.988331025862195e-06, + "loss": 0.86902416, + "num_input_tokens_seen": 22546335, + "step": 1054, + "time_per_iteration": 2.7733829021453857 + }, + { + "auxiliary_loss_clip": 0.01172281, + "auxiliary_loss_mlp": 0.01061232, + "balance_loss_clip": 1.05722666, + "balance_loss_mlp": 1.03753328, + "epoch": 0.06343003156470765, + "flos": 18478877546880.0, + "grad_norm": 2.0168531459993435, + "language_loss": 0.85884213, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.88117731, + "num_input_tokens_seen": 22563885, + "step": 1055, + "time_per_iteration": 4.490305185317993 + }, + { + "auxiliary_loss_clip": 0.01164237, + "auxiliary_loss_mlp": 0.01069785, + "balance_loss_clip": 1.05727792, + "balance_loss_mlp": 1.04534709, + "epoch": 0.06349015481737562, + "flos": 25154886309120.0, + "grad_norm": 2.4509218988768, + "language_loss": 0.8113938, + "learning_rate": 3.988246856230734e-06, + "loss": 0.83373404, + "num_input_tokens_seen": 22583035, + "step": 1056, + "time_per_iteration": 5.345282793045044 + }, + { + "auxiliary_loss_clip": 0.01144181, + "auxiliary_loss_mlp": 0.01061125, + "balance_loss_clip": 1.04991364, + "balance_loss_mlp": 1.03449368, + "epoch": 0.06355027807004358, + "flos": 26871775562880.0, + "grad_norm": 2.2117272688527128, + "language_loss": 0.81083393, + "learning_rate": 3.988204658322426e-06, + "loss": 0.83288693, + "num_input_tokens_seen": 22605055, + "step": 1057, + "time_per_iteration": 2.866757392883301 + }, + { + "auxiliary_loss_clip": 0.01139076, + "auxiliary_loss_mlp": 0.01061742, + "balance_loss_clip": 1.04970908, + "balance_loss_mlp": 1.03918755, + "epoch": 0.06361040132271156, + "flos": 21396691140480.0, + "grad_norm": 1.9636971972870172, + "language_loss": 0.83353591, + "learning_rate": 3.988162385021196e-06, + "loss": 0.85554409, + "num_input_tokens_seen": 22623760, + "step": 1058, + "time_per_iteration": 2.767024278640747 + }, + { + "auxiliary_loss_clip": 0.0117752, + "auxiliary_loss_mlp": 0.01059639, + "balance_loss_clip": 1.0576936, + "balance_loss_mlp": 1.03408027, + "epoch": 0.06367052457537953, + "flos": 25733765894400.0, + "grad_norm": 2.137077300251244, + "language_loss": 0.87556928, + "learning_rate": 3.988120036328651e-06, + "loss": 0.89794087, + "num_input_tokens_seen": 22643000, + "step": 1059, + "time_per_iteration": 2.794734239578247 + }, + { + "auxiliary_loss_clip": 0.01169658, + "auxiliary_loss_mlp": 0.01063463, + "balance_loss_clip": 1.06196678, + "balance_loss_mlp": 1.0383693, + "epoch": 0.0637306478280475, + "flos": 17631420992640.0, + "grad_norm": 2.543966627588717, + "language_loss": 0.91561133, + "learning_rate": 3.988077612246394e-06, + "loss": 0.93794256, + "num_input_tokens_seen": 22660460, + "step": 1060, + "time_per_iteration": 2.8223626613616943 + }, + { + "auxiliary_loss_clip": 0.01173933, + "auxiliary_loss_mlp": 0.01065151, + "balance_loss_clip": 1.05715585, + "balance_loss_mlp": 1.03981876, + "epoch": 0.06379077108071547, + "flos": 13662610427520.0, + "grad_norm": 1.9401711052692647, + "language_loss": 0.87242293, + "learning_rate": 3.988035112776035e-06, + "loss": 0.89481378, + "num_input_tokens_seen": 22679270, + "step": 1061, + "time_per_iteration": 2.7783865928649902 + }, + { + "auxiliary_loss_clip": 0.01190039, + "auxiliary_loss_mlp": 0.01059971, + "balance_loss_clip": 1.05976009, + "balance_loss_mlp": 1.03388786, + "epoch": 0.06385089433338344, + "flos": 28478849961600.0, + "grad_norm": 5.360593029379932, + "language_loss": 0.77407908, + "learning_rate": 3.987992537919185e-06, + "loss": 0.79657912, + "num_input_tokens_seen": 22699330, + "step": 1062, + "time_per_iteration": 2.872587203979492 + }, + { + "auxiliary_loss_clip": 0.01172912, + "auxiliary_loss_mlp": 0.01061175, + "balance_loss_clip": 1.05884075, + "balance_loss_mlp": 1.03798842, + "epoch": 0.0639110175860514, + "flos": 24311057028480.0, + "grad_norm": 2.2658654128491436, + "language_loss": 0.86522883, + "learning_rate": 3.987949887677459e-06, + "loss": 0.88756967, + "num_input_tokens_seen": 22717945, + "step": 1063, + "time_per_iteration": 2.7915029525756836 + }, + { + "auxiliary_loss_clip": 0.01207773, + "auxiliary_loss_mlp": 0.01062698, + "balance_loss_clip": 1.05969334, + "balance_loss_mlp": 1.03846335, + "epoch": 0.06397114083871938, + "flos": 22090772620800.0, + "grad_norm": 2.302236346678267, + "language_loss": 0.79908657, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.82179129, + "num_input_tokens_seen": 22736790, + "step": 1064, + "time_per_iteration": 2.6880991458892822 + }, + { + "auxiliary_loss_clip": 0.01198826, + "auxiliary_loss_mlp": 0.01066465, + "balance_loss_clip": 1.0603801, + "balance_loss_mlp": 1.04149103, + "epoch": 0.06403126409138735, + "flos": 19572824206080.0, + "grad_norm": 3.1552731138796215, + "language_loss": 0.84327948, + "learning_rate": 3.987864361045851e-06, + "loss": 0.8659324, + "num_input_tokens_seen": 22754745, + "step": 1065, + "time_per_iteration": 2.6956398487091064 + }, + { + "auxiliary_loss_clip": 0.01168098, + "auxiliary_loss_mlp": 0.01054905, + "balance_loss_clip": 1.0597136, + "balance_loss_mlp": 1.03162324, + "epoch": 0.06409138734405531, + "flos": 40807413267840.0, + "grad_norm": 1.52830872536012, + "language_loss": 0.68177885, + "learning_rate": 3.987821484659211e-06, + "loss": 0.70400894, + "num_input_tokens_seen": 22776780, + "step": 1066, + "time_per_iteration": 2.9867773056030273 + }, + { + "auxiliary_loss_clip": 0.01214184, + "auxiliary_loss_mlp": 0.01070649, + "balance_loss_clip": 1.06780005, + "balance_loss_mlp": 1.04609215, + "epoch": 0.06415151059672328, + "flos": 20441610460800.0, + "grad_norm": 1.8546001537284342, + "language_loss": 0.90349269, + "learning_rate": 3.987778532894181e-06, + "loss": 0.926341, + "num_input_tokens_seen": 22793915, + "step": 1067, + "time_per_iteration": 2.685896873474121 + }, + { + "auxiliary_loss_clip": 0.01188134, + "auxiliary_loss_mlp": 0.01063022, + "balance_loss_clip": 1.0623709, + "balance_loss_mlp": 1.03969264, + "epoch": 0.06421163384939126, + "flos": 18072045129600.0, + "grad_norm": 2.189788428445167, + "language_loss": 0.83437371, + "learning_rate": 3.987735505752391e-06, + "loss": 0.85688531, + "num_input_tokens_seen": 22812670, + "step": 1068, + "time_per_iteration": 2.851602554321289 + }, + { + "auxiliary_loss_clip": 0.01178972, + "auxiliary_loss_mlp": 0.01057745, + "balance_loss_clip": 1.05909026, + "balance_loss_mlp": 1.03426039, + "epoch": 0.06427175710205922, + "flos": 25119442563840.0, + "grad_norm": 3.045176948020938, + "language_loss": 0.89311272, + "learning_rate": 3.987692403235471e-06, + "loss": 0.9154799, + "num_input_tokens_seen": 22832440, + "step": 1069, + "time_per_iteration": 2.7825255393981934 + }, + { + "auxiliary_loss_clip": 0.01185672, + "auxiliary_loss_mlp": 0.01071834, + "balance_loss_clip": 1.06158304, + "balance_loss_mlp": 1.04663301, + "epoch": 0.06433188035472719, + "flos": 17380549428480.0, + "grad_norm": 2.7038488706194808, + "language_loss": 0.95759481, + "learning_rate": 3.987649225345056e-06, + "loss": 0.98016989, + "num_input_tokens_seen": 22845495, + "step": 1070, + "time_per_iteration": 2.715296506881714 + }, + { + "auxiliary_loss_clip": 0.01140792, + "auxiliary_loss_mlp": 0.01056718, + "balance_loss_clip": 1.05607581, + "balance_loss_mlp": 1.03027749, + "epoch": 0.06439200360739517, + "flos": 23546267625600.0, + "grad_norm": 1.630790580283393, + "language_loss": 0.8811003, + "learning_rate": 3.987605972082782e-06, + "loss": 0.90307534, + "num_input_tokens_seen": 22865390, + "step": 1071, + "time_per_iteration": 2.8445394039154053 + }, + { + "auxiliary_loss_clip": 0.01155172, + "auxiliary_loss_mlp": 0.01054986, + "balance_loss_clip": 1.05483651, + "balance_loss_mlp": 1.03102481, + "epoch": 0.06445212686006313, + "flos": 21979772616960.0, + "grad_norm": 1.8349443396730127, + "language_loss": 0.76116478, + "learning_rate": 3.987562643450292e-06, + "loss": 0.78326637, + "num_input_tokens_seen": 22885495, + "step": 1072, + "time_per_iteration": 2.8330819606781006 + }, + { + "auxiliary_loss_clip": 0.01172997, + "auxiliary_loss_mlp": 0.01070104, + "balance_loss_clip": 1.05975842, + "balance_loss_mlp": 1.04362798, + "epoch": 0.0645122501127311, + "flos": 25921291824000.0, + "grad_norm": 2.724283900767911, + "language_loss": 0.80849886, + "learning_rate": 3.987519239449226e-06, + "loss": 0.83092993, + "num_input_tokens_seen": 22904845, + "step": 1073, + "time_per_iteration": 2.748286247253418 + }, + { + "auxiliary_loss_clip": 0.01194712, + "auxiliary_loss_mlp": 0.01062452, + "balance_loss_clip": 1.06345201, + "balance_loss_mlp": 1.03825283, + "epoch": 0.06457237336539907, + "flos": 25626034028160.0, + "grad_norm": 5.0538746884234245, + "language_loss": 0.80282539, + "learning_rate": 3.987475760081233e-06, + "loss": 0.82539707, + "num_input_tokens_seen": 22925940, + "step": 1074, + "time_per_iteration": 2.7482337951660156 + }, + { + "auxiliary_loss_clip": 0.01173366, + "auxiliary_loss_mlp": 0.01057774, + "balance_loss_clip": 1.05920076, + "balance_loss_mlp": 1.03256142, + "epoch": 0.06463249661806704, + "flos": 19463979018240.0, + "grad_norm": 2.0209756517373707, + "language_loss": 0.79249811, + "learning_rate": 3.987432205347958e-06, + "loss": 0.8148095, + "num_input_tokens_seen": 22944375, + "step": 1075, + "time_per_iteration": 2.6937224864959717 + }, + { + "auxiliary_loss_clip": 0.01171297, + "auxiliary_loss_mlp": 0.01063569, + "balance_loss_clip": 1.05735481, + "balance_loss_mlp": 1.04025126, + "epoch": 0.064692619870735, + "flos": 24498044254080.0, + "grad_norm": 2.9028991302223357, + "language_loss": 0.88208115, + "learning_rate": 3.987388575251055e-06, + "loss": 0.90442967, + "num_input_tokens_seen": 22959145, + "step": 1076, + "time_per_iteration": 2.878103256225586 + }, + { + "auxiliary_loss_clip": 0.01192915, + "auxiliary_loss_mlp": 0.01052877, + "balance_loss_clip": 1.06164443, + "balance_loss_mlp": 1.0288558, + "epoch": 0.06475274312340297, + "flos": 17018677860480.0, + "grad_norm": 2.225760792581628, + "language_loss": 0.80876106, + "learning_rate": 3.98734486979218e-06, + "loss": 0.83121902, + "num_input_tokens_seen": 22978100, + "step": 1077, + "time_per_iteration": 2.7221076488494873 + }, + { + "auxiliary_loss_clip": 0.01200466, + "auxiliary_loss_mlp": 0.01064019, + "balance_loss_clip": 1.0656153, + "balance_loss_mlp": 1.03866291, + "epoch": 0.06481286637607095, + "flos": 24572379450240.0, + "grad_norm": 2.256787147683815, + "language_loss": 0.91727465, + "learning_rate": 3.987301088972986e-06, + "loss": 0.93991947, + "num_input_tokens_seen": 22997285, + "step": 1078, + "time_per_iteration": 2.862365484237671 + }, + { + "auxiliary_loss_clip": 0.0122435, + "auxiliary_loss_mlp": 0.01060225, + "balance_loss_clip": 1.06826639, + "balance_loss_mlp": 1.03552508, + "epoch": 0.06487298962873891, + "flos": 21105635235840.0, + "grad_norm": 2.080056711608912, + "language_loss": 0.78349572, + "learning_rate": 3.987257232795137e-06, + "loss": 0.80634147, + "num_input_tokens_seen": 23016285, + "step": 1079, + "time_per_iteration": 2.6435368061065674 + }, + { + "auxiliary_loss_clip": 0.01156927, + "auxiliary_loss_mlp": 0.01063794, + "balance_loss_clip": 1.05512071, + "balance_loss_mlp": 1.03899896, + "epoch": 0.06493311288140688, + "flos": 24608182331520.0, + "grad_norm": 2.274862403364013, + "language_loss": 0.68702769, + "learning_rate": 3.987213301260294e-06, + "loss": 0.70923495, + "num_input_tokens_seen": 23036420, + "step": 1080, + "time_per_iteration": 2.7782626152038574 + }, + { + "auxiliary_loss_clip": 0.01175684, + "auxiliary_loss_mlp": 0.01062351, + "balance_loss_clip": 1.06640029, + "balance_loss_mlp": 1.03610086, + "epoch": 0.06499323613407486, + "flos": 25337994865920.0, + "grad_norm": 1.886196453243775, + "language_loss": 0.72291583, + "learning_rate": 3.987169294370123e-06, + "loss": 0.74529618, + "num_input_tokens_seen": 23056945, + "step": 1081, + "time_per_iteration": 2.7983880043029785 + }, + { + "auxiliary_loss_clip": 0.01139671, + "auxiliary_loss_mlp": 0.01066686, + "balance_loss_clip": 1.0504055, + "balance_loss_mlp": 1.04076982, + "epoch": 0.06505335938674282, + "flos": 20375714960640.0, + "grad_norm": 3.3093934650613566, + "language_loss": 0.84059012, + "learning_rate": 3.987125212126294e-06, + "loss": 0.86265367, + "num_input_tokens_seen": 23074940, + "step": 1082, + "time_per_iteration": 2.8351900577545166 + }, + { + "auxiliary_loss_clip": 0.01204185, + "auxiliary_loss_mlp": 0.01063692, + "balance_loss_clip": 1.06306195, + "balance_loss_mlp": 1.03809738, + "epoch": 0.06511348263941079, + "flos": 25337923038720.0, + "grad_norm": 2.894360492506304, + "language_loss": 0.82550305, + "learning_rate": 3.987081054530478e-06, + "loss": 0.84818184, + "num_input_tokens_seen": 23093420, + "step": 1083, + "time_per_iteration": 2.866729974746704 + }, + { + "auxiliary_loss_clip": 0.01168245, + "auxiliary_loss_mlp": 0.01062938, + "balance_loss_clip": 1.06021011, + "balance_loss_mlp": 1.03655696, + "epoch": 0.06517360589207877, + "flos": 20332801186560.0, + "grad_norm": 2.468736383036802, + "language_loss": 0.79289383, + "learning_rate": 3.987036821584348e-06, + "loss": 0.81520569, + "num_input_tokens_seen": 23111550, + "step": 1084, + "time_per_iteration": 2.816601276397705 + }, + { + "auxiliary_loss_clip": 0.01174068, + "auxiliary_loss_mlp": 0.0106167, + "balance_loss_clip": 1.05854714, + "balance_loss_mlp": 1.03667152, + "epoch": 0.06523372914474673, + "flos": 31681650061440.0, + "grad_norm": 2.571590277205686, + "language_loss": 0.66443276, + "learning_rate": 3.986992513289584e-06, + "loss": 0.68679011, + "num_input_tokens_seen": 23130335, + "step": 1085, + "time_per_iteration": 2.8260092735290527 + }, + { + "auxiliary_loss_clip": 0.01170818, + "auxiliary_loss_mlp": 0.01062435, + "balance_loss_clip": 1.0600934, + "balance_loss_mlp": 1.03833067, + "epoch": 0.0652938523974147, + "flos": 20778165918720.0, + "grad_norm": 2.0478791529086977, + "language_loss": 0.76548934, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.78782183, + "num_input_tokens_seen": 23152380, + "step": 1086, + "time_per_iteration": 2.7937023639678955 + }, + { + "auxiliary_loss_clip": 0.01198609, + "auxiliary_loss_mlp": 0.01059288, + "balance_loss_clip": 1.06335294, + "balance_loss_mlp": 1.03519547, + "epoch": 0.06535397565008266, + "flos": 16690993061760.0, + "grad_norm": 2.1629448601391017, + "language_loss": 0.85109925, + "learning_rate": 3.986903670660872e-06, + "loss": 0.87367821, + "num_input_tokens_seen": 23171630, + "step": 1087, + "time_per_iteration": 2.7510013580322266 + }, + { + "auxiliary_loss_clip": 0.01184978, + "auxiliary_loss_mlp": 0.01059017, + "balance_loss_clip": 1.06293821, + "balance_loss_mlp": 1.03510392, + "epoch": 0.06541409890275064, + "flos": 26868220116480.0, + "grad_norm": 1.7886353193129139, + "language_loss": 0.77776635, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.80020636, + "num_input_tokens_seen": 23192520, + "step": 1088, + "time_per_iteration": 2.7792751789093018 + }, + { + "auxiliary_loss_clip": 0.01192707, + "auxiliary_loss_mlp": 0.01067634, + "balance_loss_clip": 1.06569457, + "balance_loss_mlp": 1.04498422, + "epoch": 0.06547422215541861, + "flos": 20521620005760.0, + "grad_norm": 3.0334087154373375, + "language_loss": 0.71050513, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.73310852, + "num_input_tokens_seen": 23210710, + "step": 1089, + "time_per_iteration": 2.8832852840423584 + }, + { + "auxiliary_loss_clip": 0.01173663, + "auxiliary_loss_mlp": 0.00781529, + "balance_loss_clip": 1.06159782, + "balance_loss_mlp": 1.00019014, + "epoch": 0.06553434540808657, + "flos": 22016616992640.0, + "grad_norm": 2.02973275746688, + "language_loss": 0.85650897, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.87606084, + "num_input_tokens_seen": 23230305, + "step": 1090, + "time_per_iteration": 2.7933149337768555 + }, + { + "auxiliary_loss_clip": 0.01214666, + "auxiliary_loss_mlp": 0.0105885, + "balance_loss_clip": 1.06735325, + "balance_loss_mlp": 1.03460288, + "epoch": 0.06559446866075455, + "flos": 24608649208320.0, + "grad_norm": 2.137212216289862, + "language_loss": 0.71829313, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.74102825, + "num_input_tokens_seen": 23249015, + "step": 1091, + "time_per_iteration": 2.646592855453491 + }, + { + "auxiliary_loss_clip": 0.01121055, + "auxiliary_loss_mlp": 0.0106405, + "balance_loss_clip": 1.05242276, + "balance_loss_mlp": 1.03961205, + "epoch": 0.06565459191342252, + "flos": 24274679529600.0, + "grad_norm": 2.2773849385721956, + "language_loss": 0.82839823, + "learning_rate": 3.986680245605936e-06, + "loss": 0.85024923, + "num_input_tokens_seen": 23265105, + "step": 1092, + "time_per_iteration": 4.799649715423584 + }, + { + "auxiliary_loss_clip": 0.01215092, + "auxiliary_loss_mlp": 0.01059151, + "balance_loss_clip": 1.0640471, + "balance_loss_mlp": 1.03352082, + "epoch": 0.06571471516609048, + "flos": 24787124910720.0, + "grad_norm": 2.268968080418226, + "language_loss": 0.71134168, + "learning_rate": 3.986635334582814e-06, + "loss": 0.73408413, + "num_input_tokens_seen": 23283950, + "step": 1093, + "time_per_iteration": 5.3356239795684814 + }, + { + "auxiliary_loss_clip": 0.01190682, + "auxiliary_loss_mlp": 0.01064498, + "balance_loss_clip": 1.06751943, + "balance_loss_mlp": 1.0392611, + "epoch": 0.06577483841875846, + "flos": 26214071581440.0, + "grad_norm": 3.829837904337144, + "language_loss": 0.87996346, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90251523, + "num_input_tokens_seen": 23305005, + "step": 1094, + "time_per_iteration": 2.853489637374878 + }, + { + "auxiliary_loss_clip": 0.01192742, + "auxiliary_loss_mlp": 0.01065068, + "balance_loss_clip": 1.06367433, + "balance_loss_mlp": 1.03843689, + "epoch": 0.06583496167142643, + "flos": 25080802508160.0, + "grad_norm": 1.6736216436017588, + "language_loss": 0.81483954, + "learning_rate": 3.986545286538044e-06, + "loss": 0.8374176, + "num_input_tokens_seen": 23323220, + "step": 1095, + "time_per_iteration": 5.1613922119140625 + }, + { + "auxiliary_loss_clip": 0.01166049, + "auxiliary_loss_mlp": 0.01058945, + "balance_loss_clip": 1.06295943, + "balance_loss_mlp": 1.03598547, + "epoch": 0.06589508492409439, + "flos": 25629804956160.0, + "grad_norm": 2.0200125290673068, + "language_loss": 0.69789279, + "learning_rate": 3.986500149519811e-06, + "loss": 0.72014272, + "num_input_tokens_seen": 23342235, + "step": 1096, + "time_per_iteration": 2.804025173187256 + }, + { + "auxiliary_loss_clip": 0.01201939, + "auxiliary_loss_mlp": 0.01070786, + "balance_loss_clip": 1.06405246, + "balance_loss_mlp": 1.04614568, + "epoch": 0.06595520817676236, + "flos": 23621249266560.0, + "grad_norm": 1.7011375462517908, + "language_loss": 0.77430046, + "learning_rate": 3.986454937173292e-06, + "loss": 0.79702777, + "num_input_tokens_seen": 23363680, + "step": 1097, + "time_per_iteration": 2.7658958435058594 + }, + { + "auxiliary_loss_clip": 0.01215996, + "auxiliary_loss_mlp": 0.01063445, + "balance_loss_clip": 1.06707537, + "balance_loss_mlp": 1.03959155, + "epoch": 0.06601533142943034, + "flos": 33801708545280.0, + "grad_norm": 1.8316558452843608, + "language_loss": 0.78217584, + "learning_rate": 3.986409649500203e-06, + "loss": 0.80497026, + "num_input_tokens_seen": 23385590, + "step": 1098, + "time_per_iteration": 2.865684747695923 + }, + { + "auxiliary_loss_clip": 0.01197349, + "auxiliary_loss_mlp": 0.01069192, + "balance_loss_clip": 1.06328607, + "balance_loss_mlp": 1.04443276, + "epoch": 0.0660754546820983, + "flos": 20259184262400.0, + "grad_norm": 1.9237510259783663, + "language_loss": 0.81525648, + "learning_rate": 3.986364286502261e-06, + "loss": 0.83792192, + "num_input_tokens_seen": 23402945, + "step": 1099, + "time_per_iteration": 2.690377950668335 + }, + { + "auxiliary_loss_clip": 0.01179995, + "auxiliary_loss_mlp": 0.0105819, + "balance_loss_clip": 1.0578239, + "balance_loss_mlp": 1.03428841, + "epoch": 0.06613557793476627, + "flos": 19354164163200.0, + "grad_norm": 1.9906927310803755, + "language_loss": 0.82793295, + "learning_rate": 3.986318848181186e-06, + "loss": 0.8503148, + "num_input_tokens_seen": 23421410, + "step": 1100, + "time_per_iteration": 2.7613909244537354 + }, + { + "auxiliary_loss_clip": 0.01191263, + "auxiliary_loss_mlp": 0.0105903, + "balance_loss_clip": 1.06985724, + "balance_loss_mlp": 1.03529549, + "epoch": 0.06619570118743424, + "flos": 13772568936960.0, + "grad_norm": 2.079994286400427, + "language_loss": 0.73502243, + "learning_rate": 3.986273334538702e-06, + "loss": 0.75752538, + "num_input_tokens_seen": 23438870, + "step": 1101, + "time_per_iteration": 2.7795870304107666 + }, + { + "auxiliary_loss_clip": 0.01199256, + "auxiliary_loss_mlp": 0.01061171, + "balance_loss_clip": 1.06278944, + "balance_loss_mlp": 1.03773487, + "epoch": 0.06625582444010221, + "flos": 17857874286720.0, + "grad_norm": 2.875757629612747, + "language_loss": 0.85861301, + "learning_rate": 3.986227745576533e-06, + "loss": 0.88121736, + "num_input_tokens_seen": 23456975, + "step": 1102, + "time_per_iteration": 2.737269401550293 + }, + { + "auxiliary_loss_clip": 0.01191982, + "auxiliary_loss_mlp": 0.01058639, + "balance_loss_clip": 1.06898165, + "balance_loss_mlp": 1.03410578, + "epoch": 0.06631594769277017, + "flos": 11838707579520.0, + "grad_norm": 2.8924251757501778, + "language_loss": 0.81655926, + "learning_rate": 3.98618208129641e-06, + "loss": 0.83906543, + "num_input_tokens_seen": 23473440, + "step": 1103, + "time_per_iteration": 2.9345293045043945 + }, + { + "auxiliary_loss_clip": 0.01203522, + "auxiliary_loss_mlp": 0.00780451, + "balance_loss_clip": 1.06721628, + "balance_loss_mlp": 1.00042021, + "epoch": 0.06637607094543815, + "flos": 19793351756160.0, + "grad_norm": 5.176370819061919, + "language_loss": 0.81749105, + "learning_rate": 3.986136341700063e-06, + "loss": 0.83733076, + "num_input_tokens_seen": 23493880, + "step": 1104, + "time_per_iteration": 2.753657102584839 + }, + { + "auxiliary_loss_clip": 0.0116508, + "auxiliary_loss_mlp": 0.01050687, + "balance_loss_clip": 1.0576005, + "balance_loss_mlp": 1.02608228, + "epoch": 0.06643619419810612, + "flos": 25485659677440.0, + "grad_norm": 1.5448539486038575, + "language_loss": 0.80422902, + "learning_rate": 3.986090526789227e-06, + "loss": 0.82638663, + "num_input_tokens_seen": 23514920, + "step": 1105, + "time_per_iteration": 2.8904521465301514 + }, + { + "auxiliary_loss_clip": 0.01179397, + "auxiliary_loss_mlp": 0.0106197, + "balance_loss_clip": 1.06348729, + "balance_loss_mlp": 1.0391891, + "epoch": 0.06649631745077408, + "flos": 16946533393920.0, + "grad_norm": 2.7426455725749896, + "language_loss": 0.96762037, + "learning_rate": 3.986044636565639e-06, + "loss": 0.99003398, + "num_input_tokens_seen": 23531635, + "step": 1106, + "time_per_iteration": 2.890073299407959 + }, + { + "auxiliary_loss_clip": 0.01198065, + "auxiliary_loss_mlp": 0.01059975, + "balance_loss_clip": 1.06069684, + "balance_loss_mlp": 1.03511953, + "epoch": 0.06655644070344206, + "flos": 17858592558720.0, + "grad_norm": 1.9297768479693453, + "language_loss": 0.82528949, + "learning_rate": 3.985998671031039e-06, + "loss": 0.84786987, + "num_input_tokens_seen": 23551020, + "step": 1107, + "time_per_iteration": 2.778857469558716 + }, + { + "auxiliary_loss_clip": 0.01104176, + "auxiliary_loss_mlp": 0.01010935, + "balance_loss_clip": 1.04708242, + "balance_loss_mlp": 1.0072155, + "epoch": 0.06661656395611003, + "flos": 61419350021760.0, + "grad_norm": 0.7967940032222198, + "language_loss": 0.56789279, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.58904392, + "num_input_tokens_seen": 23610675, + "step": 1108, + "time_per_iteration": 3.2717819213867188 + }, + { + "auxiliary_loss_clip": 0.0118327, + "auxiliary_loss_mlp": 0.01062625, + "balance_loss_clip": 1.05651307, + "balance_loss_mlp": 1.0376507, + "epoch": 0.066676687208778, + "flos": 20662856282880.0, + "grad_norm": 2.682842555407744, + "language_loss": 0.7287578, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.75121677, + "num_input_tokens_seen": 23628710, + "step": 1109, + "time_per_iteration": 2.829623222351074 + }, + { + "auxiliary_loss_clip": 0.01148971, + "auxiliary_loss_mlp": 0.01071895, + "balance_loss_clip": 1.05459642, + "balance_loss_mlp": 1.04714715, + "epoch": 0.06673681046144596, + "flos": 20923280864640.0, + "grad_norm": 1.7914435942805436, + "language_loss": 0.78140426, + "learning_rate": 3.985860322578614e-06, + "loss": 0.80361295, + "num_input_tokens_seen": 23649160, + "step": 1110, + "time_per_iteration": 2.892786741256714 + }, + { + "auxiliary_loss_clip": 0.01153553, + "auxiliary_loss_mlp": 0.0106147, + "balance_loss_clip": 1.05590594, + "balance_loss_mlp": 1.03700781, + "epoch": 0.06679693371411394, + "flos": 31065818359680.0, + "grad_norm": 2.5260725451831805, + "language_loss": 0.71425366, + "learning_rate": 3.985814055817427e-06, + "loss": 0.73640382, + "num_input_tokens_seen": 23671995, + "step": 1111, + "time_per_iteration": 2.9349052906036377 + }, + { + "auxiliary_loss_clip": 0.01170538, + "auxiliary_loss_mlp": 0.01066103, + "balance_loss_clip": 1.05776191, + "balance_loss_mlp": 1.04199934, + "epoch": 0.0668570569667819, + "flos": 21726135705600.0, + "grad_norm": 1.8396663794990693, + "language_loss": 0.78767776, + "learning_rate": 3.985767713753971e-06, + "loss": 0.81004417, + "num_input_tokens_seen": 23690705, + "step": 1112, + "time_per_iteration": 2.8676345348358154 + }, + { + "auxiliary_loss_clip": 0.01153291, + "auxiliary_loss_mlp": 0.01065421, + "balance_loss_clip": 1.05340791, + "balance_loss_mlp": 1.04163861, + "epoch": 0.06691718021944987, + "flos": 22747255539840.0, + "grad_norm": 2.071048188460824, + "language_loss": 0.78481978, + "learning_rate": 3.985721296390005e-06, + "loss": 0.80700684, + "num_input_tokens_seen": 23709990, + "step": 1113, + "time_per_iteration": 2.8688411712646484 + }, + { + "auxiliary_loss_clip": 0.0114872, + "auxiliary_loss_mlp": 0.01057074, + "balance_loss_clip": 1.05157375, + "balance_loss_mlp": 1.03376842, + "epoch": 0.06697730347211785, + "flos": 16545626720640.0, + "grad_norm": 1.7560007918285245, + "language_loss": 0.82399213, + "learning_rate": 3.985674803727289e-06, + "loss": 0.84605002, + "num_input_tokens_seen": 23728485, + "step": 1114, + "time_per_iteration": 2.832458019256592 + }, + { + "auxiliary_loss_clip": 0.01075626, + "auxiliary_loss_mlp": 0.01006906, + "balance_loss_clip": 1.04995251, + "balance_loss_mlp": 1.00271022, + "epoch": 0.06703742672478581, + "flos": 59782326658560.0, + "grad_norm": 0.8370646888074905, + "language_loss": 0.58147323, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60229862, + "num_input_tokens_seen": 23786650, + "step": 1115, + "time_per_iteration": 3.550837755203247 + }, + { + "auxiliary_loss_clip": 0.01177193, + "auxiliary_loss_mlp": 0.01059174, + "balance_loss_clip": 1.05986214, + "balance_loss_mlp": 1.03381801, + "epoch": 0.06709754997745378, + "flos": 16800197385600.0, + "grad_norm": 2.8944873563712235, + "language_loss": 0.91280693, + "learning_rate": 3.985581592512658e-06, + "loss": 0.93517065, + "num_input_tokens_seen": 23802555, + "step": 1116, + "time_per_iteration": 2.994608163833618 + }, + { + "auxiliary_loss_clip": 0.01169376, + "auxiliary_loss_mlp": 0.0078227, + "balance_loss_clip": 1.05839634, + "balance_loss_mlp": 1.00045347, + "epoch": 0.06715767323012176, + "flos": 22123917895680.0, + "grad_norm": 1.9249158333763592, + "language_loss": 0.87154609, + "learning_rate": 3.985534873964279e-06, + "loss": 0.89106256, + "num_input_tokens_seen": 23822945, + "step": 1117, + "time_per_iteration": 2.794400453567505 + }, + { + "auxiliary_loss_clip": 0.01095782, + "auxiliary_loss_mlp": 0.01003785, + "balance_loss_clip": 1.0387876, + "balance_loss_mlp": 0.99963647, + "epoch": 0.06721779648278972, + "flos": 66618100137600.0, + "grad_norm": 0.8644388721740246, + "language_loss": 0.5981611, + "learning_rate": 3.985488080124218e-06, + "loss": 0.61915678, + "num_input_tokens_seen": 23874075, + "step": 1118, + "time_per_iteration": 3.1695809364318848 + }, + { + "auxiliary_loss_clip": 0.01178972, + "auxiliary_loss_mlp": 0.01051993, + "balance_loss_clip": 1.05301392, + "balance_loss_mlp": 1.02780545, + "epoch": 0.06727791973545769, + "flos": 22382474970240.0, + "grad_norm": 3.6923711141076447, + "language_loss": 0.83045954, + "learning_rate": 3.985441210994251e-06, + "loss": 0.85276914, + "num_input_tokens_seen": 23889720, + "step": 1119, + "time_per_iteration": 2.7538814544677734 + }, + { + "auxiliary_loss_clip": 0.01182384, + "auxiliary_loss_mlp": 0.01058422, + "balance_loss_clip": 1.06102347, + "balance_loss_mlp": 1.03566504, + "epoch": 0.06733804298812565, + "flos": 24280210224000.0, + "grad_norm": 4.541743494234462, + "language_loss": 0.8451674, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.86757541, + "num_input_tokens_seen": 23909385, + "step": 1120, + "time_per_iteration": 2.76581072807312 + }, + { + "auxiliary_loss_clip": 0.0121565, + "auxiliary_loss_mlp": 0.01064916, + "balance_loss_clip": 1.06757379, + "balance_loss_mlp": 1.04028773, + "epoch": 0.06739816624079363, + "flos": 15918230839680.0, + "grad_norm": 2.503866645162978, + "language_loss": 0.78722781, + "learning_rate": 3.985347246871708e-06, + "loss": 0.81003344, + "num_input_tokens_seen": 23926830, + "step": 1121, + "time_per_iteration": 2.651175022125244 + }, + { + "auxiliary_loss_clip": 0.01080914, + "auxiliary_loss_mlp": 0.01011889, + "balance_loss_clip": 1.03108025, + "balance_loss_mlp": 1.00802636, + "epoch": 0.0674582894934616, + "flos": 71398567353600.0, + "grad_norm": 0.7540288133642103, + "language_loss": 0.58320796, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60413599, + "num_input_tokens_seen": 23992640, + "step": 1122, + "time_per_iteration": 3.3794541358947754 + }, + { + "auxiliary_loss_clip": 0.01145486, + "auxiliary_loss_mlp": 0.01066136, + "balance_loss_clip": 1.05581403, + "balance_loss_mlp": 1.04167438, + "epoch": 0.06751841274612956, + "flos": 25264952559360.0, + "grad_norm": 2.3361170394687076, + "language_loss": 0.71965349, + "learning_rate": 3.985252981610901e-06, + "loss": 0.74176967, + "num_input_tokens_seen": 24011135, + "step": 1123, + "time_per_iteration": 2.8049354553222656 + }, + { + "auxiliary_loss_clip": 0.01144994, + "auxiliary_loss_mlp": 0.01064196, + "balance_loss_clip": 1.05373979, + "balance_loss_mlp": 1.03612232, + "epoch": 0.06757853599879754, + "flos": 23802741711360.0, + "grad_norm": 1.7380479869896208, + "language_loss": 0.78987843, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81197035, + "num_input_tokens_seen": 24030695, + "step": 1124, + "time_per_iteration": 2.8595056533813477 + }, + { + "auxiliary_loss_clip": 0.01189686, + "auxiliary_loss_mlp": 0.01055169, + "balance_loss_clip": 1.05663013, + "balance_loss_mlp": 1.03200674, + "epoch": 0.0676386592514655, + "flos": 21033742164480.0, + "grad_norm": 3.1450673626590793, + "language_loss": 0.70999855, + "learning_rate": 3.985158415226128e-06, + "loss": 0.73244709, + "num_input_tokens_seen": 24050680, + "step": 1125, + "time_per_iteration": 2.726163625717163 + }, + { + "auxiliary_loss_clip": 0.01165518, + "auxiliary_loss_mlp": 0.01068918, + "balance_loss_clip": 1.05826426, + "balance_loss_mlp": 1.04290628, + "epoch": 0.06769878250413347, + "flos": 25556331686400.0, + "grad_norm": 3.340323364887528, + "language_loss": 0.81440383, + "learning_rate": 3.985111019116736e-06, + "loss": 0.83674812, + "num_input_tokens_seen": 24067205, + "step": 1126, + "time_per_iteration": 2.7356598377227783 + }, + { + "auxiliary_loss_clip": 0.0107201, + "auxiliary_loss_mlp": 0.01004999, + "balance_loss_clip": 1.0293622, + "balance_loss_mlp": 1.00092208, + "epoch": 0.06775890575680145, + "flos": 70655251305600.0, + "grad_norm": 0.77802311726495, + "language_loss": 0.59720373, + "learning_rate": 3.985063547731735e-06, + "loss": 0.6179738, + "num_input_tokens_seen": 24131320, + "step": 1127, + "time_per_iteration": 3.2627320289611816 + }, + { + "auxiliary_loss_clip": 0.01206438, + "auxiliary_loss_mlp": 0.01055509, + "balance_loss_clip": 1.06308687, + "balance_loss_mlp": 1.03189397, + "epoch": 0.06781902900946941, + "flos": 24235500769920.0, + "grad_norm": 2.2535941175889054, + "language_loss": 0.81097019, + "learning_rate": 3.985016001072925e-06, + "loss": 0.83358967, + "num_input_tokens_seen": 24149930, + "step": 1128, + "time_per_iteration": 2.6652371883392334 + }, + { + "auxiliary_loss_clip": 0.01158345, + "auxiliary_loss_mlp": 0.01052658, + "balance_loss_clip": 1.05360484, + "balance_loss_mlp": 1.02804112, + "epoch": 0.06787915226213738, + "flos": 22417523665920.0, + "grad_norm": 2.24200367657907, + "language_loss": 0.75559127, + "learning_rate": 3.984968379142109e-06, + "loss": 0.77770138, + "num_input_tokens_seen": 24169590, + "step": 1129, + "time_per_iteration": 2.7023732662200928 + }, + { + "auxiliary_loss_clip": 0.01117595, + "auxiliary_loss_mlp": 0.01053995, + "balance_loss_clip": 1.04627228, + "balance_loss_mlp": 1.03006983, + "epoch": 0.06793927551480534, + "flos": 37706922080640.0, + "grad_norm": 1.890559803272908, + "language_loss": 0.71710479, + "learning_rate": 3.984920681941094e-06, + "loss": 0.73882067, + "num_input_tokens_seen": 24189965, + "step": 1130, + "time_per_iteration": 3.0757689476013184 + }, + { + "auxiliary_loss_clip": 0.01158117, + "auxiliary_loss_mlp": 0.010592, + "balance_loss_clip": 1.05734515, + "balance_loss_mlp": 1.03481019, + "epoch": 0.06799939876747332, + "flos": 20631398947200.0, + "grad_norm": 2.24421862356218, + "language_loss": 0.80776262, + "learning_rate": 3.984872909471688e-06, + "loss": 0.82993579, + "num_input_tokens_seen": 24208045, + "step": 1131, + "time_per_iteration": 5.00832724571228 + }, + { + "auxiliary_loss_clip": 0.01195331, + "auxiliary_loss_mlp": 0.01070142, + "balance_loss_clip": 1.06155944, + "balance_loss_mlp": 1.04614532, + "epoch": 0.06805952202014129, + "flos": 14864755829760.0, + "grad_norm": 2.0533244923502463, + "language_loss": 0.80371779, + "learning_rate": 3.984825061735701e-06, + "loss": 0.8263725, + "num_input_tokens_seen": 24223805, + "step": 1132, + "time_per_iteration": 4.487931251525879 + }, + { + "auxiliary_loss_clip": 0.01170581, + "auxiliary_loss_mlp": 0.01061867, + "balance_loss_clip": 1.05438542, + "balance_loss_mlp": 1.03756022, + "epoch": 0.06811964527280925, + "flos": 48909434947200.0, + "grad_norm": 1.7182324226465766, + "language_loss": 0.6341064, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.65643084, + "num_input_tokens_seen": 24249475, + "step": 1133, + "time_per_iteration": 4.48089337348938 + }, + { + "auxiliary_loss_clip": 0.01125599, + "auxiliary_loss_mlp": 0.01055984, + "balance_loss_clip": 1.04700482, + "balance_loss_mlp": 1.02973366, + "epoch": 0.06817976852547723, + "flos": 15377273038080.0, + "grad_norm": 1.9264963116598819, + "language_loss": 0.74771935, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.76953518, + "num_input_tokens_seen": 24267980, + "step": 1134, + "time_per_iteration": 5.287277936935425 + }, + { + "auxiliary_loss_clip": 0.01169269, + "auxiliary_loss_mlp": 0.00782536, + "balance_loss_clip": 1.05878353, + "balance_loss_mlp": 1.00042605, + "epoch": 0.0682398917781452, + "flos": 20155690200960.0, + "grad_norm": 2.151108605399924, + "language_loss": 0.86871451, + "learning_rate": 3.984681066946423e-06, + "loss": 0.88823259, + "num_input_tokens_seen": 24286805, + "step": 1135, + "time_per_iteration": 2.8024110794067383 + }, + { + "auxiliary_loss_clip": 0.0117656, + "auxiliary_loss_mlp": 0.007818, + "balance_loss_clip": 1.0543226, + "balance_loss_mlp": 1.00046515, + "epoch": 0.06830001503081316, + "flos": 23440618748160.0, + "grad_norm": 2.521942237810997, + "language_loss": 0.78131735, + "learning_rate": 3.984632918162291e-06, + "loss": 0.80090094, + "num_input_tokens_seen": 24305855, + "step": 1136, + "time_per_iteration": 2.7595040798187256 + }, + { + "auxiliary_loss_clip": 0.01185832, + "auxiliary_loss_mlp": 0.01063587, + "balance_loss_clip": 1.05952621, + "balance_loss_mlp": 1.03868449, + "epoch": 0.06836013828348114, + "flos": 34349813153280.0, + "grad_norm": 2.275643110468061, + "language_loss": 0.83968467, + "learning_rate": 3.984584694120679e-06, + "loss": 0.86217892, + "num_input_tokens_seen": 24326535, + "step": 1137, + "time_per_iteration": 2.7738285064697266 + }, + { + "auxiliary_loss_clip": 0.01153105, + "auxiliary_loss_mlp": 0.01059471, + "balance_loss_clip": 1.05239427, + "balance_loss_mlp": 1.0348897, + "epoch": 0.06842026153614911, + "flos": 23148844571520.0, + "grad_norm": 2.068206081593879, + "language_loss": 0.788486, + "learning_rate": 3.984536394823418e-06, + "loss": 0.81061178, + "num_input_tokens_seen": 24345810, + "step": 1138, + "time_per_iteration": 2.804537296295166 + }, + { + "auxiliary_loss_clip": 0.01209658, + "auxiliary_loss_mlp": 0.01058353, + "balance_loss_clip": 1.06288362, + "balance_loss_mlp": 1.03415346, + "epoch": 0.06848038478881707, + "flos": 24608972430720.0, + "grad_norm": 2.3335265924104096, + "language_loss": 0.85507643, + "learning_rate": 3.984488020272336e-06, + "loss": 0.87775654, + "num_input_tokens_seen": 24366095, + "step": 1139, + "time_per_iteration": 2.746884822845459 + }, + { + "auxiliary_loss_clip": 0.01153855, + "auxiliary_loss_mlp": 0.01063721, + "balance_loss_clip": 1.05325532, + "balance_loss_mlp": 1.03679228, + "epoch": 0.06854050804148504, + "flos": 40880994278400.0, + "grad_norm": 1.9254794009430078, + "language_loss": 0.74899161, + "learning_rate": 3.984439570469271e-06, + "loss": 0.7711674, + "num_input_tokens_seen": 24388665, + "step": 1140, + "time_per_iteration": 2.938143253326416 + }, + { + "auxiliary_loss_clip": 0.01186218, + "auxiliary_loss_mlp": 0.00782227, + "balance_loss_clip": 1.06101704, + "balance_loss_mlp": 1.00036597, + "epoch": 0.06860063129415302, + "flos": 31686354743040.0, + "grad_norm": 2.1250887020504767, + "language_loss": 0.68258876, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.70227319, + "num_input_tokens_seen": 24407705, + "step": 1141, + "time_per_iteration": 2.8180530071258545 + }, + { + "auxiliary_loss_clip": 0.01197117, + "auxiliary_loss_mlp": 0.01067748, + "balance_loss_clip": 1.05978489, + "balance_loss_mlp": 1.04266596, + "epoch": 0.06866075454682098, + "flos": 26542007775360.0, + "grad_norm": 1.8460768582410394, + "language_loss": 0.78959155, + "learning_rate": 3.984342445114538e-06, + "loss": 0.81224018, + "num_input_tokens_seen": 24428390, + "step": 1142, + "time_per_iteration": 2.712876558303833 + }, + { + "auxiliary_loss_clip": 0.01186915, + "auxiliary_loss_mlp": 0.01060882, + "balance_loss_clip": 1.06245089, + "balance_loss_mlp": 1.03702831, + "epoch": 0.06872087779948895, + "flos": 29789768724480.0, + "grad_norm": 1.7867268614306446, + "language_loss": 0.68287402, + "learning_rate": 3.984293769566553e-06, + "loss": 0.70535195, + "num_input_tokens_seen": 24450810, + "step": 1143, + "time_per_iteration": 2.752659320831299 + }, + { + "auxiliary_loss_clip": 0.01177843, + "auxiliary_loss_mlp": 0.01059894, + "balance_loss_clip": 1.05798244, + "balance_loss_mlp": 1.03773308, + "epoch": 0.06878100105215693, + "flos": 26941118768640.0, + "grad_norm": 1.7582250309313294, + "language_loss": 0.74307454, + "learning_rate": 3.98424501877395e-06, + "loss": 0.76545191, + "num_input_tokens_seen": 24469965, + "step": 1144, + "time_per_iteration": 2.6448662281036377 + }, + { + "auxiliary_loss_clip": 0.01189197, + "auxiliary_loss_mlp": 0.0106544, + "balance_loss_clip": 1.0565474, + "balance_loss_mlp": 1.04039407, + "epoch": 0.06884112430482489, + "flos": 10670748946560.0, + "grad_norm": 2.699041414372958, + "language_loss": 0.91755033, + "learning_rate": 3.984196192738577e-06, + "loss": 0.94009674, + "num_input_tokens_seen": 24486370, + "step": 1145, + "time_per_iteration": 2.6621482372283936 + }, + { + "auxiliary_loss_clip": 0.01212189, + "auxiliary_loss_mlp": 0.0106819, + "balance_loss_clip": 1.06225932, + "balance_loss_mlp": 1.04258406, + "epoch": 0.06890124755749286, + "flos": 20193647898240.0, + "grad_norm": 2.2014676012481487, + "language_loss": 0.81726635, + "learning_rate": 3.984147291462285e-06, + "loss": 0.84007025, + "num_input_tokens_seen": 24503780, + "step": 1146, + "time_per_iteration": 2.623964548110962 + }, + { + "auxiliary_loss_clip": 0.01204602, + "auxiliary_loss_mlp": 0.01065301, + "balance_loss_clip": 1.06215203, + "balance_loss_mlp": 1.04191244, + "epoch": 0.06896137081016084, + "flos": 20449224144000.0, + "grad_norm": 2.1265245828428108, + "language_loss": 0.84968954, + "learning_rate": 3.98409831494693e-06, + "loss": 0.8723886, + "num_input_tokens_seen": 24522320, + "step": 1147, + "time_per_iteration": 2.5898265838623047 + }, + { + "auxiliary_loss_clip": 0.01156886, + "auxiliary_loss_mlp": 0.01064453, + "balance_loss_clip": 1.05563867, + "balance_loss_mlp": 1.03949046, + "epoch": 0.0690214940628288, + "flos": 18368703555840.0, + "grad_norm": 1.7557033260323716, + "language_loss": 0.86094105, + "learning_rate": 3.984049263194367e-06, + "loss": 0.88315445, + "num_input_tokens_seen": 24540445, + "step": 1148, + "time_per_iteration": 2.748782157897949 + }, + { + "auxiliary_loss_clip": 0.01173365, + "auxiliary_loss_mlp": 0.01060047, + "balance_loss_clip": 1.05569541, + "balance_loss_mlp": 1.03370178, + "epoch": 0.06908161731549677, + "flos": 20558033418240.0, + "grad_norm": 2.322434023005448, + "language_loss": 0.69602191, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.71835601, + "num_input_tokens_seen": 24557105, + "step": 1149, + "time_per_iteration": 2.741854429244995 + }, + { + "auxiliary_loss_clip": 0.01207871, + "auxiliary_loss_mlp": 0.01051245, + "balance_loss_clip": 1.06034219, + "balance_loss_mlp": 1.02692604, + "epoch": 0.06914174056816474, + "flos": 27563666313600.0, + "grad_norm": 1.9440351937259064, + "language_loss": 0.8374452, + "learning_rate": 3.983950933985064e-06, + "loss": 0.86003637, + "num_input_tokens_seen": 24578240, + "step": 1150, + "time_per_iteration": 2.6919586658477783 + }, + { + "auxiliary_loss_clip": 0.01181406, + "auxiliary_loss_mlp": 0.01058015, + "balance_loss_clip": 1.06063652, + "balance_loss_mlp": 1.03380394, + "epoch": 0.06920186382083271, + "flos": 15304015249920.0, + "grad_norm": 4.11905785776886, + "language_loss": 0.81464434, + "learning_rate": 3.983901656532052e-06, + "loss": 0.83703858, + "num_input_tokens_seen": 24593585, + "step": 1151, + "time_per_iteration": 2.7979934215545654 + }, + { + "auxiliary_loss_clip": 0.01206831, + "auxiliary_loss_mlp": 0.01058184, + "balance_loss_clip": 1.06409955, + "balance_loss_mlp": 1.03434169, + "epoch": 0.06926198707350067, + "flos": 25191227894400.0, + "grad_norm": 2.0324362571668724, + "language_loss": 0.85408235, + "learning_rate": 3.983852303849291e-06, + "loss": 0.87673247, + "num_input_tokens_seen": 24613110, + "step": 1152, + "time_per_iteration": 2.686021089553833 + }, + { + "auxiliary_loss_clip": 0.01190935, + "auxiliary_loss_mlp": 0.01062076, + "balance_loss_clip": 1.06250155, + "balance_loss_mlp": 1.03866374, + "epoch": 0.06932211032616864, + "flos": 13256137146240.0, + "grad_norm": 2.182544196511779, + "language_loss": 0.90594423, + "learning_rate": 3.983802875938651e-06, + "loss": 0.92847437, + "num_input_tokens_seen": 24628795, + "step": 1153, + "time_per_iteration": 2.58366060256958 + }, + { + "auxiliary_loss_clip": 0.01169877, + "auxiliary_loss_mlp": 0.01055253, + "balance_loss_clip": 1.05681062, + "balance_loss_mlp": 1.03088629, + "epoch": 0.06938223357883662, + "flos": 24827381078400.0, + "grad_norm": 2.1214794624630846, + "language_loss": 0.81526846, + "learning_rate": 3.983753372802008e-06, + "loss": 0.83751976, + "num_input_tokens_seen": 24645480, + "step": 1154, + "time_per_iteration": 2.696794271469116 + }, + { + "auxiliary_loss_clip": 0.01188774, + "auxiliary_loss_mlp": 0.01066335, + "balance_loss_clip": 1.0691216, + "balance_loss_mlp": 1.04200506, + "epoch": 0.06944235683150458, + "flos": 27267977554560.0, + "grad_norm": 2.102018399986892, + "language_loss": 0.75022292, + "learning_rate": 3.983703794441237e-06, + "loss": 0.77277398, + "num_input_tokens_seen": 24664630, + "step": 1155, + "time_per_iteration": 2.7718143463134766 + }, + { + "auxiliary_loss_clip": 0.01180696, + "auxiliary_loss_mlp": 0.00782152, + "balance_loss_clip": 1.05586052, + "balance_loss_mlp": 1.00041056, + "epoch": 0.06950248008417255, + "flos": 25808065176960.0, + "grad_norm": 1.7459449483933205, + "language_loss": 0.7110405, + "learning_rate": 3.98365414085822e-06, + "loss": 0.73066902, + "num_input_tokens_seen": 24684210, + "step": 1156, + "time_per_iteration": 2.7014200687408447 + }, + { + "auxiliary_loss_clip": 0.01179101, + "auxiliary_loss_mlp": 0.00782674, + "balance_loss_clip": 1.0593586, + "balance_loss_mlp": 1.00037348, + "epoch": 0.06956260333684053, + "flos": 22271546793600.0, + "grad_norm": 2.067241397655847, + "language_loss": 0.74882817, + "learning_rate": 3.98360441205484e-06, + "loss": 0.76844591, + "num_input_tokens_seen": 24702490, + "step": 1157, + "time_per_iteration": 2.7571897506713867 + }, + { + "auxiliary_loss_clip": 0.01178249, + "auxiliary_loss_mlp": 0.01061737, + "balance_loss_clip": 1.05653787, + "balance_loss_mlp": 1.03697729, + "epoch": 0.0696227265895085, + "flos": 29681390413440.0, + "grad_norm": 1.9827644507913538, + "language_loss": 0.7165724, + "learning_rate": 3.983554608032982e-06, + "loss": 0.73897225, + "num_input_tokens_seen": 24724340, + "step": 1158, + "time_per_iteration": 2.839745044708252 + }, + { + "auxiliary_loss_clip": 0.01207855, + "auxiliary_loss_mlp": 0.01058558, + "balance_loss_clip": 1.0605582, + "balance_loss_mlp": 1.03370285, + "epoch": 0.06968284984217646, + "flos": 25523545547520.0, + "grad_norm": 1.9692207215605615, + "language_loss": 0.79595017, + "learning_rate": 3.983504728794533e-06, + "loss": 0.8186143, + "num_input_tokens_seen": 24745550, + "step": 1159, + "time_per_iteration": 2.7535817623138428 + }, + { + "auxiliary_loss_clip": 0.01212717, + "auxiliary_loss_mlp": 0.01068535, + "balance_loss_clip": 1.06535673, + "balance_loss_mlp": 1.04094958, + "epoch": 0.06974297309484444, + "flos": 20698192287360.0, + "grad_norm": 3.5530789367722373, + "language_loss": 0.80517769, + "learning_rate": 3.983454774341387e-06, + "loss": 0.82799017, + "num_input_tokens_seen": 24762575, + "step": 1160, + "time_per_iteration": 2.7455785274505615 + }, + { + "auxiliary_loss_clip": 0.0119075, + "auxiliary_loss_mlp": 0.01057887, + "balance_loss_clip": 1.05680609, + "balance_loss_mlp": 1.03294837, + "epoch": 0.0698030963475124, + "flos": 26505199313280.0, + "grad_norm": 1.6303409062485206, + "language_loss": 0.7607069, + "learning_rate": 3.983404744675437e-06, + "loss": 0.78319323, + "num_input_tokens_seen": 24782605, + "step": 1161, + "time_per_iteration": 2.773775100708008 + }, + { + "auxiliary_loss_clip": 0.01175787, + "auxiliary_loss_mlp": 0.01062083, + "balance_loss_clip": 1.05773759, + "balance_loss_mlp": 1.03673923, + "epoch": 0.06986321960018037, + "flos": 23040430346880.0, + "grad_norm": 1.6605796421434038, + "language_loss": 0.82758528, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.84996402, + "num_input_tokens_seen": 24802910, + "step": 1162, + "time_per_iteration": 2.7426044940948486 + }, + { + "auxiliary_loss_clip": 0.01182513, + "auxiliary_loss_mlp": 0.01058124, + "balance_loss_clip": 1.05717576, + "balance_loss_mlp": 1.03092098, + "epoch": 0.06992334285284833, + "flos": 28584822061440.0, + "grad_norm": 1.9523155091610094, + "language_loss": 0.79563475, + "learning_rate": 3.983304459712716e-06, + "loss": 0.81804121, + "num_input_tokens_seen": 24823305, + "step": 1163, + "time_per_iteration": 2.720947742462158 + }, + { + "auxiliary_loss_clip": 0.01190519, + "auxiliary_loss_mlp": 0.01063375, + "balance_loss_clip": 1.05861616, + "balance_loss_mlp": 1.03722012, + "epoch": 0.06998346610551631, + "flos": 20595344670720.0, + "grad_norm": 2.213365660843382, + "language_loss": 0.79187214, + "learning_rate": 3.983254204419749e-06, + "loss": 0.81441104, + "num_input_tokens_seen": 24842155, + "step": 1164, + "time_per_iteration": 2.6554183959960938 + }, + { + "auxiliary_loss_clip": 0.01143916, + "auxiliary_loss_mlp": 0.01067459, + "balance_loss_clip": 1.05240798, + "balance_loss_mlp": 1.03875315, + "epoch": 0.07004358935818428, + "flos": 22528810978560.0, + "grad_norm": 1.421930435008642, + "language_loss": 0.72855628, + "learning_rate": 3.983203873921583e-06, + "loss": 0.75067008, + "num_input_tokens_seen": 24862080, + "step": 1165, + "time_per_iteration": 2.753063440322876 + }, + { + "auxiliary_loss_clip": 0.01183824, + "auxiliary_loss_mlp": 0.01059612, + "balance_loss_clip": 1.06135893, + "balance_loss_mlp": 1.03522193, + "epoch": 0.07010371261085224, + "flos": 28949997680640.0, + "grad_norm": 2.453348821242437, + "language_loss": 0.81136239, + "learning_rate": 3.983153468220128e-06, + "loss": 0.83379674, + "num_input_tokens_seen": 24886165, + "step": 1166, + "time_per_iteration": 2.802016496658325 + }, + { + "auxiliary_loss_clip": 0.011718, + "auxiliary_loss_mlp": 0.01053529, + "balance_loss_clip": 1.05450797, + "balance_loss_mlp": 1.02754176, + "epoch": 0.07016383586352022, + "flos": 23659171050240.0, + "grad_norm": 2.457667377154448, + "language_loss": 0.84640259, + "learning_rate": 3.983102987317295e-06, + "loss": 0.86865586, + "num_input_tokens_seen": 24905775, + "step": 1167, + "time_per_iteration": 2.7066097259521484 + }, + { + "auxiliary_loss_clip": 0.01193446, + "auxiliary_loss_mlp": 0.01064209, + "balance_loss_clip": 1.06136739, + "balance_loss_mlp": 1.03887713, + "epoch": 0.07022395911618819, + "flos": 19792130693760.0, + "grad_norm": 2.6158204436543, + "language_loss": 0.89524722, + "learning_rate": 3.983052431214997e-06, + "loss": 0.91782373, + "num_input_tokens_seen": 24924295, + "step": 1168, + "time_per_iteration": 2.6258392333984375 + }, + { + "auxiliary_loss_clip": 0.01190821, + "auxiliary_loss_mlp": 0.01065905, + "balance_loss_clip": 1.06090224, + "balance_loss_mlp": 1.03705645, + "epoch": 0.07028408236885615, + "flos": 21689147675520.0, + "grad_norm": 2.6445150319591035, + "language_loss": 0.89008862, + "learning_rate": 3.983001799915153e-06, + "loss": 0.91265589, + "num_input_tokens_seen": 24943210, + "step": 1169, + "time_per_iteration": 2.6858527660369873 + }, + { + "auxiliary_loss_clip": 0.01211063, + "auxiliary_loss_mlp": 0.01065533, + "balance_loss_clip": 1.06400895, + "balance_loss_mlp": 1.03950977, + "epoch": 0.07034420562152413, + "flos": 25630271832960.0, + "grad_norm": 1.9672897290124218, + "language_loss": 0.83834457, + "learning_rate": 3.982951093419681e-06, + "loss": 0.86111057, + "num_input_tokens_seen": 24960360, + "step": 1170, + "time_per_iteration": 2.6278069019317627 + }, + { + "auxiliary_loss_clip": 0.01180333, + "auxiliary_loss_mlp": 0.00782328, + "balance_loss_clip": 1.0613637, + "balance_loss_mlp": 1.00041986, + "epoch": 0.0704043288741921, + "flos": 20810449267200.0, + "grad_norm": 1.8542795171503503, + "language_loss": 0.75687242, + "learning_rate": 3.982900311730506e-06, + "loss": 0.77649903, + "num_input_tokens_seen": 24978290, + "step": 1171, + "time_per_iteration": 5.806530475616455 + }, + { + "auxiliary_loss_clip": 0.01179645, + "auxiliary_loss_mlp": 0.0106394, + "balance_loss_clip": 1.06133175, + "balance_loss_mlp": 1.03919196, + "epoch": 0.07046445212686006, + "flos": 25593176062080.0, + "grad_norm": 2.482864122539831, + "language_loss": 0.88865125, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.91108704, + "num_input_tokens_seen": 24997055, + "step": 1172, + "time_per_iteration": 4.371561288833618 + }, + { + "auxiliary_loss_clip": 0.01197698, + "auxiliary_loss_mlp": 0.01054991, + "balance_loss_clip": 1.06532764, + "balance_loss_mlp": 1.02858603, + "epoch": 0.07052457537952803, + "flos": 25556978131200.0, + "grad_norm": 1.6816354314161714, + "language_loss": 0.82075119, + "learning_rate": 3.982798522778748e-06, + "loss": 0.84327805, + "num_input_tokens_seen": 25017490, + "step": 1173, + "time_per_iteration": 4.611542463302612 + }, + { + "auxiliary_loss_clip": 0.01200886, + "auxiliary_loss_mlp": 0.01060851, + "balance_loss_clip": 1.06317592, + "balance_loss_mlp": 1.03503036, + "epoch": 0.070584698632196, + "flos": 17968515154560.0, + "grad_norm": 2.007232853627583, + "language_loss": 0.82071686, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.8433342, + "num_input_tokens_seen": 25035660, + "step": 1174, + "time_per_iteration": 2.6334969997406006 + }, + { + "auxiliary_loss_clip": 0.01180907, + "auxiliary_loss_mlp": 0.01059972, + "balance_loss_clip": 1.05857778, + "balance_loss_mlp": 1.03473568, + "epoch": 0.07064482188486397, + "flos": 25370888745600.0, + "grad_norm": 2.09222115072597, + "language_loss": 0.85013211, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87254095, + "num_input_tokens_seen": 25054785, + "step": 1175, + "time_per_iteration": 2.861591339111328 + }, + { + "auxiliary_loss_clip": 0.01196955, + "auxiliary_loss_mlp": 0.01069941, + "balance_loss_clip": 1.06447482, + "balance_loss_mlp": 1.04605186, + "epoch": 0.07070494513753194, + "flos": 24899848767360.0, + "grad_norm": 1.7270820646539309, + "language_loss": 0.83103871, + "learning_rate": 3.982645275446563e-06, + "loss": 0.85370767, + "num_input_tokens_seen": 25075180, + "step": 1176, + "time_per_iteration": 2.754521608352661 + }, + { + "auxiliary_loss_clip": 0.01152261, + "auxiliary_loss_mlp": 0.01062154, + "balance_loss_clip": 1.05370057, + "balance_loss_mlp": 1.0352838, + "epoch": 0.07076506839019991, + "flos": 22338447874560.0, + "grad_norm": 3.4939498355716996, + "language_loss": 0.74409902, + "learning_rate": 3.982594042635701e-06, + "loss": 0.7662431, + "num_input_tokens_seen": 25093035, + "step": 1177, + "time_per_iteration": 2.692426919937134 + }, + { + "auxiliary_loss_clip": 0.01188551, + "auxiliary_loss_mlp": 0.0106394, + "balance_loss_clip": 1.06080353, + "balance_loss_mlp": 1.03801203, + "epoch": 0.07082519164286788, + "flos": 18660800954880.0, + "grad_norm": 1.8240190288677762, + "language_loss": 0.85965598, + "learning_rate": 3.982542734644673e-06, + "loss": 0.88218087, + "num_input_tokens_seen": 25112520, + "step": 1178, + "time_per_iteration": 2.7197048664093018 + }, + { + "auxiliary_loss_clip": 0.01082521, + "auxiliary_loss_mlp": 0.01013999, + "balance_loss_clip": 1.03661168, + "balance_loss_mlp": 1.01023197, + "epoch": 0.07088531489553584, + "flos": 63654107610240.0, + "grad_norm": 0.8453670789764802, + "language_loss": 0.63256603, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65353125, + "num_input_tokens_seen": 25177760, + "step": 1179, + "time_per_iteration": 3.3419978618621826 + }, + { + "auxiliary_loss_clip": 0.01211274, + "auxiliary_loss_mlp": 0.01073372, + "balance_loss_clip": 1.06935215, + "balance_loss_mlp": 1.04858887, + "epoch": 0.07094543814820382, + "flos": 21572688804480.0, + "grad_norm": 3.2714198066984177, + "language_loss": 0.83388901, + "learning_rate": 3.98243989312991e-06, + "loss": 0.85673553, + "num_input_tokens_seen": 25195260, + "step": 1180, + "time_per_iteration": 2.631992816925049 + }, + { + "auxiliary_loss_clip": 0.01182661, + "auxiliary_loss_mlp": 0.01071326, + "balance_loss_clip": 1.06119037, + "balance_loss_mlp": 1.04624391, + "epoch": 0.07100556140087179, + "flos": 22089946608000.0, + "grad_norm": 2.0409456536886386, + "language_loss": 0.88649988, + "learning_rate": 3.982388359610074e-06, + "loss": 0.90903974, + "num_input_tokens_seen": 25212740, + "step": 1181, + "time_per_iteration": 2.696789264678955 + }, + { + "auxiliary_loss_clip": 0.01180377, + "auxiliary_loss_mlp": 0.01070036, + "balance_loss_clip": 1.06187141, + "balance_loss_mlp": 1.04516935, + "epoch": 0.07106568465353975, + "flos": 47922286400640.0, + "grad_norm": 1.8294049229574356, + "language_loss": 0.83244783, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.85495198, + "num_input_tokens_seen": 25236420, + "step": 1182, + "time_per_iteration": 2.9415605068206787 + }, + { + "auxiliary_loss_clip": 0.01193669, + "auxiliary_loss_mlp": 0.01067019, + "balance_loss_clip": 1.0641923, + "balance_loss_mlp": 1.04150808, + "epoch": 0.07112580790620772, + "flos": 23440798316160.0, + "grad_norm": 3.5892595189310903, + "language_loss": 0.79067838, + "learning_rate": 3.982285067055262e-06, + "loss": 0.81328523, + "num_input_tokens_seen": 25255120, + "step": 1183, + "time_per_iteration": 2.7284862995147705 + }, + { + "auxiliary_loss_clip": 0.01211976, + "auxiliary_loss_mlp": 0.01064792, + "balance_loss_clip": 1.06126475, + "balance_loss_mlp": 1.03866172, + "epoch": 0.0711859311588757, + "flos": 31868888682240.0, + "grad_norm": 2.5463322111759354, + "language_loss": 0.788867, + "learning_rate": 3.982233308024204e-06, + "loss": 0.81163466, + "num_input_tokens_seen": 25275150, + "step": 1184, + "time_per_iteration": 2.7531635761260986 + }, + { + "auxiliary_loss_clip": 0.01152059, + "auxiliary_loss_mlp": 0.01062006, + "balance_loss_clip": 1.05961919, + "balance_loss_mlp": 1.03752065, + "epoch": 0.07124605441154366, + "flos": 19610315026560.0, + "grad_norm": 1.904751850318294, + "language_loss": 0.76806915, + "learning_rate": 3.98218147382666e-06, + "loss": 0.79020983, + "num_input_tokens_seen": 25293680, + "step": 1185, + "time_per_iteration": 2.732539176940918 + }, + { + "auxiliary_loss_clip": 0.01208288, + "auxiliary_loss_mlp": 0.01073792, + "balance_loss_clip": 1.06328642, + "balance_loss_mlp": 1.04903185, + "epoch": 0.07130617766421163, + "flos": 14684448533760.0, + "grad_norm": 2.1301142092644696, + "language_loss": 0.65472758, + "learning_rate": 3.982129564464596e-06, + "loss": 0.67754835, + "num_input_tokens_seen": 25310050, + "step": 1186, + "time_per_iteration": 2.757812261581421 + }, + { + "auxiliary_loss_clip": 0.01195497, + "auxiliary_loss_mlp": 0.01057322, + "balance_loss_clip": 1.06479859, + "balance_loss_mlp": 1.03274107, + "epoch": 0.07136630091687961, + "flos": 26067915141120.0, + "grad_norm": 2.1671481434625894, + "language_loss": 0.69743419, + "learning_rate": 3.98207757993998e-06, + "loss": 0.71996236, + "num_input_tokens_seen": 25331020, + "step": 1187, + "time_per_iteration": 2.746615409851074 + }, + { + "auxiliary_loss_clip": 0.01151827, + "auxiliary_loss_mlp": 0.01067347, + "balance_loss_clip": 1.05412316, + "balance_loss_mlp": 1.04367232, + "epoch": 0.07142642416954757, + "flos": 15669190869120.0, + "grad_norm": 2.8037131445876597, + "language_loss": 0.7861973, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.80838895, + "num_input_tokens_seen": 25347875, + "step": 1188, + "time_per_iteration": 2.738281726837158 + }, + { + "auxiliary_loss_clip": 0.01203626, + "auxiliary_loss_mlp": 0.01059966, + "balance_loss_clip": 1.06304908, + "balance_loss_mlp": 1.03530121, + "epoch": 0.07148654742221554, + "flos": 19755322231680.0, + "grad_norm": 1.8909260147246576, + "language_loss": 0.84754103, + "learning_rate": 3.981973385410981e-06, + "loss": 0.87017697, + "num_input_tokens_seen": 25366715, + "step": 1189, + "time_per_iteration": 2.5770246982574463 + }, + { + "auxiliary_loss_clip": 0.01173135, + "auxiliary_loss_mlp": 0.0078213, + "balance_loss_clip": 1.06234396, + "balance_loss_mlp": 1.00041807, + "epoch": 0.07154667067488352, + "flos": 23471824688640.0, + "grad_norm": 5.212083930118342, + "language_loss": 0.76932275, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.78887534, + "num_input_tokens_seen": 25385450, + "step": 1190, + "time_per_iteration": 2.7057712078094482 + }, + { + "auxiliary_loss_clip": 0.01208346, + "auxiliary_loss_mlp": 0.01074705, + "balance_loss_clip": 1.06283545, + "balance_loss_mlp": 1.04751348, + "epoch": 0.07160679392755148, + "flos": 18332936588160.0, + "grad_norm": 2.5312098602102084, + "language_loss": 0.75201792, + "learning_rate": 3.981868890255468e-06, + "loss": 0.7748484, + "num_input_tokens_seen": 25403940, + "step": 1191, + "time_per_iteration": 2.6071674823760986 + }, + { + "auxiliary_loss_clip": 0.01162268, + "auxiliary_loss_mlp": 0.01063437, + "balance_loss_clip": 1.0519917, + "balance_loss_mlp": 1.03649545, + "epoch": 0.07166691718021945, + "flos": 17747017937280.0, + "grad_norm": 2.470839013019174, + "language_loss": 0.74334443, + "learning_rate": 3.981816529947719e-06, + "loss": 0.76560152, + "num_input_tokens_seen": 25420410, + "step": 1192, + "time_per_iteration": 2.661078453063965 + }, + { + "auxiliary_loss_clip": 0.01202036, + "auxiliary_loss_mlp": 0.01054727, + "balance_loss_clip": 1.05904579, + "balance_loss_mlp": 1.03099298, + "epoch": 0.07172704043288743, + "flos": 22451925916800.0, + "grad_norm": 2.443309122344248, + "language_loss": 0.78010541, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.8026731, + "num_input_tokens_seen": 25439415, + "step": 1193, + "time_per_iteration": 2.5603158473968506 + }, + { + "auxiliary_loss_clip": 0.01186747, + "auxiliary_loss_mlp": 0.01059465, + "balance_loss_clip": 1.06358278, + "balance_loss_mlp": 1.03319085, + "epoch": 0.07178716368555539, + "flos": 23222210100480.0, + "grad_norm": 2.1011663585924585, + "language_loss": 0.85497916, + "learning_rate": 3.981711583882166e-06, + "loss": 0.87744129, + "num_input_tokens_seen": 25458715, + "step": 1194, + "time_per_iteration": 2.6819851398468018 + }, + { + "auxiliary_loss_clip": 0.01184191, + "auxiliary_loss_mlp": 0.01067737, + "balance_loss_clip": 1.05706751, + "balance_loss_mlp": 1.04135609, + "epoch": 0.07184728693822336, + "flos": 25150828072320.0, + "grad_norm": 2.0205668140023185, + "language_loss": 0.8183766, + "learning_rate": 3.981658998128341e-06, + "loss": 0.84089589, + "num_input_tokens_seen": 25477985, + "step": 1195, + "time_per_iteration": 2.6646647453308105 + }, + { + "auxiliary_loss_clip": 0.01165951, + "auxiliary_loss_mlp": 0.01063438, + "balance_loss_clip": 1.0578239, + "balance_loss_mlp": 1.03976321, + "epoch": 0.07190741019089132, + "flos": 22711237176960.0, + "grad_norm": 2.161995064372768, + "language_loss": 0.80093575, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82322967, + "num_input_tokens_seen": 25497110, + "step": 1196, + "time_per_iteration": 2.7217979431152344 + }, + { + "auxiliary_loss_clip": 0.01176131, + "auxiliary_loss_mlp": 0.00784114, + "balance_loss_clip": 1.06106043, + "balance_loss_mlp": 1.00034249, + "epoch": 0.0719675334435593, + "flos": 29349791032320.0, + "grad_norm": 2.5905261146074263, + "language_loss": 0.71339291, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.73299539, + "num_input_tokens_seen": 25516555, + "step": 1197, + "time_per_iteration": 2.7931766510009766 + }, + { + "auxiliary_loss_clip": 0.01157444, + "auxiliary_loss_mlp": 0.01055247, + "balance_loss_clip": 1.06130266, + "balance_loss_mlp": 1.03074968, + "epoch": 0.07202765669622727, + "flos": 17639788861440.0, + "grad_norm": 3.074283933156949, + "language_loss": 0.85951984, + "learning_rate": 3.98150079000661e-06, + "loss": 0.88164675, + "num_input_tokens_seen": 25533895, + "step": 1198, + "time_per_iteration": 2.7241532802581787 + }, + { + "auxiliary_loss_clip": 0.01160083, + "auxiliary_loss_mlp": 0.0106501, + "balance_loss_clip": 1.0597434, + "balance_loss_mlp": 1.03944004, + "epoch": 0.07208777994889523, + "flos": 21434038306560.0, + "grad_norm": 2.052617638295489, + "language_loss": 0.83840948, + "learning_rate": 3.981447903685947e-06, + "loss": 0.86066043, + "num_input_tokens_seen": 25554195, + "step": 1199, + "time_per_iteration": 2.71362566947937 + }, + { + "auxiliary_loss_clip": 0.01212755, + "auxiliary_loss_mlp": 0.01060557, + "balance_loss_clip": 1.06877887, + "balance_loss_mlp": 1.03709614, + "epoch": 0.07214790320156321, + "flos": 26940867373440.0, + "grad_norm": 3.1601590133124837, + "language_loss": 0.7623595, + "learning_rate": 3.981394942228581e-06, + "loss": 0.78509259, + "num_input_tokens_seen": 25574155, + "step": 1200, + "time_per_iteration": 2.6913061141967773 + }, + { + "auxiliary_loss_clip": 0.0119008, + "auxiliary_loss_mlp": 0.010701, + "balance_loss_clip": 1.06442261, + "balance_loss_mlp": 1.04487491, + "epoch": 0.07220802645423118, + "flos": 23879949995520.0, + "grad_norm": 2.2017873087036226, + "language_loss": 0.83013475, + "learning_rate": 3.98134190563652e-06, + "loss": 0.85273659, + "num_input_tokens_seen": 25592735, + "step": 1201, + "time_per_iteration": 2.6983115673065186 + }, + { + "auxiliary_loss_clip": 0.01196941, + "auxiliary_loss_mlp": 0.01065672, + "balance_loss_clip": 1.06197119, + "balance_loss_mlp": 1.03952968, + "epoch": 0.07226814970689914, + "flos": 19243631036160.0, + "grad_norm": 20.835065187143087, + "language_loss": 0.68601412, + "learning_rate": 3.981288793911775e-06, + "loss": 0.70864022, + "num_input_tokens_seen": 25611510, + "step": 1202, + "time_per_iteration": 2.691742420196533 + }, + { + "auxiliary_loss_clip": 0.01182684, + "auxiliary_loss_mlp": 0.00782201, + "balance_loss_clip": 1.06256962, + "balance_loss_mlp": 1.00038218, + "epoch": 0.07232827295956712, + "flos": 19172025273600.0, + "grad_norm": 1.9661831136137597, + "language_loss": 0.87487721, + "learning_rate": 3.98123560705636e-06, + "loss": 0.89452606, + "num_input_tokens_seen": 25629560, + "step": 1203, + "time_per_iteration": 2.7832019329071045 + }, + { + "auxiliary_loss_clip": 0.01154778, + "auxiliary_loss_mlp": 0.01065748, + "balance_loss_clip": 1.05210066, + "balance_loss_mlp": 1.04065442, + "epoch": 0.07238839621223508, + "flos": 17639752947840.0, + "grad_norm": 1.731721557525142, + "language_loss": 0.78053147, + "learning_rate": 3.981182345072293e-06, + "loss": 0.80273676, + "num_input_tokens_seen": 25648330, + "step": 1204, + "time_per_iteration": 2.7754547595977783 + }, + { + "auxiliary_loss_clip": 0.01191832, + "auxiliary_loss_mlp": 0.01065794, + "balance_loss_clip": 1.06211591, + "balance_loss_mlp": 1.04084373, + "epoch": 0.07244851946490305, + "flos": 28292401440000.0, + "grad_norm": 1.5043252978087258, + "language_loss": 0.82094097, + "learning_rate": 3.981129007961593e-06, + "loss": 0.84351724, + "num_input_tokens_seen": 25669470, + "step": 1205, + "time_per_iteration": 2.680457353591919 + }, + { + "auxiliary_loss_clip": 0.01180244, + "auxiliary_loss_mlp": 0.00782807, + "balance_loss_clip": 1.06221068, + "balance_loss_mlp": 1.00036049, + "epoch": 0.07250864271757101, + "flos": 22564829341440.0, + "grad_norm": 1.6438962430217685, + "language_loss": 0.76715982, + "learning_rate": 3.981075595726283e-06, + "loss": 0.78679025, + "num_input_tokens_seen": 25690470, + "step": 1206, + "time_per_iteration": 2.7028439044952393 + }, + { + "auxiliary_loss_clip": 0.01188223, + "auxiliary_loss_mlp": 0.01059861, + "balance_loss_clip": 1.06262684, + "balance_loss_mlp": 1.03442228, + "epoch": 0.072568765970239, + "flos": 21762405463680.0, + "grad_norm": 1.9378198243304647, + "language_loss": 0.77272987, + "learning_rate": 3.981022108368387e-06, + "loss": 0.79521072, + "num_input_tokens_seen": 25709205, + "step": 1207, + "time_per_iteration": 2.779289960861206 + }, + { + "auxiliary_loss_clip": 0.01185538, + "auxiliary_loss_mlp": 0.01053693, + "balance_loss_clip": 1.05844951, + "balance_loss_mlp": 1.03062558, + "epoch": 0.07262888922290696, + "flos": 25519702792320.0, + "grad_norm": 1.8716528383816402, + "language_loss": 0.79480875, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.81720108, + "num_input_tokens_seen": 25728485, + "step": 1208, + "time_per_iteration": 2.682965040206909 + }, + { + "auxiliary_loss_clip": 0.01184899, + "auxiliary_loss_mlp": 0.01054862, + "balance_loss_clip": 1.05801737, + "balance_loss_mlp": 1.03198612, + "epoch": 0.07268901247557492, + "flos": 21246548290560.0, + "grad_norm": 2.5612886109689765, + "language_loss": 0.78537548, + "learning_rate": 3.980914908292955e-06, + "loss": 0.80777311, + "num_input_tokens_seen": 25747730, + "step": 1209, + "time_per_iteration": 2.6582658290863037 + }, + { + "auxiliary_loss_clip": 0.01191905, + "auxiliary_loss_mlp": 0.01067741, + "balance_loss_clip": 1.05931175, + "balance_loss_mlp": 1.04408956, + "epoch": 0.0727491357282429, + "flos": 25479302970240.0, + "grad_norm": 2.351303434522043, + "language_loss": 0.80920583, + "learning_rate": 3.980861195579486e-06, + "loss": 0.83180225, + "num_input_tokens_seen": 25768050, + "step": 1210, + "time_per_iteration": 4.241993427276611 + }, + { + "auxiliary_loss_clip": 0.0117493, + "auxiliary_loss_mlp": 0.01063711, + "balance_loss_clip": 1.06087565, + "balance_loss_mlp": 1.03891551, + "epoch": 0.07280925898091087, + "flos": 24462169545600.0, + "grad_norm": 1.875347829314158, + "language_loss": 0.84302205, + "learning_rate": 3.98080740775156e-06, + "loss": 0.86540848, + "num_input_tokens_seen": 25787985, + "step": 1211, + "time_per_iteration": 4.289919853210449 + }, + { + "auxiliary_loss_clip": 0.01162055, + "auxiliary_loss_mlp": 0.01060218, + "balance_loss_clip": 1.05356658, + "balance_loss_mlp": 1.03629231, + "epoch": 0.07286938223357883, + "flos": 18288191220480.0, + "grad_norm": 2.991110515222773, + "language_loss": 0.90684664, + "learning_rate": 3.98075354481122e-06, + "loss": 0.92906934, + "num_input_tokens_seen": 25803620, + "step": 1212, + "time_per_iteration": 2.660780906677246 + }, + { + "auxiliary_loss_clip": 0.01202443, + "auxiliary_loss_mlp": 0.01058817, + "balance_loss_clip": 1.0623759, + "balance_loss_mlp": 1.03490353, + "epoch": 0.07292950548624681, + "flos": 21214803646080.0, + "grad_norm": 1.7918815842724805, + "language_loss": 0.72358596, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.74619853, + "num_input_tokens_seen": 25823315, + "step": 1213, + "time_per_iteration": 4.303524017333984 + }, + { + "auxiliary_loss_clip": 0.01153662, + "auxiliary_loss_mlp": 0.01055706, + "balance_loss_clip": 1.05658662, + "balance_loss_mlp": 1.03089869, + "epoch": 0.07298962873891478, + "flos": 24642009964800.0, + "grad_norm": 1.8655932637344164, + "language_loss": 0.84356117, + "learning_rate": 3.980645593601465e-06, + "loss": 0.86565483, + "num_input_tokens_seen": 25842605, + "step": 1214, + "time_per_iteration": 2.7505569458007812 + }, + { + "auxiliary_loss_clip": 0.01208881, + "auxiliary_loss_mlp": 0.01062075, + "balance_loss_clip": 1.06484771, + "balance_loss_mlp": 1.03723145, + "epoch": 0.07304975199158274, + "flos": 27052765217280.0, + "grad_norm": 2.025651344907852, + "language_loss": 0.84113681, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86384636, + "num_input_tokens_seen": 25863030, + "step": 1215, + "time_per_iteration": 2.7235965728759766 + }, + { + "auxiliary_loss_clip": 0.01149957, + "auxiliary_loss_mlp": 0.01062992, + "balance_loss_clip": 1.05138278, + "balance_loss_mlp": 1.03744531, + "epoch": 0.07310987524425071, + "flos": 33549544091520.0, + "grad_norm": 1.9312816725096997, + "language_loss": 0.80926049, + "learning_rate": 3.980537341966595e-06, + "loss": 0.83139002, + "num_input_tokens_seen": 25888015, + "step": 1216, + "time_per_iteration": 2.9129130840301514 + }, + { + "auxiliary_loss_clip": 0.01167944, + "auxiliary_loss_mlp": 0.01060276, + "balance_loss_clip": 1.05619049, + "balance_loss_mlp": 1.03680408, + "epoch": 0.07316999849691869, + "flos": 28110944908800.0, + "grad_norm": 3.2846247291101975, + "language_loss": 0.75949144, + "learning_rate": 3.980483103494872e-06, + "loss": 0.78177369, + "num_input_tokens_seen": 25908660, + "step": 1217, + "time_per_iteration": 2.7106521129608154 + }, + { + "auxiliary_loss_clip": 0.01169026, + "auxiliary_loss_mlp": 0.01056631, + "balance_loss_clip": 1.06182647, + "balance_loss_mlp": 1.03477991, + "epoch": 0.07323012174958665, + "flos": 14392602529920.0, + "grad_norm": 1.9658490798069863, + "language_loss": 0.86455309, + "learning_rate": 3.98042878992303e-06, + "loss": 0.88680959, + "num_input_tokens_seen": 25927215, + "step": 1218, + "time_per_iteration": 2.5911786556243896 + }, + { + "auxiliary_loss_clip": 0.01192266, + "auxiliary_loss_mlp": 0.0106258, + "balance_loss_clip": 1.06015348, + "balance_loss_mlp": 1.03916681, + "epoch": 0.07329024500225462, + "flos": 21616428591360.0, + "grad_norm": 2.2310702082820675, + "language_loss": 0.86782354, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.89037204, + "num_input_tokens_seen": 25945500, + "step": 1219, + "time_per_iteration": 2.608562707901001 + }, + { + "auxiliary_loss_clip": 0.01201545, + "auxiliary_loss_mlp": 0.01058282, + "balance_loss_clip": 1.06024373, + "balance_loss_mlp": 1.03539419, + "epoch": 0.0733503682549226, + "flos": 13224141106560.0, + "grad_norm": 2.095886373367052, + "language_loss": 0.84608674, + "learning_rate": 3.980319937487235e-06, + "loss": 0.86868501, + "num_input_tokens_seen": 25963105, + "step": 1220, + "time_per_iteration": 2.469189405441284 + }, + { + "auxiliary_loss_clip": 0.01158855, + "auxiliary_loss_mlp": 0.01063399, + "balance_loss_clip": 1.05358922, + "balance_loss_mlp": 1.03942597, + "epoch": 0.07341049150759056, + "flos": 20886975192960.0, + "grad_norm": 2.648884311755534, + "language_loss": 0.77114344, + "learning_rate": 3.98026539862741e-06, + "loss": 0.79336596, + "num_input_tokens_seen": 25981690, + "step": 1221, + "time_per_iteration": 2.671762466430664 + }, + { + "auxiliary_loss_clip": 0.01158201, + "auxiliary_loss_mlp": 0.01064916, + "balance_loss_clip": 1.05726743, + "balance_loss_mlp": 1.04082406, + "epoch": 0.07347061476025853, + "flos": 15413614623360.0, + "grad_norm": 2.5357389392469942, + "language_loss": 0.91631913, + "learning_rate": 3.980210784675722e-06, + "loss": 0.93855029, + "num_input_tokens_seen": 25999890, + "step": 1222, + "time_per_iteration": 2.6973063945770264 + }, + { + "auxiliary_loss_clip": 0.01135907, + "auxiliary_loss_mlp": 0.01064872, + "balance_loss_clip": 1.05333126, + "balance_loss_mlp": 1.04169726, + "epoch": 0.0735307380129265, + "flos": 11108859131520.0, + "grad_norm": 2.8024324299253047, + "language_loss": 0.90976465, + "learning_rate": 3.980156095634242e-06, + "loss": 0.93177247, + "num_input_tokens_seen": 26016445, + "step": 1223, + "time_per_iteration": 2.8141093254089355 + }, + { + "auxiliary_loss_clip": 0.01202875, + "auxiliary_loss_mlp": 0.01077185, + "balance_loss_clip": 1.06232905, + "balance_loss_mlp": 1.05341494, + "epoch": 0.07359086126559447, + "flos": 23732392924800.0, + "grad_norm": 1.9348534518871447, + "language_loss": 0.82161939, + "learning_rate": 3.980101331505045e-06, + "loss": 0.84442002, + "num_input_tokens_seen": 26036080, + "step": 1224, + "time_per_iteration": 2.640432119369507 + }, + { + "auxiliary_loss_clip": 0.01200329, + "auxiliary_loss_mlp": 0.01057586, + "balance_loss_clip": 1.05987597, + "balance_loss_mlp": 1.03229022, + "epoch": 0.07365098451826244, + "flos": 20993270515200.0, + "grad_norm": 2.31744406237409, + "language_loss": 0.83194047, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.85451961, + "num_input_tokens_seen": 26055805, + "step": 1225, + "time_per_iteration": 2.6159210205078125 + }, + { + "auxiliary_loss_clip": 0.01170115, + "auxiliary_loss_mlp": 0.01056068, + "balance_loss_clip": 1.05743551, + "balance_loss_mlp": 1.03190422, + "epoch": 0.0737111077709304, + "flos": 19933582452480.0, + "grad_norm": 2.2959030425986544, + "language_loss": 0.90388274, + "learning_rate": 3.979991577991808e-06, + "loss": 0.9261446, + "num_input_tokens_seen": 26073905, + "step": 1226, + "time_per_iteration": 2.6527435779571533 + }, + { + "auxiliary_loss_clip": 0.01207799, + "auxiliary_loss_mlp": 0.0104599, + "balance_loss_clip": 1.05913424, + "balance_loss_mlp": 1.02080154, + "epoch": 0.07377123102359838, + "flos": 16581537342720.0, + "grad_norm": 2.579592162134606, + "language_loss": 0.76626784, + "learning_rate": 3.97993658861193e-06, + "loss": 0.78880572, + "num_input_tokens_seen": 26091700, + "step": 1227, + "time_per_iteration": 2.596151351928711 + }, + { + "auxiliary_loss_clip": 0.0118909, + "auxiliary_loss_mlp": 0.01053386, + "balance_loss_clip": 1.06296694, + "balance_loss_mlp": 1.02954459, + "epoch": 0.07383135427626634, + "flos": 28328563457280.0, + "grad_norm": 7.788838200212175, + "language_loss": 0.8555491, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.87797379, + "num_input_tokens_seen": 26114105, + "step": 1228, + "time_per_iteration": 2.6955716609954834 + }, + { + "auxiliary_loss_clip": 0.01191175, + "auxiliary_loss_mlp": 0.01062669, + "balance_loss_clip": 1.05897212, + "balance_loss_mlp": 1.03860044, + "epoch": 0.07389147752893431, + "flos": 20047168235520.0, + "grad_norm": 2.2575099517148898, + "language_loss": 0.79598552, + "learning_rate": 3.97982638461608e-06, + "loss": 0.818524, + "num_input_tokens_seen": 26131165, + "step": 1229, + "time_per_iteration": 2.6544861793518066 + }, + { + "auxiliary_loss_clip": 0.01192886, + "auxiliary_loss_mlp": 0.00782044, + "balance_loss_clip": 1.05966699, + "balance_loss_mlp": 1.00032902, + "epoch": 0.07395160078160229, + "flos": 18114132890880.0, + "grad_norm": 2.2881874382496377, + "language_loss": 0.78209347, + "learning_rate": 3.979771170004287e-06, + "loss": 0.80184281, + "num_input_tokens_seen": 26150040, + "step": 1230, + "time_per_iteration": 2.6001133918762207 + }, + { + "auxiliary_loss_clip": 0.0120142, + "auxiliary_loss_mlp": 0.01052342, + "balance_loss_clip": 1.06209648, + "balance_loss_mlp": 1.02739108, + "epoch": 0.07401172403427025, + "flos": 23586918842880.0, + "grad_norm": 2.038847041772147, + "language_loss": 0.8136946, + "learning_rate": 3.979715880319372e-06, + "loss": 0.83623219, + "num_input_tokens_seen": 26169380, + "step": 1231, + "time_per_iteration": 2.6364073753356934 + }, + { + "auxiliary_loss_clip": 0.01179975, + "auxiliary_loss_mlp": 0.01070917, + "balance_loss_clip": 1.05690873, + "balance_loss_mlp": 1.04599047, + "epoch": 0.07407184728693822, + "flos": 26359904799360.0, + "grad_norm": 2.096832924731062, + "language_loss": 0.95204866, + "learning_rate": 3.979660515563434e-06, + "loss": 0.97455758, + "num_input_tokens_seen": 26189420, + "step": 1232, + "time_per_iteration": 2.7929203510284424 + }, + { + "auxiliary_loss_clip": 0.01187282, + "auxiliary_loss_mlp": 0.01059661, + "balance_loss_clip": 1.06202245, + "balance_loss_mlp": 1.03733301, + "epoch": 0.0741319705396062, + "flos": 22200443821440.0, + "grad_norm": 1.7778448126368063, + "language_loss": 0.80695188, + "learning_rate": 3.979605075738569e-06, + "loss": 0.82942128, + "num_input_tokens_seen": 26209300, + "step": 1233, + "time_per_iteration": 2.7945051193237305 + }, + { + "auxiliary_loss_clip": 0.01209245, + "auxiliary_loss_mlp": 0.0106207, + "balance_loss_clip": 1.06238747, + "balance_loss_mlp": 1.03602231, + "epoch": 0.07419209379227416, + "flos": 39200482523520.0, + "grad_norm": 2.136728864247421, + "language_loss": 0.70708907, + "learning_rate": 3.979549560846883e-06, + "loss": 0.72980225, + "num_input_tokens_seen": 26228110, + "step": 1234, + "time_per_iteration": 2.9646782875061035 + }, + { + "auxiliary_loss_clip": 0.01167486, + "auxiliary_loss_mlp": 0.01068879, + "balance_loss_clip": 1.0542618, + "balance_loss_mlp": 1.04265285, + "epoch": 0.07425221704494213, + "flos": 22781657790720.0, + "grad_norm": 1.7921102377369336, + "language_loss": 0.76852918, + "learning_rate": 3.979493970890478e-06, + "loss": 0.79089284, + "num_input_tokens_seen": 26247020, + "step": 1235, + "time_per_iteration": 2.820577621459961 + }, + { + "auxiliary_loss_clip": 0.01198028, + "auxiliary_loss_mlp": 0.01055883, + "balance_loss_clip": 1.05918813, + "balance_loss_mlp": 1.0321244, + "epoch": 0.0743123402976101, + "flos": 22272983337600.0, + "grad_norm": 2.3018318065058097, + "language_loss": 0.82748145, + "learning_rate": 3.979438305871464e-06, + "loss": 0.85002053, + "num_input_tokens_seen": 26265750, + "step": 1236, + "time_per_iteration": 2.6302287578582764 + }, + { + "auxiliary_loss_clip": 0.01154783, + "auxiliary_loss_mlp": 0.00782014, + "balance_loss_clip": 1.05519629, + "balance_loss_mlp": 1.00039148, + "epoch": 0.07437246355027807, + "flos": 29315029645440.0, + "grad_norm": 1.7985383717833268, + "language_loss": 0.7595011, + "learning_rate": 3.979382565791951e-06, + "loss": 0.77886909, + "num_input_tokens_seen": 26287905, + "step": 1237, + "time_per_iteration": 2.721931219100952 + }, + { + "auxiliary_loss_clip": 0.01135551, + "auxiliary_loss_mlp": 0.00783311, + "balance_loss_clip": 1.0505693, + "balance_loss_mlp": 1.00031757, + "epoch": 0.07443258680294604, + "flos": 31944732249600.0, + "grad_norm": 1.6915170784810407, + "language_loss": 0.77458763, + "learning_rate": 3.979326750654053e-06, + "loss": 0.79377621, + "num_input_tokens_seen": 26311795, + "step": 1238, + "time_per_iteration": 2.831620931625366 + }, + { + "auxiliary_loss_clip": 0.01177529, + "auxiliary_loss_mlp": 0.01057762, + "balance_loss_clip": 1.05673254, + "balance_loss_mlp": 1.03311002, + "epoch": 0.074492710055614, + "flos": 22675290641280.0, + "grad_norm": 1.9053364150897723, + "language_loss": 0.867737, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.89008987, + "num_input_tokens_seen": 26330330, + "step": 1239, + "time_per_iteration": 2.6697263717651367 + }, + { + "auxiliary_loss_clip": 0.01159844, + "auxiliary_loss_mlp": 0.01050954, + "balance_loss_clip": 1.05222142, + "balance_loss_mlp": 1.02532458, + "epoch": 0.07455283330828198, + "flos": 21284901037440.0, + "grad_norm": 26.978042105238785, + "language_loss": 0.89356089, + "learning_rate": 3.979214895211569e-06, + "loss": 0.91566885, + "num_input_tokens_seen": 26348865, + "step": 1240, + "time_per_iteration": 2.846013069152832 + }, + { + "auxiliary_loss_clip": 0.01174117, + "auxiliary_loss_mlp": 0.01063539, + "balance_loss_clip": 1.05857158, + "balance_loss_mlp": 1.03713393, + "epoch": 0.07461295656094995, + "flos": 24388408967040.0, + "grad_norm": 1.9346624045484253, + "language_loss": 0.88873678, + "learning_rate": 3.979158854911225e-06, + "loss": 0.91111326, + "num_input_tokens_seen": 26368210, + "step": 1241, + "time_per_iteration": 2.6926562786102295 + }, + { + "auxiliary_loss_clip": 0.01079637, + "auxiliary_loss_mlp": 0.01009562, + "balance_loss_clip": 1.03489435, + "balance_loss_mlp": 1.00405502, + "epoch": 0.07467307981361791, + "flos": 62109660574080.0, + "grad_norm": 0.8973011136706247, + "language_loss": 0.63067901, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65157104, + "num_input_tokens_seen": 26424890, + "step": 1242, + "time_per_iteration": 3.298609972000122 + }, + { + "auxiliary_loss_clip": 0.01164269, + "auxiliary_loss_mlp": 0.01068833, + "balance_loss_clip": 1.05246222, + "balance_loss_mlp": 1.03819644, + "epoch": 0.07473320306628589, + "flos": 24863148046080.0, + "grad_norm": 3.87499965477456, + "language_loss": 0.62926078, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.65159178, + "num_input_tokens_seen": 26446405, + "step": 1243, + "time_per_iteration": 2.7774572372436523 + }, + { + "auxiliary_loss_clip": 0.01188864, + "auxiliary_loss_mlp": 0.01059918, + "balance_loss_clip": 1.05716145, + "balance_loss_mlp": 1.03499091, + "epoch": 0.07479332631895386, + "flos": 24897442556160.0, + "grad_norm": 1.6252135866538246, + "language_loss": 0.76259589, + "learning_rate": 3.978990283719296e-06, + "loss": 0.78508377, + "num_input_tokens_seen": 26466070, + "step": 1244, + "time_per_iteration": 2.714459180831909 + }, + { + "auxiliary_loss_clip": 0.01184345, + "auxiliary_loss_mlp": 0.00783076, + "balance_loss_clip": 1.0611167, + "balance_loss_mlp": 1.00038469, + "epoch": 0.07485344957162182, + "flos": 17815247821440.0, + "grad_norm": 5.636002853507256, + "language_loss": 0.69419599, + "learning_rate": 3.978933943232123e-06, + "loss": 0.71387023, + "num_input_tokens_seen": 26479350, + "step": 1245, + "time_per_iteration": 2.640895366668701 + }, + { + "auxiliary_loss_clip": 0.01203955, + "auxiliary_loss_mlp": 0.01062684, + "balance_loss_clip": 1.06098139, + "balance_loss_mlp": 1.0372088, + "epoch": 0.0749135728242898, + "flos": 25010202326400.0, + "grad_norm": 2.5525245798098757, + "language_loss": 0.88635457, + "learning_rate": 3.978877527703576e-06, + "loss": 0.90902102, + "num_input_tokens_seen": 26498255, + "step": 1246, + "time_per_iteration": 2.747765302658081 + }, + { + "auxiliary_loss_clip": 0.01212369, + "auxiliary_loss_mlp": 0.01077452, + "balance_loss_clip": 1.06102896, + "balance_loss_mlp": 1.049402, + "epoch": 0.07497369607695777, + "flos": 17822071405440.0, + "grad_norm": 2.675073323546491, + "language_loss": 0.8825295, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.90542769, + "num_input_tokens_seen": 26515375, + "step": 1247, + "time_per_iteration": 2.6810224056243896 + }, + { + "auxiliary_loss_clip": 0.0118495, + "auxiliary_loss_mlp": 0.01069489, + "balance_loss_clip": 1.06058884, + "balance_loss_mlp": 1.04383492, + "epoch": 0.07503381932962573, + "flos": 15121086261120.0, + "grad_norm": 2.620559853720615, + "language_loss": 0.64849806, + "learning_rate": 3.978764471530921e-06, + "loss": 0.67104244, + "num_input_tokens_seen": 26533595, + "step": 1248, + "time_per_iteration": 2.706862449645996 + }, + { + "auxiliary_loss_clip": 0.01181878, + "auxiliary_loss_mlp": 0.00782677, + "balance_loss_clip": 1.0575974, + "balance_loss_mlp": 1.0004611, + "epoch": 0.0750939425822937, + "flos": 12816734071680.0, + "grad_norm": 2.872208543000993, + "language_loss": 0.74216163, + "learning_rate": 3.978707830891102e-06, + "loss": 0.7618072, + "num_input_tokens_seen": 26549405, + "step": 1249, + "time_per_iteration": 4.309665679931641 + }, + { + "auxiliary_loss_clip": 0.01168375, + "auxiliary_loss_mlp": 0.01079691, + "balance_loss_clip": 1.0579834, + "balance_loss_mlp": 1.05296445, + "epoch": 0.07515406583496168, + "flos": 24206844695040.0, + "grad_norm": 2.679176110316805, + "language_loss": 0.82353318, + "learning_rate": 3.978651115218482e-06, + "loss": 0.84601378, + "num_input_tokens_seen": 26567200, + "step": 1250, + "time_per_iteration": 4.367432594299316 + }, + { + "auxiliary_loss_clip": 0.011507, + "auxiliary_loss_mlp": 0.01064103, + "balance_loss_clip": 1.05736125, + "balance_loss_mlp": 1.0380677, + "epoch": 0.07521418908762964, + "flos": 26688164215680.0, + "grad_norm": 2.015636709873133, + "language_loss": 0.6679548, + "learning_rate": 3.978594324515215e-06, + "loss": 0.69010288, + "num_input_tokens_seen": 26586190, + "step": 1251, + "time_per_iteration": 4.339111089706421 + }, + { + "auxiliary_loss_clip": 0.01061099, + "auxiliary_loss_mlp": 0.01007289, + "balance_loss_clip": 1.02992618, + "balance_loss_mlp": 1.00314093, + "epoch": 0.0752743123402976, + "flos": 59095140589440.0, + "grad_norm": 0.9014655793512963, + "language_loss": 0.7038399, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72452378, + "num_input_tokens_seen": 26650710, + "step": 1252, + "time_per_iteration": 4.984445333480835 + }, + { + "auxiliary_loss_clip": 0.0120348, + "auxiliary_loss_mlp": 0.01071343, + "balance_loss_clip": 1.06016684, + "balance_loss_mlp": 1.04651129, + "epoch": 0.07533443559296558, + "flos": 23477032160640.0, + "grad_norm": 2.2789224049077226, + "language_loss": 0.79936707, + "learning_rate": 3.97848051802535e-06, + "loss": 0.82211524, + "num_input_tokens_seen": 26669000, + "step": 1253, + "time_per_iteration": 2.613696575164795 + }, + { + "auxiliary_loss_clip": 0.01165402, + "auxiliary_loss_mlp": 0.01062493, + "balance_loss_clip": 1.05703712, + "balance_loss_mlp": 1.03758967, + "epoch": 0.07539455884563355, + "flos": 20879110114560.0, + "grad_norm": 3.1057458778243263, + "language_loss": 0.93360364, + "learning_rate": 3.978423502243069e-06, + "loss": 0.95588255, + "num_input_tokens_seen": 26683075, + "step": 1254, + "time_per_iteration": 2.7332606315612793 + }, + { + "auxiliary_loss_clip": 0.011733, + "auxiliary_loss_mlp": 0.01064454, + "balance_loss_clip": 1.06050682, + "balance_loss_mlp": 1.03958726, + "epoch": 0.07545468209830151, + "flos": 27672906551040.0, + "grad_norm": 2.090631066181037, + "language_loss": 0.88087487, + "learning_rate": 3.97836641143877e-06, + "loss": 0.90325236, + "num_input_tokens_seen": 26701875, + "step": 1255, + "time_per_iteration": 2.713636875152588 + }, + { + "auxiliary_loss_clip": 0.01202338, + "auxiliary_loss_mlp": 0.01071467, + "balance_loss_clip": 1.06138325, + "balance_loss_mlp": 1.04531264, + "epoch": 0.0755148053509695, + "flos": 14136990370560.0, + "grad_norm": 1.9772348994273161, + "language_loss": 0.79305708, + "learning_rate": 3.978309245614618e-06, + "loss": 0.81579506, + "num_input_tokens_seen": 26719050, + "step": 1256, + "time_per_iteration": 2.688812255859375 + }, + { + "auxiliary_loss_clip": 0.01064506, + "auxiliary_loss_mlp": 0.01008663, + "balance_loss_clip": 1.0281384, + "balance_loss_mlp": 1.0043, + "epoch": 0.07557492860363746, + "flos": 58235257929600.0, + "grad_norm": 0.7721513084275832, + "language_loss": 0.58031851, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.6010502, + "num_input_tokens_seen": 26780650, + "step": 1257, + "time_per_iteration": 3.290971517562866 + }, + { + "auxiliary_loss_clip": 0.01154091, + "auxiliary_loss_mlp": 0.01065293, + "balance_loss_clip": 1.06175375, + "balance_loss_mlp": 1.04035461, + "epoch": 0.07563505185630542, + "flos": 24644380262400.0, + "grad_norm": 2.5700283098608026, + "language_loss": 0.90029764, + "learning_rate": 3.978194688915432e-06, + "loss": 0.92249143, + "num_input_tokens_seen": 26798725, + "step": 1258, + "time_per_iteration": 2.800297975540161 + }, + { + "auxiliary_loss_clip": 0.01169581, + "auxiliary_loss_mlp": 0.01064585, + "balance_loss_clip": 1.06184185, + "balance_loss_mlp": 1.03797793, + "epoch": 0.07569517510897339, + "flos": 15522998515200.0, + "grad_norm": 2.1868972302346377, + "language_loss": 0.81404132, + "learning_rate": 3.978137298044741e-06, + "loss": 0.83638299, + "num_input_tokens_seen": 26817005, + "step": 1259, + "time_per_iteration": 2.767717123031616 + }, + { + "auxiliary_loss_clip": 0.01194891, + "auxiliary_loss_mlp": 0.01062022, + "balance_loss_clip": 1.06317782, + "balance_loss_mlp": 1.03766739, + "epoch": 0.07575529836164137, + "flos": 22928532503040.0, + "grad_norm": 1.8876128491153832, + "language_loss": 0.7609086, + "learning_rate": 3.978079832162885e-06, + "loss": 0.78347778, + "num_input_tokens_seen": 26836655, + "step": 1260, + "time_per_iteration": 2.859339714050293 + }, + { + "auxiliary_loss_clip": 0.01160098, + "auxiliary_loss_mlp": 0.01068568, + "balance_loss_clip": 1.05432057, + "balance_loss_mlp": 1.04222322, + "epoch": 0.07581542161430933, + "flos": 19500428344320.0, + "grad_norm": 1.7028037437197219, + "language_loss": 0.84734851, + "learning_rate": 3.978022291272044e-06, + "loss": 0.86963522, + "num_input_tokens_seen": 26854925, + "step": 1261, + "time_per_iteration": 2.773087978363037 + }, + { + "auxiliary_loss_clip": 0.01212087, + "auxiliary_loss_mlp": 0.0106726, + "balance_loss_clip": 1.06821966, + "balance_loss_mlp": 1.04273915, + "epoch": 0.0758755448669773, + "flos": 24973465691520.0, + "grad_norm": 1.8668314773439494, + "language_loss": 0.82578814, + "learning_rate": 3.977964675374399e-06, + "loss": 0.84858155, + "num_input_tokens_seen": 26876170, + "step": 1262, + "time_per_iteration": 2.681764841079712 + }, + { + "auxiliary_loss_clip": 0.01206367, + "auxiliary_loss_mlp": 0.0106285, + "balance_loss_clip": 1.06333947, + "balance_loss_mlp": 1.03685009, + "epoch": 0.07593566811964528, + "flos": 22747973811840.0, + "grad_norm": 2.501362251414687, + "language_loss": 0.82448232, + "learning_rate": 3.977906984472136e-06, + "loss": 0.84717447, + "num_input_tokens_seen": 26895005, + "step": 1263, + "time_per_iteration": 2.6262786388397217 + }, + { + "auxiliary_loss_clip": 0.01166059, + "auxiliary_loss_mlp": 0.01068738, + "balance_loss_clip": 1.06484997, + "balance_loss_mlp": 1.04334641, + "epoch": 0.07599579137231324, + "flos": 23112395245440.0, + "grad_norm": 2.171520639750579, + "language_loss": 0.76149648, + "learning_rate": 3.977849218567442e-06, + "loss": 0.78384447, + "num_input_tokens_seen": 26913930, + "step": 1264, + "time_per_iteration": 2.7735466957092285 + }, + { + "auxiliary_loss_clip": 0.01181777, + "auxiliary_loss_mlp": 0.01061673, + "balance_loss_clip": 1.06183577, + "balance_loss_mlp": 1.03704381, + "epoch": 0.07605591462498121, + "flos": 14502058248960.0, + "grad_norm": 2.252731793921747, + "language_loss": 0.80919051, + "learning_rate": 3.977791377662507e-06, + "loss": 0.83162498, + "num_input_tokens_seen": 26931485, + "step": 1265, + "time_per_iteration": 2.6076793670654297 + }, + { + "auxiliary_loss_clip": 0.01143593, + "auxiliary_loss_mlp": 0.01068856, + "balance_loss_clip": 1.05383801, + "balance_loss_mlp": 1.0411638, + "epoch": 0.07611603787764919, + "flos": 23514199758720.0, + "grad_norm": 2.117217065332582, + "language_loss": 0.65244937, + "learning_rate": 3.977733461759524e-06, + "loss": 0.67457378, + "num_input_tokens_seen": 26951670, + "step": 1266, + "time_per_iteration": 2.714848041534424 + }, + { + "auxiliary_loss_clip": 0.0116364, + "auxiliary_loss_mlp": 0.01066982, + "balance_loss_clip": 1.05869627, + "balance_loss_mlp": 1.04194832, + "epoch": 0.07617616113031715, + "flos": 21507188353920.0, + "grad_norm": 2.0157381540709416, + "language_loss": 0.79570109, + "learning_rate": 3.977675470860691e-06, + "loss": 0.81800735, + "num_input_tokens_seen": 26970335, + "step": 1267, + "time_per_iteration": 2.692220687866211 + }, + { + "auxiliary_loss_clip": 0.01186526, + "auxiliary_loss_mlp": 0.01060572, + "balance_loss_clip": 1.06368709, + "balance_loss_mlp": 1.03644359, + "epoch": 0.07623628438298512, + "flos": 14573161221120.0, + "grad_norm": 2.573855585409162, + "language_loss": 0.72936547, + "learning_rate": 3.977617404968205e-06, + "loss": 0.75183642, + "num_input_tokens_seen": 26986025, + "step": 1268, + "time_per_iteration": 2.666487216949463 + }, + { + "auxiliary_loss_clip": 0.01189272, + "auxiliary_loss_mlp": 0.01056943, + "balance_loss_clip": 1.05925119, + "balance_loss_mlp": 1.03146791, + "epoch": 0.07629640763565308, + "flos": 14720395069440.0, + "grad_norm": 2.3531002902867018, + "language_loss": 0.82087409, + "learning_rate": 3.977559264084269e-06, + "loss": 0.84333622, + "num_input_tokens_seen": 27004045, + "step": 1269, + "time_per_iteration": 2.6196024417877197 + }, + { + "auxiliary_loss_clip": 0.01198264, + "auxiliary_loss_mlp": 0.01062408, + "balance_loss_clip": 1.06528163, + "balance_loss_mlp": 1.03656352, + "epoch": 0.07635653088832106, + "flos": 14902929008640.0, + "grad_norm": 2.6660741307472424, + "language_loss": 0.88614184, + "learning_rate": 3.977501048211088e-06, + "loss": 0.90874851, + "num_input_tokens_seen": 27022070, + "step": 1270, + "time_per_iteration": 2.6423919200897217 + }, + { + "auxiliary_loss_clip": 0.01195764, + "auxiliary_loss_mlp": 0.01062092, + "balance_loss_clip": 1.06443572, + "balance_loss_mlp": 1.0371294, + "epoch": 0.07641665414098903, + "flos": 26651571235200.0, + "grad_norm": 2.486841045046768, + "language_loss": 0.7104162, + "learning_rate": 3.977442757350869e-06, + "loss": 0.73299474, + "num_input_tokens_seen": 27041755, + "step": 1271, + "time_per_iteration": 2.6679437160491943 + }, + { + "auxiliary_loss_clip": 0.01157818, + "auxiliary_loss_mlp": 0.01068131, + "balance_loss_clip": 1.05973268, + "balance_loss_mlp": 1.04282308, + "epoch": 0.07647677739365699, + "flos": 25192808092800.0, + "grad_norm": 1.5691807400142836, + "language_loss": 0.82570392, + "learning_rate": 3.977384391505823e-06, + "loss": 0.84796339, + "num_input_tokens_seen": 27061540, + "step": 1272, + "time_per_iteration": 2.7613680362701416 + }, + { + "auxiliary_loss_clip": 0.01176176, + "auxiliary_loss_mlp": 0.00782751, + "balance_loss_clip": 1.05822372, + "balance_loss_mlp": 1.00051665, + "epoch": 0.07653690064632497, + "flos": 20558141159040.0, + "grad_norm": 1.811509476700225, + "language_loss": 0.79854733, + "learning_rate": 3.977325950678162e-06, + "loss": 0.81813657, + "num_input_tokens_seen": 27081395, + "step": 1273, + "time_per_iteration": 2.696317434310913 + }, + { + "auxiliary_loss_clip": 0.01185133, + "auxiliary_loss_mlp": 0.01064308, + "balance_loss_clip": 1.06556833, + "balance_loss_mlp": 1.03910685, + "epoch": 0.07659702389899294, + "flos": 22269320150400.0, + "grad_norm": 1.7399681078894738, + "language_loss": 0.81519866, + "learning_rate": 3.977267434870103e-06, + "loss": 0.83769304, + "num_input_tokens_seen": 27101175, + "step": 1274, + "time_per_iteration": 2.8570950031280518 + }, + { + "auxiliary_loss_clip": 0.0118748, + "auxiliary_loss_mlp": 0.01078696, + "balance_loss_clip": 1.06516898, + "balance_loss_mlp": 1.05164731, + "epoch": 0.0766571471516609, + "flos": 32636120209920.0, + "grad_norm": 2.6845981005996453, + "language_loss": 0.73083639, + "learning_rate": 3.977208844083865e-06, + "loss": 0.75349814, + "num_input_tokens_seen": 27124505, + "step": 1275, + "time_per_iteration": 2.75947904586792 + }, + { + "auxiliary_loss_clip": 0.0121081, + "auxiliary_loss_mlp": 0.01063745, + "balance_loss_clip": 1.06740415, + "balance_loss_mlp": 1.03694642, + "epoch": 0.07671727040432888, + "flos": 15267386355840.0, + "grad_norm": 2.828157953752124, + "language_loss": 0.79507053, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.81781602, + "num_input_tokens_seen": 27140960, + "step": 1276, + "time_per_iteration": 2.626683473587036 + }, + { + "auxiliary_loss_clip": 0.01198279, + "auxiliary_loss_mlp": 0.01058719, + "balance_loss_clip": 1.06486118, + "balance_loss_mlp": 1.03485298, + "epoch": 0.07677739365699685, + "flos": 28184094956160.0, + "grad_norm": 2.406514987231471, + "language_loss": 0.58915478, + "learning_rate": 3.97709143758574e-06, + "loss": 0.61172473, + "num_input_tokens_seen": 27160985, + "step": 1277, + "time_per_iteration": 2.6684958934783936 + }, + { + "auxiliary_loss_clip": 0.01201282, + "auxiliary_loss_mlp": 0.01064396, + "balance_loss_clip": 1.06430948, + "balance_loss_mlp": 1.03919542, + "epoch": 0.07683751690966481, + "flos": 18296128126080.0, + "grad_norm": 2.8024245322836046, + "language_loss": 0.74957907, + "learning_rate": 3.977032621878305e-06, + "loss": 0.77223587, + "num_input_tokens_seen": 27178390, + "step": 1278, + "time_per_iteration": 2.723675012588501 + }, + { + "auxiliary_loss_clip": 0.01160972, + "auxiliary_loss_mlp": 0.01063133, + "balance_loss_clip": 1.0584681, + "balance_loss_mlp": 1.0390408, + "epoch": 0.07689764016233278, + "flos": 21981101420160.0, + "grad_norm": 5.339853944094037, + "language_loss": 0.88594604, + "learning_rate": 3.976973731201596e-06, + "loss": 0.90818715, + "num_input_tokens_seen": 27197505, + "step": 1279, + "time_per_iteration": 2.655036211013794 + }, + { + "auxiliary_loss_clip": 0.01172627, + "auxiliary_loss_mlp": 0.01066586, + "balance_loss_clip": 1.06065845, + "balance_loss_mlp": 1.04077685, + "epoch": 0.07695776341500075, + "flos": 22235995307520.0, + "grad_norm": 2.4937131241937256, + "language_loss": 0.8300451, + "learning_rate": 3.976914765557845e-06, + "loss": 0.85243726, + "num_input_tokens_seen": 27214260, + "step": 1280, + "time_per_iteration": 2.7717065811157227 + }, + { + "auxiliary_loss_clip": 0.01194022, + "auxiliary_loss_mlp": 0.01066533, + "balance_loss_clip": 1.06593037, + "balance_loss_mlp": 1.04104638, + "epoch": 0.07701788666766872, + "flos": 16143750380160.0, + "grad_norm": 2.044864943195716, + "language_loss": 0.7581439, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.78074944, + "num_input_tokens_seen": 27232525, + "step": 1281, + "time_per_iteration": 2.7444865703582764 + }, + { + "auxiliary_loss_clip": 0.01170775, + "auxiliary_loss_mlp": 0.01062526, + "balance_loss_clip": 1.05879402, + "balance_loss_mlp": 1.03669322, + "epoch": 0.07707800992033668, + "flos": 19463045264640.0, + "grad_norm": 1.8925477349429178, + "language_loss": 0.75091648, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.77324951, + "num_input_tokens_seen": 27249800, + "step": 1282, + "time_per_iteration": 2.829145908355713 + }, + { + "auxiliary_loss_clip": 0.01213222, + "auxiliary_loss_mlp": 0.01071082, + "balance_loss_clip": 1.07007408, + "balance_loss_mlp": 1.04549992, + "epoch": 0.07713813317300466, + "flos": 18990281433600.0, + "grad_norm": 2.1558853998977527, + "language_loss": 0.83863324, + "learning_rate": 3.976737418846713e-06, + "loss": 0.8614763, + "num_input_tokens_seen": 27268895, + "step": 1283, + "time_per_iteration": 2.6955173015594482 + }, + { + "auxiliary_loss_clip": 0.0119621, + "auxiliary_loss_mlp": 0.01066889, + "balance_loss_clip": 1.06603825, + "balance_loss_mlp": 1.03925657, + "epoch": 0.07719825642567263, + "flos": 18113953322880.0, + "grad_norm": 2.520477290704422, + "language_loss": 0.75147104, + "learning_rate": 3.976678153357181e-06, + "loss": 0.77410209, + "num_input_tokens_seen": 27288180, + "step": 1284, + "time_per_iteration": 2.6589291095733643 + }, + { + "auxiliary_loss_clip": 0.01182212, + "auxiliary_loss_mlp": 0.01068485, + "balance_loss_clip": 1.06304765, + "balance_loss_mlp": 1.0438329, + "epoch": 0.0772583796783406, + "flos": 42194426993280.0, + "grad_norm": 5.2953301239297295, + "language_loss": 0.76224041, + "learning_rate": 3.976618812911817e-06, + "loss": 0.78474742, + "num_input_tokens_seen": 27311815, + "step": 1285, + "time_per_iteration": 2.847702741622925 + }, + { + "auxiliary_loss_clip": 0.01216302, + "auxiliary_loss_mlp": 0.01071451, + "balance_loss_clip": 1.07193899, + "balance_loss_mlp": 1.04729891, + "epoch": 0.07731850293100857, + "flos": 24753692327040.0, + "grad_norm": 2.0564733507641, + "language_loss": 0.84193194, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.86480945, + "num_input_tokens_seen": 27331890, + "step": 1286, + "time_per_iteration": 2.713963270187378 + }, + { + "auxiliary_loss_clip": 0.01180469, + "auxiliary_loss_mlp": 0.01061062, + "balance_loss_clip": 1.06331325, + "balance_loss_mlp": 1.03646958, + "epoch": 0.07737862618367654, + "flos": 17565884628480.0, + "grad_norm": 2.810253293244863, + "language_loss": 0.76899689, + "learning_rate": 3.97649990716259e-06, + "loss": 0.79141217, + "num_input_tokens_seen": 27348320, + "step": 1287, + "time_per_iteration": 2.669168472290039 + }, + { + "auxiliary_loss_clip": 0.011763, + "auxiliary_loss_mlp": 0.01061108, + "balance_loss_clip": 1.05891848, + "balance_loss_mlp": 1.03696775, + "epoch": 0.0774387494363445, + "flos": 25627147349760.0, + "grad_norm": 1.6525652726351308, + "language_loss": 0.84699571, + "learning_rate": 3.976440341863237e-06, + "loss": 0.86936986, + "num_input_tokens_seen": 27367670, + "step": 1288, + "time_per_iteration": 2.7794599533081055 + }, + { + "auxiliary_loss_clip": 0.01206182, + "auxiliary_loss_mlp": 0.0106604, + "balance_loss_clip": 1.06214797, + "balance_loss_mlp": 1.04203176, + "epoch": 0.07749887268901248, + "flos": 12239865648000.0, + "grad_norm": 2.0424090794957523, + "language_loss": 0.85576034, + "learning_rate": 3.976380701617068e-06, + "loss": 0.87848258, + "num_input_tokens_seen": 27385485, + "step": 1289, + "time_per_iteration": 4.232934236526489 + }, + { + "auxiliary_loss_clip": 0.01207527, + "auxiliary_loss_mlp": 0.01052975, + "balance_loss_clip": 1.06487668, + "balance_loss_mlp": 1.0291574, + "epoch": 0.07755899594168045, + "flos": 25081736261760.0, + "grad_norm": 2.840721047922519, + "language_loss": 0.85548425, + "learning_rate": 3.976320986426344e-06, + "loss": 0.87808931, + "num_input_tokens_seen": 27405110, + "step": 1290, + "time_per_iteration": 4.218302965164185 + }, + { + "auxiliary_loss_clip": 0.0117374, + "auxiliary_loss_mlp": 0.01066698, + "balance_loss_clip": 1.06411862, + "balance_loss_mlp": 1.04041266, + "epoch": 0.07761911919434841, + "flos": 14246410176000.0, + "grad_norm": 2.3756178078405976, + "language_loss": 0.91390574, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.93631011, + "num_input_tokens_seen": 27422855, + "step": 1291, + "time_per_iteration": 4.468304395675659 + }, + { + "auxiliary_loss_clip": 0.01081301, + "auxiliary_loss_mlp": 0.01043026, + "balance_loss_clip": 1.04092944, + "balance_loss_mlp": 1.03894901, + "epoch": 0.07767924244701638, + "flos": 67237202954880.0, + "grad_norm": 0.8973948861970446, + "language_loss": 0.65065891, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67190224, + "num_input_tokens_seen": 27487190, + "step": 1292, + "time_per_iteration": 3.3142755031585693 + }, + { + "auxiliary_loss_clip": 0.01195822, + "auxiliary_loss_mlp": 0.01062751, + "balance_loss_clip": 1.06527543, + "balance_loss_mlp": 1.03846776, + "epoch": 0.07773936569968436, + "flos": 28550635292160.0, + "grad_norm": 1.7595227960044768, + "language_loss": 0.87530363, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.89788938, + "num_input_tokens_seen": 27510465, + "step": 1293, + "time_per_iteration": 2.801603078842163 + }, + { + "auxiliary_loss_clip": 0.01116633, + "auxiliary_loss_mlp": 0.01078659, + "balance_loss_clip": 1.05041039, + "balance_loss_mlp": 1.05012059, + "epoch": 0.07779948895235232, + "flos": 27490264871040.0, + "grad_norm": 2.2898991349098528, + "language_loss": 0.84518278, + "learning_rate": 3.976081376263239e-06, + "loss": 0.8671357, + "num_input_tokens_seen": 27528645, + "step": 1294, + "time_per_iteration": 2.898597002029419 + }, + { + "auxiliary_loss_clip": 0.01158796, + "auxiliary_loss_mlp": 0.01059505, + "balance_loss_clip": 1.05967593, + "balance_loss_mlp": 1.0342207, + "epoch": 0.07785961220502029, + "flos": 18223301301120.0, + "grad_norm": 2.7292442592472073, + "language_loss": 0.79365373, + "learning_rate": 3.976021286383768e-06, + "loss": 0.81583679, + "num_input_tokens_seen": 27546165, + "step": 1295, + "time_per_iteration": 2.8481552600860596 + }, + { + "auxiliary_loss_clip": 0.01155886, + "auxiliary_loss_mlp": 0.01061351, + "balance_loss_clip": 1.06015158, + "balance_loss_mlp": 1.0356493, + "epoch": 0.07791973545768827, + "flos": 24608218245120.0, + "grad_norm": 3.472740252224496, + "language_loss": 0.88351864, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90569103, + "num_input_tokens_seen": 27566520, + "step": 1296, + "time_per_iteration": 2.697831392288208 + }, + { + "auxiliary_loss_clip": 0.0120756, + "auxiliary_loss_mlp": 0.01074146, + "balance_loss_clip": 1.06552935, + "balance_loss_mlp": 1.04791999, + "epoch": 0.07797985871035623, + "flos": 14282069402880.0, + "grad_norm": 2.384603846473911, + "language_loss": 0.9625901, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.98540717, + "num_input_tokens_seen": 27581960, + "step": 1297, + "time_per_iteration": 2.62660551071167 + }, + { + "auxiliary_loss_clip": 0.01175852, + "auxiliary_loss_mlp": 0.01069298, + "balance_loss_clip": 1.06147313, + "balance_loss_mlp": 1.04517019, + "epoch": 0.0780399819630242, + "flos": 26610453141120.0, + "grad_norm": 2.15152040651991, + "language_loss": 0.7600193, + "learning_rate": 3.97584056716893e-06, + "loss": 0.78247076, + "num_input_tokens_seen": 27601415, + "step": 1298, + "time_per_iteration": 2.8040499687194824 + }, + { + "auxiliary_loss_clip": 0.0114505, + "auxiliary_loss_mlp": 0.00783981, + "balance_loss_clip": 1.05864501, + "balance_loss_mlp": 1.0006063, + "epoch": 0.07810010521569218, + "flos": 21834514016640.0, + "grad_norm": 1.6697657327886877, + "language_loss": 0.8097105, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.82900077, + "num_input_tokens_seen": 27621490, + "step": 1299, + "time_per_iteration": 2.7667653560638428 + }, + { + "auxiliary_loss_clip": 0.01162638, + "auxiliary_loss_mlp": 0.01064395, + "balance_loss_clip": 1.06191885, + "balance_loss_mlp": 1.0393368, + "epoch": 0.07816022846836014, + "flos": 25081233471360.0, + "grad_norm": 1.9748762517467437, + "language_loss": 0.86755943, + "learning_rate": 3.975719713068202e-06, + "loss": 0.8898297, + "num_input_tokens_seen": 27640600, + "step": 1300, + "time_per_iteration": 2.7819204330444336 + }, + { + "auxiliary_loss_clip": 0.0120807, + "auxiliary_loss_mlp": 0.01056805, + "balance_loss_clip": 1.06663537, + "balance_loss_mlp": 1.03180683, + "epoch": 0.0782203517210281, + "flos": 40917515431680.0, + "grad_norm": 3.040560411644486, + "language_loss": 0.71822268, + "learning_rate": 3.975659173637458e-06, + "loss": 0.74087137, + "num_input_tokens_seen": 27663070, + "step": 1301, + "time_per_iteration": 2.845107316970825 + }, + { + "auxiliary_loss_clip": 0.01196566, + "auxiliary_loss_mlp": 0.01075534, + "balance_loss_clip": 1.06426311, + "balance_loss_mlp": 1.05100083, + "epoch": 0.07828047497369607, + "flos": 41172014269440.0, + "grad_norm": 1.6425838754876312, + "language_loss": 0.70782864, + "learning_rate": 3.97559855928952e-06, + "loss": 0.73054957, + "num_input_tokens_seen": 27686425, + "step": 1302, + "time_per_iteration": 2.898069381713867 + }, + { + "auxiliary_loss_clip": 0.01162032, + "auxiliary_loss_mlp": 0.00783256, + "balance_loss_clip": 1.06019354, + "balance_loss_mlp": 1.00062823, + "epoch": 0.07834059822636405, + "flos": 23508130360320.0, + "grad_norm": 2.067506704059933, + "language_loss": 0.82100385, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.84045678, + "num_input_tokens_seen": 27704900, + "step": 1303, + "time_per_iteration": 2.7862839698791504 + }, + { + "auxiliary_loss_clip": 0.01191742, + "auxiliary_loss_mlp": 0.01074585, + "balance_loss_clip": 1.06583321, + "balance_loss_mlp": 1.04908574, + "epoch": 0.07840072147903202, + "flos": 20193899293440.0, + "grad_norm": 1.8830773419754625, + "language_loss": 0.75206572, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.77472901, + "num_input_tokens_seen": 27724890, + "step": 1304, + "time_per_iteration": 2.7380170822143555 + }, + { + "auxiliary_loss_clip": 0.01211207, + "auxiliary_loss_mlp": 0.01074343, + "balance_loss_clip": 1.07114935, + "balance_loss_mlp": 1.04922605, + "epoch": 0.07846084473169998, + "flos": 21360816432000.0, + "grad_norm": 1.6118444643214749, + "language_loss": 0.76141047, + "learning_rate": 3.975416266765542e-06, + "loss": 0.784266, + "num_input_tokens_seen": 27743115, + "step": 1305, + "time_per_iteration": 2.6788928508758545 + }, + { + "auxiliary_loss_clip": 0.01137547, + "auxiliary_loss_mlp": 0.01064795, + "balance_loss_clip": 1.05611205, + "balance_loss_mlp": 1.04021358, + "epoch": 0.07852096798436796, + "flos": 25410965345280.0, + "grad_norm": 1.9541638070229452, + "language_loss": 0.85011744, + "learning_rate": 3.975355352771841e-06, + "loss": 0.87214082, + "num_input_tokens_seen": 27763570, + "step": 1306, + "time_per_iteration": 3.048137903213501 + }, + { + "auxiliary_loss_clip": 0.01194779, + "auxiliary_loss_mlp": 0.01049822, + "balance_loss_clip": 1.06754708, + "balance_loss_mlp": 1.02668333, + "epoch": 0.07858109123703592, + "flos": 24571481610240.0, + "grad_norm": 6.108459548145404, + "language_loss": 0.90882134, + "learning_rate": 3.975294363872468e-06, + "loss": 0.93126732, + "num_input_tokens_seen": 27780030, + "step": 1307, + "time_per_iteration": 3.1597135066986084 + }, + { + "auxiliary_loss_clip": 0.01145989, + "auxiliary_loss_mlp": 0.01060478, + "balance_loss_clip": 1.05529833, + "balance_loss_mlp": 1.034729, + "epoch": 0.07864121448970389, + "flos": 20698874645760.0, + "grad_norm": 3.4991416096159136, + "language_loss": 0.83695096, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85901558, + "num_input_tokens_seen": 27796225, + "step": 1308, + "time_per_iteration": 2.749174118041992 + }, + { + "auxiliary_loss_clip": 0.01151044, + "auxiliary_loss_mlp": 0.01061966, + "balance_loss_clip": 1.05445218, + "balance_loss_mlp": 1.03789735, + "epoch": 0.07870133774237187, + "flos": 22966526113920.0, + "grad_norm": 1.7092634116882437, + "language_loss": 0.77521002, + "learning_rate": 3.975172161365958e-06, + "loss": 0.7973401, + "num_input_tokens_seen": 27815975, + "step": 1309, + "time_per_iteration": 2.752854108810425 + }, + { + "auxiliary_loss_clip": 0.01200102, + "auxiliary_loss_mlp": 0.01070583, + "balance_loss_clip": 1.06396675, + "balance_loss_mlp": 1.04449987, + "epoch": 0.07876146099503983, + "flos": 18842832103680.0, + "grad_norm": 1.8729662604656268, + "language_loss": 0.80561006, + "learning_rate": 3.975110947763453e-06, + "loss": 0.82831693, + "num_input_tokens_seen": 27832255, + "step": 1310, + "time_per_iteration": 2.6966710090637207 + }, + { + "auxiliary_loss_clip": 0.01173381, + "auxiliary_loss_mlp": 0.0078245, + "balance_loss_clip": 1.06193507, + "balance_loss_mlp": 1.00060987, + "epoch": 0.0788215842477078, + "flos": 23805794367360.0, + "grad_norm": 1.796715978968241, + "language_loss": 0.73187977, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.75143808, + "num_input_tokens_seen": 27852180, + "step": 1311, + "time_per_iteration": 2.7588090896606445 + }, + { + "auxiliary_loss_clip": 0.01188438, + "auxiliary_loss_mlp": 0.01078546, + "balance_loss_clip": 1.06358969, + "balance_loss_mlp": 1.05342865, + "epoch": 0.07888170750037576, + "flos": 21579907438080.0, + "grad_norm": 1.7490617386556226, + "language_loss": 0.86002982, + "learning_rate": 3.974988295871553e-06, + "loss": 0.88269973, + "num_input_tokens_seen": 27871435, + "step": 1312, + "time_per_iteration": 2.6969683170318604 + }, + { + "auxiliary_loss_clip": 0.01178338, + "auxiliary_loss_mlp": 0.01059112, + "balance_loss_clip": 1.06324685, + "balance_loss_mlp": 1.03633142, + "epoch": 0.07894183075304374, + "flos": 19864849777920.0, + "grad_norm": 1.825664315845032, + "language_loss": 0.82087892, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.84325337, + "num_input_tokens_seen": 27890625, + "step": 1313, + "time_per_iteration": 2.6936304569244385 + }, + { + "auxiliary_loss_clip": 0.01184798, + "auxiliary_loss_mlp": 0.00783631, + "balance_loss_clip": 1.06229842, + "balance_loss_mlp": 1.00053823, + "epoch": 0.07900195400571171, + "flos": 16143463071360.0, + "grad_norm": 2.837190319075622, + "language_loss": 0.73569417, + "learning_rate": 3.97486534441264e-06, + "loss": 0.75537837, + "num_input_tokens_seen": 27906530, + "step": 1314, + "time_per_iteration": 2.653505325317383 + }, + { + "auxiliary_loss_clip": 0.01154585, + "auxiliary_loss_mlp": 0.00782352, + "balance_loss_clip": 1.05730104, + "balance_loss_mlp": 1.00044668, + "epoch": 0.07906207725837967, + "flos": 23730417676800.0, + "grad_norm": 1.6153694611764058, + "language_loss": 0.79490477, + "learning_rate": 3.974803756351379e-06, + "loss": 0.81427419, + "num_input_tokens_seen": 27926725, + "step": 1315, + "time_per_iteration": 2.797306776046753 + }, + { + "auxiliary_loss_clip": 0.01189107, + "auxiliary_loss_mlp": 0.01060743, + "balance_loss_clip": 1.05841756, + "balance_loss_mlp": 1.03487444, + "epoch": 0.07912220051104765, + "flos": 24315905364480.0, + "grad_norm": 1.6362349035659796, + "language_loss": 0.73546493, + "learning_rate": 3.974742093405362e-06, + "loss": 0.75796348, + "num_input_tokens_seen": 27947875, + "step": 1316, + "time_per_iteration": 2.688997507095337 + }, + { + "auxiliary_loss_clip": 0.01162651, + "auxiliary_loss_mlp": 0.01066617, + "balance_loss_clip": 1.05845332, + "balance_loss_mlp": 1.0418098, + "epoch": 0.07918232376371562, + "flos": 18880035615360.0, + "grad_norm": 2.157376902111077, + "language_loss": 0.65540409, + "learning_rate": 3.974680355576927e-06, + "loss": 0.67769682, + "num_input_tokens_seen": 27965040, + "step": 1317, + "time_per_iteration": 2.6998519897460938 + }, + { + "auxiliary_loss_clip": 0.01177674, + "auxiliary_loss_mlp": 0.01068635, + "balance_loss_clip": 1.06280386, + "balance_loss_mlp": 1.0428021, + "epoch": 0.07924244701638358, + "flos": 27376284038400.0, + "grad_norm": 2.382161374765057, + "language_loss": 0.73105192, + "learning_rate": 3.974618542868415e-06, + "loss": 0.75351495, + "num_input_tokens_seen": 27985330, + "step": 1318, + "time_per_iteration": 2.8350789546966553 + }, + { + "auxiliary_loss_clip": 0.01139638, + "auxiliary_loss_mlp": 0.01058798, + "balance_loss_clip": 1.05582452, + "balance_loss_mlp": 1.03515935, + "epoch": 0.07930257026905156, + "flos": 25120340403840.0, + "grad_norm": 2.635941883481154, + "language_loss": 0.90381306, + "learning_rate": 3.97455665528217e-06, + "loss": 0.92579746, + "num_input_tokens_seen": 28007615, + "step": 1319, + "time_per_iteration": 2.8553895950317383 + }, + { + "auxiliary_loss_clip": 0.01175059, + "auxiliary_loss_mlp": 0.01055333, + "balance_loss_clip": 1.05662942, + "balance_loss_mlp": 1.03122926, + "epoch": 0.07936269352171953, + "flos": 21834478103040.0, + "grad_norm": 1.9449065990449943, + "language_loss": 0.80134505, + "learning_rate": 3.974494692820539e-06, + "loss": 0.82364893, + "num_input_tokens_seen": 28027765, + "step": 1320, + "time_per_iteration": 2.6651997566223145 + }, + { + "auxiliary_loss_clip": 0.01181808, + "auxiliary_loss_mlp": 0.01060151, + "balance_loss_clip": 1.06380332, + "balance_loss_mlp": 1.03657198, + "epoch": 0.07942281677438749, + "flos": 16939889377920.0, + "grad_norm": 2.1078540484546746, + "language_loss": 0.6901226, + "learning_rate": 3.974432655485872e-06, + "loss": 0.71254218, + "num_input_tokens_seen": 28044225, + "step": 1321, + "time_per_iteration": 2.6500401496887207 + }, + { + "auxiliary_loss_clip": 0.01189002, + "auxiliary_loss_mlp": 0.01060598, + "balance_loss_clip": 1.06469131, + "balance_loss_mlp": 1.03688753, + "epoch": 0.07948294002705546, + "flos": 18986941468800.0, + "grad_norm": 1.9310950096267907, + "language_loss": 0.8359012, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.85839725, + "num_input_tokens_seen": 28062915, + "step": 1322, + "time_per_iteration": 2.684978723526001 + }, + { + "auxiliary_loss_clip": 0.01202147, + "auxiliary_loss_mlp": 0.01057117, + "balance_loss_clip": 1.06135976, + "balance_loss_mlp": 1.03304851, + "epoch": 0.07954306327972344, + "flos": 21653452535040.0, + "grad_norm": 2.128262121046283, + "language_loss": 0.90555447, + "learning_rate": 3.974308356206838e-06, + "loss": 0.92814714, + "num_input_tokens_seen": 28082175, + "step": 1323, + "time_per_iteration": 2.6192240715026855 + }, + { + "auxiliary_loss_clip": 0.01164151, + "auxiliary_loss_mlp": 0.01062303, + "balance_loss_clip": 1.06272292, + "balance_loss_mlp": 1.03809166, + "epoch": 0.0796031865323914, + "flos": 23220270766080.0, + "grad_norm": 1.8373443631598505, + "language_loss": 0.82521075, + "learning_rate": 3.974246094267187e-06, + "loss": 0.84747529, + "num_input_tokens_seen": 28102645, + "step": 1324, + "time_per_iteration": 2.8283956050872803 + }, + { + "auxiliary_loss_clip": 0.01180787, + "auxiliary_loss_mlp": 0.01053463, + "balance_loss_clip": 1.06256735, + "balance_loss_mlp": 1.02834535, + "epoch": 0.07966330978505937, + "flos": 23294534135040.0, + "grad_norm": 2.119290865165494, + "language_loss": 0.79162025, + "learning_rate": 3.974183757463925e-06, + "loss": 0.8139627, + "num_input_tokens_seen": 28122805, + "step": 1325, + "time_per_iteration": 2.6996092796325684 + }, + { + "auxiliary_loss_clip": 0.01119286, + "auxiliary_loss_mlp": 0.00785175, + "balance_loss_clip": 1.04844928, + "balance_loss_mlp": 1.00035501, + "epoch": 0.07972343303772735, + "flos": 18363783392640.0, + "grad_norm": 2.2621745256944448, + "language_loss": 0.88038248, + "learning_rate": 3.974121345799418e-06, + "loss": 0.89942712, + "num_input_tokens_seen": 28140530, + "step": 1326, + "time_per_iteration": 2.881410837173462 + }, + { + "auxiliary_loss_clip": 0.012, + "auxiliary_loss_mlp": 0.01056877, + "balance_loss_clip": 1.06257951, + "balance_loss_mlp": 1.03168797, + "epoch": 0.07978355629039531, + "flos": 21762513204480.0, + "grad_norm": 1.8538865301137586, + "language_loss": 0.8328709, + "learning_rate": 3.974058859276032e-06, + "loss": 0.85543966, + "num_input_tokens_seen": 28159640, + "step": 1327, + "time_per_iteration": 2.7277982234954834 + }, + { + "auxiliary_loss_clip": 0.01207207, + "auxiliary_loss_mlp": 0.01056886, + "balance_loss_clip": 1.06532371, + "balance_loss_mlp": 1.03223395, + "epoch": 0.07984367954306328, + "flos": 18551309322240.0, + "grad_norm": 2.3216818645515636, + "language_loss": 0.78599, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.80863088, + "num_input_tokens_seen": 28177050, + "step": 1328, + "time_per_iteration": 4.2137157917022705 + }, + { + "auxiliary_loss_clip": 0.01201442, + "auxiliary_loss_mlp": 0.01052053, + "balance_loss_clip": 1.06778932, + "balance_loss_mlp": 1.02722156, + "epoch": 0.07990380279573125, + "flos": 16904050583040.0, + "grad_norm": 4.209530911932697, + "language_loss": 0.73918134, + "learning_rate": 3.973933661662101e-06, + "loss": 0.76171625, + "num_input_tokens_seen": 28193245, + "step": 1329, + "time_per_iteration": 5.853717565536499 + }, + { + "auxiliary_loss_clip": 0.01169795, + "auxiliary_loss_mlp": 0.01064631, + "balance_loss_clip": 1.06039059, + "balance_loss_mlp": 1.04069376, + "epoch": 0.07996392604839922, + "flos": 24098358643200.0, + "grad_norm": 1.6102544328312476, + "language_loss": 0.81743932, + "learning_rate": 3.973870950576305e-06, + "loss": 0.83978355, + "num_input_tokens_seen": 28213570, + "step": 1330, + "time_per_iteration": 4.307915687561035 + }, + { + "auxiliary_loss_clip": 0.01205148, + "auxiliary_loss_mlp": 0.00780735, + "balance_loss_clip": 1.06445098, + "balance_loss_mlp": 1.00030971, + "epoch": 0.08002404930106718, + "flos": 14278729438080.0, + "grad_norm": 3.0935981151455865, + "language_loss": 0.88962448, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.90948325, + "num_input_tokens_seen": 28229980, + "step": 1331, + "time_per_iteration": 2.645198345184326 + }, + { + "auxiliary_loss_clip": 0.01196019, + "auxiliary_loss_mlp": 0.00781409, + "balance_loss_clip": 1.05950165, + "balance_loss_mlp": 1.00032377, + "epoch": 0.08008417255373516, + "flos": 40406219285760.0, + "grad_norm": 1.8933982437719925, + "language_loss": 0.7335732, + "learning_rate": 3.973745303858942e-06, + "loss": 0.75334752, + "num_input_tokens_seen": 28253840, + "step": 1332, + "time_per_iteration": 2.792128562927246 + }, + { + "auxiliary_loss_clip": 0.01180359, + "auxiliary_loss_mlp": 0.01055118, + "balance_loss_clip": 1.06217384, + "balance_loss_mlp": 1.03216982, + "epoch": 0.08014429580640313, + "flos": 18478913460480.0, + "grad_norm": 1.7464568676953767, + "language_loss": 0.82765031, + "learning_rate": 3.973682368232138e-06, + "loss": 0.85000509, + "num_input_tokens_seen": 28271675, + "step": 1333, + "time_per_iteration": 2.635579824447632 + }, + { + "auxiliary_loss_clip": 0.01160554, + "auxiliary_loss_mlp": 0.01059025, + "balance_loss_clip": 1.05944169, + "balance_loss_mlp": 1.03502798, + "epoch": 0.0802044190590711, + "flos": 22053461368320.0, + "grad_norm": 2.677615191761892, + "language_loss": 0.74862051, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.77081633, + "num_input_tokens_seen": 28291850, + "step": 1334, + "time_per_iteration": 2.8150298595428467 + }, + { + "auxiliary_loss_clip": 0.01176175, + "auxiliary_loss_mlp": 0.01063593, + "balance_loss_clip": 1.06460369, + "balance_loss_mlp": 1.04010868, + "epoch": 0.08026454231173906, + "flos": 24572128055040.0, + "grad_norm": 1.8723728369534094, + "language_loss": 0.79970533, + "learning_rate": 3.973556272454221e-06, + "loss": 0.82210302, + "num_input_tokens_seen": 28310780, + "step": 1335, + "time_per_iteration": 2.6858503818511963 + }, + { + "auxiliary_loss_clip": 0.01068232, + "auxiliary_loss_mlp": 0.01020395, + "balance_loss_clip": 1.04101062, + "balance_loss_mlp": 1.01693749, + "epoch": 0.08032466556440704, + "flos": 52581841459200.0, + "grad_norm": 0.7491611763509133, + "language_loss": 0.56056821, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58145452, + "num_input_tokens_seen": 28369985, + "step": 1336, + "time_per_iteration": 3.324230670928955 + }, + { + "auxiliary_loss_clip": 0.01179495, + "auxiliary_loss_mlp": 0.01064433, + "balance_loss_clip": 1.06005239, + "balance_loss_mlp": 1.04149771, + "epoch": 0.080384788817075, + "flos": 23842602829440.0, + "grad_norm": 2.8990759307469256, + "language_loss": 0.67587668, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.69831598, + "num_input_tokens_seen": 28388670, + "step": 1337, + "time_per_iteration": 2.755451202392578 + }, + { + "auxiliary_loss_clip": 0.01171763, + "auxiliary_loss_mlp": 0.0107788, + "balance_loss_clip": 1.06270492, + "balance_loss_mlp": 1.05304837, + "epoch": 0.08044491206974297, + "flos": 25300719527040.0, + "grad_norm": 1.9421039451316542, + "language_loss": 0.86847901, + "learning_rate": 3.973366567512453e-06, + "loss": 0.89097536, + "num_input_tokens_seen": 28411845, + "step": 1338, + "time_per_iteration": 2.758418560028076 + }, + { + "auxiliary_loss_clip": 0.01136344, + "auxiliary_loss_mlp": 0.01082295, + "balance_loss_clip": 1.04883683, + "balance_loss_mlp": 1.05596161, + "epoch": 0.08050503532241095, + "flos": 22376549226240.0, + "grad_norm": 2.4557709650828157, + "language_loss": 0.87217385, + "learning_rate": 3.973303182868147e-06, + "loss": 0.89436018, + "num_input_tokens_seen": 28427875, + "step": 1339, + "time_per_iteration": 2.72682785987854 + }, + { + "auxiliary_loss_clip": 0.01188632, + "auxiliary_loss_mlp": 0.01055953, + "balance_loss_clip": 1.06334567, + "balance_loss_mlp": 1.03417385, + "epoch": 0.08056515857507891, + "flos": 18369421827840.0, + "grad_norm": 10.603370056653041, + "language_loss": 0.89504963, + "learning_rate": 3.973239723395988e-06, + "loss": 0.91749549, + "num_input_tokens_seen": 28446615, + "step": 1340, + "time_per_iteration": 2.639601469039917 + }, + { + "auxiliary_loss_clip": 0.01080107, + "auxiliary_loss_mlp": 0.01012224, + "balance_loss_clip": 1.02943289, + "balance_loss_mlp": 1.00850451, + "epoch": 0.08062528182774688, + "flos": 51348130980480.0, + "grad_norm": 0.8861598592181924, + "language_loss": 0.64834231, + "learning_rate": 3.97317618909838e-06, + "loss": 0.66926563, + "num_input_tokens_seen": 28505290, + "step": 1341, + "time_per_iteration": 3.0625648498535156 + }, + { + "auxiliary_loss_clip": 0.01197538, + "auxiliary_loss_mlp": 0.01061885, + "balance_loss_clip": 1.0628854, + "balance_loss_mlp": 1.0364095, + "epoch": 0.08068540508041486, + "flos": 17599712261760.0, + "grad_norm": 3.3156125209451286, + "language_loss": 0.89471233, + "learning_rate": 3.973112579977733e-06, + "loss": 0.9173066, + "num_input_tokens_seen": 28522735, + "step": 1342, + "time_per_iteration": 2.6123783588409424 + }, + { + "auxiliary_loss_clip": 0.01177687, + "auxiliary_loss_mlp": 0.01062063, + "balance_loss_clip": 1.0644995, + "balance_loss_mlp": 1.03818512, + "epoch": 0.08074552833308282, + "flos": 10561185486720.0, + "grad_norm": 2.2904075751929365, + "language_loss": 0.76354575, + "learning_rate": 3.973048896036459e-06, + "loss": 0.78594327, + "num_input_tokens_seen": 28539460, + "step": 1343, + "time_per_iteration": 2.7564918994903564 + }, + { + "auxiliary_loss_clip": 0.01064182, + "auxiliary_loss_mlp": 0.01010488, + "balance_loss_clip": 1.02542567, + "balance_loss_mlp": 1.0066731, + "epoch": 0.08080565158575079, + "flos": 60840254954880.0, + "grad_norm": 0.8071281523255156, + "language_loss": 0.57418531, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59493202, + "num_input_tokens_seen": 28599855, + "step": 1344, + "time_per_iteration": 3.170443058013916 + }, + { + "auxiliary_loss_clip": 0.01158029, + "auxiliary_loss_mlp": 0.01063108, + "balance_loss_clip": 1.05839872, + "balance_loss_mlp": 1.03846788, + "epoch": 0.08086577483841875, + "flos": 18332361970560.0, + "grad_norm": 2.5953739346171676, + "language_loss": 0.86569476, + "learning_rate": 3.972921303701695e-06, + "loss": 0.88790607, + "num_input_tokens_seen": 28617585, + "step": 1345, + "time_per_iteration": 2.765254497528076 + }, + { + "auxiliary_loss_clip": 0.01203428, + "auxiliary_loss_mlp": 0.01057879, + "balance_loss_clip": 1.06629944, + "balance_loss_mlp": 1.03603959, + "epoch": 0.08092589809108673, + "flos": 21543601766400.0, + "grad_norm": 1.8653844332842058, + "language_loss": 0.87646407, + "learning_rate": 3.972857395313042e-06, + "loss": 0.89907712, + "num_input_tokens_seen": 28636355, + "step": 1346, + "time_per_iteration": 2.655611991882324 + }, + { + "auxiliary_loss_clip": 0.01191822, + "auxiliary_loss_mlp": 0.0105414, + "balance_loss_clip": 1.06450033, + "balance_loss_mlp": 1.03047693, + "epoch": 0.0809860213437547, + "flos": 22128012046080.0, + "grad_norm": 1.7047476553504466, + "language_loss": 0.9298563, + "learning_rate": 3.972793412113439e-06, + "loss": 0.95231593, + "num_input_tokens_seen": 28656260, + "step": 1347, + "time_per_iteration": 2.718355417251587 + }, + { + "auxiliary_loss_clip": 0.01188696, + "auxiliary_loss_mlp": 0.01066703, + "balance_loss_clip": 1.06260633, + "balance_loss_mlp": 1.04144263, + "epoch": 0.08104614459642266, + "flos": 21725489260800.0, + "grad_norm": 1.9307860049130865, + "language_loss": 0.89506733, + "learning_rate": 3.972729354105312e-06, + "loss": 0.91762137, + "num_input_tokens_seen": 28675865, + "step": 1348, + "time_per_iteration": 2.763735771179199 + }, + { + "auxiliary_loss_clip": 0.01137961, + "auxiliary_loss_mlp": 0.01059733, + "balance_loss_clip": 1.06026649, + "balance_loss_mlp": 1.03730989, + "epoch": 0.08110626784909064, + "flos": 23951878980480.0, + "grad_norm": 1.6214351378274148, + "language_loss": 0.76906884, + "learning_rate": 3.97266522129109e-06, + "loss": 0.79104578, + "num_input_tokens_seen": 28696255, + "step": 1349, + "time_per_iteration": 2.778050661087036 + }, + { + "auxiliary_loss_clip": 0.01202122, + "auxiliary_loss_mlp": 0.01065092, + "balance_loss_clip": 1.06290889, + "balance_loss_mlp": 1.04144049, + "epoch": 0.0811663911017586, + "flos": 19025689265280.0, + "grad_norm": 1.777484449358279, + "language_loss": 0.8877703, + "learning_rate": 3.972601013673205e-06, + "loss": 0.91044247, + "num_input_tokens_seen": 28713905, + "step": 1350, + "time_per_iteration": 2.5871450901031494 + }, + { + "auxiliary_loss_clip": 0.01164889, + "auxiliary_loss_mlp": 0.00780958, + "balance_loss_clip": 1.06011164, + "balance_loss_mlp": 1.00028801, + "epoch": 0.08122651435442657, + "flos": 15341290588800.0, + "grad_norm": 2.7472756845793156, + "language_loss": 0.82298493, + "learning_rate": 3.972536731254092e-06, + "loss": 0.84244347, + "num_input_tokens_seen": 28732075, + "step": 1351, + "time_per_iteration": 2.840271234512329 + }, + { + "auxiliary_loss_clip": 0.01198177, + "auxiliary_loss_mlp": 0.01055773, + "balance_loss_clip": 1.06010592, + "balance_loss_mlp": 1.03090644, + "epoch": 0.08128663760709455, + "flos": 23221563655680.0, + "grad_norm": 2.2808101252466724, + "language_loss": 0.75274944, + "learning_rate": 3.972472374036189e-06, + "loss": 0.775289, + "num_input_tokens_seen": 28751150, + "step": 1352, + "time_per_iteration": 2.733644485473633 + }, + { + "auxiliary_loss_clip": 0.01194643, + "auxiliary_loss_mlp": 0.00783595, + "balance_loss_clip": 1.06613326, + "balance_loss_mlp": 1.00036311, + "epoch": 0.08134676085976252, + "flos": 22965628273920.0, + "grad_norm": 1.678520960707938, + "language_loss": 0.82936156, + "learning_rate": 3.972407942021935e-06, + "loss": 0.84914398, + "num_input_tokens_seen": 28773360, + "step": 1353, + "time_per_iteration": 2.742149829864502 + }, + { + "auxiliary_loss_clip": 0.01068236, + "auxiliary_loss_mlp": 0.01015932, + "balance_loss_clip": 1.02440155, + "balance_loss_mlp": 1.01242769, + "epoch": 0.08140688411243048, + "flos": 64322115816960.0, + "grad_norm": 0.8516312511934722, + "language_loss": 0.59741521, + "learning_rate": 3.972343435213775e-06, + "loss": 0.61825693, + "num_input_tokens_seen": 28833390, + "step": 1354, + "time_per_iteration": 3.1912426948547363 + }, + { + "auxiliary_loss_clip": 0.01150343, + "auxiliary_loss_mlp": 0.01058874, + "balance_loss_clip": 1.0546236, + "balance_loss_mlp": 1.03583086, + "epoch": 0.08146700736509845, + "flos": 22491858862080.0, + "grad_norm": 2.1234068486581643, + "language_loss": 0.82310611, + "learning_rate": 3.972278853614154e-06, + "loss": 0.84519827, + "num_input_tokens_seen": 28852430, + "step": 1355, + "time_per_iteration": 2.782442808151245 + }, + { + "auxiliary_loss_clip": 0.01186948, + "auxiliary_loss_mlp": 0.01062856, + "balance_loss_clip": 1.0600667, + "balance_loss_mlp": 1.03801262, + "epoch": 0.08152713061776642, + "flos": 20447823513600.0, + "grad_norm": 1.8366299277102565, + "language_loss": 0.7135247, + "learning_rate": 3.972214197225521e-06, + "loss": 0.73602271, + "num_input_tokens_seen": 28870685, + "step": 1356, + "time_per_iteration": 2.7777554988861084 + }, + { + "auxiliary_loss_clip": 0.01194666, + "auxiliary_loss_mlp": 0.01056522, + "balance_loss_clip": 1.06462216, + "balance_loss_mlp": 1.03259718, + "epoch": 0.08158725387043439, + "flos": 23550218121600.0, + "grad_norm": 2.050923525150184, + "language_loss": 0.70426142, + "learning_rate": 3.972149466050329e-06, + "loss": 0.72677326, + "num_input_tokens_seen": 28889860, + "step": 1357, + "time_per_iteration": 2.852046012878418 + }, + { + "auxiliary_loss_clip": 0.01186996, + "auxiliary_loss_mlp": 0.01054475, + "balance_loss_clip": 1.06138206, + "balance_loss_mlp": 1.03070426, + "epoch": 0.08164737712310235, + "flos": 22017335264640.0, + "grad_norm": 2.634204556872777, + "language_loss": 0.84203482, + "learning_rate": 3.97208466009103e-06, + "loss": 0.8644495, + "num_input_tokens_seen": 28905865, + "step": 1358, + "time_per_iteration": 2.7127115726470947 + }, + { + "auxiliary_loss_clip": 0.01176629, + "auxiliary_loss_mlp": 0.010566, + "balance_loss_clip": 1.06037402, + "balance_loss_mlp": 1.03154182, + "epoch": 0.08170750037577033, + "flos": 23367827836800.0, + "grad_norm": 2.1726272773281097, + "language_loss": 1.02781308, + "learning_rate": 3.972019779350084e-06, + "loss": 1.05014539, + "num_input_tokens_seen": 28925250, + "step": 1359, + "time_per_iteration": 2.7171826362609863 + }, + { + "auxiliary_loss_clip": 0.01128357, + "auxiliary_loss_mlp": 0.01056774, + "balance_loss_clip": 1.05009234, + "balance_loss_mlp": 1.03263426, + "epoch": 0.0817676236284383, + "flos": 28397978490240.0, + "grad_norm": 2.0494617207464945, + "language_loss": 0.8313604, + "learning_rate": 3.971954823829951e-06, + "loss": 0.85321164, + "num_input_tokens_seen": 28943445, + "step": 1360, + "time_per_iteration": 2.9020919799804688 + }, + { + "auxiliary_loss_clip": 0.01202956, + "auxiliary_loss_mlp": 0.0106887, + "balance_loss_clip": 1.06274688, + "balance_loss_mlp": 1.04469395, + "epoch": 0.08182774688110626, + "flos": 19208905562880.0, + "grad_norm": 5.2377005088202075, + "language_loss": 0.72322488, + "learning_rate": 3.971889793533093e-06, + "loss": 0.74594313, + "num_input_tokens_seen": 28962695, + "step": 1361, + "time_per_iteration": 2.6643178462982178 + }, + { + "auxiliary_loss_clip": 0.01166556, + "auxiliary_loss_mlp": 0.01056311, + "balance_loss_clip": 1.0552367, + "balance_loss_mlp": 1.03184962, + "epoch": 0.08188787013377424, + "flos": 22784099915520.0, + "grad_norm": 28.302545492028134, + "language_loss": 0.76657653, + "learning_rate": 3.971824688461976e-06, + "loss": 0.78880513, + "num_input_tokens_seen": 28982120, + "step": 1362, + "time_per_iteration": 2.7439064979553223 + }, + { + "auxiliary_loss_clip": 0.01199728, + "auxiliary_loss_mlp": 0.01053492, + "balance_loss_clip": 1.06350708, + "balance_loss_mlp": 1.03104496, + "epoch": 0.08194799338644221, + "flos": 16468095214080.0, + "grad_norm": 2.1850191919210338, + "language_loss": 0.72384715, + "learning_rate": 3.971759508619069e-06, + "loss": 0.74637932, + "num_input_tokens_seen": 28998100, + "step": 1363, + "time_per_iteration": 2.7082791328430176 + }, + { + "auxiliary_loss_clip": 0.01202887, + "auxiliary_loss_mlp": 0.01066374, + "balance_loss_clip": 1.06580126, + "balance_loss_mlp": 1.04083955, + "epoch": 0.08200811663911017, + "flos": 23913633974400.0, + "grad_norm": 2.142285699657122, + "language_loss": 0.7726444, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79533696, + "num_input_tokens_seen": 29017095, + "step": 1364, + "time_per_iteration": 2.777156114578247 + }, + { + "auxiliary_loss_clip": 0.01135428, + "auxiliary_loss_mlp": 0.01063854, + "balance_loss_clip": 1.05182433, + "balance_loss_mlp": 1.03645968, + "epoch": 0.08206823989177814, + "flos": 17896550256000.0, + "grad_norm": 1.85589982882842, + "language_loss": 0.82242119, + "learning_rate": 3.971628924627776e-06, + "loss": 0.844414, + "num_input_tokens_seen": 29037240, + "step": 1365, + "time_per_iteration": 2.8192803859710693 + }, + { + "auxiliary_loss_clip": 0.01196582, + "auxiliary_loss_mlp": 0.01059945, + "balance_loss_clip": 1.07006347, + "balance_loss_mlp": 1.03706884, + "epoch": 0.08212836314444612, + "flos": 22088186841600.0, + "grad_norm": 1.7803424706125983, + "language_loss": 0.82062519, + "learning_rate": 3.97156352048434e-06, + "loss": 0.84319043, + "num_input_tokens_seen": 29056250, + "step": 1366, + "time_per_iteration": 2.7482311725616455 + }, + { + "auxiliary_loss_clip": 0.01153262, + "auxiliary_loss_mlp": 0.0107233, + "balance_loss_clip": 1.05320215, + "balance_loss_mlp": 1.04779685, + "epoch": 0.08218848639711408, + "flos": 17597485618560.0, + "grad_norm": 2.010209091244133, + "language_loss": 0.81944495, + "learning_rate": 3.97149804157902e-06, + "loss": 0.84170091, + "num_input_tokens_seen": 29073380, + "step": 1367, + "time_per_iteration": 4.352729797363281 + }, + { + "auxiliary_loss_clip": 0.01206125, + "auxiliary_loss_mlp": 0.01066888, + "balance_loss_clip": 1.06541765, + "balance_loss_mlp": 1.04241478, + "epoch": 0.08224860964978205, + "flos": 17857838373120.0, + "grad_norm": 2.518996379768439, + "language_loss": 0.8331567, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.85588682, + "num_input_tokens_seen": 29091330, + "step": 1368, + "time_per_iteration": 6.077457666397095 + }, + { + "auxiliary_loss_clip": 0.01159992, + "auxiliary_loss_mlp": 0.01049874, + "balance_loss_clip": 1.06314564, + "balance_loss_mlp": 1.02790344, + "epoch": 0.08230873290245003, + "flos": 25227533566080.0, + "grad_norm": 3.198110530618569, + "language_loss": 0.81336468, + "learning_rate": 3.971366859492653e-06, + "loss": 0.8354634, + "num_input_tokens_seen": 29110375, + "step": 1369, + "time_per_iteration": 2.769972085952759 + }, + { + "auxiliary_loss_clip": 0.01137456, + "auxiliary_loss_mlp": 0.00781814, + "balance_loss_clip": 1.05438268, + "balance_loss_mlp": 1.00027657, + "epoch": 0.08236885615511799, + "flos": 31759935753600.0, + "grad_norm": 2.610758273724768, + "language_loss": 0.74818152, + "learning_rate": 3.971301156316582e-06, + "loss": 0.76737428, + "num_input_tokens_seen": 29129395, + "step": 1370, + "time_per_iteration": 4.497304201126099 + }, + { + "auxiliary_loss_clip": 0.0115498, + "auxiliary_loss_mlp": 0.01064278, + "balance_loss_clip": 1.06403351, + "balance_loss_mlp": 1.03987551, + "epoch": 0.08242897940778596, + "flos": 23185832601600.0, + "grad_norm": 1.5246391685186451, + "language_loss": 0.7398203, + "learning_rate": 3.971235378388573e-06, + "loss": 0.76201284, + "num_input_tokens_seen": 29148650, + "step": 1371, + "time_per_iteration": 2.758089065551758 + }, + { + "auxiliary_loss_clip": 0.01097162, + "auxiliary_loss_mlp": 0.0106614, + "balance_loss_clip": 1.05124569, + "balance_loss_mlp": 1.04098701, + "epoch": 0.08248910266045394, + "flos": 34491480393600.0, + "grad_norm": 1.9670948823939327, + "language_loss": 0.70851803, + "learning_rate": 3.971169525711122e-06, + "loss": 0.73015106, + "num_input_tokens_seen": 29170785, + "step": 1372, + "time_per_iteration": 4.069301605224609 + }, + { + "auxiliary_loss_clip": 0.01162292, + "auxiliary_loss_mlp": 0.01056859, + "balance_loss_clip": 1.0571332, + "balance_loss_mlp": 1.03261209, + "epoch": 0.0825492259131219, + "flos": 13436228960640.0, + "grad_norm": 2.750431245604494, + "language_loss": 0.88363653, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.905828, + "num_input_tokens_seen": 29185210, + "step": 1373, + "time_per_iteration": 3.9346964359283447 + }, + { + "auxiliary_loss_clip": 0.01147291, + "auxiliary_loss_mlp": 0.01062343, + "balance_loss_clip": 1.05334187, + "balance_loss_mlp": 1.03878665, + "epoch": 0.08260934916578987, + "flos": 25812446636160.0, + "grad_norm": 2.128923272573014, + "language_loss": 0.82465184, + "learning_rate": 3.971037596117882e-06, + "loss": 0.84674811, + "num_input_tokens_seen": 29205210, + "step": 1374, + "time_per_iteration": 2.933377981185913 + }, + { + "auxiliary_loss_clip": 0.01044322, + "auxiliary_loss_mlp": 0.01017124, + "balance_loss_clip": 1.03154135, + "balance_loss_mlp": 1.0135479, + "epoch": 0.08266947241845783, + "flos": 63460009491840.0, + "grad_norm": 0.8272339650193923, + "language_loss": 0.60641956, + "learning_rate": 3.970971519207095e-06, + "loss": 0.62703401, + "num_input_tokens_seen": 29265350, + "step": 1375, + "time_per_iteration": 3.3287038803100586 + }, + { + "auxiliary_loss_clip": 0.01060461, + "auxiliary_loss_mlp": 0.01013653, + "balance_loss_clip": 1.02398169, + "balance_loss_mlp": 1.01017237, + "epoch": 0.08272959567112581, + "flos": 69993704568960.0, + "grad_norm": 0.9162492148708097, + "language_loss": 0.62171799, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64245915, + "num_input_tokens_seen": 29321475, + "step": 1376, + "time_per_iteration": 3.218834161758423 + }, + { + "auxiliary_loss_clip": 0.01159103, + "auxiliary_loss_mlp": 0.0106347, + "balance_loss_clip": 1.06229186, + "balance_loss_mlp": 1.03942561, + "epoch": 0.08278971892379378, + "flos": 20413205781120.0, + "grad_norm": 1.9191670647860084, + "language_loss": 0.82577401, + "learning_rate": 3.970839141169718e-06, + "loss": 0.84799975, + "num_input_tokens_seen": 29341405, + "step": 1377, + "time_per_iteration": 2.8763558864593506 + }, + { + "auxiliary_loss_clip": 0.01176967, + "auxiliary_loss_mlp": 0.01054072, + "balance_loss_clip": 1.06486619, + "balance_loss_mlp": 1.03011107, + "epoch": 0.08284984217646174, + "flos": 26250233598720.0, + "grad_norm": 1.915539507671093, + "language_loss": 0.84923226, + "learning_rate": 3.970772840048147e-06, + "loss": 0.87154263, + "num_input_tokens_seen": 29361955, + "step": 1378, + "time_per_iteration": 2.8232595920562744 + }, + { + "auxiliary_loss_clip": 0.01185329, + "auxiliary_loss_mlp": 0.01058999, + "balance_loss_clip": 1.06043923, + "balance_loss_mlp": 1.0344305, + "epoch": 0.08290996542912972, + "flos": 27194683852800.0, + "grad_norm": 6.4689921779024795, + "language_loss": 0.87319231, + "learning_rate": 3.970706464194672e-06, + "loss": 0.8956356, + "num_input_tokens_seen": 29382395, + "step": 1379, + "time_per_iteration": 2.756082534790039 + }, + { + "auxiliary_loss_clip": 0.01158173, + "auxiliary_loss_mlp": 0.01061479, + "balance_loss_clip": 1.05779433, + "balance_loss_mlp": 1.03829277, + "epoch": 0.08297008868179769, + "flos": 38618191146240.0, + "grad_norm": 2.078993196749275, + "language_loss": 0.78545237, + "learning_rate": 3.970640013611812e-06, + "loss": 0.8076489, + "num_input_tokens_seen": 29404460, + "step": 1380, + "time_per_iteration": 2.9525601863861084 + }, + { + "auxiliary_loss_clip": 0.01183492, + "auxiliary_loss_mlp": 0.01059448, + "balance_loss_clip": 1.06308961, + "balance_loss_mlp": 1.0344255, + "epoch": 0.08303021193446565, + "flos": 19974736460160.0, + "grad_norm": 2.6608111668609697, + "language_loss": 0.86125714, + "learning_rate": 3.970573488302083e-06, + "loss": 0.88368654, + "num_input_tokens_seen": 29422675, + "step": 1381, + "time_per_iteration": 2.735203742980957 + }, + { + "auxiliary_loss_clip": 0.01197152, + "auxiliary_loss_mlp": 0.00781814, + "balance_loss_clip": 1.06611753, + "balance_loss_mlp": 1.00034571, + "epoch": 0.08309033518713363, + "flos": 13662646341120.0, + "grad_norm": 2.9433398182948203, + "language_loss": 0.87471211, + "learning_rate": 3.970506888268011e-06, + "loss": 0.89450181, + "num_input_tokens_seen": 29439840, + "step": 1382, + "time_per_iteration": 2.6392617225646973 + }, + { + "auxiliary_loss_clip": 0.0115996, + "auxiliary_loss_mlp": 0.01055463, + "balance_loss_clip": 1.06138313, + "balance_loss_mlp": 1.03337312, + "epoch": 0.0831504584398016, + "flos": 17968551068160.0, + "grad_norm": 1.9901989904031434, + "language_loss": 0.77085757, + "learning_rate": 3.970440213512121e-06, + "loss": 0.79301178, + "num_input_tokens_seen": 29457360, + "step": 1383, + "time_per_iteration": 2.756565809249878 + }, + { + "auxiliary_loss_clip": 0.01191549, + "auxiliary_loss_mlp": 0.01058014, + "balance_loss_clip": 1.06211782, + "balance_loss_mlp": 1.03395748, + "epoch": 0.08321058169246956, + "flos": 22601386408320.0, + "grad_norm": 1.818236548161018, + "language_loss": 0.82858944, + "learning_rate": 3.97037346403694e-06, + "loss": 0.85108507, + "num_input_tokens_seen": 29477040, + "step": 1384, + "time_per_iteration": 2.7848587036132812 + }, + { + "auxiliary_loss_clip": 0.01148661, + "auxiliary_loss_mlp": 0.01063605, + "balance_loss_clip": 1.05671442, + "balance_loss_mlp": 1.03610373, + "epoch": 0.08327070494513754, + "flos": 22850426378880.0, + "grad_norm": 3.9982776391866346, + "language_loss": 0.85219657, + "learning_rate": 3.970306639845e-06, + "loss": 0.8743192, + "num_input_tokens_seen": 29492010, + "step": 1385, + "time_per_iteration": 2.803893566131592 + }, + { + "auxiliary_loss_clip": 0.01157001, + "auxiliary_loss_mlp": 0.01061891, + "balance_loss_clip": 1.05823874, + "balance_loss_mlp": 1.03750122, + "epoch": 0.0833308281978055, + "flos": 22782986593920.0, + "grad_norm": 1.7071515381676081, + "language_loss": 0.69195282, + "learning_rate": 3.970239740938835e-06, + "loss": 0.71414173, + "num_input_tokens_seen": 29511850, + "step": 1386, + "time_per_iteration": 3.004786252975464 + }, + { + "auxiliary_loss_clip": 0.01172803, + "auxiliary_loss_mlp": 0.01058809, + "balance_loss_clip": 1.05489016, + "balance_loss_mlp": 1.03483546, + "epoch": 0.08339095145047347, + "flos": 20812604083200.0, + "grad_norm": 1.672791522425571, + "language_loss": 0.81894958, + "learning_rate": 3.97017276732098e-06, + "loss": 0.84126568, + "num_input_tokens_seen": 29531415, + "step": 1387, + "time_per_iteration": 2.7678542137145996 + }, + { + "auxiliary_loss_clip": 0.01179554, + "auxiliary_loss_mlp": 0.01074251, + "balance_loss_clip": 1.06179345, + "balance_loss_mlp": 1.04817975, + "epoch": 0.08345107470314143, + "flos": 18515326872960.0, + "grad_norm": 2.071322011459688, + "language_loss": 0.77205479, + "learning_rate": 3.970105718993978e-06, + "loss": 0.7945928, + "num_input_tokens_seen": 29549525, + "step": 1388, + "time_per_iteration": 2.8246304988861084 + }, + { + "auxiliary_loss_clip": 0.01130856, + "auxiliary_loss_mlp": 0.01062414, + "balance_loss_clip": 1.05684018, + "balance_loss_mlp": 1.03742766, + "epoch": 0.08351119795580941, + "flos": 18807567926400.0, + "grad_norm": 2.0255270252506636, + "language_loss": 0.79527366, + "learning_rate": 3.970038595960369e-06, + "loss": 0.81720638, + "num_input_tokens_seen": 29568705, + "step": 1389, + "time_per_iteration": 2.8606414794921875 + }, + { + "auxiliary_loss_clip": 0.01172785, + "auxiliary_loss_mlp": 0.01064077, + "balance_loss_clip": 1.05787444, + "balance_loss_mlp": 1.03923428, + "epoch": 0.08357132120847738, + "flos": 18441817689600.0, + "grad_norm": 2.546615132743645, + "language_loss": 0.87427586, + "learning_rate": 3.969971398222699e-06, + "loss": 0.89664447, + "num_input_tokens_seen": 29585855, + "step": 1390, + "time_per_iteration": 2.795931577682495 + }, + { + "auxiliary_loss_clip": 0.01160426, + "auxiliary_loss_mlp": 0.01067723, + "balance_loss_clip": 1.05447149, + "balance_loss_mlp": 1.04082966, + "epoch": 0.08363144446114534, + "flos": 25922333318400.0, + "grad_norm": 1.8703157168219726, + "language_loss": 0.86833143, + "learning_rate": 3.969904125783517e-06, + "loss": 0.89061296, + "num_input_tokens_seen": 29607280, + "step": 1391, + "time_per_iteration": 2.811598062515259 + }, + { + "auxiliary_loss_clip": 0.01156119, + "auxiliary_loss_mlp": 0.01076482, + "balance_loss_clip": 1.05575848, + "balance_loss_mlp": 1.05180562, + "epoch": 0.08369156771381332, + "flos": 18041306065920.0, + "grad_norm": 3.7979396758909263, + "language_loss": 0.87688571, + "learning_rate": 3.969836778645371e-06, + "loss": 0.89921176, + "num_input_tokens_seen": 29624130, + "step": 1392, + "time_per_iteration": 2.776819944381714 + }, + { + "auxiliary_loss_clip": 0.01183316, + "auxiliary_loss_mlp": 0.01058545, + "balance_loss_clip": 1.05830503, + "balance_loss_mlp": 1.03500128, + "epoch": 0.08375169096648129, + "flos": 22675111073280.0, + "grad_norm": 8.95243370865895, + "language_loss": 0.80574775, + "learning_rate": 3.969769356810819e-06, + "loss": 0.82816637, + "num_input_tokens_seen": 29643210, + "step": 1393, + "time_per_iteration": 2.735761880874634 + }, + { + "auxiliary_loss_clip": 0.01197686, + "auxiliary_loss_mlp": 0.01058125, + "balance_loss_clip": 1.06329441, + "balance_loss_mlp": 1.03466487, + "epoch": 0.08381181421914925, + "flos": 26103215232000.0, + "grad_norm": 1.7485261130451684, + "language_loss": 0.85064757, + "learning_rate": 3.969701860282415e-06, + "loss": 0.87320572, + "num_input_tokens_seen": 29663920, + "step": 1394, + "time_per_iteration": 2.950211524963379 + }, + { + "auxiliary_loss_clip": 0.01145594, + "auxiliary_loss_mlp": 0.01058123, + "balance_loss_clip": 1.05994248, + "balance_loss_mlp": 1.03432918, + "epoch": 0.08387193747181723, + "flos": 20629782835200.0, + "grad_norm": 1.782466846937859, + "language_loss": 0.82979721, + "learning_rate": 3.969634289062719e-06, + "loss": 0.85183442, + "num_input_tokens_seen": 29683825, + "step": 1395, + "time_per_iteration": 2.883977174758911 + }, + { + "auxiliary_loss_clip": 0.01187279, + "auxiliary_loss_mlp": 0.00782865, + "balance_loss_clip": 1.06065941, + "balance_loss_mlp": 1.00028706, + "epoch": 0.0839320607244852, + "flos": 13443196199040.0, + "grad_norm": 3.330409107955743, + "language_loss": 0.82481396, + "learning_rate": 3.969566643154293e-06, + "loss": 0.84451544, + "num_input_tokens_seen": 29698775, + "step": 1396, + "time_per_iteration": 2.6729378700256348 + }, + { + "auxiliary_loss_clip": 0.0118605, + "auxiliary_loss_mlp": 0.01060468, + "balance_loss_clip": 1.06378388, + "balance_loss_mlp": 1.03475475, + "epoch": 0.08399218397715316, + "flos": 23477247642240.0, + "grad_norm": 1.780410555630689, + "language_loss": 0.76843297, + "learning_rate": 3.969498922559703e-06, + "loss": 0.79089814, + "num_input_tokens_seen": 29719430, + "step": 1397, + "time_per_iteration": 2.64888334274292 + }, + { + "auxiliary_loss_clip": 0.01153742, + "auxiliary_loss_mlp": 0.01050759, + "balance_loss_clip": 1.05790138, + "balance_loss_mlp": 1.02621412, + "epoch": 0.08405230722982113, + "flos": 25920717206400.0, + "grad_norm": 2.1323769932413184, + "language_loss": 0.77941638, + "learning_rate": 3.969431127281516e-06, + "loss": 0.8014614, + "num_input_tokens_seen": 29739685, + "step": 1398, + "time_per_iteration": 2.8302125930786133 + }, + { + "auxiliary_loss_clip": 0.01191086, + "auxiliary_loss_mlp": 0.01052374, + "balance_loss_clip": 1.05962944, + "balance_loss_mlp": 1.02943766, + "epoch": 0.0841124304824891, + "flos": 17967437746560.0, + "grad_norm": 2.150764713624159, + "language_loss": 0.94635069, + "learning_rate": 3.969363257322304e-06, + "loss": 0.96878529, + "num_input_tokens_seen": 29756165, + "step": 1399, + "time_per_iteration": 2.650517702102661 + }, + { + "auxiliary_loss_clip": 0.01172403, + "auxiliary_loss_mlp": 0.0106738, + "balance_loss_clip": 1.0562712, + "balance_loss_mlp": 1.04168999, + "epoch": 0.08417255373515707, + "flos": 25629661301760.0, + "grad_norm": 3.6141849657848137, + "language_loss": 0.81904209, + "learning_rate": 3.96929531268464e-06, + "loss": 0.8414399, + "num_input_tokens_seen": 29776425, + "step": 1400, + "time_per_iteration": 2.777369260787964 + }, + { + "auxiliary_loss_clip": 0.01170173, + "auxiliary_loss_mlp": 0.01064292, + "balance_loss_clip": 1.05968165, + "balance_loss_mlp": 1.03957999, + "epoch": 0.08423267698782504, + "flos": 26249730808320.0, + "grad_norm": 8.998651919840762, + "language_loss": 0.8642807, + "learning_rate": 3.969227293371099e-06, + "loss": 0.88662529, + "num_input_tokens_seen": 29796440, + "step": 1401, + "time_per_iteration": 2.91375732421875 + }, + { + "auxiliary_loss_clip": 0.01196, + "auxiliary_loss_mlp": 0.01066109, + "balance_loss_clip": 1.05935979, + "balance_loss_mlp": 1.04053831, + "epoch": 0.08429280024049302, + "flos": 20119707751680.0, + "grad_norm": 2.9792515680869114, + "language_loss": 0.87500131, + "learning_rate": 3.969159199384263e-06, + "loss": 0.89762247, + "num_input_tokens_seen": 29814755, + "step": 1402, + "time_per_iteration": 2.7827296257019043 + }, + { + "auxiliary_loss_clip": 0.01144907, + "auxiliary_loss_mlp": 0.00781428, + "balance_loss_clip": 1.05105817, + "balance_loss_mlp": 1.00033188, + "epoch": 0.08435292349316098, + "flos": 42924526836480.0, + "grad_norm": 2.1517994230241566, + "language_loss": 0.8905524, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.90981579, + "num_input_tokens_seen": 29834785, + "step": 1403, + "time_per_iteration": 2.931666374206543 + }, + { + "auxiliary_loss_clip": 0.01165276, + "auxiliary_loss_mlp": 0.01061696, + "balance_loss_clip": 1.05570936, + "balance_loss_mlp": 1.03715038, + "epoch": 0.08441304674582895, + "flos": 22857285876480.0, + "grad_norm": 1.790271378285476, + "language_loss": 0.80321431, + "learning_rate": 3.969022787401033e-06, + "loss": 0.82548404, + "num_input_tokens_seen": 29854695, + "step": 1404, + "time_per_iteration": 2.7397725582122803 + }, + { + "auxiliary_loss_clip": 0.01181709, + "auxiliary_loss_mlp": 0.01071408, + "balance_loss_clip": 1.06211567, + "balance_loss_mlp": 1.04649353, + "epoch": 0.08447316999849692, + "flos": 18697501676160.0, + "grad_norm": 2.0849305916509193, + "language_loss": 0.83557045, + "learning_rate": 3.968954469409811e-06, + "loss": 0.85810155, + "num_input_tokens_seen": 29872180, + "step": 1405, + "time_per_iteration": 2.8052847385406494 + }, + { + "auxiliary_loss_clip": 0.0118246, + "auxiliary_loss_mlp": 0.01058347, + "balance_loss_clip": 1.05636072, + "balance_loss_mlp": 1.03588748, + "epoch": 0.08453329325116489, + "flos": 25483971738240.0, + "grad_norm": 1.5225846020503528, + "language_loss": 0.7991904, + "learning_rate": 3.968886076755639e-06, + "loss": 0.82159847, + "num_input_tokens_seen": 29893205, + "step": 1406, + "time_per_iteration": 4.301243305206299 + }, + { + "auxiliary_loss_clip": 0.0117117, + "auxiliary_loss_mlp": 0.01068275, + "balance_loss_clip": 1.05790758, + "balance_loss_mlp": 1.04406369, + "epoch": 0.08459341650383286, + "flos": 20920048640640.0, + "grad_norm": 1.717770739318623, + "language_loss": 0.79441547, + "learning_rate": 3.96881760944111e-06, + "loss": 0.81680995, + "num_input_tokens_seen": 29911970, + "step": 1407, + "time_per_iteration": 2.6535613536834717 + }, + { + "auxiliary_loss_clip": 0.01186501, + "auxiliary_loss_mlp": 0.01057881, + "balance_loss_clip": 1.05982685, + "balance_loss_mlp": 1.03409886, + "epoch": 0.08465353975650082, + "flos": 13043079624960.0, + "grad_norm": 2.191354041218588, + "language_loss": 0.91799384, + "learning_rate": 3.968749067468819e-06, + "loss": 0.94043779, + "num_input_tokens_seen": 29929925, + "step": 1408, + "time_per_iteration": 5.774486064910889 + }, + { + "auxiliary_loss_clip": 0.01058217, + "auxiliary_loss_mlp": 0.01015213, + "balance_loss_clip": 1.0231359, + "balance_loss_mlp": 1.01139832, + "epoch": 0.0847136630091688, + "flos": 60877422552960.0, + "grad_norm": 0.9559717259642487, + "language_loss": 0.61891782, + "learning_rate": 3.968680450841368e-06, + "loss": 0.63965201, + "num_input_tokens_seen": 29985950, + "step": 1409, + "time_per_iteration": 4.9455225467681885 + }, + { + "auxiliary_loss_clip": 0.01188186, + "auxiliary_loss_mlp": 0.01061718, + "balance_loss_clip": 1.05840743, + "balance_loss_mlp": 1.03878236, + "epoch": 0.08477378626183676, + "flos": 22046530043520.0, + "grad_norm": 1.6980375913788566, + "language_loss": 0.86357373, + "learning_rate": 3.968611759561355e-06, + "loss": 0.88607281, + "num_input_tokens_seen": 30004330, + "step": 1410, + "time_per_iteration": 2.640355110168457 + }, + { + "auxiliary_loss_clip": 0.01181512, + "auxiliary_loss_mlp": 0.01053874, + "balance_loss_clip": 1.0583061, + "balance_loss_mlp": 1.02870846, + "epoch": 0.08483390951450473, + "flos": 16690059308160.0, + "grad_norm": 2.248971712939306, + "language_loss": 0.74384397, + "learning_rate": 3.968542993631388e-06, + "loss": 0.7661978, + "num_input_tokens_seen": 30022555, + "step": 1411, + "time_per_iteration": 2.6200830936431885 + }, + { + "auxiliary_loss_clip": 0.01077929, + "auxiliary_loss_mlp": 0.01003535, + "balance_loss_clip": 1.02317524, + "balance_loss_mlp": 0.99991113, + "epoch": 0.08489403276717271, + "flos": 51584640082560.0, + "grad_norm": 0.9014663966204861, + "language_loss": 0.56748837, + "learning_rate": 3.968474153054073e-06, + "loss": 0.58830309, + "num_input_tokens_seen": 30077220, + "step": 1412, + "time_per_iteration": 3.0746512413024902 + }, + { + "auxiliary_loss_clip": 0.01156137, + "auxiliary_loss_mlp": 0.01067795, + "balance_loss_clip": 1.05325568, + "balance_loss_mlp": 1.04265356, + "epoch": 0.08495415601984067, + "flos": 17092330698240.0, + "grad_norm": 2.2757293876932945, + "language_loss": 0.88754624, + "learning_rate": 3.96840523783202e-06, + "loss": 0.90978551, + "num_input_tokens_seen": 30094600, + "step": 1413, + "time_per_iteration": 2.7309420108795166 + }, + { + "auxiliary_loss_clip": 0.01164895, + "auxiliary_loss_mlp": 0.01057479, + "balance_loss_clip": 1.05780244, + "balance_loss_mlp": 1.03295755, + "epoch": 0.08501427927250864, + "flos": 23148413608320.0, + "grad_norm": 1.9781781646219805, + "language_loss": 0.87963474, + "learning_rate": 3.968336247967844e-06, + "loss": 0.90185857, + "num_input_tokens_seen": 30114475, + "step": 1414, + "time_per_iteration": 2.692030668258667 + }, + { + "auxiliary_loss_clip": 0.01168145, + "auxiliary_loss_mlp": 0.01063751, + "balance_loss_clip": 1.05704033, + "balance_loss_mlp": 1.04170966, + "epoch": 0.08507440252517662, + "flos": 19063467394560.0, + "grad_norm": 1.9706021333256292, + "language_loss": 0.77636635, + "learning_rate": 3.96826718346416e-06, + "loss": 0.79868531, + "num_input_tokens_seen": 30133350, + "step": 1415, + "time_per_iteration": 2.8435540199279785 + }, + { + "auxiliary_loss_clip": 0.01182108, + "auxiliary_loss_mlp": 0.01059478, + "balance_loss_clip": 1.0588963, + "balance_loss_mlp": 1.03701878, + "epoch": 0.08513452577784458, + "flos": 60182296600320.0, + "grad_norm": 1.7170282174092708, + "language_loss": 0.70545506, + "learning_rate": 3.968198044323587e-06, + "loss": 0.72787094, + "num_input_tokens_seen": 30159005, + "step": 1416, + "time_per_iteration": 3.021360158920288 + }, + { + "auxiliary_loss_clip": 0.01174166, + "auxiliary_loss_mlp": 0.01066487, + "balance_loss_clip": 1.05930233, + "balance_loss_mlp": 1.04131043, + "epoch": 0.08519464903051255, + "flos": 27308485117440.0, + "grad_norm": 2.8159853289102053, + "language_loss": 0.74938154, + "learning_rate": 3.968128830548748e-06, + "loss": 0.771788, + "num_input_tokens_seen": 30179450, + "step": 1417, + "time_per_iteration": 2.738301992416382 + }, + { + "auxiliary_loss_clip": 0.01171292, + "auxiliary_loss_mlp": 0.01057092, + "balance_loss_clip": 1.05715823, + "balance_loss_mlp": 1.03313112, + "epoch": 0.08525477228318051, + "flos": 20266438809600.0, + "grad_norm": 2.4132423968154635, + "language_loss": 0.8258723, + "learning_rate": 3.968059542142265e-06, + "loss": 0.84815615, + "num_input_tokens_seen": 30197235, + "step": 1418, + "time_per_iteration": 2.671574831008911 + }, + { + "auxiliary_loss_clip": 0.0104499, + "auxiliary_loss_mlp": 0.01004818, + "balance_loss_clip": 1.02242994, + "balance_loss_mlp": 1.0004549, + "epoch": 0.08531489553584849, + "flos": 67615017183360.0, + "grad_norm": 0.8667411864001444, + "language_loss": 0.56638753, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58688557, + "num_input_tokens_seen": 30257410, + "step": 1419, + "time_per_iteration": 3.199730396270752 + }, + { + "auxiliary_loss_clip": 0.01192231, + "auxiliary_loss_mlp": 0.01067737, + "balance_loss_clip": 1.05757999, + "balance_loss_mlp": 1.04369283, + "epoch": 0.08537501878851646, + "flos": 27526965592320.0, + "grad_norm": 2.2357492693560466, + "language_loss": 0.70111859, + "learning_rate": 3.967920741444886e-06, + "loss": 0.72371829, + "num_input_tokens_seen": 30277865, + "step": 1420, + "time_per_iteration": 2.7176027297973633 + }, + { + "auxiliary_loss_clip": 0.01155207, + "auxiliary_loss_mlp": 0.01050755, + "balance_loss_clip": 1.05377483, + "balance_loss_mlp": 1.02692556, + "epoch": 0.08543514204118442, + "flos": 22784243569920.0, + "grad_norm": 1.5975069204011494, + "language_loss": 0.88011539, + "learning_rate": 3.967851229159252e-06, + "loss": 0.90217495, + "num_input_tokens_seen": 30298545, + "step": 1421, + "time_per_iteration": 2.7552106380462646 + }, + { + "auxiliary_loss_clip": 0.01077473, + "auxiliary_loss_mlp": 0.01013517, + "balance_loss_clip": 1.02364218, + "balance_loss_mlp": 1.01020324, + "epoch": 0.0854952652938524, + "flos": 60990721027200.0, + "grad_norm": 0.9142209544576306, + "language_loss": 0.63506877, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65597868, + "num_input_tokens_seen": 30361725, + "step": 1422, + "time_per_iteration": 3.134183168411255 + }, + { + "auxiliary_loss_clip": 0.01152948, + "auxiliary_loss_mlp": 0.01063847, + "balance_loss_clip": 1.05932307, + "balance_loss_mlp": 1.0406723, + "epoch": 0.08555538854652037, + "flos": 28038046256640.0, + "grad_norm": 1.8757015124159093, + "language_loss": 0.82691669, + "learning_rate": 3.967711980727276e-06, + "loss": 0.84908462, + "num_input_tokens_seen": 30382180, + "step": 1423, + "time_per_iteration": 2.789393424987793 + }, + { + "auxiliary_loss_clip": 0.01153439, + "auxiliary_loss_mlp": 0.01064169, + "balance_loss_clip": 1.0526228, + "balance_loss_mlp": 1.04089928, + "epoch": 0.08561551179918833, + "flos": 23509279595520.0, + "grad_norm": 1.6593534429066656, + "language_loss": 0.75424892, + "learning_rate": 3.967642244586213e-06, + "loss": 0.776425, + "num_input_tokens_seen": 30402980, + "step": 1424, + "time_per_iteration": 2.7805826663970947 + }, + { + "auxiliary_loss_clip": 0.01139579, + "auxiliary_loss_mlp": 0.01060342, + "balance_loss_clip": 1.05769765, + "balance_loss_mlp": 1.03751373, + "epoch": 0.08567563505185631, + "flos": 17926930183680.0, + "grad_norm": 1.7999307606718091, + "language_loss": 0.75948423, + "learning_rate": 3.96757243383196e-06, + "loss": 0.78148341, + "num_input_tokens_seen": 30420800, + "step": 1425, + "time_per_iteration": 2.677889823913574 + }, + { + "auxiliary_loss_clip": 0.0118966, + "auxiliary_loss_mlp": 0.01055231, + "balance_loss_clip": 1.05982256, + "balance_loss_mlp": 1.03230715, + "epoch": 0.08573575830452428, + "flos": 19719519350400.0, + "grad_norm": 2.1792756220437743, + "language_loss": 0.93362999, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.95607889, + "num_input_tokens_seen": 30439620, + "step": 1426, + "time_per_iteration": 2.6270906925201416 + }, + { + "auxiliary_loss_clip": 0.01145994, + "auxiliary_loss_mlp": 0.01066219, + "balance_loss_clip": 1.05707717, + "balance_loss_mlp": 1.0406251, + "epoch": 0.08579588155719224, + "flos": 17931563038080.0, + "grad_norm": 2.3679064075186553, + "language_loss": 0.75424731, + "learning_rate": 3.967432588494471e-06, + "loss": 0.77636945, + "num_input_tokens_seen": 30457300, + "step": 1427, + "time_per_iteration": 2.84614634513855 + }, + { + "auxiliary_loss_clip": 0.01190697, + "auxiliary_loss_mlp": 0.01052992, + "balance_loss_clip": 1.06006169, + "balance_loss_mlp": 1.0305804, + "epoch": 0.08585600480986022, + "flos": 16033324993920.0, + "grad_norm": 3.503048788198607, + "language_loss": 0.82108849, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84352541, + "num_input_tokens_seen": 30471580, + "step": 1428, + "time_per_iteration": 2.5882396697998047 + }, + { + "auxiliary_loss_clip": 0.01173688, + "auxiliary_loss_mlp": 0.0106298, + "balance_loss_clip": 1.05633736, + "balance_loss_mlp": 1.03832793, + "epoch": 0.08591612806252819, + "flos": 28657433404800.0, + "grad_norm": 2.088481658755078, + "language_loss": 0.79929984, + "learning_rate": 3.967292444736023e-06, + "loss": 0.82166648, + "num_input_tokens_seen": 30492720, + "step": 1429, + "time_per_iteration": 2.720500946044922 + }, + { + "auxiliary_loss_clip": 0.01169119, + "auxiliary_loss_mlp": 0.010606, + "balance_loss_clip": 1.05971265, + "balance_loss_mlp": 1.0379504, + "epoch": 0.08597625131519615, + "flos": 20959119659520.0, + "grad_norm": 1.9029222975672677, + "language_loss": 0.87716508, + "learning_rate": 3.967222260955578e-06, + "loss": 0.89946228, + "num_input_tokens_seen": 30509535, + "step": 1430, + "time_per_iteration": 2.6914596557617188 + }, + { + "auxiliary_loss_clip": 0.01144304, + "auxiliary_loss_mlp": 0.01074633, + "balance_loss_clip": 1.05802035, + "balance_loss_mlp": 1.05125606, + "epoch": 0.08603637456786412, + "flos": 23256360956160.0, + "grad_norm": 1.6366623508781384, + "language_loss": 0.81859726, + "learning_rate": 3.96715200257787e-06, + "loss": 0.84078664, + "num_input_tokens_seen": 30529490, + "step": 1431, + "time_per_iteration": 2.834402322769165 + }, + { + "auxiliary_loss_clip": 0.01148362, + "auxiliary_loss_mlp": 0.01054323, + "balance_loss_clip": 1.05620182, + "balance_loss_mlp": 1.03132737, + "epoch": 0.0860964978205321, + "flos": 28694170039680.0, + "grad_norm": 1.5497375505717568, + "language_loss": 0.78109461, + "learning_rate": 3.967081669605559e-06, + "loss": 0.80312145, + "num_input_tokens_seen": 30550205, + "step": 1432, + "time_per_iteration": 2.767860174179077 + }, + { + "auxiliary_loss_clip": 0.01167351, + "auxiliary_loss_mlp": 0.0106333, + "balance_loss_clip": 1.0540905, + "balance_loss_mlp": 1.03914225, + "epoch": 0.08615662107320006, + "flos": 19318397195520.0, + "grad_norm": 1.9631692713893694, + "language_loss": 0.73365706, + "learning_rate": 3.967011262041315e-06, + "loss": 0.75596392, + "num_input_tokens_seen": 30568830, + "step": 1433, + "time_per_iteration": 2.6930699348449707 + }, + { + "auxiliary_loss_clip": 0.01150098, + "auxiliary_loss_mlp": 0.00781967, + "balance_loss_clip": 1.05335927, + "balance_loss_mlp": 1.00044179, + "epoch": 0.08621674432586802, + "flos": 15851688894720.0, + "grad_norm": 2.468588778716135, + "language_loss": 0.85340321, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.87272388, + "num_input_tokens_seen": 30585730, + "step": 1434, + "time_per_iteration": 2.735690116882324 + }, + { + "auxiliary_loss_clip": 0.01170363, + "auxiliary_loss_mlp": 0.01057659, + "balance_loss_clip": 1.05604434, + "balance_loss_mlp": 1.0344249, + "epoch": 0.086276867578536, + "flos": 14100648785280.0, + "grad_norm": 2.160640509122794, + "language_loss": 0.7870298, + "learning_rate": 3.966870223147707e-06, + "loss": 0.80931008, + "num_input_tokens_seen": 30603180, + "step": 1435, + "time_per_iteration": 2.776567220687866 + }, + { + "auxiliary_loss_clip": 0.01047768, + "auxiliary_loss_mlp": 0.01015597, + "balance_loss_clip": 1.023893, + "balance_loss_mlp": 1.01206815, + "epoch": 0.08633699083120397, + "flos": 70184857772160.0, + "grad_norm": 0.8900716332014227, + "language_loss": 0.57975936, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60039294, + "num_input_tokens_seen": 30668895, + "step": 1436, + "time_per_iteration": 3.344207763671875 + }, + { + "auxiliary_loss_clip": 0.0117372, + "auxiliary_loss_mlp": 0.01056829, + "balance_loss_clip": 1.05617976, + "balance_loss_mlp": 1.03153312, + "epoch": 0.08639711408387193, + "flos": 30298874140800.0, + "grad_norm": 2.240343996649645, + "language_loss": 0.69169062, + "learning_rate": 3.966728885918437e-06, + "loss": 0.71399617, + "num_input_tokens_seen": 30688955, + "step": 1437, + "time_per_iteration": 2.7171547412872314 + }, + { + "auxiliary_loss_clip": 0.01121044, + "auxiliary_loss_mlp": 0.01055264, + "balance_loss_clip": 1.05334914, + "balance_loss_mlp": 1.03223276, + "epoch": 0.08645723733653991, + "flos": 20297680663680.0, + "grad_norm": 2.1340571114707245, + "language_loss": 0.72624576, + "learning_rate": 3.966658105434627e-06, + "loss": 0.74800885, + "num_input_tokens_seen": 30706095, + "step": 1438, + "time_per_iteration": 2.7815651893615723 + }, + { + "auxiliary_loss_clip": 0.01179626, + "auxiliary_loss_mlp": 0.01052578, + "balance_loss_clip": 1.06052637, + "balance_loss_mlp": 1.02872419, + "epoch": 0.08651736058920788, + "flos": 32890583134080.0, + "grad_norm": 1.5339762166114281, + "language_loss": 0.64377135, + "learning_rate": 3.966587250374945e-06, + "loss": 0.66609335, + "num_input_tokens_seen": 30729025, + "step": 1439, + "time_per_iteration": 2.8935797214508057 + }, + { + "auxiliary_loss_clip": 0.01153286, + "auxiliary_loss_mlp": 0.01056452, + "balance_loss_clip": 1.05530453, + "balance_loss_mlp": 1.03213322, + "epoch": 0.08657748384187584, + "flos": 22637368857600.0, + "grad_norm": 5.193932354158579, + "language_loss": 0.87521696, + "learning_rate": 3.966516320742077e-06, + "loss": 0.89731431, + "num_input_tokens_seen": 30746155, + "step": 1440, + "time_per_iteration": 2.731531858444214 + }, + { + "auxiliary_loss_clip": 0.01155923, + "auxiliary_loss_mlp": 0.00782787, + "balance_loss_clip": 1.05752945, + "balance_loss_mlp": 1.00043201, + "epoch": 0.08663760709454381, + "flos": 23658380951040.0, + "grad_norm": 2.023462963415533, + "language_loss": 0.83434939, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.85373652, + "num_input_tokens_seen": 30761410, + "step": 1441, + "time_per_iteration": 2.7126500606536865 + }, + { + "auxiliary_loss_clip": 0.01074667, + "auxiliary_loss_mlp": 0.01004602, + "balance_loss_clip": 1.0222367, + "balance_loss_mlp": 1.00100195, + "epoch": 0.08669773034721179, + "flos": 62686564911360.0, + "grad_norm": 0.8541685426878655, + "language_loss": 0.60479522, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62558794, + "num_input_tokens_seen": 30823010, + "step": 1442, + "time_per_iteration": 3.25555157661438 + }, + { + "auxiliary_loss_clip": 0.0116729, + "auxiliary_loss_mlp": 0.01054262, + "balance_loss_clip": 1.05768681, + "balance_loss_mlp": 1.03075421, + "epoch": 0.08675785359987975, + "flos": 20667489137280.0, + "grad_norm": 2.8449103562639073, + "language_loss": 0.79304373, + "learning_rate": 3.96630308443127e-06, + "loss": 0.81525922, + "num_input_tokens_seen": 30841980, + "step": 1443, + "time_per_iteration": 2.7314631938934326 + }, + { + "auxiliary_loss_clip": 0.01180858, + "auxiliary_loss_mlp": 0.01051075, + "balance_loss_clip": 1.05780149, + "balance_loss_mlp": 1.02755547, + "epoch": 0.08681797685254772, + "flos": 26941118768640.0, + "grad_norm": 1.6739262813835734, + "language_loss": 0.82399666, + "learning_rate": 3.966231856532584e-06, + "loss": 0.84631598, + "num_input_tokens_seen": 30863280, + "step": 1444, + "time_per_iteration": 2.7341418266296387 + }, + { + "auxiliary_loss_clip": 0.01196759, + "auxiliary_loss_mlp": 0.01051473, + "balance_loss_clip": 1.06044626, + "balance_loss_mlp": 1.02810788, + "epoch": 0.0868781001052157, + "flos": 17712831168000.0, + "grad_norm": 2.3015461969915747, + "language_loss": 0.87354827, + "learning_rate": 3.966160554074189e-06, + "loss": 0.8960306, + "num_input_tokens_seen": 30881710, + "step": 1445, + "time_per_iteration": 4.25179386138916 + }, + { + "auxiliary_loss_clip": 0.01180784, + "auxiliary_loss_mlp": 0.01055896, + "balance_loss_clip": 1.06094933, + "balance_loss_mlp": 1.03446186, + "epoch": 0.08693822335788366, + "flos": 19896522595200.0, + "grad_norm": 1.8066650797875201, + "language_loss": 0.81863767, + "learning_rate": 3.96608917705879e-06, + "loss": 0.84100449, + "num_input_tokens_seen": 30900225, + "step": 1446, + "time_per_iteration": 4.197181940078735 + }, + { + "auxiliary_loss_clip": 0.01056056, + "auxiliary_loss_mlp": 0.01004371, + "balance_loss_clip": 1.01782191, + "balance_loss_mlp": 1.00031781, + "epoch": 0.08699834661055163, + "flos": 67023747406080.0, + "grad_norm": 0.7255245569613363, + "language_loss": 0.54762936, + "learning_rate": 3.966017725489091e-06, + "loss": 0.56823361, + "num_input_tokens_seen": 30959580, + "step": 1447, + "time_per_iteration": 3.2158126831054688 + }, + { + "auxiliary_loss_clip": 0.0114861, + "auxiliary_loss_mlp": 0.01056824, + "balance_loss_clip": 1.05373001, + "balance_loss_mlp": 1.03518772, + "epoch": 0.0870584698632196, + "flos": 13480507451520.0, + "grad_norm": 2.1586118179593696, + "language_loss": 0.84592307, + "learning_rate": 3.965946199367804e-06, + "loss": 0.86797738, + "num_input_tokens_seen": 30976775, + "step": 1448, + "time_per_iteration": 4.262767314910889 + }, + { + "auxiliary_loss_clip": 0.01194173, + "auxiliary_loss_mlp": 0.01050219, + "balance_loss_clip": 1.05891991, + "balance_loss_mlp": 1.02768826, + "epoch": 0.08711859311588757, + "flos": 16107013745280.0, + "grad_norm": 3.4326906921347096, + "language_loss": 0.80644608, + "learning_rate": 3.965874598697638e-06, + "loss": 0.82888997, + "num_input_tokens_seen": 30990495, + "step": 1449, + "time_per_iteration": 4.553676128387451 + }, + { + "auxiliary_loss_clip": 0.01138548, + "auxiliary_loss_mlp": 0.01052142, + "balance_loss_clip": 1.05437374, + "balance_loss_mlp": 1.02946854, + "epoch": 0.08717871636855554, + "flos": 38472357928320.0, + "grad_norm": 1.5251600336566102, + "language_loss": 0.70971417, + "learning_rate": 3.965802923481313e-06, + "loss": 0.73162109, + "num_input_tokens_seen": 31014080, + "step": 1450, + "time_per_iteration": 2.9082705974578857 + }, + { + "auxiliary_loss_clip": 0.01124466, + "auxiliary_loss_mlp": 0.01054883, + "balance_loss_clip": 1.05164719, + "balance_loss_mlp": 1.03207827, + "epoch": 0.0872388396212235, + "flos": 17600574188160.0, + "grad_norm": 1.9392114767205617, + "language_loss": 0.83684897, + "learning_rate": 3.965731173721542e-06, + "loss": 0.85864246, + "num_input_tokens_seen": 31031210, + "step": 1451, + "time_per_iteration": 2.809880495071411 + }, + { + "auxiliary_loss_clip": 0.01134251, + "auxiliary_loss_mlp": 0.00780873, + "balance_loss_clip": 1.05147851, + "balance_loss_mlp": 1.00039482, + "epoch": 0.08729896287389148, + "flos": 25259385951360.0, + "grad_norm": 2.5160845512367773, + "language_loss": 0.74654591, + "learning_rate": 3.965659349421049e-06, + "loss": 0.76569718, + "num_input_tokens_seen": 31049710, + "step": 1452, + "time_per_iteration": 2.88580060005188 + }, + { + "auxiliary_loss_clip": 0.01157134, + "auxiliary_loss_mlp": 0.01063328, + "balance_loss_clip": 1.05607891, + "balance_loss_mlp": 1.0388428, + "epoch": 0.08735908612655945, + "flos": 15632454234240.0, + "grad_norm": 4.56941406999875, + "language_loss": 0.80543101, + "learning_rate": 3.965587450582556e-06, + "loss": 0.82763565, + "num_input_tokens_seen": 31066160, + "step": 1453, + "time_per_iteration": 2.733632802963257 + }, + { + "auxiliary_loss_clip": 0.01169707, + "auxiliary_loss_mlp": 0.01059533, + "balance_loss_clip": 1.05905569, + "balance_loss_mlp": 1.03625154, + "epoch": 0.08741920937922741, + "flos": 20339660684160.0, + "grad_norm": 2.0102093196988102, + "language_loss": 0.71041977, + "learning_rate": 3.96551547720879e-06, + "loss": 0.73271215, + "num_input_tokens_seen": 31085270, + "step": 1454, + "time_per_iteration": 2.7568745613098145 + }, + { + "auxiliary_loss_clip": 0.0106426, + "auxiliary_loss_mlp": 0.01008112, + "balance_loss_clip": 1.0215131, + "balance_loss_mlp": 1.00463128, + "epoch": 0.08747933263189539, + "flos": 62819795433600.0, + "grad_norm": 0.7713706503543015, + "language_loss": 0.5859946, + "learning_rate": 3.96544342930248e-06, + "loss": 0.6067183, + "num_input_tokens_seen": 31148445, + "step": 1455, + "time_per_iteration": 3.2372186183929443 + }, + { + "auxiliary_loss_clip": 0.01189404, + "auxiliary_loss_mlp": 0.01060742, + "balance_loss_clip": 1.05742884, + "balance_loss_mlp": 1.03688788, + "epoch": 0.08753945588456336, + "flos": 33035877648000.0, + "grad_norm": 1.6485208275358016, + "language_loss": 0.77564865, + "learning_rate": 3.965371306866359e-06, + "loss": 0.79815018, + "num_input_tokens_seen": 31168770, + "step": 1456, + "time_per_iteration": 2.790663003921509 + }, + { + "auxiliary_loss_clip": 0.01127959, + "auxiliary_loss_mlp": 0.01054526, + "balance_loss_clip": 1.04962158, + "balance_loss_mlp": 1.03071976, + "epoch": 0.08759957913723132, + "flos": 35547182046720.0, + "grad_norm": 1.83889407784057, + "language_loss": 0.72420907, + "learning_rate": 3.96529910990316e-06, + "loss": 0.74603397, + "num_input_tokens_seen": 31189270, + "step": 1457, + "time_per_iteration": 2.9099740982055664 + }, + { + "auxiliary_loss_clip": 0.01176549, + "auxiliary_loss_mlp": 0.0104866, + "balance_loss_clip": 1.05627227, + "balance_loss_mlp": 1.02633214, + "epoch": 0.0876597023898993, + "flos": 23911120022400.0, + "grad_norm": 1.5250401870177361, + "language_loss": 0.86412215, + "learning_rate": 3.965226838415622e-06, + "loss": 0.88637424, + "num_input_tokens_seen": 31210385, + "step": 1458, + "time_per_iteration": 2.7517166137695312 + }, + { + "auxiliary_loss_clip": 0.01169535, + "auxiliary_loss_mlp": 0.01061413, + "balance_loss_clip": 1.05884266, + "balance_loss_mlp": 1.03825045, + "epoch": 0.08771982564256726, + "flos": 18114025150080.0, + "grad_norm": 1.7412813512419094, + "language_loss": 0.80268395, + "learning_rate": 3.965154492406486e-06, + "loss": 0.82499349, + "num_input_tokens_seen": 31229745, + "step": 1459, + "time_per_iteration": 2.71455717086792 + }, + { + "auxiliary_loss_clip": 0.01130491, + "auxiliary_loss_mlp": 0.01054334, + "balance_loss_clip": 1.05256546, + "balance_loss_mlp": 1.03018188, + "epoch": 0.08777994889523523, + "flos": 17712005155200.0, + "grad_norm": 2.1450339680450714, + "language_loss": 0.84538847, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.86723673, + "num_input_tokens_seen": 31248280, + "step": 1460, + "time_per_iteration": 2.8737733364105225 + }, + { + "auxiliary_loss_clip": 0.01177787, + "auxiliary_loss_mlp": 0.01057974, + "balance_loss_clip": 1.0572983, + "balance_loss_mlp": 1.03640938, + "epoch": 0.0878400721479032, + "flos": 12819930382080.0, + "grad_norm": 4.917361835698274, + "language_loss": 0.79993135, + "learning_rate": 3.965009576834394e-06, + "loss": 0.82228899, + "num_input_tokens_seen": 31262190, + "step": 1461, + "time_per_iteration": 2.8436062335968018 + }, + { + "auxiliary_loss_clip": 0.01169165, + "auxiliary_loss_mlp": 0.01058947, + "balance_loss_clip": 1.05800629, + "balance_loss_mlp": 1.03704822, + "epoch": 0.08790019540057117, + "flos": 26392690938240.0, + "grad_norm": 1.566202508611165, + "language_loss": 0.76571167, + "learning_rate": 3.964937007276932e-06, + "loss": 0.78799284, + "num_input_tokens_seen": 31283690, + "step": 1462, + "time_per_iteration": 2.7895474433898926 + }, + { + "auxiliary_loss_clip": 0.0117563, + "auxiliary_loss_mlp": 0.01060064, + "balance_loss_clip": 1.05839491, + "balance_loss_mlp": 1.03580475, + "epoch": 0.08796031865323914, + "flos": 19134031662720.0, + "grad_norm": 2.89717114041641, + "language_loss": 0.74710488, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.76946187, + "num_input_tokens_seen": 31302505, + "step": 1463, + "time_per_iteration": 2.760404348373413 + }, + { + "auxiliary_loss_clip": 0.01191543, + "auxiliary_loss_mlp": 0.01061609, + "balance_loss_clip": 1.06145048, + "balance_loss_mlp": 1.03680158, + "epoch": 0.0880204419059071, + "flos": 26064287867520.0, + "grad_norm": 2.431514195311041, + "language_loss": 0.83797103, + "learning_rate": 3.964791644632941e-06, + "loss": 0.8605026, + "num_input_tokens_seen": 31323070, + "step": 1464, + "time_per_iteration": 2.7417759895324707 + }, + { + "auxiliary_loss_clip": 0.011733, + "auxiliary_loss_mlp": 0.01063475, + "balance_loss_clip": 1.05683231, + "balance_loss_mlp": 1.04093289, + "epoch": 0.08808056515857508, + "flos": 22377842115840.0, + "grad_norm": 2.1775753375634963, + "language_loss": 0.78104752, + "learning_rate": 3.964718851551923e-06, + "loss": 0.8034153, + "num_input_tokens_seen": 31341880, + "step": 1465, + "time_per_iteration": 2.6852309703826904 + }, + { + "auxiliary_loss_clip": 0.01199489, + "auxiliary_loss_mlp": 0.01059873, + "balance_loss_clip": 1.0619812, + "balance_loss_mlp": 1.03791499, + "epoch": 0.08814068841124305, + "flos": 23185293897600.0, + "grad_norm": 2.412657222564686, + "language_loss": 0.85187089, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.87446451, + "num_input_tokens_seen": 31361995, + "step": 1466, + "time_per_iteration": 2.706264019012451 + }, + { + "auxiliary_loss_clip": 0.01120627, + "auxiliary_loss_mlp": 0.00782645, + "balance_loss_clip": 1.04989958, + "balance_loss_mlp": 1.00037241, + "epoch": 0.08820081166391101, + "flos": 25155281358720.0, + "grad_norm": 1.9900601596102498, + "language_loss": 0.84168816, + "learning_rate": 3.964573041885641e-06, + "loss": 0.86072087, + "num_input_tokens_seen": 31381515, + "step": 1467, + "time_per_iteration": 2.8636934757232666 + }, + { + "auxiliary_loss_clip": 0.01178935, + "auxiliary_loss_mlp": 0.01055379, + "balance_loss_clip": 1.05910301, + "balance_loss_mlp": 1.03219247, + "epoch": 0.08826093491657899, + "flos": 22231685675520.0, + "grad_norm": 1.660218686828999, + "language_loss": 0.75506544, + "learning_rate": 3.964500025305907e-06, + "loss": 0.77740854, + "num_input_tokens_seen": 31400345, + "step": 1468, + "time_per_iteration": 2.661501884460449 + }, + { + "auxiliary_loss_clip": 0.01181261, + "auxiliary_loss_mlp": 0.01054252, + "balance_loss_clip": 1.0629456, + "balance_loss_mlp": 1.03266358, + "epoch": 0.08832105816924696, + "flos": 22126826897280.0, + "grad_norm": 4.868504388441724, + "language_loss": 0.80322379, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.82557893, + "num_input_tokens_seen": 31419620, + "step": 1469, + "time_per_iteration": 2.7473137378692627 + }, + { + "auxiliary_loss_clip": 0.01198542, + "auxiliary_loss_mlp": 0.01059353, + "balance_loss_clip": 1.0627017, + "balance_loss_mlp": 1.03677487, + "epoch": 0.08838118142191492, + "flos": 17566495159680.0, + "grad_norm": 2.0179242193855806, + "language_loss": 0.77437651, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.79695547, + "num_input_tokens_seen": 31437970, + "step": 1470, + "time_per_iteration": 2.7672410011291504 + }, + { + "auxiliary_loss_clip": 0.01193825, + "auxiliary_loss_mlp": 0.01067102, + "balance_loss_clip": 1.06180143, + "balance_loss_mlp": 1.04281926, + "epoch": 0.0884413046745829, + "flos": 20777196251520.0, + "grad_norm": 1.6812425162011504, + "language_loss": 0.84297001, + "learning_rate": 3.964280528613569e-06, + "loss": 0.86557925, + "num_input_tokens_seen": 31457040, + "step": 1471, + "time_per_iteration": 2.7584216594696045 + }, + { + "auxiliary_loss_clip": 0.01156315, + "auxiliary_loss_mlp": 0.01054307, + "balance_loss_clip": 1.05682266, + "balance_loss_mlp": 1.03342199, + "epoch": 0.08850142792725087, + "flos": 22125462180480.0, + "grad_norm": 1.6938350729430058, + "language_loss": 0.83321345, + "learning_rate": 3.964207214074324e-06, + "loss": 0.85531968, + "num_input_tokens_seen": 31477520, + "step": 1472, + "time_per_iteration": 2.7895469665527344 + }, + { + "auxiliary_loss_clip": 0.01176151, + "auxiliary_loss_mlp": 0.01058616, + "balance_loss_clip": 1.06106544, + "balance_loss_mlp": 1.03529835, + "epoch": 0.08856155117991883, + "flos": 22418744728320.0, + "grad_norm": 2.3638705809965, + "language_loss": 0.82781172, + "learning_rate": 3.964133825052146e-06, + "loss": 0.85015941, + "num_input_tokens_seen": 31495575, + "step": 1473, + "time_per_iteration": 2.7361483573913574 + }, + { + "auxiliary_loss_clip": 0.01129906, + "auxiliary_loss_mlp": 0.01064148, + "balance_loss_clip": 1.05552769, + "balance_loss_mlp": 1.04263091, + "epoch": 0.0886216744325868, + "flos": 29937002572800.0, + "grad_norm": 1.6022277785896435, + "language_loss": 0.78712153, + "learning_rate": 3.964060361549816e-06, + "loss": 0.80906206, + "num_input_tokens_seen": 31520020, + "step": 1474, + "time_per_iteration": 2.894319534301758 + }, + { + "auxiliary_loss_clip": 0.01146238, + "auxiliary_loss_mlp": 0.01068131, + "balance_loss_clip": 1.05575764, + "balance_loss_mlp": 1.04175043, + "epoch": 0.08868179768525478, + "flos": 23982833525760.0, + "grad_norm": 1.6120869011213488, + "language_loss": 0.79030406, + "learning_rate": 3.963986823570121e-06, + "loss": 0.81244779, + "num_input_tokens_seen": 31539265, + "step": 1475, + "time_per_iteration": 2.8806042671203613 + }, + { + "auxiliary_loss_clip": 0.01191986, + "auxiliary_loss_mlp": 0.01047451, + "balance_loss_clip": 1.05980015, + "balance_loss_mlp": 1.02478909, + "epoch": 0.08874192093792274, + "flos": 43177553216640.0, + "grad_norm": 1.4679464237421194, + "language_loss": 0.74202317, + "learning_rate": 3.963913211115848e-06, + "loss": 0.76441753, + "num_input_tokens_seen": 31563425, + "step": 1476, + "time_per_iteration": 2.8381049633026123 + }, + { + "auxiliary_loss_clip": 0.01174628, + "auxiliary_loss_mlp": 0.01059934, + "balance_loss_clip": 1.06217527, + "balance_loss_mlp": 1.03678358, + "epoch": 0.0888020441905907, + "flos": 32852445868800.0, + "grad_norm": 1.712954575149443, + "language_loss": 0.74220836, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.76455402, + "num_input_tokens_seen": 31584525, + "step": 1477, + "time_per_iteration": 2.8452210426330566 + }, + { + "auxiliary_loss_clip": 0.01191865, + "auxiliary_loss_mlp": 0.01051229, + "balance_loss_clip": 1.06062829, + "balance_loss_mlp": 1.0278163, + "epoch": 0.08886216744325869, + "flos": 23149347361920.0, + "grad_norm": 1.95844459768748, + "language_loss": 0.87194049, + "learning_rate": 3.963765762794739e-06, + "loss": 0.89437139, + "num_input_tokens_seen": 31603325, + "step": 1478, + "time_per_iteration": 2.644918203353882 + }, + { + "auxiliary_loss_clip": 0.01176299, + "auxiliary_loss_mlp": 0.01058069, + "balance_loss_clip": 1.0572443, + "balance_loss_mlp": 1.03546739, + "epoch": 0.08892229069592665, + "flos": 23331593992320.0, + "grad_norm": 1.6306868156426517, + "language_loss": 0.77571511, + "learning_rate": 3.963691926933495e-06, + "loss": 0.79805881, + "num_input_tokens_seen": 31624820, + "step": 1479, + "time_per_iteration": 2.738168954849243 + }, + { + "auxiliary_loss_clip": 0.01164179, + "auxiliary_loss_mlp": 0.010526, + "balance_loss_clip": 1.05629039, + "balance_loss_mlp": 1.02801871, + "epoch": 0.08898241394859462, + "flos": 26213784272640.0, + "grad_norm": 2.199164032289915, + "language_loss": 0.77797234, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.80014014, + "num_input_tokens_seen": 31646080, + "step": 1480, + "time_per_iteration": 2.837562322616577 + }, + { + "auxiliary_loss_clip": 0.01180168, + "auxiliary_loss_mlp": 0.01060894, + "balance_loss_clip": 1.05762577, + "balance_loss_mlp": 1.03656292, + "epoch": 0.0890425372012626, + "flos": 23550613171200.0, + "grad_norm": 2.9471668635954273, + "language_loss": 0.66437578, + "learning_rate": 3.963544031823624e-06, + "loss": 0.68678641, + "num_input_tokens_seen": 31665770, + "step": 1481, + "time_per_iteration": 2.742422580718994 + }, + { + "auxiliary_loss_clip": 0.01143445, + "auxiliary_loss_mlp": 0.01055318, + "balance_loss_clip": 1.05510306, + "balance_loss_mlp": 1.03273988, + "epoch": 0.08910266045393056, + "flos": 23002795872000.0, + "grad_norm": 2.124586862599894, + "language_loss": 0.96630967, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.9882974, + "num_input_tokens_seen": 31683805, + "step": 1482, + "time_per_iteration": 2.8150243759155273 + }, + { + "auxiliary_loss_clip": 0.0115336, + "auxiliary_loss_mlp": 0.01057266, + "balance_loss_clip": 1.05521989, + "balance_loss_mlp": 1.03353167, + "epoch": 0.08916278370659853, + "flos": 31936508035200.0, + "grad_norm": 1.7904792435575492, + "language_loss": 0.78683239, + "learning_rate": 3.96339583888261e-06, + "loss": 0.80893862, + "num_input_tokens_seen": 31704630, + "step": 1483, + "time_per_iteration": 2.869084119796753 + }, + { + "auxiliary_loss_clip": 0.0116904, + "auxiliary_loss_mlp": 0.01082082, + "balance_loss_clip": 1.05540919, + "balance_loss_mlp": 1.05829978, + "epoch": 0.08922290695926649, + "flos": 17530404969600.0, + "grad_norm": 2.2229749189835677, + "language_loss": 0.85424453, + "learning_rate": 3.963321630732448e-06, + "loss": 0.87675571, + "num_input_tokens_seen": 31723255, + "step": 1484, + "time_per_iteration": 4.280332326889038 + }, + { + "auxiliary_loss_clip": 0.01199312, + "auxiliary_loss_mlp": 0.01060639, + "balance_loss_clip": 1.06350458, + "balance_loss_mlp": 1.03701186, + "epoch": 0.08928303021193447, + "flos": 32125075459200.0, + "grad_norm": 1.7208139316694195, + "language_loss": 0.80205405, + "learning_rate": 3.963247348132932e-06, + "loss": 0.82465357, + "num_input_tokens_seen": 31747045, + "step": 1485, + "time_per_iteration": 2.761733055114746 + }, + { + "auxiliary_loss_clip": 0.01173167, + "auxiliary_loss_mlp": 0.01056554, + "balance_loss_clip": 1.0563333, + "balance_loss_mlp": 1.03228331, + "epoch": 0.08934315346460243, + "flos": 22125210785280.0, + "grad_norm": 1.8969438127775513, + "language_loss": 0.82859123, + "learning_rate": 3.96317299108688e-06, + "loss": 0.85088843, + "num_input_tokens_seen": 31766615, + "step": 1486, + "time_per_iteration": 4.144649028778076 + }, + { + "auxiliary_loss_clip": 0.01144509, + "auxiliary_loss_mlp": 0.01063805, + "balance_loss_clip": 1.05592823, + "balance_loss_mlp": 1.04021382, + "epoch": 0.0894032767172704, + "flos": 22565583527040.0, + "grad_norm": 2.1520807598980185, + "language_loss": 0.76365155, + "learning_rate": 3.963098559597111e-06, + "loss": 0.78573477, + "num_input_tokens_seen": 31785855, + "step": 1487, + "time_per_iteration": 4.432489395141602 + }, + { + "auxiliary_loss_clip": 0.01157327, + "auxiliary_loss_mlp": 0.01060261, + "balance_loss_clip": 1.05041027, + "balance_loss_mlp": 1.03542995, + "epoch": 0.08946339996993838, + "flos": 20193396503040.0, + "grad_norm": 3.851280697857004, + "language_loss": 0.83030224, + "learning_rate": 3.963024053666449e-06, + "loss": 0.85247803, + "num_input_tokens_seen": 31804210, + "step": 1488, + "time_per_iteration": 2.7262001037597656 + }, + { + "auxiliary_loss_clip": 0.01171869, + "auxiliary_loss_mlp": 0.01051875, + "balance_loss_clip": 1.05546355, + "balance_loss_mlp": 1.02916527, + "epoch": 0.08952352322260634, + "flos": 48360181104000.0, + "grad_norm": 1.7759111472560039, + "language_loss": 0.71783459, + "learning_rate": 3.962949473297718e-06, + "loss": 0.74007201, + "num_input_tokens_seen": 31826150, + "step": 1489, + "time_per_iteration": 4.562536954879761 + }, + { + "auxiliary_loss_clip": 0.01150585, + "auxiliary_loss_mlp": 0.01051382, + "balance_loss_clip": 1.05190349, + "balance_loss_mlp": 1.02830291, + "epoch": 0.08958364647527431, + "flos": 31793081028480.0, + "grad_norm": 1.6999724957706692, + "language_loss": 0.89717221, + "learning_rate": 3.962874818493745e-06, + "loss": 0.91919196, + "num_input_tokens_seen": 31848060, + "step": 1490, + "time_per_iteration": 2.838327646255493 + }, + { + "auxiliary_loss_clip": 0.01184278, + "auxiliary_loss_mlp": 0.01064168, + "balance_loss_clip": 1.05656135, + "balance_loss_mlp": 1.04102957, + "epoch": 0.08964376972794229, + "flos": 23368186972800.0, + "grad_norm": 3.9062133325383126, + "language_loss": 0.73075998, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.7532444, + "num_input_tokens_seen": 31870040, + "step": 1491, + "time_per_iteration": 2.7007367610931396 + }, + { + "auxiliary_loss_clip": 0.01189564, + "auxiliary_loss_mlp": 0.00780167, + "balance_loss_clip": 1.05968356, + "balance_loss_mlp": 1.00023544, + "epoch": 0.08970389298061025, + "flos": 23294785530240.0, + "grad_norm": 1.7021050418948058, + "language_loss": 0.77235049, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.79204774, + "num_input_tokens_seen": 31890400, + "step": 1492, + "time_per_iteration": 2.7799623012542725 + }, + { + "auxiliary_loss_clip": 0.01187114, + "auxiliary_loss_mlp": 0.01057952, + "balance_loss_clip": 1.05902028, + "balance_loss_mlp": 1.03512359, + "epoch": 0.08976401623327822, + "flos": 33761703772800.0, + "grad_norm": 1.9236790530591625, + "language_loss": 0.71429193, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73674262, + "num_input_tokens_seen": 31913435, + "step": 1493, + "time_per_iteration": 2.8479840755462646 + }, + { + "auxiliary_loss_clip": 0.01188796, + "auxiliary_loss_mlp": 0.01057103, + "balance_loss_clip": 1.05757976, + "balance_loss_mlp": 1.03371406, + "epoch": 0.08982413948594618, + "flos": 23911335504000.0, + "grad_norm": 2.6977604073852053, + "language_loss": 0.87175488, + "learning_rate": 3.962575454982109e-06, + "loss": 0.8942138, + "num_input_tokens_seen": 31932435, + "step": 1494, + "time_per_iteration": 2.855658769607544 + }, + { + "auxiliary_loss_clip": 0.0108466, + "auxiliary_loss_mlp": 0.01070478, + "balance_loss_clip": 1.04641223, + "balance_loss_mlp": 1.04551601, + "epoch": 0.08988426273861416, + "flos": 16837544551680.0, + "grad_norm": 1.6162523894431247, + "language_loss": 0.82929438, + "learning_rate": 3.962500428044454e-06, + "loss": 0.85084569, + "num_input_tokens_seen": 31950125, + "step": 1495, + "time_per_iteration": 2.9265449047088623 + }, + { + "auxiliary_loss_clip": 0.01171464, + "auxiliary_loss_mlp": 0.01059756, + "balance_loss_clip": 1.05779243, + "balance_loss_mlp": 1.03682017, + "epoch": 0.08994438599128213, + "flos": 14793365548800.0, + "grad_norm": 9.387255385257733, + "language_loss": 0.70191383, + "learning_rate": 3.962425326688585e-06, + "loss": 0.72422606, + "num_input_tokens_seen": 31968050, + "step": 1496, + "time_per_iteration": 2.773693799972534 + }, + { + "auxiliary_loss_clip": 0.01164171, + "auxiliary_loss_mlp": 0.01049454, + "balance_loss_clip": 1.05397439, + "balance_loss_mlp": 1.02888989, + "epoch": 0.09000450924395009, + "flos": 17384320356480.0, + "grad_norm": 1.6327835891742186, + "language_loss": 0.79752576, + "learning_rate": 3.962350150917351e-06, + "loss": 0.81966203, + "num_input_tokens_seen": 31985675, + "step": 1497, + "time_per_iteration": 2.6850852966308594 + }, + { + "auxiliary_loss_clip": 0.01129609, + "auxiliary_loss_mlp": 0.01054903, + "balance_loss_clip": 1.05307686, + "balance_loss_mlp": 1.03146648, + "epoch": 0.09006463249661807, + "flos": 24280317964800.0, + "grad_norm": 8.517000212139891, + "language_loss": 0.82940567, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.85125089, + "num_input_tokens_seen": 32005180, + "step": 1498, + "time_per_iteration": 2.786205768585205 + }, + { + "auxiliary_loss_clip": 0.01170006, + "auxiliary_loss_mlp": 0.01059397, + "balance_loss_clip": 1.0577898, + "balance_loss_mlp": 1.03718853, + "epoch": 0.09012475574928604, + "flos": 13661928069120.0, + "grad_norm": 2.220597323082783, + "language_loss": 0.78609937, + "learning_rate": 3.962199576140195e-06, + "loss": 0.80839342, + "num_input_tokens_seen": 32022970, + "step": 1499, + "time_per_iteration": 2.71785831451416 + }, + { + "auxiliary_loss_clip": 0.01161539, + "auxiliary_loss_mlp": 0.00780528, + "balance_loss_clip": 1.05444527, + "balance_loss_mlp": 1.00024021, + "epoch": 0.090184879001954, + "flos": 23327751237120.0, + "grad_norm": 2.049001350461653, + "language_loss": 0.93337607, + "learning_rate": 3.962124177139981e-06, + "loss": 0.95279682, + "num_input_tokens_seen": 32043055, + "step": 1500, + "time_per_iteration": 2.7077536582946777 + }, + { + "auxiliary_loss_clip": 0.01148009, + "auxiliary_loss_mlp": 0.01055246, + "balance_loss_clip": 1.05371249, + "balance_loss_mlp": 1.0308435, + "epoch": 0.09024500225462198, + "flos": 23002688131200.0, + "grad_norm": 3.0778515668575492, + "language_loss": 0.74595469, + "learning_rate": 3.962048703735822e-06, + "loss": 0.76798725, + "num_input_tokens_seen": 32061900, + "step": 1501, + "time_per_iteration": 2.7073416709899902 + }, + { + "auxiliary_loss_clip": 0.01056535, + "auxiliary_loss_mlp": 0.01013118, + "balance_loss_clip": 1.03392363, + "balance_loss_mlp": 1.00963676, + "epoch": 0.09030512550728995, + "flos": 62189203242240.0, + "grad_norm": 0.7274487593473578, + "language_loss": 0.58316052, + "learning_rate": 3.96197315593058e-06, + "loss": 0.60385704, + "num_input_tokens_seen": 32122745, + "step": 1502, + "time_per_iteration": 3.274049997329712 + }, + { + "auxiliary_loss_clip": 0.0114469, + "auxiliary_loss_mlp": 0.01062533, + "balance_loss_clip": 1.04626393, + "balance_loss_mlp": 1.03896546, + "epoch": 0.09036524875995791, + "flos": 38800689171840.0, + "grad_norm": 2.1727281711500095, + "language_loss": 0.69501173, + "learning_rate": 3.961897533727119e-06, + "loss": 0.71708393, + "num_input_tokens_seen": 32145125, + "step": 1503, + "time_per_iteration": 2.87554669380188 + }, + { + "auxiliary_loss_clip": 0.01133108, + "auxiliary_loss_mlp": 0.0105903, + "balance_loss_clip": 1.04783726, + "balance_loss_mlp": 1.03660655, + "epoch": 0.09042537201262588, + "flos": 21690081429120.0, + "grad_norm": 2.169205134580129, + "language_loss": 0.86124271, + "learning_rate": 3.961821837128306e-06, + "loss": 0.88316405, + "num_input_tokens_seen": 32166255, + "step": 1504, + "time_per_iteration": 2.844688892364502 + }, + { + "auxiliary_loss_clip": 0.01146301, + "auxiliary_loss_mlp": 0.01069714, + "balance_loss_clip": 1.05341232, + "balance_loss_mlp": 1.04261804, + "epoch": 0.09048549526529386, + "flos": 22267021680000.0, + "grad_norm": 2.178155372989796, + "language_loss": 0.7233696, + "learning_rate": 3.961746066137014e-06, + "loss": 0.74552977, + "num_input_tokens_seen": 32184010, + "step": 1505, + "time_per_iteration": 2.7992677688598633 + }, + { + "auxiliary_loss_clip": 0.01137399, + "auxiliary_loss_mlp": 0.01056414, + "balance_loss_clip": 1.05097985, + "balance_loss_mlp": 1.03302479, + "epoch": 0.09054561851796182, + "flos": 14610939350400.0, + "grad_norm": 2.5107188210784526, + "language_loss": 0.80730999, + "learning_rate": 3.961670220756114e-06, + "loss": 0.82924813, + "num_input_tokens_seen": 32201635, + "step": 1506, + "time_per_iteration": 2.7458760738372803 + }, + { + "auxiliary_loss_clip": 0.01140643, + "auxiliary_loss_mlp": 0.01053315, + "balance_loss_clip": 1.05161858, + "balance_loss_mlp": 1.03197718, + "epoch": 0.09060574177062979, + "flos": 27636169916160.0, + "grad_norm": 2.166956120197676, + "language_loss": 0.75915337, + "learning_rate": 3.961594300988482e-06, + "loss": 0.78109294, + "num_input_tokens_seen": 32221940, + "step": 1507, + "time_per_iteration": 2.873826742172241 + }, + { + "auxiliary_loss_clip": 0.01051873, + "auxiliary_loss_mlp": 0.01005715, + "balance_loss_clip": 1.02043629, + "balance_loss_mlp": 1.00175714, + "epoch": 0.09066586502329776, + "flos": 66085797513600.0, + "grad_norm": 0.7272435825555993, + "language_loss": 0.57699698, + "learning_rate": 3.961518306836998e-06, + "loss": 0.59757286, + "num_input_tokens_seen": 32276495, + "step": 1508, + "time_per_iteration": 3.064926862716675 + }, + { + "auxiliary_loss_clip": 0.01165416, + "auxiliary_loss_mlp": 0.01054804, + "balance_loss_clip": 1.055233, + "balance_loss_mlp": 1.03155804, + "epoch": 0.09072598827596573, + "flos": 18916449027840.0, + "grad_norm": 1.7601330807914457, + "language_loss": 0.85090744, + "learning_rate": 3.961442238304543e-06, + "loss": 0.87310958, + "num_input_tokens_seen": 32294130, + "step": 1509, + "time_per_iteration": 2.6664113998413086 + }, + { + "auxiliary_loss_clip": 0.01168837, + "auxiliary_loss_mlp": 0.01064138, + "balance_loss_clip": 1.05745769, + "balance_loss_mlp": 1.03949761, + "epoch": 0.0907861115286337, + "flos": 24821742643200.0, + "grad_norm": 2.3794507710009203, + "language_loss": 0.84110659, + "learning_rate": 3.961366095394002e-06, + "loss": 0.8634364, + "num_input_tokens_seen": 32313555, + "step": 1510, + "time_per_iteration": 2.783484697341919 + }, + { + "auxiliary_loss_clip": 0.01153141, + "auxiliary_loss_mlp": 0.01058569, + "balance_loss_clip": 1.05423617, + "balance_loss_mlp": 1.03482211, + "epoch": 0.09084623478130167, + "flos": 21652842003840.0, + "grad_norm": 1.8490761573484715, + "language_loss": 0.85247588, + "learning_rate": 3.961289878108262e-06, + "loss": 0.87459302, + "num_input_tokens_seen": 32331430, + "step": 1511, + "time_per_iteration": 2.714620351791382 + }, + { + "auxiliary_loss_clip": 0.01145395, + "auxiliary_loss_mlp": 0.01052919, + "balance_loss_clip": 1.05182219, + "balance_loss_mlp": 1.02983987, + "epoch": 0.09090635803396964, + "flos": 27639258485760.0, + "grad_norm": 1.5734326837562458, + "language_loss": 0.84977764, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.87176073, + "num_input_tokens_seen": 32353705, + "step": 1512, + "time_per_iteration": 2.75361704826355 + }, + { + "auxiliary_loss_clip": 0.01155239, + "auxiliary_loss_mlp": 0.01053669, + "balance_loss_clip": 1.05740952, + "balance_loss_mlp": 1.03185391, + "epoch": 0.0909664812866376, + "flos": 17669127294720.0, + "grad_norm": 3.0235926431973654, + "language_loss": 0.87346804, + "learning_rate": 3.961137220422749e-06, + "loss": 0.89555705, + "num_input_tokens_seen": 32370520, + "step": 1513, + "time_per_iteration": 2.6864211559295654 + }, + { + "auxiliary_loss_clip": 0.01168585, + "auxiliary_loss_mlp": 0.01049408, + "balance_loss_clip": 1.05562937, + "balance_loss_mlp": 1.02841544, + "epoch": 0.09102660453930557, + "flos": 23951448017280.0, + "grad_norm": 1.7883280971870592, + "language_loss": 0.86802679, + "learning_rate": 3.961060780028764e-06, + "loss": 0.89020675, + "num_input_tokens_seen": 32389105, + "step": 1514, + "time_per_iteration": 2.6788065433502197 + }, + { + "auxiliary_loss_clip": 0.01134005, + "auxiliary_loss_mlp": 0.01064386, + "balance_loss_clip": 1.05571628, + "balance_loss_mlp": 1.04252315, + "epoch": 0.09108672779197355, + "flos": 25812949426560.0, + "grad_norm": 1.7666120550996132, + "language_loss": 0.89944756, + "learning_rate": 3.960984265271159e-06, + "loss": 0.92143154, + "num_input_tokens_seen": 32408065, + "step": 1515, + "time_per_iteration": 2.757390022277832 + }, + { + "auxiliary_loss_clip": 0.01162518, + "auxiliary_loss_mlp": 0.01056937, + "balance_loss_clip": 1.05547726, + "balance_loss_mlp": 1.03360808, + "epoch": 0.09114685104464151, + "flos": 29639482220160.0, + "grad_norm": 2.1090985009837646, + "language_loss": 0.85576892, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.87796342, + "num_input_tokens_seen": 32427225, + "step": 1516, + "time_per_iteration": 2.704784870147705 + }, + { + "auxiliary_loss_clip": 0.01158781, + "auxiliary_loss_mlp": 0.01057165, + "balance_loss_clip": 1.05135357, + "balance_loss_mlp": 1.03451526, + "epoch": 0.09120697429730948, + "flos": 33729635905920.0, + "grad_norm": 2.086405156201108, + "language_loss": 0.81167233, + "learning_rate": 3.960831012676692e-06, + "loss": 0.83383185, + "num_input_tokens_seen": 32450510, + "step": 1517, + "time_per_iteration": 2.8586854934692383 + }, + { + "auxiliary_loss_clip": 0.0117857, + "auxiliary_loss_mlp": 0.01065492, + "balance_loss_clip": 1.05741739, + "balance_loss_mlp": 1.04280686, + "epoch": 0.09126709754997746, + "flos": 18401381953920.0, + "grad_norm": 2.104468567304263, + "language_loss": 0.78067243, + "learning_rate": 3.960754274845642e-06, + "loss": 0.80311304, + "num_input_tokens_seen": 32468425, + "step": 1518, + "time_per_iteration": 2.7862088680267334 + }, + { + "auxiliary_loss_clip": 0.01165395, + "auxiliary_loss_mlp": 0.01061371, + "balance_loss_clip": 1.05285823, + "balance_loss_mlp": 1.03900695, + "epoch": 0.09132722080264542, + "flos": 22091957769600.0, + "grad_norm": 1.6816479812467473, + "language_loss": 0.86124098, + "learning_rate": 3.960677462662594e-06, + "loss": 0.88350856, + "num_input_tokens_seen": 32487510, + "step": 1519, + "time_per_iteration": 2.723714828491211 + }, + { + "auxiliary_loss_clip": 0.01163599, + "auxiliary_loss_mlp": 0.01052792, + "balance_loss_clip": 1.05454183, + "balance_loss_mlp": 1.02914131, + "epoch": 0.09138734405531339, + "flos": 21033131633280.0, + "grad_norm": 1.9681293960876167, + "language_loss": 0.73279071, + "learning_rate": 3.96060057613046e-06, + "loss": 0.75495458, + "num_input_tokens_seen": 32507250, + "step": 1520, + "time_per_iteration": 2.8098628520965576 + }, + { + "auxiliary_loss_clip": 0.01161166, + "auxiliary_loss_mlp": 0.01058035, + "balance_loss_clip": 1.05696058, + "balance_loss_mlp": 1.03469419, + "epoch": 0.09144746730798137, + "flos": 20083940784000.0, + "grad_norm": 2.6988457876937066, + "language_loss": 0.85236609, + "learning_rate": 3.960523615252156e-06, + "loss": 0.87455815, + "num_input_tokens_seen": 32526045, + "step": 1521, + "time_per_iteration": 2.7134172916412354 + }, + { + "auxiliary_loss_clip": 0.01120174, + "auxiliary_loss_mlp": 0.01063979, + "balance_loss_clip": 1.05189717, + "balance_loss_mlp": 1.03991079, + "epoch": 0.09150759056064933, + "flos": 22778210085120.0, + "grad_norm": 1.6991603177293335, + "language_loss": 0.83933008, + "learning_rate": 3.960446580030599e-06, + "loss": 0.8611716, + "num_input_tokens_seen": 32546575, + "step": 1522, + "time_per_iteration": 2.93745493888855 + }, + { + "auxiliary_loss_clip": 0.01182362, + "auxiliary_loss_mlp": 0.01064589, + "balance_loss_clip": 1.05630755, + "balance_loss_mlp": 1.04153395, + "epoch": 0.0915677138133173, + "flos": 27564205017600.0, + "grad_norm": 1.647915064434875, + "language_loss": 0.81012994, + "learning_rate": 3.960369470468711e-06, + "loss": 0.8325994, + "num_input_tokens_seen": 32568795, + "step": 1523, + "time_per_iteration": 4.378152847290039 + }, + { + "auxiliary_loss_clip": 0.01157976, + "auxiliary_loss_mlp": 0.00781395, + "balance_loss_clip": 1.05422449, + "balance_loss_mlp": 1.00037968, + "epoch": 0.09162783706598528, + "flos": 17674765729920.0, + "grad_norm": 2.106497620262502, + "language_loss": 0.7460072, + "learning_rate": 3.960292286569418e-06, + "loss": 0.76540089, + "num_input_tokens_seen": 32587010, + "step": 1524, + "time_per_iteration": 2.7146124839782715 + }, + { + "auxiliary_loss_clip": 0.01135228, + "auxiliary_loss_mlp": 0.0106119, + "balance_loss_clip": 1.05092478, + "balance_loss_mlp": 1.03782487, + "epoch": 0.09168796031865324, + "flos": 18478195188480.0, + "grad_norm": 2.0992608845945413, + "language_loss": 0.86498803, + "learning_rate": 3.960215028335644e-06, + "loss": 0.88695222, + "num_input_tokens_seen": 32602375, + "step": 1525, + "time_per_iteration": 4.314826965332031 + }, + { + "auxiliary_loss_clip": 0.01164396, + "auxiliary_loss_mlp": 0.01049506, + "balance_loss_clip": 1.05688822, + "balance_loss_mlp": 1.0263319, + "epoch": 0.0917480835713212, + "flos": 29387605075200.0, + "grad_norm": 2.1146348399758237, + "language_loss": 0.74512708, + "learning_rate": 3.96013769577032e-06, + "loss": 0.76726609, + "num_input_tokens_seen": 32621460, + "step": 1526, + "time_per_iteration": 5.878855466842651 + }, + { + "auxiliary_loss_clip": 0.01186002, + "auxiliary_loss_mlp": 0.01055817, + "balance_loss_clip": 1.05732703, + "balance_loss_mlp": 1.03392982, + "epoch": 0.09180820682398917, + "flos": 19829262378240.0, + "grad_norm": 2.5135282962071215, + "language_loss": 0.77581728, + "learning_rate": 3.960060288876378e-06, + "loss": 0.79823542, + "num_input_tokens_seen": 32640440, + "step": 1527, + "time_per_iteration": 2.693847179412842 + }, + { + "auxiliary_loss_clip": 0.01173605, + "auxiliary_loss_mlp": 0.01052264, + "balance_loss_clip": 1.0534333, + "balance_loss_mlp": 1.02868414, + "epoch": 0.09186833007665715, + "flos": 23841848643840.0, + "grad_norm": 2.655631139677705, + "language_loss": 0.78546697, + "learning_rate": 3.959982807656753e-06, + "loss": 0.80772561, + "num_input_tokens_seen": 32660020, + "step": 1528, + "time_per_iteration": 2.774219512939453 + }, + { + "auxiliary_loss_clip": 0.01146017, + "auxiliary_loss_mlp": 0.01050376, + "balance_loss_clip": 1.0499053, + "balance_loss_mlp": 1.02827477, + "epoch": 0.09192845332932512, + "flos": 12932726065920.0, + "grad_norm": 2.682547324044482, + "language_loss": 0.76732361, + "learning_rate": 3.959905252114384e-06, + "loss": 0.78928751, + "num_input_tokens_seen": 32678170, + "step": 1529, + "time_per_iteration": 4.603156089782715 + }, + { + "auxiliary_loss_clip": 0.01186538, + "auxiliary_loss_mlp": 0.00780856, + "balance_loss_clip": 1.05415928, + "balance_loss_mlp": 1.00045025, + "epoch": 0.09198857658199308, + "flos": 24568177559040.0, + "grad_norm": 1.7410660090049153, + "language_loss": 0.82906747, + "learning_rate": 3.959827622252211e-06, + "loss": 0.84874141, + "num_input_tokens_seen": 32697540, + "step": 1530, + "time_per_iteration": 2.7118582725524902 + }, + { + "auxiliary_loss_clip": 0.01130108, + "auxiliary_loss_mlp": 0.0106509, + "balance_loss_clip": 1.04975331, + "balance_loss_mlp": 1.04220152, + "epoch": 0.09204869983466106, + "flos": 20266941600000.0, + "grad_norm": 2.182960664479704, + "language_loss": 0.84001881, + "learning_rate": 3.959749918073179e-06, + "loss": 0.86197078, + "num_input_tokens_seen": 32716805, + "step": 1531, + "time_per_iteration": 2.791947603225708 + }, + { + "auxiliary_loss_clip": 0.0113655, + "auxiliary_loss_mlp": 0.01051554, + "balance_loss_clip": 1.04906452, + "balance_loss_mlp": 1.02853465, + "epoch": 0.09210882308732903, + "flos": 20885646389760.0, + "grad_norm": 1.7570281394880602, + "language_loss": 0.81253195, + "learning_rate": 3.959672139580233e-06, + "loss": 0.83441293, + "num_input_tokens_seen": 32736385, + "step": 1532, + "time_per_iteration": 2.737739324569702 + }, + { + "auxiliary_loss_clip": 0.01157728, + "auxiliary_loss_mlp": 0.01056753, + "balance_loss_clip": 1.052163, + "balance_loss_mlp": 1.03385305, + "epoch": 0.09216894633999699, + "flos": 30956326727040.0, + "grad_norm": 2.2821036564882182, + "language_loss": 0.84194255, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.86408734, + "num_input_tokens_seen": 32757140, + "step": 1533, + "time_per_iteration": 2.7542598247528076 + }, + { + "auxiliary_loss_clip": 0.01149262, + "auxiliary_loss_mlp": 0.01053623, + "balance_loss_clip": 1.05813503, + "balance_loss_mlp": 1.03190327, + "epoch": 0.09222906959266497, + "flos": 13151565676800.0, + "grad_norm": 1.9396914937933663, + "language_loss": 0.9009546, + "learning_rate": 3.959516359664402e-06, + "loss": 0.92298347, + "num_input_tokens_seen": 32774860, + "step": 1534, + "time_per_iteration": 2.6450984477996826 + }, + { + "auxiliary_loss_clip": 0.01150273, + "auxiliary_loss_mlp": 0.0106298, + "balance_loss_clip": 1.0495038, + "balance_loss_mlp": 1.03849435, + "epoch": 0.09228919284533293, + "flos": 25994477784960.0, + "grad_norm": 5.065477266086046, + "language_loss": 0.75779241, + "learning_rate": 3.959438358247424e-06, + "loss": 0.77992499, + "num_input_tokens_seen": 32795250, + "step": 1535, + "time_per_iteration": 2.730915069580078 + }, + { + "auxiliary_loss_clip": 0.01168283, + "auxiliary_loss_mlp": 0.01045276, + "balance_loss_clip": 1.05278873, + "balance_loss_mlp": 1.02403271, + "epoch": 0.0923493160980009, + "flos": 18660800954880.0, + "grad_norm": 1.8085584532497372, + "language_loss": 0.81631637, + "learning_rate": 3.959360282528346e-06, + "loss": 0.83845198, + "num_input_tokens_seen": 32813805, + "step": 1536, + "time_per_iteration": 2.7326817512512207 + }, + { + "auxiliary_loss_clip": 0.01181977, + "auxiliary_loss_mlp": 0.01053699, + "balance_loss_clip": 1.05431938, + "balance_loss_mlp": 1.03224182, + "epoch": 0.09240943935066886, + "flos": 21140576190720.0, + "grad_norm": 2.0929096884707556, + "language_loss": 0.89092755, + "learning_rate": 3.959282132510131e-06, + "loss": 0.9132843, + "num_input_tokens_seen": 32830960, + "step": 1537, + "time_per_iteration": 2.675771713256836 + }, + { + "auxiliary_loss_clip": 0.01157238, + "auxiliary_loss_mlp": 0.01058647, + "balance_loss_clip": 1.05114293, + "balance_loss_mlp": 1.03605688, + "epoch": 0.09246956260333684, + "flos": 20592435669120.0, + "grad_norm": 1.9480116987165197, + "language_loss": 0.80702311, + "learning_rate": 3.959203908195741e-06, + "loss": 0.82918191, + "num_input_tokens_seen": 32848275, + "step": 1538, + "time_per_iteration": 2.71618390083313 + }, + { + "auxiliary_loss_clip": 0.01060495, + "auxiliary_loss_mlp": 0.0101237, + "balance_loss_clip": 1.03095436, + "balance_loss_mlp": 1.00872231, + "epoch": 0.09252968585600481, + "flos": 67558710614400.0, + "grad_norm": 0.7534074452314953, + "language_loss": 0.57429332, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59502202, + "num_input_tokens_seen": 32917730, + "step": 1539, + "time_per_iteration": 3.3933441638946533 + }, + { + "auxiliary_loss_clip": 0.01159831, + "auxiliary_loss_mlp": 0.01050602, + "balance_loss_clip": 1.05638027, + "balance_loss_mlp": 1.02863121, + "epoch": 0.09258980910867277, + "flos": 17383853479680.0, + "grad_norm": 2.849299216868502, + "language_loss": 0.67554641, + "learning_rate": 3.959047236690304e-06, + "loss": 0.69765073, + "num_input_tokens_seen": 32934910, + "step": 1540, + "time_per_iteration": 2.757084608078003 + }, + { + "auxiliary_loss_clip": 0.01144239, + "auxiliary_loss_mlp": 0.01048444, + "balance_loss_clip": 1.04954028, + "balance_loss_mlp": 1.026438, + "epoch": 0.09264993236134075, + "flos": 19865927185920.0, + "grad_norm": 2.044335478602743, + "language_loss": 0.83917534, + "learning_rate": 3.958968789505198e-06, + "loss": 0.86110216, + "num_input_tokens_seen": 32953840, + "step": 1541, + "time_per_iteration": 2.8497180938720703 + }, + { + "auxiliary_loss_clip": 0.01077839, + "auxiliary_loss_mlp": 0.01013078, + "balance_loss_clip": 1.02602255, + "balance_loss_mlp": 1.0097636, + "epoch": 0.09271005561400872, + "flos": 62284401262080.0, + "grad_norm": 0.8790732834061692, + "language_loss": 0.61881655, + "learning_rate": 3.9588902680358e-06, + "loss": 0.63972563, + "num_input_tokens_seen": 33011410, + "step": 1542, + "time_per_iteration": 3.3079330921173096 + }, + { + "auxiliary_loss_clip": 0.01161232, + "auxiliary_loss_mlp": 0.01059438, + "balance_loss_clip": 1.05441117, + "balance_loss_mlp": 1.03808808, + "epoch": 0.09277017886667668, + "flos": 23329870139520.0, + "grad_norm": 1.6256118826429122, + "language_loss": 0.82802349, + "learning_rate": 3.958811672285086e-06, + "loss": 0.85023022, + "num_input_tokens_seen": 33031675, + "step": 1543, + "time_per_iteration": 2.7408807277679443 + }, + { + "auxiliary_loss_clip": 0.01135873, + "auxiliary_loss_mlp": 0.01060295, + "balance_loss_clip": 1.04848838, + "balance_loss_mlp": 1.03863442, + "epoch": 0.09283030211934466, + "flos": 54745169875200.0, + "grad_norm": 1.706948475246468, + "language_loss": 0.72265279, + "learning_rate": 3.958733002256038e-06, + "loss": 0.74461448, + "num_input_tokens_seen": 33056355, + "step": 1544, + "time_per_iteration": 3.104156255722046 + }, + { + "auxiliary_loss_clip": 0.01166071, + "auxiliary_loss_mlp": 0.01055881, + "balance_loss_clip": 1.05165935, + "balance_loss_mlp": 1.03138375, + "epoch": 0.09289042537201263, + "flos": 30334784762880.0, + "grad_norm": 1.7720844214030114, + "language_loss": 0.77286768, + "learning_rate": 3.958654257951637e-06, + "loss": 0.79508722, + "num_input_tokens_seen": 33079520, + "step": 1545, + "time_per_iteration": 2.808180570602417 + }, + { + "auxiliary_loss_clip": 0.01140161, + "auxiliary_loss_mlp": 0.01050495, + "balance_loss_clip": 1.0526737, + "balance_loss_mlp": 1.02872682, + "epoch": 0.09295054862468059, + "flos": 17746838369280.0, + "grad_norm": 2.7089619481030076, + "language_loss": 0.74396008, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.76586664, + "num_input_tokens_seen": 33096135, + "step": 1546, + "time_per_iteration": 2.7634081840515137 + }, + { + "auxiliary_loss_clip": 0.01163775, + "auxiliary_loss_mlp": 0.0105305, + "balance_loss_clip": 1.05357957, + "balance_loss_mlp": 1.02956545, + "epoch": 0.09301067187734856, + "flos": 23658021815040.0, + "grad_norm": 1.9423225100503794, + "language_loss": 0.84200966, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.86417794, + "num_input_tokens_seen": 33115245, + "step": 1547, + "time_per_iteration": 2.790003776550293 + }, + { + "auxiliary_loss_clip": 0.01141839, + "auxiliary_loss_mlp": 0.01053941, + "balance_loss_clip": 1.04740989, + "balance_loss_mlp": 1.03195918, + "epoch": 0.09307079513001654, + "flos": 27527719777920.0, + "grad_norm": 2.6545433694843488, + "language_loss": 0.67698336, + "learning_rate": 3.958417579416199e-06, + "loss": 0.69894123, + "num_input_tokens_seen": 33136640, + "step": 1548, + "time_per_iteration": 2.8367013931274414 + }, + { + "auxiliary_loss_clip": 0.01123899, + "auxiliary_loss_mlp": 0.01059885, + "balance_loss_clip": 1.04744387, + "balance_loss_mlp": 1.03754544, + "epoch": 0.0931309183826845, + "flos": 20627340710400.0, + "grad_norm": 1.6829727803454704, + "language_loss": 0.8326273, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.85446513, + "num_input_tokens_seen": 33155060, + "step": 1549, + "time_per_iteration": 2.8462016582489014 + }, + { + "auxiliary_loss_clip": 0.01176243, + "auxiliary_loss_mlp": 0.0104617, + "balance_loss_clip": 1.05815506, + "balance_loss_mlp": 1.02473652, + "epoch": 0.09319104163535247, + "flos": 29020921084800.0, + "grad_norm": 1.5528514681372962, + "language_loss": 0.75838119, + "learning_rate": 3.958259422403966e-06, + "loss": 0.78060532, + "num_input_tokens_seen": 33175420, + "step": 1550, + "time_per_iteration": 2.7325351238250732 + }, + { + "auxiliary_loss_clip": 0.01150315, + "auxiliary_loss_mlp": 0.01069257, + "balance_loss_clip": 1.05249369, + "balance_loss_mlp": 1.04483092, + "epoch": 0.09325116488802045, + "flos": 25301545539840.0, + "grad_norm": 2.1922696027472233, + "language_loss": 0.82828665, + "learning_rate": 3.95818023251026e-06, + "loss": 0.85048234, + "num_input_tokens_seen": 33194120, + "step": 1551, + "time_per_iteration": 2.852602481842041 + }, + { + "auxiliary_loss_clip": 0.01064371, + "auxiliary_loss_mlp": 0.00760109, + "balance_loss_clip": 1.02203059, + "balance_loss_mlp": 0.99984246, + "epoch": 0.09331128814068841, + "flos": 61536203942400.0, + "grad_norm": 0.7384225982202158, + "language_loss": 0.61837572, + "learning_rate": 3.958100968362163e-06, + "loss": 0.63662052, + "num_input_tokens_seen": 33261080, + "step": 1552, + "time_per_iteration": 3.3453099727630615 + }, + { + "auxiliary_loss_clip": 0.01059175, + "auxiliary_loss_mlp": 0.01016654, + "balance_loss_clip": 1.02415061, + "balance_loss_mlp": 1.01338792, + "epoch": 0.09337141139335638, + "flos": 53293700171520.0, + "grad_norm": 0.8524917480784928, + "language_loss": 0.58986926, + "learning_rate": 3.958021629962681e-06, + "loss": 0.61062753, + "num_input_tokens_seen": 33330235, + "step": 1553, + "time_per_iteration": 3.37673282623291 + }, + { + "auxiliary_loss_clip": 0.01146955, + "auxiliary_loss_mlp": 0.01056683, + "balance_loss_clip": 1.05026984, + "balance_loss_mlp": 1.03336585, + "epoch": 0.09343153464602436, + "flos": 23476852592640.0, + "grad_norm": 2.3365109182487, + "language_loss": 0.87665397, + "learning_rate": 3.957942217314823e-06, + "loss": 0.8986904, + "num_input_tokens_seen": 33349035, + "step": 1554, + "time_per_iteration": 2.8098127841949463 + }, + { + "auxiliary_loss_clip": 0.01153047, + "auxiliary_loss_mlp": 0.01057257, + "balance_loss_clip": 1.05439448, + "balance_loss_mlp": 1.03393972, + "epoch": 0.09349165789869232, + "flos": 19353481804800.0, + "grad_norm": 4.388884220182432, + "language_loss": 0.81678319, + "learning_rate": 3.957862730421599e-06, + "loss": 0.83888626, + "num_input_tokens_seen": 33368060, + "step": 1555, + "time_per_iteration": 2.726207971572876 + }, + { + "auxiliary_loss_clip": 0.01058869, + "auxiliary_loss_mlp": 0.01003892, + "balance_loss_clip": 1.0202632, + "balance_loss_mlp": 1.00045919, + "epoch": 0.09355178115136029, + "flos": 67502580635520.0, + "grad_norm": 0.8683826280274983, + "language_loss": 0.59606886, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61669648, + "num_input_tokens_seen": 33430825, + "step": 1556, + "time_per_iteration": 3.209326982498169 + }, + { + "auxiliary_loss_clip": 0.01174249, + "auxiliary_loss_mlp": 0.01059741, + "balance_loss_clip": 1.05518138, + "balance_loss_mlp": 1.03727031, + "epoch": 0.09361190440402825, + "flos": 37341638720640.0, + "grad_norm": 1.6803158790244075, + "language_loss": 0.84290808, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.86524796, + "num_input_tokens_seen": 33454855, + "step": 1557, + "time_per_iteration": 2.831650733947754 + }, + { + "auxiliary_loss_clip": 0.01110857, + "auxiliary_loss_mlp": 0.01065156, + "balance_loss_clip": 1.04900038, + "balance_loss_mlp": 1.04112351, + "epoch": 0.09367202765669623, + "flos": 24899705112960.0, + "grad_norm": 1.6725809358966677, + "language_loss": 0.780913, + "learning_rate": 3.957623824299893e-06, + "loss": 0.8026731, + "num_input_tokens_seen": 33476000, + "step": 1558, + "time_per_iteration": 3.0111780166625977 + }, + { + "auxiliary_loss_clip": 0.01164994, + "auxiliary_loss_mlp": 0.01051229, + "balance_loss_clip": 1.0558666, + "balance_loss_mlp": 1.02881753, + "epoch": 0.0937321509093642, + "flos": 15705568368000.0, + "grad_norm": 2.0141986314124414, + "language_loss": 0.80066288, + "learning_rate": 3.957544040455379e-06, + "loss": 0.82282507, + "num_input_tokens_seen": 33493845, + "step": 1559, + "time_per_iteration": 3.024117946624756 + }, + { + "auxiliary_loss_clip": 0.01141277, + "auxiliary_loss_mlp": 0.01061718, + "balance_loss_clip": 1.05060387, + "balance_loss_mlp": 1.04012942, + "epoch": 0.09379227416203216, + "flos": 20483698222080.0, + "grad_norm": 1.8358373674042003, + "language_loss": 0.76418209, + "learning_rate": 3.957464182380599e-06, + "loss": 0.78621197, + "num_input_tokens_seen": 33510850, + "step": 1560, + "time_per_iteration": 2.68558406829834 + }, + { + "auxiliary_loss_clip": 0.01137939, + "auxiliary_loss_mlp": 0.01054925, + "balance_loss_clip": 1.05014277, + "balance_loss_mlp": 1.03213274, + "epoch": 0.09385239741470014, + "flos": 24352498344960.0, + "grad_norm": 3.575155933252121, + "language_loss": 0.80784953, + "learning_rate": 3.95738425007858e-06, + "loss": 0.82977819, + "num_input_tokens_seen": 33530430, + "step": 1561, + "time_per_iteration": 2.759148359298706 + }, + { + "auxiliary_loss_clip": 0.01173652, + "auxiliary_loss_mlp": 0.01052448, + "balance_loss_clip": 1.05276573, + "balance_loss_mlp": 1.02989376, + "epoch": 0.0939125206673681, + "flos": 33291489807360.0, + "grad_norm": 2.448664627367939, + "language_loss": 0.6140722, + "learning_rate": 3.957304243552354e-06, + "loss": 0.63633323, + "num_input_tokens_seen": 33551975, + "step": 1562, + "time_per_iteration": 2.9014978408813477 + }, + { + "auxiliary_loss_clip": 0.01162693, + "auxiliary_loss_mlp": 0.0106374, + "balance_loss_clip": 1.05719543, + "balance_loss_mlp": 1.04213953, + "epoch": 0.09397264392003607, + "flos": 19244923925760.0, + "grad_norm": 3.5098220300578555, + "language_loss": 0.8496151, + "learning_rate": 3.957224162804956e-06, + "loss": 0.87187934, + "num_input_tokens_seen": 33569850, + "step": 1563, + "time_per_iteration": 4.404061555862427 + }, + { + "auxiliary_loss_clip": 0.01164811, + "auxiliary_loss_mlp": 0.01047932, + "balance_loss_clip": 1.05775142, + "balance_loss_mlp": 1.02652228, + "epoch": 0.09403276717270405, + "flos": 19317930318720.0, + "grad_norm": 1.6765528861156813, + "language_loss": 0.76511294, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.78724039, + "num_input_tokens_seen": 33590510, + "step": 1564, + "time_per_iteration": 4.255565166473389 + }, + { + "auxiliary_loss_clip": 0.01151297, + "auxiliary_loss_mlp": 0.01063256, + "balance_loss_clip": 1.05196142, + "balance_loss_mlp": 1.04172707, + "epoch": 0.09409289042537201, + "flos": 23583471137280.0, + "grad_norm": 1.9762038777899962, + "language_loss": 0.80134326, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.82348871, + "num_input_tokens_seen": 33608810, + "step": 1565, + "time_per_iteration": 2.8548545837402344 + }, + { + "auxiliary_loss_clip": 0.01158602, + "auxiliary_loss_mlp": 0.01063767, + "balance_loss_clip": 1.05420566, + "balance_loss_mlp": 1.04233313, + "epoch": 0.09415301367803998, + "flos": 20078446003200.0, + "grad_norm": 1.6810250981626251, + "language_loss": 0.75134379, + "learning_rate": 3.956983475266103e-06, + "loss": 0.77356744, + "num_input_tokens_seen": 33627265, + "step": 1566, + "time_per_iteration": 4.889045715332031 + }, + { + "auxiliary_loss_clip": 0.01145856, + "auxiliary_loss_mlp": 0.00780689, + "balance_loss_clip": 1.05168366, + "balance_loss_mlp": 1.00022864, + "epoch": 0.09421313693070796, + "flos": 21062075016960.0, + "grad_norm": 1.6828919748843199, + "language_loss": 0.77958012, + "learning_rate": 3.956903097664407e-06, + "loss": 0.79884553, + "num_input_tokens_seen": 33644810, + "step": 1567, + "time_per_iteration": 4.445765972137451 + }, + { + "auxiliary_loss_clip": 0.01156815, + "auxiliary_loss_mlp": 0.01056228, + "balance_loss_clip": 1.05256855, + "balance_loss_mlp": 1.03591454, + "epoch": 0.09427326018337592, + "flos": 24316156759680.0, + "grad_norm": 2.008686295040646, + "language_loss": 0.82608044, + "learning_rate": 3.956822645856749e-06, + "loss": 0.84821093, + "num_input_tokens_seen": 33665665, + "step": 1568, + "time_per_iteration": 2.881535768508911 + }, + { + "auxiliary_loss_clip": 0.01187915, + "auxiliary_loss_mlp": 0.01051731, + "balance_loss_clip": 1.05717778, + "balance_loss_mlp": 1.02927184, + "epoch": 0.09433338343604389, + "flos": 20263888944000.0, + "grad_norm": 1.9573151026586577, + "language_loss": 0.76943743, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.79183388, + "num_input_tokens_seen": 33684760, + "step": 1569, + "time_per_iteration": 2.6097726821899414 + }, + { + "auxiliary_loss_clip": 0.01120191, + "auxiliary_loss_mlp": 0.01060805, + "balance_loss_clip": 1.04771852, + "balance_loss_mlp": 1.03625941, + "epoch": 0.09439350668871185, + "flos": 12742973493120.0, + "grad_norm": 3.3813700161908917, + "language_loss": 0.85488856, + "learning_rate": 3.956661519635756e-06, + "loss": 0.87669849, + "num_input_tokens_seen": 33700750, + "step": 1570, + "time_per_iteration": 2.7571377754211426 + }, + { + "auxiliary_loss_clip": 0.01122458, + "auxiliary_loss_mlp": 0.01055939, + "balance_loss_clip": 1.04927301, + "balance_loss_mlp": 1.03183508, + "epoch": 0.09445362994137983, + "flos": 25962266263680.0, + "grad_norm": 1.540414635950846, + "language_loss": 0.76415235, + "learning_rate": 3.95658084522853e-06, + "loss": 0.7859363, + "num_input_tokens_seen": 33724430, + "step": 1571, + "time_per_iteration": 2.913569211959839 + }, + { + "auxiliary_loss_clip": 0.01135683, + "auxiliary_loss_mlp": 0.01057111, + "balance_loss_clip": 1.0490278, + "balance_loss_mlp": 1.0349735, + "epoch": 0.0945137531940478, + "flos": 19715353372800.0, + "grad_norm": 1.6745378641752047, + "language_loss": 0.79397607, + "learning_rate": 3.956500096627561e-06, + "loss": 0.81590402, + "num_input_tokens_seen": 33743455, + "step": 1572, + "time_per_iteration": 2.813410758972168 + }, + { + "auxiliary_loss_clip": 0.01148251, + "auxiliary_loss_mlp": 0.0106927, + "balance_loss_clip": 1.05619979, + "balance_loss_mlp": 1.04524922, + "epoch": 0.09457387644671576, + "flos": 23617047375360.0, + "grad_norm": 1.7559396294879055, + "language_loss": 0.87707287, + "learning_rate": 3.956419273835913e-06, + "loss": 0.89924812, + "num_input_tokens_seen": 33763435, + "step": 1573, + "time_per_iteration": 2.776535987854004 + }, + { + "auxiliary_loss_clip": 0.01161183, + "auxiliary_loss_mlp": 0.01063326, + "balance_loss_clip": 1.05485129, + "balance_loss_mlp": 1.03804219, + "epoch": 0.09463399969938374, + "flos": 26907291135360.0, + "grad_norm": 2.9707854698090097, + "language_loss": 0.81982428, + "learning_rate": 3.95633837685665e-06, + "loss": 0.84206939, + "num_input_tokens_seen": 33784325, + "step": 1574, + "time_per_iteration": 2.7604806423187256 + }, + { + "auxiliary_loss_clip": 0.01156287, + "auxiliary_loss_mlp": 0.01055594, + "balance_loss_clip": 1.05234718, + "balance_loss_mlp": 1.0344342, + "epoch": 0.0946941229520517, + "flos": 23659566099840.0, + "grad_norm": 1.7178511535677499, + "language_loss": 0.80855322, + "learning_rate": 3.95625740569284e-06, + "loss": 0.83067203, + "num_input_tokens_seen": 33802510, + "step": 1575, + "time_per_iteration": 2.713247299194336 + }, + { + "auxiliary_loss_clip": 0.01182326, + "auxiliary_loss_mlp": 0.01068689, + "balance_loss_clip": 1.05578864, + "balance_loss_mlp": 1.04581285, + "epoch": 0.09475424620471967, + "flos": 24134053783680.0, + "grad_norm": 1.9110861379460222, + "language_loss": 0.86483347, + "learning_rate": 3.956176360347553e-06, + "loss": 0.88734365, + "num_input_tokens_seen": 33819980, + "step": 1576, + "time_per_iteration": 2.682644844055176 + }, + { + "auxiliary_loss_clip": 0.01056441, + "auxiliary_loss_mlp": 0.01027284, + "balance_loss_clip": 1.0225811, + "balance_loss_mlp": 1.02344561, + "epoch": 0.09481436945738765, + "flos": 68426168065920.0, + "grad_norm": 0.9789918611127905, + "language_loss": 0.6582402, + "learning_rate": 3.956095240823862e-06, + "loss": 0.67907751, + "num_input_tokens_seen": 33878925, + "step": 1577, + "time_per_iteration": 3.2106685638427734 + }, + { + "auxiliary_loss_clip": 0.01147668, + "auxiliary_loss_mlp": 0.01051958, + "balance_loss_clip": 1.05218005, + "balance_loss_mlp": 1.03098869, + "epoch": 0.09487449271005562, + "flos": 16654076858880.0, + "grad_norm": 1.8223175005615506, + "language_loss": 0.79152733, + "learning_rate": 3.956014047124844e-06, + "loss": 0.81352365, + "num_input_tokens_seen": 33897600, + "step": 1578, + "time_per_iteration": 2.820089340209961 + }, + { + "auxiliary_loss_clip": 0.01185941, + "auxiliary_loss_mlp": 0.01066432, + "balance_loss_clip": 1.05838132, + "balance_loss_mlp": 1.04437804, + "epoch": 0.09493461596272358, + "flos": 24275685110400.0, + "grad_norm": 3.480730999818176, + "language_loss": 0.78161818, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80414188, + "num_input_tokens_seen": 33917365, + "step": 1579, + "time_per_iteration": 2.6518983840942383 + }, + { + "auxiliary_loss_clip": 0.01128319, + "auxiliary_loss_mlp": 0.01065633, + "balance_loss_clip": 1.04771328, + "balance_loss_mlp": 1.04001498, + "epoch": 0.09499473921539155, + "flos": 21870173243520.0, + "grad_norm": 2.0084876987684526, + "language_loss": 0.73410392, + "learning_rate": 3.955851437213144e-06, + "loss": 0.75604343, + "num_input_tokens_seen": 33936680, + "step": 1580, + "time_per_iteration": 2.679461717605591 + }, + { + "auxiliary_loss_clip": 0.01157568, + "auxiliary_loss_mlp": 0.01062628, + "balance_loss_clip": 1.05573344, + "balance_loss_mlp": 1.04095626, + "epoch": 0.09505486246805953, + "flos": 33547137880320.0, + "grad_norm": 14.809542792179553, + "language_loss": 0.77565914, + "learning_rate": 3.955770021006627e-06, + "loss": 0.7978611, + "num_input_tokens_seen": 33960685, + "step": 1581, + "time_per_iteration": 2.765394449234009 + }, + { + "auxiliary_loss_clip": 0.01144835, + "auxiliary_loss_mlp": 0.0106468, + "balance_loss_clip": 1.05426359, + "balance_loss_mlp": 1.04276967, + "epoch": 0.09511498572072749, + "flos": 21215342350080.0, + "grad_norm": 1.8617167187056045, + "language_loss": 0.87230825, + "learning_rate": 3.955688530637116e-06, + "loss": 0.89440346, + "num_input_tokens_seen": 33980015, + "step": 1582, + "time_per_iteration": 2.691364288330078 + }, + { + "auxiliary_loss_clip": 0.01174295, + "auxiliary_loss_mlp": 0.0106431, + "balance_loss_clip": 1.05508888, + "balance_loss_mlp": 1.04039705, + "epoch": 0.09517510897339546, + "flos": 14611262572800.0, + "grad_norm": 1.8512060219658202, + "language_loss": 0.67043924, + "learning_rate": 3.955606966107699e-06, + "loss": 0.69282532, + "num_input_tokens_seen": 33997705, + "step": 1583, + "time_per_iteration": 2.6693732738494873 + }, + { + "auxiliary_loss_clip": 0.01177751, + "auxiliary_loss_mlp": 0.01053743, + "balance_loss_clip": 1.0593859, + "balance_loss_mlp": 1.03035378, + "epoch": 0.09523523222606343, + "flos": 27817339138560.0, + "grad_norm": 2.144216926782962, + "language_loss": 0.70752859, + "learning_rate": 3.95552532742147e-06, + "loss": 0.7298435, + "num_input_tokens_seen": 34017465, + "step": 1584, + "time_per_iteration": 2.7164390087127686 + }, + { + "auxiliary_loss_clip": 0.01138507, + "auxiliary_loss_mlp": 0.0105762, + "balance_loss_clip": 1.05243039, + "balance_loss_mlp": 1.03584039, + "epoch": 0.0952953554787314, + "flos": 20706272847360.0, + "grad_norm": 1.4654737580846544, + "language_loss": 0.8080442, + "learning_rate": 3.955443614581525e-06, + "loss": 0.83000553, + "num_input_tokens_seen": 34038550, + "step": 1585, + "time_per_iteration": 2.879831314086914 + }, + { + "auxiliary_loss_clip": 0.01159374, + "auxiliary_loss_mlp": 0.01057717, + "balance_loss_clip": 1.05387473, + "balance_loss_mlp": 1.03355336, + "epoch": 0.09535547873139937, + "flos": 24787627701120.0, + "grad_norm": 1.638250735795891, + "language_loss": 0.71921158, + "learning_rate": 3.955361827590961e-06, + "loss": 0.74138248, + "num_input_tokens_seen": 34058665, + "step": 1586, + "time_per_iteration": 2.750436544418335 + }, + { + "auxiliary_loss_clip": 0.01048565, + "auxiliary_loss_mlp": 0.01003302, + "balance_loss_clip": 1.03115988, + "balance_loss_mlp": 0.99901009, + "epoch": 0.09541560198406734, + "flos": 71912194905600.0, + "grad_norm": 0.8099482252624973, + "language_loss": 0.55475175, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57527041, + "num_input_tokens_seen": 34109655, + "step": 1587, + "time_per_iteration": 3.0975699424743652 + }, + { + "auxiliary_loss_clip": 0.01128884, + "auxiliary_loss_mlp": 0.0105965, + "balance_loss_clip": 1.04768586, + "balance_loss_mlp": 1.03661847, + "epoch": 0.09547572523673531, + "flos": 28982604251520.0, + "grad_norm": 1.708481785076906, + "language_loss": 0.81062275, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83250809, + "num_input_tokens_seen": 34131115, + "step": 1588, + "time_per_iteration": 2.7718451023101807 + }, + { + "auxiliary_loss_clip": 0.01131602, + "auxiliary_loss_mlp": 0.01056117, + "balance_loss_clip": 1.04894614, + "balance_loss_mlp": 1.03438473, + "epoch": 0.09553584848940327, + "flos": 24133910129280.0, + "grad_norm": 1.5119879232668088, + "language_loss": 0.81481898, + "learning_rate": 3.955116021746594e-06, + "loss": 0.83669615, + "num_input_tokens_seen": 34151925, + "step": 1589, + "time_per_iteration": 2.782468795776367 + }, + { + "auxiliary_loss_clip": 0.0112194, + "auxiliary_loss_mlp": 0.00780573, + "balance_loss_clip": 1.0508883, + "balance_loss_mlp": 1.00013089, + "epoch": 0.09559597174207124, + "flos": 42851376789120.0, + "grad_norm": 1.525287399882202, + "language_loss": 0.64882791, + "learning_rate": 3.955033938184601e-06, + "loss": 0.667853, + "num_input_tokens_seen": 34175395, + "step": 1590, + "time_per_iteration": 3.0783450603485107 + }, + { + "auxiliary_loss_clip": 0.01143501, + "auxiliary_loss_mlp": 0.01058399, + "balance_loss_clip": 1.05087948, + "balance_loss_mlp": 1.0358206, + "epoch": 0.09565609499473922, + "flos": 32670845683200.0, + "grad_norm": 2.0745314237741916, + "language_loss": 0.83290577, + "learning_rate": 3.954951780487526e-06, + "loss": 0.85492468, + "num_input_tokens_seen": 34197760, + "step": 1591, + "time_per_iteration": 2.8393962383270264 + }, + { + "auxiliary_loss_clip": 0.01163486, + "auxiliary_loss_mlp": 0.01065588, + "balance_loss_clip": 1.0522387, + "balance_loss_mlp": 1.04266405, + "epoch": 0.09571621824740718, + "flos": 18478410670080.0, + "grad_norm": 2.825705290827541, + "language_loss": 0.74087322, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.76316392, + "num_input_tokens_seen": 34215330, + "step": 1592, + "time_per_iteration": 2.6828882694244385 + }, + { + "auxiliary_loss_clip": 0.01169239, + "auxiliary_loss_mlp": 0.01055073, + "balance_loss_clip": 1.05161428, + "balance_loss_mlp": 1.03337741, + "epoch": 0.09577634150007515, + "flos": 29387497334400.0, + "grad_norm": 2.18277080043521, + "language_loss": 0.74483889, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76708198, + "num_input_tokens_seen": 34237745, + "step": 1593, + "time_per_iteration": 2.7193498611450195 + }, + { + "auxiliary_loss_clip": 0.01177343, + "auxiliary_loss_mlp": 0.01055096, + "balance_loss_clip": 1.05910873, + "balance_loss_mlp": 1.03307831, + "epoch": 0.09583646475274313, + "flos": 22747830157440.0, + "grad_norm": 1.887493467708827, + "language_loss": 0.69782627, + "learning_rate": 3.954704862616971e-06, + "loss": 0.72015071, + "num_input_tokens_seen": 34256565, + "step": 1594, + "time_per_iteration": 2.635383367538452 + }, + { + "auxiliary_loss_clip": 0.01173222, + "auxiliary_loss_mlp": 0.01051806, + "balance_loss_clip": 1.05618978, + "balance_loss_mlp": 1.03037214, + "epoch": 0.0958965880054111, + "flos": 23218367345280.0, + "grad_norm": 2.1411006117727682, + "language_loss": 0.82780552, + "learning_rate": 3.954622408410747e-06, + "loss": 0.85005581, + "num_input_tokens_seen": 34275970, + "step": 1595, + "time_per_iteration": 2.7158257961273193 + }, + { + "auxiliary_loss_clip": 0.01153253, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_clip": 1.05143809, + "balance_loss_mlp": 1.0301652, + "epoch": 0.09595671125807906, + "flos": 21324438933120.0, + "grad_norm": 1.7751890788987925, + "language_loss": 0.84513396, + "learning_rate": 3.954539880085045e-06, + "loss": 0.86720896, + "num_input_tokens_seen": 34295490, + "step": 1596, + "time_per_iteration": 2.710228204727173 + }, + { + "auxiliary_loss_clip": 0.01166586, + "auxiliary_loss_mlp": 0.0105804, + "balance_loss_clip": 1.05440903, + "balance_loss_mlp": 1.03376901, + "epoch": 0.09601683451074704, + "flos": 39603472185600.0, + "grad_norm": 1.8335529067237837, + "language_loss": 0.69328064, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71552688, + "num_input_tokens_seen": 34319990, + "step": 1597, + "time_per_iteration": 2.802959442138672 + }, + { + "auxiliary_loss_clip": 0.01167235, + "auxiliary_loss_mlp": 0.00780978, + "balance_loss_clip": 1.0503217, + "balance_loss_mlp": 1.00010371, + "epoch": 0.096076957763415, + "flos": 23732716147200.0, + "grad_norm": 2.0491570740921885, + "language_loss": 0.7486403, + "learning_rate": 3.954374601087729e-06, + "loss": 0.76812243, + "num_input_tokens_seen": 34339225, + "step": 1598, + "time_per_iteration": 2.6502270698547363 + }, + { + "auxiliary_loss_clip": 0.01176661, + "auxiliary_loss_mlp": 0.01053936, + "balance_loss_clip": 1.05745888, + "balance_loss_mlp": 1.03009462, + "epoch": 0.09613708101608297, + "flos": 34678108483200.0, + "grad_norm": 1.6831440826618358, + "language_loss": 0.68804371, + "learning_rate": 3.954291850422382e-06, + "loss": 0.71034968, + "num_input_tokens_seen": 34361020, + "step": 1599, + "time_per_iteration": 2.74243426322937 + }, + { + "auxiliary_loss_clip": 0.01157322, + "auxiliary_loss_mlp": 0.01059883, + "balance_loss_clip": 1.05754852, + "balance_loss_mlp": 1.0371263, + "epoch": 0.09619720426875093, + "flos": 20740028653440.0, + "grad_norm": 2.9774251326108367, + "language_loss": 0.83950365, + "learning_rate": 3.954209025650093e-06, + "loss": 0.86167574, + "num_input_tokens_seen": 34378630, + "step": 1600, + "time_per_iteration": 2.702907085418701 + }, + { + "auxiliary_loss_clip": 0.01150263, + "auxiliary_loss_mlp": 0.01054168, + "balance_loss_clip": 1.05129707, + "balance_loss_mlp": 1.03093433, + "epoch": 0.09625732752141891, + "flos": 13042720488960.0, + "grad_norm": 2.287254549480118, + "language_loss": 0.80520785, + "learning_rate": 3.954126126774001e-06, + "loss": 0.82725215, + "num_input_tokens_seen": 34397110, + "step": 1601, + "time_per_iteration": 2.693399429321289 + }, + { + "auxiliary_loss_clip": 0.01181247, + "auxiliary_loss_mlp": 0.01054578, + "balance_loss_clip": 1.05711937, + "balance_loss_mlp": 1.03133249, + "epoch": 0.09631745077408688, + "flos": 22273629782400.0, + "grad_norm": 2.4356926646094954, + "language_loss": 0.81959623, + "learning_rate": 3.954043153797251e-06, + "loss": 0.84195447, + "num_input_tokens_seen": 34414165, + "step": 1602, + "time_per_iteration": 2.639479875564575 + }, + { + "auxiliary_loss_clip": 0.01137855, + "auxiliary_loss_mlp": 0.01051495, + "balance_loss_clip": 1.05295444, + "balance_loss_mlp": 1.02681863, + "epoch": 0.09637757402675484, + "flos": 24754266944640.0, + "grad_norm": 3.099164686790191, + "language_loss": 0.62498438, + "learning_rate": 3.953960106722989e-06, + "loss": 0.64687788, + "num_input_tokens_seen": 34434445, + "step": 1603, + "time_per_iteration": 4.341834306716919 + }, + { + "auxiliary_loss_clip": 0.01189954, + "auxiliary_loss_mlp": 0.01054376, + "balance_loss_clip": 1.05902839, + "balance_loss_mlp": 1.02918696, + "epoch": 0.09643769727942282, + "flos": 22525758322560.0, + "grad_norm": 3.121905357886113, + "language_loss": 0.70996022, + "learning_rate": 3.953876985554364e-06, + "loss": 0.73240346, + "num_input_tokens_seen": 34453095, + "step": 1604, + "time_per_iteration": 2.6520893573760986 + }, + { + "auxiliary_loss_clip": 0.01176446, + "auxiliary_loss_mlp": 0.01055314, + "balance_loss_clip": 1.0570209, + "balance_loss_mlp": 1.03358221, + "epoch": 0.09649782053209079, + "flos": 30921026636160.0, + "grad_norm": 2.082890345500055, + "language_loss": 0.7993719, + "learning_rate": 3.953793790294527e-06, + "loss": 0.82168949, + "num_input_tokens_seen": 34473680, + "step": 1605, + "time_per_iteration": 4.5557661056518555 + }, + { + "auxiliary_loss_clip": 0.01161047, + "auxiliary_loss_mlp": 0.01047918, + "balance_loss_clip": 1.05455577, + "balance_loss_mlp": 1.0245893, + "epoch": 0.09655794378475875, + "flos": 25337635729920.0, + "grad_norm": 1.990204665194141, + "language_loss": 0.74550986, + "learning_rate": 3.953710520946634e-06, + "loss": 0.76759952, + "num_input_tokens_seen": 34492610, + "step": 1606, + "time_per_iteration": 2.7172651290893555 + }, + { + "auxiliary_loss_clip": 0.01172416, + "auxiliary_loss_mlp": 0.01046772, + "balance_loss_clip": 1.05834222, + "balance_loss_mlp": 1.02378857, + "epoch": 0.09661806703742673, + "flos": 22346061557760.0, + "grad_norm": 1.6403710807101601, + "language_loss": 0.7571919, + "learning_rate": 3.953627177513843e-06, + "loss": 0.77938372, + "num_input_tokens_seen": 34511855, + "step": 1607, + "time_per_iteration": 4.302686452865601 + }, + { + "auxiliary_loss_clip": 0.01139491, + "auxiliary_loss_mlp": 0.01051546, + "balance_loss_clip": 1.04833579, + "balance_loss_mlp": 1.0289799, + "epoch": 0.0966781902900947, + "flos": 17457578144640.0, + "grad_norm": 1.975850982703557, + "language_loss": 0.86756283, + "learning_rate": 3.953543759999312e-06, + "loss": 0.88947326, + "num_input_tokens_seen": 34528905, + "step": 1608, + "time_per_iteration": 2.6280455589294434 + }, + { + "auxiliary_loss_clip": 0.01126253, + "auxiliary_loss_mlp": 0.01064704, + "balance_loss_clip": 1.05433142, + "balance_loss_mlp": 1.03940821, + "epoch": 0.09673831354276266, + "flos": 36903995412480.0, + "grad_norm": 2.3082762386200266, + "language_loss": 0.71363097, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73554057, + "num_input_tokens_seen": 34548480, + "step": 1609, + "time_per_iteration": 2.9116146564483643 + }, + { + "auxiliary_loss_clip": 0.01149353, + "auxiliary_loss_mlp": 0.01058179, + "balance_loss_clip": 1.0546515, + "balance_loss_mlp": 1.03606534, + "epoch": 0.09679843679543064, + "flos": 20701388597760.0, + "grad_norm": 1.9988414994799784, + "language_loss": 0.84810984, + "learning_rate": 3.953376702737693e-06, + "loss": 0.87018514, + "num_input_tokens_seen": 34565410, + "step": 1610, + "time_per_iteration": 2.8005051612854004 + }, + { + "auxiliary_loss_clip": 0.01161389, + "auxiliary_loss_mlp": 0.01056267, + "balance_loss_clip": 1.05790925, + "balance_loss_mlp": 1.03228188, + "epoch": 0.0968585600480986, + "flos": 23514415240320.0, + "grad_norm": 2.176236379770122, + "language_loss": 0.6696198, + "learning_rate": 3.953293062996939e-06, + "loss": 0.69179636, + "num_input_tokens_seen": 34584840, + "step": 1611, + "time_per_iteration": 2.731931447982788 + }, + { + "auxiliary_loss_clip": 0.01125259, + "auxiliary_loss_mlp": 0.01057116, + "balance_loss_clip": 1.04740572, + "balance_loss_mlp": 1.03385806, + "epoch": 0.09691868330076657, + "flos": 20121072468480.0, + "grad_norm": 1.6508278294088392, + "language_loss": 0.81067657, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83250034, + "num_input_tokens_seen": 34603360, + "step": 1612, + "time_per_iteration": 2.7998390197753906 + }, + { + "auxiliary_loss_clip": 0.01182404, + "auxiliary_loss_mlp": 0.01069551, + "balance_loss_clip": 1.06046534, + "balance_loss_mlp": 1.04600716, + "epoch": 0.09697880655343454, + "flos": 16544692967040.0, + "grad_norm": 3.304939197664143, + "language_loss": 0.80836105, + "learning_rate": 3.953125561311398e-06, + "loss": 0.83088064, + "num_input_tokens_seen": 34620760, + "step": 1613, + "time_per_iteration": 2.624218702316284 + }, + { + "auxiliary_loss_clip": 0.01148565, + "auxiliary_loss_mlp": 0.01054743, + "balance_loss_clip": 1.05542159, + "balance_loss_mlp": 1.03047192, + "epoch": 0.09703892980610251, + "flos": 26104184899200.0, + "grad_norm": 1.7164386274315457, + "language_loss": 0.84289789, + "learning_rate": 3.953041699372964e-06, + "loss": 0.86493099, + "num_input_tokens_seen": 34640695, + "step": 1614, + "time_per_iteration": 2.744340419769287 + }, + { + "auxiliary_loss_clip": 0.01066618, + "auxiliary_loss_mlp": 0.00759744, + "balance_loss_clip": 1.02654934, + "balance_loss_mlp": 1.00008702, + "epoch": 0.09709905305877048, + "flos": 60443622000000.0, + "grad_norm": 0.7127167896900892, + "language_loss": 0.54629624, + "learning_rate": 3.952957763374992e-06, + "loss": 0.56455994, + "num_input_tokens_seen": 34702395, + "step": 1615, + "time_per_iteration": 3.1547679901123047 + }, + { + "auxiliary_loss_clip": 0.01033143, + "auxiliary_loss_mlp": 0.01017555, + "balance_loss_clip": 1.02384067, + "balance_loss_mlp": 1.01381195, + "epoch": 0.09715917631143844, + "flos": 57639932893440.0, + "grad_norm": 0.7689373847786285, + "language_loss": 0.58190405, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60241103, + "num_input_tokens_seen": 34768910, + "step": 1616, + "time_per_iteration": 3.3940556049346924 + }, + { + "auxiliary_loss_clip": 0.01155533, + "auxiliary_loss_mlp": 0.01067983, + "balance_loss_clip": 1.05504358, + "balance_loss_mlp": 1.04205465, + "epoch": 0.09721929956410642, + "flos": 20558212986240.0, + "grad_norm": 1.8932449927934136, + "language_loss": 0.69031835, + "learning_rate": 3.952789669213172e-06, + "loss": 0.7125535, + "num_input_tokens_seen": 34787680, + "step": 1617, + "time_per_iteration": 2.714629888534546 + }, + { + "auxiliary_loss_clip": 0.01152637, + "auxiliary_loss_mlp": 0.01057882, + "balance_loss_clip": 1.05386162, + "balance_loss_mlp": 1.03127456, + "epoch": 0.09727942281677439, + "flos": 27344359825920.0, + "grad_norm": 1.755493071880773, + "language_loss": 0.80910909, + "learning_rate": 3.952705511055698e-06, + "loss": 0.83121431, + "num_input_tokens_seen": 34808330, + "step": 1618, + "time_per_iteration": 2.8081507682800293 + }, + { + "auxiliary_loss_clip": 0.01168356, + "auxiliary_loss_mlp": 0.01058179, + "balance_loss_clip": 1.06048679, + "balance_loss_mlp": 1.03678131, + "epoch": 0.09733954606944235, + "flos": 24900028335360.0, + "grad_norm": 1.667659488760432, + "language_loss": 0.92901695, + "learning_rate": 3.952621278851435e-06, + "loss": 0.95128226, + "num_input_tokens_seen": 34830020, + "step": 1619, + "time_per_iteration": 2.7752275466918945 + }, + { + "auxiliary_loss_clip": 0.01175515, + "auxiliary_loss_mlp": 0.01058252, + "balance_loss_clip": 1.05952573, + "balance_loss_mlp": 1.03512526, + "epoch": 0.09739966932211033, + "flos": 31503928544640.0, + "grad_norm": 2.1973967195348902, + "language_loss": 0.88978708, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.91212475, + "num_input_tokens_seen": 34850330, + "step": 1620, + "time_per_iteration": 2.771176338195801 + }, + { + "auxiliary_loss_clip": 0.01153763, + "auxiliary_loss_mlp": 0.01065329, + "balance_loss_clip": 1.05353975, + "balance_loss_mlp": 1.0397464, + "epoch": 0.0974597925747783, + "flos": 23878764846720.0, + "grad_norm": 2.154793183838835, + "language_loss": 0.77331412, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79550499, + "num_input_tokens_seen": 34871640, + "step": 1621, + "time_per_iteration": 2.6740832328796387 + }, + { + "auxiliary_loss_clip": 0.01131342, + "auxiliary_loss_mlp": 0.01082359, + "balance_loss_clip": 1.04798269, + "balance_loss_mlp": 1.05640674, + "epoch": 0.09751991582744626, + "flos": 17019575700480.0, + "grad_norm": 1.9420195171733425, + "language_loss": 0.77671158, + "learning_rate": 3.952368137989871e-06, + "loss": 0.79884863, + "num_input_tokens_seen": 34888100, + "step": 1622, + "time_per_iteration": 2.7247347831726074 + }, + { + "auxiliary_loss_clip": 0.01150185, + "auxiliary_loss_mlp": 0.01064277, + "balance_loss_clip": 1.05335355, + "balance_loss_mlp": 1.04025626, + "epoch": 0.09758003908011423, + "flos": 28402826826240.0, + "grad_norm": 1.8603109065807166, + "language_loss": 0.85784447, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.87998909, + "num_input_tokens_seen": 34910485, + "step": 1623, + "time_per_iteration": 2.785388469696045 + }, + { + "auxiliary_loss_clip": 0.0117659, + "auxiliary_loss_mlp": 0.01064102, + "balance_loss_clip": 1.05769634, + "balance_loss_mlp": 1.04043913, + "epoch": 0.09764016233278221, + "flos": 18144297336960.0, + "grad_norm": 2.39630116599036, + "language_loss": 0.80534065, + "learning_rate": 3.952199007240184e-06, + "loss": 0.82774758, + "num_input_tokens_seen": 34928615, + "step": 1624, + "time_per_iteration": 2.6818184852600098 + }, + { + "auxiliary_loss_clip": 0.01176335, + "auxiliary_loss_mlp": 0.01056788, + "balance_loss_clip": 1.05616927, + "balance_loss_mlp": 1.03465128, + "epoch": 0.09770028558545017, + "flos": 15265842071040.0, + "grad_norm": 2.44379144971104, + "language_loss": 0.85556966, + "learning_rate": 3.952114330822364e-06, + "loss": 0.8779009, + "num_input_tokens_seen": 34946045, + "step": 1625, + "time_per_iteration": 2.6594324111938477 + }, + { + "auxiliary_loss_clip": 0.01181411, + "auxiliary_loss_mlp": 0.0106682, + "balance_loss_clip": 1.06004012, + "balance_loss_mlp": 1.04411101, + "epoch": 0.09776040883811814, + "flos": 23472435219840.0, + "grad_norm": 2.058269503362464, + "language_loss": 0.85431635, + "learning_rate": 3.952029580380172e-06, + "loss": 0.87679869, + "num_input_tokens_seen": 34962865, + "step": 1626, + "time_per_iteration": 2.7384841442108154 + }, + { + "auxiliary_loss_clip": 0.01165311, + "auxiliary_loss_mlp": 0.007823, + "balance_loss_clip": 1.05467701, + "balance_loss_mlp": 1.000211, + "epoch": 0.09782053209078612, + "flos": 24499480798080.0, + "grad_norm": 2.0701580273163036, + "language_loss": 0.83370024, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.85317636, + "num_input_tokens_seen": 34983505, + "step": 1627, + "time_per_iteration": 2.8269948959350586 + }, + { + "auxiliary_loss_clip": 0.01168188, + "auxiliary_loss_mlp": 0.01065332, + "balance_loss_clip": 1.05557203, + "balance_loss_mlp": 1.04275417, + "epoch": 0.09788065534345408, + "flos": 21580158833280.0, + "grad_norm": 1.8143281262319713, + "language_loss": 0.84674478, + "learning_rate": 3.951859857435534e-06, + "loss": 0.86907995, + "num_input_tokens_seen": 35001825, + "step": 1628, + "time_per_iteration": 2.6151821613311768 + }, + { + "auxiliary_loss_clip": 0.01170257, + "auxiliary_loss_mlp": 0.01058367, + "balance_loss_clip": 1.05374515, + "balance_loss_mlp": 1.03558636, + "epoch": 0.09794077859612205, + "flos": 23842459175040.0, + "grad_norm": 1.5658807312485334, + "language_loss": 0.75531614, + "learning_rate": 3.951774884939523e-06, + "loss": 0.77760237, + "num_input_tokens_seen": 35023075, + "step": 1629, + "time_per_iteration": 2.6794557571411133 + }, + { + "auxiliary_loss_clip": 0.01129604, + "auxiliary_loss_mlp": 0.01056904, + "balance_loss_clip": 1.0577755, + "balance_loss_mlp": 1.03169131, + "epoch": 0.09800090184879003, + "flos": 23659889322240.0, + "grad_norm": 1.6755762488260617, + "language_loss": 0.78487194, + "learning_rate": 3.951689838432013e-06, + "loss": 0.80673707, + "num_input_tokens_seen": 35043480, + "step": 1630, + "time_per_iteration": 2.7986228466033936 + }, + { + "auxiliary_loss_clip": 0.01167766, + "auxiliary_loss_mlp": 0.01063441, + "balance_loss_clip": 1.05938148, + "balance_loss_mlp": 1.03804946, + "epoch": 0.09806102510145799, + "flos": 17055773631360.0, + "grad_norm": 1.8175370389297836, + "language_loss": 0.86677933, + "learning_rate": 3.951604717916228e-06, + "loss": 0.88909143, + "num_input_tokens_seen": 35061490, + "step": 1631, + "time_per_iteration": 2.6350157260894775 + }, + { + "auxiliary_loss_clip": 0.01171369, + "auxiliary_loss_mlp": 0.01058643, + "balance_loss_clip": 1.0610745, + "balance_loss_mlp": 1.03625536, + "epoch": 0.09812114835412596, + "flos": 23878477537920.0, + "grad_norm": 2.2030333753544773, + "language_loss": 0.82996809, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85226822, + "num_input_tokens_seen": 35079670, + "step": 1632, + "time_per_iteration": 2.7990314960479736 + }, + { + "auxiliary_loss_clip": 0.01148453, + "auxiliary_loss_mlp": 0.01064004, + "balance_loss_clip": 1.05554819, + "balance_loss_mlp": 1.04102039, + "epoch": 0.09818127160679392, + "flos": 20595488325120.0, + "grad_norm": 1.531777801288569, + "language_loss": 0.7882973, + "learning_rate": 3.951434254872751e-06, + "loss": 0.81042188, + "num_input_tokens_seen": 35099205, + "step": 1633, + "time_per_iteration": 2.735353708267212 + }, + { + "auxiliary_loss_clip": 0.01170992, + "auxiliary_loss_mlp": 0.01061681, + "balance_loss_clip": 1.05558002, + "balance_loss_mlp": 1.03731489, + "epoch": 0.0982413948594619, + "flos": 15487339288320.0, + "grad_norm": 2.4037572513069687, + "language_loss": 0.73209554, + "learning_rate": 3.951348912351521e-06, + "loss": 0.75442231, + "num_input_tokens_seen": 35115270, + "step": 1634, + "time_per_iteration": 2.688596248626709 + }, + { + "auxiliary_loss_clip": 0.01162743, + "auxiliary_loss_mlp": 0.01071164, + "balance_loss_clip": 1.05591321, + "balance_loss_mlp": 1.04672611, + "epoch": 0.09830151811212987, + "flos": 24207958016640.0, + "grad_norm": 3.2244021303311405, + "language_loss": 0.72553629, + "learning_rate": 3.951263495834947e-06, + "loss": 0.74787533, + "num_input_tokens_seen": 35134065, + "step": 1635, + "time_per_iteration": 2.720266342163086 + }, + { + "auxiliary_loss_clip": 0.01154765, + "auxiliary_loss_mlp": 0.01068349, + "balance_loss_clip": 1.05526268, + "balance_loss_mlp": 1.04177701, + "epoch": 0.09836164136479783, + "flos": 20594590485120.0, + "grad_norm": 1.7699592352066487, + "language_loss": 0.78026646, + "learning_rate": 3.951178005326264e-06, + "loss": 0.80249763, + "num_input_tokens_seen": 35154870, + "step": 1636, + "time_per_iteration": 2.9618239402770996 + }, + { + "auxiliary_loss_clip": 0.01162744, + "auxiliary_loss_mlp": 0.01060716, + "balance_loss_clip": 1.05561686, + "balance_loss_mlp": 1.0368979, + "epoch": 0.09842176461746581, + "flos": 19934157070080.0, + "grad_norm": 1.8332710343018006, + "language_loss": 0.69524407, + "learning_rate": 3.951092440828715e-06, + "loss": 0.71747863, + "num_input_tokens_seen": 35171850, + "step": 1637, + "time_per_iteration": 2.671178102493286 + }, + { + "auxiliary_loss_clip": 0.01188316, + "auxiliary_loss_mlp": 0.01058851, + "balance_loss_clip": 1.05926394, + "balance_loss_mlp": 1.03500926, + "epoch": 0.09848188787013377, + "flos": 21214659991680.0, + "grad_norm": 2.3775286970935503, + "language_loss": 0.77050996, + "learning_rate": 3.951006802345545e-06, + "loss": 0.79298162, + "num_input_tokens_seen": 35188795, + "step": 1638, + "time_per_iteration": 2.62457537651062 + }, + { + "auxiliary_loss_clip": 0.01140265, + "auxiliary_loss_mlp": 0.01052026, + "balance_loss_clip": 1.05538166, + "balance_loss_mlp": 1.02941203, + "epoch": 0.09854201112280174, + "flos": 30154226071680.0, + "grad_norm": 1.4014263071342075, + "language_loss": 0.72620296, + "learning_rate": 3.950921089880003e-06, + "loss": 0.74812591, + "num_input_tokens_seen": 35212100, + "step": 1639, + "time_per_iteration": 2.7499618530273438 + }, + { + "auxiliary_loss_clip": 0.01173752, + "auxiliary_loss_mlp": 0.01051382, + "balance_loss_clip": 1.0582087, + "balance_loss_mlp": 1.02831531, + "epoch": 0.09860213437546972, + "flos": 21795730306560.0, + "grad_norm": 1.7213189449892274, + "language_loss": 0.88679075, + "learning_rate": 3.950835303435337e-06, + "loss": 0.90904212, + "num_input_tokens_seen": 35230390, + "step": 1640, + "time_per_iteration": 2.664133071899414 + }, + { + "auxiliary_loss_clip": 0.01177786, + "auxiliary_loss_mlp": 0.01044457, + "balance_loss_clip": 1.05981517, + "balance_loss_mlp": 1.02130616, + "epoch": 0.09866225762813768, + "flos": 21835555511040.0, + "grad_norm": 2.0701766566296915, + "language_loss": 0.80567038, + "learning_rate": 3.950749443014801e-06, + "loss": 0.82789278, + "num_input_tokens_seen": 35250405, + "step": 1641, + "time_per_iteration": 2.645353317260742 + }, + { + "auxiliary_loss_clip": 0.011756, + "auxiliary_loss_mlp": 0.01062641, + "balance_loss_clip": 1.05896795, + "balance_loss_mlp": 1.03742838, + "epoch": 0.09872238088080565, + "flos": 17599855916160.0, + "grad_norm": 2.64335263522248, + "language_loss": 0.86117625, + "learning_rate": 3.95066350862165e-06, + "loss": 0.88355863, + "num_input_tokens_seen": 35262820, + "step": 1642, + "time_per_iteration": 5.81004524230957 + }, + { + "auxiliary_loss_clip": 0.01151329, + "auxiliary_loss_mlp": 0.01056693, + "balance_loss_clip": 1.05857074, + "balance_loss_mlp": 1.03404331, + "epoch": 0.09878250413347361, + "flos": 27636134002560.0, + "grad_norm": 2.7092208079201607, + "language_loss": 0.8058275, + "learning_rate": 3.950577500259144e-06, + "loss": 0.82790768, + "num_input_tokens_seen": 35284490, + "step": 1643, + "time_per_iteration": 2.7235090732574463 + }, + { + "auxiliary_loss_clip": 0.01174075, + "auxiliary_loss_mlp": 0.01077435, + "balance_loss_clip": 1.05761337, + "balance_loss_mlp": 1.05470192, + "epoch": 0.0988426273861416, + "flos": 16544728880640.0, + "grad_norm": 2.0561742686210676, + "language_loss": 0.82546467, + "learning_rate": 3.950491417930543e-06, + "loss": 0.84797978, + "num_input_tokens_seen": 35302815, + "step": 1644, + "time_per_iteration": 4.318823575973511 + }, + { + "auxiliary_loss_clip": 0.01163142, + "auxiliary_loss_mlp": 0.00782463, + "balance_loss_clip": 1.05607629, + "balance_loss_mlp": 1.00010633, + "epoch": 0.09890275063880956, + "flos": 21215270522880.0, + "grad_norm": 1.6945489721625269, + "language_loss": 0.68219113, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.70164716, + "num_input_tokens_seen": 35321175, + "step": 1645, + "time_per_iteration": 2.6626670360565186 + }, + { + "auxiliary_loss_clip": 0.01059795, + "auxiliary_loss_mlp": 0.01047617, + "balance_loss_clip": 1.02852345, + "balance_loss_mlp": 1.04404068, + "epoch": 0.09896287389147752, + "flos": 59379372910080.0, + "grad_norm": 0.8512889940087613, + "language_loss": 0.60885167, + "learning_rate": 3.950319031388119e-06, + "loss": 0.62992585, + "num_input_tokens_seen": 35381740, + "step": 1646, + "time_per_iteration": 4.752669095993042 + }, + { + "auxiliary_loss_clip": 0.01147006, + "auxiliary_loss_mlp": 0.0105976, + "balance_loss_clip": 1.0574733, + "balance_loss_mlp": 1.03464222, + "epoch": 0.0990229971441455, + "flos": 29642678530560.0, + "grad_norm": 5.785751121573768, + "language_loss": 0.73211443, + "learning_rate": 3.950232727180833e-06, + "loss": 0.7541821, + "num_input_tokens_seen": 35403760, + "step": 1647, + "time_per_iteration": 2.783442974090576 + }, + { + "auxiliary_loss_clip": 0.01161789, + "auxiliary_loss_mlp": 0.01066314, + "balance_loss_clip": 1.06016421, + "balance_loss_mlp": 1.04445136, + "epoch": 0.09908312039681347, + "flos": 21834873152640.0, + "grad_norm": 1.828428298130997, + "language_loss": 0.84094375, + "learning_rate": 3.950146349020525e-06, + "loss": 0.86322474, + "num_input_tokens_seen": 35424050, + "step": 1648, + "time_per_iteration": 2.709559679031372 + }, + { + "auxiliary_loss_clip": 0.01065954, + "auxiliary_loss_mlp": 0.01020799, + "balance_loss_clip": 1.02565169, + "balance_loss_mlp": 1.01722264, + "epoch": 0.09914324364948143, + "flos": 57564304807680.0, + "grad_norm": 0.7317434537206132, + "language_loss": 0.55672908, + "learning_rate": 3.950059896910473e-06, + "loss": 0.5775966, + "num_input_tokens_seen": 35481690, + "step": 1649, + "time_per_iteration": 3.0944156646728516 + }, + { + "auxiliary_loss_clip": 0.0117133, + "auxiliary_loss_mlp": 0.01049543, + "balance_loss_clip": 1.05603158, + "balance_loss_mlp": 1.02723897, + "epoch": 0.09920336690214941, + "flos": 34123934476800.0, + "grad_norm": 2.195431109372502, + "language_loss": 0.8975327, + "learning_rate": 3.949973370853954e-06, + "loss": 0.91974139, + "num_input_tokens_seen": 35498635, + "step": 1650, + "time_per_iteration": 2.7438554763793945 + }, + { + "auxiliary_loss_clip": 0.01033978, + "auxiliary_loss_mlp": 0.00758727, + "balance_loss_clip": 1.02943921, + "balance_loss_mlp": 0.9997822, + "epoch": 0.09926349015481738, + "flos": 71216428464000.0, + "grad_norm": 0.8036050505402587, + "language_loss": 0.63734978, + "learning_rate": 3.94988677085425e-06, + "loss": 0.65527683, + "num_input_tokens_seen": 35565720, + "step": 1651, + "time_per_iteration": 3.40269136428833 + }, + { + "auxiliary_loss_clip": 0.01170347, + "auxiliary_loss_mlp": 0.01062486, + "balance_loss_clip": 1.05790281, + "balance_loss_mlp": 1.03842974, + "epoch": 0.09932361340748534, + "flos": 23148700917120.0, + "grad_norm": 1.9744130417114842, + "language_loss": 0.88115525, + "learning_rate": 3.949800096914643e-06, + "loss": 0.90348363, + "num_input_tokens_seen": 35586000, + "step": 1652, + "time_per_iteration": 2.6695117950439453 + }, + { + "auxiliary_loss_clip": 0.0116773, + "auxiliary_loss_mlp": 0.01062073, + "balance_loss_clip": 1.06095552, + "balance_loss_mlp": 1.03895831, + "epoch": 0.09938373666015332, + "flos": 19828651847040.0, + "grad_norm": 2.166773052437996, + "language_loss": 0.81789082, + "learning_rate": 3.949713349038422e-06, + "loss": 0.84018886, + "num_input_tokens_seen": 35604355, + "step": 1653, + "time_per_iteration": 2.7136831283569336 + }, + { + "auxiliary_loss_clip": 0.01173152, + "auxiliary_loss_mlp": 0.00780466, + "balance_loss_clip": 1.05683279, + "balance_loss_mlp": 1.00016594, + "epoch": 0.09944385991282129, + "flos": 22090664880000.0, + "grad_norm": 1.662037391605293, + "language_loss": 0.79489207, + "learning_rate": 3.949626527228875e-06, + "loss": 0.81442821, + "num_input_tokens_seen": 35625495, + "step": 1654, + "time_per_iteration": 2.645875930786133 + }, + { + "auxiliary_loss_clip": 0.01187918, + "auxiliary_loss_mlp": 0.01056849, + "balance_loss_clip": 1.06405056, + "balance_loss_mlp": 1.03561759, + "epoch": 0.09950398316548925, + "flos": 19828867328640.0, + "grad_norm": 1.7263610037420916, + "language_loss": 0.81038272, + "learning_rate": 3.949539631489295e-06, + "loss": 0.83283037, + "num_input_tokens_seen": 35645030, + "step": 1655, + "time_per_iteration": 2.630404233932495 + }, + { + "auxiliary_loss_clip": 0.01181205, + "auxiliary_loss_mlp": 0.01055977, + "balance_loss_clip": 1.05679035, + "balance_loss_mlp": 1.03294599, + "epoch": 0.09956410641815722, + "flos": 25003701964800.0, + "grad_norm": 2.426795421082641, + "language_loss": 0.80429518, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.82666701, + "num_input_tokens_seen": 35664305, + "step": 1656, + "time_per_iteration": 2.6283950805664062 + }, + { + "auxiliary_loss_clip": 0.01170003, + "auxiliary_loss_mlp": 0.01061881, + "balance_loss_clip": 1.05787742, + "balance_loss_mlp": 1.03870714, + "epoch": 0.0996242296708252, + "flos": 19317714837120.0, + "grad_norm": 1.4960238412267362, + "language_loss": 0.89040691, + "learning_rate": 3.949365618233217e-06, + "loss": 0.91272575, + "num_input_tokens_seen": 35684060, + "step": 1657, + "time_per_iteration": 2.653674602508545 + }, + { + "auxiliary_loss_clip": 0.01165842, + "auxiliary_loss_mlp": 0.01057352, + "balance_loss_clip": 1.05830753, + "balance_loss_mlp": 1.0329144, + "epoch": 0.09968435292349316, + "flos": 21871609787520.0, + "grad_norm": 2.1866084372248062, + "language_loss": 0.84684521, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.86907715, + "num_input_tokens_seen": 35703250, + "step": 1658, + "time_per_iteration": 2.6897473335266113 + }, + { + "auxiliary_loss_clip": 0.01069806, + "auxiliary_loss_mlp": 0.01015844, + "balance_loss_clip": 1.02042234, + "balance_loss_mlp": 1.01292348, + "epoch": 0.09974447617616113, + "flos": 65384533313280.0, + "grad_norm": 0.9123227767672076, + "language_loss": 0.60828507, + "learning_rate": 3.949191309296585e-06, + "loss": 0.62914157, + "num_input_tokens_seen": 35762165, + "step": 1659, + "time_per_iteration": 3.273890495300293 + }, + { + "auxiliary_loss_clip": 0.01152432, + "auxiliary_loss_mlp": 0.01051829, + "balance_loss_clip": 1.05082798, + "balance_loss_mlp": 1.02814245, + "epoch": 0.0998045994288291, + "flos": 23659817495040.0, + "grad_norm": 1.9344290476513741, + "language_loss": 0.84892076, + "learning_rate": 3.949104043956321e-06, + "loss": 0.87096334, + "num_input_tokens_seen": 35781520, + "step": 1660, + "time_per_iteration": 2.788018226623535 + }, + { + "auxiliary_loss_clip": 0.01149163, + "auxiliary_loss_mlp": 0.01060092, + "balance_loss_clip": 1.05374026, + "balance_loss_mlp": 1.03514171, + "epoch": 0.09986472268149707, + "flos": 19609704495360.0, + "grad_norm": 1.9493882663610318, + "language_loss": 0.80024737, + "learning_rate": 3.949016704705836e-06, + "loss": 0.82234001, + "num_input_tokens_seen": 35799565, + "step": 1661, + "time_per_iteration": 2.6537399291992188 + }, + { + "auxiliary_loss_clip": 0.01172787, + "auxiliary_loss_mlp": 0.01055532, + "balance_loss_clip": 1.05715156, + "balance_loss_mlp": 1.03153503, + "epoch": 0.09992484593416504, + "flos": 26213317395840.0, + "grad_norm": 2.0152235709188377, + "language_loss": 0.83560598, + "learning_rate": 3.948929291548443e-06, + "loss": 0.85788912, + "num_input_tokens_seen": 35821085, + "step": 1662, + "time_per_iteration": 2.753807783126831 + }, + { + "auxiliary_loss_clip": 0.01154838, + "auxiliary_loss_mlp": 0.01061466, + "balance_loss_clip": 1.05079484, + "balance_loss_mlp": 1.03616929, + "epoch": 0.09998496918683301, + "flos": 17493632421120.0, + "grad_norm": 1.9355779644050557, + "language_loss": 0.88865256, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.91081554, + "num_input_tokens_seen": 35839840, + "step": 1663, + "time_per_iteration": 2.6829047203063965 + }, + { + "auxiliary_loss_clip": 0.0118246, + "auxiliary_loss_mlp": 0.01061692, + "balance_loss_clip": 1.06228638, + "balance_loss_mlp": 1.03825521, + "epoch": 0.10004509243950098, + "flos": 22784925928320.0, + "grad_norm": 1.7925330820671084, + "language_loss": 0.70140731, + "learning_rate": 3.948754243526191e-06, + "loss": 0.72384882, + "num_input_tokens_seen": 35861545, + "step": 1664, + "time_per_iteration": 2.809300184249878 + }, + { + "auxiliary_loss_clip": 0.01142878, + "auxiliary_loss_mlp": 0.01055306, + "balance_loss_clip": 1.05475903, + "balance_loss_mlp": 1.03312087, + "epoch": 0.10010521569216894, + "flos": 16253385667200.0, + "grad_norm": 2.4978474602303895, + "language_loss": 0.78981555, + "learning_rate": 3.94866660866797e-06, + "loss": 0.81179744, + "num_input_tokens_seen": 35878295, + "step": 1665, + "time_per_iteration": 2.7010488510131836 + }, + { + "auxiliary_loss_clip": 0.01175861, + "auxiliary_loss_mlp": 0.01070341, + "balance_loss_clip": 1.06286561, + "balance_loss_mlp": 1.04742861, + "epoch": 0.10016533894483691, + "flos": 23402589223680.0, + "grad_norm": 3.1438625724360945, + "language_loss": 0.70054829, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.7230103, + "num_input_tokens_seen": 35898990, + "step": 1666, + "time_per_iteration": 2.689879894256592 + }, + { + "auxiliary_loss_clip": 0.01110848, + "auxiliary_loss_mlp": 0.01074593, + "balance_loss_clip": 1.05082703, + "balance_loss_mlp": 1.04946339, + "epoch": 0.10022546219750489, + "flos": 19354164163200.0, + "grad_norm": 1.7583449522195267, + "language_loss": 0.78647351, + "learning_rate": 3.948491117273956e-06, + "loss": 0.80832791, + "num_input_tokens_seen": 35916225, + "step": 1667, + "time_per_iteration": 2.8973352909088135 + }, + { + "auxiliary_loss_clip": 0.01153352, + "auxiliary_loss_mlp": 0.01062819, + "balance_loss_clip": 1.05452693, + "balance_loss_mlp": 1.03752255, + "epoch": 0.10028558545017285, + "flos": 27085766837760.0, + "grad_norm": 2.4011089045072187, + "language_loss": 0.77357388, + "learning_rate": 3.948403260744817e-06, + "loss": 0.7957356, + "num_input_tokens_seen": 35934630, + "step": 1668, + "time_per_iteration": 3.2600321769714355 + }, + { + "auxiliary_loss_clip": 0.01184879, + "auxiliary_loss_mlp": 0.01059367, + "balance_loss_clip": 1.05833495, + "balance_loss_mlp": 1.03523922, + "epoch": 0.10034570870284082, + "flos": 25847136195840.0, + "grad_norm": 1.7407668002390366, + "language_loss": 0.77520061, + "learning_rate": 3.948315330332031e-06, + "loss": 0.79764307, + "num_input_tokens_seen": 35953855, + "step": 1669, + "time_per_iteration": 2.6899471282958984 + }, + { + "auxiliary_loss_clip": 0.0118887, + "auxiliary_loss_mlp": 0.01067842, + "balance_loss_clip": 1.05948365, + "balance_loss_mlp": 1.04416728, + "epoch": 0.1004058319555088, + "flos": 26249587153920.0, + "grad_norm": 5.441134829238958, + "language_loss": 0.85160148, + "learning_rate": 3.948227326038933e-06, + "loss": 0.87416857, + "num_input_tokens_seen": 35974555, + "step": 1670, + "time_per_iteration": 2.616867780685425 + }, + { + "auxiliary_loss_clip": 0.011763, + "auxiliary_loss_mlp": 0.01055607, + "balance_loss_clip": 1.05584121, + "balance_loss_mlp": 1.03354108, + "epoch": 0.10046595520817676, + "flos": 25374480105600.0, + "grad_norm": 1.4849262119454174, + "language_loss": 0.76836258, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.79068166, + "num_input_tokens_seen": 35996830, + "step": 1671, + "time_per_iteration": 2.658254384994507 + }, + { + "auxiliary_loss_clip": 0.01061447, + "auxiliary_loss_mlp": 0.01017561, + "balance_loss_clip": 1.02178144, + "balance_loss_mlp": 1.01454473, + "epoch": 0.10052607846084473, + "flos": 67461821677440.0, + "grad_norm": 0.7781454358921105, + "language_loss": 0.60718858, + "learning_rate": 3.948051095825149e-06, + "loss": 0.62797856, + "num_input_tokens_seen": 36054465, + "step": 1672, + "time_per_iteration": 3.1269097328186035 + }, + { + "auxiliary_loss_clip": 0.01143177, + "auxiliary_loss_mlp": 0.01063346, + "balance_loss_clip": 1.05112922, + "balance_loss_mlp": 1.04055333, + "epoch": 0.10058620171351271, + "flos": 21360493209600.0, + "grad_norm": 2.433278134910662, + "language_loss": 0.7711426, + "learning_rate": 3.947962869911147e-06, + "loss": 0.79320776, + "num_input_tokens_seen": 36073480, + "step": 1673, + "time_per_iteration": 2.6931638717651367 + }, + { + "auxiliary_loss_clip": 0.01132094, + "auxiliary_loss_mlp": 0.01056611, + "balance_loss_clip": 1.04989302, + "balance_loss_mlp": 1.03262639, + "epoch": 0.10064632496618067, + "flos": 16800125558400.0, + "grad_norm": 2.074683072839241, + "language_loss": 0.73173523, + "learning_rate": 3.947874570130197e-06, + "loss": 0.75362229, + "num_input_tokens_seen": 36091830, + "step": 1674, + "time_per_iteration": 2.7188127040863037 + }, + { + "auxiliary_loss_clip": 0.01172389, + "auxiliary_loss_mlp": 0.00779533, + "balance_loss_clip": 1.0556165, + "balance_loss_mlp": 1.00024796, + "epoch": 0.10070644821884864, + "flos": 23624445576960.0, + "grad_norm": 2.1982379565146872, + "language_loss": 0.79456973, + "learning_rate": 3.947786196485649e-06, + "loss": 0.81408894, + "num_input_tokens_seen": 36111400, + "step": 1675, + "time_per_iteration": 2.712090253829956 + }, + { + "auxiliary_loss_clip": 0.01182659, + "auxiliary_loss_mlp": 0.01063327, + "balance_loss_clip": 1.05801332, + "balance_loss_mlp": 1.04239404, + "epoch": 0.1007665714715166, + "flos": 24462564595200.0, + "grad_norm": 2.408955682155161, + "language_loss": 0.8120935, + "learning_rate": 3.947697748980853e-06, + "loss": 0.83455336, + "num_input_tokens_seen": 36129345, + "step": 1676, + "time_per_iteration": 2.685472249984741 + }, + { + "auxiliary_loss_clip": 0.01175397, + "auxiliary_loss_mlp": 0.01057105, + "balance_loss_clip": 1.05950332, + "balance_loss_mlp": 1.03546858, + "epoch": 0.10082669472418458, + "flos": 16799119977600.0, + "grad_norm": 2.008035557658629, + "language_loss": 0.86132157, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88364655, + "num_input_tokens_seen": 36146255, + "step": 1677, + "time_per_iteration": 2.6589157581329346 + }, + { + "auxiliary_loss_clip": 0.01162997, + "auxiliary_loss_mlp": 0.010508, + "balance_loss_clip": 1.05363441, + "balance_loss_mlp": 1.02896047, + "epoch": 0.10088681797685255, + "flos": 13553513844480.0, + "grad_norm": 2.160847391025828, + "language_loss": 0.86006588, + "learning_rate": 3.947520632403936e-06, + "loss": 0.88220382, + "num_input_tokens_seen": 36164050, + "step": 1678, + "time_per_iteration": 2.694347858428955 + }, + { + "auxiliary_loss_clip": 0.0116292, + "auxiliary_loss_mlp": 0.01056376, + "balance_loss_clip": 1.0587275, + "balance_loss_mlp": 1.03406048, + "epoch": 0.10094694122952051, + "flos": 25265706744960.0, + "grad_norm": 12.700254532531051, + "language_loss": 0.89978886, + "learning_rate": 3.947431963338532e-06, + "loss": 0.92198181, + "num_input_tokens_seen": 36183530, + "step": 1679, + "time_per_iteration": 2.6741397380828857 + }, + { + "auxiliary_loss_clip": 0.01071086, + "auxiliary_loss_mlp": 0.0101685, + "balance_loss_clip": 1.02328789, + "balance_loss_mlp": 1.01360798, + "epoch": 0.10100706448218849, + "flos": 69854299885440.0, + "grad_norm": 0.7882499243548835, + "language_loss": 0.52985126, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55073065, + "num_input_tokens_seen": 36248550, + "step": 1680, + "time_per_iteration": 3.169893503189087 + }, + { + "auxiliary_loss_clip": 0.01185252, + "auxiliary_loss_mlp": 0.00779951, + "balance_loss_clip": 1.06022644, + "balance_loss_mlp": 1.00017488, + "epoch": 0.10106718773485646, + "flos": 20007163463040.0, + "grad_norm": 1.6642182724084642, + "language_loss": 0.76869059, + "learning_rate": 3.947254403670641e-06, + "loss": 0.7883426, + "num_input_tokens_seen": 36266065, + "step": 1681, + "time_per_iteration": 4.146950006484985 + }, + { + "auxiliary_loss_clip": 0.01156046, + "auxiliary_loss_mlp": 0.01059972, + "balance_loss_clip": 1.0539515, + "balance_loss_mlp": 1.03469992, + "epoch": 0.10112731098752442, + "flos": 13479825093120.0, + "grad_norm": 2.3884003317971225, + "language_loss": 0.93957508, + "learning_rate": 3.947165513074889e-06, + "loss": 0.96173531, + "num_input_tokens_seen": 36280960, + "step": 1682, + "time_per_iteration": 4.220505237579346 + }, + { + "auxiliary_loss_clip": 0.01173183, + "auxiliary_loss_mlp": 0.01053261, + "balance_loss_clip": 1.05487084, + "balance_loss_mlp": 1.03133821, + "epoch": 0.1011874342401924, + "flos": 18515901490560.0, + "grad_norm": 3.5300660189263917, + "language_loss": 0.87618893, + "learning_rate": 3.947076548642425e-06, + "loss": 0.89845335, + "num_input_tokens_seen": 36299010, + "step": 1683, + "time_per_iteration": 2.635636329650879 + }, + { + "auxiliary_loss_clip": 0.01128888, + "auxiliary_loss_mlp": 0.01063089, + "balance_loss_clip": 1.04814756, + "balance_loss_mlp": 1.04008126, + "epoch": 0.10124755749286037, + "flos": 20702861055360.0, + "grad_norm": 2.3337760241024923, + "language_loss": 0.74566805, + "learning_rate": 3.946987510376624e-06, + "loss": 0.76758784, + "num_input_tokens_seen": 36318400, + "step": 1684, + "time_per_iteration": 4.417364835739136 + }, + { + "auxiliary_loss_clip": 0.01053031, + "auxiliary_loss_mlp": 0.0101182, + "balance_loss_clip": 1.02547038, + "balance_loss_mlp": 1.00853014, + "epoch": 0.10130768074552833, + "flos": 56109456247680.0, + "grad_norm": 0.7564631726021327, + "language_loss": 0.61085057, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.63149905, + "num_input_tokens_seen": 36381815, + "step": 1685, + "time_per_iteration": 4.87179970741272 + }, + { + "auxiliary_loss_clip": 0.01157045, + "auxiliary_loss_mlp": 0.01056064, + "balance_loss_clip": 1.05233479, + "balance_loss_mlp": 1.0341655, + "epoch": 0.1013678039981963, + "flos": 33402346156800.0, + "grad_norm": 4.297801792672815, + "language_loss": 0.61381406, + "learning_rate": 3.946809212358516e-06, + "loss": 0.6359452, + "num_input_tokens_seen": 36404320, + "step": 1686, + "time_per_iteration": 2.8289108276367188 + }, + { + "auxiliary_loss_clip": 0.01144631, + "auxiliary_loss_mlp": 0.01059888, + "balance_loss_clip": 1.05645001, + "balance_loss_mlp": 1.03678524, + "epoch": 0.10142792725086427, + "flos": 31905338008320.0, + "grad_norm": 2.21923850158845, + "language_loss": 0.81216162, + "learning_rate": 3.946719952612972e-06, + "loss": 0.83420682, + "num_input_tokens_seen": 36427510, + "step": 1687, + "time_per_iteration": 2.947535276412964 + }, + { + "auxiliary_loss_clip": 0.0117612, + "auxiliary_loss_mlp": 0.0105614, + "balance_loss_clip": 1.05933213, + "balance_loss_mlp": 1.03403926, + "epoch": 0.10148805050353224, + "flos": 28475905046400.0, + "grad_norm": 1.7955898786084035, + "language_loss": 0.71943259, + "learning_rate": 3.94663061904761e-06, + "loss": 0.74175525, + "num_input_tokens_seen": 36448230, + "step": 1688, + "time_per_iteration": 2.693249225616455 + }, + { + "auxiliary_loss_clip": 0.01151953, + "auxiliary_loss_mlp": 0.01063362, + "balance_loss_clip": 1.05288756, + "balance_loss_mlp": 1.04079556, + "epoch": 0.1015481737562002, + "flos": 25148888737920.0, + "grad_norm": 2.636795901714516, + "language_loss": 0.86876953, + "learning_rate": 3.94654121166582e-06, + "loss": 0.89092261, + "num_input_tokens_seen": 36464395, + "step": 1689, + "time_per_iteration": 2.677992820739746 + }, + { + "auxiliary_loss_clip": 0.01172188, + "auxiliary_loss_mlp": 0.01057982, + "balance_loss_clip": 1.05476904, + "balance_loss_mlp": 1.0378834, + "epoch": 0.10160829700886818, + "flos": 30882781630080.0, + "grad_norm": 2.2211105929909696, + "language_loss": 0.88170946, + "learning_rate": 3.946451730470993e-06, + "loss": 0.90401113, + "num_input_tokens_seen": 36486475, + "step": 1690, + "time_per_iteration": 2.707209348678589 + }, + { + "auxiliary_loss_clip": 0.01158767, + "auxiliary_loss_mlp": 0.01052386, + "balance_loss_clip": 1.05507553, + "balance_loss_mlp": 1.02973664, + "epoch": 0.10166842026153615, + "flos": 20412020632320.0, + "grad_norm": 2.08291471600754, + "language_loss": 0.83348423, + "learning_rate": 3.946362175466521e-06, + "loss": 0.85559577, + "num_input_tokens_seen": 36505310, + "step": 1691, + "time_per_iteration": 2.6521170139312744 + }, + { + "auxiliary_loss_clip": 0.01162159, + "auxiliary_loss_mlp": 0.01051716, + "balance_loss_clip": 1.05550599, + "balance_loss_mlp": 1.03016281, + "epoch": 0.10172854351420411, + "flos": 33476968661760.0, + "grad_norm": 1.704519528530946, + "language_loss": 0.66773653, + "learning_rate": 3.946272546655801e-06, + "loss": 0.68987525, + "num_input_tokens_seen": 36529820, + "step": 1692, + "time_per_iteration": 2.799353837966919 + }, + { + "auxiliary_loss_clip": 0.01144502, + "auxiliary_loss_mlp": 0.0107473, + "balance_loss_clip": 1.05057836, + "balance_loss_mlp": 1.05258095, + "epoch": 0.1017886667668721, + "flos": 23550325862400.0, + "grad_norm": 1.8345924563029705, + "language_loss": 0.75939322, + "learning_rate": 3.94618284404223e-06, + "loss": 0.78158557, + "num_input_tokens_seen": 36549000, + "step": 1693, + "time_per_iteration": 2.6711113452911377 + }, + { + "auxiliary_loss_clip": 0.01132621, + "auxiliary_loss_mlp": 0.01057162, + "balance_loss_clip": 1.04893303, + "balance_loss_mlp": 1.03289056, + "epoch": 0.10184879001954006, + "flos": 23296078419840.0, + "grad_norm": 1.7027745569702395, + "language_loss": 0.87503564, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.89693356, + "num_input_tokens_seen": 36567515, + "step": 1694, + "time_per_iteration": 2.749119520187378 + }, + { + "auxiliary_loss_clip": 0.01130673, + "auxiliary_loss_mlp": 0.01058451, + "balance_loss_clip": 1.04954553, + "balance_loss_mlp": 1.033095, + "epoch": 0.10190891327220802, + "flos": 18333116156160.0, + "grad_norm": 1.7649462193878245, + "language_loss": 0.79299057, + "learning_rate": 3.946003217420147e-06, + "loss": 0.8148818, + "num_input_tokens_seen": 36586190, + "step": 1695, + "time_per_iteration": 2.839081048965454 + }, + { + "auxiliary_loss_clip": 0.0112732, + "auxiliary_loss_mlp": 0.01061103, + "balance_loss_clip": 1.04818296, + "balance_loss_mlp": 1.03772628, + "epoch": 0.10196903652487599, + "flos": 26465374108800.0, + "grad_norm": 2.7190993931598446, + "language_loss": 0.86494684, + "learning_rate": 3.945913293418447e-06, + "loss": 0.88683105, + "num_input_tokens_seen": 36607495, + "step": 1696, + "time_per_iteration": 2.7802348136901855 + }, + { + "auxiliary_loss_clip": 0.01168675, + "auxiliary_loss_mlp": 0.01054661, + "balance_loss_clip": 1.05711746, + "balance_loss_mlp": 1.03315568, + "epoch": 0.10202915977754397, + "flos": 21869526798720.0, + "grad_norm": 1.7889048836535288, + "language_loss": 0.82350796, + "learning_rate": 3.945823295627519e-06, + "loss": 0.84574133, + "num_input_tokens_seen": 36628555, + "step": 1697, + "time_per_iteration": 2.667962074279785 + }, + { + "auxiliary_loss_clip": 0.01184333, + "auxiliary_loss_mlp": 0.01055548, + "balance_loss_clip": 1.05680871, + "balance_loss_mlp": 1.033149, + "epoch": 0.10208928303021193, + "flos": 22309755886080.0, + "grad_norm": 2.0464291543972006, + "language_loss": 0.81198204, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.83438087, + "num_input_tokens_seen": 36646250, + "step": 1698, + "time_per_iteration": 2.6484432220458984 + }, + { + "auxiliary_loss_clip": 0.01150498, + "auxiliary_loss_mlp": 0.01053546, + "balance_loss_clip": 1.05696845, + "balance_loss_mlp": 1.03226686, + "epoch": 0.1021494062828799, + "flos": 22125569921280.0, + "grad_norm": 2.3020250981163226, + "language_loss": 0.75612724, + "learning_rate": 3.945643078691637e-06, + "loss": 0.77816761, + "num_input_tokens_seen": 36666675, + "step": 1699, + "time_per_iteration": 2.8040614128112793 + }, + { + "auxiliary_loss_clip": 0.01162088, + "auxiliary_loss_mlp": 0.01050379, + "balance_loss_clip": 1.06041551, + "balance_loss_mlp": 1.02827764, + "epoch": 0.10220952953554788, + "flos": 19646728439040.0, + "grad_norm": 1.6839869206777538, + "language_loss": 0.80395639, + "learning_rate": 3.945552859553516e-06, + "loss": 0.8260811, + "num_input_tokens_seen": 36685225, + "step": 1700, + "time_per_iteration": 2.6701290607452393 + }, + { + "auxiliary_loss_clip": 0.0117076, + "auxiliary_loss_mlp": 0.0104804, + "balance_loss_clip": 1.05714083, + "balance_loss_mlp": 1.02653444, + "epoch": 0.10226965278821584, + "flos": 29787290686080.0, + "grad_norm": 2.102621975458346, + "language_loss": 0.76877582, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79096377, + "num_input_tokens_seen": 36705985, + "step": 1701, + "time_per_iteration": 2.748201847076416 + }, + { + "auxiliary_loss_clip": 0.01182259, + "auxiliary_loss_mlp": 0.01050364, + "balance_loss_clip": 1.06157088, + "balance_loss_mlp": 1.02852523, + "epoch": 0.10232977604088381, + "flos": 27016818681600.0, + "grad_norm": 2.1099726588763965, + "language_loss": 0.77922845, + "learning_rate": 3.945372199954019e-06, + "loss": 0.80155474, + "num_input_tokens_seen": 36725815, + "step": 1702, + "time_per_iteration": 2.6703274250030518 + }, + { + "auxiliary_loss_clip": 0.01156323, + "auxiliary_loss_mlp": 0.01052524, + "balance_loss_clip": 1.05596721, + "balance_loss_mlp": 1.03126872, + "epoch": 0.10238989929355179, + "flos": 20777519473920.0, + "grad_norm": 2.2326457826946293, + "language_loss": 0.94093609, + "learning_rate": 3.945281759499494e-06, + "loss": 0.96302462, + "num_input_tokens_seen": 36742345, + "step": 1703, + "time_per_iteration": 2.6712698936462402 + }, + { + "auxiliary_loss_clip": 0.01034483, + "auxiliary_loss_mlp": 0.01037784, + "balance_loss_clip": 1.02765131, + "balance_loss_mlp": 1.03315914, + "epoch": 0.10245002254621975, + "flos": 57698322451200.0, + "grad_norm": 0.8815387598011586, + "language_loss": 0.55096036, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57168299, + "num_input_tokens_seen": 36798775, + "step": 1704, + "time_per_iteration": 3.2863855361938477 + }, + { + "auxiliary_loss_clip": 0.01186822, + "auxiliary_loss_mlp": 0.01053701, + "balance_loss_clip": 1.06026638, + "balance_loss_mlp": 1.03088403, + "epoch": 0.10251014579888772, + "flos": 16800125558400.0, + "grad_norm": 2.051901555713709, + "language_loss": 0.84025991, + "learning_rate": 3.945100657298039e-06, + "loss": 0.86266518, + "num_input_tokens_seen": 36816295, + "step": 1705, + "time_per_iteration": 2.8991851806640625 + }, + { + "auxiliary_loss_clip": 0.01045354, + "auxiliary_loss_mlp": 0.01018361, + "balance_loss_clip": 1.02622223, + "balance_loss_mlp": 1.01526153, + "epoch": 0.1025702690515557, + "flos": 68565500922240.0, + "grad_norm": 0.7692746082941451, + "language_loss": 0.60408181, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62471896, + "num_input_tokens_seen": 36882030, + "step": 1706, + "time_per_iteration": 3.2174558639526367 + }, + { + "auxiliary_loss_clip": 0.01149922, + "auxiliary_loss_mlp": 0.01051211, + "balance_loss_clip": 1.05388391, + "balance_loss_mlp": 1.02812052, + "epoch": 0.10263039230422366, + "flos": 14866623336960.0, + "grad_norm": 2.201796189576969, + "language_loss": 0.85937822, + "learning_rate": 3.94491926006294e-06, + "loss": 0.88138962, + "num_input_tokens_seen": 36899245, + "step": 1707, + "time_per_iteration": 2.689208507537842 + }, + { + "auxiliary_loss_clip": 0.01169165, + "auxiliary_loss_mlp": 0.0105297, + "balance_loss_clip": 1.05941081, + "balance_loss_mlp": 1.03114319, + "epoch": 0.10269051555689163, + "flos": 25337599816320.0, + "grad_norm": 1.471109036018689, + "language_loss": 0.73299325, + "learning_rate": 3.944828450816369e-06, + "loss": 0.75521457, + "num_input_tokens_seen": 36920950, + "step": 1708, + "time_per_iteration": 2.679760456085205 + }, + { + "auxiliary_loss_clip": 0.01155833, + "auxiliary_loss_mlp": 0.00780571, + "balance_loss_clip": 1.05718231, + "balance_loss_mlp": 1.00042295, + "epoch": 0.10275063880955959, + "flos": 21068826773760.0, + "grad_norm": 1.7051644476897239, + "language_loss": 0.91616452, + "learning_rate": 3.944737567821709e-06, + "loss": 0.93552846, + "num_input_tokens_seen": 36938900, + "step": 1709, + "time_per_iteration": 2.6754679679870605 + }, + { + "auxiliary_loss_clip": 0.01124911, + "auxiliary_loss_mlp": 0.01057008, + "balance_loss_clip": 1.05144072, + "balance_loss_mlp": 1.0343945, + "epoch": 0.10281076206222757, + "flos": 30366780802560.0, + "grad_norm": 2.1056252966717275, + "language_loss": 0.88004494, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90186411, + "num_input_tokens_seen": 36957010, + "step": 1710, + "time_per_iteration": 2.708723306655884 + }, + { + "auxiliary_loss_clip": 0.01171004, + "auxiliary_loss_mlp": 0.0105967, + "balance_loss_clip": 1.05658317, + "balance_loss_mlp": 1.036973, + "epoch": 0.10287088531489554, + "flos": 22418313765120.0, + "grad_norm": 1.7046493271202992, + "language_loss": 0.79370153, + "learning_rate": 3.944555580601908e-06, + "loss": 0.81600821, + "num_input_tokens_seen": 36977690, + "step": 1711, + "time_per_iteration": 2.631908416748047 + }, + { + "auxiliary_loss_clip": 0.01156003, + "auxiliary_loss_mlp": 0.01055126, + "balance_loss_clip": 1.05841637, + "balance_loss_mlp": 1.03189242, + "epoch": 0.1029310085675635, + "flos": 25115994858240.0, + "grad_norm": 3.2168061349371135, + "language_loss": 0.73666596, + "learning_rate": 3.944464476383668e-06, + "loss": 0.75877726, + "num_input_tokens_seen": 36997300, + "step": 1712, + "time_per_iteration": 2.7107467651367188 + }, + { + "auxiliary_loss_clip": 0.01133407, + "auxiliary_loss_mlp": 0.01056055, + "balance_loss_clip": 1.05496907, + "balance_loss_mlp": 1.03334546, + "epoch": 0.10299113182023148, + "flos": 19865639877120.0, + "grad_norm": 1.974447377126898, + "language_loss": 0.87049067, + "learning_rate": 3.94437329843114e-06, + "loss": 0.89238536, + "num_input_tokens_seen": 37016110, + "step": 1713, + "time_per_iteration": 2.6532411575317383 + }, + { + "auxiliary_loss_clip": 0.0116832, + "auxiliary_loss_mlp": 0.01060237, + "balance_loss_clip": 1.05669498, + "balance_loss_mlp": 1.03877962, + "epoch": 0.10305125507289944, + "flos": 20447608032000.0, + "grad_norm": 1.57388574383124, + "language_loss": 0.72406238, + "learning_rate": 3.944282046747782e-06, + "loss": 0.74634796, + "num_input_tokens_seen": 37036405, + "step": 1714, + "time_per_iteration": 2.5987610816955566 + }, + { + "auxiliary_loss_clip": 0.01174482, + "auxiliary_loss_mlp": 0.01063165, + "balance_loss_clip": 1.05715692, + "balance_loss_mlp": 1.03934693, + "epoch": 0.10311137832556741, + "flos": 26250772302720.0, + "grad_norm": 2.1959530175190434, + "language_loss": 0.91065919, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93303567, + "num_input_tokens_seen": 37057580, + "step": 1715, + "time_per_iteration": 2.743833303451538 + }, + { + "auxiliary_loss_clip": 0.01170297, + "auxiliary_loss_mlp": 0.01054891, + "balance_loss_clip": 1.05448914, + "balance_loss_mlp": 1.03305221, + "epoch": 0.10317150157823539, + "flos": 35298932175360.0, + "grad_norm": 1.8741123562687005, + "language_loss": 0.75969976, + "learning_rate": 3.944099322202418e-06, + "loss": 0.78195167, + "num_input_tokens_seen": 37079120, + "step": 1716, + "time_per_iteration": 2.748903274536133 + }, + { + "auxiliary_loss_clip": 0.01162664, + "auxiliary_loss_mlp": 0.01061895, + "balance_loss_clip": 1.05617428, + "balance_loss_mlp": 1.03804111, + "epoch": 0.10323162483090335, + "flos": 25739943033600.0, + "grad_norm": 3.178190042364093, + "language_loss": 0.85308528, + "learning_rate": 3.944007849347342e-06, + "loss": 0.87533092, + "num_input_tokens_seen": 37099710, + "step": 1717, + "time_per_iteration": 2.690772533416748 + }, + { + "auxiliary_loss_clip": 0.01127019, + "auxiliary_loss_mlp": 0.01067935, + "balance_loss_clip": 1.05048633, + "balance_loss_mlp": 1.04436755, + "epoch": 0.10329174808357132, + "flos": 16289870906880.0, + "grad_norm": 1.8474438265561113, + "language_loss": 0.82945001, + "learning_rate": 3.943916302775292e-06, + "loss": 0.85139954, + "num_input_tokens_seen": 37117775, + "step": 1718, + "time_per_iteration": 2.7029476165771484 + }, + { + "auxiliary_loss_clip": 0.01171184, + "auxiliary_loss_mlp": 0.01049869, + "balance_loss_clip": 1.05912328, + "balance_loss_mlp": 1.02701616, + "epoch": 0.10335187133623928, + "flos": 36687166963200.0, + "grad_norm": 1.7728224248964342, + "language_loss": 0.73396438, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75617492, + "num_input_tokens_seen": 37140280, + "step": 1719, + "time_per_iteration": 2.7653820514678955 + }, + { + "auxiliary_loss_clip": 0.01168859, + "auxiliary_loss_mlp": 0.01048444, + "balance_loss_clip": 1.05861163, + "balance_loss_mlp": 1.02786827, + "epoch": 0.10341199458890726, + "flos": 14975648092800.0, + "grad_norm": 1.7819459058763836, + "language_loss": 0.92692196, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.94909501, + "num_input_tokens_seen": 37158350, + "step": 1720, + "time_per_iteration": 4.1962480545043945 + }, + { + "auxiliary_loss_clip": 0.01139894, + "auxiliary_loss_mlp": 0.01051033, + "balance_loss_clip": 1.05092323, + "balance_loss_mlp": 1.02827597, + "epoch": 0.10347211784157523, + "flos": 21031587348480.0, + "grad_norm": 1.6861044154168399, + "language_loss": 0.79497123, + "learning_rate": 3.943641220792039e-06, + "loss": 0.81688046, + "num_input_tokens_seen": 37177120, + "step": 1721, + "time_per_iteration": 4.524151802062988 + }, + { + "auxiliary_loss_clip": 0.01130482, + "auxiliary_loss_mlp": 0.01067754, + "balance_loss_clip": 1.05380797, + "balance_loss_mlp": 1.04109859, + "epoch": 0.1035322410942432, + "flos": 19792094780160.0, + "grad_norm": 1.951940775381607, + "language_loss": 0.80707669, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.829059, + "num_input_tokens_seen": 37195895, + "step": 1722, + "time_per_iteration": 2.7972562313079834 + }, + { + "auxiliary_loss_clip": 0.01059018, + "auxiliary_loss_mlp": 0.01038991, + "balance_loss_clip": 1.02668202, + "balance_loss_mlp": 1.03536737, + "epoch": 0.10359236434691117, + "flos": 52698874947840.0, + "grad_norm": 0.9413879826908518, + "language_loss": 0.67161834, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69259846, + "num_input_tokens_seen": 37247270, + "step": 1723, + "time_per_iteration": 4.899553060531616 + }, + { + "auxiliary_loss_clip": 0.01169875, + "auxiliary_loss_mlp": 0.01062977, + "balance_loss_clip": 1.05482125, + "balance_loss_mlp": 1.04193664, + "epoch": 0.10365248759957914, + "flos": 18405404277120.0, + "grad_norm": 2.8641520576523116, + "language_loss": 0.77715755, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.7994861, + "num_input_tokens_seen": 37265595, + "step": 1724, + "time_per_iteration": 2.7613437175750732 + }, + { + "auxiliary_loss_clip": 0.01151829, + "auxiliary_loss_mlp": 0.01069246, + "balance_loss_clip": 1.05667496, + "balance_loss_mlp": 1.04753852, + "epoch": 0.1037126108522471, + "flos": 47553555335040.0, + "grad_norm": 2.6433978354033543, + "language_loss": 0.74533165, + "learning_rate": 3.943273412987676e-06, + "loss": 0.76754242, + "num_input_tokens_seen": 37286660, + "step": 1725, + "time_per_iteration": 4.557274580001831 + }, + { + "auxiliary_loss_clip": 0.01137065, + "auxiliary_loss_mlp": 0.01081067, + "balance_loss_clip": 1.05264461, + "balance_loss_mlp": 1.05832207, + "epoch": 0.10377273410491508, + "flos": 22816670572800.0, + "grad_norm": 2.2241153649877865, + "language_loss": 0.75043738, + "learning_rate": 3.943181276805054e-06, + "loss": 0.77261865, + "num_input_tokens_seen": 37304915, + "step": 1726, + "time_per_iteration": 2.7098495960235596 + }, + { + "auxiliary_loss_clip": 0.01150932, + "auxiliary_loss_mlp": 0.0107864, + "balance_loss_clip": 1.05345368, + "balance_loss_mlp": 1.05610991, + "epoch": 0.10383285735758305, + "flos": 26138694890880.0, + "grad_norm": 2.783771441956431, + "language_loss": 0.73243797, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.75473368, + "num_input_tokens_seen": 37325265, + "step": 1727, + "time_per_iteration": 2.74774169921875 + }, + { + "auxiliary_loss_clip": 0.01157922, + "auxiliary_loss_mlp": 0.01068007, + "balance_loss_clip": 1.05303776, + "balance_loss_mlp": 1.04625082, + "epoch": 0.10389298061025101, + "flos": 17091791994240.0, + "grad_norm": 2.172978726198527, + "language_loss": 0.84373868, + "learning_rate": 3.942996783386422e-06, + "loss": 0.86599791, + "num_input_tokens_seen": 37341650, + "step": 1728, + "time_per_iteration": 2.675724744796753 + }, + { + "auxiliary_loss_clip": 0.01154897, + "auxiliary_loss_mlp": 0.01060505, + "balance_loss_clip": 1.0545603, + "balance_loss_mlp": 1.0393219, + "epoch": 0.10395310386291898, + "flos": 20776513893120.0, + "grad_norm": 2.1406499008555513, + "language_loss": 0.70776087, + "learning_rate": 3.942904426157406e-06, + "loss": 0.7299149, + "num_input_tokens_seen": 37360270, + "step": 1729, + "time_per_iteration": 2.6885008811950684 + }, + { + "auxiliary_loss_clip": 0.01158623, + "auxiliary_loss_mlp": 0.01068311, + "balance_loss_clip": 1.05437422, + "balance_loss_mlp": 1.04520774, + "epoch": 0.10401322711558696, + "flos": 12820540913280.0, + "grad_norm": 2.4133379049648283, + "language_loss": 0.81237471, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.83464402, + "num_input_tokens_seen": 37375225, + "step": 1730, + "time_per_iteration": 2.6659536361694336 + }, + { + "auxiliary_loss_clip": 0.01085856, + "auxiliary_loss_mlp": 0.01063394, + "balance_loss_clip": 1.04733562, + "balance_loss_mlp": 1.04314065, + "epoch": 0.10407335036825492, + "flos": 23184683366400.0, + "grad_norm": 1.6634499611984725, + "language_loss": 0.75829297, + "learning_rate": 3.942719490677489e-06, + "loss": 0.77978551, + "num_input_tokens_seen": 37395165, + "step": 1731, + "time_per_iteration": 3.043125629425049 + }, + { + "auxiliary_loss_clip": 0.01129913, + "auxiliary_loss_mlp": 0.01065783, + "balance_loss_clip": 1.0526607, + "balance_loss_mlp": 1.04604149, + "epoch": 0.10413347362092289, + "flos": 26104184899200.0, + "grad_norm": 1.8280179918091173, + "language_loss": 0.8268069, + "learning_rate": 3.9426269124336e-06, + "loss": 0.84876388, + "num_input_tokens_seen": 37414845, + "step": 1732, + "time_per_iteration": 2.96221661567688 + }, + { + "auxiliary_loss_clip": 0.01141505, + "auxiliary_loss_mlp": 0.01067805, + "balance_loss_clip": 1.05805755, + "balance_loss_mlp": 1.04852867, + "epoch": 0.10419359687359087, + "flos": 12641059630080.0, + "grad_norm": 1.9919813178368582, + "language_loss": 0.83320522, + "learning_rate": 3.942534260525104e-06, + "loss": 0.85529828, + "num_input_tokens_seen": 37432490, + "step": 1733, + "time_per_iteration": 2.7364420890808105 + }, + { + "auxiliary_loss_clip": 0.01153374, + "auxiliary_loss_mlp": 0.0106675, + "balance_loss_clip": 1.05592012, + "balance_loss_mlp": 1.04654372, + "epoch": 0.10425372012625883, + "flos": 12125094716160.0, + "grad_norm": 2.4441875881355597, + "language_loss": 0.76683885, + "learning_rate": 3.942441534955514e-06, + "loss": 0.78904009, + "num_input_tokens_seen": 37449435, + "step": 1734, + "time_per_iteration": 2.669623851776123 + }, + { + "auxiliary_loss_clip": 0.0113597, + "auxiliary_loss_mlp": 0.01052567, + "balance_loss_clip": 1.05042601, + "balance_loss_mlp": 1.03255177, + "epoch": 0.1043138433789268, + "flos": 25337563902720.0, + "grad_norm": 1.6775801166329647, + "language_loss": 0.74826896, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.7701543, + "num_input_tokens_seen": 37469105, + "step": 1735, + "time_per_iteration": 2.8477160930633545 + }, + { + "auxiliary_loss_clip": 0.01167698, + "auxiliary_loss_mlp": 0.01055716, + "balance_loss_clip": 1.05678105, + "balance_loss_mlp": 1.0344727, + "epoch": 0.10437396663159478, + "flos": 29167149352320.0, + "grad_norm": 1.7228393064183538, + "language_loss": 0.78835273, + "learning_rate": 3.94225586284712e-06, + "loss": 0.81058681, + "num_input_tokens_seen": 37490540, + "step": 1736, + "time_per_iteration": 2.690453052520752 + }, + { + "auxiliary_loss_clip": 0.0116734, + "auxiliary_loss_mlp": 0.01064692, + "balance_loss_clip": 1.05800533, + "balance_loss_mlp": 1.04357982, + "epoch": 0.10443408988426274, + "flos": 25080946162560.0, + "grad_norm": 1.8549131823334455, + "language_loss": 0.7058785, + "learning_rate": 3.942162916315356e-06, + "loss": 0.72819883, + "num_input_tokens_seen": 37511905, + "step": 1737, + "time_per_iteration": 2.6296744346618652 + }, + { + "auxiliary_loss_clip": 0.01150138, + "auxiliary_loss_mlp": 0.01059407, + "balance_loss_clip": 1.04806042, + "balance_loss_mlp": 1.03600669, + "epoch": 0.1044942131369307, + "flos": 26759662237440.0, + "grad_norm": 2.415613377802324, + "language_loss": 0.81624997, + "learning_rate": 3.942069896136581e-06, + "loss": 0.83834541, + "num_input_tokens_seen": 37533635, + "step": 1738, + "time_per_iteration": 2.7436723709106445 + }, + { + "auxiliary_loss_clip": 0.01181471, + "auxiliary_loss_mlp": 0.01062035, + "balance_loss_clip": 1.05579174, + "balance_loss_mlp": 1.03950453, + "epoch": 0.10455433638959867, + "flos": 18442571875200.0, + "grad_norm": 2.1004590024567897, + "language_loss": 0.75419426, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.77662933, + "num_input_tokens_seen": 37552035, + "step": 1739, + "time_per_iteration": 2.585538148880005 + }, + { + "auxiliary_loss_clip": 0.01146716, + "auxiliary_loss_mlp": 0.01054893, + "balance_loss_clip": 1.05417264, + "balance_loss_mlp": 1.03348303, + "epoch": 0.10461445964226665, + "flos": 23218977876480.0, + "grad_norm": 1.586314706443492, + "language_loss": 0.77523744, + "learning_rate": 3.941883634852104e-06, + "loss": 0.79725355, + "num_input_tokens_seen": 37571540, + "step": 1740, + "time_per_iteration": 2.8947789669036865 + }, + { + "auxiliary_loss_clip": 0.01152077, + "auxiliary_loss_mlp": 0.01049503, + "balance_loss_clip": 1.05725431, + "balance_loss_mlp": 1.0288676, + "epoch": 0.10467458289493461, + "flos": 24345243797760.0, + "grad_norm": 1.964868695493703, + "language_loss": 0.85976374, + "learning_rate": 3.941790393753467e-06, + "loss": 0.88177955, + "num_input_tokens_seen": 37588265, + "step": 1741, + "time_per_iteration": 2.7706260681152344 + }, + { + "auxiliary_loss_clip": 0.01158134, + "auxiliary_loss_mlp": 0.01056311, + "balance_loss_clip": 1.05614483, + "balance_loss_mlp": 1.03350592, + "epoch": 0.10473470614760258, + "flos": 21287953693440.0, + "grad_norm": 5.197245251055922, + "language_loss": 0.75592613, + "learning_rate": 3.941697079021942e-06, + "loss": 0.77807057, + "num_input_tokens_seen": 37606860, + "step": 1742, + "time_per_iteration": 2.784748077392578 + }, + { + "auxiliary_loss_clip": 0.0113066, + "auxiliary_loss_mlp": 0.01057571, + "balance_loss_clip": 1.05678856, + "balance_loss_mlp": 1.03735304, + "epoch": 0.10479482940027056, + "flos": 21687208341120.0, + "grad_norm": 2.1426857583950416, + "language_loss": 0.87614191, + "learning_rate": 3.94160369066107e-06, + "loss": 0.89802414, + "num_input_tokens_seen": 37625210, + "step": 1743, + "time_per_iteration": 2.819350004196167 + }, + { + "auxiliary_loss_clip": 0.01139959, + "auxiliary_loss_mlp": 0.01048534, + "balance_loss_clip": 1.0552268, + "balance_loss_mlp": 1.0254786, + "epoch": 0.10485495265293852, + "flos": 21573694385280.0, + "grad_norm": 2.060686178474056, + "language_loss": 0.75927812, + "learning_rate": 3.941510228674391e-06, + "loss": 0.7811631, + "num_input_tokens_seen": 37644110, + "step": 1744, + "time_per_iteration": 2.7817211151123047 + }, + { + "auxiliary_loss_clip": 0.01170232, + "auxiliary_loss_mlp": 0.01054483, + "balance_loss_clip": 1.05992889, + "balance_loss_mlp": 1.03442037, + "epoch": 0.10491507590560649, + "flos": 37961923708800.0, + "grad_norm": 1.9689383181633062, + "language_loss": 0.78905094, + "learning_rate": 3.941416693065451e-06, + "loss": 0.81129813, + "num_input_tokens_seen": 37665800, + "step": 1745, + "time_per_iteration": 2.88080096244812 + }, + { + "auxiliary_loss_clip": 0.01180482, + "auxiliary_loss_mlp": 0.01060479, + "balance_loss_clip": 1.05740213, + "balance_loss_mlp": 1.03920031, + "epoch": 0.10497519915827447, + "flos": 26396282298240.0, + "grad_norm": 2.64819141351011, + "language_loss": 0.82568693, + "learning_rate": 3.941323083837794e-06, + "loss": 0.84809649, + "num_input_tokens_seen": 37685095, + "step": 1746, + "time_per_iteration": 2.7068004608154297 + }, + { + "auxiliary_loss_clip": 0.01158367, + "auxiliary_loss_mlp": 0.0105595, + "balance_loss_clip": 1.05737162, + "balance_loss_mlp": 1.03448033, + "epoch": 0.10503532241094243, + "flos": 40662190581120.0, + "grad_norm": 1.6274602877205533, + "language_loss": 0.70573747, + "learning_rate": 3.941229400994971e-06, + "loss": 0.7278806, + "num_input_tokens_seen": 37707445, + "step": 1747, + "time_per_iteration": 2.8689963817596436 + }, + { + "auxiliary_loss_clip": 0.01159389, + "auxiliary_loss_mlp": 0.01056346, + "balance_loss_clip": 1.06035507, + "balance_loss_mlp": 1.03492367, + "epoch": 0.1050954456636104, + "flos": 29789409588480.0, + "grad_norm": 2.386885173400054, + "language_loss": 0.8447504, + "learning_rate": 3.941135644540535e-06, + "loss": 0.86690772, + "num_input_tokens_seen": 37728325, + "step": 1748, + "time_per_iteration": 2.8022749423980713 + }, + { + "auxiliary_loss_clip": 0.01175489, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_clip": 1.05471563, + "balance_loss_mlp": 1.02701974, + "epoch": 0.10515556891627838, + "flos": 23948754497280.0, + "grad_norm": 1.759895679837136, + "language_loss": 0.71681082, + "learning_rate": 3.941041814478041e-06, + "loss": 0.73905981, + "num_input_tokens_seen": 37748910, + "step": 1749, + "time_per_iteration": 2.6568849086761475 + }, + { + "auxiliary_loss_clip": 0.01158221, + "auxiliary_loss_mlp": 0.01058697, + "balance_loss_clip": 1.05427456, + "balance_loss_mlp": 1.03590393, + "epoch": 0.10521569216894634, + "flos": 18259606972800.0, + "grad_norm": 2.95022560634889, + "language_loss": 0.81510806, + "learning_rate": 3.940947910811047e-06, + "loss": 0.83727717, + "num_input_tokens_seen": 37765745, + "step": 1750, + "time_per_iteration": 2.6282739639282227 + }, + { + "auxiliary_loss_clip": 0.01156475, + "auxiliary_loss_mlp": 0.01062657, + "balance_loss_clip": 1.06022298, + "balance_loss_mlp": 1.03973269, + "epoch": 0.10527581542161431, + "flos": 15630909949440.0, + "grad_norm": 2.2218325288878953, + "language_loss": 0.92364043, + "learning_rate": 3.940853933543114e-06, + "loss": 0.94583178, + "num_input_tokens_seen": 37780520, + "step": 1751, + "time_per_iteration": 2.703376531600952 + }, + { + "auxiliary_loss_clip": 0.01165779, + "auxiliary_loss_mlp": 0.01053304, + "balance_loss_clip": 1.0570029, + "balance_loss_mlp": 1.03171563, + "epoch": 0.10533593867428227, + "flos": 18296559089280.0, + "grad_norm": 2.0356912608722877, + "language_loss": 0.79293752, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81512833, + "num_input_tokens_seen": 37799515, + "step": 1752, + "time_per_iteration": 2.6501150131225586 + }, + { + "auxiliary_loss_clip": 0.01116865, + "auxiliary_loss_mlp": 0.01055489, + "balance_loss_clip": 1.05116987, + "balance_loss_mlp": 1.03264856, + "epoch": 0.10539606192695025, + "flos": 29023219555200.0, + "grad_norm": 2.022904639316529, + "language_loss": 0.75978744, + "learning_rate": 3.940665758218686e-06, + "loss": 0.78151095, + "num_input_tokens_seen": 37818695, + "step": 1753, + "time_per_iteration": 2.871335744857788 + }, + { + "auxiliary_loss_clip": 0.01141721, + "auxiliary_loss_mlp": 0.01057356, + "balance_loss_clip": 1.05547547, + "balance_loss_mlp": 1.03415775, + "epoch": 0.10545618517961822, + "flos": 19969313506560.0, + "grad_norm": 2.0563919939847914, + "language_loss": 0.83969283, + "learning_rate": 3.940571560169328e-06, + "loss": 0.86168355, + "num_input_tokens_seen": 37837860, + "step": 1754, + "time_per_iteration": 2.685591459274292 + }, + { + "auxiliary_loss_clip": 0.01136802, + "auxiliary_loss_mlp": 0.01053577, + "balance_loss_clip": 1.05587101, + "balance_loss_mlp": 1.03034329, + "epoch": 0.10551630843228618, + "flos": 16143427157760.0, + "grad_norm": 2.7567281016961087, + "language_loss": 0.68732727, + "learning_rate": 3.940477288533302e-06, + "loss": 0.70923102, + "num_input_tokens_seen": 37856260, + "step": 1755, + "time_per_iteration": 2.754117727279663 + }, + { + "auxiliary_loss_clip": 0.01161626, + "auxiliary_loss_mlp": 0.010623, + "balance_loss_clip": 1.05367684, + "balance_loss_mlp": 1.040187, + "epoch": 0.10557643168495416, + "flos": 23440115957760.0, + "grad_norm": 2.26658946748733, + "language_loss": 0.76382339, + "learning_rate": 3.940382943314182e-06, + "loss": 0.7860626, + "num_input_tokens_seen": 37876960, + "step": 1756, + "time_per_iteration": 2.686790943145752 + }, + { + "auxiliary_loss_clip": 0.01182062, + "auxiliary_loss_mlp": 0.01062906, + "balance_loss_clip": 1.05688286, + "balance_loss_mlp": 1.04203284, + "epoch": 0.10563655493762213, + "flos": 21799034357760.0, + "grad_norm": 1.5917029795724482, + "language_loss": 0.79926664, + "learning_rate": 3.940288524515547e-06, + "loss": 0.82171631, + "num_input_tokens_seen": 37897070, + "step": 1757, + "time_per_iteration": 2.6543681621551514 + }, + { + "auxiliary_loss_clip": 0.01149304, + "auxiliary_loss_mlp": 0.01057523, + "balance_loss_clip": 1.0524838, + "balance_loss_mlp": 1.03563643, + "epoch": 0.10569667819029009, + "flos": 53800863275520.0, + "grad_norm": 1.6583181970862437, + "language_loss": 0.78714895, + "learning_rate": 3.940194032140976e-06, + "loss": 0.80921721, + "num_input_tokens_seen": 37923635, + "step": 1758, + "time_per_iteration": 3.013157367706299 + }, + { + "auxiliary_loss_clip": 0.01165597, + "auxiliary_loss_mlp": 0.01054919, + "balance_loss_clip": 1.05894113, + "balance_loss_mlp": 1.03347349, + "epoch": 0.10575680144295807, + "flos": 22925515760640.0, + "grad_norm": 1.870482409236857, + "language_loss": 0.91388202, + "learning_rate": 3.940099466194054e-06, + "loss": 0.93608713, + "num_input_tokens_seen": 37942650, + "step": 1759, + "time_per_iteration": 4.1841137409210205 + }, + { + "auxiliary_loss_clip": 0.0115455, + "auxiliary_loss_mlp": 0.01056708, + "balance_loss_clip": 1.05242109, + "balance_loss_mlp": 1.03346229, + "epoch": 0.10581692469562604, + "flos": 14136667148160.0, + "grad_norm": 2.509404173865799, + "language_loss": 0.77406812, + "learning_rate": 3.940004826678365e-06, + "loss": 0.79618067, + "num_input_tokens_seen": 37960660, + "step": 1760, + "time_per_iteration": 4.476959228515625 + }, + { + "auxiliary_loss_clip": 0.01161737, + "auxiliary_loss_mlp": 0.01064522, + "balance_loss_clip": 1.0536418, + "balance_loss_mlp": 1.04053712, + "epoch": 0.105877047948294, + "flos": 25958674903680.0, + "grad_norm": 2.27300461956159, + "language_loss": 0.88896096, + "learning_rate": 3.939910113597498e-06, + "loss": 0.91122353, + "num_input_tokens_seen": 37978625, + "step": 1761, + "time_per_iteration": 2.6907520294189453 + }, + { + "auxiliary_loss_clip": 0.01110571, + "auxiliary_loss_mlp": 0.00782389, + "balance_loss_clip": 1.04964042, + "balance_loss_mlp": 1.00012767, + "epoch": 0.10593717120096197, + "flos": 30664768032000.0, + "grad_norm": 2.010693315376097, + "language_loss": 0.7809304, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.79986, + "num_input_tokens_seen": 38000005, + "step": 1762, + "time_per_iteration": 2.869051456451416 + }, + { + "auxiliary_loss_clip": 0.01053171, + "auxiliary_loss_mlp": 0.0105371, + "balance_loss_clip": 1.02694225, + "balance_loss_mlp": 1.05056334, + "epoch": 0.10599729445362994, + "flos": 66436682497920.0, + "grad_norm": 0.8956567750819878, + "language_loss": 0.60503203, + "learning_rate": 3.939720466754602e-06, + "loss": 0.6261009, + "num_input_tokens_seen": 38066165, + "step": 1763, + "time_per_iteration": 5.049196720123291 + }, + { + "auxiliary_loss_clip": 0.01156865, + "auxiliary_loss_mlp": 0.01048706, + "balance_loss_clip": 1.05424261, + "balance_loss_mlp": 1.02708137, + "epoch": 0.10605741770629791, + "flos": 23948179879680.0, + "grad_norm": 2.0510547250099633, + "language_loss": 0.80232942, + "learning_rate": 3.939625532999763e-06, + "loss": 0.82438517, + "num_input_tokens_seen": 38086150, + "step": 1764, + "time_per_iteration": 4.288762807846069 + }, + { + "auxiliary_loss_clip": 0.01136032, + "auxiliary_loss_mlp": 0.01055975, + "balance_loss_clip": 1.04879069, + "balance_loss_mlp": 1.03218043, + "epoch": 0.10611754095896588, + "flos": 19387524919680.0, + "grad_norm": 1.693202084864273, + "language_loss": 0.801691, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.82361102, + "num_input_tokens_seen": 38104205, + "step": 1765, + "time_per_iteration": 2.931269407272339 + }, + { + "auxiliary_loss_clip": 0.01163261, + "auxiliary_loss_mlp": 0.01058956, + "balance_loss_clip": 1.05457163, + "balance_loss_mlp": 1.0367949, + "epoch": 0.10617766421163385, + "flos": 22237755073920.0, + "grad_norm": 1.7665774264343403, + "language_loss": 0.76864165, + "learning_rate": 3.939435444841306e-06, + "loss": 0.79086387, + "num_input_tokens_seen": 38122005, + "step": 1766, + "time_per_iteration": 2.5976176261901855 + }, + { + "auxiliary_loss_clip": 0.01182495, + "auxiliary_loss_mlp": 0.01059246, + "balance_loss_clip": 1.05923963, + "balance_loss_mlp": 1.03766894, + "epoch": 0.10623778746430182, + "flos": 28404407024640.0, + "grad_norm": 1.6265727447650185, + "language_loss": 0.77311498, + "learning_rate": 3.939340290444895e-06, + "loss": 0.79553241, + "num_input_tokens_seen": 38143365, + "step": 1767, + "time_per_iteration": 2.6356630325317383 + }, + { + "auxiliary_loss_clip": 0.01006515, + "auxiliary_loss_mlp": 0.01018751, + "balance_loss_clip": 1.03004837, + "balance_loss_mlp": 1.0151509, + "epoch": 0.10629791071696978, + "flos": 64234639221120.0, + "grad_norm": 0.9172341423433896, + "language_loss": 0.57889944, + "learning_rate": 3.939245062508506e-06, + "loss": 0.59915209, + "num_input_tokens_seen": 38210035, + "step": 1768, + "time_per_iteration": 3.6866471767425537 + }, + { + "auxiliary_loss_clip": 0.01144481, + "auxiliary_loss_mlp": 0.01047419, + "balance_loss_clip": 1.0546546, + "balance_loss_mlp": 1.02687907, + "epoch": 0.10635803396963776, + "flos": 22747578762240.0, + "grad_norm": 1.4529696494540971, + "language_loss": 0.86711109, + "learning_rate": 3.939149761035749e-06, + "loss": 0.8890301, + "num_input_tokens_seen": 38231230, + "step": 1769, + "time_per_iteration": 3.936905860900879 + }, + { + "auxiliary_loss_clip": 0.01141219, + "auxiliary_loss_mlp": 0.00780338, + "balance_loss_clip": 1.05321527, + "balance_loss_mlp": 1.00008726, + "epoch": 0.10641815722230573, + "flos": 31395586147200.0, + "grad_norm": 1.8275276693890916, + "language_loss": 0.61906171, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.63827729, + "num_input_tokens_seen": 38253890, + "step": 1770, + "time_per_iteration": 2.8926138877868652 + }, + { + "auxiliary_loss_clip": 0.01057689, + "auxiliary_loss_mlp": 0.01010808, + "balance_loss_clip": 1.02007711, + "balance_loss_mlp": 1.00775671, + "epoch": 0.1064782804749737, + "flos": 58552527784320.0, + "grad_norm": 0.9163874753670794, + "language_loss": 0.57049137, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.59117633, + "num_input_tokens_seen": 38304290, + "step": 1771, + "time_per_iteration": 3.0783088207244873 + }, + { + "auxiliary_loss_clip": 0.01146276, + "auxiliary_loss_mlp": 0.01065918, + "balance_loss_clip": 1.05574095, + "balance_loss_mlp": 1.04465103, + "epoch": 0.10653840372764166, + "flos": 23987825516160.0, + "grad_norm": 12.794881398939157, + "language_loss": 0.88265753, + "learning_rate": 3.938863415435429e-06, + "loss": 0.90477949, + "num_input_tokens_seen": 38324725, + "step": 1772, + "time_per_iteration": 2.770202159881592 + }, + { + "auxiliary_loss_clip": 0.0118421, + "auxiliary_loss_mlp": 0.01058161, + "balance_loss_clip": 1.05697048, + "balance_loss_mlp": 1.03497458, + "epoch": 0.10659852698030964, + "flos": 18294655668480.0, + "grad_norm": 2.576940958490313, + "language_loss": 0.76030588, + "learning_rate": 3.93876781985337e-06, + "loss": 0.78272957, + "num_input_tokens_seen": 38340735, + "step": 1773, + "time_per_iteration": 2.6177070140838623 + }, + { + "auxiliary_loss_clip": 0.01122733, + "auxiliary_loss_mlp": 0.01067657, + "balance_loss_clip": 1.04691553, + "balance_loss_mlp": 1.04205084, + "epoch": 0.1066586502329776, + "flos": 32160591031680.0, + "grad_norm": 1.868288871406422, + "language_loss": 0.8330853, + "learning_rate": 3.938672150753041e-06, + "loss": 0.85498923, + "num_input_tokens_seen": 38361315, + "step": 1774, + "time_per_iteration": 2.7396061420440674 + }, + { + "auxiliary_loss_clip": 0.01156305, + "auxiliary_loss_mlp": 0.00780518, + "balance_loss_clip": 1.05627465, + "balance_loss_mlp": 1.00011277, + "epoch": 0.10671877348564557, + "flos": 17785155202560.0, + "grad_norm": 2.73383407032925, + "language_loss": 0.76446521, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.78383344, + "num_input_tokens_seen": 38377425, + "step": 1775, + "time_per_iteration": 2.624208927154541 + }, + { + "auxiliary_loss_clip": 0.01063199, + "auxiliary_loss_mlp": 0.01007654, + "balance_loss_clip": 1.01726675, + "balance_loss_mlp": 1.00443542, + "epoch": 0.10677889673831355, + "flos": 63510177813120.0, + "grad_norm": 0.8200823962511624, + "language_loss": 0.57477289, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.5954814, + "num_input_tokens_seen": 38440275, + "step": 1776, + "time_per_iteration": 3.1782386302948 + }, + { + "auxiliary_loss_clip": 0.01150087, + "auxiliary_loss_mlp": 0.01066244, + "balance_loss_clip": 1.05192852, + "balance_loss_mlp": 1.0407691, + "epoch": 0.10683901999098151, + "flos": 22017694400640.0, + "grad_norm": 1.4232532718517703, + "language_loss": 0.83442962, + "learning_rate": 3.938384702378727e-06, + "loss": 0.85659301, + "num_input_tokens_seen": 38461820, + "step": 1777, + "time_per_iteration": 2.7342305183410645 + }, + { + "auxiliary_loss_clip": 0.01113855, + "auxiliary_loss_mlp": 0.00780712, + "balance_loss_clip": 1.04919302, + "balance_loss_mlp": 1.00015831, + "epoch": 0.10689914324364948, + "flos": 25042952551680.0, + "grad_norm": 1.8326039994575831, + "language_loss": 0.87207437, + "learning_rate": 3.938288739241625e-06, + "loss": 0.89102006, + "num_input_tokens_seen": 38482235, + "step": 1778, + "time_per_iteration": 2.859834671020508 + }, + { + "auxiliary_loss_clip": 0.01152509, + "auxiliary_loss_mlp": 0.00780436, + "balance_loss_clip": 1.06804752, + "balance_loss_mlp": 1.00019765, + "epoch": 0.10695926649631746, + "flos": 16435129507200.0, + "grad_norm": 2.4525249429301823, + "language_loss": 0.84165859, + "learning_rate": 3.938192702604417e-06, + "loss": 0.86098808, + "num_input_tokens_seen": 38500690, + "step": 1779, + "time_per_iteration": 2.81423020362854 + }, + { + "auxiliary_loss_clip": 0.01141718, + "auxiliary_loss_mlp": 0.00779857, + "balance_loss_clip": 1.05215359, + "balance_loss_mlp": 1.0001775, + "epoch": 0.10701938974898542, + "flos": 16979211792000.0, + "grad_norm": 1.9378348403129941, + "language_loss": 0.66915894, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.68837464, + "num_input_tokens_seen": 38518405, + "step": 1780, + "time_per_iteration": 2.616684913635254 + }, + { + "auxiliary_loss_clip": 0.01166288, + "auxiliary_loss_mlp": 0.01054109, + "balance_loss_clip": 1.05843914, + "balance_loss_mlp": 1.03268683, + "epoch": 0.10707951300165339, + "flos": 15888102307200.0, + "grad_norm": 1.9168180254288365, + "language_loss": 0.92058647, + "learning_rate": 3.938000408844265e-06, + "loss": 0.94279045, + "num_input_tokens_seen": 38535060, + "step": 1781, + "time_per_iteration": 2.6167802810668945 + }, + { + "auxiliary_loss_clip": 0.0113109, + "auxiliary_loss_mlp": 0.01064554, + "balance_loss_clip": 1.0531441, + "balance_loss_mlp": 1.04344225, + "epoch": 0.10713963625432135, + "flos": 14247164361600.0, + "grad_norm": 1.8357670097294174, + "language_loss": 0.79336482, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.81532121, + "num_input_tokens_seen": 38552855, + "step": 1782, + "time_per_iteration": 2.7669336795806885 + }, + { + "auxiliary_loss_clip": 0.01158369, + "auxiliary_loss_mlp": 0.01061646, + "balance_loss_clip": 1.05510604, + "balance_loss_mlp": 1.04016423, + "epoch": 0.10719975950698933, + "flos": 16756780821120.0, + "grad_norm": 2.0914095256513945, + "language_loss": 0.79086542, + "learning_rate": 3.937807821127436e-06, + "loss": 0.81306553, + "num_input_tokens_seen": 38570075, + "step": 1783, + "time_per_iteration": 2.6349542140960693 + }, + { + "auxiliary_loss_clip": 0.01164267, + "auxiliary_loss_mlp": 0.01065333, + "balance_loss_clip": 1.0570296, + "balance_loss_mlp": 1.04299295, + "epoch": 0.1072598827596573, + "flos": 22710626645760.0, + "grad_norm": 2.1874612027367806, + "language_loss": 0.86421812, + "learning_rate": 3.937711417044395e-06, + "loss": 0.88651407, + "num_input_tokens_seen": 38587970, + "step": 1784, + "time_per_iteration": 2.8452541828155518 + }, + { + "auxiliary_loss_clip": 0.01153461, + "auxiliary_loss_mlp": 0.01055605, + "balance_loss_clip": 1.05502176, + "balance_loss_mlp": 1.03321707, + "epoch": 0.10732000601232526, + "flos": 23258264376960.0, + "grad_norm": 2.4649130783319553, + "language_loss": 1.01192284, + "learning_rate": 3.937614939483143e-06, + "loss": 1.03401351, + "num_input_tokens_seen": 38605840, + "step": 1785, + "time_per_iteration": 2.690018653869629 + }, + { + "auxiliary_loss_clip": 0.01168517, + "auxiliary_loss_mlp": 0.01060763, + "balance_loss_clip": 1.05854678, + "balance_loss_mlp": 1.03984189, + "epoch": 0.10738012926499324, + "flos": 24207060176640.0, + "grad_norm": 1.397915549237645, + "language_loss": 0.84951413, + "learning_rate": 3.937518388447339e-06, + "loss": 0.87180698, + "num_input_tokens_seen": 38627070, + "step": 1786, + "time_per_iteration": 2.637430191040039 + }, + { + "auxiliary_loss_clip": 0.01183118, + "auxiliary_loss_mlp": 0.01059079, + "balance_loss_clip": 1.05716729, + "balance_loss_mlp": 1.03520155, + "epoch": 0.1074402525176612, + "flos": 20923065383040.0, + "grad_norm": 1.7951357311742837, + "language_loss": 0.78861409, + "learning_rate": 3.937421763940642e-06, + "loss": 0.81103605, + "num_input_tokens_seen": 38645840, + "step": 1787, + "time_per_iteration": 2.54508900642395 + }, + { + "auxiliary_loss_clip": 0.01174896, + "auxiliary_loss_mlp": 0.01047406, + "balance_loss_clip": 1.05971575, + "balance_loss_mlp": 1.02528071, + "epoch": 0.10750037577032917, + "flos": 16946928443520.0, + "grad_norm": 1.8536072321218278, + "language_loss": 0.82307518, + "learning_rate": 3.937325065966719e-06, + "loss": 0.84529817, + "num_input_tokens_seen": 38664770, + "step": 1788, + "time_per_iteration": 2.706247568130493 + }, + { + "auxiliary_loss_clip": 0.01180896, + "auxiliary_loss_mlp": 0.01064682, + "balance_loss_clip": 1.05843878, + "balance_loss_mlp": 1.04427314, + "epoch": 0.10756049902299715, + "flos": 20266546550400.0, + "grad_norm": 2.110245519520894, + "language_loss": 0.77840686, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.80086267, + "num_input_tokens_seen": 38683865, + "step": 1789, + "time_per_iteration": 2.6274654865264893 + }, + { + "auxiliary_loss_clip": 0.01185566, + "auxiliary_loss_mlp": 0.01065099, + "balance_loss_clip": 1.0604099, + "balance_loss_mlp": 1.04049408, + "epoch": 0.10762062227566511, + "flos": 23586523793280.0, + "grad_norm": 2.7248977042722524, + "language_loss": 0.74817526, + "learning_rate": 3.937131449631859e-06, + "loss": 0.77068192, + "num_input_tokens_seen": 38702485, + "step": 1790, + "time_per_iteration": 2.624382972717285 + }, + { + "auxiliary_loss_clip": 0.01178128, + "auxiliary_loss_mlp": 0.00780572, + "balance_loss_clip": 1.06110644, + "balance_loss_mlp": 1.00021124, + "epoch": 0.10768074552833308, + "flos": 24310626065280.0, + "grad_norm": 2.350797373347828, + "language_loss": 0.78764236, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.80722934, + "num_input_tokens_seen": 38722475, + "step": 1791, + "time_per_iteration": 2.696162223815918 + }, + { + "auxiliary_loss_clip": 0.01134133, + "auxiliary_loss_mlp": 0.01065057, + "balance_loss_clip": 1.05280125, + "balance_loss_mlp": 1.04117918, + "epoch": 0.10774086878100106, + "flos": 25299965341440.0, + "grad_norm": 1.5879424734455678, + "language_loss": 0.70638013, + "learning_rate": 3.936937539472126e-06, + "loss": 0.7283721, + "num_input_tokens_seen": 38743285, + "step": 1792, + "time_per_iteration": 2.770874261856079 + }, + { + "auxiliary_loss_clip": 0.01149934, + "auxiliary_loss_mlp": 0.01051019, + "balance_loss_clip": 1.05610943, + "balance_loss_mlp": 1.02764249, + "epoch": 0.10780099203366902, + "flos": 22054035985920.0, + "grad_norm": 1.920104493539276, + "language_loss": 0.76565266, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.78766215, + "num_input_tokens_seen": 38763035, + "step": 1793, + "time_per_iteration": 2.7218761444091797 + }, + { + "auxiliary_loss_clip": 0.01116412, + "auxiliary_loss_mlp": 0.01064574, + "balance_loss_clip": 1.05029237, + "balance_loss_mlp": 1.0414238, + "epoch": 0.10786111528633699, + "flos": 22747471021440.0, + "grad_norm": 1.7475786500241859, + "language_loss": 0.85103315, + "learning_rate": 3.936743335516936e-06, + "loss": 0.87284303, + "num_input_tokens_seen": 38784900, + "step": 1794, + "time_per_iteration": 2.7590620517730713 + }, + { + "auxiliary_loss_clip": 0.01115198, + "auxiliary_loss_mlp": 0.01055294, + "balance_loss_clip": 1.04807687, + "balance_loss_mlp": 1.03146446, + "epoch": 0.10792123853900495, + "flos": 20851064570880.0, + "grad_norm": 2.5236234593460924, + "language_loss": 0.74585378, + "learning_rate": 3.936646123375246e-06, + "loss": 0.76755869, + "num_input_tokens_seen": 38804695, + "step": 1795, + "time_per_iteration": 2.8500585556030273 + }, + { + "auxiliary_loss_clip": 0.01124895, + "auxiliary_loss_mlp": 0.01058294, + "balance_loss_clip": 1.04831553, + "balance_loss_mlp": 1.03479767, + "epoch": 0.10798136179167293, + "flos": 17748705876480.0, + "grad_norm": 2.842374039298248, + "language_loss": 0.81653619, + "learning_rate": 3.936548837795741e-06, + "loss": 0.83836806, + "num_input_tokens_seen": 38822395, + "step": 1796, + "time_per_iteration": 2.7549750804901123 + }, + { + "auxiliary_loss_clip": 0.01140492, + "auxiliary_loss_mlp": 0.01083966, + "balance_loss_clip": 1.05246449, + "balance_loss_mlp": 1.05721593, + "epoch": 0.1080414850443409, + "flos": 13589639948160.0, + "grad_norm": 2.59635455269928, + "language_loss": 0.74233043, + "learning_rate": 3.936451478782111e-06, + "loss": 0.764575, + "num_input_tokens_seen": 38839865, + "step": 1797, + "time_per_iteration": 2.6396753787994385 + }, + { + "auxiliary_loss_clip": 0.01160286, + "auxiliary_loss_mlp": 0.01049954, + "balance_loss_clip": 1.05505061, + "balance_loss_mlp": 1.02874684, + "epoch": 0.10810160829700886, + "flos": 16253421580800.0, + "grad_norm": 2.0852339617015025, + "language_loss": 0.81855786, + "learning_rate": 3.936354046338046e-06, + "loss": 0.84066033, + "num_input_tokens_seen": 38857300, + "step": 1798, + "time_per_iteration": 2.7105324268341064 + }, + { + "auxiliary_loss_clip": 0.01142859, + "auxiliary_loss_mlp": 0.01054502, + "balance_loss_clip": 1.05379176, + "balance_loss_mlp": 1.03117299, + "epoch": 0.10816173154967684, + "flos": 15158002464000.0, + "grad_norm": 2.4443000829323687, + "language_loss": 0.85516405, + "learning_rate": 3.936256540467242e-06, + "loss": 0.87713766, + "num_input_tokens_seen": 38874960, + "step": 1799, + "time_per_iteration": 4.159978628158569 + }, + { + "auxiliary_loss_clip": 0.01154352, + "auxiliary_loss_mlp": 0.01062903, + "balance_loss_clip": 1.05493283, + "balance_loss_mlp": 1.04114687, + "epoch": 0.10822185480234481, + "flos": 17785334770560.0, + "grad_norm": 2.7405734706827825, + "language_loss": 0.77434146, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.79651403, + "num_input_tokens_seen": 38893610, + "step": 1800, + "time_per_iteration": 4.52047872543335 + }, + { + "auxiliary_loss_clip": 0.01178634, + "auxiliary_loss_mlp": 0.0104758, + "balance_loss_clip": 1.05722904, + "balance_loss_mlp": 1.02689719, + "epoch": 0.10828197805501277, + "flos": 25556654908800.0, + "grad_norm": 1.582468034859118, + "language_loss": 0.72897375, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.75123584, + "num_input_tokens_seen": 38913485, + "step": 1801, + "time_per_iteration": 4.291400909423828 + }, + { + "auxiliary_loss_clip": 0.01190595, + "auxiliary_loss_mlp": 0.01056056, + "balance_loss_clip": 1.06095624, + "balance_loss_mlp": 1.03478956, + "epoch": 0.10834210130768075, + "flos": 28984435845120.0, + "grad_norm": 1.951139287607183, + "language_loss": 0.6634692, + "learning_rate": 3.935963582331381e-06, + "loss": 0.68593562, + "num_input_tokens_seen": 38935650, + "step": 1802, + "time_per_iteration": 2.722628355026245 + }, + { + "auxiliary_loss_clip": 0.01155661, + "auxiliary_loss_mlp": 0.01059375, + "balance_loss_clip": 1.05326533, + "balance_loss_mlp": 1.03695142, + "epoch": 0.10840222456034872, + "flos": 20264212166400.0, + "grad_norm": 2.084551157592464, + "language_loss": 0.81612957, + "learning_rate": 3.935865782790621e-06, + "loss": 0.8382799, + "num_input_tokens_seen": 38954130, + "step": 1803, + "time_per_iteration": 4.239379167556763 + }, + { + "auxiliary_loss_clip": 0.01163104, + "auxiliary_loss_mlp": 0.01061781, + "balance_loss_clip": 1.0567112, + "balance_loss_mlp": 1.03921473, + "epoch": 0.10846234781301668, + "flos": 19863054097920.0, + "grad_norm": 1.9102934552723363, + "language_loss": 0.91127038, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.93351918, + "num_input_tokens_seen": 38972905, + "step": 1804, + "time_per_iteration": 2.5836737155914307 + }, + { + "auxiliary_loss_clip": 0.01136188, + "auxiliary_loss_mlp": 0.01060133, + "balance_loss_clip": 1.05617714, + "balance_loss_mlp": 1.03718543, + "epoch": 0.10852247106568465, + "flos": 26469037296000.0, + "grad_norm": 2.5742522317806262, + "language_loss": 0.76198906, + "learning_rate": 3.935669963488139e-06, + "loss": 0.78395224, + "num_input_tokens_seen": 38993255, + "step": 1805, + "time_per_iteration": 2.783137321472168 + }, + { + "auxiliary_loss_clip": 0.01149468, + "auxiliary_loss_mlp": 0.01050946, + "balance_loss_clip": 1.05419612, + "balance_loss_mlp": 1.03050184, + "epoch": 0.10858259431835263, + "flos": 30081506987520.0, + "grad_norm": 1.7049574807827799, + "language_loss": 0.85876733, + "learning_rate": 3.935571943733843e-06, + "loss": 0.88077152, + "num_input_tokens_seen": 39012610, + "step": 1806, + "time_per_iteration": 2.8148701190948486 + }, + { + "auxiliary_loss_clip": 0.01168733, + "auxiliary_loss_mlp": 0.00779888, + "balance_loss_clip": 1.05462408, + "balance_loss_mlp": 1.00006652, + "epoch": 0.10864271757102059, + "flos": 19063180085760.0, + "grad_norm": 2.554050049117878, + "language_loss": 0.8108198, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.83030605, + "num_input_tokens_seen": 39030120, + "step": 1807, + "time_per_iteration": 2.6275649070739746 + }, + { + "auxiliary_loss_clip": 0.01139085, + "auxiliary_loss_mlp": 0.01055438, + "balance_loss_clip": 1.05193985, + "balance_loss_mlp": 1.03522038, + "epoch": 0.10870284082368856, + "flos": 24715052271360.0, + "grad_norm": 1.834914777588586, + "language_loss": 0.78910971, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.81105494, + "num_input_tokens_seen": 39049875, + "step": 1808, + "time_per_iteration": 2.722910165786743 + }, + { + "auxiliary_loss_clip": 0.01157997, + "auxiliary_loss_mlp": 0.01056971, + "balance_loss_clip": 1.05918014, + "balance_loss_mlp": 1.03548992, + "epoch": 0.10876296407635654, + "flos": 20627663932800.0, + "grad_norm": 1.6201371380093192, + "language_loss": 0.79013431, + "learning_rate": 3.935277444103342e-06, + "loss": 0.81228393, + "num_input_tokens_seen": 39068935, + "step": 1809, + "time_per_iteration": 2.7261481285095215 + }, + { + "auxiliary_loss_clip": 0.01180468, + "auxiliary_loss_mlp": 0.01057915, + "balance_loss_clip": 1.0568099, + "balance_loss_mlp": 1.03705359, + "epoch": 0.1088230873290245, + "flos": 21579835610880.0, + "grad_norm": 1.9004896030263678, + "language_loss": 0.85129547, + "learning_rate": 3.935179130783046e-06, + "loss": 0.87367928, + "num_input_tokens_seen": 39087370, + "step": 1810, + "time_per_iteration": 2.672696828842163 + }, + { + "auxiliary_loss_clip": 0.01124301, + "auxiliary_loss_mlp": 0.01057363, + "balance_loss_clip": 1.04580724, + "balance_loss_mlp": 1.0335803, + "epoch": 0.10888321058169247, + "flos": 26469037296000.0, + "grad_norm": 1.5993643379141278, + "language_loss": 0.63822675, + "learning_rate": 3.935080744080564e-06, + "loss": 0.66004336, + "num_input_tokens_seen": 39106635, + "step": 1811, + "time_per_iteration": 2.7731611728668213 + }, + { + "auxiliary_loss_clip": 0.01151891, + "auxiliary_loss_mlp": 0.01050225, + "balance_loss_clip": 1.05335796, + "balance_loss_mlp": 1.02836192, + "epoch": 0.10894333383436045, + "flos": 25848608653440.0, + "grad_norm": 1.9284151803363307, + "language_loss": 0.74238706, + "learning_rate": 3.934982283999626e-06, + "loss": 0.76440823, + "num_input_tokens_seen": 39126335, + "step": 1812, + "time_per_iteration": 2.727743625640869 + }, + { + "auxiliary_loss_clip": 0.01142498, + "auxiliary_loss_mlp": 0.01057826, + "balance_loss_clip": 1.05199611, + "balance_loss_mlp": 1.03546214, + "epoch": 0.10900345708702841, + "flos": 19537093152000.0, + "grad_norm": 1.5783196636767667, + "language_loss": 0.72746086, + "learning_rate": 3.934883750543966e-06, + "loss": 0.74946409, + "num_input_tokens_seen": 39144820, + "step": 1813, + "time_per_iteration": 2.798297166824341 + }, + { + "auxiliary_loss_clip": 0.0113892, + "auxiliary_loss_mlp": 0.01056639, + "balance_loss_clip": 1.0511452, + "balance_loss_mlp": 1.03515792, + "epoch": 0.10906358033969638, + "flos": 23623296341760.0, + "grad_norm": 1.635228619121262, + "language_loss": 0.82981038, + "learning_rate": 3.93478514371732e-06, + "loss": 0.85176599, + "num_input_tokens_seen": 39165945, + "step": 1814, + "time_per_iteration": 2.7120048999786377 + }, + { + "auxiliary_loss_clip": 0.01141958, + "auxiliary_loss_mlp": 0.01058857, + "balance_loss_clip": 1.0537864, + "balance_loss_mlp": 1.03787625, + "epoch": 0.10912370359236434, + "flos": 21214731818880.0, + "grad_norm": 1.9556743991494996, + "language_loss": 0.84310579, + "learning_rate": 3.934686463523429e-06, + "loss": 0.86511397, + "num_input_tokens_seen": 39183520, + "step": 1815, + "time_per_iteration": 2.788870096206665 + }, + { + "auxiliary_loss_clip": 0.01146878, + "auxiliary_loss_mlp": 0.01055141, + "balance_loss_clip": 1.05443966, + "balance_loss_mlp": 1.03182411, + "epoch": 0.10918382684503232, + "flos": 13553190622080.0, + "grad_norm": 2.5374826422013195, + "language_loss": 0.71670222, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.73872244, + "num_input_tokens_seen": 39201190, + "step": 1816, + "time_per_iteration": 2.8424103260040283 + }, + { + "auxiliary_loss_clip": 0.01164173, + "auxiliary_loss_mlp": 0.01064184, + "balance_loss_clip": 1.05216932, + "balance_loss_mlp": 1.04052126, + "epoch": 0.10924395009770028, + "flos": 27964321591680.0, + "grad_norm": 2.016899555923086, + "language_loss": 0.72880268, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.75108624, + "num_input_tokens_seen": 39221210, + "step": 1817, + "time_per_iteration": 2.7320947647094727 + }, + { + "auxiliary_loss_clip": 0.01116915, + "auxiliary_loss_mlp": 0.01057856, + "balance_loss_clip": 1.05173278, + "balance_loss_mlp": 1.03517008, + "epoch": 0.10930407335036825, + "flos": 25593750679680.0, + "grad_norm": 1.5988628345308824, + "language_loss": 0.67275256, + "learning_rate": 3.934389982775706e-06, + "loss": 0.69450033, + "num_input_tokens_seen": 39242025, + "step": 1818, + "time_per_iteration": 2.8700790405273438 + }, + { + "auxiliary_loss_clip": 0.01155804, + "auxiliary_loss_mlp": 0.01065952, + "balance_loss_clip": 1.05673873, + "balance_loss_mlp": 1.04313517, + "epoch": 0.10936419660303623, + "flos": 18406194376320.0, + "grad_norm": 3.593580913512793, + "language_loss": 0.73149616, + "learning_rate": 3.934291009150275e-06, + "loss": 0.75371373, + "num_input_tokens_seen": 39259870, + "step": 1819, + "time_per_iteration": 2.7091007232666016 + }, + { + "auxiliary_loss_clip": 0.01142955, + "auxiliary_loss_mlp": 0.00779155, + "balance_loss_clip": 1.05341268, + "balance_loss_mlp": 1.00027704, + "epoch": 0.1094243198557042, + "flos": 23840052963840.0, + "grad_norm": 4.531598275817935, + "language_loss": 0.73764241, + "learning_rate": 3.934191962176335e-06, + "loss": 0.75686359, + "num_input_tokens_seen": 39278500, + "step": 1820, + "time_per_iteration": 2.6513099670410156 + }, + { + "auxiliary_loss_clip": 0.01179358, + "auxiliary_loss_mlp": 0.01056073, + "balance_loss_clip": 1.05747604, + "balance_loss_mlp": 1.03297031, + "epoch": 0.10948444310837216, + "flos": 14643940970880.0, + "grad_norm": 2.2567103978329337, + "language_loss": 0.82532805, + "learning_rate": 3.934092841857642e-06, + "loss": 0.84768236, + "num_input_tokens_seen": 39294800, + "step": 1821, + "time_per_iteration": 2.5348384380340576 + }, + { + "auxiliary_loss_clip": 0.01148016, + "auxiliary_loss_mlp": 0.01052031, + "balance_loss_clip": 1.05133605, + "balance_loss_mlp": 1.03077567, + "epoch": 0.10954456636104014, + "flos": 27818811596160.0, + "grad_norm": 2.0770330480401578, + "language_loss": 0.76271641, + "learning_rate": 3.933993648197955e-06, + "loss": 0.7847169, + "num_input_tokens_seen": 39314625, + "step": 1822, + "time_per_iteration": 2.730079174041748 + }, + { + "auxiliary_loss_clip": 0.01142446, + "auxiliary_loss_mlp": 0.01049259, + "balance_loss_clip": 1.04849207, + "balance_loss_mlp": 1.02856421, + "epoch": 0.1096046896137081, + "flos": 33620934372480.0, + "grad_norm": 1.734419613996414, + "language_loss": 0.79309607, + "learning_rate": 3.933894381201034e-06, + "loss": 0.81501311, + "num_input_tokens_seen": 39336465, + "step": 1823, + "time_per_iteration": 2.756969928741455 + }, + { + "auxiliary_loss_clip": 0.01148165, + "auxiliary_loss_mlp": 0.01049595, + "balance_loss_clip": 1.05160606, + "balance_loss_mlp": 1.02745807, + "epoch": 0.10966481286637607, + "flos": 26980010219520.0, + "grad_norm": 1.4318009514182364, + "language_loss": 0.79590744, + "learning_rate": 3.933795040870645e-06, + "loss": 0.81788504, + "num_input_tokens_seen": 39357930, + "step": 1824, + "time_per_iteration": 2.798168182373047 + }, + { + "auxiliary_loss_clip": 0.01142146, + "auxiliary_loss_mlp": 0.01055513, + "balance_loss_clip": 1.05104232, + "balance_loss_mlp": 1.03381693, + "epoch": 0.10972493611904403, + "flos": 23036551678080.0, + "grad_norm": 2.127143421089703, + "language_loss": 0.88138539, + "learning_rate": 3.933695627210554e-06, + "loss": 0.90336192, + "num_input_tokens_seen": 39376380, + "step": 1825, + "time_per_iteration": 2.6804513931274414 + }, + { + "auxiliary_loss_clip": 0.01128623, + "auxiliary_loss_mlp": 0.01056127, + "balance_loss_clip": 1.04586983, + "balance_loss_mlp": 1.03439498, + "epoch": 0.10978505937171201, + "flos": 38104632443520.0, + "grad_norm": 1.721192594935189, + "language_loss": 0.76441038, + "learning_rate": 3.933596140224532e-06, + "loss": 0.78625786, + "num_input_tokens_seen": 39399935, + "step": 1826, + "time_per_iteration": 2.8315086364746094 + }, + { + "auxiliary_loss_clip": 0.01063155, + "auxiliary_loss_mlp": 0.01016957, + "balance_loss_clip": 1.02709544, + "balance_loss_mlp": 1.01409554, + "epoch": 0.10984518262437998, + "flos": 59849694616320.0, + "grad_norm": 0.8518463216820418, + "language_loss": 0.54997343, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57077461, + "num_input_tokens_seen": 39460685, + "step": 1827, + "time_per_iteration": 3.1425766944885254 + }, + { + "auxiliary_loss_clip": 0.01072651, + "auxiliary_loss_mlp": 0.01010167, + "balance_loss_clip": 1.02693772, + "balance_loss_mlp": 1.00717473, + "epoch": 0.10990530587704794, + "flos": 66719837410560.0, + "grad_norm": 0.7375455878808789, + "language_loss": 0.55382878, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57465696, + "num_input_tokens_seen": 39524765, + "step": 1828, + "time_per_iteration": 3.168165922164917 + }, + { + "auxiliary_loss_clip": 0.01156998, + "auxiliary_loss_mlp": 0.01059335, + "balance_loss_clip": 1.05407059, + "balance_loss_mlp": 1.03618491, + "epoch": 0.10996542912971592, + "flos": 25447199189760.0, + "grad_norm": 2.250827401167328, + "language_loss": 0.84010404, + "learning_rate": 3.933297239348612e-06, + "loss": 0.86226743, + "num_input_tokens_seen": 39543640, + "step": 1829, + "time_per_iteration": 2.7341628074645996 + }, + { + "auxiliary_loss_clip": 0.01130747, + "auxiliary_loss_mlp": 0.01053464, + "balance_loss_clip": 1.0547024, + "balance_loss_mlp": 1.03036165, + "epoch": 0.11002555238238389, + "flos": 44018186186880.0, + "grad_norm": 2.342204785330024, + "language_loss": 0.88880253, + "learning_rate": 3.933197459096614e-06, + "loss": 0.91064465, + "num_input_tokens_seen": 39567525, + "step": 1830, + "time_per_iteration": 2.9093260765075684 + }, + { + "auxiliary_loss_clip": 0.01049643, + "auxiliary_loss_mlp": 0.01009685, + "balance_loss_clip": 1.02618647, + "balance_loss_mlp": 1.00681162, + "epoch": 0.11008567563505185, + "flos": 54065133590400.0, + "grad_norm": 0.6882192363357665, + "language_loss": 0.55566543, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57625872, + "num_input_tokens_seen": 39628470, + "step": 1831, + "time_per_iteration": 3.1713974475860596 + }, + { + "auxiliary_loss_clip": 0.01156783, + "auxiliary_loss_mlp": 0.01073931, + "balance_loss_clip": 1.05708003, + "balance_loss_mlp": 1.04965997, + "epoch": 0.11014579888771983, + "flos": 24243150366720.0, + "grad_norm": 2.4937725361201495, + "language_loss": 0.90836191, + "learning_rate": 3.932997678675282e-06, + "loss": 0.93066907, + "num_input_tokens_seen": 39646670, + "step": 1832, + "time_per_iteration": 2.6786489486694336 + }, + { + "auxiliary_loss_clip": 0.0106111, + "auxiliary_loss_mlp": 0.01010664, + "balance_loss_clip": 1.02332854, + "balance_loss_mlp": 1.00769615, + "epoch": 0.1102059221403878, + "flos": 57743965658880.0, + "grad_norm": 0.7154576595208243, + "language_loss": 0.59911001, + "learning_rate": 3.932897678513523e-06, + "loss": 0.61982775, + "num_input_tokens_seen": 39712915, + "step": 1833, + "time_per_iteration": 3.1802401542663574 + }, + { + "auxiliary_loss_clip": 0.01167201, + "auxiliary_loss_mlp": 0.0105502, + "balance_loss_clip": 1.05312014, + "balance_loss_mlp": 1.03285873, + "epoch": 0.11026604539305576, + "flos": 16795923667200.0, + "grad_norm": 2.6772934272606923, + "language_loss": 0.80799395, + "learning_rate": 3.93279760505609e-06, + "loss": 0.83021617, + "num_input_tokens_seen": 39730650, + "step": 1834, + "time_per_iteration": 2.591374635696411 + }, + { + "auxiliary_loss_clip": 0.01141662, + "auxiliary_loss_mlp": 0.01054827, + "balance_loss_clip": 1.05557871, + "balance_loss_mlp": 1.03004324, + "epoch": 0.11032616864572373, + "flos": 23988076911360.0, + "grad_norm": 2.4853906687508247, + "language_loss": 0.89856094, + "learning_rate": 3.932697458306779e-06, + "loss": 0.92052579, + "num_input_tokens_seen": 39751065, + "step": 1835, + "time_per_iteration": 2.742330312728882 + }, + { + "auxiliary_loss_clip": 0.01131787, + "auxiliary_loss_mlp": 0.01063812, + "balance_loss_clip": 1.0524013, + "balance_loss_mlp": 1.03758645, + "epoch": 0.1103862918983917, + "flos": 19683141851520.0, + "grad_norm": 2.2754442269720023, + "language_loss": 0.63256055, + "learning_rate": 3.932597238269386e-06, + "loss": 0.65451658, + "num_input_tokens_seen": 39769245, + "step": 1836, + "time_per_iteration": 2.6935038566589355 + }, + { + "auxiliary_loss_clip": 0.01138919, + "auxiliary_loss_mlp": 0.01061469, + "balance_loss_clip": 1.05021358, + "balance_loss_mlp": 1.03954661, + "epoch": 0.11044641515105967, + "flos": 32160878340480.0, + "grad_norm": 1.6726289784191204, + "language_loss": 0.72792488, + "learning_rate": 3.932496944947711e-06, + "loss": 0.74992871, + "num_input_tokens_seen": 39790830, + "step": 1837, + "time_per_iteration": 2.7790510654449463 + }, + { + "auxiliary_loss_clip": 0.01165472, + "auxiliary_loss_mlp": 0.01057035, + "balance_loss_clip": 1.05463088, + "balance_loss_mlp": 1.03551781, + "epoch": 0.11050653840372764, + "flos": 16689233295360.0, + "grad_norm": 2.027055787194766, + "language_loss": 0.78489268, + "learning_rate": 3.93239657834556e-06, + "loss": 0.8071177, + "num_input_tokens_seen": 39809475, + "step": 1838, + "time_per_iteration": 4.098532438278198 + }, + { + "auxiliary_loss_clip": 0.01154042, + "auxiliary_loss_mlp": 0.01062407, + "balance_loss_clip": 1.05542612, + "balance_loss_mlp": 1.03970969, + "epoch": 0.11056666165639562, + "flos": 21208877902080.0, + "grad_norm": 2.046221888979386, + "language_loss": 0.71451718, + "learning_rate": 3.932296138466736e-06, + "loss": 0.7366817, + "num_input_tokens_seen": 39826355, + "step": 1839, + "time_per_iteration": 4.205714464187622 + }, + { + "auxiliary_loss_clip": 0.01187588, + "auxiliary_loss_mlp": 0.00781104, + "balance_loss_clip": 1.06183171, + "balance_loss_mlp": 1.00018013, + "epoch": 0.11062678490906358, + "flos": 19165488998400.0, + "grad_norm": 2.623062836625425, + "language_loss": 0.79027873, + "learning_rate": 3.93219562531505e-06, + "loss": 0.80996567, + "num_input_tokens_seen": 39845335, + "step": 1840, + "time_per_iteration": 2.6023378372192383 + }, + { + "auxiliary_loss_clip": 0.01156508, + "auxiliary_loss_mlp": 0.01052512, + "balance_loss_clip": 1.05206251, + "balance_loss_mlp": 1.02887261, + "epoch": 0.11068690816173155, + "flos": 24895287740160.0, + "grad_norm": 1.7551987843009527, + "language_loss": 0.88083529, + "learning_rate": 3.932095038894311e-06, + "loss": 0.90292549, + "num_input_tokens_seen": 39865065, + "step": 1841, + "time_per_iteration": 4.3361639976501465 + }, + { + "auxiliary_loss_clip": 0.01130203, + "auxiliary_loss_mlp": 0.01067683, + "balance_loss_clip": 1.05036247, + "balance_loss_mlp": 1.04453301, + "epoch": 0.11074703141439952, + "flos": 16472368932480.0, + "grad_norm": 3.1603067125494126, + "language_loss": 0.90521991, + "learning_rate": 3.931994379208334e-06, + "loss": 0.92719877, + "num_input_tokens_seen": 39882780, + "step": 1842, + "time_per_iteration": 2.7086760997772217 + }, + { + "auxiliary_loss_clip": 0.01152506, + "auxiliary_loss_mlp": 0.01061227, + "balance_loss_clip": 1.05065131, + "balance_loss_mlp": 1.03982854, + "epoch": 0.11080715466706749, + "flos": 19172420323200.0, + "grad_norm": 2.112801816568727, + "language_loss": 0.85845053, + "learning_rate": 3.931893646260937e-06, + "loss": 0.88058788, + "num_input_tokens_seen": 39900295, + "step": 1843, + "time_per_iteration": 4.263117790222168 + }, + { + "auxiliary_loss_clip": 0.01119254, + "auxiliary_loss_mlp": 0.00783076, + "balance_loss_clip": 1.05050898, + "balance_loss_mlp": 1.00012159, + "epoch": 0.11086727791973545, + "flos": 27704687109120.0, + "grad_norm": 1.4511349711086798, + "language_loss": 0.74735641, + "learning_rate": 3.931792840055941e-06, + "loss": 0.76637971, + "num_input_tokens_seen": 39922075, + "step": 1844, + "time_per_iteration": 2.7999000549316406 + }, + { + "auxiliary_loss_clip": 0.01180395, + "auxiliary_loss_mlp": 0.01055824, + "balance_loss_clip": 1.05662274, + "balance_loss_mlp": 1.03238785, + "epoch": 0.11092740117240343, + "flos": 18514967736960.0, + "grad_norm": 2.017286766878137, + "language_loss": 0.7566812, + "learning_rate": 3.931691960597165e-06, + "loss": 0.77904338, + "num_input_tokens_seen": 39940115, + "step": 1845, + "time_per_iteration": 2.5305535793304443 + }, + { + "auxiliary_loss_clip": 0.01153403, + "auxiliary_loss_mlp": 0.01058911, + "balance_loss_clip": 1.05442989, + "balance_loss_mlp": 1.03807366, + "epoch": 0.1109875244250714, + "flos": 20522446018560.0, + "grad_norm": 1.9628359583393364, + "language_loss": 0.75953126, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.78165436, + "num_input_tokens_seen": 39959920, + "step": 1846, + "time_per_iteration": 2.719325542449951 + }, + { + "auxiliary_loss_clip": 0.01173899, + "auxiliary_loss_mlp": 0.01059369, + "balance_loss_clip": 1.05823123, + "balance_loss_mlp": 1.03717244, + "epoch": 0.11104764767773936, + "flos": 14098601710080.0, + "grad_norm": 2.612459533347621, + "language_loss": 0.8620472, + "learning_rate": 3.931489981933584e-06, + "loss": 0.88437986, + "num_input_tokens_seen": 39974755, + "step": 1847, + "time_per_iteration": 2.7705559730529785 + }, + { + "auxiliary_loss_clip": 0.01181158, + "auxiliary_loss_mlp": 0.01055145, + "balance_loss_clip": 1.05562854, + "balance_loss_mlp": 1.0322808, + "epoch": 0.11110777093040733, + "flos": 20594518657920.0, + "grad_norm": 1.8452742714770096, + "language_loss": 0.76981926, + "learning_rate": 3.931388882736438e-06, + "loss": 0.79218227, + "num_input_tokens_seen": 39993355, + "step": 1848, + "time_per_iteration": 2.605933666229248 + }, + { + "auxiliary_loss_clip": 0.01172398, + "auxiliary_loss_mlp": 0.01056349, + "balance_loss_clip": 1.06262445, + "balance_loss_mlp": 1.03455794, + "epoch": 0.11116789418307531, + "flos": 21870065502720.0, + "grad_norm": 1.6943193134392138, + "language_loss": 0.77621841, + "learning_rate": 3.931287710300832e-06, + "loss": 0.7985059, + "num_input_tokens_seen": 40012410, + "step": 1849, + "time_per_iteration": 2.678415536880493 + }, + { + "auxiliary_loss_clip": 0.01138995, + "auxiliary_loss_mlp": 0.00781122, + "balance_loss_clip": 1.05277848, + "balance_loss_mlp": 1.00010324, + "epoch": 0.11122801743574327, + "flos": 15523106256000.0, + "grad_norm": 3.3234972538165066, + "language_loss": 0.72098577, + "learning_rate": 3.931186464630601e-06, + "loss": 0.74018693, + "num_input_tokens_seen": 40029315, + "step": 1850, + "time_per_iteration": 2.7763028144836426 + }, + { + "auxiliary_loss_clip": 0.01170569, + "auxiliary_loss_mlp": 0.01061108, + "balance_loss_clip": 1.05759382, + "balance_loss_mlp": 1.03874469, + "epoch": 0.11128814068841124, + "flos": 14392279307520.0, + "grad_norm": 2.0638339407107873, + "language_loss": 0.81499028, + "learning_rate": 3.931085145729588e-06, + "loss": 0.83730704, + "num_input_tokens_seen": 40045765, + "step": 1851, + "time_per_iteration": 2.688854694366455 + }, + { + "auxiliary_loss_clip": 0.01164692, + "auxiliary_loss_mlp": 0.01061301, + "balance_loss_clip": 1.05789042, + "balance_loss_mlp": 1.04027295, + "epoch": 0.11134826394107922, + "flos": 16653933204480.0, + "grad_norm": 2.365035468310974, + "language_loss": 0.88270009, + "learning_rate": 3.930983753601631e-06, + "loss": 0.90496004, + "num_input_tokens_seen": 40061660, + "step": 1852, + "time_per_iteration": 2.659914493560791 + }, + { + "auxiliary_loss_clip": 0.01166772, + "auxiliary_loss_mlp": 0.01060698, + "balance_loss_clip": 1.05489326, + "balance_loss_mlp": 1.03791702, + "epoch": 0.11140838719374718, + "flos": 16690993061760.0, + "grad_norm": 2.1825610274136054, + "language_loss": 0.72492862, + "learning_rate": 3.930882288250578e-06, + "loss": 0.74720335, + "num_input_tokens_seen": 40080180, + "step": 1853, + "time_per_iteration": 2.7840964794158936 + }, + { + "auxiliary_loss_clip": 0.01069898, + "auxiliary_loss_mlp": 0.01019902, + "balance_loss_clip": 1.02549517, + "balance_loss_mlp": 1.01701725, + "epoch": 0.11146851044641515, + "flos": 60976355587200.0, + "grad_norm": 0.772231443606995, + "language_loss": 0.53664064, + "learning_rate": 3.930780749680273e-06, + "loss": 0.55753863, + "num_input_tokens_seen": 40138910, + "step": 1854, + "time_per_iteration": 3.089354991912842 + }, + { + "auxiliary_loss_clip": 0.01159576, + "auxiliary_loss_mlp": 0.0105585, + "balance_loss_clip": 1.05390525, + "balance_loss_mlp": 1.03184092, + "epoch": 0.11152863369908313, + "flos": 22193835719040.0, + "grad_norm": 1.863523240792578, + "language_loss": 0.8468501, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.86900431, + "num_input_tokens_seen": 40157745, + "step": 1855, + "time_per_iteration": 2.7361156940460205 + }, + { + "auxiliary_loss_clip": 0.01147504, + "auxiliary_loss_mlp": 0.01064479, + "balance_loss_clip": 1.05225825, + "balance_loss_mlp": 1.0424726, + "epoch": 0.11158875695175109, + "flos": 19537524115200.0, + "grad_norm": 2.1217067547931756, + "language_loss": 0.81187081, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.83399057, + "num_input_tokens_seen": 40175375, + "step": 1856, + "time_per_iteration": 2.7158002853393555 + }, + { + "auxiliary_loss_clip": 0.01168288, + "auxiliary_loss_mlp": 0.01052259, + "balance_loss_clip": 1.05843937, + "balance_loss_mlp": 1.02957392, + "epoch": 0.11164888020441906, + "flos": 25442709989760.0, + "grad_norm": 2.0555738298465314, + "language_loss": 0.82761133, + "learning_rate": 3.93047569469238e-06, + "loss": 0.8498168, + "num_input_tokens_seen": 40195715, + "step": 1857, + "time_per_iteration": 2.647184133529663 + }, + { + "auxiliary_loss_clip": 0.01144196, + "auxiliary_loss_mlp": 0.01044915, + "balance_loss_clip": 1.05255508, + "balance_loss_mlp": 1.02395833, + "epoch": 0.11170900345708702, + "flos": 15632741543040.0, + "grad_norm": 2.3199985887988914, + "language_loss": 0.83131742, + "learning_rate": 3.930373863283608e-06, + "loss": 0.85320854, + "num_input_tokens_seen": 40213975, + "step": 1858, + "time_per_iteration": 2.726905107498169 + }, + { + "auxiliary_loss_clip": 0.01134962, + "auxiliary_loss_mlp": 0.01067658, + "balance_loss_clip": 1.04900265, + "balance_loss_mlp": 1.04350638, + "epoch": 0.111769126709755, + "flos": 23039424766080.0, + "grad_norm": 2.0395414997027657, + "language_loss": 0.9133389, + "learning_rate": 3.930271958674866e-06, + "loss": 0.93536508, + "num_input_tokens_seen": 40233905, + "step": 1859, + "time_per_iteration": 3.0006766319274902 + }, + { + "auxiliary_loss_clip": 0.01167289, + "auxiliary_loss_mlp": 0.01049698, + "balance_loss_clip": 1.05445409, + "balance_loss_mlp": 1.02751315, + "epoch": 0.11182924996242297, + "flos": 20850705434880.0, + "grad_norm": 2.048197345879043, + "language_loss": 0.81528586, + "learning_rate": 3.930169980870018e-06, + "loss": 0.83745575, + "num_input_tokens_seen": 40252810, + "step": 1860, + "time_per_iteration": 2.7216553688049316 + }, + { + "auxiliary_loss_clip": 0.01154007, + "auxiliary_loss_mlp": 0.01060885, + "balance_loss_clip": 1.05737674, + "balance_loss_mlp": 1.03920078, + "epoch": 0.11188937321509093, + "flos": 17455315587840.0, + "grad_norm": 2.00330439318394, + "language_loss": 0.75250578, + "learning_rate": 3.930067929872931e-06, + "loss": 0.77465475, + "num_input_tokens_seen": 40272000, + "step": 1861, + "time_per_iteration": 2.6878490447998047 + }, + { + "auxiliary_loss_clip": 0.01177651, + "auxiliary_loss_mlp": 0.01054452, + "balance_loss_clip": 1.0565964, + "balance_loss_mlp": 1.03360212, + "epoch": 0.11194949646775891, + "flos": 24095916518400.0, + "grad_norm": 1.9427039767358767, + "language_loss": 0.88888168, + "learning_rate": 3.929965805687474e-06, + "loss": 0.91120267, + "num_input_tokens_seen": 40290660, + "step": 1862, + "time_per_iteration": 2.615057945251465 + }, + { + "auxiliary_loss_clip": 0.01164251, + "auxiliary_loss_mlp": 0.01062894, + "balance_loss_clip": 1.05994737, + "balance_loss_mlp": 1.04086459, + "epoch": 0.11200961972042688, + "flos": 25153880728320.0, + "grad_norm": 2.2273555113866847, + "language_loss": 0.87719512, + "learning_rate": 3.92986360831752e-06, + "loss": 0.89946657, + "num_input_tokens_seen": 40307820, + "step": 1863, + "time_per_iteration": 2.6778175830841064 + }, + { + "auxiliary_loss_clip": 0.01158667, + "auxiliary_loss_mlp": 0.01055299, + "balance_loss_clip": 1.05455208, + "balance_loss_mlp": 1.03071773, + "epoch": 0.11206974297309484, + "flos": 21288312829440.0, + "grad_norm": 2.8013407816012226, + "language_loss": 0.64245486, + "learning_rate": 3.929761337766945e-06, + "loss": 0.66459453, + "num_input_tokens_seen": 40327430, + "step": 1864, + "time_per_iteration": 2.724076509475708 + }, + { + "auxiliary_loss_clip": 0.01110154, + "auxiliary_loss_mlp": 0.01047933, + "balance_loss_clip": 1.04924703, + "balance_loss_mlp": 1.02672601, + "epoch": 0.11212986622576282, + "flos": 18915982151040.0, + "grad_norm": 2.0303098144917135, + "language_loss": 0.74043733, + "learning_rate": 3.929658994039627e-06, + "loss": 0.7620182, + "num_input_tokens_seen": 40344545, + "step": 1865, + "time_per_iteration": 2.8119356632232666 + }, + { + "auxiliary_loss_clip": 0.01114683, + "auxiliary_loss_mlp": 0.01070203, + "balance_loss_clip": 1.05348182, + "balance_loss_mlp": 1.04483545, + "epoch": 0.11218998947843078, + "flos": 22054754257920.0, + "grad_norm": 2.7389427033573375, + "language_loss": 0.84692436, + "learning_rate": 3.929556577139446e-06, + "loss": 0.86877316, + "num_input_tokens_seen": 40362300, + "step": 1866, + "time_per_iteration": 2.8022067546844482 + }, + { + "auxiliary_loss_clip": 0.01092364, + "auxiliary_loss_mlp": 0.00781014, + "balance_loss_clip": 1.04227424, + "balance_loss_mlp": 1.00006938, + "epoch": 0.11225011273109875, + "flos": 24571697091840.0, + "grad_norm": 1.704208120094955, + "language_loss": 0.8104012, + "learning_rate": 3.929454087070286e-06, + "loss": 0.82913494, + "num_input_tokens_seen": 40384720, + "step": 1867, + "time_per_iteration": 2.915989875793457 + }, + { + "auxiliary_loss_clip": 0.01179505, + "auxiliary_loss_mlp": 0.01060529, + "balance_loss_clip": 1.05720687, + "balance_loss_mlp": 1.03959608, + "epoch": 0.11231023598376672, + "flos": 28438665621120.0, + "grad_norm": 2.0811636681692844, + "language_loss": 0.86840278, + "learning_rate": 3.929351523836035e-06, + "loss": 0.8908031, + "num_input_tokens_seen": 40404000, + "step": 1868, + "time_per_iteration": 2.6855647563934326 + }, + { + "auxiliary_loss_clip": 0.01161412, + "auxiliary_loss_mlp": 0.00779977, + "balance_loss_clip": 1.06005311, + "balance_loss_mlp": 1.00010097, + "epoch": 0.1123703592364347, + "flos": 14426466076800.0, + "grad_norm": 2.1491178409138376, + "language_loss": 0.68308532, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.70249927, + "num_input_tokens_seen": 40418665, + "step": 1869, + "time_per_iteration": 2.7404487133026123 + }, + { + "auxiliary_loss_clip": 0.01133783, + "auxiliary_loss_mlp": 0.01066188, + "balance_loss_clip": 1.04932964, + "balance_loss_mlp": 1.04225063, + "epoch": 0.11243048248910266, + "flos": 22236282616320.0, + "grad_norm": 1.5255545896853626, + "language_loss": 0.76943326, + "learning_rate": 3.929146177887814e-06, + "loss": 0.79143298, + "num_input_tokens_seen": 40437870, + "step": 1870, + "time_per_iteration": 2.809734344482422 + }, + { + "auxiliary_loss_clip": 0.01129358, + "auxiliary_loss_mlp": 0.01056867, + "balance_loss_clip": 1.0509038, + "balance_loss_mlp": 1.03300166, + "epoch": 0.11249060574177062, + "flos": 18584167288320.0, + "grad_norm": 1.8186132867503446, + "language_loss": 0.76056099, + "learning_rate": 3.929043395181631e-06, + "loss": 0.78242326, + "num_input_tokens_seen": 40455570, + "step": 1871, + "time_per_iteration": 2.727161169052124 + }, + { + "auxiliary_loss_clip": 0.01105662, + "auxiliary_loss_mlp": 0.01051114, + "balance_loss_clip": 1.04993379, + "balance_loss_mlp": 1.03026426, + "epoch": 0.1125507289944386, + "flos": 22856567604480.0, + "grad_norm": 1.9425066802508644, + "language_loss": 0.81811988, + "learning_rate": 3.928940539325929e-06, + "loss": 0.83968765, + "num_input_tokens_seen": 40473600, + "step": 1872, + "time_per_iteration": 2.851868152618408 + }, + { + "auxiliary_loss_clip": 0.01179923, + "auxiliary_loss_mlp": 0.01055722, + "balance_loss_clip": 1.05722499, + "balance_loss_mlp": 1.03359652, + "epoch": 0.11261085224710657, + "flos": 19676390094720.0, + "grad_norm": 2.186176467187071, + "language_loss": 0.8361913, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.85854775, + "num_input_tokens_seen": 40490025, + "step": 1873, + "time_per_iteration": 2.6668763160705566 + }, + { + "auxiliary_loss_clip": 0.01144862, + "auxiliary_loss_mlp": 0.01054726, + "balance_loss_clip": 1.0525465, + "balance_loss_mlp": 1.03196871, + "epoch": 0.11267097549977453, + "flos": 26063246373120.0, + "grad_norm": 1.8822875514234196, + "language_loss": 0.92342389, + "learning_rate": 3.928734608181575e-06, + "loss": 0.94541967, + "num_input_tokens_seen": 40511580, + "step": 1874, + "time_per_iteration": 2.700533866882324 + }, + { + "auxiliary_loss_clip": 0.01140327, + "auxiliary_loss_mlp": 0.01056402, + "balance_loss_clip": 1.05100179, + "balance_loss_mlp": 1.03509891, + "epoch": 0.11273109875244251, + "flos": 21068036674560.0, + "grad_norm": 1.6564425098873434, + "language_loss": 0.75359404, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77556133, + "num_input_tokens_seen": 40530155, + "step": 1875, + "time_per_iteration": 2.7642719745635986 + }, + { + "auxiliary_loss_clip": 0.01167091, + "auxiliary_loss_mlp": 0.01055271, + "balance_loss_clip": 1.05893159, + "balance_loss_mlp": 1.0348264, + "epoch": 0.11279122200511048, + "flos": 27088999061760.0, + "grad_norm": 2.12758140825061, + "language_loss": 0.71578634, + "learning_rate": 3.928528384485984e-06, + "loss": 0.73800993, + "num_input_tokens_seen": 40549500, + "step": 1876, + "time_per_iteration": 2.8505096435546875 + }, + { + "auxiliary_loss_clip": 0.01147417, + "auxiliary_loss_mlp": 0.01054094, + "balance_loss_clip": 1.05223966, + "balance_loss_mlp": 1.03200495, + "epoch": 0.11285134525777844, + "flos": 20187901722240.0, + "grad_norm": 1.8103612630164048, + "language_loss": 0.76795971, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.78997481, + "num_input_tokens_seen": 40567475, + "step": 1877, + "time_per_iteration": 2.6972849369049072 + }, + { + "auxiliary_loss_clip": 0.01168106, + "auxiliary_loss_mlp": 0.01063056, + "balance_loss_clip": 1.05518627, + "balance_loss_mlp": 1.04026341, + "epoch": 0.11291146851044641, + "flos": 12458453863680.0, + "grad_norm": 2.1601834607000368, + "language_loss": 0.87843502, + "learning_rate": 3.928321868270436e-06, + "loss": 0.90074658, + "num_input_tokens_seen": 40583280, + "step": 1878, + "time_per_iteration": 5.6992692947387695 + }, + { + "auxiliary_loss_clip": 0.01140682, + "auxiliary_loss_mlp": 0.01054902, + "balance_loss_clip": 1.05420399, + "balance_loss_mlp": 1.03333724, + "epoch": 0.11297159176311439, + "flos": 23842315520640.0, + "grad_norm": 2.151084139284284, + "language_loss": 0.81623232, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83818817, + "num_input_tokens_seen": 40603080, + "step": 1879, + "time_per_iteration": 2.8688366413116455 + }, + { + "auxiliary_loss_clip": 0.01155904, + "auxiliary_loss_mlp": 0.01059079, + "balance_loss_clip": 1.05238748, + "balance_loss_mlp": 1.03609526, + "epoch": 0.11303171501578235, + "flos": 29930538124800.0, + "grad_norm": 1.941623939252122, + "language_loss": 0.70234305, + "learning_rate": 3.928115059566259e-06, + "loss": 0.72449279, + "num_input_tokens_seen": 40623255, + "step": 1880, + "time_per_iteration": 5.567574739456177 + }, + { + "auxiliary_loss_clip": 0.01155691, + "auxiliary_loss_mlp": 0.01052309, + "balance_loss_clip": 1.05585837, + "balance_loss_mlp": 1.0306015, + "epoch": 0.11309183826845032, + "flos": 16180558842240.0, + "grad_norm": 1.6696082535169858, + "language_loss": 0.72690225, + "learning_rate": 3.928011545540734e-06, + "loss": 0.74898225, + "num_input_tokens_seen": 40641570, + "step": 1881, + "time_per_iteration": 2.792428493499756 + }, + { + "auxiliary_loss_clip": 0.011425, + "auxiliary_loss_mlp": 0.00781179, + "balance_loss_clip": 1.05046606, + "balance_loss_mlp": 1.00008667, + "epoch": 0.1131519615211183, + "flos": 12020702814720.0, + "grad_norm": 2.2964043184115783, + "language_loss": 0.74205768, + "learning_rate": 3.927907958404819e-06, + "loss": 0.76129448, + "num_input_tokens_seen": 40658775, + "step": 1882, + "time_per_iteration": 4.414916515350342 + }, + { + "auxiliary_loss_clip": 0.01177281, + "auxiliary_loss_mlp": 0.01054815, + "balance_loss_clip": 1.05680335, + "balance_loss_mlp": 1.03203452, + "epoch": 0.11321208477378626, + "flos": 26250125857920.0, + "grad_norm": 2.4326158086005965, + "language_loss": 0.7923016, + "learning_rate": 3.92780429816244e-06, + "loss": 0.81462252, + "num_input_tokens_seen": 40679555, + "step": 1883, + "time_per_iteration": 2.762615919113159 + }, + { + "auxiliary_loss_clip": 0.01140926, + "auxiliary_loss_mlp": 0.01058465, + "balance_loss_clip": 1.05226314, + "balance_loss_mlp": 1.03520727, + "epoch": 0.11327220802645423, + "flos": 13626376583040.0, + "grad_norm": 2.2898863699254974, + "language_loss": 0.77047318, + "learning_rate": 3.927700564817529e-06, + "loss": 0.79246712, + "num_input_tokens_seen": 40697295, + "step": 1884, + "time_per_iteration": 2.835468292236328 + }, + { + "auxiliary_loss_clip": 0.01074478, + "auxiliary_loss_mlp": 0.01009476, + "balance_loss_clip": 1.03993821, + "balance_loss_mlp": 1.00620937, + "epoch": 0.1133323312791222, + "flos": 57191802814080.0, + "grad_norm": 0.8138652948403053, + "language_loss": 0.55151373, + "learning_rate": 3.927596758374019e-06, + "loss": 0.5723533, + "num_input_tokens_seen": 40758095, + "step": 1885, + "time_per_iteration": 3.179532289505005 + }, + { + "auxiliary_loss_clip": 0.01083888, + "auxiliary_loss_mlp": 0.01050751, + "balance_loss_clip": 1.04415166, + "balance_loss_mlp": 1.02910316, + "epoch": 0.11339245453179017, + "flos": 24351708245760.0, + "grad_norm": 1.9836288003076585, + "language_loss": 0.90384823, + "learning_rate": 3.927492878835848e-06, + "loss": 0.92519462, + "num_input_tokens_seen": 40777140, + "step": 1886, + "time_per_iteration": 3.038928747177124 + }, + { + "auxiliary_loss_clip": 0.01116325, + "auxiliary_loss_mlp": 0.01057697, + "balance_loss_clip": 1.05137897, + "balance_loss_mlp": 1.03634632, + "epoch": 0.11345257778445814, + "flos": 22670693700480.0, + "grad_norm": 2.0132756022974023, + "language_loss": 0.84852886, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87026906, + "num_input_tokens_seen": 40797505, + "step": 1887, + "time_per_iteration": 3.178863048553467 + }, + { + "auxiliary_loss_clip": 0.01136567, + "auxiliary_loss_mlp": 0.01056557, + "balance_loss_clip": 1.05091035, + "balance_loss_mlp": 1.03549314, + "epoch": 0.11351270103712612, + "flos": 20988242611200.0, + "grad_norm": 2.847610033990257, + "language_loss": 0.75826252, + "learning_rate": 3.927284900491277e-06, + "loss": 0.78019381, + "num_input_tokens_seen": 40812970, + "step": 1888, + "time_per_iteration": 2.7349846363067627 + }, + { + "auxiliary_loss_clip": 0.0113463, + "auxiliary_loss_mlp": 0.01062359, + "balance_loss_clip": 1.05614805, + "balance_loss_mlp": 1.03892243, + "epoch": 0.11357282428979408, + "flos": 37347923600640.0, + "grad_norm": 2.0598279187313624, + "language_loss": 0.68104899, + "learning_rate": 3.927180801692764e-06, + "loss": 0.7030189, + "num_input_tokens_seen": 40837745, + "step": 1889, + "time_per_iteration": 3.144444465637207 + }, + { + "auxiliary_loss_clip": 0.01177206, + "auxiliary_loss_mlp": 0.01049162, + "balance_loss_clip": 1.05653095, + "balance_loss_mlp": 1.02694094, + "epoch": 0.11363294754246205, + "flos": 21757018423680.0, + "grad_norm": 1.7896678692754837, + "language_loss": 0.83947051, + "learning_rate": 3.927076629815362e-06, + "loss": 0.86173415, + "num_input_tokens_seen": 40856490, + "step": 1890, + "time_per_iteration": 2.73126482963562 + }, + { + "auxiliary_loss_clip": 0.01145149, + "auxiliary_loss_mlp": 0.01056017, + "balance_loss_clip": 1.05039728, + "balance_loss_mlp": 1.03395164, + "epoch": 0.11369307079513001, + "flos": 22601637803520.0, + "grad_norm": 2.1678723202845256, + "language_loss": 0.64663875, + "learning_rate": 3.926972384863022e-06, + "loss": 0.66865045, + "num_input_tokens_seen": 40874070, + "step": 1891, + "time_per_iteration": 2.7474160194396973 + }, + { + "auxiliary_loss_clip": 0.01145505, + "auxiliary_loss_mlp": 0.01049015, + "balance_loss_clip": 1.05395687, + "balance_loss_mlp": 1.02773631, + "epoch": 0.11375319404779799, + "flos": 21944257044480.0, + "grad_norm": 2.126575023047711, + "language_loss": 0.87889415, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.90083933, + "num_input_tokens_seen": 40892425, + "step": 1892, + "time_per_iteration": 2.795269250869751 + }, + { + "auxiliary_loss_clip": 0.01119535, + "auxiliary_loss_mlp": 0.01079586, + "balance_loss_clip": 1.05541015, + "balance_loss_mlp": 1.05461168, + "epoch": 0.11381331730046595, + "flos": 26395456285440.0, + "grad_norm": 3.1806920305576973, + "language_loss": 0.72902197, + "learning_rate": 3.926763675749339e-06, + "loss": 0.75101316, + "num_input_tokens_seen": 40912190, + "step": 1893, + "time_per_iteration": 2.890289306640625 + }, + { + "auxiliary_loss_clip": 0.01175698, + "auxiliary_loss_mlp": 0.0106591, + "balance_loss_clip": 1.05438137, + "balance_loss_mlp": 1.04290223, + "epoch": 0.11387344055313392, + "flos": 23804716959360.0, + "grad_norm": 1.8842571229841023, + "language_loss": 0.79247093, + "learning_rate": 3.92665921159591e-06, + "loss": 0.81488699, + "num_input_tokens_seen": 40928395, + "step": 1894, + "time_per_iteration": 2.6820743083953857 + }, + { + "auxiliary_loss_clip": 0.01150233, + "auxiliary_loss_mlp": 0.01061956, + "balance_loss_clip": 1.05356526, + "balance_loss_mlp": 1.03944933, + "epoch": 0.1139335638058019, + "flos": 34522865902080.0, + "grad_norm": 3.429237983174195, + "language_loss": 0.79718482, + "learning_rate": 3.926554674383371e-06, + "loss": 0.81930667, + "num_input_tokens_seen": 40946555, + "step": 1895, + "time_per_iteration": 2.829946994781494 + }, + { + "auxiliary_loss_clip": 0.01075529, + "auxiliary_loss_mlp": 0.01018518, + "balance_loss_clip": 1.03062391, + "balance_loss_mlp": 1.0155375, + "epoch": 0.11399368705846986, + "flos": 70587811520640.0, + "grad_norm": 0.8041110638842961, + "language_loss": 0.63357508, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65451556, + "num_input_tokens_seen": 41004910, + "step": 1896, + "time_per_iteration": 3.3087315559387207 + }, + { + "auxiliary_loss_clip": 0.01147265, + "auxiliary_loss_mlp": 0.0106086, + "balance_loss_clip": 1.05560398, + "balance_loss_mlp": 1.03663635, + "epoch": 0.11405381031113783, + "flos": 21324259365120.0, + "grad_norm": 1.5952307342327186, + "language_loss": 0.85055745, + "learning_rate": 3.926345380796821e-06, + "loss": 0.8726387, + "num_input_tokens_seen": 41026385, + "step": 1897, + "time_per_iteration": 2.8522274494171143 + }, + { + "auxiliary_loss_clip": 0.0117836, + "auxiliary_loss_mlp": 0.00780276, + "balance_loss_clip": 1.05591989, + "balance_loss_mlp": 1.0001986, + "epoch": 0.11411393356380581, + "flos": 19719627091200.0, + "grad_norm": 3.3624139627125587, + "language_loss": 0.79675245, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.81633884, + "num_input_tokens_seen": 41045315, + "step": 1898, + "time_per_iteration": 2.760057210922241 + }, + { + "auxiliary_loss_clip": 0.01115338, + "auxiliary_loss_mlp": 0.01064417, + "balance_loss_clip": 1.04594529, + "balance_loss_mlp": 1.03965724, + "epoch": 0.11417405681647377, + "flos": 17530440883200.0, + "grad_norm": 2.0191769665152903, + "language_loss": 0.73251313, + "learning_rate": 3.926135795021435e-06, + "loss": 0.75431061, + "num_input_tokens_seen": 41063390, + "step": 1899, + "time_per_iteration": 2.7363204956054688 + }, + { + "auxiliary_loss_clip": 0.01042449, + "auxiliary_loss_mlp": 0.01003313, + "balance_loss_clip": 1.03643703, + "balance_loss_mlp": 1.0003922, + "epoch": 0.11423418006914174, + "flos": 59674666619520.0, + "grad_norm": 0.9089505356695228, + "language_loss": 0.63434029, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65479791, + "num_input_tokens_seen": 41124180, + "step": 1900, + "time_per_iteration": 3.2045955657958984 + }, + { + "auxiliary_loss_clip": 0.01113626, + "auxiliary_loss_mlp": 0.01066815, + "balance_loss_clip": 1.04929233, + "balance_loss_mlp": 1.04378414, + "epoch": 0.1142943033218097, + "flos": 22963114321920.0, + "grad_norm": 1.577500478750639, + "language_loss": 0.77943742, + "learning_rate": 3.925925917089001e-06, + "loss": 0.80124187, + "num_input_tokens_seen": 41143485, + "step": 1901, + "time_per_iteration": 2.745089530944824 + }, + { + "auxiliary_loss_clip": 0.01171621, + "auxiliary_loss_mlp": 0.01057834, + "balance_loss_clip": 1.05803061, + "balance_loss_mlp": 1.0359118, + "epoch": 0.11435442657447768, + "flos": 18256267008000.0, + "grad_norm": 2.175933638179557, + "language_loss": 0.84158623, + "learning_rate": 3.925820868573839e-06, + "loss": 0.86388075, + "num_input_tokens_seen": 41161695, + "step": 1902, + "time_per_iteration": 2.6433799266815186 + }, + { + "auxiliary_loss_clip": 0.01159941, + "auxiliary_loss_mlp": 0.01056662, + "balance_loss_clip": 1.05280399, + "balance_loss_mlp": 1.03122306, + "epoch": 0.11441454982714565, + "flos": 24061191045120.0, + "grad_norm": 1.7702735053047673, + "language_loss": 0.77720451, + "learning_rate": 3.925715747031356e-06, + "loss": 0.79937053, + "num_input_tokens_seen": 41181715, + "step": 1903, + "time_per_iteration": 2.6385905742645264 + }, + { + "auxiliary_loss_clip": 0.01145143, + "auxiliary_loss_mlp": 0.0104196, + "balance_loss_clip": 1.05293322, + "balance_loss_mlp": 1.02174175, + "epoch": 0.11447467307981361, + "flos": 25337707557120.0, + "grad_norm": 2.212790565732917, + "language_loss": 0.75751555, + "learning_rate": 3.925610552465539e-06, + "loss": 0.77938658, + "num_input_tokens_seen": 41201770, + "step": 1904, + "time_per_iteration": 2.632152557373047 + }, + { + "auxiliary_loss_clip": 0.01149375, + "auxiliary_loss_mlp": 0.01056532, + "balance_loss_clip": 1.05207586, + "balance_loss_mlp": 1.03279781, + "epoch": 0.11453479633248159, + "flos": 21726063878400.0, + "grad_norm": 2.4422699353972006, + "language_loss": 0.91853034, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.94058943, + "num_input_tokens_seen": 41220590, + "step": 1905, + "time_per_iteration": 2.7421486377716064 + }, + { + "auxiliary_loss_clip": 0.01161686, + "auxiliary_loss_mlp": 0.01050264, + "balance_loss_clip": 1.04978943, + "balance_loss_mlp": 1.02612448, + "epoch": 0.11459491958514956, + "flos": 12969714096000.0, + "grad_norm": 2.5117992419356066, + "language_loss": 0.77484202, + "learning_rate": 3.925399944279861e-06, + "loss": 0.79696143, + "num_input_tokens_seen": 41237250, + "step": 1906, + "time_per_iteration": 2.69333553314209 + }, + { + "auxiliary_loss_clip": 0.0117911, + "auxiliary_loss_mlp": 0.01055129, + "balance_loss_clip": 1.05697322, + "balance_loss_mlp": 1.03222847, + "epoch": 0.11465504283781752, + "flos": 22711273090560.0, + "grad_norm": 2.0720467666322113, + "language_loss": 0.81739306, + "learning_rate": 3.925294530667986e-06, + "loss": 0.83973539, + "num_input_tokens_seen": 41256680, + "step": 1907, + "time_per_iteration": 2.6531317234039307 + }, + { + "auxiliary_loss_clip": 0.0113647, + "auxiliary_loss_mlp": 0.01065473, + "balance_loss_clip": 1.05235374, + "balance_loss_mlp": 1.04227471, + "epoch": 0.1147151660904855, + "flos": 23398387332480.0, + "grad_norm": 2.1769364553121293, + "language_loss": 0.84901214, + "learning_rate": 3.92518904404875e-06, + "loss": 0.87103164, + "num_input_tokens_seen": 41270955, + "step": 1908, + "time_per_iteration": 2.8768258094787598 + }, + { + "auxiliary_loss_clip": 0.01029536, + "auxiliary_loss_mlp": 0.01020856, + "balance_loss_clip": 1.02524137, + "balance_loss_mlp": 1.01694632, + "epoch": 0.11477528934315347, + "flos": 63011843498880.0, + "grad_norm": 0.9197306473097341, + "language_loss": 0.61072773, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63123173, + "num_input_tokens_seen": 41319180, + "step": 1909, + "time_per_iteration": 3.0845727920532227 + }, + { + "auxiliary_loss_clip": 0.01182744, + "auxiliary_loss_mlp": 0.01054075, + "balance_loss_clip": 1.06014562, + "balance_loss_mlp": 1.03219986, + "epoch": 0.11483541259582143, + "flos": 16325601960960.0, + "grad_norm": 7.319166590530674, + "language_loss": 0.79170966, + "learning_rate": 3.924977851804197e-06, + "loss": 0.81407785, + "num_input_tokens_seen": 41337480, + "step": 1910, + "time_per_iteration": 2.708704710006714 + }, + { + "auxiliary_loss_clip": 0.01156489, + "auxiliary_loss_mlp": 0.01052406, + "balance_loss_clip": 1.0580864, + "balance_loss_mlp": 1.03029275, + "epoch": 0.1148955358484894, + "flos": 21580410228480.0, + "grad_norm": 2.117911712245717, + "language_loss": 0.7702589, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.79234779, + "num_input_tokens_seen": 41354650, + "step": 1911, + "time_per_iteration": 2.7597720623016357 + }, + { + "auxiliary_loss_clip": 0.01159986, + "auxiliary_loss_mlp": 0.01054599, + "balance_loss_clip": 1.05726957, + "balance_loss_mlp": 1.03227139, + "epoch": 0.11495565910115738, + "flos": 27673696650240.0, + "grad_norm": 1.677508784227342, + "language_loss": 0.79177421, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.81392002, + "num_input_tokens_seen": 41376935, + "step": 1912, + "time_per_iteration": 2.8143310546875 + }, + { + "auxiliary_loss_clip": 0.01183047, + "auxiliary_loss_mlp": 0.00779659, + "balance_loss_clip": 1.06065917, + "balance_loss_mlp": 1.00014925, + "epoch": 0.11501578235382534, + "flos": 20632368614400.0, + "grad_norm": 2.291252405113977, + "language_loss": 0.77942276, + "learning_rate": 3.924660515982246e-06, + "loss": 0.79904979, + "num_input_tokens_seen": 41396105, + "step": 1913, + "time_per_iteration": 2.696430206298828 + }, + { + "auxiliary_loss_clip": 0.01166892, + "auxiliary_loss_mlp": 0.01052769, + "balance_loss_clip": 1.05442226, + "balance_loss_mlp": 1.02953506, + "epoch": 0.1150759056064933, + "flos": 19829046896640.0, + "grad_norm": 1.8145547055361753, + "language_loss": 0.7003395, + "learning_rate": 3.924554591402939e-06, + "loss": 0.72253609, + "num_input_tokens_seen": 41415600, + "step": 1914, + "time_per_iteration": 2.739251136779785 + }, + { + "auxiliary_loss_clip": 0.01007182, + "auxiliary_loss_mlp": 0.01004682, + "balance_loss_clip": 1.02677619, + "balance_loss_mlp": 1.00191641, + "epoch": 0.11513602885916129, + "flos": 70045776311040.0, + "grad_norm": 0.7558771871458172, + "language_loss": 0.61059874, + "learning_rate": 3.92444859384433e-06, + "loss": 0.6307174, + "num_input_tokens_seen": 41478760, + "step": 1915, + "time_per_iteration": 3.56019926071167 + }, + { + "auxiliary_loss_clip": 0.01166434, + "auxiliary_loss_mlp": 0.01058573, + "balance_loss_clip": 1.05994964, + "balance_loss_mlp": 1.03595936, + "epoch": 0.11519615211182925, + "flos": 15741730385280.0, + "grad_norm": 2.437201506258279, + "language_loss": 0.93116963, + "learning_rate": 3.924342523310436e-06, + "loss": 0.95341969, + "num_input_tokens_seen": 41495720, + "step": 1916, + "time_per_iteration": 3.244772434234619 + }, + { + "auxiliary_loss_clip": 0.01161132, + "auxiliary_loss_mlp": 0.01059827, + "balance_loss_clip": 1.05798697, + "balance_loss_mlp": 1.03470993, + "epoch": 0.11525627536449722, + "flos": 20667632791680.0, + "grad_norm": 1.8909260082350545, + "language_loss": 0.72560197, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.74781156, + "num_input_tokens_seen": 41513585, + "step": 1917, + "time_per_iteration": 4.502236843109131 + }, + { + "auxiliary_loss_clip": 0.01138773, + "auxiliary_loss_mlp": 0.0104964, + "balance_loss_clip": 1.05739903, + "balance_loss_mlp": 1.02700245, + "epoch": 0.1153163986171652, + "flos": 20303283185280.0, + "grad_norm": 9.147356795176979, + "language_loss": 0.74213129, + "learning_rate": 3.92413016333289e-06, + "loss": 0.76401544, + "num_input_tokens_seen": 41533390, + "step": 1918, + "time_per_iteration": 4.344711065292358 + }, + { + "auxiliary_loss_clip": 0.0114898, + "auxiliary_loss_mlp": 0.010469, + "balance_loss_clip": 1.05532503, + "balance_loss_mlp": 1.02450073, + "epoch": 0.11537652186983316, + "flos": 17639321984640.0, + "grad_norm": 3.182152136597976, + "language_loss": 0.86367452, + "learning_rate": 3.92402387389729e-06, + "loss": 0.88563335, + "num_input_tokens_seen": 41551015, + "step": 1919, + "time_per_iteration": 4.540036201477051 + }, + { + "auxiliary_loss_clip": 0.01134044, + "auxiliary_loss_mlp": 0.01067867, + "balance_loss_clip": 1.0496366, + "balance_loss_mlp": 1.04172444, + "epoch": 0.11543664512250112, + "flos": 21069401391360.0, + "grad_norm": 1.93595243799445, + "language_loss": 0.86735415, + "learning_rate": 3.923917511502512e-06, + "loss": 0.8893733, + "num_input_tokens_seen": 41568055, + "step": 1920, + "time_per_iteration": 2.7719242572784424 + }, + { + "auxiliary_loss_clip": 0.011686, + "auxiliary_loss_mlp": 0.010528, + "balance_loss_clip": 1.0593946, + "balance_loss_mlp": 1.0302341, + "epoch": 0.11549676837516909, + "flos": 22747542848640.0, + "grad_norm": 4.512761907267092, + "language_loss": 0.79294932, + "learning_rate": 3.923811076152589e-06, + "loss": 0.81516337, + "num_input_tokens_seen": 41587435, + "step": 1921, + "time_per_iteration": 2.798673629760742 + }, + { + "auxiliary_loss_clip": 0.01174604, + "auxiliary_loss_mlp": 0.01063526, + "balance_loss_clip": 1.05685806, + "balance_loss_mlp": 1.04007721, + "epoch": 0.11555689162783707, + "flos": 19168972617600.0, + "grad_norm": 2.4057040360661484, + "language_loss": 0.78464305, + "learning_rate": 3.923704567851557e-06, + "loss": 0.80702436, + "num_input_tokens_seen": 41604975, + "step": 1922, + "time_per_iteration": 4.352341651916504 + }, + { + "auxiliary_loss_clip": 0.01092284, + "auxiliary_loss_mlp": 0.01064602, + "balance_loss_clip": 1.04645681, + "balance_loss_mlp": 1.04229808, + "epoch": 0.11561701488050503, + "flos": 24572056227840.0, + "grad_norm": 1.8560991769949675, + "language_loss": 0.84293079, + "learning_rate": 3.923597986603456e-06, + "loss": 0.86449969, + "num_input_tokens_seen": 41626155, + "step": 1923, + "time_per_iteration": 3.2956740856170654 + }, + { + "auxiliary_loss_clip": 0.01171957, + "auxiliary_loss_mlp": 0.01056739, + "balance_loss_clip": 1.0600003, + "balance_loss_mlp": 1.03317094, + "epoch": 0.115677138133173, + "flos": 17092546179840.0, + "grad_norm": 1.944851076041885, + "language_loss": 0.80890471, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.83119166, + "num_input_tokens_seen": 41644805, + "step": 1924, + "time_per_iteration": 3.0939247608184814 + }, + { + "auxiliary_loss_clip": 0.01055916, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.03045607, + "balance_loss_mlp": 1.02436543, + "epoch": 0.11573726138584098, + "flos": 62703875266560.0, + "grad_norm": 0.8171642061509322, + "language_loss": 0.61196578, + "learning_rate": 3.923384605282212e-06, + "loss": 0.63279623, + "num_input_tokens_seen": 41709345, + "step": 1925, + "time_per_iteration": 3.3765265941619873 + }, + { + "auxiliary_loss_clip": 0.01155845, + "auxiliary_loss_mlp": 0.01079328, + "balance_loss_clip": 1.05374098, + "balance_loss_mlp": 1.0549382, + "epoch": 0.11579738463850894, + "flos": 22601135013120.0, + "grad_norm": 1.7772533553430212, + "language_loss": 0.74766397, + "learning_rate": 3.923277805217161e-06, + "loss": 0.77001572, + "num_input_tokens_seen": 41730210, + "step": 1926, + "time_per_iteration": 2.754974126815796 + }, + { + "auxiliary_loss_clip": 0.01116228, + "auxiliary_loss_mlp": 0.00781701, + "balance_loss_clip": 1.04683304, + "balance_loss_mlp": 1.00016665, + "epoch": 0.11585750789117691, + "flos": 21726135705600.0, + "grad_norm": 4.731879086182685, + "language_loss": 0.71978599, + "learning_rate": 3.923170932221222e-06, + "loss": 0.7387653, + "num_input_tokens_seen": 41750270, + "step": 1927, + "time_per_iteration": 2.9454004764556885 + }, + { + "auxiliary_loss_clip": 0.01137955, + "auxiliary_loss_mlp": 0.01058796, + "balance_loss_clip": 1.05250621, + "balance_loss_mlp": 1.03572917, + "epoch": 0.11591763114384489, + "flos": 26287544851200.0, + "grad_norm": 1.5938674022456252, + "language_loss": 0.86854041, + "learning_rate": 3.92306398629845e-06, + "loss": 0.89050794, + "num_input_tokens_seen": 41772975, + "step": 1928, + "time_per_iteration": 2.832750082015991 + }, + { + "auxiliary_loss_clip": 0.01129041, + "auxiliary_loss_mlp": 0.01060836, + "balance_loss_clip": 1.05032003, + "balance_loss_mlp": 1.03706551, + "epoch": 0.11597775439651285, + "flos": 23000461488000.0, + "grad_norm": 1.6639520350020578, + "language_loss": 0.77450585, + "learning_rate": 3.922956967452898e-06, + "loss": 0.79640466, + "num_input_tokens_seen": 41791765, + "step": 1929, + "time_per_iteration": 2.7876811027526855 + }, + { + "auxiliary_loss_clip": 0.01176887, + "auxiliary_loss_mlp": 0.01063611, + "balance_loss_clip": 1.05667901, + "balance_loss_mlp": 1.0424509, + "epoch": 0.11603787764918082, + "flos": 31941715507200.0, + "grad_norm": 1.8085677541874856, + "language_loss": 0.76831949, + "learning_rate": 3.922849875688626e-06, + "loss": 0.79072452, + "num_input_tokens_seen": 41815615, + "step": 1930, + "time_per_iteration": 2.819934844970703 + }, + { + "auxiliary_loss_clip": 0.01145781, + "auxiliary_loss_mlp": 0.01054046, + "balance_loss_clip": 1.05066586, + "balance_loss_mlp": 1.03165817, + "epoch": 0.1160980009018488, + "flos": 22271654534400.0, + "grad_norm": 1.9434791543130712, + "language_loss": 0.72291863, + "learning_rate": 3.922742711009693e-06, + "loss": 0.74491692, + "num_input_tokens_seen": 41834810, + "step": 1931, + "time_per_iteration": 2.8078088760375977 + }, + { + "auxiliary_loss_clip": 0.01146409, + "auxiliary_loss_mlp": 0.01061336, + "balance_loss_clip": 1.05090261, + "balance_loss_mlp": 1.03575325, + "epoch": 0.11615812415451676, + "flos": 22783633038720.0, + "grad_norm": 1.7378937044391531, + "language_loss": 0.8222791, + "learning_rate": 3.922635473420164e-06, + "loss": 0.8443566, + "num_input_tokens_seen": 41854975, + "step": 1932, + "time_per_iteration": 2.7495200634002686 + }, + { + "auxiliary_loss_clip": 0.01030493, + "auxiliary_loss_mlp": 0.01018834, + "balance_loss_clip": 1.02184403, + "balance_loss_mlp": 1.01556778, + "epoch": 0.11621824740718473, + "flos": 67146096107520.0, + "grad_norm": 0.7669378012870447, + "language_loss": 0.61050332, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63099658, + "num_input_tokens_seen": 41911105, + "step": 1933, + "time_per_iteration": 3.256678581237793 + }, + { + "auxiliary_loss_clip": 0.01108577, + "auxiliary_loss_mlp": 0.00780156, + "balance_loss_clip": 1.04764509, + "balance_loss_mlp": 1.00006175, + "epoch": 0.11627837065985269, + "flos": 20375930442240.0, + "grad_norm": 2.830760437639296, + "language_loss": 0.85790741, + "learning_rate": 3.922420779525586e-06, + "loss": 0.8767947, + "num_input_tokens_seen": 41931750, + "step": 1934, + "time_per_iteration": 2.9144253730773926 + }, + { + "auxiliary_loss_clip": 0.01117671, + "auxiliary_loss_mlp": 0.01059839, + "balance_loss_clip": 1.04929256, + "balance_loss_mlp": 1.03453088, + "epoch": 0.11633849391252067, + "flos": 21725812483200.0, + "grad_norm": 2.625764216143105, + "language_loss": 0.66222906, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.68400419, + "num_input_tokens_seen": 41949400, + "step": 1935, + "time_per_iteration": 2.867152452468872 + }, + { + "auxiliary_loss_clip": 0.01183991, + "auxiliary_loss_mlp": 0.01052492, + "balance_loss_clip": 1.05868936, + "balance_loss_mlp": 1.03111792, + "epoch": 0.11639861716518864, + "flos": 18805341283200.0, + "grad_norm": 2.025938843377603, + "language_loss": 0.75678742, + "learning_rate": 3.922205794037456e-06, + "loss": 0.77915227, + "num_input_tokens_seen": 41968100, + "step": 1936, + "time_per_iteration": 2.7282185554504395 + }, + { + "auxiliary_loss_clip": 0.01179718, + "auxiliary_loss_mlp": 0.01049532, + "balance_loss_clip": 1.05632091, + "balance_loss_mlp": 1.02639306, + "epoch": 0.1164587404178566, + "flos": 21214983214080.0, + "grad_norm": 2.0032002399718905, + "language_loss": 0.84086847, + "learning_rate": 3.922098191955998e-06, + "loss": 0.86316097, + "num_input_tokens_seen": 41986375, + "step": 1937, + "time_per_iteration": 2.715386152267456 + }, + { + "auxiliary_loss_clip": 0.01152084, + "auxiliary_loss_mlp": 0.01048961, + "balance_loss_clip": 1.05258632, + "balance_loss_mlp": 1.0268234, + "epoch": 0.11651886367052458, + "flos": 27818632028160.0, + "grad_norm": 3.0485930101216607, + "language_loss": 0.7617709, + "learning_rate": 3.921990516988384e-06, + "loss": 0.78378135, + "num_input_tokens_seen": 42006055, + "step": 1938, + "time_per_iteration": 2.7624804973602295 + }, + { + "auxiliary_loss_clip": 0.01182576, + "auxiliary_loss_mlp": 0.01055104, + "balance_loss_clip": 1.05742419, + "balance_loss_mlp": 1.03250146, + "epoch": 0.11657898692319255, + "flos": 22889569224960.0, + "grad_norm": 1.7682499083089231, + "language_loss": 0.79677606, + "learning_rate": 3.921882769138696e-06, + "loss": 0.81915289, + "num_input_tokens_seen": 42024995, + "step": 1939, + "time_per_iteration": 2.71458101272583 + }, + { + "auxiliary_loss_clip": 0.01148291, + "auxiliary_loss_mlp": 0.01057951, + "balance_loss_clip": 1.05209351, + "balance_loss_mlp": 1.03508627, + "epoch": 0.11663911017586051, + "flos": 24315905364480.0, + "grad_norm": 2.2281245193552475, + "language_loss": 0.85916591, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.88122833, + "num_input_tokens_seen": 42042640, + "step": 1940, + "time_per_iteration": 2.7322728633880615 + }, + { + "auxiliary_loss_clip": 0.01153746, + "auxiliary_loss_mlp": 0.01056301, + "balance_loss_clip": 1.05659437, + "balance_loss_mlp": 1.03548717, + "epoch": 0.11669923342852849, + "flos": 42340152470400.0, + "grad_norm": 1.4952807995381137, + "language_loss": 0.75590646, + "learning_rate": 3.921667054809449e-06, + "loss": 0.77800703, + "num_input_tokens_seen": 42067005, + "step": 1941, + "time_per_iteration": 2.9211390018463135 + }, + { + "auxiliary_loss_clip": 0.01149585, + "auxiliary_loss_mlp": 0.00780203, + "balance_loss_clip": 1.05181897, + "balance_loss_mlp": 1.00006557, + "epoch": 0.11675935668119646, + "flos": 14642288945280.0, + "grad_norm": 2.277225749463833, + "language_loss": 0.88847101, + "learning_rate": 3.921559088338068e-06, + "loss": 0.90776885, + "num_input_tokens_seen": 42082295, + "step": 1942, + "time_per_iteration": 2.7145469188690186 + }, + { + "auxiliary_loss_clip": 0.01165183, + "auxiliary_loss_mlp": 0.01056257, + "balance_loss_clip": 1.05553317, + "balance_loss_mlp": 1.03552663, + "epoch": 0.11681947993386442, + "flos": 35116470063360.0, + "grad_norm": 1.6547450593003057, + "language_loss": 0.67979252, + "learning_rate": 3.921451049000975e-06, + "loss": 0.70200694, + "num_input_tokens_seen": 42105295, + "step": 1943, + "time_per_iteration": 2.789701461791992 + }, + { + "auxiliary_loss_clip": 0.01153022, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.05515063, + "balance_loss_mlp": 1.02591634, + "epoch": 0.11687960318653239, + "flos": 38983259024640.0, + "grad_norm": 1.9817763000300312, + "language_loss": 0.69831288, + "learning_rate": 3.921342936802265e-06, + "loss": 0.72031963, + "num_input_tokens_seen": 42125520, + "step": 1944, + "time_per_iteration": 2.827150583267212 + }, + { + "auxiliary_loss_clip": 0.01155915, + "auxiliary_loss_mlp": 0.01051888, + "balance_loss_clip": 1.05038309, + "balance_loss_mlp": 1.03158641, + "epoch": 0.11693972643920036, + "flos": 25994980575360.0, + "grad_norm": 1.4963028532298175, + "language_loss": 0.82662582, + "learning_rate": 3.921234751746038e-06, + "loss": 0.84870374, + "num_input_tokens_seen": 42146335, + "step": 1945, + "time_per_iteration": 2.7190194129943848 + }, + { + "auxiliary_loss_clip": 0.01137101, + "auxiliary_loss_mlp": 0.01062082, + "balance_loss_clip": 1.04682803, + "balance_loss_mlp": 1.04005265, + "epoch": 0.11699984969186833, + "flos": 27272107618560.0, + "grad_norm": 2.3643045784637735, + "language_loss": 0.76298034, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.78497219, + "num_input_tokens_seen": 42165320, + "step": 1946, + "time_per_iteration": 2.792555093765259 + }, + { + "auxiliary_loss_clip": 0.01134728, + "auxiliary_loss_mlp": 0.01056112, + "balance_loss_clip": 1.0507704, + "balance_loss_mlp": 1.03536999, + "epoch": 0.1170599729445363, + "flos": 15267853232640.0, + "grad_norm": 2.058923240355934, + "language_loss": 0.69014907, + "learning_rate": 3.921018163077448e-06, + "loss": 0.71205747, + "num_input_tokens_seen": 42182955, + "step": 1947, + "time_per_iteration": 2.643807888031006 + }, + { + "auxiliary_loss_clip": 0.01154759, + "auxiliary_loss_mlp": 0.01067767, + "balance_loss_clip": 1.05707347, + "balance_loss_mlp": 1.04604673, + "epoch": 0.11712009619720427, + "flos": 17164439251200.0, + "grad_norm": 2.0690991629011615, + "language_loss": 0.85044622, + "learning_rate": 3.920909759473295e-06, + "loss": 0.87267148, + "num_input_tokens_seen": 42200760, + "step": 1948, + "time_per_iteration": 2.6399292945861816 + }, + { + "auxiliary_loss_clip": 0.01051031, + "auxiliary_loss_mlp": 0.0075782, + "balance_loss_clip": 1.0245688, + "balance_loss_mlp": 0.99997467, + "epoch": 0.11718021944987224, + "flos": 70940991997440.0, + "grad_norm": 0.8206821069070506, + "language_loss": 0.65139282, + "learning_rate": 3.920801283028054e-06, + "loss": 0.66948134, + "num_input_tokens_seen": 42265745, + "step": 1949, + "time_per_iteration": 3.3030900955200195 + }, + { + "auxiliary_loss_clip": 0.01159399, + "auxiliary_loss_mlp": 0.01061163, + "balance_loss_clip": 1.05735683, + "balance_loss_mlp": 1.04054022, + "epoch": 0.1172403427025402, + "flos": 27453456408960.0, + "grad_norm": 1.512876015443777, + "language_loss": 0.71746683, + "learning_rate": 3.920692733745835e-06, + "loss": 0.73967248, + "num_input_tokens_seen": 42286245, + "step": 1950, + "time_per_iteration": 2.739341974258423 + }, + { + "auxiliary_loss_clip": 0.01175731, + "auxiliary_loss_mlp": 0.01061149, + "balance_loss_clip": 1.06152189, + "balance_loss_mlp": 1.03907192, + "epoch": 0.11730046595520818, + "flos": 15668723992320.0, + "grad_norm": 2.1258853115079996, + "language_loss": 0.76671386, + "learning_rate": 3.920584111630755e-06, + "loss": 0.78908259, + "num_input_tokens_seen": 42302710, + "step": 1951, + "time_per_iteration": 2.624788999557495 + }, + { + "auxiliary_loss_clip": 0.01129104, + "auxiliary_loss_mlp": 0.0106562, + "balance_loss_clip": 1.05285251, + "balance_loss_mlp": 1.04435349, + "epoch": 0.11736058920787615, + "flos": 25630164092160.0, + "grad_norm": 1.7264952730121887, + "language_loss": 0.75964963, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.7815969, + "num_input_tokens_seen": 42324115, + "step": 1952, + "time_per_iteration": 2.824826955795288 + }, + { + "auxiliary_loss_clip": 0.01123677, + "auxiliary_loss_mlp": 0.01065929, + "balance_loss_clip": 1.04589534, + "balance_loss_mlp": 1.04451907, + "epoch": 0.11742071246054411, + "flos": 21434289701760.0, + "grad_norm": 2.2111022500713453, + "language_loss": 0.72316217, + "learning_rate": 3.920366648918491e-06, + "loss": 0.74505818, + "num_input_tokens_seen": 42342505, + "step": 1953, + "time_per_iteration": 2.7456531524658203 + }, + { + "auxiliary_loss_clip": 0.01149214, + "auxiliary_loss_mlp": 0.00781136, + "balance_loss_clip": 1.0549686, + "balance_loss_mlp": 1.0000577, + "epoch": 0.11748083571321208, + "flos": 15997845335040.0, + "grad_norm": 2.1208802652878522, + "language_loss": 0.79780388, + "learning_rate": 3.920257808329552e-06, + "loss": 0.81710744, + "num_input_tokens_seen": 42360525, + "step": 1954, + "time_per_iteration": 2.653949737548828 + }, + { + "auxiliary_loss_clip": 0.01112399, + "auxiliary_loss_mlp": 0.01059787, + "balance_loss_clip": 1.04880822, + "balance_loss_mlp": 1.03763783, + "epoch": 0.11754095896588006, + "flos": 16180056051840.0, + "grad_norm": 1.9673692595826442, + "language_loss": 0.8553021, + "learning_rate": 3.920148894924246e-06, + "loss": 0.87702394, + "num_input_tokens_seen": 42377045, + "step": 1955, + "time_per_iteration": 2.7987124919891357 + }, + { + "auxiliary_loss_clip": 0.01163172, + "auxiliary_loss_mlp": 0.00779783, + "balance_loss_clip": 1.05209899, + "balance_loss_mlp": 1.00016606, + "epoch": 0.11760108221854802, + "flos": 13261596013440.0, + "grad_norm": 2.12926288831445, + "language_loss": 0.78105426, + "learning_rate": 3.920039908706701e-06, + "loss": 0.80048382, + "num_input_tokens_seen": 42393960, + "step": 1956, + "time_per_iteration": 2.6247944831848145 + }, + { + "auxiliary_loss_clip": 0.01158287, + "auxiliary_loss_mlp": 0.01058454, + "balance_loss_clip": 1.05559933, + "balance_loss_mlp": 1.03601909, + "epoch": 0.11766120547121599, + "flos": 24498439303680.0, + "grad_norm": 2.264983200322237, + "language_loss": 0.80487299, + "learning_rate": 3.91993084968105e-06, + "loss": 0.82704043, + "num_input_tokens_seen": 42413160, + "step": 1957, + "time_per_iteration": 5.862411260604858 + }, + { + "auxiliary_loss_clip": 0.01168294, + "auxiliary_loss_mlp": 0.0105259, + "balance_loss_clip": 1.05703866, + "balance_loss_mlp": 1.0308696, + "epoch": 0.11772132872388397, + "flos": 17784005967360.0, + "grad_norm": 4.8672025609093215, + "language_loss": 0.77955222, + "learning_rate": 3.919821717851428e-06, + "loss": 0.80176103, + "num_input_tokens_seen": 42432590, + "step": 1958, + "time_per_iteration": 4.4218549728393555 + }, + { + "auxiliary_loss_clip": 0.01149976, + "auxiliary_loss_mlp": 0.0105003, + "balance_loss_clip": 1.05451894, + "balance_loss_mlp": 1.02680755, + "epoch": 0.11778145197655193, + "flos": 13217030213760.0, + "grad_norm": 1.7537692363765556, + "language_loss": 0.77002251, + "learning_rate": 3.919712513221976e-06, + "loss": 0.79202259, + "num_input_tokens_seen": 42450135, + "step": 1959, + "time_per_iteration": 2.674323558807373 + }, + { + "auxiliary_loss_clip": 0.01162585, + "auxiliary_loss_mlp": 0.01057019, + "balance_loss_clip": 1.05857027, + "balance_loss_mlp": 1.03484631, + "epoch": 0.1178415752292199, + "flos": 20230204965120.0, + "grad_norm": 2.2026367524708927, + "language_loss": 0.70078689, + "learning_rate": 3.919603235796832e-06, + "loss": 0.722983, + "num_input_tokens_seen": 42470050, + "step": 1960, + "time_per_iteration": 2.7704508304595947 + }, + { + "auxiliary_loss_clip": 0.01161089, + "auxiliary_loss_mlp": 0.01055224, + "balance_loss_clip": 1.05841374, + "balance_loss_mlp": 1.03228831, + "epoch": 0.11790169848188788, + "flos": 13040134709760.0, + "grad_norm": 2.663996374773888, + "language_loss": 0.81045067, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.83261371, + "num_input_tokens_seen": 42484335, + "step": 1961, + "time_per_iteration": 4.67006778717041 + }, + { + "auxiliary_loss_clip": 0.01163817, + "auxiliary_loss_mlp": 0.00779643, + "balance_loss_clip": 1.05658793, + "balance_loss_mlp": 1.00009537, + "epoch": 0.11796182173455584, + "flos": 22265728790400.0, + "grad_norm": 1.71345119244153, + "language_loss": 0.92273545, + "learning_rate": 3.919384462576049e-06, + "loss": 0.94217002, + "num_input_tokens_seen": 42502720, + "step": 1962, + "time_per_iteration": 2.6559524536132812 + }, + { + "auxiliary_loss_clip": 0.01139826, + "auxiliary_loss_mlp": 0.01058964, + "balance_loss_clip": 1.05222392, + "balance_loss_mlp": 1.03704107, + "epoch": 0.1180219449872238, + "flos": 10635017892480.0, + "grad_norm": 2.157203116008796, + "language_loss": 0.87635934, + "learning_rate": 3.919274966788707e-06, + "loss": 0.8983472, + "num_input_tokens_seen": 42519460, + "step": 1963, + "time_per_iteration": 2.710042715072632 + }, + { + "auxiliary_loss_clip": 0.0115823, + "auxiliary_loss_mlp": 0.00779391, + "balance_loss_clip": 1.05600929, + "balance_loss_mlp": 1.00011134, + "epoch": 0.11808206823989177, + "flos": 20923532259840.0, + "grad_norm": 2.8331529324994333, + "language_loss": 0.83879703, + "learning_rate": 3.919165398222265e-06, + "loss": 0.85817325, + "num_input_tokens_seen": 42539420, + "step": 1964, + "time_per_iteration": 2.734941244125366 + }, + { + "auxiliary_loss_clip": 0.01122529, + "auxiliary_loss_mlp": 0.01069054, + "balance_loss_clip": 1.05171156, + "balance_loss_mlp": 1.04628491, + "epoch": 0.11814219149255975, + "flos": 20777770869120.0, + "grad_norm": 3.9132941826799543, + "language_loss": 0.8313272, + "learning_rate": 3.919055756880879e-06, + "loss": 0.85324299, + "num_input_tokens_seen": 42558225, + "step": 1965, + "time_per_iteration": 2.7427306175231934 + }, + { + "auxiliary_loss_clip": 0.01178673, + "auxiliary_loss_mlp": 0.01053338, + "balance_loss_clip": 1.05815279, + "balance_loss_mlp": 1.03163004, + "epoch": 0.11820231474522772, + "flos": 48759938542080.0, + "grad_norm": 1.6720023918141877, + "language_loss": 0.74227381, + "learning_rate": 3.918946042768707e-06, + "loss": 0.76459396, + "num_input_tokens_seen": 42580790, + "step": 1966, + "time_per_iteration": 2.8265397548675537 + }, + { + "auxiliary_loss_clip": 0.01163407, + "auxiliary_loss_mlp": 0.0106081, + "balance_loss_clip": 1.06309748, + "balance_loss_mlp": 1.03836274, + "epoch": 0.11826243799789568, + "flos": 16690598012160.0, + "grad_norm": 2.5628488285375397, + "language_loss": 0.73137337, + "learning_rate": 3.918836255889908e-06, + "loss": 0.7536155, + "num_input_tokens_seen": 42597355, + "step": 1967, + "time_per_iteration": 2.706193685531616 + }, + { + "auxiliary_loss_clip": 0.01167052, + "auxiliary_loss_mlp": 0.01053471, + "balance_loss_clip": 1.05852592, + "balance_loss_mlp": 1.03141701, + "epoch": 0.11832256125056366, + "flos": 16909868586240.0, + "grad_norm": 5.332816815546028, + "language_loss": 0.8831054, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.90531063, + "num_input_tokens_seen": 42616060, + "step": 1968, + "time_per_iteration": 2.6308343410491943 + }, + { + "auxiliary_loss_clip": 0.01168356, + "auxiliary_loss_mlp": 0.01051817, + "balance_loss_clip": 1.06406927, + "balance_loss_mlp": 1.0294776, + "epoch": 0.11838268450323162, + "flos": 22820405587200.0, + "grad_norm": 2.252087054693662, + "language_loss": 0.67010254, + "learning_rate": 3.918616463849087e-06, + "loss": 0.69230425, + "num_input_tokens_seen": 42636285, + "step": 1969, + "time_per_iteration": 2.662480592727661 + }, + { + "auxiliary_loss_clip": 0.01130071, + "auxiliary_loss_mlp": 0.0106143, + "balance_loss_clip": 1.05177045, + "balance_loss_mlp": 1.03774357, + "epoch": 0.11844280775589959, + "flos": 33545844990720.0, + "grad_norm": 2.153814675458072, + "language_loss": 0.80455101, + "learning_rate": 3.918506458695399e-06, + "loss": 0.82646602, + "num_input_tokens_seen": 42658320, + "step": 1970, + "time_per_iteration": 2.798050880432129 + }, + { + "auxiliary_loss_clip": 0.01060284, + "auxiliary_loss_mlp": 0.01021383, + "balance_loss_clip": 1.02553701, + "balance_loss_mlp": 1.01892686, + "epoch": 0.11850293100856757, + "flos": 66350998604160.0, + "grad_norm": 0.8165911228106061, + "language_loss": 0.66192186, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68273854, + "num_input_tokens_seen": 42721500, + "step": 1971, + "time_per_iteration": 3.167018413543701 + }, + { + "auxiliary_loss_clip": 0.01151504, + "auxiliary_loss_mlp": 0.0105629, + "balance_loss_clip": 1.05294323, + "balance_loss_mlp": 1.03422379, + "epoch": 0.11856305426123553, + "flos": 24681045070080.0, + "grad_norm": 2.1839859106137554, + "language_loss": 0.79782552, + "learning_rate": 3.918286230142327e-06, + "loss": 0.81990343, + "num_input_tokens_seen": 42739825, + "step": 1972, + "time_per_iteration": 2.6908793449401855 + }, + { + "auxiliary_loss_clip": 0.01133219, + "auxiliary_loss_mlp": 0.00778766, + "balance_loss_clip": 1.05341005, + "balance_loss_mlp": 1.00005877, + "epoch": 0.1186231775139035, + "flos": 24280102483200.0, + "grad_norm": 2.0473813607633384, + "language_loss": 0.72843599, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74755585, + "num_input_tokens_seen": 42758695, + "step": 1973, + "time_per_iteration": 2.7801859378814697 + }, + { + "auxiliary_loss_clip": 0.01138022, + "auxiliary_loss_mlp": 0.01049764, + "balance_loss_clip": 1.05580497, + "balance_loss_mlp": 1.02707887, + "epoch": 0.11868330076657148, + "flos": 21757413473280.0, + "grad_norm": 1.6449677647733996, + "language_loss": 0.72019619, + "learning_rate": 3.918065710622832e-06, + "loss": 0.74207413, + "num_input_tokens_seen": 42778510, + "step": 1974, + "time_per_iteration": 2.7337663173675537 + }, + { + "auxiliary_loss_clip": 0.01129602, + "auxiliary_loss_mlp": 0.01043161, + "balance_loss_clip": 1.05265522, + "balance_loss_mlp": 1.02086854, + "epoch": 0.11874342401923944, + "flos": 17193274894080.0, + "grad_norm": 2.017372400194955, + "language_loss": 0.77409399, + "learning_rate": 3.917955341761128e-06, + "loss": 0.79582161, + "num_input_tokens_seen": 42793995, + "step": 1975, + "time_per_iteration": 2.669546604156494 + }, + { + "auxiliary_loss_clip": 0.01131477, + "auxiliary_loss_mlp": 0.01059968, + "balance_loss_clip": 1.05880177, + "balance_loss_mlp": 1.03908277, + "epoch": 0.11880354727190741, + "flos": 15229572312960.0, + "grad_norm": 2.3842578575289, + "language_loss": 0.75110453, + "learning_rate": 3.917844900170364e-06, + "loss": 0.77301902, + "num_input_tokens_seen": 42809000, + "step": 1976, + "time_per_iteration": 2.8439090251922607 + }, + { + "auxiliary_loss_clip": 0.0116819, + "auxiliary_loss_mlp": 0.01049523, + "balance_loss_clip": 1.05999744, + "balance_loss_mlp": 1.02835166, + "epoch": 0.11886367052457537, + "flos": 27309706179840.0, + "grad_norm": 1.8674311015318124, + "language_loss": 0.74877423, + "learning_rate": 3.91773438585473e-06, + "loss": 0.77095133, + "num_input_tokens_seen": 42831585, + "step": 1977, + "time_per_iteration": 2.6747169494628906 + }, + { + "auxiliary_loss_clip": 0.01182095, + "auxiliary_loss_mlp": 0.01059621, + "balance_loss_clip": 1.05954552, + "balance_loss_mlp": 1.03805614, + "epoch": 0.11892379377724335, + "flos": 21798280172160.0, + "grad_norm": 2.1793079873879604, + "language_loss": 0.74207634, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.76449353, + "num_input_tokens_seen": 42848420, + "step": 1978, + "time_per_iteration": 2.631664514541626 + }, + { + "auxiliary_loss_clip": 0.01142323, + "auxiliary_loss_mlp": 0.01050585, + "balance_loss_clip": 1.06037045, + "balance_loss_mlp": 1.0289247, + "epoch": 0.11898391702991132, + "flos": 13991013498240.0, + "grad_norm": 1.7170872786869797, + "language_loss": 0.73256385, + "learning_rate": 3.917513139065616e-06, + "loss": 0.754493, + "num_input_tokens_seen": 42866645, + "step": 1979, + "time_per_iteration": 2.7442541122436523 + }, + { + "auxiliary_loss_clip": 0.01137516, + "auxiliary_loss_mlp": 0.01051378, + "balance_loss_clip": 1.0566175, + "balance_loss_mlp": 1.02968168, + "epoch": 0.11904404028257928, + "flos": 32234567091840.0, + "grad_norm": 1.876224505386343, + "language_loss": 0.98293436, + "learning_rate": 3.917402406600525e-06, + "loss": 1.00482333, + "num_input_tokens_seen": 42888515, + "step": 1980, + "time_per_iteration": 2.787667989730835 + }, + { + "auxiliary_loss_clip": 0.01153629, + "auxiliary_loss_mlp": 0.01053612, + "balance_loss_clip": 1.05595791, + "balance_loss_mlp": 1.03077161, + "epoch": 0.11910416353524726, + "flos": 23586272398080.0, + "grad_norm": 1.7507584506289393, + "language_loss": 0.86265099, + "learning_rate": 3.917291601427342e-06, + "loss": 0.88472342, + "num_input_tokens_seen": 42909035, + "step": 1981, + "time_per_iteration": 2.6680359840393066 + }, + { + "auxiliary_loss_clip": 0.01158736, + "auxiliary_loss_mlp": 0.01064978, + "balance_loss_clip": 1.06144083, + "balance_loss_mlp": 1.04214907, + "epoch": 0.11916428678791523, + "flos": 25333038789120.0, + "grad_norm": 1.8908045276276995, + "language_loss": 0.85375237, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87598956, + "num_input_tokens_seen": 42927555, + "step": 1982, + "time_per_iteration": 2.732797861099243 + }, + { + "auxiliary_loss_clip": 0.01146432, + "auxiliary_loss_mlp": 0.01050259, + "balance_loss_clip": 1.05539966, + "balance_loss_mlp": 1.02843213, + "epoch": 0.11922441004058319, + "flos": 19788431592960.0, + "grad_norm": 2.3856086229742877, + "language_loss": 0.85202634, + "learning_rate": 3.917069772973513e-06, + "loss": 0.87399322, + "num_input_tokens_seen": 42945300, + "step": 1983, + "time_per_iteration": 2.6839804649353027 + }, + { + "auxiliary_loss_clip": 0.01126589, + "auxiliary_loss_mlp": 0.01056051, + "balance_loss_clip": 1.05602145, + "balance_loss_mlp": 1.03399742, + "epoch": 0.11928453329325117, + "flos": 21536347219200.0, + "grad_norm": 3.6641824085676022, + "language_loss": 0.7693429, + "learning_rate": 3.916958749701277e-06, + "loss": 0.79116929, + "num_input_tokens_seen": 42961295, + "step": 1984, + "time_per_iteration": 2.7008767127990723 + }, + { + "auxiliary_loss_clip": 0.01161623, + "auxiliary_loss_mlp": 0.01055251, + "balance_loss_clip": 1.05752373, + "balance_loss_mlp": 1.0334003, + "epoch": 0.11934465654591914, + "flos": 20815010294400.0, + "grad_norm": 1.917528093726237, + "language_loss": 0.83058321, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.85275191, + "num_input_tokens_seen": 42980330, + "step": 1985, + "time_per_iteration": 2.6692728996276855 + }, + { + "auxiliary_loss_clip": 0.01151831, + "auxiliary_loss_mlp": 0.01050086, + "balance_loss_clip": 1.0541923, + "balance_loss_mlp": 1.02835393, + "epoch": 0.1194047797985871, + "flos": 19060486565760.0, + "grad_norm": 1.8732848573733223, + "language_loss": 0.74398553, + "learning_rate": 3.916736485087216e-06, + "loss": 0.76600474, + "num_input_tokens_seen": 42996125, + "step": 1986, + "time_per_iteration": 2.722013473510742 + }, + { + "auxiliary_loss_clip": 0.01146125, + "auxiliary_loss_mlp": 0.01059008, + "balance_loss_clip": 1.05472732, + "balance_loss_mlp": 1.03791952, + "epoch": 0.11946490305125507, + "flos": 27190805184000.0, + "grad_norm": 2.4724436343771083, + "language_loss": 0.72123617, + "learning_rate": 3.916625243753819e-06, + "loss": 0.74328756, + "num_input_tokens_seen": 43014180, + "step": 1987, + "time_per_iteration": 2.814481258392334 + }, + { + "auxiliary_loss_clip": 0.01156854, + "auxiliary_loss_mlp": 0.01054644, + "balance_loss_clip": 1.05747938, + "balance_loss_mlp": 1.03138638, + "epoch": 0.11952502630392305, + "flos": 21140791672320.0, + "grad_norm": 1.9246234449532542, + "language_loss": 0.72007513, + "learning_rate": 3.916513929741799e-06, + "loss": 0.74219012, + "num_input_tokens_seen": 43032120, + "step": 1988, + "time_per_iteration": 2.7242019176483154 + }, + { + "auxiliary_loss_clip": 0.0116348, + "auxiliary_loss_mlp": 0.01062102, + "balance_loss_clip": 1.05559146, + "balance_loss_mlp": 1.03913057, + "epoch": 0.11958514955659101, + "flos": 22124241118080.0, + "grad_norm": 1.7561483239324645, + "language_loss": 0.81144297, + "learning_rate": 3.91640254305538e-06, + "loss": 0.83369875, + "num_input_tokens_seen": 43052215, + "step": 1989, + "time_per_iteration": 2.6259546279907227 + }, + { + "auxiliary_loss_clip": 0.01135956, + "auxiliary_loss_mlp": 0.01057689, + "balance_loss_clip": 1.05254042, + "balance_loss_mlp": 1.03325129, + "epoch": 0.11964527280925898, + "flos": 17421452040960.0, + "grad_norm": 2.5516320258539795, + "language_loss": 0.75881672, + "learning_rate": 3.916291083698784e-06, + "loss": 0.7807532, + "num_input_tokens_seen": 43069720, + "step": 1990, + "time_per_iteration": 2.6779251098632812 + }, + { + "auxiliary_loss_clip": 0.0105322, + "auxiliary_loss_mlp": 0.01019112, + "balance_loss_clip": 1.02816892, + "balance_loss_mlp": 1.01647794, + "epoch": 0.11970539606192696, + "flos": 70679741402880.0, + "grad_norm": 0.8628582727639288, + "language_loss": 0.55184531, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57256866, + "num_input_tokens_seen": 43123130, + "step": 1991, + "time_per_iteration": 3.3713693618774414 + }, + { + "auxiliary_loss_clip": 0.01136423, + "auxiliary_loss_mlp": 0.01053959, + "balance_loss_clip": 1.05748868, + "balance_loss_mlp": 1.03326464, + "epoch": 0.11976551931459492, + "flos": 21215019127680.0, + "grad_norm": 2.286300891386994, + "language_loss": 0.78371406, + "learning_rate": 3.916067946991971e-06, + "loss": 0.80561793, + "num_input_tokens_seen": 43140015, + "step": 1992, + "time_per_iteration": 2.6797914505004883 + }, + { + "auxiliary_loss_clip": 0.0117949, + "auxiliary_loss_mlp": 0.01056635, + "balance_loss_clip": 1.05811, + "balance_loss_mlp": 1.03453374, + "epoch": 0.11982564256726289, + "flos": 25989306226560.0, + "grad_norm": 1.8481811043026504, + "language_loss": 0.78911144, + "learning_rate": 3.915956269650216e-06, + "loss": 0.81147265, + "num_input_tokens_seen": 43160105, + "step": 1993, + "time_per_iteration": 2.691301107406616 + }, + { + "auxiliary_loss_clip": 0.01126423, + "auxiliary_loss_mlp": 0.0106217, + "balance_loss_clip": 1.05012226, + "balance_loss_mlp": 1.04081941, + "epoch": 0.11988576581993086, + "flos": 21650866755840.0, + "grad_norm": 1.644866568705103, + "language_loss": 0.82088816, + "learning_rate": 3.915844519655208e-06, + "loss": 0.84277415, + "num_input_tokens_seen": 43179835, + "step": 1994, + "time_per_iteration": 2.772905111312866 + }, + { + "auxiliary_loss_clip": 0.0115068, + "auxiliary_loss_mlp": 0.01063961, + "balance_loss_clip": 1.05523098, + "balance_loss_mlp": 1.0433259, + "epoch": 0.11994588907259883, + "flos": 17857407409920.0, + "grad_norm": 2.0065598513575247, + "language_loss": 0.88392794, + "learning_rate": 3.915732697011183e-06, + "loss": 0.9060744, + "num_input_tokens_seen": 43197210, + "step": 1995, + "time_per_iteration": 4.206532716751099 + }, + { + "auxiliary_loss_clip": 0.01153482, + "auxiliary_loss_mlp": 0.01066415, + "balance_loss_clip": 1.06005812, + "balance_loss_mlp": 1.0441823, + "epoch": 0.1200060123252668, + "flos": 24462744163200.0, + "grad_norm": 1.8775058007239456, + "language_loss": 0.73949909, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.76169801, + "num_input_tokens_seen": 43215050, + "step": 1996, + "time_per_iteration": 2.7263944149017334 + }, + { + "auxiliary_loss_clip": 0.01141484, + "auxiliary_loss_mlp": 0.01060112, + "balance_loss_clip": 1.05754757, + "balance_loss_mlp": 1.03808212, + "epoch": 0.12006613557793476, + "flos": 18732191235840.0, + "grad_norm": 1.976051865072764, + "language_loss": 0.88125587, + "learning_rate": 3.915508833793048e-06, + "loss": 0.90327179, + "num_input_tokens_seen": 43233900, + "step": 1997, + "time_per_iteration": 4.29426383972168 + }, + { + "auxiliary_loss_clip": 0.01165634, + "auxiliary_loss_mlp": 0.00779568, + "balance_loss_clip": 1.05701697, + "balance_loss_mlp": 1.00001049, + "epoch": 0.12012625883060274, + "flos": 22267739952000.0, + "grad_norm": 2.1091392562336018, + "language_loss": 0.79031086, + "learning_rate": 3.915396793227428e-06, + "loss": 0.80976284, + "num_input_tokens_seen": 43252105, + "step": 1998, + "time_per_iteration": 4.330955266952515 + }, + { + "auxiliary_loss_clip": 0.0116661, + "auxiliary_loss_mlp": 0.00779642, + "balance_loss_clip": 1.0576719, + "balance_loss_mlp": 1.00002396, + "epoch": 0.1201863820832707, + "flos": 21758885930880.0, + "grad_norm": 1.799585336659533, + "language_loss": 0.73583078, + "learning_rate": 3.915284680029769e-06, + "loss": 0.75529337, + "num_input_tokens_seen": 43270315, + "step": 1999, + "time_per_iteration": 2.754770040512085 + }, + { + "auxiliary_loss_clip": 0.01178966, + "auxiliary_loss_mlp": 0.01073097, + "balance_loss_clip": 1.0602119, + "balance_loss_mlp": 1.05115068, + "epoch": 0.12024650533593867, + "flos": 21907987286400.0, + "grad_norm": 2.916355473014409, + "language_loss": 0.74854898, + "learning_rate": 3.915172494204323e-06, + "loss": 0.77106953, + "num_input_tokens_seen": 43289935, + "step": 2000, + "time_per_iteration": 4.3900322914123535 + }, + { + "auxiliary_loss_clip": 0.01149374, + "auxiliary_loss_mlp": 0.01069735, + "balance_loss_clip": 1.05375695, + "balance_loss_mlp": 1.04763341, + "epoch": 0.12030662858860665, + "flos": 21689219502720.0, + "grad_norm": 1.5203973891597686, + "language_loss": 0.8496564, + "learning_rate": 3.915060235755344e-06, + "loss": 0.87184751, + "num_input_tokens_seen": 43309325, + "step": 2001, + "time_per_iteration": 2.6912643909454346 + }, + { + "auxiliary_loss_clip": 0.01154057, + "auxiliary_loss_mlp": 0.01063637, + "balance_loss_clip": 1.05600786, + "balance_loss_mlp": 1.04265642, + "epoch": 0.12036675184127461, + "flos": 12933228856320.0, + "grad_norm": 2.932264271186656, + "language_loss": 0.74711967, + "learning_rate": 3.91494790468709e-06, + "loss": 0.76929653, + "num_input_tokens_seen": 43327010, + "step": 2002, + "time_per_iteration": 2.6991024017333984 + }, + { + "auxiliary_loss_clip": 0.01129169, + "auxiliary_loss_mlp": 0.01066705, + "balance_loss_clip": 1.05340302, + "balance_loss_mlp": 1.0429939, + "epoch": 0.12042687509394258, + "flos": 20851028657280.0, + "grad_norm": 2.117271428042382, + "language_loss": 0.78029454, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80225325, + "num_input_tokens_seen": 43345650, + "step": 2003, + "time_per_iteration": 2.731381416320801 + }, + { + "auxiliary_loss_clip": 0.01163252, + "auxiliary_loss_mlp": 0.01062886, + "balance_loss_clip": 1.05728662, + "balance_loss_mlp": 1.04073668, + "epoch": 0.12048699834661056, + "flos": 23878513451520.0, + "grad_norm": 1.585850552088038, + "language_loss": 0.72205627, + "learning_rate": 3.914723024709793e-06, + "loss": 0.74431765, + "num_input_tokens_seen": 43365555, + "step": 2004, + "time_per_iteration": 2.725092649459839 + }, + { + "auxiliary_loss_clip": 0.01160616, + "auxiliary_loss_mlp": 0.01069457, + "balance_loss_clip": 1.05870187, + "balance_loss_mlp": 1.04645014, + "epoch": 0.12054712159927852, + "flos": 19756363726080.0, + "grad_norm": 1.9357732467170252, + "language_loss": 0.78415942, + "learning_rate": 3.914610475809279e-06, + "loss": 0.8064602, + "num_input_tokens_seen": 43384990, + "step": 2005, + "time_per_iteration": 2.7232437133789062 + }, + { + "auxiliary_loss_clip": 0.01073016, + "auxiliary_loss_mlp": 0.00758901, + "balance_loss_clip": 1.02995479, + "balance_loss_mlp": 1.00011683, + "epoch": 0.12060724485194649, + "flos": 51672763123200.0, + "grad_norm": 0.9264315537536937, + "language_loss": 0.58087146, + "learning_rate": 3.914497854306543e-06, + "loss": 0.59919059, + "num_input_tokens_seen": 43436335, + "step": 2006, + "time_per_iteration": 2.9570157527923584 + }, + { + "auxiliary_loss_clip": 0.01155081, + "auxiliary_loss_mlp": 0.01053472, + "balance_loss_clip": 1.05803597, + "balance_loss_mlp": 1.03299201, + "epoch": 0.12066736810461445, + "flos": 18990425088000.0, + "grad_norm": 1.6109316320484448, + "language_loss": 0.76524282, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.78732836, + "num_input_tokens_seen": 43456495, + "step": 2007, + "time_per_iteration": 2.763380289077759 + }, + { + "auxiliary_loss_clip": 0.01147254, + "auxiliary_loss_mlp": 0.01064209, + "balance_loss_clip": 1.05931091, + "balance_loss_mlp": 1.04177368, + "epoch": 0.12072749135728243, + "flos": 16471973882880.0, + "grad_norm": 2.449779851562752, + "language_loss": 0.83023942, + "learning_rate": 3.914272393511494e-06, + "loss": 0.85235405, + "num_input_tokens_seen": 43473085, + "step": 2008, + "time_per_iteration": 2.7693119049072266 + }, + { + "auxiliary_loss_clip": 0.01176157, + "auxiliary_loss_mlp": 0.01052894, + "balance_loss_clip": 1.0584172, + "balance_loss_mlp": 1.03135288, + "epoch": 0.1207876146099504, + "flos": 18077108947200.0, + "grad_norm": 2.203355340521787, + "language_loss": 0.83835697, + "learning_rate": 3.91415955422773e-06, + "loss": 0.86064744, + "num_input_tokens_seen": 43491135, + "step": 2009, + "time_per_iteration": 2.640944242477417 + }, + { + "auxiliary_loss_clip": 0.01180076, + "auxiliary_loss_mlp": 0.01053549, + "balance_loss_clip": 1.06196725, + "balance_loss_mlp": 1.02994514, + "epoch": 0.12084773786261836, + "flos": 21871573873920.0, + "grad_norm": 1.6799099601218046, + "language_loss": 0.83870012, + "learning_rate": 3.914046642358844e-06, + "loss": 0.8610363, + "num_input_tokens_seen": 43510440, + "step": 2010, + "time_per_iteration": 2.716127634048462 + }, + { + "auxiliary_loss_clip": 0.01145261, + "auxiliary_loss_mlp": 0.00780804, + "balance_loss_clip": 1.05555713, + "balance_loss_mlp": 1.0000627, + "epoch": 0.12090786111528634, + "flos": 18333044328960.0, + "grad_norm": 1.8933604390076018, + "language_loss": 0.84194541, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.86120605, + "num_input_tokens_seen": 43530145, + "step": 2011, + "time_per_iteration": 2.73793625831604 + }, + { + "auxiliary_loss_clip": 0.01148418, + "auxiliary_loss_mlp": 0.01060974, + "balance_loss_clip": 1.05480969, + "balance_loss_mlp": 1.03905129, + "epoch": 0.1209679843679543, + "flos": 21105850717440.0, + "grad_norm": 2.0524904800028154, + "language_loss": 0.96236968, + "learning_rate": 3.913820600882834e-06, + "loss": 0.98446357, + "num_input_tokens_seen": 43549315, + "step": 2012, + "time_per_iteration": 2.7269980907440186 + }, + { + "auxiliary_loss_clip": 0.01146369, + "auxiliary_loss_mlp": 0.01051396, + "balance_loss_clip": 1.05808425, + "balance_loss_mlp": 1.0289607, + "epoch": 0.12102810762062227, + "flos": 29241053585280.0, + "grad_norm": 1.853151366811655, + "language_loss": 0.80903435, + "learning_rate": 3.913707471284283e-06, + "loss": 0.83101201, + "num_input_tokens_seen": 43569240, + "step": 2013, + "time_per_iteration": 2.740489959716797 + }, + { + "auxiliary_loss_clip": 0.01124703, + "auxiliary_loss_mlp": 0.0105341, + "balance_loss_clip": 1.05300117, + "balance_loss_mlp": 1.02962804, + "epoch": 0.12108823087329025, + "flos": 17930701111680.0, + "grad_norm": 5.099975898232357, + "language_loss": 0.77255923, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.79434031, + "num_input_tokens_seen": 43587710, + "step": 2014, + "time_per_iteration": 2.7361485958099365 + }, + { + "auxiliary_loss_clip": 0.0116607, + "auxiliary_loss_mlp": 0.01051056, + "balance_loss_clip": 1.05832791, + "balance_loss_mlp": 1.02791715, + "epoch": 0.12114835412595822, + "flos": 22091850028800.0, + "grad_norm": 5.8570343294144465, + "language_loss": 0.87169874, + "learning_rate": 3.913480994387535e-06, + "loss": 0.89387, + "num_input_tokens_seen": 43606000, + "step": 2015, + "time_per_iteration": 2.6881515979766846 + }, + { + "auxiliary_loss_clip": 0.01170382, + "auxiliary_loss_mlp": 0.01051162, + "balance_loss_clip": 1.05500197, + "balance_loss_mlp": 1.0289886, + "epoch": 0.12120847737862618, + "flos": 20412343854720.0, + "grad_norm": 2.087765239068409, + "language_loss": 0.69146478, + "learning_rate": 3.913367647097926e-06, + "loss": 0.71368027, + "num_input_tokens_seen": 43624815, + "step": 2016, + "time_per_iteration": 2.7096211910247803 + }, + { + "auxiliary_loss_clip": 0.01152563, + "auxiliary_loss_mlp": 0.0104714, + "balance_loss_clip": 1.05737591, + "balance_loss_mlp": 1.02390599, + "epoch": 0.12126860063129415, + "flos": 22309037614080.0, + "grad_norm": 2.8043603396252865, + "language_loss": 0.79858959, + "learning_rate": 3.913254227253225e-06, + "loss": 0.82058656, + "num_input_tokens_seen": 43643960, + "step": 2017, + "time_per_iteration": 2.7042336463928223 + }, + { + "auxiliary_loss_clip": 0.01156022, + "auxiliary_loss_mlp": 0.0105052, + "balance_loss_clip": 1.05479789, + "balance_loss_mlp": 1.02740538, + "epoch": 0.12132872388396213, + "flos": 13699275235200.0, + "grad_norm": 2.8700241463026654, + "language_loss": 0.68828821, + "learning_rate": 3.913140734857731e-06, + "loss": 0.71035373, + "num_input_tokens_seen": 43662650, + "step": 2018, + "time_per_iteration": 2.7015058994293213 + }, + { + "auxiliary_loss_clip": 0.01136376, + "auxiliary_loss_mlp": 0.01050749, + "balance_loss_clip": 1.05524123, + "balance_loss_mlp": 1.02873111, + "epoch": 0.12138884713663009, + "flos": 26466954307200.0, + "grad_norm": 1.6132330771570709, + "language_loss": 0.72476816, + "learning_rate": 3.91302716991575e-06, + "loss": 0.74663943, + "num_input_tokens_seen": 43684205, + "step": 2019, + "time_per_iteration": 2.8956947326660156 + }, + { + "auxiliary_loss_clip": 0.01107167, + "auxiliary_loss_mlp": 0.01057916, + "balance_loss_clip": 1.05286384, + "balance_loss_mlp": 1.03482556, + "epoch": 0.12144897038929806, + "flos": 26141603892480.0, + "grad_norm": 1.853626515444831, + "language_loss": 0.92125106, + "learning_rate": 3.912913532431586e-06, + "loss": 0.94290185, + "num_input_tokens_seen": 43706320, + "step": 2020, + "time_per_iteration": 2.9980764389038086 + }, + { + "auxiliary_loss_clip": 0.0114145, + "auxiliary_loss_mlp": 0.01055455, + "balance_loss_clip": 1.05289125, + "balance_loss_mlp": 1.03360391, + "epoch": 0.12150909364196603, + "flos": 24717530309760.0, + "grad_norm": 1.9227427415613194, + "language_loss": 0.7772885, + "learning_rate": 3.912799822409549e-06, + "loss": 0.79925752, + "num_input_tokens_seen": 43724805, + "step": 2021, + "time_per_iteration": 3.01798939704895 + }, + { + "auxiliary_loss_clip": 0.0117749, + "auxiliary_loss_mlp": 0.01049007, + "balance_loss_clip": 1.0610733, + "balance_loss_mlp": 1.0277164, + "epoch": 0.121569216894634, + "flos": 25186990089600.0, + "grad_norm": 2.054228820960504, + "language_loss": 0.80712306, + "learning_rate": 3.912686039853952e-06, + "loss": 0.82938808, + "num_input_tokens_seen": 43742320, + "step": 2022, + "time_per_iteration": 2.684309244155884 + }, + { + "auxiliary_loss_clip": 0.01144749, + "auxiliary_loss_mlp": 0.0106163, + "balance_loss_clip": 1.055619, + "balance_loss_mlp": 1.03697765, + "epoch": 0.12162934014730196, + "flos": 13444094039040.0, + "grad_norm": 1.734031517866852, + "language_loss": 0.84842217, + "learning_rate": 3.912572184769108e-06, + "loss": 0.87048596, + "num_input_tokens_seen": 43760665, + "step": 2023, + "time_per_iteration": 2.6886441707611084 + }, + { + "auxiliary_loss_clip": 0.01139348, + "auxiliary_loss_mlp": 0.01053043, + "balance_loss_clip": 1.05162323, + "balance_loss_mlp": 1.03081048, + "epoch": 0.12168946339996994, + "flos": 16946138344320.0, + "grad_norm": 2.3397199529221546, + "language_loss": 0.85514021, + "learning_rate": 3.912458257159335e-06, + "loss": 0.87706411, + "num_input_tokens_seen": 43779020, + "step": 2024, + "time_per_iteration": 2.8043718338012695 + }, + { + "auxiliary_loss_clip": 0.01169767, + "auxiliary_loss_mlp": 0.01055534, + "balance_loss_clip": 1.05277538, + "balance_loss_mlp": 1.03389716, + "epoch": 0.12174958665263791, + "flos": 29821585196160.0, + "grad_norm": 1.8432491304976684, + "language_loss": 0.72088945, + "learning_rate": 3.912344257028954e-06, + "loss": 0.74314243, + "num_input_tokens_seen": 43798850, + "step": 2025, + "time_per_iteration": 2.704876184463501 + }, + { + "auxiliary_loss_clip": 0.01148564, + "auxiliary_loss_mlp": 0.01047618, + "balance_loss_clip": 1.05486572, + "balance_loss_mlp": 1.02555275, + "epoch": 0.12180970990530587, + "flos": 24641902224000.0, + "grad_norm": 1.4969552271445652, + "language_loss": 0.76075011, + "learning_rate": 3.912230184382286e-06, + "loss": 0.78271192, + "num_input_tokens_seen": 43820130, + "step": 2026, + "time_per_iteration": 2.6957921981811523 + }, + { + "auxiliary_loss_clip": 0.01147374, + "auxiliary_loss_mlp": 0.01046261, + "balance_loss_clip": 1.05086374, + "balance_loss_mlp": 1.02474427, + "epoch": 0.12186983315797385, + "flos": 20521691832960.0, + "grad_norm": 2.2064263994277478, + "language_loss": 0.88769746, + "learning_rate": 3.912116039223659e-06, + "loss": 0.90963376, + "num_input_tokens_seen": 43838485, + "step": 2027, + "time_per_iteration": 2.6847639083862305 + }, + { + "auxiliary_loss_clip": 0.01143778, + "auxiliary_loss_mlp": 0.01056715, + "balance_loss_clip": 1.05258501, + "balance_loss_mlp": 1.03667617, + "epoch": 0.12192995641064182, + "flos": 27818344719360.0, + "grad_norm": 1.5725885574076592, + "language_loss": 0.75544459, + "learning_rate": 3.912001821557399e-06, + "loss": 0.77744961, + "num_input_tokens_seen": 43859080, + "step": 2028, + "time_per_iteration": 2.7706027030944824 + }, + { + "auxiliary_loss_clip": 0.01123185, + "auxiliary_loss_mlp": 0.01057136, + "balance_loss_clip": 1.0518471, + "balance_loss_mlp": 1.03554714, + "epoch": 0.12199007966330978, + "flos": 22017119783040.0, + "grad_norm": 2.0550419223931193, + "language_loss": 0.76802504, + "learning_rate": 3.911887531387839e-06, + "loss": 0.78982824, + "num_input_tokens_seen": 43879030, + "step": 2029, + "time_per_iteration": 2.732637405395508 + }, + { + "auxiliary_loss_clip": 0.01156591, + "auxiliary_loss_mlp": 0.01052355, + "balance_loss_clip": 1.05253625, + "balance_loss_mlp": 1.03107572, + "epoch": 0.12205020291597775, + "flos": 23295216493440.0, + "grad_norm": 1.707195979328818, + "language_loss": 0.79164296, + "learning_rate": 3.911773168719313e-06, + "loss": 0.81373239, + "num_input_tokens_seen": 43898505, + "step": 2030, + "time_per_iteration": 2.7254061698913574 + }, + { + "auxiliary_loss_clip": 0.0116997, + "auxiliary_loss_mlp": 0.01051357, + "balance_loss_clip": 1.05618095, + "balance_loss_mlp": 1.02930319, + "epoch": 0.12211032616864573, + "flos": 26031609469440.0, + "grad_norm": 3.038077546298312, + "language_loss": 0.74411637, + "learning_rate": 3.911658733556155e-06, + "loss": 0.76632965, + "num_input_tokens_seen": 43917945, + "step": 2031, + "time_per_iteration": 2.6711080074310303 + }, + { + "auxiliary_loss_clip": 0.01174332, + "auxiliary_loss_mlp": 0.01045812, + "balance_loss_clip": 1.05888343, + "balance_loss_mlp": 1.02545118, + "epoch": 0.12217044942131369, + "flos": 20410943224320.0, + "grad_norm": 1.7636188348969384, + "language_loss": 0.75230348, + "learning_rate": 3.911544225902707e-06, + "loss": 0.7745049, + "num_input_tokens_seen": 43937385, + "step": 2032, + "time_per_iteration": 2.7134530544281006 + }, + { + "auxiliary_loss_clip": 0.01152363, + "auxiliary_loss_mlp": 0.01045735, + "balance_loss_clip": 1.05129802, + "balance_loss_mlp": 1.02538586, + "epoch": 0.12223057267398166, + "flos": 22857142222080.0, + "grad_norm": 1.5809359138264147, + "language_loss": 0.89502287, + "learning_rate": 3.911429645763311e-06, + "loss": 0.91700387, + "num_input_tokens_seen": 43958130, + "step": 2033, + "time_per_iteration": 2.7105965614318848 + }, + { + "auxiliary_loss_clip": 0.01155694, + "auxiliary_loss_mlp": 0.01051169, + "balance_loss_clip": 1.05740523, + "balance_loss_mlp": 1.03005767, + "epoch": 0.12229069592664964, + "flos": 20047563285120.0, + "grad_norm": 1.9580868921695649, + "language_loss": 0.65195286, + "learning_rate": 3.911314993142311e-06, + "loss": 0.67402148, + "num_input_tokens_seen": 43976800, + "step": 2034, + "time_per_iteration": 4.222668886184692 + }, + { + "auxiliary_loss_clip": 0.01152239, + "auxiliary_loss_mlp": 0.01055659, + "balance_loss_clip": 1.05550218, + "balance_loss_mlp": 1.0327704, + "epoch": 0.1223508191793176, + "flos": 22274240313600.0, + "grad_norm": 1.6376942269871653, + "language_loss": 0.76459455, + "learning_rate": 3.911200268044055e-06, + "loss": 0.78667355, + "num_input_tokens_seen": 43996620, + "step": 2035, + "time_per_iteration": 2.7306556701660156 + }, + { + "auxiliary_loss_clip": 0.01176703, + "auxiliary_loss_mlp": 0.01050008, + "balance_loss_clip": 1.0577215, + "balance_loss_mlp": 1.02798975, + "epoch": 0.12241094243198557, + "flos": 21285978445440.0, + "grad_norm": 1.8460180606974623, + "language_loss": 0.71294892, + "learning_rate": 3.911085470472892e-06, + "loss": 0.73521602, + "num_input_tokens_seen": 44016175, + "step": 2036, + "time_per_iteration": 2.7327258586883545 + }, + { + "auxiliary_loss_clip": 0.01144473, + "auxiliary_loss_mlp": 0.01058389, + "balance_loss_clip": 1.05778408, + "balance_loss_mlp": 1.03623962, + "epoch": 0.12247106568465355, + "flos": 17382381022080.0, + "grad_norm": 1.5772021569883852, + "language_loss": 0.83130831, + "learning_rate": 3.910970600433178e-06, + "loss": 0.85333693, + "num_input_tokens_seen": 44035060, + "step": 2037, + "time_per_iteration": 4.248440742492676 + }, + { + "auxiliary_loss_clip": 0.01153641, + "auxiliary_loss_mlp": 0.01060257, + "balance_loss_clip": 1.0556947, + "balance_loss_mlp": 1.0366174, + "epoch": 0.12253118893732151, + "flos": 27045438842880.0, + "grad_norm": 2.676780030246967, + "language_loss": 0.79765236, + "learning_rate": 3.910855657929267e-06, + "loss": 0.81979132, + "num_input_tokens_seen": 44053330, + "step": 2038, + "time_per_iteration": 2.7321341037750244 + }, + { + "auxiliary_loss_clip": 0.010642, + "auxiliary_loss_mlp": 0.00759248, + "balance_loss_clip": 1.02961969, + "balance_loss_mlp": 1.00006962, + "epoch": 0.12259131218998948, + "flos": 53861518368000.0, + "grad_norm": 0.8248048644272604, + "language_loss": 0.58659601, + "learning_rate": 3.910740642965518e-06, + "loss": 0.6048305, + "num_input_tokens_seen": 44107575, + "step": 2039, + "time_per_iteration": 4.739040851593018 + }, + { + "auxiliary_loss_clip": 0.01128, + "auxiliary_loss_mlp": 0.01064411, + "balance_loss_clip": 1.05292714, + "balance_loss_mlp": 1.03912663, + "epoch": 0.12265143544265744, + "flos": 17891917401600.0, + "grad_norm": 2.1548467753138136, + "language_loss": 0.80099291, + "learning_rate": 3.910625555546292e-06, + "loss": 0.82291704, + "num_input_tokens_seen": 44126075, + "step": 2040, + "time_per_iteration": 2.723247766494751 + }, + { + "auxiliary_loss_clip": 0.01149343, + "auxiliary_loss_mlp": 0.01058534, + "balance_loss_clip": 1.05517352, + "balance_loss_mlp": 1.03673029, + "epoch": 0.12271155869532542, + "flos": 21799932197760.0, + "grad_norm": 1.8247690225218605, + "language_loss": 0.82841176, + "learning_rate": 3.910510395675953e-06, + "loss": 0.85049051, + "num_input_tokens_seen": 44145605, + "step": 2041, + "time_per_iteration": 2.699110984802246 + }, + { + "auxiliary_loss_clip": 0.01136001, + "auxiliary_loss_mlp": 0.01053451, + "balance_loss_clip": 1.05120957, + "balance_loss_mlp": 1.03061032, + "epoch": 0.12277168194799339, + "flos": 19828759587840.0, + "grad_norm": 1.9386136063873771, + "language_loss": 0.67272276, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.69461727, + "num_input_tokens_seen": 44164770, + "step": 2042, + "time_per_iteration": 2.7042133808135986 + }, + { + "auxiliary_loss_clip": 0.01133115, + "auxiliary_loss_mlp": 0.01056941, + "balance_loss_clip": 1.05079007, + "balance_loss_mlp": 1.03517294, + "epoch": 0.12283180520066135, + "flos": 23221024951680.0, + "grad_norm": 1.912164915278887, + "language_loss": 0.81765604, + "learning_rate": 3.910279858599409e-06, + "loss": 0.83955657, + "num_input_tokens_seen": 44184025, + "step": 2043, + "time_per_iteration": 2.6942050457000732 + }, + { + "auxiliary_loss_clip": 0.01146416, + "auxiliary_loss_mlp": 0.01052365, + "balance_loss_clip": 1.05161905, + "balance_loss_mlp": 1.03040695, + "epoch": 0.12289192845332933, + "flos": 18588476920320.0, + "grad_norm": 1.7894844734354058, + "language_loss": 0.80192459, + "learning_rate": 3.910164481401946e-06, + "loss": 0.82391244, + "num_input_tokens_seen": 44202950, + "step": 2044, + "time_per_iteration": 2.6227192878723145 + }, + { + "auxiliary_loss_clip": 0.01116285, + "auxiliary_loss_mlp": 0.01052013, + "balance_loss_clip": 1.05284619, + "balance_loss_mlp": 1.03055525, + "epoch": 0.1229520517059973, + "flos": 25769532862080.0, + "grad_norm": 1.7152742607840916, + "language_loss": 0.7794897, + "learning_rate": 3.910049031770853e-06, + "loss": 0.80117267, + "num_input_tokens_seen": 44221115, + "step": 2045, + "time_per_iteration": 2.769017219543457 + }, + { + "auxiliary_loss_clip": 0.01163545, + "auxiliary_loss_mlp": 0.01060468, + "balance_loss_clip": 1.05796146, + "balance_loss_mlp": 1.03827095, + "epoch": 0.12301217495866526, + "flos": 20887154760960.0, + "grad_norm": 1.852572781372854, + "language_loss": 0.67284262, + "learning_rate": 3.90993350971051e-06, + "loss": 0.69508278, + "num_input_tokens_seen": 44240575, + "step": 2046, + "time_per_iteration": 2.6377944946289062 + }, + { + "auxiliary_loss_clip": 0.01173803, + "auxiliary_loss_mlp": 0.01053755, + "balance_loss_clip": 1.06010675, + "balance_loss_mlp": 1.03202295, + "epoch": 0.12307229821133324, + "flos": 22378811783040.0, + "grad_norm": 4.982373490718116, + "language_loss": 0.72730684, + "learning_rate": 3.909817915225297e-06, + "loss": 0.74958241, + "num_input_tokens_seen": 44257145, + "step": 2047, + "time_per_iteration": 2.5791239738464355 + }, + { + "auxiliary_loss_clip": 0.01155159, + "auxiliary_loss_mlp": 0.01060632, + "balance_loss_clip": 1.05398846, + "balance_loss_mlp": 1.03817296, + "epoch": 0.1231324214640012, + "flos": 23367396873600.0, + "grad_norm": 1.8194194024321948, + "language_loss": 0.76583183, + "learning_rate": 3.909702248319597e-06, + "loss": 0.78798974, + "num_input_tokens_seen": 44278035, + "step": 2048, + "time_per_iteration": 2.6997592449188232 + }, + { + "auxiliary_loss_clip": 0.01146796, + "auxiliary_loss_mlp": 0.01047309, + "balance_loss_clip": 1.05524468, + "balance_loss_mlp": 1.02798486, + "epoch": 0.12319254471666917, + "flos": 23767154311680.0, + "grad_norm": 1.8097490634569602, + "language_loss": 0.85359102, + "learning_rate": 3.909586508997797e-06, + "loss": 0.87553203, + "num_input_tokens_seen": 44296980, + "step": 2049, + "time_per_iteration": 2.739617109298706 + }, + { + "auxiliary_loss_clip": 0.01120276, + "auxiliary_loss_mlp": 0.01050145, + "balance_loss_clip": 1.0533725, + "balance_loss_mlp": 1.02887857, + "epoch": 0.12325266796933713, + "flos": 23550146294400.0, + "grad_norm": 2.6582136339172724, + "language_loss": 0.75563407, + "learning_rate": 3.909470697264285e-06, + "loss": 0.77733827, + "num_input_tokens_seen": 44318005, + "step": 2050, + "time_per_iteration": 2.7814078330993652 + }, + { + "auxiliary_loss_clip": 0.01138568, + "auxiliary_loss_mlp": 0.01057939, + "balance_loss_clip": 1.05428278, + "balance_loss_mlp": 1.03608823, + "epoch": 0.12331279122200511, + "flos": 24423996366720.0, + "grad_norm": 1.81408967902731, + "language_loss": 0.81166679, + "learning_rate": 3.909354813123452e-06, + "loss": 0.83363187, + "num_input_tokens_seen": 44335260, + "step": 2051, + "time_per_iteration": 2.7555224895477295 + }, + { + "auxiliary_loss_clip": 0.01171646, + "auxiliary_loss_mlp": 0.00779218, + "balance_loss_clip": 1.05882978, + "balance_loss_mlp": 0.99996465, + "epoch": 0.12337291447467308, + "flos": 25484294960640.0, + "grad_norm": 1.8885516327307212, + "language_loss": 0.80445349, + "learning_rate": 3.909238856579693e-06, + "loss": 0.82396215, + "num_input_tokens_seen": 44355315, + "step": 2052, + "time_per_iteration": 2.7676405906677246 + }, + { + "auxiliary_loss_clip": 0.01165489, + "auxiliary_loss_mlp": 0.010569, + "balance_loss_clip": 1.0581975, + "balance_loss_mlp": 1.03537059, + "epoch": 0.12343303772734104, + "flos": 23550002640000.0, + "grad_norm": 2.171205541070781, + "language_loss": 0.73676848, + "learning_rate": 3.909122827637406e-06, + "loss": 0.75899243, + "num_input_tokens_seen": 44373020, + "step": 2053, + "time_per_iteration": 2.648609161376953 + }, + { + "auxiliary_loss_clip": 0.01168883, + "auxiliary_loss_mlp": 0.00778478, + "balance_loss_clip": 1.05302441, + "balance_loss_mlp": 0.99995315, + "epoch": 0.12349316098000902, + "flos": 47557074867840.0, + "grad_norm": 1.5051513438882418, + "language_loss": 0.7413671, + "learning_rate": 3.909006726300991e-06, + "loss": 0.76084077, + "num_input_tokens_seen": 44397525, + "step": 2054, + "time_per_iteration": 2.871469020843506 + }, + { + "auxiliary_loss_clip": 0.01147607, + "auxiliary_loss_mlp": 0.01044612, + "balance_loss_clip": 1.05402803, + "balance_loss_mlp": 1.02482307, + "epoch": 0.12355328423267699, + "flos": 25045969294080.0, + "grad_norm": 4.50189877271012, + "language_loss": 0.85417157, + "learning_rate": 3.908890552574849e-06, + "loss": 0.8760938, + "num_input_tokens_seen": 44415890, + "step": 2055, + "time_per_iteration": 2.7136077880859375 + }, + { + "auxiliary_loss_clip": 0.01133829, + "auxiliary_loss_mlp": 0.01047458, + "balance_loss_clip": 1.05999517, + "balance_loss_mlp": 1.02802706, + "epoch": 0.12361340748534495, + "flos": 27709140395520.0, + "grad_norm": 2.0629908776416688, + "language_loss": 0.77506042, + "learning_rate": 3.908774306463384e-06, + "loss": 0.79687333, + "num_input_tokens_seen": 44436625, + "step": 2056, + "time_per_iteration": 2.83107852935791 + }, + { + "auxiliary_loss_clip": 0.01158234, + "auxiliary_loss_mlp": 0.01055, + "balance_loss_clip": 1.05444396, + "balance_loss_mlp": 1.03405499, + "epoch": 0.12367353073801293, + "flos": 26140598311680.0, + "grad_norm": 1.9893743253373262, + "language_loss": 0.83361745, + "learning_rate": 3.908657987971009e-06, + "loss": 0.85574985, + "num_input_tokens_seen": 44455265, + "step": 2057, + "time_per_iteration": 2.6987085342407227 + }, + { + "auxiliary_loss_clip": 0.01141319, + "auxiliary_loss_mlp": 0.01051708, + "balance_loss_clip": 1.05057144, + "balance_loss_mlp": 1.02991605, + "epoch": 0.1237336539906809, + "flos": 25156035544320.0, + "grad_norm": 1.4905135493793764, + "language_loss": 0.77818203, + "learning_rate": 3.90854159710213e-06, + "loss": 0.80011231, + "num_input_tokens_seen": 44475815, + "step": 2058, + "time_per_iteration": 2.7149016857147217 + }, + { + "auxiliary_loss_clip": 0.01138087, + "auxiliary_loss_mlp": 0.01058134, + "balance_loss_clip": 1.05117273, + "balance_loss_mlp": 1.03482866, + "epoch": 0.12379377724334886, + "flos": 15304589867520.0, + "grad_norm": 1.8387803476985631, + "language_loss": 0.8342883, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.85625052, + "num_input_tokens_seen": 44494045, + "step": 2059, + "time_per_iteration": 2.7030091285705566 + }, + { + "auxiliary_loss_clip": 0.01133517, + "auxiliary_loss_mlp": 0.01057399, + "balance_loss_clip": 1.05123472, + "balance_loss_mlp": 1.03445077, + "epoch": 0.12385390049601683, + "flos": 21316717509120.0, + "grad_norm": 2.7478129466394217, + "language_loss": 0.81420219, + "learning_rate": 3.908308598252523e-06, + "loss": 0.83611137, + "num_input_tokens_seen": 44509120, + "step": 2060, + "time_per_iteration": 2.738499402999878 + }, + { + "auxiliary_loss_clip": 0.01150334, + "auxiliary_loss_mlp": 0.01054424, + "balance_loss_clip": 1.05367386, + "balance_loss_mlp": 1.0315125, + "epoch": 0.1239140237486848, + "flos": 15116309752320.0, + "grad_norm": 1.8699548955873522, + "language_loss": 0.86224365, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.88429129, + "num_input_tokens_seen": 44525780, + "step": 2061, + "time_per_iteration": 2.6492960453033447 + }, + { + "auxiliary_loss_clip": 0.0115523, + "auxiliary_loss_mlp": 0.01050307, + "balance_loss_clip": 1.05506253, + "balance_loss_mlp": 1.03031528, + "epoch": 0.12397414700135277, + "flos": 21976791788160.0, + "grad_norm": 2.006361909654615, + "language_loss": 0.84949362, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87154901, + "num_input_tokens_seen": 44543125, + "step": 2062, + "time_per_iteration": 2.5925393104553223 + }, + { + "auxiliary_loss_clip": 0.01124676, + "auxiliary_loss_mlp": 0.01058304, + "balance_loss_clip": 1.05198252, + "balance_loss_mlp": 1.03498697, + "epoch": 0.12403427025402074, + "flos": 13400892956160.0, + "grad_norm": 1.6181471799462952, + "language_loss": 0.78765064, + "learning_rate": 3.907958557264774e-06, + "loss": 0.80948043, + "num_input_tokens_seen": 44560275, + "step": 2063, + "time_per_iteration": 2.7551674842834473 + }, + { + "auxiliary_loss_clip": 0.01124369, + "auxiliary_loss_mlp": 0.01057465, + "balance_loss_clip": 1.05492854, + "balance_loss_mlp": 1.03450513, + "epoch": 0.12409439350668872, + "flos": 15304374385920.0, + "grad_norm": 2.9315517002695017, + "language_loss": 0.79452097, + "learning_rate": 3.907841732229663e-06, + "loss": 0.81633931, + "num_input_tokens_seen": 44577640, + "step": 2064, + "time_per_iteration": 2.699711322784424 + }, + { + "auxiliary_loss_clip": 0.01144709, + "auxiliary_loss_mlp": 0.01058768, + "balance_loss_clip": 1.05316699, + "balance_loss_mlp": 1.03847849, + "epoch": 0.12415451675935668, + "flos": 25009376313600.0, + "grad_norm": 2.5611248351266016, + "language_loss": 0.92676973, + "learning_rate": 3.907724834849002e-06, + "loss": 0.9488045, + "num_input_tokens_seen": 44594860, + "step": 2065, + "time_per_iteration": 2.7114996910095215 + }, + { + "auxiliary_loss_clip": 0.01147841, + "auxiliary_loss_mlp": 0.01052058, + "balance_loss_clip": 1.05113554, + "balance_loss_mlp": 1.02943158, + "epoch": 0.12421464001202465, + "flos": 23659673840640.0, + "grad_norm": 1.7498294279318665, + "language_loss": 0.80540735, + "learning_rate": 3.907607865127225e-06, + "loss": 0.82740629, + "num_input_tokens_seen": 44614780, + "step": 2066, + "time_per_iteration": 2.6958389282226562 + }, + { + "auxiliary_loss_clip": 0.01030831, + "auxiliary_loss_mlp": 0.01051436, + "balance_loss_clip": 1.02768898, + "balance_loss_mlp": 1.04884958, + "epoch": 0.12427476326469263, + "flos": 65732904345600.0, + "grad_norm": 0.8715885531008962, + "language_loss": 0.63299954, + "learning_rate": 3.907490823068766e-06, + "loss": 0.6538223, + "num_input_tokens_seen": 44671240, + "step": 2067, + "time_per_iteration": 3.200000762939453 + }, + { + "auxiliary_loss_clip": 0.01117858, + "auxiliary_loss_mlp": 0.01057985, + "balance_loss_clip": 1.04878855, + "balance_loss_mlp": 1.0344646, + "epoch": 0.12433488651736059, + "flos": 24535427333760.0, + "grad_norm": 1.9218217735084064, + "language_loss": 0.93783462, + "learning_rate": 3.907373708678063e-06, + "loss": 0.959593, + "num_input_tokens_seen": 44691050, + "step": 2068, + "time_per_iteration": 2.7631025314331055 + }, + { + "auxiliary_loss_clip": 0.01166393, + "auxiliary_loss_mlp": 0.0105657, + "balance_loss_clip": 1.05994427, + "balance_loss_mlp": 1.03697169, + "epoch": 0.12439500977002856, + "flos": 21031659175680.0, + "grad_norm": 1.8717926968048342, + "language_loss": 0.80861229, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.83084196, + "num_input_tokens_seen": 44709850, + "step": 2069, + "time_per_iteration": 2.6630098819732666 + }, + { + "auxiliary_loss_clip": 0.01113262, + "auxiliary_loss_mlp": 0.01062592, + "balance_loss_clip": 1.04863238, + "balance_loss_mlp": 1.03963184, + "epoch": 0.12445513302269653, + "flos": 26830621555200.0, + "grad_norm": 1.5649570979854035, + "language_loss": 0.777978, + "learning_rate": 3.907139262917696e-06, + "loss": 0.79973656, + "num_input_tokens_seen": 44731475, + "step": 2070, + "time_per_iteration": 2.7750463485717773 + }, + { + "auxiliary_loss_clip": 0.01156875, + "auxiliary_loss_mlp": 0.01052509, + "balance_loss_clip": 1.05520415, + "balance_loss_mlp": 1.03055048, + "epoch": 0.1245152562753645, + "flos": 18368919037440.0, + "grad_norm": 2.2051981544638166, + "language_loss": 0.80743957, + "learning_rate": 3.907021931556922e-06, + "loss": 0.8295334, + "num_input_tokens_seen": 44749685, + "step": 2071, + "time_per_iteration": 2.654171943664551 + }, + { + "auxiliary_loss_clip": 0.01154683, + "auxiliary_loss_mlp": 0.01055767, + "balance_loss_clip": 1.05492425, + "balance_loss_mlp": 1.03405952, + "epoch": 0.12457537952803246, + "flos": 33107986200960.0, + "grad_norm": 2.118828414072521, + "language_loss": 0.78278041, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80488491, + "num_input_tokens_seen": 44772165, + "step": 2072, + "time_per_iteration": 2.753159284591675 + }, + { + "auxiliary_loss_clip": 0.0114568, + "auxiliary_loss_mlp": 0.01055287, + "balance_loss_clip": 1.05651307, + "balance_loss_mlp": 1.03381729, + "epoch": 0.12463550278070043, + "flos": 22270217990400.0, + "grad_norm": 7.360489773093417, + "language_loss": 0.752267, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.77427667, + "num_input_tokens_seen": 44790580, + "step": 2073, + "time_per_iteration": 2.6561899185180664 + }, + { + "auxiliary_loss_clip": 0.01096485, + "auxiliary_loss_mlp": 0.01053193, + "balance_loss_clip": 1.04471385, + "balance_loss_mlp": 1.03086543, + "epoch": 0.12469562603336841, + "flos": 14679025580160.0, + "grad_norm": 1.9234955386089483, + "language_loss": 0.90560025, + "learning_rate": 3.906669503605631e-06, + "loss": 0.92709696, + "num_input_tokens_seen": 44806730, + "step": 2074, + "time_per_iteration": 2.7846343517303467 + }, + { + "auxiliary_loss_clip": 0.01105332, + "auxiliary_loss_mlp": 0.01056651, + "balance_loss_clip": 1.04977274, + "balance_loss_mlp": 1.03346491, + "epoch": 0.12475574928603637, + "flos": 24644775312000.0, + "grad_norm": 2.8321626325497493, + "language_loss": 0.83836985, + "learning_rate": 3.906551883013728e-06, + "loss": 0.8599897, + "num_input_tokens_seen": 44825550, + "step": 2075, + "time_per_iteration": 4.412928342819214 + }, + { + "auxiliary_loss_clip": 0.01107078, + "auxiliary_loss_mlp": 0.01062819, + "balance_loss_clip": 1.04380202, + "balance_loss_mlp": 1.03972864, + "epoch": 0.12481587253870434, + "flos": 21762980081280.0, + "grad_norm": 2.042892519020311, + "language_loss": 0.73648787, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.75818682, + "num_input_tokens_seen": 44844155, + "step": 2076, + "time_per_iteration": 5.925223112106323 + }, + { + "auxiliary_loss_clip": 0.01101731, + "auxiliary_loss_mlp": 0.01048176, + "balance_loss_clip": 1.04774427, + "balance_loss_mlp": 1.02751708, + "epoch": 0.12487599579137232, + "flos": 21432529935360.0, + "grad_norm": 1.8779339700875872, + "language_loss": 0.7622484, + "learning_rate": 3.906316424944469e-06, + "loss": 0.78374755, + "num_input_tokens_seen": 44863780, + "step": 2077, + "time_per_iteration": 2.70566987991333 + }, + { + "auxiliary_loss_clip": 0.01156274, + "auxiliary_loss_mlp": 0.01062042, + "balance_loss_clip": 1.05365288, + "balance_loss_mlp": 1.04001164, + "epoch": 0.12493611904404028, + "flos": 16107624276480.0, + "grad_norm": 2.022280968605665, + "language_loss": 0.82290226, + "learning_rate": 3.906198587476043e-06, + "loss": 0.84508544, + "num_input_tokens_seen": 44881480, + "step": 2078, + "time_per_iteration": 4.302385568618774 + }, + { + "auxiliary_loss_clip": 0.01144821, + "auxiliary_loss_mlp": 0.01050482, + "balance_loss_clip": 1.05281842, + "balance_loss_mlp": 1.02855957, + "epoch": 0.12499624229670825, + "flos": 21580266574080.0, + "grad_norm": 1.6413520418295044, + "language_loss": 0.75195324, + "learning_rate": 3.906080677724374e-06, + "loss": 0.77390629, + "num_input_tokens_seen": 44900390, + "step": 2079, + "time_per_iteration": 2.6915946006774902 + }, + { + "auxiliary_loss_clip": 0.01166758, + "auxiliary_loss_mlp": 0.01058474, + "balance_loss_clip": 1.05881989, + "balance_loss_mlp": 1.03696847, + "epoch": 0.1250563655493762, + "flos": 25699040421120.0, + "grad_norm": 6.733284446627088, + "language_loss": 0.83874094, + "learning_rate": 3.905962695693935e-06, + "loss": 0.86099327, + "num_input_tokens_seen": 44920375, + "step": 2080, + "time_per_iteration": 2.7467572689056396 + }, + { + "auxiliary_loss_clip": 0.01156163, + "auxiliary_loss_mlp": 0.01059409, + "balance_loss_clip": 1.05525088, + "balance_loss_mlp": 1.03885686, + "epoch": 0.12511648880204418, + "flos": 16909509450240.0, + "grad_norm": 1.8581885454518776, + "language_loss": 0.84644079, + "learning_rate": 3.9058446413892e-06, + "loss": 0.86859655, + "num_input_tokens_seen": 44938415, + "step": 2081, + "time_per_iteration": 2.685875654220581 + }, + { + "auxiliary_loss_clip": 0.01156835, + "auxiliary_loss_mlp": 0.01046398, + "balance_loss_clip": 1.05375946, + "balance_loss_mlp": 1.02594149, + "epoch": 0.12517661205471217, + "flos": 17567500740480.0, + "grad_norm": 1.8191819349610059, + "language_loss": 0.76739037, + "learning_rate": 3.905726514814646e-06, + "loss": 0.78942269, + "num_input_tokens_seen": 44957135, + "step": 2082, + "time_per_iteration": 2.6133053302764893 + }, + { + "auxiliary_loss_clip": 0.01152911, + "auxiliary_loss_mlp": 0.0104632, + "balance_loss_clip": 1.05701911, + "balance_loss_mlp": 1.02463615, + "epoch": 0.12523673530738014, + "flos": 16033791870720.0, + "grad_norm": 2.5415589476696265, + "language_loss": 0.79044539, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.81243765, + "num_input_tokens_seen": 44974480, + "step": 2083, + "time_per_iteration": 2.6963307857513428 + }, + { + "auxiliary_loss_clip": 0.01147874, + "auxiliary_loss_mlp": 0.01047351, + "balance_loss_clip": 1.05509973, + "balance_loss_mlp": 1.02421284, + "epoch": 0.1252968585600481, + "flos": 18807747494400.0, + "grad_norm": 2.1696249857299, + "language_loss": 0.89831448, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.92026675, + "num_input_tokens_seen": 44990310, + "step": 2084, + "time_per_iteration": 2.6770403385162354 + }, + { + "auxiliary_loss_clip": 0.01131068, + "auxiliary_loss_mlp": 0.01048299, + "balance_loss_clip": 1.05299771, + "balance_loss_mlp": 1.02729464, + "epoch": 0.12535698181271607, + "flos": 27271568914560.0, + "grad_norm": 1.8896331095253402, + "language_loss": 0.80354226, + "learning_rate": 3.905371701516869e-06, + "loss": 0.82533598, + "num_input_tokens_seen": 45010720, + "step": 2085, + "time_per_iteration": 2.749783515930176 + }, + { + "auxiliary_loss_clip": 0.01170318, + "auxiliary_loss_mlp": 0.01051018, + "balance_loss_clip": 1.05725896, + "balance_loss_mlp": 1.03001356, + "epoch": 0.12541710506538403, + "flos": 22054107813120.0, + "grad_norm": 1.8300316094254767, + "language_loss": 0.88228154, + "learning_rate": 3.905253285907856e-06, + "loss": 0.90449488, + "num_input_tokens_seen": 45030360, + "step": 2086, + "time_per_iteration": 2.603515148162842 + }, + { + "auxiliary_loss_clip": 0.01134598, + "auxiliary_loss_mlp": 0.01044925, + "balance_loss_clip": 1.05278981, + "balance_loss_mlp": 1.02522027, + "epoch": 0.125477228318052, + "flos": 12603173760000.0, + "grad_norm": 2.0471238132540344, + "language_loss": 0.86819696, + "learning_rate": 3.905134798051447e-06, + "loss": 0.88999224, + "num_input_tokens_seen": 45045085, + "step": 2087, + "time_per_iteration": 2.6265859603881836 + }, + { + "auxiliary_loss_clip": 0.01146999, + "auxiliary_loss_mlp": 0.01058875, + "balance_loss_clip": 1.05599046, + "balance_loss_mlp": 1.03651142, + "epoch": 0.12553735157071996, + "flos": 23878549365120.0, + "grad_norm": 2.3362397674907758, + "language_loss": 0.73027468, + "learning_rate": 3.905016237952136e-06, + "loss": 0.75233346, + "num_input_tokens_seen": 45065145, + "step": 2088, + "time_per_iteration": 2.65324330329895 + }, + { + "auxiliary_loss_clip": 0.01062529, + "auxiliary_loss_mlp": 0.01013405, + "balance_loss_clip": 1.02985716, + "balance_loss_mlp": 1.01079392, + "epoch": 0.12559747482338796, + "flos": 69920841830400.0, + "grad_norm": 0.7742255614948045, + "language_loss": 0.61767036, + "learning_rate": 3.904897605614418e-06, + "loss": 0.6384297, + "num_input_tokens_seen": 45126230, + "step": 2089, + "time_per_iteration": 3.1219804286956787 + }, + { + "auxiliary_loss_clip": 0.01149606, + "auxiliary_loss_mlp": 0.01060841, + "balance_loss_clip": 1.05670094, + "balance_loss_mlp": 1.0388943, + "epoch": 0.12565759807605592, + "flos": 24279563779200.0, + "grad_norm": 1.817095421446176, + "language_loss": 0.7781918, + "learning_rate": 3.904778901042793e-06, + "loss": 0.80029625, + "num_input_tokens_seen": 45145545, + "step": 2090, + "time_per_iteration": 2.700425863265991 + }, + { + "auxiliary_loss_clip": 0.01046946, + "auxiliary_loss_mlp": 0.01013884, + "balance_loss_clip": 1.03125095, + "balance_loss_mlp": 1.01101136, + "epoch": 0.12571772132872389, + "flos": 56451180286080.0, + "grad_norm": 0.760599485634597, + "language_loss": 0.59434772, + "learning_rate": 3.90466012424176e-06, + "loss": 0.61495602, + "num_input_tokens_seen": 45206845, + "step": 2091, + "time_per_iteration": 3.0814294815063477 + }, + { + "auxiliary_loss_clip": 0.01159814, + "auxiliary_loss_mlp": 0.01060546, + "balance_loss_clip": 1.05760789, + "balance_loss_mlp": 1.041067, + "epoch": 0.12577784458139185, + "flos": 41245846675200.0, + "grad_norm": 1.6552462178493936, + "language_loss": 0.62916517, + "learning_rate": 3.904541275215825e-06, + "loss": 0.6513688, + "num_input_tokens_seen": 45228495, + "step": 2092, + "time_per_iteration": 2.7813880443573 + }, + { + "auxiliary_loss_clip": 0.01147016, + "auxiliary_loss_mlp": 0.01061963, + "balance_loss_clip": 1.05395663, + "balance_loss_mlp": 1.04069614, + "epoch": 0.12583796783405982, + "flos": 19755501799680.0, + "grad_norm": 2.279616692029291, + "language_loss": 0.80507946, + "learning_rate": 3.904422353969493e-06, + "loss": 0.82716924, + "num_input_tokens_seen": 45245720, + "step": 2093, + "time_per_iteration": 2.6768014430999756 + }, + { + "auxiliary_loss_clip": 0.01146976, + "auxiliary_loss_mlp": 0.01075616, + "balance_loss_clip": 1.0524025, + "balance_loss_mlp": 1.05380058, + "epoch": 0.12589809108672778, + "flos": 22602104680320.0, + "grad_norm": 1.7347385846840702, + "language_loss": 0.76003867, + "learning_rate": 3.904303360507276e-06, + "loss": 0.78226459, + "num_input_tokens_seen": 45265650, + "step": 2094, + "time_per_iteration": 2.6730611324310303 + }, + { + "auxiliary_loss_clip": 0.01117887, + "auxiliary_loss_mlp": 0.01069309, + "balance_loss_clip": 1.0500071, + "balance_loss_mlp": 1.04892457, + "epoch": 0.12595821433939577, + "flos": 45222845541120.0, + "grad_norm": 1.5703706409155747, + "language_loss": 0.76664734, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.78851926, + "num_input_tokens_seen": 45287790, + "step": 2095, + "time_per_iteration": 2.958367109298706 + }, + { + "auxiliary_loss_clip": 0.01147751, + "auxiliary_loss_mlp": 0.01058477, + "balance_loss_clip": 1.05202031, + "balance_loss_mlp": 1.03782988, + "epoch": 0.12601833759206374, + "flos": 14319811618560.0, + "grad_norm": 2.2556524892449326, + "language_loss": 0.83266854, + "learning_rate": 3.904065156953232e-06, + "loss": 0.85473078, + "num_input_tokens_seen": 45305720, + "step": 2096, + "time_per_iteration": 2.7097342014312744 + }, + { + "auxiliary_loss_clip": 0.01163652, + "auxiliary_loss_mlp": 0.01056552, + "balance_loss_clip": 1.05806553, + "balance_loss_mlp": 1.03577375, + "epoch": 0.1260784608447317, + "flos": 21288241002240.0, + "grad_norm": 1.7589400475615893, + "language_loss": 0.75478256, + "learning_rate": 3.903945946870439e-06, + "loss": 0.77698463, + "num_input_tokens_seen": 45325290, + "step": 2097, + "time_per_iteration": 2.642056703567505 + }, + { + "auxiliary_loss_clip": 0.01156719, + "auxiliary_loss_mlp": 0.01063976, + "balance_loss_clip": 1.05648863, + "balance_loss_mlp": 1.04527175, + "epoch": 0.12613858409739967, + "flos": 26251311006720.0, + "grad_norm": 1.8828235460619742, + "language_loss": 0.87110066, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.89330757, + "num_input_tokens_seen": 45344465, + "step": 2098, + "time_per_iteration": 2.63826584815979 + }, + { + "auxiliary_loss_clip": 0.01117414, + "auxiliary_loss_mlp": 0.01058025, + "balance_loss_clip": 1.04983974, + "balance_loss_mlp": 1.03475559, + "epoch": 0.12619870735006763, + "flos": 21579979265280.0, + "grad_norm": 1.8855647331078333, + "language_loss": 0.69494271, + "learning_rate": 3.903707310115912e-06, + "loss": 0.7166971, + "num_input_tokens_seen": 45362465, + "step": 2099, + "time_per_iteration": 2.7813057899475098 + }, + { + "auxiliary_loss_clip": 0.01142696, + "auxiliary_loss_mlp": 0.01061431, + "balance_loss_clip": 1.04979372, + "balance_loss_mlp": 1.03923464, + "epoch": 0.1262588306027356, + "flos": 23367037737600.0, + "grad_norm": 2.0457253500590498, + "language_loss": 0.81949925, + "learning_rate": 3.903587883453228e-06, + "loss": 0.84154058, + "num_input_tokens_seen": 45382700, + "step": 2100, + "time_per_iteration": 2.704871416091919 + }, + { + "auxiliary_loss_clip": 0.01159613, + "auxiliary_loss_mlp": 0.01055067, + "balance_loss_clip": 1.0620985, + "balance_loss_mlp": 1.03408623, + "epoch": 0.12631895385540357, + "flos": 23949185460480.0, + "grad_norm": 1.7810176086536167, + "language_loss": 0.80399859, + "learning_rate": 3.903468384606302e-06, + "loss": 0.82614541, + "num_input_tokens_seen": 45401005, + "step": 2101, + "time_per_iteration": 2.7071452140808105 + }, + { + "auxiliary_loss_clip": 0.0106985, + "auxiliary_loss_mlp": 0.01010859, + "balance_loss_clip": 1.02823138, + "balance_loss_mlp": 1.00803375, + "epoch": 0.12637907710807156, + "flos": 70282138780800.0, + "grad_norm": 0.7128618749962091, + "language_loss": 0.57087427, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59168136, + "num_input_tokens_seen": 45466555, + "step": 2102, + "time_per_iteration": 3.20320987701416 + }, + { + "auxiliary_loss_clip": 0.01140495, + "auxiliary_loss_mlp": 0.01056574, + "balance_loss_clip": 1.053671, + "balance_loss_mlp": 1.03661788, + "epoch": 0.12643920036073952, + "flos": 18915084311040.0, + "grad_norm": 2.0306165352193988, + "language_loss": 0.93653679, + "learning_rate": 3.903229170377845e-06, + "loss": 0.95850742, + "num_input_tokens_seen": 45485165, + "step": 2103, + "time_per_iteration": 2.6628894805908203 + }, + { + "auxiliary_loss_clip": 0.01144405, + "auxiliary_loss_mlp": 0.01040745, + "balance_loss_clip": 1.04991472, + "balance_loss_mlp": 1.02174282, + "epoch": 0.1264993236134075, + "flos": 27782470010880.0, + "grad_norm": 1.5962316578756222, + "language_loss": 0.7804662, + "learning_rate": 3.903109455005387e-06, + "loss": 0.80231774, + "num_input_tokens_seen": 45504630, + "step": 2104, + "time_per_iteration": 2.6215474605560303 + }, + { + "auxiliary_loss_clip": 0.01135927, + "auxiliary_loss_mlp": 0.01056343, + "balance_loss_clip": 1.05414486, + "balance_loss_mlp": 1.03683996, + "epoch": 0.12655944686607545, + "flos": 24754697907840.0, + "grad_norm": 1.7362499149688688, + "language_loss": 0.80728614, + "learning_rate": 3.902989667466828e-06, + "loss": 0.82920885, + "num_input_tokens_seen": 45524885, + "step": 2105, + "time_per_iteration": 2.74128794670105 + }, + { + "auxiliary_loss_clip": 0.01162904, + "auxiliary_loss_mlp": 0.01056367, + "balance_loss_clip": 1.05482686, + "balance_loss_mlp": 1.03514743, + "epoch": 0.12661957011874342, + "flos": 24133048202880.0, + "grad_norm": 1.9810187943106816, + "language_loss": 0.83402872, + "learning_rate": 3.90286980776671e-06, + "loss": 0.85622144, + "num_input_tokens_seen": 45545000, + "step": 2106, + "time_per_iteration": 2.676694631576538 + }, + { + "auxiliary_loss_clip": 0.01126632, + "auxiliary_loss_mlp": 0.01052067, + "balance_loss_clip": 1.05697966, + "balance_loss_mlp": 1.03147984, + "epoch": 0.12667969337141138, + "flos": 24569614103040.0, + "grad_norm": 1.6951691508845637, + "language_loss": 0.73469931, + "learning_rate": 3.902749875909578e-06, + "loss": 0.7564863, + "num_input_tokens_seen": 45564210, + "step": 2107, + "time_per_iteration": 2.7506372928619385 + }, + { + "auxiliary_loss_clip": 0.01162931, + "auxiliary_loss_mlp": 0.01044317, + "balance_loss_clip": 1.05320692, + "balance_loss_mlp": 1.02599406, + "epoch": 0.12673981662407935, + "flos": 22961677777920.0, + "grad_norm": 2.0116792159666477, + "language_loss": 0.79395336, + "learning_rate": 3.90262987189998e-06, + "loss": 0.81602579, + "num_input_tokens_seen": 45583030, + "step": 2108, + "time_per_iteration": 2.6611146926879883 + }, + { + "auxiliary_loss_clip": 0.01168073, + "auxiliary_loss_mlp": 0.01049192, + "balance_loss_clip": 1.05300844, + "balance_loss_mlp": 1.02945089, + "epoch": 0.12679993987674734, + "flos": 17274864637440.0, + "grad_norm": 1.9298328790617403, + "language_loss": 0.7561394, + "learning_rate": 3.902509795742467e-06, + "loss": 0.77831209, + "num_input_tokens_seen": 45602265, + "step": 2109, + "time_per_iteration": 2.5963573455810547 + }, + { + "auxiliary_loss_clip": 0.01111025, + "auxiliary_loss_mlp": 0.01053822, + "balance_loss_clip": 1.04636049, + "balance_loss_mlp": 1.0335331, + "epoch": 0.1268600631294153, + "flos": 17275080119040.0, + "grad_norm": 1.6171901700648081, + "language_loss": 0.82806516, + "learning_rate": 3.902389647441592e-06, + "loss": 0.84971368, + "num_input_tokens_seen": 45620595, + "step": 2110, + "time_per_iteration": 2.6745550632476807 + }, + { + "auxiliary_loss_clip": 0.01145969, + "auxiliary_loss_mlp": 0.00778071, + "balance_loss_clip": 1.05419564, + "balance_loss_mlp": 0.99996144, + "epoch": 0.12692018638208327, + "flos": 24061047390720.0, + "grad_norm": 1.6765217216011241, + "language_loss": 0.78092968, + "learning_rate": 3.90226942700191e-06, + "loss": 0.80017006, + "num_input_tokens_seen": 45641140, + "step": 2111, + "time_per_iteration": 2.65983510017395 + }, + { + "auxiliary_loss_clip": 0.01130932, + "auxiliary_loss_mlp": 0.01076547, + "balance_loss_clip": 1.05490458, + "balance_loss_mlp": 1.05352807, + "epoch": 0.12698030963475124, + "flos": 31831900652160.0, + "grad_norm": 2.15738266202174, + "language_loss": 0.77103376, + "learning_rate": 3.902149134427982e-06, + "loss": 0.79310858, + "num_input_tokens_seen": 45662315, + "step": 2112, + "time_per_iteration": 2.870299816131592 + }, + { + "auxiliary_loss_clip": 0.01129438, + "auxiliary_loss_mlp": 0.01074863, + "balance_loss_clip": 1.05213726, + "balance_loss_mlp": 1.05427516, + "epoch": 0.1270404328874192, + "flos": 25187744275200.0, + "grad_norm": 1.9191529425470424, + "language_loss": 0.85806453, + "learning_rate": 3.902028769724367e-06, + "loss": 0.88010758, + "num_input_tokens_seen": 45680335, + "step": 2113, + "time_per_iteration": 4.26338267326355 + }, + { + "auxiliary_loss_clip": 0.01137468, + "auxiliary_loss_mlp": 0.01078067, + "balance_loss_clip": 1.05511892, + "balance_loss_mlp": 1.05670488, + "epoch": 0.12710055614008717, + "flos": 15997342544640.0, + "grad_norm": 1.9721234476704599, + "language_loss": 0.74027002, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.7624253, + "num_input_tokens_seen": 45696240, + "step": 2114, + "time_per_iteration": 2.7573230266571045 + }, + { + "auxiliary_loss_clip": 0.01156713, + "auxiliary_loss_mlp": 0.01060574, + "balance_loss_clip": 1.05770111, + "balance_loss_mlp": 1.03924704, + "epoch": 0.12716067939275516, + "flos": 15085642515840.0, + "grad_norm": 1.7921743813213327, + "language_loss": 0.83240676, + "learning_rate": 3.901787823946341e-06, + "loss": 0.85457963, + "num_input_tokens_seen": 45713695, + "step": 2115, + "time_per_iteration": 4.1369829177856445 + }, + { + "auxiliary_loss_clip": 0.01154653, + "auxiliary_loss_mlp": 0.01065557, + "balance_loss_clip": 1.05875492, + "balance_loss_mlp": 1.04476702, + "epoch": 0.12722080264542313, + "flos": 28366736636160.0, + "grad_norm": 1.4840591347809418, + "language_loss": 0.87010503, + "learning_rate": 3.901667242881065e-06, + "loss": 0.89230716, + "num_input_tokens_seen": 45736655, + "step": 2116, + "time_per_iteration": 2.73896861076355 + }, + { + "auxiliary_loss_clip": 0.01139498, + "auxiliary_loss_mlp": 0.00777066, + "balance_loss_clip": 1.05413389, + "balance_loss_mlp": 0.99995339, + "epoch": 0.1272809258980911, + "flos": 32379897519360.0, + "grad_norm": 1.753205985010591, + "language_loss": 0.70374918, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.72291481, + "num_input_tokens_seen": 45758195, + "step": 2117, + "time_per_iteration": 2.783156156539917 + }, + { + "auxiliary_loss_clip": 0.01127455, + "auxiliary_loss_mlp": 0.0106424, + "balance_loss_clip": 1.04978406, + "balance_loss_mlp": 1.04068434, + "epoch": 0.12734104915075906, + "flos": 16034402401920.0, + "grad_norm": 1.9957647698478755, + "language_loss": 0.86237884, + "learning_rate": 3.901425864420852e-06, + "loss": 0.8842957, + "num_input_tokens_seen": 45774280, + "step": 2118, + "time_per_iteration": 4.322036266326904 + }, + { + "auxiliary_loss_clip": 0.01161417, + "auxiliary_loss_mlp": 0.01049008, + "balance_loss_clip": 1.05827069, + "balance_loss_mlp": 1.02951694, + "epoch": 0.12740117240342702, + "flos": 18260325244800.0, + "grad_norm": 1.705293179953873, + "language_loss": 0.87577266, + "learning_rate": 3.901305067035068e-06, + "loss": 0.89787692, + "num_input_tokens_seen": 45792760, + "step": 2119, + "time_per_iteration": 2.6559741497039795 + }, + { + "auxiliary_loss_clip": 0.01145426, + "auxiliary_loss_mlp": 0.0077754, + "balance_loss_clip": 1.05233431, + "balance_loss_mlp": 0.99984539, + "epoch": 0.127461295656095, + "flos": 12121790664960.0, + "grad_norm": 2.05013605026053, + "language_loss": 0.87824571, + "learning_rate": 3.901184197551605e-06, + "loss": 0.89747536, + "num_input_tokens_seen": 45804300, + "step": 2120, + "time_per_iteration": 2.6154048442840576 + }, + { + "auxiliary_loss_clip": 0.01170497, + "auxiliary_loss_mlp": 0.01046075, + "balance_loss_clip": 1.05822706, + "balance_loss_mlp": 1.02626204, + "epoch": 0.12752141890876295, + "flos": 23149095966720.0, + "grad_norm": 1.9784951602308867, + "language_loss": 0.75584805, + "learning_rate": 3.901063255975046e-06, + "loss": 0.77801377, + "num_input_tokens_seen": 45823780, + "step": 2121, + "time_per_iteration": 2.579265832901001 + }, + { + "auxiliary_loss_clip": 0.0111249, + "auxiliary_loss_mlp": 0.01047949, + "balance_loss_clip": 1.04741263, + "balance_loss_mlp": 1.02727842, + "epoch": 0.12758154216143094, + "flos": 21615997628160.0, + "grad_norm": 2.0293629108662405, + "language_loss": 0.82732606, + "learning_rate": 3.900942242309978e-06, + "loss": 0.84893048, + "num_input_tokens_seen": 45840495, + "step": 2122, + "time_per_iteration": 2.793870210647583 + }, + { + "auxiliary_loss_clip": 0.01151713, + "auxiliary_loss_mlp": 0.01049724, + "balance_loss_clip": 1.05901408, + "balance_loss_mlp": 1.02983987, + "epoch": 0.1276416654140989, + "flos": 15924874855680.0, + "grad_norm": 1.7660235451894624, + "language_loss": 0.78699338, + "learning_rate": 3.90082115656099e-06, + "loss": 0.80900776, + "num_input_tokens_seen": 45857735, + "step": 2123, + "time_per_iteration": 2.70546293258667 + }, + { + "auxiliary_loss_clip": 0.01172823, + "auxiliary_loss_mlp": 0.01055328, + "balance_loss_clip": 1.05931985, + "balance_loss_mlp": 1.03478789, + "epoch": 0.12770178866676687, + "flos": 22382690451840.0, + "grad_norm": 1.5643885422181942, + "language_loss": 0.78931451, + "learning_rate": 3.900699998732673e-06, + "loss": 0.81159604, + "num_input_tokens_seen": 45876485, + "step": 2124, + "time_per_iteration": 2.661712408065796 + }, + { + "auxiliary_loss_clip": 0.01160474, + "auxiliary_loss_mlp": 0.00776885, + "balance_loss_clip": 1.05457389, + "balance_loss_mlp": 0.99987447, + "epoch": 0.12776191191943484, + "flos": 21652482867840.0, + "grad_norm": 1.9695028631977674, + "language_loss": 0.75605726, + "learning_rate": 3.900578768829623e-06, + "loss": 0.7754308, + "num_input_tokens_seen": 45894645, + "step": 2125, + "time_per_iteration": 2.696021556854248 + }, + { + "auxiliary_loss_clip": 0.01158163, + "auxiliary_loss_mlp": 0.00777059, + "balance_loss_clip": 1.05398965, + "balance_loss_mlp": 1.00002348, + "epoch": 0.1278220351721028, + "flos": 25735561574400.0, + "grad_norm": 3.019802885219414, + "language_loss": 0.78016824, + "learning_rate": 3.900457466856434e-06, + "loss": 0.79952049, + "num_input_tokens_seen": 45913755, + "step": 2126, + "time_per_iteration": 2.721435308456421 + }, + { + "auxiliary_loss_clip": 0.01124637, + "auxiliary_loss_mlp": 0.010537, + "balance_loss_clip": 1.05406642, + "balance_loss_mlp": 1.03504348, + "epoch": 0.12788215842477077, + "flos": 41243224982400.0, + "grad_norm": 1.3825945270792501, + "language_loss": 0.6927852, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.71456861, + "num_input_tokens_seen": 45936095, + "step": 2127, + "time_per_iteration": 2.902101993560791 + }, + { + "auxiliary_loss_clip": 0.01030231, + "auxiliary_loss_mlp": 0.00759051, + "balance_loss_clip": 1.02830005, + "balance_loss_mlp": 1.00050259, + "epoch": 0.12794228167743876, + "flos": 70877430881280.0, + "grad_norm": 0.853491438999862, + "language_loss": 0.62831402, + "learning_rate": 3.900214646718047e-06, + "loss": 0.64620686, + "num_input_tokens_seen": 46004655, + "step": 2128, + "time_per_iteration": 3.3387396335601807 + }, + { + "auxiliary_loss_clip": 0.01145823, + "auxiliary_loss_mlp": 0.01047815, + "balance_loss_clip": 1.05080712, + "balance_loss_mlp": 1.02599955, + "epoch": 0.12800240493010673, + "flos": 16289727252480.0, + "grad_norm": 2.066959353069841, + "language_loss": 0.77626479, + "learning_rate": 3.900093128562056e-06, + "loss": 0.7982012, + "num_input_tokens_seen": 46023610, + "step": 2129, + "time_per_iteration": 2.611309766769409 + }, + { + "auxiliary_loss_clip": 0.01122914, + "auxiliary_loss_mlp": 0.01052577, + "balance_loss_clip": 1.05058527, + "balance_loss_mlp": 1.03029668, + "epoch": 0.1280625281827747, + "flos": 20631542601600.0, + "grad_norm": 2.1214737401843893, + "language_loss": 0.79263359, + "learning_rate": 3.899971538354343e-06, + "loss": 0.81438851, + "num_input_tokens_seen": 46041725, + "step": 2130, + "time_per_iteration": 2.753243923187256 + }, + { + "auxiliary_loss_clip": 0.01139626, + "auxiliary_loss_mlp": 0.01052453, + "balance_loss_clip": 1.05133748, + "balance_loss_mlp": 1.03147244, + "epoch": 0.12812265143544266, + "flos": 22638230784000.0, + "grad_norm": 1.7780274650921335, + "language_loss": 0.70945668, + "learning_rate": 3.899849876099518e-06, + "loss": 0.73137754, + "num_input_tokens_seen": 46061095, + "step": 2131, + "time_per_iteration": 2.6809306144714355 + }, + { + "auxiliary_loss_clip": 0.01102824, + "auxiliary_loss_mlp": 0.01052393, + "balance_loss_clip": 1.04982638, + "balance_loss_mlp": 1.03163886, + "epoch": 0.12818277468811062, + "flos": 34714701463680.0, + "grad_norm": 2.2916674504462655, + "language_loss": 0.72298968, + "learning_rate": 3.899728141802197e-06, + "loss": 0.74454176, + "num_input_tokens_seen": 46082670, + "step": 2132, + "time_per_iteration": 2.8769233226776123 + }, + { + "auxiliary_loss_clip": 0.01102594, + "auxiliary_loss_mlp": 0.01055993, + "balance_loss_clip": 1.04384947, + "balance_loss_mlp": 1.03348672, + "epoch": 0.1282428979407786, + "flos": 23112107936640.0, + "grad_norm": 2.0316054281953155, + "language_loss": 0.82128644, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.84287226, + "num_input_tokens_seen": 46102410, + "step": 2133, + "time_per_iteration": 2.766897678375244 + }, + { + "auxiliary_loss_clip": 0.01163396, + "auxiliary_loss_mlp": 0.01057069, + "balance_loss_clip": 1.05397773, + "balance_loss_mlp": 1.03458595, + "epoch": 0.12830302119344655, + "flos": 20886508316160.0, + "grad_norm": 3.232115826630309, + "language_loss": 0.80001891, + "learning_rate": 3.899484457098528e-06, + "loss": 0.82222354, + "num_input_tokens_seen": 46121145, + "step": 2134, + "time_per_iteration": 2.6347672939300537 + }, + { + "auxiliary_loss_clip": 0.01159056, + "auxiliary_loss_mlp": 0.01046209, + "balance_loss_clip": 1.05907345, + "balance_loss_mlp": 1.02614641, + "epoch": 0.12836314444611455, + "flos": 21397768548480.0, + "grad_norm": 1.731952504909339, + "language_loss": 0.82657921, + "learning_rate": 3.899362506701421e-06, + "loss": 0.84863198, + "num_input_tokens_seen": 46140740, + "step": 2135, + "time_per_iteration": 2.6393656730651855 + }, + { + "auxiliary_loss_clip": 0.0114208, + "auxiliary_loss_mlp": 0.0105553, + "balance_loss_clip": 1.05345035, + "balance_loss_mlp": 1.03411996, + "epoch": 0.1284232676987825, + "flos": 13662466773120.0, + "grad_norm": 2.1083924470752278, + "language_loss": 0.7764526, + "learning_rate": 3.899240484280298e-06, + "loss": 0.79842871, + "num_input_tokens_seen": 46156805, + "step": 2136, + "time_per_iteration": 2.7195920944213867 + }, + { + "auxiliary_loss_clip": 0.01020946, + "auxiliary_loss_mlp": 0.01003991, + "balance_loss_clip": 1.01967573, + "balance_loss_mlp": 1.00096273, + "epoch": 0.12848339095145048, + "flos": 59994737735040.0, + "grad_norm": 0.8964253308146478, + "language_loss": 0.59152198, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61177135, + "num_input_tokens_seen": 46222085, + "step": 2137, + "time_per_iteration": 3.416015625 + }, + { + "auxiliary_loss_clip": 0.01153694, + "auxiliary_loss_mlp": 0.01054623, + "balance_loss_clip": 1.05178177, + "balance_loss_mlp": 1.03483438, + "epoch": 0.12854351420411844, + "flos": 13881378211200.0, + "grad_norm": 3.244493357011547, + "language_loss": 0.82344306, + "learning_rate": 3.898996223384512e-06, + "loss": 0.84552622, + "num_input_tokens_seen": 46239970, + "step": 2138, + "time_per_iteration": 2.65515398979187 + }, + { + "auxiliary_loss_clip": 0.01159586, + "auxiliary_loss_mlp": 0.01049293, + "balance_loss_clip": 1.05592752, + "balance_loss_mlp": 1.02665496, + "epoch": 0.1286036374567864, + "flos": 22637943475200.0, + "grad_norm": 2.5417837252920323, + "language_loss": 0.78691363, + "learning_rate": 3.898873984919113e-06, + "loss": 0.8090024, + "num_input_tokens_seen": 46257740, + "step": 2139, + "time_per_iteration": 2.651132345199585 + }, + { + "auxiliary_loss_clip": 0.01136892, + "auxiliary_loss_mlp": 0.01045928, + "balance_loss_clip": 1.05267286, + "balance_loss_mlp": 1.02582908, + "epoch": 0.12866376070945437, + "flos": 16324775948160.0, + "grad_norm": 1.9541049485452633, + "language_loss": 0.85289955, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.87472773, + "num_input_tokens_seen": 46275445, + "step": 2140, + "time_per_iteration": 2.730156183242798 + }, + { + "auxiliary_loss_clip": 0.01143134, + "auxiliary_loss_mlp": 0.01044337, + "balance_loss_clip": 1.05203128, + "balance_loss_mlp": 1.02482224, + "epoch": 0.12872388396212234, + "flos": 11874546374400.0, + "grad_norm": 1.8185491602156885, + "language_loss": 0.86268306, + "learning_rate": 3.898629291976476e-06, + "loss": 0.88455778, + "num_input_tokens_seen": 46291710, + "step": 2141, + "time_per_iteration": 2.62223482131958 + }, + { + "auxiliary_loss_clip": 0.01146971, + "auxiliary_loss_mlp": 0.01045813, + "balance_loss_clip": 1.0528295, + "balance_loss_mlp": 1.02548814, + "epoch": 0.12878400721479033, + "flos": 28366700722560.0, + "grad_norm": 3.1267362471736684, + "language_loss": 0.68282312, + "learning_rate": 3.898506837508518e-06, + "loss": 0.70475101, + "num_input_tokens_seen": 46311335, + "step": 2142, + "time_per_iteration": 2.71232271194458 + }, + { + "auxiliary_loss_clip": 0.01165678, + "auxiliary_loss_mlp": 0.0077895, + "balance_loss_clip": 1.05764627, + "balance_loss_mlp": 0.99990749, + "epoch": 0.1288441304674583, + "flos": 25885632597120.0, + "grad_norm": 2.373838274123079, + "language_loss": 0.83479214, + "learning_rate": 3.89838431104899e-06, + "loss": 0.85423845, + "num_input_tokens_seen": 46330985, + "step": 2143, + "time_per_iteration": 2.677692174911499 + }, + { + "auxiliary_loss_clip": 0.01175134, + "auxiliary_loss_mlp": 0.00777405, + "balance_loss_clip": 1.0598439, + "balance_loss_mlp": 0.99994075, + "epoch": 0.12890425372012626, + "flos": 20813789232000.0, + "grad_norm": 1.5662270309624111, + "language_loss": 0.81703234, + "learning_rate": 3.898261712602539e-06, + "loss": 0.83655775, + "num_input_tokens_seen": 46351295, + "step": 2144, + "time_per_iteration": 2.712620496749878 + }, + { + "auxiliary_loss_clip": 0.01130321, + "auxiliary_loss_mlp": 0.01053521, + "balance_loss_clip": 1.04658103, + "balance_loss_mlp": 1.03145528, + "epoch": 0.12896437697279423, + "flos": 22565870835840.0, + "grad_norm": 1.8026346290528672, + "language_loss": 0.78304374, + "learning_rate": 3.898139042173813e-06, + "loss": 0.80488217, + "num_input_tokens_seen": 46368600, + "step": 2145, + "time_per_iteration": 2.6766605377197266 + }, + { + "auxiliary_loss_clip": 0.01170585, + "auxiliary_loss_mlp": 0.01047893, + "balance_loss_clip": 1.0543592, + "balance_loss_mlp": 1.02662635, + "epoch": 0.1290245002254622, + "flos": 17493776075520.0, + "grad_norm": 2.147087506474235, + "language_loss": 0.82865375, + "learning_rate": 3.898016299767465e-06, + "loss": 0.85083848, + "num_input_tokens_seen": 46387370, + "step": 2146, + "time_per_iteration": 2.5860395431518555 + }, + { + "auxiliary_loss_clip": 0.01141916, + "auxiliary_loss_mlp": 0.0105138, + "balance_loss_clip": 1.05367482, + "balance_loss_mlp": 1.03062606, + "epoch": 0.12908462347813016, + "flos": 36315957859200.0, + "grad_norm": 2.344626501147968, + "language_loss": 0.71275079, + "learning_rate": 3.897893485388149e-06, + "loss": 0.73468375, + "num_input_tokens_seen": 46409570, + "step": 2147, + "time_per_iteration": 2.7870359420776367 + }, + { + "auxiliary_loss_clip": 0.01147238, + "auxiliary_loss_mlp": 0.01052291, + "balance_loss_clip": 1.05527067, + "balance_loss_mlp": 1.03297925, + "epoch": 0.12914474673079815, + "flos": 22528703237760.0, + "grad_norm": 2.120275205230366, + "language_loss": 0.71432978, + "learning_rate": 3.897770599040521e-06, + "loss": 0.73632509, + "num_input_tokens_seen": 46429320, + "step": 2148, + "time_per_iteration": 2.6865081787109375 + }, + { + "auxiliary_loss_clip": 0.01168479, + "auxiliary_loss_mlp": 0.01049575, + "balance_loss_clip": 1.05762172, + "balance_loss_mlp": 1.03016782, + "epoch": 0.12920486998346611, + "flos": 21471888263040.0, + "grad_norm": 1.6388902851592406, + "language_loss": 0.79064089, + "learning_rate": 3.897647640729242e-06, + "loss": 0.81282145, + "num_input_tokens_seen": 46450155, + "step": 2149, + "time_per_iteration": 2.6041862964630127 + }, + { + "auxiliary_loss_clip": 0.01159527, + "auxiliary_loss_mlp": 0.01046069, + "balance_loss_clip": 1.05377793, + "balance_loss_mlp": 1.02531469, + "epoch": 0.12926499323613408, + "flos": 27308556944640.0, + "grad_norm": 2.034796374339078, + "language_loss": 0.75976646, + "learning_rate": 3.897524610458975e-06, + "loss": 0.78182244, + "num_input_tokens_seen": 46470280, + "step": 2150, + "time_per_iteration": 2.647224187850952 + }, + { + "auxiliary_loss_clip": 0.01155787, + "auxiliary_loss_mlp": 0.01055192, + "balance_loss_clip": 1.05445433, + "balance_loss_mlp": 1.03491461, + "epoch": 0.12932511648880204, + "flos": 22091131756800.0, + "grad_norm": 2.3830500835005592, + "language_loss": 0.70986372, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.73197353, + "num_input_tokens_seen": 46487605, + "step": 2151, + "time_per_iteration": 2.7008492946624756 + }, + { + "auxiliary_loss_clip": 0.01167835, + "auxiliary_loss_mlp": 0.0104951, + "balance_loss_clip": 1.05603719, + "balance_loss_mlp": 1.03017378, + "epoch": 0.12938523974147, + "flos": 20302780394880.0, + "grad_norm": 2.058334480733051, + "language_loss": 0.83964819, + "learning_rate": 3.897278334060137e-06, + "loss": 0.86182165, + "num_input_tokens_seen": 46505100, + "step": 2152, + "time_per_iteration": 2.6467373371124268 + }, + { + "auxiliary_loss_clip": 0.01158553, + "auxiliary_loss_mlp": 0.01058416, + "balance_loss_clip": 1.05283821, + "balance_loss_mlp": 1.03888893, + "epoch": 0.12944536299413797, + "flos": 19499961467520.0, + "grad_norm": 1.5624811365269535, + "language_loss": 0.78585124, + "learning_rate": 3.897155087940906e-06, + "loss": 0.80802095, + "num_input_tokens_seen": 46524020, + "step": 2153, + "time_per_iteration": 4.286921262741089 + }, + { + "auxiliary_loss_clip": 0.01113716, + "auxiliary_loss_mlp": 0.00777812, + "balance_loss_clip": 1.04707122, + "balance_loss_mlp": 0.99989671, + "epoch": 0.12950548624680594, + "flos": 27707919333120.0, + "grad_norm": 1.6189787343362376, + "language_loss": 0.80253434, + "learning_rate": 3.897031769881364e-06, + "loss": 0.82144964, + "num_input_tokens_seen": 46544640, + "step": 2154, + "time_per_iteration": 2.7602338790893555 + }, + { + "auxiliary_loss_clip": 0.01149958, + "auxiliary_loss_mlp": 0.0105188, + "balance_loss_clip": 1.05262971, + "balance_loss_mlp": 1.03099442, + "epoch": 0.12956560949947393, + "flos": 17565740974080.0, + "grad_norm": 1.8080432584650143, + "language_loss": 0.83717728, + "learning_rate": 3.896908379886188e-06, + "loss": 0.85919571, + "num_input_tokens_seen": 46561395, + "step": 2155, + "time_per_iteration": 5.696707010269165 + }, + { + "auxiliary_loss_clip": 0.01161999, + "auxiliary_loss_mlp": 0.01056273, + "balance_loss_clip": 1.05426383, + "balance_loss_mlp": 1.03611445, + "epoch": 0.1296257327521419, + "flos": 20740711011840.0, + "grad_norm": 2.4972858828122666, + "language_loss": 0.76114857, + "learning_rate": 3.896784917960055e-06, + "loss": 0.78333133, + "num_input_tokens_seen": 46579395, + "step": 2156, + "time_per_iteration": 2.6279313564300537 + }, + { + "auxiliary_loss_clip": 0.01105089, + "auxiliary_loss_mlp": 0.01056603, + "balance_loss_clip": 1.0510118, + "balance_loss_mlp": 1.03679013, + "epoch": 0.12968585600480986, + "flos": 16395735265920.0, + "grad_norm": 1.6652476704410177, + "language_loss": 0.86493659, + "learning_rate": 3.896661384107648e-06, + "loss": 0.88655347, + "num_input_tokens_seen": 46597090, + "step": 2157, + "time_per_iteration": 4.4089202880859375 + }, + { + "auxiliary_loss_clip": 0.01170107, + "auxiliary_loss_mlp": 0.01055814, + "balance_loss_clip": 1.05253935, + "balance_loss_mlp": 1.0349642, + "epoch": 0.12974597925747783, + "flos": 28329533124480.0, + "grad_norm": 2.5240136552338956, + "language_loss": 0.80393612, + "learning_rate": 3.896537778333651e-06, + "loss": 0.8261953, + "num_input_tokens_seen": 46617355, + "step": 2158, + "time_per_iteration": 2.702765703201294 + }, + { + "auxiliary_loss_clip": 0.01177017, + "auxiliary_loss_mlp": 0.01060365, + "balance_loss_clip": 1.05905974, + "balance_loss_mlp": 1.04050517, + "epoch": 0.1298061025101458, + "flos": 9683025782400.0, + "grad_norm": 2.5307604694159607, + "language_loss": 0.74881256, + "learning_rate": 3.896414100642752e-06, + "loss": 0.77118635, + "num_input_tokens_seen": 46633130, + "step": 2159, + "time_per_iteration": 2.534163475036621 + }, + { + "auxiliary_loss_clip": 0.01122909, + "auxiliary_loss_mlp": 0.01058309, + "balance_loss_clip": 1.04594469, + "balance_loss_mlp": 1.03471708, + "epoch": 0.12986622576281376, + "flos": 27709535445120.0, + "grad_norm": 1.954419432637739, + "language_loss": 0.8259204, + "learning_rate": 3.89629035103964e-06, + "loss": 0.84773254, + "num_input_tokens_seen": 46650575, + "step": 2160, + "time_per_iteration": 2.7358646392822266 + }, + { + "auxiliary_loss_clip": 0.01154348, + "auxiliary_loss_mlp": 0.01047243, + "balance_loss_clip": 1.05873609, + "balance_loss_mlp": 1.02732301, + "epoch": 0.12992634901548175, + "flos": 18802719590400.0, + "grad_norm": 1.7252123805741888, + "language_loss": 0.82310414, + "learning_rate": 3.896166529529008e-06, + "loss": 0.84512007, + "num_input_tokens_seen": 46668780, + "step": 2161, + "time_per_iteration": 2.7029623985290527 + }, + { + "auxiliary_loss_clip": 0.01145886, + "auxiliary_loss_mlp": 0.01060381, + "balance_loss_clip": 1.05145073, + "balance_loss_mlp": 1.03911448, + "epoch": 0.12998647226814972, + "flos": 29127575543040.0, + "grad_norm": 2.0780374068601253, + "language_loss": 0.82668459, + "learning_rate": 3.896042636115551e-06, + "loss": 0.84874725, + "num_input_tokens_seen": 46687550, + "step": 2162, + "time_per_iteration": 2.674825668334961 + }, + { + "auxiliary_loss_clip": 0.0113921, + "auxiliary_loss_mlp": 0.0105953, + "balance_loss_clip": 1.05468941, + "balance_loss_mlp": 1.03957474, + "epoch": 0.13004659552081768, + "flos": 19573686132480.0, + "grad_norm": 3.928222506771022, + "language_loss": 0.72579277, + "learning_rate": 3.895918670803968e-06, + "loss": 0.7477802, + "num_input_tokens_seen": 46706730, + "step": 2163, + "time_per_iteration": 2.678394079208374 + }, + { + "auxiliary_loss_clip": 0.01173873, + "auxiliary_loss_mlp": 0.00778662, + "balance_loss_clip": 1.05635965, + "balance_loss_mlp": 0.99994016, + "epoch": 0.13010671877348565, + "flos": 22490709626880.0, + "grad_norm": 2.0196348424542827, + "language_loss": 0.81330699, + "learning_rate": 3.895794633598958e-06, + "loss": 0.83283234, + "num_input_tokens_seen": 46724250, + "step": 2164, + "time_per_iteration": 2.6116931438446045 + }, + { + "auxiliary_loss_clip": 0.01119834, + "auxiliary_loss_mlp": 0.01050661, + "balance_loss_clip": 1.04808033, + "balance_loss_mlp": 1.03061032, + "epoch": 0.1301668420261536, + "flos": 23878226142720.0, + "grad_norm": 2.274563635903502, + "language_loss": 0.72262049, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.74432552, + "num_input_tokens_seen": 46744105, + "step": 2165, + "time_per_iteration": 2.7646515369415283 + }, + { + "auxiliary_loss_clip": 0.01109832, + "auxiliary_loss_mlp": 0.01048351, + "balance_loss_clip": 1.05059505, + "balance_loss_mlp": 1.02707219, + "epoch": 0.13022696527882158, + "flos": 23150065633920.0, + "grad_norm": 2.8383873988269217, + "language_loss": 0.74749964, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.76908153, + "num_input_tokens_seen": 46764250, + "step": 2166, + "time_per_iteration": 2.7939398288726807 + }, + { + "auxiliary_loss_clip": 0.01170298, + "auxiliary_loss_mlp": 0.01048037, + "balance_loss_clip": 1.05364752, + "balance_loss_mlp": 1.02827251, + "epoch": 0.13028708853148954, + "flos": 26908548111360.0, + "grad_norm": 1.5379857106114436, + "language_loss": 0.83098066, + "learning_rate": 3.895422090670421e-06, + "loss": 0.85316396, + "num_input_tokens_seen": 46786865, + "step": 2167, + "time_per_iteration": 2.700505495071411 + }, + { + "auxiliary_loss_clip": 0.01108628, + "auxiliary_loss_mlp": 0.01059921, + "balance_loss_clip": 1.04567361, + "balance_loss_mlp": 1.03841531, + "epoch": 0.13034721178415754, + "flos": 21251468453760.0, + "grad_norm": 1.6054044551173634, + "language_loss": 0.83578718, + "learning_rate": 3.89529776593877e-06, + "loss": 0.85747266, + "num_input_tokens_seen": 46807030, + "step": 2168, + "time_per_iteration": 2.839285135269165 + }, + { + "auxiliary_loss_clip": 0.01079188, + "auxiliary_loss_mlp": 0.01063413, + "balance_loss_clip": 1.04247975, + "balance_loss_mlp": 1.03861713, + "epoch": 0.1304073350368255, + "flos": 18767239931520.0, + "grad_norm": 1.950315007602454, + "language_loss": 0.79910588, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.8205319, + "num_input_tokens_seen": 46826280, + "step": 2169, + "time_per_iteration": 2.8150076866149902 + }, + { + "auxiliary_loss_clip": 0.01174566, + "auxiliary_loss_mlp": 0.01044893, + "balance_loss_clip": 1.05822575, + "balance_loss_mlp": 1.02339983, + "epoch": 0.13046745828949347, + "flos": 28364653647360.0, + "grad_norm": 2.4117618540057766, + "language_loss": 0.66804767, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.69024229, + "num_input_tokens_seen": 46846505, + "step": 2170, + "time_per_iteration": 2.722769021987915 + }, + { + "auxiliary_loss_clip": 0.0114216, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_clip": 1.05424142, + "balance_loss_mlp": 1.02637053, + "epoch": 0.13052758154216143, + "flos": 29605044055680.0, + "grad_norm": 1.9089846415842238, + "language_loss": 0.66768706, + "learning_rate": 3.8949243605434e-06, + "loss": 0.68957549, + "num_input_tokens_seen": 46867380, + "step": 2171, + "time_per_iteration": 2.7474682331085205 + }, + { + "auxiliary_loss_clip": 0.01157431, + "auxiliary_loss_mlp": 0.01049079, + "balance_loss_clip": 1.05283058, + "balance_loss_mlp": 1.02701378, + "epoch": 0.1305877047948294, + "flos": 19390864884480.0, + "grad_norm": 2.103440896006443, + "language_loss": 0.72157478, + "learning_rate": 3.894799748360537e-06, + "loss": 0.74363995, + "num_input_tokens_seen": 46886810, + "step": 2172, + "time_per_iteration": 2.8062691688537598 + }, + { + "auxiliary_loss_clip": 0.01131178, + "auxiliary_loss_mlp": 0.01045812, + "balance_loss_clip": 1.05676126, + "balance_loss_mlp": 1.0248909, + "epoch": 0.13064782804749736, + "flos": 16873527000960.0, + "grad_norm": 1.8662964619330822, + "language_loss": 0.75331408, + "learning_rate": 3.894675064326678e-06, + "loss": 0.77508402, + "num_input_tokens_seen": 46905620, + "step": 2173, + "time_per_iteration": 2.749630928039551 + }, + { + "auxiliary_loss_clip": 0.01132129, + "auxiliary_loss_mlp": 0.01056024, + "balance_loss_clip": 1.05241716, + "balance_loss_mlp": 1.03388715, + "epoch": 0.13070795130016533, + "flos": 24499085748480.0, + "grad_norm": 2.8034072456055426, + "language_loss": 0.70175481, + "learning_rate": 3.894550308446551e-06, + "loss": 0.72363639, + "num_input_tokens_seen": 46925120, + "step": 2174, + "time_per_iteration": 2.723314046859741 + }, + { + "auxiliary_loss_clip": 0.01047643, + "auxiliary_loss_mlp": 0.01015006, + "balance_loss_clip": 1.02629197, + "balance_loss_mlp": 1.01260972, + "epoch": 0.13076807455283332, + "flos": 71054505953280.0, + "grad_norm": 0.7998489021914615, + "language_loss": 0.59026134, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61088777, + "num_input_tokens_seen": 46988195, + "step": 2175, + "time_per_iteration": 3.318049192428589 + }, + { + "auxiliary_loss_clip": 0.01159762, + "auxiliary_loss_mlp": 0.01053929, + "balance_loss_clip": 1.05441868, + "balance_loss_mlp": 1.03342521, + "epoch": 0.13082819780550128, + "flos": 20264499475200.0, + "grad_norm": 2.2309284705459707, + "language_loss": 0.80365628, + "learning_rate": 3.894300581166417e-06, + "loss": 0.82579315, + "num_input_tokens_seen": 47004720, + "step": 2176, + "time_per_iteration": 2.631732702255249 + }, + { + "auxiliary_loss_clip": 0.01169648, + "auxiliary_loss_mlp": 0.01047517, + "balance_loss_clip": 1.05513525, + "balance_loss_mlp": 1.02529645, + "epoch": 0.13088832105816925, + "flos": 34203441231360.0, + "grad_norm": 1.6906214681317566, + "language_loss": 0.74661696, + "learning_rate": 3.894175609775881e-06, + "loss": 0.76878858, + "num_input_tokens_seen": 47024255, + "step": 2177, + "time_per_iteration": 2.701422691345215 + }, + { + "auxiliary_loss_clip": 0.01131124, + "auxiliary_loss_mlp": 0.0105144, + "balance_loss_clip": 1.051373, + "balance_loss_mlp": 1.02905297, + "epoch": 0.13094844431083721, + "flos": 17894970057600.0, + "grad_norm": 1.8043513019060269, + "language_loss": 0.82266748, + "learning_rate": 3.894050566558015e-06, + "loss": 0.84449303, + "num_input_tokens_seen": 47042465, + "step": 2178, + "time_per_iteration": 2.6934497356414795 + }, + { + "auxiliary_loss_clip": 0.01170524, + "auxiliary_loss_mlp": 0.01047895, + "balance_loss_clip": 1.05729508, + "balance_loss_mlp": 1.02705729, + "epoch": 0.13100856756350518, + "flos": 17311313963520.0, + "grad_norm": 2.9251611149508276, + "language_loss": 0.74291968, + "learning_rate": 3.893925451517562e-06, + "loss": 0.76510382, + "num_input_tokens_seen": 47060370, + "step": 2179, + "time_per_iteration": 2.6111502647399902 + }, + { + "auxiliary_loss_clip": 0.01128297, + "auxiliary_loss_mlp": 0.01052407, + "balance_loss_clip": 1.04917574, + "balance_loss_mlp": 1.03184354, + "epoch": 0.13106869081617314, + "flos": 22200551562240.0, + "grad_norm": 1.9805514150688242, + "language_loss": 0.84366202, + "learning_rate": 3.893800264659266e-06, + "loss": 0.8654691, + "num_input_tokens_seen": 47081415, + "step": 2180, + "time_per_iteration": 2.731229543685913 + }, + { + "auxiliary_loss_clip": 0.01162028, + "auxiliary_loss_mlp": 0.0105845, + "balance_loss_clip": 1.05875921, + "balance_loss_mlp": 1.03757644, + "epoch": 0.13112881406884114, + "flos": 21763123735680.0, + "grad_norm": 1.8389866248015785, + "language_loss": 0.89840436, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.92060918, + "num_input_tokens_seen": 47099860, + "step": 2181, + "time_per_iteration": 2.643890380859375 + }, + { + "auxiliary_loss_clip": 0.01153771, + "auxiliary_loss_mlp": 0.01051982, + "balance_loss_clip": 1.05222976, + "balance_loss_mlp": 1.03126323, + "epoch": 0.1311889373215091, + "flos": 23331091201920.0, + "grad_norm": 2.117586475019142, + "language_loss": 0.68813586, + "learning_rate": 3.893549675508137e-06, + "loss": 0.7101934, + "num_input_tokens_seen": 47118540, + "step": 2182, + "time_per_iteration": 2.6198863983154297 + }, + { + "auxiliary_loss_clip": 0.01123039, + "auxiliary_loss_mlp": 0.01051411, + "balance_loss_clip": 1.0502702, + "balance_loss_mlp": 1.0292381, + "epoch": 0.13124906057417707, + "flos": 21467363149440.0, + "grad_norm": 1.787500136217105, + "language_loss": 0.78694725, + "learning_rate": 3.893424273224806e-06, + "loss": 0.8086918, + "num_input_tokens_seen": 47136710, + "step": 2183, + "time_per_iteration": 2.715517520904541 + }, + { + "auxiliary_loss_clip": 0.01169106, + "auxiliary_loss_mlp": 0.01047098, + "balance_loss_clip": 1.05452895, + "balance_loss_mlp": 1.02586675, + "epoch": 0.13130918382684503, + "flos": 23255319461760.0, + "grad_norm": 26.753588494231124, + "language_loss": 0.85792655, + "learning_rate": 3.893298799142636e-06, + "loss": 0.88008863, + "num_input_tokens_seen": 47157155, + "step": 2184, + "time_per_iteration": 2.632539987564087 + }, + { + "auxiliary_loss_clip": 0.01138714, + "auxiliary_loss_mlp": 0.01054657, + "balance_loss_clip": 1.05349112, + "balance_loss_mlp": 1.03230524, + "epoch": 0.131369307079513, + "flos": 20850274471680.0, + "grad_norm": 2.50466124454056, + "language_loss": 0.82703435, + "learning_rate": 3.893173253266387e-06, + "loss": 0.84896809, + "num_input_tokens_seen": 47176820, + "step": 2185, + "time_per_iteration": 2.6809136867523193 + }, + { + "auxiliary_loss_clip": 0.01144077, + "auxiliary_loss_mlp": 0.01054121, + "balance_loss_clip": 1.05262399, + "balance_loss_mlp": 1.03236496, + "epoch": 0.13142943033218096, + "flos": 17858341163520.0, + "grad_norm": 1.8949462712827352, + "language_loss": 0.72956109, + "learning_rate": 3.893047635600818e-06, + "loss": 0.75154305, + "num_input_tokens_seen": 47195855, + "step": 2186, + "time_per_iteration": 2.628096342086792 + }, + { + "auxiliary_loss_clip": 0.01157778, + "auxiliary_loss_mlp": 0.01050695, + "balance_loss_clip": 1.05436552, + "balance_loss_mlp": 1.02783096, + "epoch": 0.13148955358484893, + "flos": 20996035862400.0, + "grad_norm": 1.9822444068613732, + "language_loss": 0.80363685, + "learning_rate": 3.892921946150693e-06, + "loss": 0.82572162, + "num_input_tokens_seen": 47214535, + "step": 2187, + "time_per_iteration": 2.762223720550537 + }, + { + "auxiliary_loss_clip": 0.01027324, + "auxiliary_loss_mlp": 0.0101023, + "balance_loss_clip": 1.02364707, + "balance_loss_mlp": 1.00792885, + "epoch": 0.13154967683751692, + "flos": 70172467580160.0, + "grad_norm": 0.8471850380496847, + "language_loss": 0.59082437, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61119986, + "num_input_tokens_seen": 47270300, + "step": 2188, + "time_per_iteration": 3.302457571029663 + }, + { + "auxiliary_loss_clip": 0.01095126, + "auxiliary_loss_mlp": 0.01059346, + "balance_loss_clip": 1.04827487, + "balance_loss_mlp": 1.03676724, + "epoch": 0.1316098000901849, + "flos": 20376145923840.0, + "grad_norm": 1.7340345041340466, + "language_loss": 0.74211109, + "learning_rate": 3.892670351915842e-06, + "loss": 0.76365584, + "num_input_tokens_seen": 47290720, + "step": 2189, + "time_per_iteration": 2.7990496158599854 + }, + { + "auxiliary_loss_clip": 0.01160124, + "auxiliary_loss_mlp": 0.01049098, + "balance_loss_clip": 1.05551052, + "balance_loss_mlp": 1.02799821, + "epoch": 0.13166992334285285, + "flos": 23221132692480.0, + "grad_norm": 1.8160574809616576, + "language_loss": 0.73152113, + "learning_rate": 3.892544447140657e-06, + "loss": 0.75361335, + "num_input_tokens_seen": 47311820, + "step": 2190, + "time_per_iteration": 2.6485326290130615 + }, + { + "auxiliary_loss_clip": 0.01160351, + "auxiliary_loss_mlp": 0.01058461, + "balance_loss_clip": 1.05671644, + "balance_loss_mlp": 1.03811169, + "epoch": 0.13173004659552082, + "flos": 23330947547520.0, + "grad_norm": 1.8825588242208007, + "language_loss": 0.74617779, + "learning_rate": 3.892418470599996e-06, + "loss": 0.76836598, + "num_input_tokens_seen": 47331605, + "step": 2191, + "time_per_iteration": 2.644484281539917 + }, + { + "auxiliary_loss_clip": 0.0112783, + "auxiliary_loss_mlp": 0.01054712, + "balance_loss_clip": 1.05129039, + "balance_loss_mlp": 1.03356445, + "epoch": 0.13179016984818878, + "flos": 21251504367360.0, + "grad_norm": 1.8823393822145031, + "language_loss": 0.79093283, + "learning_rate": 3.892292422298637e-06, + "loss": 0.81275827, + "num_input_tokens_seen": 47350455, + "step": 2192, + "time_per_iteration": 2.735225200653076 + }, + { + "auxiliary_loss_clip": 0.0111282, + "auxiliary_loss_mlp": 0.01051113, + "balance_loss_clip": 1.04457211, + "balance_loss_mlp": 1.02936912, + "epoch": 0.13185029310085675, + "flos": 17778690754560.0, + "grad_norm": 1.7242105632860862, + "language_loss": 0.85350716, + "learning_rate": 3.892166302241361e-06, + "loss": 0.87514639, + "num_input_tokens_seen": 47368225, + "step": 2193, + "time_per_iteration": 4.262877941131592 + }, + { + "auxiliary_loss_clip": 0.0104173, + "auxiliary_loss_mlp": 0.01015651, + "balance_loss_clip": 1.02609122, + "balance_loss_mlp": 1.01280212, + "epoch": 0.1319104163535247, + "flos": 69851785933440.0, + "grad_norm": 0.7746813180799224, + "language_loss": 0.54112649, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56170022, + "num_input_tokens_seen": 47427125, + "step": 2194, + "time_per_iteration": 6.223008394241333 + }, + { + "auxiliary_loss_clip": 0.01168022, + "auxiliary_loss_mlp": 0.01048581, + "balance_loss_clip": 1.05420566, + "balance_loss_mlp": 1.02828002, + "epoch": 0.1319705396061927, + "flos": 25193095401600.0, + "grad_norm": 2.1079865649821925, + "language_loss": 0.72433972, + "learning_rate": 3.891913846878185e-06, + "loss": 0.74650574, + "num_input_tokens_seen": 47450275, + "step": 2195, + "time_per_iteration": 2.6357345581054688 + }, + { + "auxiliary_loss_clip": 0.01136503, + "auxiliary_loss_mlp": 0.00778731, + "balance_loss_clip": 1.05176425, + "balance_loss_mlp": 0.99996454, + "epoch": 0.13203066285886067, + "flos": 20740459616640.0, + "grad_norm": 1.5737174748369949, + "language_loss": 0.78126895, + "learning_rate": 3.891787511581859e-06, + "loss": 0.8004213, + "num_input_tokens_seen": 47469155, + "step": 2196, + "time_per_iteration": 2.7118594646453857 + }, + { + "auxiliary_loss_clip": 0.01162447, + "auxiliary_loss_mlp": 0.010526, + "balance_loss_clip": 1.05453539, + "balance_loss_mlp": 1.03210831, + "epoch": 0.13209078611152864, + "flos": 22054395121920.0, + "grad_norm": 1.9385650447291836, + "language_loss": 0.74632496, + "learning_rate": 3.89166110454876e-06, + "loss": 0.76847541, + "num_input_tokens_seen": 47488405, + "step": 2197, + "time_per_iteration": 4.270530939102173 + }, + { + "auxiliary_loss_clip": 0.01173786, + "auxiliary_loss_mlp": 0.01050846, + "balance_loss_clip": 1.05440533, + "balance_loss_mlp": 1.02947164, + "epoch": 0.1321509093641966, + "flos": 16284950743680.0, + "grad_norm": 1.785688190112577, + "language_loss": 0.79566747, + "learning_rate": 3.891534625783685e-06, + "loss": 0.81791383, + "num_input_tokens_seen": 47505650, + "step": 2198, + "time_per_iteration": 2.6145474910736084 + }, + { + "auxiliary_loss_clip": 0.01170264, + "auxiliary_loss_mlp": 0.01057159, + "balance_loss_clip": 1.05536175, + "balance_loss_mlp": 1.03647637, + "epoch": 0.13221103261686457, + "flos": 16983018633600.0, + "grad_norm": 2.56313218775589, + "language_loss": 0.82932216, + "learning_rate": 3.891408075291425e-06, + "loss": 0.85159647, + "num_input_tokens_seen": 47521540, + "step": 2199, + "time_per_iteration": 2.5715503692626953 + }, + { + "auxiliary_loss_clip": 0.01122554, + "auxiliary_loss_mlp": 0.01052148, + "balance_loss_clip": 1.05047798, + "balance_loss_mlp": 1.03045249, + "epoch": 0.13227115586953253, + "flos": 34233605677440.0, + "grad_norm": 1.8710902505917797, + "language_loss": 0.69579422, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.71754128, + "num_input_tokens_seen": 47543625, + "step": 2200, + "time_per_iteration": 2.8001365661621094 + }, + { + "auxiliary_loss_clip": 0.01167798, + "auxiliary_loss_mlp": 0.01058155, + "balance_loss_clip": 1.05345917, + "balance_loss_mlp": 1.03618431, + "epoch": 0.13233127912220052, + "flos": 20704656735360.0, + "grad_norm": 1.647659287704997, + "language_loss": 0.84624702, + "learning_rate": 3.891154759144557e-06, + "loss": 0.86850655, + "num_input_tokens_seen": 47563740, + "step": 2201, + "time_per_iteration": 2.6485981941223145 + }, + { + "auxiliary_loss_clip": 0.0117188, + "auxiliary_loss_mlp": 0.01055627, + "balance_loss_clip": 1.05427861, + "balance_loss_mlp": 1.03431273, + "epoch": 0.1323914023748685, + "flos": 25805048434560.0, + "grad_norm": 1.7446392584198542, + "language_loss": 0.87088037, + "learning_rate": 3.891027993499554e-06, + "loss": 0.8931554, + "num_input_tokens_seen": 47582655, + "step": 2202, + "time_per_iteration": 2.5921456813812256 + }, + { + "auxiliary_loss_clip": 0.01139991, + "auxiliary_loss_mlp": 0.01053413, + "balance_loss_clip": 1.05299544, + "balance_loss_mlp": 1.03267026, + "epoch": 0.13245152562753645, + "flos": 21251540280960.0, + "grad_norm": 2.405254380671628, + "language_loss": 0.72801507, + "learning_rate": 3.89090115614658e-06, + "loss": 0.7499491, + "num_input_tokens_seen": 47600875, + "step": 2203, + "time_per_iteration": 2.6257405281066895 + }, + { + "auxiliary_loss_clip": 0.01124508, + "auxiliary_loss_mlp": 0.0105959, + "balance_loss_clip": 1.05080879, + "balance_loss_mlp": 1.03916979, + "epoch": 0.13251164888020442, + "flos": 26610955931520.0, + "grad_norm": 2.044348475010678, + "language_loss": 0.73170948, + "learning_rate": 3.890774247090444e-06, + "loss": 0.75355047, + "num_input_tokens_seen": 47619250, + "step": 2204, + "time_per_iteration": 2.753830909729004 + }, + { + "auxiliary_loss_clip": 0.01160826, + "auxiliary_loss_mlp": 0.01054406, + "balance_loss_clip": 1.05474758, + "balance_loss_mlp": 1.03225708, + "epoch": 0.13257177213287238, + "flos": 29826541272960.0, + "grad_norm": 2.094172729236468, + "language_loss": 0.78377104, + "learning_rate": 3.89064726633596e-06, + "loss": 0.80592328, + "num_input_tokens_seen": 47639445, + "step": 2205, + "time_per_iteration": 2.730682134628296 + }, + { + "auxiliary_loss_clip": 0.01125154, + "auxiliary_loss_mlp": 0.01048818, + "balance_loss_clip": 1.04975629, + "balance_loss_mlp": 1.02782559, + "epoch": 0.13263189538554035, + "flos": 21288456483840.0, + "grad_norm": 1.8609089802832188, + "language_loss": 0.78638101, + "learning_rate": 3.890520213887941e-06, + "loss": 0.80812073, + "num_input_tokens_seen": 47658740, + "step": 2206, + "time_per_iteration": 2.691962718963623 + }, + { + "auxiliary_loss_clip": 0.01124965, + "auxiliary_loss_mlp": 0.01045957, + "balance_loss_clip": 1.04958403, + "balance_loss_mlp": 1.02649069, + "epoch": 0.13269201863820831, + "flos": 16874101618560.0, + "grad_norm": 2.2777192787220066, + "language_loss": 0.74672282, + "learning_rate": 3.890393089751208e-06, + "loss": 0.76843208, + "num_input_tokens_seen": 47676880, + "step": 2207, + "time_per_iteration": 2.7062454223632812 + }, + { + "auxiliary_loss_clip": 0.01143208, + "auxiliary_loss_mlp": 0.01047941, + "balance_loss_clip": 1.05257845, + "balance_loss_mlp": 1.02672219, + "epoch": 0.1327521418908763, + "flos": 23768914078080.0, + "grad_norm": 1.692212064021935, + "language_loss": 0.84061795, + "learning_rate": 3.890265893930578e-06, + "loss": 0.8625294, + "num_input_tokens_seen": 47696635, + "step": 2208, + "time_per_iteration": 2.687717914581299 + }, + { + "auxiliary_loss_clip": 0.01152573, + "auxiliary_loss_mlp": 0.0105274, + "balance_loss_clip": 1.05847478, + "balance_loss_mlp": 1.03411973, + "epoch": 0.13281226514354427, + "flos": 26505594362880.0, + "grad_norm": 1.7032258459750478, + "language_loss": 0.85587811, + "learning_rate": 3.890138626430876e-06, + "loss": 0.8779313, + "num_input_tokens_seen": 47717760, + "step": 2209, + "time_per_iteration": 2.646015167236328 + }, + { + "auxiliary_loss_clip": 0.01138084, + "auxiliary_loss_mlp": 0.00778828, + "balance_loss_clip": 1.05316806, + "balance_loss_mlp": 1.00002563, + "epoch": 0.13287238839621224, + "flos": 24498762526080.0, + "grad_norm": 2.237247968175465, + "language_loss": 0.81797457, + "learning_rate": 3.890011287256929e-06, + "loss": 0.83714366, + "num_input_tokens_seen": 47737685, + "step": 2210, + "time_per_iteration": 2.676262378692627 + }, + { + "auxiliary_loss_clip": 0.0104445, + "auxiliary_loss_mlp": 0.00757817, + "balance_loss_clip": 1.03801322, + "balance_loss_mlp": 1.00007725, + "epoch": 0.1329325116488802, + "flos": 67694344369920.0, + "grad_norm": 0.7515252652740232, + "language_loss": 0.58031559, + "learning_rate": 3.889883876413563e-06, + "loss": 0.59833825, + "num_input_tokens_seen": 47802415, + "step": 2211, + "time_per_iteration": 3.3914146423339844 + }, + { + "auxiliary_loss_clip": 0.01064712, + "auxiliary_loss_mlp": 0.01012978, + "balance_loss_clip": 1.04205871, + "balance_loss_mlp": 1.01083231, + "epoch": 0.13299263490154817, + "flos": 72261894741120.0, + "grad_norm": 0.8012428422082742, + "language_loss": 0.55299425, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57377112, + "num_input_tokens_seen": 47871485, + "step": 2212, + "time_per_iteration": 3.2910914421081543 + }, + { + "auxiliary_loss_clip": 0.01132433, + "auxiliary_loss_mlp": 0.01054299, + "balance_loss_clip": 1.05107963, + "balance_loss_mlp": 1.0331986, + "epoch": 0.13305275815421613, + "flos": 17931275729280.0, + "grad_norm": 2.484635795733661, + "language_loss": 0.74228692, + "learning_rate": 3.889628839737908e-06, + "loss": 0.7641542, + "num_input_tokens_seen": 47888315, + "step": 2213, + "time_per_iteration": 2.755777597427368 + }, + { + "auxiliary_loss_clip": 0.01114671, + "auxiliary_loss_mlp": 0.01051459, + "balance_loss_clip": 1.04682255, + "balance_loss_mlp": 1.03231359, + "epoch": 0.13311288140688413, + "flos": 22340889999360.0, + "grad_norm": 1.850943077435394, + "language_loss": 0.79699469, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81865597, + "num_input_tokens_seen": 47906600, + "step": 2214, + "time_per_iteration": 2.702603340148926 + }, + { + "auxiliary_loss_clip": 0.01143494, + "auxiliary_loss_mlp": 0.01052411, + "balance_loss_clip": 1.05555344, + "balance_loss_mlp": 1.03171659, + "epoch": 0.1331730046595521, + "flos": 31868888682240.0, + "grad_norm": 1.8782588426913054, + "language_loss": 0.69341159, + "learning_rate": 3.889373516442597e-06, + "loss": 0.71537066, + "num_input_tokens_seen": 47927630, + "step": 2215, + "time_per_iteration": 2.769237518310547 + }, + { + "auxiliary_loss_clip": 0.01167307, + "auxiliary_loss_mlp": 0.01051423, + "balance_loss_clip": 1.06098068, + "balance_loss_mlp": 1.03132463, + "epoch": 0.13323312791222006, + "flos": 22566589107840.0, + "grad_norm": 1.884566493826098, + "language_loss": 0.81262428, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83481157, + "num_input_tokens_seen": 47947935, + "step": 2216, + "time_per_iteration": 2.7427120208740234 + }, + { + "auxiliary_loss_clip": 0.01163681, + "auxiliary_loss_mlp": 0.01056545, + "balance_loss_clip": 1.06198788, + "balance_loss_mlp": 1.03631544, + "epoch": 0.13329325116488802, + "flos": 15085319293440.0, + "grad_norm": 3.783334161704178, + "language_loss": 0.87299347, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89519572, + "num_input_tokens_seen": 47965515, + "step": 2217, + "time_per_iteration": 2.709527015686035 + }, + { + "auxiliary_loss_clip": 0.01152703, + "auxiliary_loss_mlp": 0.01056364, + "balance_loss_clip": 1.06054497, + "balance_loss_mlp": 1.0343225, + "epoch": 0.133353374417556, + "flos": 27453671890560.0, + "grad_norm": 4.412823416345162, + "language_loss": 0.73105222, + "learning_rate": 3.888989994172501e-06, + "loss": 0.75314289, + "num_input_tokens_seen": 47985675, + "step": 2218, + "time_per_iteration": 2.697733163833618 + }, + { + "auxiliary_loss_clip": 0.01129106, + "auxiliary_loss_mlp": 0.01051151, + "balance_loss_clip": 1.0535965, + "balance_loss_mlp": 1.02993202, + "epoch": 0.13341349767022395, + "flos": 24094695456000.0, + "grad_norm": 1.7935349411013712, + "language_loss": 0.86911142, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.89091408, + "num_input_tokens_seen": 48004985, + "step": 2219, + "time_per_iteration": 2.7641642093658447 + }, + { + "auxiliary_loss_clip": 0.01141172, + "auxiliary_loss_mlp": 0.0106326, + "balance_loss_clip": 1.05751657, + "balance_loss_mlp": 1.04406714, + "epoch": 0.13347362092289192, + "flos": 24133335511680.0, + "grad_norm": 1.8604531362737113, + "language_loss": 0.77244747, + "learning_rate": 3.888733954497574e-06, + "loss": 0.79449183, + "num_input_tokens_seen": 48024965, + "step": 2220, + "time_per_iteration": 2.732160806655884 + }, + { + "auxiliary_loss_clip": 0.01146487, + "auxiliary_loss_mlp": 0.01048662, + "balance_loss_clip": 1.05399704, + "balance_loss_mlp": 1.03001785, + "epoch": 0.1335337441755599, + "flos": 18436538390400.0, + "grad_norm": 2.3004113327688955, + "language_loss": 0.79467338, + "learning_rate": 3.888605827226212e-06, + "loss": 0.81662482, + "num_input_tokens_seen": 48040890, + "step": 2221, + "time_per_iteration": 2.685612440109253 + }, + { + "auxiliary_loss_clip": 0.01062777, + "auxiliary_loss_mlp": 0.01021711, + "balance_loss_clip": 1.03293467, + "balance_loss_mlp": 1.0194701, + "epoch": 0.13359386742822787, + "flos": 50611997652480.0, + "grad_norm": 0.9755051104211709, + "language_loss": 0.68938822, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.71023309, + "num_input_tokens_seen": 48091855, + "step": 2222, + "time_per_iteration": 3.0336835384368896 + }, + { + "auxiliary_loss_clip": 0.01130152, + "auxiliary_loss_mlp": 0.01058574, + "balance_loss_clip": 1.05544209, + "balance_loss_mlp": 1.03940475, + "epoch": 0.13365399068089584, + "flos": 22778569221120.0, + "grad_norm": 2.1295993667823416, + "language_loss": 0.67389107, + "learning_rate": 3.888349357839982e-06, + "loss": 0.69577825, + "num_input_tokens_seen": 48111350, + "step": 2223, + "time_per_iteration": 2.7134146690368652 + }, + { + "auxiliary_loss_clip": 0.01161386, + "auxiliary_loss_mlp": 0.01060571, + "balance_loss_clip": 1.05785358, + "balance_loss_mlp": 1.04010296, + "epoch": 0.1337141139335638, + "flos": 12531603911040.0, + "grad_norm": 4.277142483609355, + "language_loss": 0.82505226, + "learning_rate": 3.88822101573484e-06, + "loss": 0.84727186, + "num_input_tokens_seen": 48129840, + "step": 2224, + "time_per_iteration": 2.608372926712036 + }, + { + "auxiliary_loss_clip": 0.01173412, + "auxiliary_loss_mlp": 0.01050086, + "balance_loss_clip": 1.0573926, + "balance_loss_mlp": 1.0290221, + "epoch": 0.13377423718623177, + "flos": 23038957889280.0, + "grad_norm": 1.9890294619132924, + "language_loss": 0.66270435, + "learning_rate": 3.888092602028167e-06, + "loss": 0.68493932, + "num_input_tokens_seen": 48149240, + "step": 2225, + "time_per_iteration": 2.6304945945739746 + }, + { + "auxiliary_loss_clip": 0.01153626, + "auxiliary_loss_mlp": 0.01051637, + "balance_loss_clip": 1.05233717, + "balance_loss_mlp": 1.03180075, + "epoch": 0.13383436043889974, + "flos": 16216397637120.0, + "grad_norm": 2.2915668787246997, + "language_loss": 0.89469218, + "learning_rate": 3.887964116724835e-06, + "loss": 0.91674477, + "num_input_tokens_seen": 48166330, + "step": 2226, + "time_per_iteration": 2.6002328395843506 + }, + { + "auxiliary_loss_clip": 0.01150395, + "auxiliary_loss_mlp": 0.01054296, + "balance_loss_clip": 1.0549798, + "balance_loss_mlp": 1.03423262, + "epoch": 0.1338944836915677, + "flos": 24279671520000.0, + "grad_norm": 1.7271512115821777, + "language_loss": 0.73209751, + "learning_rate": 3.887835559829712e-06, + "loss": 0.75414443, + "num_input_tokens_seen": 48187600, + "step": 2227, + "time_per_iteration": 2.706193447113037 + }, + { + "auxiliary_loss_clip": 0.01157707, + "auxiliary_loss_mlp": 0.01047387, + "balance_loss_clip": 1.05518484, + "balance_loss_mlp": 1.02683568, + "epoch": 0.1339546069442357, + "flos": 17598742594560.0, + "grad_norm": 2.848999829625599, + "language_loss": 0.85160232, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.87365323, + "num_input_tokens_seen": 48204400, + "step": 2228, + "time_per_iteration": 2.689209222793579 + }, + { + "auxiliary_loss_clip": 0.01132803, + "auxiliary_loss_mlp": 0.01052829, + "balance_loss_clip": 1.04935181, + "balance_loss_mlp": 1.03126431, + "epoch": 0.13401473019690366, + "flos": 18990065952000.0, + "grad_norm": 1.909679794697233, + "language_loss": 0.81460214, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.83645844, + "num_input_tokens_seen": 48222180, + "step": 2229, + "time_per_iteration": 2.6380228996276855 + }, + { + "auxiliary_loss_clip": 0.0110557, + "auxiliary_loss_mlp": 0.01052684, + "balance_loss_clip": 1.04774594, + "balance_loss_mlp": 1.03233457, + "epoch": 0.13407485344957162, + "flos": 26943812288640.0, + "grad_norm": 1.7464076089691416, + "language_loss": 0.73822236, + "learning_rate": 3.887449459642378e-06, + "loss": 0.7598049, + "num_input_tokens_seen": 48243245, + "step": 2230, + "time_per_iteration": 2.7332983016967773 + }, + { + "auxiliary_loss_clip": 0.01125236, + "auxiliary_loss_mlp": 0.01058977, + "balance_loss_clip": 1.05213606, + "balance_loss_mlp": 1.03890252, + "epoch": 0.1341349767022396, + "flos": 20339373375360.0, + "grad_norm": 1.6827882777998602, + "language_loss": 0.80133682, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82317901, + "num_input_tokens_seen": 48262600, + "step": 2231, + "time_per_iteration": 2.6759045124053955 + }, + { + "auxiliary_loss_clip": 0.01111387, + "auxiliary_loss_mlp": 0.01057582, + "balance_loss_clip": 1.04997492, + "balance_loss_mlp": 1.03499198, + "epoch": 0.13419509995490755, + "flos": 29862020931840.0, + "grad_norm": 1.746756846769887, + "language_loss": 0.72152746, + "learning_rate": 3.887191701647992e-06, + "loss": 0.74321723, + "num_input_tokens_seen": 48285075, + "step": 2232, + "time_per_iteration": 4.391890048980713 + }, + { + "auxiliary_loss_clip": 0.0112104, + "auxiliary_loss_mlp": 0.01051805, + "balance_loss_clip": 1.0481019, + "balance_loss_mlp": 1.03039551, + "epoch": 0.13425522320757552, + "flos": 26942986275840.0, + "grad_norm": 2.4719586176391686, + "language_loss": 0.65116024, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.67288864, + "num_input_tokens_seen": 48301285, + "step": 2233, + "time_per_iteration": 4.234508037567139 + }, + { + "auxiliary_loss_clip": 0.01167005, + "auxiliary_loss_mlp": 0.0104461, + "balance_loss_clip": 1.05189967, + "balance_loss_mlp": 1.02421367, + "epoch": 0.1343153464602435, + "flos": 15777281871360.0, + "grad_norm": 2.4864430088666656, + "language_loss": 0.80878961, + "learning_rate": 3.886933657403615e-06, + "loss": 0.8309058, + "num_input_tokens_seen": 48317835, + "step": 2234, + "time_per_iteration": 4.175215005874634 + }, + { + "auxiliary_loss_clip": 0.01140761, + "auxiliary_loss_mlp": 0.01054039, + "balance_loss_clip": 1.05052733, + "balance_loss_mlp": 1.03268874, + "epoch": 0.13437546971291148, + "flos": 24314756129280.0, + "grad_norm": 2.0569321713284827, + "language_loss": 0.82114553, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84309351, + "num_input_tokens_seen": 48335670, + "step": 2235, + "time_per_iteration": 2.6588025093078613 + }, + { + "auxiliary_loss_clip": 0.01149093, + "auxiliary_loss_mlp": 0.01052015, + "balance_loss_clip": 1.05040097, + "balance_loss_mlp": 1.02983022, + "epoch": 0.13443559296557944, + "flos": 26650673395200.0, + "grad_norm": 1.6363146905087136, + "language_loss": 0.86092007, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.88293117, + "num_input_tokens_seen": 48357805, + "step": 2236, + "time_per_iteration": 4.349383592605591 + }, + { + "auxiliary_loss_clip": 0.01166751, + "auxiliary_loss_mlp": 0.01047925, + "balance_loss_clip": 1.05288053, + "balance_loss_mlp": 1.02724242, + "epoch": 0.1344957162182474, + "flos": 21796197183360.0, + "grad_norm": 1.82135056053112, + "language_loss": 0.77258497, + "learning_rate": 3.886546054403946e-06, + "loss": 0.79473174, + "num_input_tokens_seen": 48377845, + "step": 2237, + "time_per_iteration": 2.6398766040802 + }, + { + "auxiliary_loss_clip": 0.01145425, + "auxiliary_loss_mlp": 0.01051006, + "balance_loss_clip": 1.05016851, + "balance_loss_mlp": 1.02919102, + "epoch": 0.13455583947091537, + "flos": 19865568049920.0, + "grad_norm": 2.440947698046141, + "language_loss": 0.78772336, + "learning_rate": 3.886416710321491e-06, + "loss": 0.80968761, + "num_input_tokens_seen": 48394735, + "step": 2238, + "time_per_iteration": 2.6556923389434814 + }, + { + "auxiliary_loss_clip": 0.01141594, + "auxiliary_loss_mlp": 0.01050085, + "balance_loss_clip": 1.05123293, + "balance_loss_mlp": 1.02878201, + "epoch": 0.13461596272358334, + "flos": 30846835094400.0, + "grad_norm": 2.9136729194949735, + "language_loss": 0.68486369, + "learning_rate": 3.886287294705924e-06, + "loss": 0.70678043, + "num_input_tokens_seen": 48414200, + "step": 2239, + "time_per_iteration": 2.6778814792633057 + }, + { + "auxiliary_loss_clip": 0.01147129, + "auxiliary_loss_mlp": 0.01052633, + "balance_loss_clip": 1.0515976, + "balance_loss_mlp": 1.03197384, + "epoch": 0.1346760859762513, + "flos": 12494436312960.0, + "grad_norm": 2.3763106012672925, + "language_loss": 0.81277847, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.8347761, + "num_input_tokens_seen": 48431065, + "step": 2240, + "time_per_iteration": 2.5920939445495605 + }, + { + "auxiliary_loss_clip": 0.01107793, + "auxiliary_loss_mlp": 0.01049909, + "balance_loss_clip": 1.04459488, + "balance_loss_mlp": 1.02884459, + "epoch": 0.1347362092289193, + "flos": 21836022387840.0, + "grad_norm": 1.7269080191231387, + "language_loss": 0.77183759, + "learning_rate": 3.886028248895093e-06, + "loss": 0.79341465, + "num_input_tokens_seen": 48450335, + "step": 2241, + "time_per_iteration": 2.7224419116973877 + }, + { + "auxiliary_loss_clip": 0.0116331, + "auxiliary_loss_mlp": 0.01041419, + "balance_loss_clip": 1.05439126, + "balance_loss_mlp": 1.02324009, + "epoch": 0.13479633248158726, + "flos": 23509459163520.0, + "grad_norm": 2.0305903786470743, + "language_loss": 0.83062387, + "learning_rate": 3.88589861870965e-06, + "loss": 0.85267115, + "num_input_tokens_seen": 48468555, + "step": 2242, + "time_per_iteration": 2.5794169902801514 + }, + { + "auxiliary_loss_clip": 0.01170048, + "auxiliary_loss_mlp": 0.01056609, + "balance_loss_clip": 1.05504107, + "balance_loss_mlp": 1.03469825, + "epoch": 0.13485645573425523, + "flos": 29344332165120.0, + "grad_norm": 2.465549548535016, + "language_loss": 0.6498239, + "learning_rate": 3.885768917010744e-06, + "loss": 0.67209053, + "num_input_tokens_seen": 48488515, + "step": 2243, + "time_per_iteration": 2.6709110736846924 + }, + { + "auxiliary_loss_clip": 0.01125086, + "auxiliary_loss_mlp": 0.01046786, + "balance_loss_clip": 1.04593956, + "balance_loss_mlp": 1.02618706, + "epoch": 0.1349165789869232, + "flos": 28037112503040.0, + "grad_norm": 1.7770524512670738, + "language_loss": 0.72633034, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.74804902, + "num_input_tokens_seen": 48510515, + "step": 2244, + "time_per_iteration": 2.713803768157959 + }, + { + "auxiliary_loss_clip": 0.0115377, + "auxiliary_loss_mlp": 0.0105148, + "balance_loss_clip": 1.05312431, + "balance_loss_mlp": 1.03209639, + "epoch": 0.13497670223959116, + "flos": 22853730430080.0, + "grad_norm": 1.7564166456764931, + "language_loss": 0.86023217, + "learning_rate": 3.88550929909221e-06, + "loss": 0.88228464, + "num_input_tokens_seen": 48529940, + "step": 2245, + "time_per_iteration": 2.626560926437378 + }, + { + "auxiliary_loss_clip": 0.01149467, + "auxiliary_loss_mlp": 0.0105327, + "balance_loss_clip": 1.05035663, + "balance_loss_mlp": 1.03346968, + "epoch": 0.13503682549225912, + "flos": 16504580453760.0, + "grad_norm": 1.7861449859595755, + "language_loss": 0.78912753, + "learning_rate": 3.88537938288243e-06, + "loss": 0.8111549, + "num_input_tokens_seen": 48548190, + "step": 2246, + "time_per_iteration": 2.6543703079223633 + }, + { + "auxiliary_loss_clip": 0.010304, + "auxiliary_loss_mlp": 0.01015407, + "balance_loss_clip": 1.03666449, + "balance_loss_mlp": 1.01285601, + "epoch": 0.1350969487449271, + "flos": 70756303242240.0, + "grad_norm": 0.7509256694227144, + "language_loss": 0.6054731, + "learning_rate": 3.885249395178874e-06, + "loss": 0.62593114, + "num_input_tokens_seen": 48613165, + "step": 2247, + "time_per_iteration": 3.3349809646606445 + }, + { + "auxiliary_loss_clip": 0.01162017, + "auxiliary_loss_mlp": 0.01056869, + "balance_loss_clip": 1.05492628, + "balance_loss_mlp": 1.03470767, + "epoch": 0.13515707199759508, + "flos": 23075981832960.0, + "grad_norm": 2.562042993856578, + "language_loss": 0.80841738, + "learning_rate": 3.885119335986473e-06, + "loss": 0.83060622, + "num_input_tokens_seen": 48631705, + "step": 2248, + "time_per_iteration": 2.6279287338256836 + }, + { + "auxiliary_loss_clip": 0.0114073, + "auxiliary_loss_mlp": 0.01049128, + "balance_loss_clip": 1.05086231, + "balance_loss_mlp": 1.03054309, + "epoch": 0.13521719525026304, + "flos": 23186371305600.0, + "grad_norm": 1.9247838227480492, + "language_loss": 0.77108699, + "learning_rate": 3.884989205310157e-06, + "loss": 0.79298556, + "num_input_tokens_seen": 48649740, + "step": 2249, + "time_per_iteration": 2.7100210189819336 + }, + { + "auxiliary_loss_clip": 0.0112733, + "auxiliary_loss_mlp": 0.01057649, + "balance_loss_clip": 1.05325472, + "balance_loss_mlp": 1.03863478, + "epoch": 0.135277318502931, + "flos": 24790931752320.0, + "grad_norm": 1.7403695434994237, + "language_loss": 0.84457541, + "learning_rate": 3.884859003154862e-06, + "loss": 0.86642522, + "num_input_tokens_seen": 48671565, + "step": 2250, + "time_per_iteration": 2.789350986480713 + }, + { + "auxiliary_loss_clip": 0.01155547, + "auxiliary_loss_mlp": 0.0105348, + "balance_loss_clip": 1.05310512, + "balance_loss_mlp": 1.03243995, + "epoch": 0.13533744175559898, + "flos": 21908525990400.0, + "grad_norm": 3.018154510939524, + "language_loss": 0.81796515, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84005541, + "num_input_tokens_seen": 48690425, + "step": 2251, + "time_per_iteration": 2.685617208480835 + }, + { + "auxiliary_loss_clip": 0.01165433, + "auxiliary_loss_mlp": 0.01060257, + "balance_loss_clip": 1.05235004, + "balance_loss_mlp": 1.03888273, + "epoch": 0.13539756500826694, + "flos": 21211643249280.0, + "grad_norm": 1.7680273527580506, + "language_loss": 0.86173487, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88399172, + "num_input_tokens_seen": 48707505, + "step": 2252, + "time_per_iteration": 2.597219467163086 + }, + { + "auxiliary_loss_clip": 0.01052296, + "auxiliary_loss_mlp": 0.01018557, + "balance_loss_clip": 1.02446079, + "balance_loss_mlp": 1.01632786, + "epoch": 0.1354576882609349, + "flos": 63242103634560.0, + "grad_norm": 0.8028920055572067, + "language_loss": 0.61837333, + "learning_rate": 3.884467967864485e-06, + "loss": 0.6390819, + "num_input_tokens_seen": 48775895, + "step": 2253, + "time_per_iteration": 3.25115704536438 + }, + { + "auxiliary_loss_clip": 0.01155107, + "auxiliary_loss_mlp": 0.01055639, + "balance_loss_clip": 1.0539906, + "balance_loss_mlp": 1.03587449, + "epoch": 0.1355178115136029, + "flos": 25483037984640.0, + "grad_norm": 1.6376691715964824, + "language_loss": 0.89441288, + "learning_rate": 3.884337479842671e-06, + "loss": 0.91652036, + "num_input_tokens_seen": 48798370, + "step": 2254, + "time_per_iteration": 2.6803932189941406 + }, + { + "auxiliary_loss_clip": 0.01131786, + "auxiliary_loss_mlp": 0.01063066, + "balance_loss_clip": 1.04506016, + "balance_loss_mlp": 1.03872383, + "epoch": 0.13557793476627086, + "flos": 21616967295360.0, + "grad_norm": 2.1104776784573787, + "language_loss": 0.84626925, + "learning_rate": 3.884206920366591e-06, + "loss": 0.86821771, + "num_input_tokens_seen": 48817955, + "step": 2255, + "time_per_iteration": 2.7074074745178223 + }, + { + "auxiliary_loss_clip": 0.01165481, + "auxiliary_loss_mlp": 0.01058458, + "balance_loss_clip": 1.05211091, + "balance_loss_mlp": 1.03767991, + "epoch": 0.13563805801893883, + "flos": 24928253447040.0, + "grad_norm": 4.791676738707355, + "language_loss": 0.74684238, + "learning_rate": 3.884076289441196e-06, + "loss": 0.76908177, + "num_input_tokens_seen": 48836330, + "step": 2256, + "time_per_iteration": 2.590178966522217 + }, + { + "auxiliary_loss_clip": 0.01127027, + "auxiliary_loss_mlp": 0.01054317, + "balance_loss_clip": 1.04977024, + "balance_loss_mlp": 1.03338361, + "epoch": 0.1356981812716068, + "flos": 14750272206720.0, + "grad_norm": 5.890843360804152, + "language_loss": 0.8309083, + "learning_rate": 3.88394558707144e-06, + "loss": 0.85272169, + "num_input_tokens_seen": 48851890, + "step": 2257, + "time_per_iteration": 2.642096519470215 + }, + { + "auxiliary_loss_clip": 0.0114984, + "auxiliary_loss_mlp": 0.00780177, + "balance_loss_clip": 1.05128407, + "balance_loss_mlp": 1.00013828, + "epoch": 0.13575830452427476, + "flos": 11108571822720.0, + "grad_norm": 2.1957250492246505, + "language_loss": 0.82045269, + "learning_rate": 3.883814813262277e-06, + "loss": 0.83975297, + "num_input_tokens_seen": 48865510, + "step": 2258, + "time_per_iteration": 2.6279473304748535 + }, + { + "auxiliary_loss_clip": 0.01155515, + "auxiliary_loss_mlp": 0.01054519, + "balance_loss_clip": 1.05172098, + "balance_loss_mlp": 1.03152323, + "epoch": 0.13581842777694272, + "flos": 17960290940160.0, + "grad_norm": 2.6364031487830464, + "language_loss": 0.82694167, + "learning_rate": 3.883683968018669e-06, + "loss": 0.849042, + "num_input_tokens_seen": 48882360, + "step": 2259, + "time_per_iteration": 2.677804708480835 + }, + { + "auxiliary_loss_clip": 0.01127201, + "auxiliary_loss_mlp": 0.01054646, + "balance_loss_clip": 1.0495683, + "balance_loss_mlp": 1.03547728, + "epoch": 0.1358785510296107, + "flos": 22857142222080.0, + "grad_norm": 2.0790748617118853, + "language_loss": 0.73916006, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.76097858, + "num_input_tokens_seen": 48902700, + "step": 2260, + "time_per_iteration": 2.7416799068450928 + }, + { + "auxiliary_loss_clip": 0.01144177, + "auxiliary_loss_mlp": 0.01056881, + "balance_loss_clip": 1.05196047, + "balance_loss_mlp": 1.03691387, + "epoch": 0.13593867428227868, + "flos": 25739404329600.0, + "grad_norm": 3.546593987683097, + "language_loss": 0.74799728, + "learning_rate": 3.883422063247961e-06, + "loss": 0.77000785, + "num_input_tokens_seen": 48922525, + "step": 2261, + "time_per_iteration": 2.675342559814453 + }, + { + "auxiliary_loss_clip": 0.01170469, + "auxiliary_loss_mlp": 0.01050986, + "balance_loss_clip": 1.05486035, + "balance_loss_mlp": 1.03043413, + "epoch": 0.13599879753494665, + "flos": 31249214225280.0, + "grad_norm": 2.967396076139427, + "language_loss": 0.63602281, + "learning_rate": 3.883291003730794e-06, + "loss": 0.65823734, + "num_input_tokens_seen": 48942510, + "step": 2262, + "time_per_iteration": 2.660538911819458 + }, + { + "auxiliary_loss_clip": 0.01148004, + "auxiliary_loss_mlp": 0.01052118, + "balance_loss_clip": 1.0516696, + "balance_loss_mlp": 1.03216195, + "epoch": 0.1360589207876146, + "flos": 23915034604800.0, + "grad_norm": 2.301949377353301, + "language_loss": 0.81810403, + "learning_rate": 3.883159872799043e-06, + "loss": 0.84010524, + "num_input_tokens_seen": 48962625, + "step": 2263, + "time_per_iteration": 2.840043783187866 + }, + { + "auxiliary_loss_clip": 0.01098888, + "auxiliary_loss_mlp": 0.01064302, + "balance_loss_clip": 1.04875195, + "balance_loss_mlp": 1.0410558, + "epoch": 0.13611904404028258, + "flos": 19974197756160.0, + "grad_norm": 1.7561035968690553, + "language_loss": 0.87737143, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.89900339, + "num_input_tokens_seen": 48982525, + "step": 2264, + "time_per_iteration": 2.784648895263672 + }, + { + "auxiliary_loss_clip": 0.01157618, + "auxiliary_loss_mlp": 0.01049521, + "balance_loss_clip": 1.05161715, + "balance_loss_mlp": 1.02709746, + "epoch": 0.13617916729295054, + "flos": 15340644144000.0, + "grad_norm": 3.151792845640157, + "language_loss": 0.7115528, + "learning_rate": 3.882897396711683e-06, + "loss": 0.7336241, + "num_input_tokens_seen": 48997605, + "step": 2265, + "time_per_iteration": 2.6108245849609375 + }, + { + "auxiliary_loss_clip": 0.01111831, + "auxiliary_loss_mlp": 0.01042545, + "balance_loss_clip": 1.05199265, + "balance_loss_mlp": 1.02256525, + "epoch": 0.1362392905456185, + "flos": 27451445247360.0, + "grad_norm": 4.918827494175735, + "language_loss": 0.6671263, + "learning_rate": 3.882766051566027e-06, + "loss": 0.68867004, + "num_input_tokens_seen": 49018535, + "step": 2266, + "time_per_iteration": 2.7810373306274414 + }, + { + "auxiliary_loss_clip": 0.01127539, + "auxiliary_loss_mlp": 0.01057589, + "balance_loss_clip": 1.05683684, + "balance_loss_mlp": 1.03739524, + "epoch": 0.1362994137982865, + "flos": 25009017177600.0, + "grad_norm": 1.707924588861666, + "language_loss": 0.7634865, + "learning_rate": 3.882634635025694e-06, + "loss": 0.78533769, + "num_input_tokens_seen": 49038865, + "step": 2267, + "time_per_iteration": 2.7682721614837646 + }, + { + "auxiliary_loss_clip": 0.01133448, + "auxiliary_loss_mlp": 0.01048207, + "balance_loss_clip": 1.04668903, + "balance_loss_mlp": 1.02641535, + "epoch": 0.13635953705095447, + "flos": 20303031790080.0, + "grad_norm": 2.9531688260339934, + "language_loss": 0.81653506, + "learning_rate": 3.882503147095667e-06, + "loss": 0.83835161, + "num_input_tokens_seen": 49058010, + "step": 2268, + "time_per_iteration": 2.645081043243408 + }, + { + "auxiliary_loss_clip": 0.01155147, + "auxiliary_loss_mlp": 0.01048448, + "balance_loss_clip": 1.05424881, + "balance_loss_mlp": 1.02738333, + "epoch": 0.13641966030362243, + "flos": 31358418549120.0, + "grad_norm": 1.9923150848418427, + "language_loss": 0.75975174, + "learning_rate": 3.882371587780931e-06, + "loss": 0.78178769, + "num_input_tokens_seen": 49080330, + "step": 2269, + "time_per_iteration": 2.6764814853668213 + }, + { + "auxiliary_loss_clip": 0.0113465, + "auxiliary_loss_mlp": 0.01049702, + "balance_loss_clip": 1.04941857, + "balance_loss_mlp": 1.02844727, + "epoch": 0.1364797835562904, + "flos": 20478095700480.0, + "grad_norm": 2.1475090354855473, + "language_loss": 0.81328762, + "learning_rate": 3.882239957086477e-06, + "loss": 0.83513117, + "num_input_tokens_seen": 49097035, + "step": 2270, + "time_per_iteration": 2.6801655292510986 + }, + { + "auxiliary_loss_clip": 0.01142111, + "auxiliary_loss_mlp": 0.010594, + "balance_loss_clip": 1.04989171, + "balance_loss_mlp": 1.03773928, + "epoch": 0.13653990680895836, + "flos": 13078343802240.0, + "grad_norm": 3.2227070482893976, + "language_loss": 0.75812757, + "learning_rate": 3.882108255017295e-06, + "loss": 0.78014266, + "num_input_tokens_seen": 49113945, + "step": 2271, + "time_per_iteration": 4.197805166244507 + }, + { + "auxiliary_loss_clip": 0.01156913, + "auxiliary_loss_mlp": 0.01061846, + "balance_loss_clip": 1.05097795, + "balance_loss_mlp": 1.03921962, + "epoch": 0.13660003006162633, + "flos": 16946712961920.0, + "grad_norm": 2.2800716885469754, + "language_loss": 0.80251753, + "learning_rate": 3.881976481578379e-06, + "loss": 0.82470512, + "num_input_tokens_seen": 49132855, + "step": 2272, + "time_per_iteration": 4.1461029052734375 + }, + { + "auxiliary_loss_clip": 0.01055091, + "auxiliary_loss_mlp": 0.01042701, + "balance_loss_clip": 1.02539539, + "balance_loss_mlp": 1.04001904, + "epoch": 0.1366601533142943, + "flos": 68682749892480.0, + "grad_norm": 0.7097054685047118, + "language_loss": 0.60739923, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62837708, + "num_input_tokens_seen": 49198310, + "step": 2273, + "time_per_iteration": 4.731219530105591 + }, + { + "auxiliary_loss_clip": 0.01165514, + "auxiliary_loss_mlp": 0.00780474, + "balance_loss_clip": 1.0523783, + "balance_loss_mlp": 1.00008452, + "epoch": 0.13672027656696228, + "flos": 19244241567360.0, + "grad_norm": 2.4844725334882583, + "language_loss": 0.77506429, + "learning_rate": 3.881712720611336e-06, + "loss": 0.79452413, + "num_input_tokens_seen": 49217250, + "step": 2274, + "time_per_iteration": 2.7122738361358643 + }, + { + "auxiliary_loss_clip": 0.01154937, + "auxiliary_loss_mlp": 0.01054542, + "balance_loss_clip": 1.05082417, + "balance_loss_mlp": 1.03271496, + "epoch": 0.13678039981963025, + "flos": 24534924543360.0, + "grad_norm": 2.391437383339344, + "language_loss": 0.78256011, + "learning_rate": 3.881580733093211e-06, + "loss": 0.8046549, + "num_input_tokens_seen": 49236615, + "step": 2275, + "time_per_iteration": 2.6674444675445557 + }, + { + "auxiliary_loss_clip": 0.01154585, + "auxiliary_loss_mlp": 0.01044634, + "balance_loss_clip": 1.05220842, + "balance_loss_mlp": 1.02449977, + "epoch": 0.13684052307229821, + "flos": 15669334523520.0, + "grad_norm": 2.271072834476717, + "language_loss": 0.81682789, + "learning_rate": 3.881448674225356e-06, + "loss": 0.83882004, + "num_input_tokens_seen": 49253935, + "step": 2276, + "time_per_iteration": 4.202202558517456 + }, + { + "auxiliary_loss_clip": 0.01164941, + "auxiliary_loss_mlp": 0.01060078, + "balance_loss_clip": 1.05228245, + "balance_loss_mlp": 1.03604531, + "epoch": 0.13690064632496618, + "flos": 28364689560960.0, + "grad_norm": 5.063053962589045, + "language_loss": 0.69948691, + "learning_rate": 3.881316544012779e-06, + "loss": 0.72173715, + "num_input_tokens_seen": 49273605, + "step": 2277, + "time_per_iteration": 2.708591938018799 + }, + { + "auxiliary_loss_clip": 0.01160044, + "auxiliary_loss_mlp": 0.00780297, + "balance_loss_clip": 1.05169702, + "balance_loss_mlp": 1.00017083, + "epoch": 0.13696076957763414, + "flos": 23404779953280.0, + "grad_norm": 2.062701620585305, + "language_loss": 0.80197465, + "learning_rate": 3.88118434246049e-06, + "loss": 0.82137805, + "num_input_tokens_seen": 49291785, + "step": 2278, + "time_per_iteration": 2.6916158199310303 + }, + { + "auxiliary_loss_clip": 0.01159146, + "auxiliary_loss_mlp": 0.01060686, + "balance_loss_clip": 1.05954766, + "balance_loss_mlp": 1.03925228, + "epoch": 0.1370208928303021, + "flos": 37196595601920.0, + "grad_norm": 7.088344486179519, + "language_loss": 0.75048816, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77268648, + "num_input_tokens_seen": 49311405, + "step": 2279, + "time_per_iteration": 2.7316977977752686 + }, + { + "auxiliary_loss_clip": 0.01101952, + "auxiliary_loss_mlp": 0.01066685, + "balance_loss_clip": 1.04605758, + "balance_loss_mlp": 1.04485774, + "epoch": 0.13708101608297008, + "flos": 26976311118720.0, + "grad_norm": 2.5293116992138223, + "language_loss": 0.76743513, + "learning_rate": 3.880919725356831e-06, + "loss": 0.78912151, + "num_input_tokens_seen": 49331835, + "step": 2280, + "time_per_iteration": 2.813720941543579 + }, + { + "auxiliary_loss_clip": 0.01108594, + "auxiliary_loss_mlp": 0.01060805, + "balance_loss_clip": 1.04457331, + "balance_loss_mlp": 1.04022956, + "epoch": 0.13714113933563807, + "flos": 32556864850560.0, + "grad_norm": 2.0597640944890325, + "language_loss": 0.79657966, + "learning_rate": 3.880787309815496e-06, + "loss": 0.81827366, + "num_input_tokens_seen": 49352290, + "step": 2281, + "time_per_iteration": 2.8325345516204834 + }, + { + "auxiliary_loss_clip": 0.0117656, + "auxiliary_loss_mlp": 0.0107773, + "balance_loss_clip": 1.05715084, + "balance_loss_mlp": 1.05671358, + "epoch": 0.13720126258830603, + "flos": 16101267569280.0, + "grad_norm": 2.0769142230572877, + "language_loss": 0.83383757, + "learning_rate": 3.880654822954518e-06, + "loss": 0.85638046, + "num_input_tokens_seen": 49370285, + "step": 2282, + "time_per_iteration": 2.5988755226135254 + }, + { + "auxiliary_loss_clip": 0.01142098, + "auxiliary_loss_mlp": 0.01075909, + "balance_loss_clip": 1.04898703, + "balance_loss_mlp": 1.05583453, + "epoch": 0.137261385840974, + "flos": 18953544798720.0, + "grad_norm": 1.5269487193470777, + "language_loss": 0.73526621, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.75744629, + "num_input_tokens_seen": 49389610, + "step": 2283, + "time_per_iteration": 2.7099714279174805 + }, + { + "auxiliary_loss_clip": 0.01160178, + "auxiliary_loss_mlp": 0.01062577, + "balance_loss_clip": 1.05577087, + "balance_loss_mlp": 1.04173923, + "epoch": 0.13732150909364196, + "flos": 23295360147840.0, + "grad_norm": 2.2306012559941455, + "language_loss": 0.83934438, + "learning_rate": 3.880389635293729e-06, + "loss": 0.86157191, + "num_input_tokens_seen": 49408390, + "step": 2284, + "time_per_iteration": 2.7315831184387207 + }, + { + "auxiliary_loss_clip": 0.01151427, + "auxiliary_loss_mlp": 0.01070288, + "balance_loss_clip": 1.05204272, + "balance_loss_mlp": 1.04779351, + "epoch": 0.13738163234630993, + "flos": 29351263489920.0, + "grad_norm": 2.0900141273659223, + "language_loss": 0.7557056, + "learning_rate": 3.880256934503974e-06, + "loss": 0.77792281, + "num_input_tokens_seen": 49427725, + "step": 2285, + "time_per_iteration": 2.7257747650146484 + }, + { + "auxiliary_loss_clip": 0.01144078, + "auxiliary_loss_mlp": 0.01064539, + "balance_loss_clip": 1.05233073, + "balance_loss_mlp": 1.04392731, + "epoch": 0.1374417555989779, + "flos": 26651319840000.0, + "grad_norm": 2.727019945657865, + "language_loss": 0.74521589, + "learning_rate": 3.880124162414689e-06, + "loss": 0.76730204, + "num_input_tokens_seen": 49449000, + "step": 2286, + "time_per_iteration": 2.742582082748413 + }, + { + "auxiliary_loss_clip": 0.0112541, + "auxiliary_loss_mlp": 0.01059198, + "balance_loss_clip": 1.04906356, + "balance_loss_mlp": 1.03659606, + "epoch": 0.1375018788516459, + "flos": 28403401443840.0, + "grad_norm": 2.2168449035378357, + "language_loss": 0.86683542, + "learning_rate": 3.879991319030908e-06, + "loss": 0.88868147, + "num_input_tokens_seen": 49468360, + "step": 2287, + "time_per_iteration": 2.802088499069214 + }, + { + "auxiliary_loss_clip": 0.01124712, + "auxiliary_loss_mlp": 0.01064517, + "balance_loss_clip": 1.04803944, + "balance_loss_mlp": 1.04207003, + "epoch": 0.13756200210431385, + "flos": 37413783187200.0, + "grad_norm": 2.0592152854463106, + "language_loss": 0.68410838, + "learning_rate": 3.879858404357666e-06, + "loss": 0.70600063, + "num_input_tokens_seen": 49493450, + "step": 2288, + "time_per_iteration": 2.861175537109375 + }, + { + "auxiliary_loss_clip": 0.01112106, + "auxiliary_loss_mlp": 0.01071262, + "balance_loss_clip": 1.05062151, + "balance_loss_mlp": 1.04666936, + "epoch": 0.13762212535698182, + "flos": 22711021695360.0, + "grad_norm": 2.3933568244149357, + "language_loss": 0.87090456, + "learning_rate": 3.879725418400005e-06, + "loss": 0.89273822, + "num_input_tokens_seen": 49511220, + "step": 2289, + "time_per_iteration": 2.7185773849487305 + }, + { + "auxiliary_loss_clip": 0.01130193, + "auxiliary_loss_mlp": 0.00781167, + "balance_loss_clip": 1.0480957, + "balance_loss_mlp": 1.00019848, + "epoch": 0.13768224860964978, + "flos": 23952130375680.0, + "grad_norm": 1.8106848287624444, + "language_loss": 0.74668044, + "learning_rate": 3.879592361162969e-06, + "loss": 0.76579404, + "num_input_tokens_seen": 49529820, + "step": 2290, + "time_per_iteration": 2.6751222610473633 + }, + { + "auxiliary_loss_clip": 0.01039657, + "auxiliary_loss_mlp": 0.01081332, + "balance_loss_clip": 1.03094769, + "balance_loss_mlp": 1.07881641, + "epoch": 0.13774237186231775, + "flos": 63590438753280.0, + "grad_norm": 0.7179159366671727, + "language_loss": 0.51597112, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53718102, + "num_input_tokens_seen": 49595325, + "step": 2291, + "time_per_iteration": 3.2823359966278076 + }, + { + "auxiliary_loss_clip": 0.01157406, + "auxiliary_loss_mlp": 0.01052846, + "balance_loss_clip": 1.05224037, + "balance_loss_mlp": 1.03123331, + "epoch": 0.1378024951149857, + "flos": 24279456038400.0, + "grad_norm": 1.9326408617769533, + "language_loss": 0.71273667, + "learning_rate": 3.879326032870952e-06, + "loss": 0.7348392, + "num_input_tokens_seen": 49615850, + "step": 2292, + "time_per_iteration": 2.74045729637146 + }, + { + "auxiliary_loss_clip": 0.01156871, + "auxiliary_loss_mlp": 0.01049315, + "balance_loss_clip": 1.05427122, + "balance_loss_mlp": 1.02931166, + "epoch": 0.13786261836765368, + "flos": 14021537080320.0, + "grad_norm": 6.592759889378346, + "language_loss": 0.8047784, + "learning_rate": 3.879192761826071e-06, + "loss": 0.82684022, + "num_input_tokens_seen": 49631860, + "step": 2293, + "time_per_iteration": 2.587576389312744 + }, + { + "auxiliary_loss_clip": 0.0115787, + "auxiliary_loss_mlp": 0.0104972, + "balance_loss_clip": 1.0554558, + "balance_loss_mlp": 1.02921653, + "epoch": 0.13792274162032167, + "flos": 28878679226880.0, + "grad_norm": 1.9082895606463517, + "language_loss": 0.78440171, + "learning_rate": 3.879059419522011e-06, + "loss": 0.80647767, + "num_input_tokens_seen": 49652145, + "step": 2294, + "time_per_iteration": 2.7152793407440186 + }, + { + "auxiliary_loss_clip": 0.01126374, + "auxiliary_loss_mlp": 0.01050648, + "balance_loss_clip": 1.05281758, + "balance_loss_mlp": 1.03104973, + "epoch": 0.13798286487298964, + "flos": 21141150808320.0, + "grad_norm": 1.991103290125302, + "language_loss": 0.80339509, + "learning_rate": 3.878926005963831e-06, + "loss": 0.82516527, + "num_input_tokens_seen": 49669880, + "step": 2295, + "time_per_iteration": 2.7026021480560303 + }, + { + "auxiliary_loss_clip": 0.01154693, + "auxiliary_loss_mlp": 0.01052186, + "balance_loss_clip": 1.05239046, + "balance_loss_mlp": 1.03102624, + "epoch": 0.1380429881256576, + "flos": 22487477402880.0, + "grad_norm": 1.7450624966187134, + "language_loss": 0.78661883, + "learning_rate": 3.878792521156588e-06, + "loss": 0.80868757, + "num_input_tokens_seen": 49687255, + "step": 2296, + "time_per_iteration": 2.566929340362549 + }, + { + "auxiliary_loss_clip": 0.01153425, + "auxiliary_loss_mlp": 0.01069343, + "balance_loss_clip": 1.05437231, + "balance_loss_mlp": 1.04811132, + "epoch": 0.13810311137832557, + "flos": 21393674398080.0, + "grad_norm": 1.7434096141785573, + "language_loss": 0.78663194, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.80885959, + "num_input_tokens_seen": 49706650, + "step": 2297, + "time_per_iteration": 2.6254489421844482 + }, + { + "auxiliary_loss_clip": 0.01110905, + "auxiliary_loss_mlp": 0.01059754, + "balance_loss_clip": 1.05296302, + "balance_loss_mlp": 1.03871369, + "epoch": 0.13816323463099353, + "flos": 25989844930560.0, + "grad_norm": 1.929043788877404, + "language_loss": 0.69199705, + "learning_rate": 3.878525337815164e-06, + "loss": 0.71370363, + "num_input_tokens_seen": 49725715, + "step": 2298, + "time_per_iteration": 2.791301965713501 + }, + { + "auxiliary_loss_clip": 0.01137772, + "auxiliary_loss_mlp": 0.01061768, + "balance_loss_clip": 1.0517292, + "balance_loss_mlp": 1.04059684, + "epoch": 0.1382233578836615, + "flos": 19244313394560.0, + "grad_norm": 1.7910922430646712, + "language_loss": 0.86382294, + "learning_rate": 3.878391639291116e-06, + "loss": 0.88581836, + "num_input_tokens_seen": 49744710, + "step": 2299, + "time_per_iteration": 2.6075453758239746 + }, + { + "auxiliary_loss_clip": 0.01166817, + "auxiliary_loss_mlp": 0.01054863, + "balance_loss_clip": 1.05378175, + "balance_loss_mlp": 1.03292871, + "epoch": 0.1382834811363295, + "flos": 25666290195840.0, + "grad_norm": 2.2378660690879606, + "language_loss": 0.75468475, + "learning_rate": 3.878257869538267e-06, + "loss": 0.77690154, + "num_input_tokens_seen": 49764300, + "step": 2300, + "time_per_iteration": 2.663328170776367 + }, + { + "auxiliary_loss_clip": 0.01130608, + "auxiliary_loss_mlp": 0.01047248, + "balance_loss_clip": 1.05274105, + "balance_loss_mlp": 1.02664876, + "epoch": 0.13834360438899745, + "flos": 19784193788160.0, + "grad_norm": 2.5571861214345963, + "language_loss": 0.82463622, + "learning_rate": 3.878124028561692e-06, + "loss": 0.8464148, + "num_input_tokens_seen": 49778380, + "step": 2301, + "time_per_iteration": 2.6705129146575928 + }, + { + "auxiliary_loss_clip": 0.0113862, + "auxiliary_loss_mlp": 0.00777879, + "balance_loss_clip": 1.05323792, + "balance_loss_mlp": 1.00021625, + "epoch": 0.13840372764166542, + "flos": 26651858544000.0, + "grad_norm": 1.9612043619218924, + "language_loss": 0.85957694, + "learning_rate": 3.877990116366466e-06, + "loss": 0.87874192, + "num_input_tokens_seen": 49797460, + "step": 2302, + "time_per_iteration": 2.679797410964966 + }, + { + "auxiliary_loss_clip": 0.01059341, + "auxiliary_loss_mlp": 0.01025212, + "balance_loss_clip": 1.03226125, + "balance_loss_mlp": 1.02244604, + "epoch": 0.13846385089433338, + "flos": 70510998286080.0, + "grad_norm": 0.7598813547967705, + "language_loss": 0.65591633, + "learning_rate": 3.877856132957667e-06, + "loss": 0.67676187, + "num_input_tokens_seen": 49868005, + "step": 2303, + "time_per_iteration": 3.3249399662017822 + }, + { + "auxiliary_loss_clip": 0.01151443, + "auxiliary_loss_mlp": 0.01046478, + "balance_loss_clip": 1.05337632, + "balance_loss_mlp": 1.02655792, + "epoch": 0.13852397414700135, + "flos": 17348732956800.0, + "grad_norm": 3.141207945865242, + "language_loss": 0.78663635, + "learning_rate": 3.877722078340374e-06, + "loss": 0.80861557, + "num_input_tokens_seen": 49885825, + "step": 2304, + "time_per_iteration": 2.7364001274108887 + }, + { + "auxiliary_loss_clip": 0.01157514, + "auxiliary_loss_mlp": 0.01043253, + "balance_loss_clip": 1.05566275, + "balance_loss_mlp": 1.02385736, + "epoch": 0.13858409739966931, + "flos": 21543781334400.0, + "grad_norm": 1.7487365854034607, + "language_loss": 0.77559888, + "learning_rate": 3.877587952519672e-06, + "loss": 0.79760659, + "num_input_tokens_seen": 49905975, + "step": 2305, + "time_per_iteration": 2.7814202308654785 + }, + { + "auxiliary_loss_clip": 0.01074766, + "auxiliary_loss_mlp": 0.01055718, + "balance_loss_clip": 1.04160607, + "balance_loss_mlp": 1.03473723, + "epoch": 0.13864422065233728, + "flos": 21579907438080.0, + "grad_norm": 1.8207477060355044, + "language_loss": 0.87737936, + "learning_rate": 3.877453755500647e-06, + "loss": 0.89868426, + "num_input_tokens_seen": 49925800, + "step": 2306, + "time_per_iteration": 2.917616605758667 + }, + { + "auxiliary_loss_clip": 0.01064826, + "auxiliary_loss_mlp": 0.0101208, + "balance_loss_clip": 1.02692199, + "balance_loss_mlp": 1.0094099, + "epoch": 0.13870434390500527, + "flos": 53371156872960.0, + "grad_norm": 0.8728538231155298, + "language_loss": 0.59008431, + "learning_rate": 3.877319487288387e-06, + "loss": 0.61085337, + "num_input_tokens_seen": 49977620, + "step": 2307, + "time_per_iteration": 3.4345149993896484 + }, + { + "auxiliary_loss_clip": 0.01169624, + "auxiliary_loss_mlp": 0.00778134, + "balance_loss_clip": 1.05528641, + "balance_loss_mlp": 1.00021303, + "epoch": 0.13876446715767324, + "flos": 22565906749440.0, + "grad_norm": 1.8467673932802395, + "language_loss": 0.79483795, + "learning_rate": 3.877185147887984e-06, + "loss": 0.81431556, + "num_input_tokens_seen": 49996650, + "step": 2308, + "time_per_iteration": 2.7137296199798584 + }, + { + "auxiliary_loss_clip": 0.01131024, + "auxiliary_loss_mlp": 0.01050332, + "balance_loss_clip": 1.05118585, + "balance_loss_mlp": 1.03054297, + "epoch": 0.1388245904103412, + "flos": 20705231352960.0, + "grad_norm": 2.352128383160346, + "language_loss": 0.78101134, + "learning_rate": 3.877050737304533e-06, + "loss": 0.80282485, + "num_input_tokens_seen": 50015640, + "step": 2309, + "time_per_iteration": 2.9259471893310547 + }, + { + "auxiliary_loss_clip": 0.01128109, + "auxiliary_loss_mlp": 0.01057348, + "balance_loss_clip": 1.04979932, + "balance_loss_mlp": 1.03620028, + "epoch": 0.13888471366300917, + "flos": 20554729367040.0, + "grad_norm": 3.914796791761399, + "language_loss": 0.68133545, + "learning_rate": 3.876916255543129e-06, + "loss": 0.70318997, + "num_input_tokens_seen": 50033500, + "step": 2310, + "time_per_iteration": 4.27877140045166 + }, + { + "auxiliary_loss_clip": 0.01164985, + "auxiliary_loss_mlp": 0.01062516, + "balance_loss_clip": 1.05356944, + "balance_loss_mlp": 1.04021168, + "epoch": 0.13894483691567713, + "flos": 13838033473920.0, + "grad_norm": 1.934954545600412, + "language_loss": 0.84295756, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.86523259, + "num_input_tokens_seen": 50050075, + "step": 2311, + "time_per_iteration": 2.5612359046936035 + }, + { + "auxiliary_loss_clip": 0.01173749, + "auxiliary_loss_mlp": 0.01055474, + "balance_loss_clip": 1.05752683, + "balance_loss_mlp": 1.0350771, + "epoch": 0.1390049601683451, + "flos": 28031186759040.0, + "grad_norm": 2.9213009430481143, + "language_loss": 0.82358992, + "learning_rate": 3.876647078506866e-06, + "loss": 0.84588212, + "num_input_tokens_seen": 50070080, + "step": 2312, + "time_per_iteration": 5.737139701843262 + }, + { + "auxiliary_loss_clip": 0.01129781, + "auxiliary_loss_mlp": 0.00778347, + "balance_loss_clip": 1.05464363, + "balance_loss_mlp": 1.00023031, + "epoch": 0.13906508342101306, + "flos": 26756860976640.0, + "grad_norm": 2.109799495913242, + "language_loss": 0.86732674, + "learning_rate": 3.876512383242215e-06, + "loss": 0.88640809, + "num_input_tokens_seen": 50090040, + "step": 2313, + "time_per_iteration": 2.8402304649353027 + }, + { + "auxiliary_loss_clip": 0.01168088, + "auxiliary_loss_mlp": 0.01061738, + "balance_loss_clip": 1.05670547, + "balance_loss_mlp": 1.04115057, + "epoch": 0.13912520667368106, + "flos": 24535104111360.0, + "grad_norm": 1.784990717237318, + "language_loss": 0.79935932, + "learning_rate": 3.876377616820024e-06, + "loss": 0.8216576, + "num_input_tokens_seen": 50110595, + "step": 2314, + "time_per_iteration": 2.683448076248169 + }, + { + "auxiliary_loss_clip": 0.01124732, + "auxiliary_loss_mlp": 0.01061041, + "balance_loss_clip": 1.04845023, + "balance_loss_mlp": 1.04103708, + "epoch": 0.13918532992634902, + "flos": 19383215287680.0, + "grad_norm": 2.585875079553688, + "language_loss": 0.85367405, + "learning_rate": 3.876242779245409e-06, + "loss": 0.87553179, + "num_input_tokens_seen": 50125430, + "step": 2315, + "time_per_iteration": 4.332594394683838 + }, + { + "auxiliary_loss_clip": 0.01156122, + "auxiliary_loss_mlp": 0.01058532, + "balance_loss_clip": 1.05397022, + "balance_loss_mlp": 1.0372889, + "epoch": 0.139245453179017, + "flos": 21323756574720.0, + "grad_norm": 2.333331492160627, + "language_loss": 0.77170396, + "learning_rate": 3.876107870523477e-06, + "loss": 0.79385042, + "num_input_tokens_seen": 50144120, + "step": 2316, + "time_per_iteration": 2.654604911804199 + }, + { + "auxiliary_loss_clip": 0.01163967, + "auxiliary_loss_mlp": 0.00780027, + "balance_loss_clip": 1.05353916, + "balance_loss_mlp": 1.00024533, + "epoch": 0.13930557643168495, + "flos": 19500607912320.0, + "grad_norm": 2.1485284032262086, + "language_loss": 0.76820493, + "learning_rate": 3.875972890659349e-06, + "loss": 0.78764486, + "num_input_tokens_seen": 50162500, + "step": 2317, + "time_per_iteration": 2.6501235961914062 + }, + { + "auxiliary_loss_clip": 0.01144052, + "auxiliary_loss_mlp": 0.01061042, + "balance_loss_clip": 1.05156648, + "balance_loss_mlp": 1.04074025, + "epoch": 0.13936569968435292, + "flos": 25410821690880.0, + "grad_norm": 1.7797832869421444, + "language_loss": 0.80185997, + "learning_rate": 3.875837839658139e-06, + "loss": 0.82391089, + "num_input_tokens_seen": 50182415, + "step": 2318, + "time_per_iteration": 2.7097995281219482 + }, + { + "auxiliary_loss_clip": 0.01049096, + "auxiliary_loss_mlp": 0.01048478, + "balance_loss_clip": 1.03358936, + "balance_loss_mlp": 1.04518783, + "epoch": 0.13942582293702088, + "flos": 70771063731840.0, + "grad_norm": 0.854553938374386, + "language_loss": 0.59004617, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61102188, + "num_input_tokens_seen": 50245160, + "step": 2319, + "time_per_iteration": 3.2631640434265137 + }, + { + "auxiliary_loss_clip": 0.0111484, + "auxiliary_loss_mlp": 0.01055367, + "balance_loss_clip": 1.04508984, + "balance_loss_mlp": 1.03437412, + "epoch": 0.13948594618968888, + "flos": 35590885920000.0, + "grad_norm": 2.3313836691947722, + "language_loss": 0.64993447, + "learning_rate": 3.875567524264967e-06, + "loss": 0.67163646, + "num_input_tokens_seen": 50268215, + "step": 2320, + "time_per_iteration": 2.8668782711029053 + }, + { + "auxiliary_loss_clip": 0.01096421, + "auxiliary_loss_mlp": 0.01056652, + "balance_loss_clip": 1.04400086, + "balance_loss_mlp": 1.03521848, + "epoch": 0.13954606944235684, + "flos": 21105204272640.0, + "grad_norm": 2.285151015895421, + "language_loss": 0.70708811, + "learning_rate": 3.875432259883256e-06, + "loss": 0.72861886, + "num_input_tokens_seen": 50288575, + "step": 2321, + "time_per_iteration": 2.8273603916168213 + }, + { + "auxiliary_loss_clip": 0.01117698, + "auxiliary_loss_mlp": 0.01061754, + "balance_loss_clip": 1.04603076, + "balance_loss_mlp": 1.03698206, + "epoch": 0.1396061926950248, + "flos": 25044425009280.0, + "grad_norm": 1.7926270181208543, + "language_loss": 0.85931206, + "learning_rate": 3.875296924384965e-06, + "loss": 0.88110662, + "num_input_tokens_seen": 50308735, + "step": 2322, + "time_per_iteration": 2.833807945251465 + }, + { + "auxiliary_loss_clip": 0.01120545, + "auxiliary_loss_mlp": 0.01055036, + "balance_loss_clip": 1.04616976, + "balance_loss_mlp": 1.03568828, + "epoch": 0.13966631594769277, + "flos": 37634023428480.0, + "grad_norm": 1.5963293576391182, + "language_loss": 0.67159557, + "learning_rate": 3.875161517775226e-06, + "loss": 0.69335139, + "num_input_tokens_seen": 50331025, + "step": 2323, + "time_per_iteration": 2.875265121459961 + }, + { + "auxiliary_loss_clip": 0.01127992, + "auxiliary_loss_mlp": 0.01055173, + "balance_loss_clip": 1.04900301, + "balance_loss_mlp": 1.03432369, + "epoch": 0.13972643920036074, + "flos": 16690993061760.0, + "grad_norm": 2.0757452253793485, + "language_loss": 0.88878977, + "learning_rate": 3.875026040059175e-06, + "loss": 0.9106214, + "num_input_tokens_seen": 50349725, + "step": 2324, + "time_per_iteration": 2.6841063499450684 + }, + { + "auxiliary_loss_clip": 0.01154799, + "auxiliary_loss_mlp": 0.01056834, + "balance_loss_clip": 1.05145955, + "balance_loss_mlp": 1.03541231, + "epoch": 0.1397865624530287, + "flos": 23331055288320.0, + "grad_norm": 2.8450589371660526, + "language_loss": 0.70621002, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.72832638, + "num_input_tokens_seen": 50367965, + "step": 2325, + "time_per_iteration": 2.694218397140503 + }, + { + "auxiliary_loss_clip": 0.01134393, + "auxiliary_loss_mlp": 0.00778751, + "balance_loss_clip": 1.05273592, + "balance_loss_mlp": 1.00028229, + "epoch": 0.13984668570569667, + "flos": 22778317825920.0, + "grad_norm": 2.230299294128946, + "language_loss": 0.81657004, + "learning_rate": 3.874754871328688e-06, + "loss": 0.83570141, + "num_input_tokens_seen": 50385605, + "step": 2326, + "time_per_iteration": 2.715306282043457 + }, + { + "auxiliary_loss_clip": 0.01151297, + "auxiliary_loss_mlp": 0.01045813, + "balance_loss_clip": 1.05490732, + "balance_loss_mlp": 1.02745473, + "epoch": 0.13990680895836466, + "flos": 19464553635840.0, + "grad_norm": 1.729713540462037, + "language_loss": 0.89241689, + "learning_rate": 3.874619180324534e-06, + "loss": 0.91438794, + "num_input_tokens_seen": 50403985, + "step": 2327, + "time_per_iteration": 2.679626941680908 + }, + { + "auxiliary_loss_clip": 0.01119996, + "auxiliary_loss_mlp": 0.01057397, + "balance_loss_clip": 1.04873121, + "balance_loss_mlp": 1.0352242, + "epoch": 0.13996693221103262, + "flos": 20303283185280.0, + "grad_norm": 2.9217951598838363, + "language_loss": 0.84760427, + "learning_rate": 3.874483418234632e-06, + "loss": 0.86937821, + "num_input_tokens_seen": 50421590, + "step": 2328, + "time_per_iteration": 2.7277352809906006 + }, + { + "auxiliary_loss_clip": 0.01151775, + "auxiliary_loss_mlp": 0.0104443, + "balance_loss_clip": 1.05300856, + "balance_loss_mlp": 1.02421176, + "epoch": 0.1400270554637006, + "flos": 26617707688320.0, + "grad_norm": 1.6116398320348613, + "language_loss": 0.73835862, + "learning_rate": 3.874347585064131e-06, + "loss": 0.76032066, + "num_input_tokens_seen": 50443945, + "step": 2329, + "time_per_iteration": 2.6911025047302246 + }, + { + "auxiliary_loss_clip": 0.01153137, + "auxiliary_loss_mlp": 0.01046755, + "balance_loss_clip": 1.05254042, + "balance_loss_mlp": 1.02644169, + "epoch": 0.14008717871636855, + "flos": 19391475415680.0, + "grad_norm": 2.565670250114109, + "language_loss": 0.78373277, + "learning_rate": 3.874211680818183e-06, + "loss": 0.80573165, + "num_input_tokens_seen": 50462065, + "step": 2330, + "time_per_iteration": 2.703225612640381 + }, + { + "auxiliary_loss_clip": 0.01144455, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.05247569, + "balance_loss_mlp": 1.02692819, + "epoch": 0.14014730196903652, + "flos": 15304266645120.0, + "grad_norm": 2.2215524337864143, + "language_loss": 0.72115719, + "learning_rate": 3.87407570550194e-06, + "loss": 0.74306256, + "num_input_tokens_seen": 50479565, + "step": 2331, + "time_per_iteration": 2.7044217586517334 + }, + { + "auxiliary_loss_clip": 0.01159691, + "auxiliary_loss_mlp": 0.01051771, + "balance_loss_clip": 1.0558939, + "balance_loss_mlp": 1.03234017, + "epoch": 0.14020742522170448, + "flos": 14939701557120.0, + "grad_norm": 1.5806705357110964, + "language_loss": 0.72634697, + "learning_rate": 3.873939659120557e-06, + "loss": 0.7484616, + "num_input_tokens_seen": 50497305, + "step": 2332, + "time_per_iteration": 2.647564649581909 + }, + { + "auxiliary_loss_clip": 0.01063058, + "auxiliary_loss_mlp": 0.01022564, + "balance_loss_clip": 1.03391051, + "balance_loss_mlp": 1.01944101, + "epoch": 0.14026754847437245, + "flos": 48824580044160.0, + "grad_norm": 0.8445516092095569, + "language_loss": 0.56185365, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58270991, + "num_input_tokens_seen": 50549735, + "step": 2333, + "time_per_iteration": 3.038390636444092 + }, + { + "auxiliary_loss_clip": 0.01127793, + "auxiliary_loss_mlp": 0.01045888, + "balance_loss_clip": 1.05246043, + "balance_loss_mlp": 1.02587318, + "epoch": 0.14032767172704044, + "flos": 25773267876480.0, + "grad_norm": 1.7702774265545234, + "language_loss": 0.82728767, + "learning_rate": 3.873667353183016e-06, + "loss": 0.84902453, + "num_input_tokens_seen": 50570100, + "step": 2334, + "time_per_iteration": 2.7205803394317627 + }, + { + "auxiliary_loss_clip": 0.01129244, + "auxiliary_loss_mlp": 0.01044663, + "balance_loss_clip": 1.05110407, + "balance_loss_mlp": 1.02593565, + "epoch": 0.1403877949797084, + "flos": 21216312017280.0, + "grad_norm": 1.7790720657464538, + "language_loss": 0.80958998, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83132899, + "num_input_tokens_seen": 50589185, + "step": 2335, + "time_per_iteration": 2.7844314575195312 + }, + { + "auxiliary_loss_clip": 0.01108373, + "auxiliary_loss_mlp": 0.0104374, + "balance_loss_clip": 1.04802513, + "balance_loss_mlp": 1.02160311, + "epoch": 0.14044791823237637, + "flos": 22747973811840.0, + "grad_norm": 1.739505291070366, + "language_loss": 0.81987065, + "learning_rate": 3.873394763046862e-06, + "loss": 0.84139174, + "num_input_tokens_seen": 50609645, + "step": 2336, + "time_per_iteration": 2.7787351608276367 + }, + { + "auxiliary_loss_clip": 0.01150445, + "auxiliary_loss_mlp": 0.01046319, + "balance_loss_clip": 1.05603921, + "balance_loss_mlp": 1.02709103, + "epoch": 0.14050804148504434, + "flos": 22964443125120.0, + "grad_norm": 1.7584048007565314, + "language_loss": 0.80606967, + "learning_rate": 3.873258361417225e-06, + "loss": 0.82803738, + "num_input_tokens_seen": 50628385, + "step": 2337, + "time_per_iteration": 2.6119275093078613 + }, + { + "auxiliary_loss_clip": 0.01150898, + "auxiliary_loss_mlp": 0.01051074, + "balance_loss_clip": 1.05363941, + "balance_loss_mlp": 1.03202438, + "epoch": 0.1405681647377123, + "flos": 22200336080640.0, + "grad_norm": 2.383737065589604, + "language_loss": 0.78994334, + "learning_rate": 3.873121888753442e-06, + "loss": 0.81196302, + "num_input_tokens_seen": 50647260, + "step": 2338, + "time_per_iteration": 2.672427177429199 + }, + { + "auxiliary_loss_clip": 0.01158377, + "auxiliary_loss_mlp": 0.01050168, + "balance_loss_clip": 1.05894089, + "balance_loss_mlp": 1.02919865, + "epoch": 0.14062828799038027, + "flos": 23732787974400.0, + "grad_norm": 2.117725014058833, + "language_loss": 0.79766536, + "learning_rate": 3.87298534506069e-06, + "loss": 0.81975079, + "num_input_tokens_seen": 50666130, + "step": 2339, + "time_per_iteration": 2.68635892868042 + }, + { + "auxiliary_loss_clip": 0.01097095, + "auxiliary_loss_mlp": 0.01065327, + "balance_loss_clip": 1.04686952, + "balance_loss_mlp": 1.04463232, + "epoch": 0.14068841124304826, + "flos": 39202493685120.0, + "grad_norm": 2.0269377249156793, + "language_loss": 0.65632963, + "learning_rate": 3.872848730344146e-06, + "loss": 0.67795384, + "num_input_tokens_seen": 50687440, + "step": 2340, + "time_per_iteration": 2.9426286220550537 + }, + { + "auxiliary_loss_clip": 0.0114865, + "auxiliary_loss_mlp": 0.01050723, + "balance_loss_clip": 1.05418086, + "balance_loss_mlp": 1.0310297, + "epoch": 0.14074853449571623, + "flos": 20192283181440.0, + "grad_norm": 2.8518792803213917, + "language_loss": 0.78760445, + "learning_rate": 3.87271204460899e-06, + "loss": 0.80959821, + "num_input_tokens_seen": 50704030, + "step": 2341, + "time_per_iteration": 2.8814899921417236 + }, + { + "auxiliary_loss_clip": 0.01162758, + "auxiliary_loss_mlp": 0.01057334, + "balance_loss_clip": 1.0554986, + "balance_loss_mlp": 1.03876162, + "epoch": 0.1408086577483842, + "flos": 18405871153920.0, + "grad_norm": 2.2693198584224454, + "language_loss": 0.80322361, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.82542449, + "num_input_tokens_seen": 50723305, + "step": 2342, + "time_per_iteration": 2.604814291000366 + }, + { + "auxiliary_loss_clip": 0.01152048, + "auxiliary_loss_mlp": 0.01056552, + "balance_loss_clip": 1.05776191, + "balance_loss_mlp": 1.03858757, + "epoch": 0.14086878100105216, + "flos": 25264593423360.0, + "grad_norm": 2.4727499245104343, + "language_loss": 0.77686632, + "learning_rate": 3.87243846010358e-06, + "loss": 0.79895234, + "num_input_tokens_seen": 50743270, + "step": 2343, + "time_per_iteration": 2.676823854446411 + }, + { + "auxiliary_loss_clip": 0.0105659, + "auxiliary_loss_mlp": 0.01037584, + "balance_loss_clip": 1.03650093, + "balance_loss_mlp": 1.03438878, + "epoch": 0.14092890425372012, + "flos": 65978388869760.0, + "grad_norm": 0.8521752699932517, + "language_loss": 0.61553669, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63647842, + "num_input_tokens_seen": 50802710, + "step": 2344, + "time_per_iteration": 3.156792402267456 + }, + { + "auxiliary_loss_clip": 0.01147637, + "auxiliary_loss_mlp": 0.01049362, + "balance_loss_clip": 1.05167484, + "balance_loss_mlp": 1.03121877, + "epoch": 0.1409890275063881, + "flos": 23694973931520.0, + "grad_norm": 1.558783678159347, + "language_loss": 0.64331692, + "learning_rate": 3.872164591585956e-06, + "loss": 0.6652869, + "num_input_tokens_seen": 50822625, + "step": 2345, + "time_per_iteration": 2.654100179672241 + }, + { + "auxiliary_loss_clip": 0.01154879, + "auxiliary_loss_mlp": 0.0104633, + "balance_loss_clip": 1.05009735, + "balance_loss_mlp": 1.02562308, + "epoch": 0.14104915075905605, + "flos": 23623152687360.0, + "grad_norm": 2.26337760563351, + "language_loss": 0.73892581, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.76093793, + "num_input_tokens_seen": 50842330, + "step": 2346, + "time_per_iteration": 2.7032830715179443 + }, + { + "auxiliary_loss_clip": 0.0115447, + "auxiliary_loss_mlp": 0.01048793, + "balance_loss_clip": 1.0572027, + "balance_loss_mlp": 1.02929008, + "epoch": 0.14110927401172405, + "flos": 20595165102720.0, + "grad_norm": 1.7675181118684058, + "language_loss": 0.7727294, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.79476202, + "num_input_tokens_seen": 50861035, + "step": 2347, + "time_per_iteration": 2.678647518157959 + }, + { + "auxiliary_loss_clip": 0.01164131, + "auxiliary_loss_mlp": 0.01052088, + "balance_loss_clip": 1.05490732, + "balance_loss_mlp": 1.03370619, + "epoch": 0.141169397264392, + "flos": 28548049512960.0, + "grad_norm": 2.592464695784388, + "language_loss": 0.76753062, + "learning_rate": 3.8717532563775e-06, + "loss": 0.78969282, + "num_input_tokens_seen": 50880105, + "step": 2348, + "time_per_iteration": 2.7450597286224365 + }, + { + "auxiliary_loss_clip": 0.01147264, + "auxiliary_loss_mlp": 0.01042525, + "balance_loss_clip": 1.05267334, + "balance_loss_mlp": 1.02295136, + "epoch": 0.14122952051705998, + "flos": 17092258871040.0, + "grad_norm": 1.8617784303344698, + "language_loss": 0.86794335, + "learning_rate": 3.871616002680272e-06, + "loss": 0.8898412, + "num_input_tokens_seen": 50897720, + "step": 2349, + "time_per_iteration": 2.662508964538574 + }, + { + "auxiliary_loss_clip": 0.01150971, + "auxiliary_loss_mlp": 0.01048616, + "balance_loss_clip": 1.05632985, + "balance_loss_mlp": 1.02897048, + "epoch": 0.14128964376972794, + "flos": 28946801370240.0, + "grad_norm": 2.650060051711467, + "language_loss": 0.88758218, + "learning_rate": 3.871478678011177e-06, + "loss": 0.90957808, + "num_input_tokens_seen": 50918385, + "step": 2350, + "time_per_iteration": 4.1697962284088135 + }, + { + "auxiliary_loss_clip": 0.01142704, + "auxiliary_loss_mlp": 0.01045134, + "balance_loss_clip": 1.05369377, + "balance_loss_mlp": 1.02442729, + "epoch": 0.1413497670223959, + "flos": 18989778643200.0, + "grad_norm": 1.801090232061166, + "language_loss": 0.8094542, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83133256, + "num_input_tokens_seen": 50938270, + "step": 2351, + "time_per_iteration": 2.6769907474517822 + }, + { + "auxiliary_loss_clip": 0.01149546, + "auxiliary_loss_mlp": 0.01040141, + "balance_loss_clip": 1.05100775, + "balance_loss_mlp": 1.02096045, + "epoch": 0.14140989027506387, + "flos": 29862236413440.0, + "grad_norm": 2.590933181784672, + "language_loss": 0.82796198, + "learning_rate": 3.871203815778219e-06, + "loss": 0.84985888, + "num_input_tokens_seen": 50958155, + "step": 2352, + "time_per_iteration": 5.713203430175781 + }, + { + "auxiliary_loss_clip": 0.01063742, + "auxiliary_loss_mlp": 0.01009803, + "balance_loss_clip": 1.03462291, + "balance_loss_mlp": 1.0060122, + "epoch": 0.14147001352773186, + "flos": 62079532041600.0, + "grad_norm": 0.9118003008214054, + "language_loss": 0.61876011, + "learning_rate": 3.87106627822478e-06, + "loss": 0.63949555, + "num_input_tokens_seen": 51020705, + "step": 2353, + "time_per_iteration": 3.1698319911956787 + }, + { + "auxiliary_loss_clip": 0.01134069, + "auxiliary_loss_mlp": 0.01049094, + "balance_loss_clip": 1.0536828, + "balance_loss_mlp": 1.03039002, + "epoch": 0.14153013678039983, + "flos": 22017514832640.0, + "grad_norm": 1.5909284402791886, + "language_loss": 0.87075388, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.89258552, + "num_input_tokens_seen": 51039995, + "step": 2354, + "time_per_iteration": 2.6781272888183594 + }, + { + "auxiliary_loss_clip": 0.01124592, + "auxiliary_loss_mlp": 0.0104583, + "balance_loss_clip": 1.0527302, + "balance_loss_mlp": 1.02562428, + "epoch": 0.1415902600330678, + "flos": 19720093968000.0, + "grad_norm": 2.035812967878614, + "language_loss": 0.74701214, + "learning_rate": 3.870790990270057e-06, + "loss": 0.76871634, + "num_input_tokens_seen": 51059075, + "step": 2355, + "time_per_iteration": 4.464852571487427 + }, + { + "auxiliary_loss_clip": 0.01062228, + "auxiliary_loss_mlp": 0.01003337, + "balance_loss_clip": 1.03320074, + "balance_loss_mlp": 0.99947417, + "epoch": 0.14165038328573576, + "flos": 65900929190400.0, + "grad_norm": 0.6801443738216844, + "language_loss": 0.51819825, + "learning_rate": 3.870653239879212e-06, + "loss": 0.53885388, + "num_input_tokens_seen": 51120380, + "step": 2356, + "time_per_iteration": 3.094026803970337 + }, + { + "auxiliary_loss_clip": 0.01165635, + "auxiliary_loss_mlp": 0.01057535, + "balance_loss_clip": 1.05662966, + "balance_loss_mlp": 1.0379492, + "epoch": 0.14171050653840372, + "flos": 12130158533760.0, + "grad_norm": 1.9928903491175036, + "language_loss": 0.70598352, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.72821522, + "num_input_tokens_seen": 51136950, + "step": 2357, + "time_per_iteration": 2.569486141204834 + }, + { + "auxiliary_loss_clip": 0.01117022, + "auxiliary_loss_mlp": 0.01054948, + "balance_loss_clip": 1.04706419, + "balance_loss_mlp": 1.0355413, + "epoch": 0.1417706297910717, + "flos": 20412487509120.0, + "grad_norm": 2.1046358800035234, + "language_loss": 0.82020235, + "learning_rate": 3.870377526296674e-06, + "loss": 0.84192204, + "num_input_tokens_seen": 51155175, + "step": 2358, + "time_per_iteration": 2.719344139099121 + }, + { + "auxiliary_loss_clip": 0.01145283, + "auxiliary_loss_mlp": 0.01050239, + "balance_loss_clip": 1.05257189, + "balance_loss_mlp": 1.02932954, + "epoch": 0.14183075304373965, + "flos": 22380607463040.0, + "grad_norm": 2.2336131404929787, + "language_loss": 0.71575904, + "learning_rate": 3.870239563115436e-06, + "loss": 0.73771417, + "num_input_tokens_seen": 51174500, + "step": 2359, + "time_per_iteration": 2.6914820671081543 + }, + { + "auxiliary_loss_clip": 0.0111529, + "auxiliary_loss_mlp": 0.007787, + "balance_loss_clip": 1.0526464, + "balance_loss_mlp": 1.00033379, + "epoch": 0.14189087629640765, + "flos": 21580913018880.0, + "grad_norm": 2.4314273775499906, + "language_loss": 0.7541784, + "learning_rate": 3.870101529014526e-06, + "loss": 0.77311832, + "num_input_tokens_seen": 51194270, + "step": 2360, + "time_per_iteration": 2.803493022918701 + }, + { + "auxiliary_loss_clip": 0.01108644, + "auxiliary_loss_mlp": 0.01053684, + "balance_loss_clip": 1.0491271, + "balance_loss_mlp": 1.03136814, + "epoch": 0.1419509995490756, + "flos": 20008564093440.0, + "grad_norm": 2.374719540518049, + "language_loss": 0.81920552, + "learning_rate": 3.869963423999178e-06, + "loss": 0.84082878, + "num_input_tokens_seen": 51211850, + "step": 2361, + "time_per_iteration": 2.8039920330047607 + }, + { + "auxiliary_loss_clip": 0.0115065, + "auxiliary_loss_mlp": 0.01057946, + "balance_loss_clip": 1.05230403, + "balance_loss_mlp": 1.03802609, + "epoch": 0.14201112280174358, + "flos": 31941464112000.0, + "grad_norm": 1.9397979109407166, + "language_loss": 0.74081504, + "learning_rate": 3.86982524807463e-06, + "loss": 0.76290095, + "num_input_tokens_seen": 51233545, + "step": 2362, + "time_per_iteration": 2.7272114753723145 + }, + { + "auxiliary_loss_clip": 0.0115354, + "auxiliary_loss_mlp": 0.01048321, + "balance_loss_clip": 1.05355787, + "balance_loss_mlp": 1.02861547, + "epoch": 0.14207124605441154, + "flos": 41464147582080.0, + "grad_norm": 1.7489521991344694, + "language_loss": 0.74221587, + "learning_rate": 3.869687001246122e-06, + "loss": 0.76423442, + "num_input_tokens_seen": 51257615, + "step": 2363, + "time_per_iteration": 2.789802312850952 + }, + { + "auxiliary_loss_clip": 0.01128802, + "auxiliary_loss_mlp": 0.0105205, + "balance_loss_clip": 1.04769099, + "balance_loss_mlp": 1.03180885, + "epoch": 0.1421313693070795, + "flos": 31905086613120.0, + "grad_norm": 1.7832713632097879, + "language_loss": 0.73034167, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.75215018, + "num_input_tokens_seen": 51279645, + "step": 2364, + "time_per_iteration": 2.8508312702178955 + }, + { + "auxiliary_loss_clip": 0.01142769, + "auxiliary_loss_mlp": 0.01049829, + "balance_loss_clip": 1.05160844, + "balance_loss_mlp": 1.03207827, + "epoch": 0.14219149255974747, + "flos": 26871165031680.0, + "grad_norm": 1.875477198706701, + "language_loss": 0.90395916, + "learning_rate": 3.869410294898195e-06, + "loss": 0.92588514, + "num_input_tokens_seen": 51299775, + "step": 2365, + "time_per_iteration": 2.6807806491851807 + }, + { + "auxiliary_loss_clip": 0.01127252, + "auxiliary_loss_mlp": 0.01054912, + "balance_loss_clip": 1.04759967, + "balance_loss_mlp": 1.03394318, + "epoch": 0.14225161581241544, + "flos": 27454426076160.0, + "grad_norm": 1.719218863067841, + "language_loss": 0.65305161, + "learning_rate": 3.869271835389268e-06, + "loss": 0.67487329, + "num_input_tokens_seen": 51319430, + "step": 2366, + "time_per_iteration": 2.7293641567230225 + }, + { + "auxiliary_loss_clip": 0.01143576, + "auxiliary_loss_mlp": 0.01051629, + "balance_loss_clip": 1.05218709, + "balance_loss_mlp": 1.03058839, + "epoch": 0.14231173906508343, + "flos": 10561436881920.0, + "grad_norm": 2.3740196514966256, + "language_loss": 0.80331928, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.82527137, + "num_input_tokens_seen": 51336045, + "step": 2367, + "time_per_iteration": 2.67529296875 + }, + { + "auxiliary_loss_clip": 0.01138517, + "auxiliary_loss_mlp": 0.01062653, + "balance_loss_clip": 1.05117869, + "balance_loss_mlp": 1.0402534, + "epoch": 0.1423718623177514, + "flos": 28360882719360.0, + "grad_norm": 2.0081973718426283, + "language_loss": 0.82346755, + "learning_rate": 3.868994703727742e-06, + "loss": 0.84547925, + "num_input_tokens_seen": 51357030, + "step": 2368, + "time_per_iteration": 2.7447288036346436 + }, + { + "auxiliary_loss_clip": 0.01122755, + "auxiliary_loss_mlp": 0.01052229, + "balance_loss_clip": 1.05180073, + "balance_loss_mlp": 1.03065228, + "epoch": 0.14243198557041936, + "flos": 19354235990400.0, + "grad_norm": 2.6586279461428144, + "language_loss": 0.8711772, + "learning_rate": 3.868856031585652e-06, + "loss": 0.89292705, + "num_input_tokens_seen": 51374890, + "step": 2369, + "time_per_iteration": 2.736872673034668 + }, + { + "auxiliary_loss_clip": 0.01127301, + "auxiliary_loss_mlp": 0.0104182, + "balance_loss_clip": 1.05011857, + "balance_loss_mlp": 1.02170992, + "epoch": 0.14249210882308733, + "flos": 28806857982720.0, + "grad_norm": 1.7900856007188275, + "language_loss": 0.75828248, + "learning_rate": 3.868717288576354e-06, + "loss": 0.77997375, + "num_input_tokens_seen": 51398100, + "step": 2370, + "time_per_iteration": 2.762603998184204 + }, + { + "auxiliary_loss_clip": 0.01158195, + "auxiliary_loss_mlp": 0.00781098, + "balance_loss_clip": 1.05268764, + "balance_loss_mlp": 1.00028419, + "epoch": 0.1425522320757553, + "flos": 21835016807040.0, + "grad_norm": 1.7770434161065212, + "language_loss": 0.82934797, + "learning_rate": 3.868578474705109e-06, + "loss": 0.84874088, + "num_input_tokens_seen": 51418745, + "step": 2371, + "time_per_iteration": 2.6224656105041504 + }, + { + "auxiliary_loss_clip": 0.01173447, + "auxiliary_loss_mlp": 0.0105718, + "balance_loss_clip": 1.05837953, + "balance_loss_mlp": 1.03638947, + "epoch": 0.14261235532842326, + "flos": 17311457617920.0, + "grad_norm": 2.0431625041319825, + "language_loss": 0.82982123, + "learning_rate": 3.868439589977181e-06, + "loss": 0.85212755, + "num_input_tokens_seen": 51437455, + "step": 2372, + "time_per_iteration": 2.575690269470215 + }, + { + "auxiliary_loss_clip": 0.01172196, + "auxiliary_loss_mlp": 0.0105022, + "balance_loss_clip": 1.0581125, + "balance_loss_mlp": 1.0285356, + "epoch": 0.14267247858109125, + "flos": 18806741913600.0, + "grad_norm": 3.3704326167450582, + "language_loss": 0.8438468, + "learning_rate": 3.868300634397836e-06, + "loss": 0.86607099, + "num_input_tokens_seen": 51455710, + "step": 2373, + "time_per_iteration": 2.7160356044769287 + }, + { + "auxiliary_loss_clip": 0.01141742, + "auxiliary_loss_mlp": 0.01055295, + "balance_loss_clip": 1.05160809, + "balance_loss_mlp": 1.03598261, + "epoch": 0.14273260183375922, + "flos": 11358904682880.0, + "grad_norm": 3.5035356392631836, + "language_loss": 0.86027539, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88224572, + "num_input_tokens_seen": 51471270, + "step": 2374, + "time_per_iteration": 2.6845595836639404 + }, + { + "auxiliary_loss_clip": 0.01164623, + "auxiliary_loss_mlp": 0.01061957, + "balance_loss_clip": 1.05515146, + "balance_loss_mlp": 1.03996301, + "epoch": 0.14279272508642718, + "flos": 27567688636800.0, + "grad_norm": 1.6059368749673757, + "language_loss": 0.79169822, + "learning_rate": 3.868022510705977e-06, + "loss": 0.81396401, + "num_input_tokens_seen": 51492705, + "step": 2375, + "time_per_iteration": 2.738156795501709 + }, + { + "auxiliary_loss_clip": 0.01163115, + "auxiliary_loss_mlp": 0.01058224, + "balance_loss_clip": 1.05641222, + "balance_loss_mlp": 1.0368259, + "epoch": 0.14285284833909515, + "flos": 16252559654400.0, + "grad_norm": 2.559097553272684, + "language_loss": 0.76907504, + "learning_rate": 3.867883342604009e-06, + "loss": 0.79128844, + "num_input_tokens_seen": 51510780, + "step": 2376, + "time_per_iteration": 2.751178741455078 + }, + { + "auxiliary_loss_clip": 0.01160115, + "auxiliary_loss_mlp": 0.0105168, + "balance_loss_clip": 1.054515, + "balance_loss_mlp": 1.03040111, + "epoch": 0.1429129715917631, + "flos": 19755609540480.0, + "grad_norm": 2.7331999261828592, + "language_loss": 0.92795181, + "learning_rate": 3.867744103671717e-06, + "loss": 0.95006979, + "num_input_tokens_seen": 51531400, + "step": 2377, + "time_per_iteration": 2.6584725379943848 + }, + { + "auxiliary_loss_clip": 0.01147246, + "auxiliary_loss_mlp": 0.01061419, + "balance_loss_clip": 1.05362535, + "balance_loss_mlp": 1.03793442, + "epoch": 0.14297309484443108, + "flos": 21137092571520.0, + "grad_norm": 2.9252003733204894, + "language_loss": 0.91754365, + "learning_rate": 3.867604793914382e-06, + "loss": 0.93963027, + "num_input_tokens_seen": 51548215, + "step": 2378, + "time_per_iteration": 2.8107075691223145 + }, + { + "auxiliary_loss_clip": 0.01164153, + "auxiliary_loss_mlp": 0.0105303, + "balance_loss_clip": 1.05712187, + "balance_loss_mlp": 1.03092849, + "epoch": 0.14303321809709904, + "flos": 23586667447680.0, + "grad_norm": 2.1292902842232966, + "language_loss": 0.73961306, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.76178491, + "num_input_tokens_seen": 51566820, + "step": 2379, + "time_per_iteration": 2.7029881477355957 + }, + { + "auxiliary_loss_clip": 0.01137551, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_clip": 1.05204058, + "balance_loss_mlp": 1.0330174, + "epoch": 0.14309334134976703, + "flos": 15888281875200.0, + "grad_norm": 2.1898245228218784, + "language_loss": 0.78818595, + "learning_rate": 3.867325961945714e-06, + "loss": 0.81011152, + "num_input_tokens_seen": 51585075, + "step": 2380, + "time_per_iteration": 2.7213294506073 + }, + { + "auxiliary_loss_clip": 0.01126442, + "auxiliary_loss_mlp": 0.01057409, + "balance_loss_clip": 1.05457354, + "balance_loss_mlp": 1.03580785, + "epoch": 0.143153464602435, + "flos": 16325601960960.0, + "grad_norm": 4.699041640805274, + "language_loss": 0.87895483, + "learning_rate": 3.867186439744955e-06, + "loss": 0.90079331, + "num_input_tokens_seen": 51603185, + "step": 2381, + "time_per_iteration": 2.7144110202789307 + }, + { + "auxiliary_loss_clip": 0.01141327, + "auxiliary_loss_mlp": 0.01052708, + "balance_loss_clip": 1.05200005, + "balance_loss_mlp": 1.03088117, + "epoch": 0.14321358785510296, + "flos": 17092079303040.0, + "grad_norm": 2.47508592106904, + "language_loss": 0.76396096, + "learning_rate": 3.867046846740299e-06, + "loss": 0.78590137, + "num_input_tokens_seen": 51620880, + "step": 2382, + "time_per_iteration": 2.6185953617095947 + }, + { + "auxiliary_loss_clip": 0.01132222, + "auxiliary_loss_mlp": 0.01054019, + "balance_loss_clip": 1.05162048, + "balance_loss_mlp": 1.03319359, + "epoch": 0.14327371110777093, + "flos": 26322916769280.0, + "grad_norm": 4.3017095308344375, + "language_loss": 0.76636785, + "learning_rate": 3.866907182937039e-06, + "loss": 0.7882303, + "num_input_tokens_seen": 51640170, + "step": 2383, + "time_per_iteration": 2.7408525943756104 + }, + { + "auxiliary_loss_clip": 0.01139698, + "auxiliary_loss_mlp": 0.01052888, + "balance_loss_clip": 1.05078864, + "balance_loss_mlp": 1.02926064, + "epoch": 0.1433338343604389, + "flos": 18076462502400.0, + "grad_norm": 2.3526544982502284, + "language_loss": 0.87649417, + "learning_rate": 3.866767448340471e-06, + "loss": 0.8984201, + "num_input_tokens_seen": 51656580, + "step": 2384, + "time_per_iteration": 2.6798789501190186 + }, + { + "auxiliary_loss_clip": 0.01164805, + "auxiliary_loss_mlp": 0.01053206, + "balance_loss_clip": 1.05644679, + "balance_loss_mlp": 1.02985239, + "epoch": 0.14339395761310686, + "flos": 15522783033600.0, + "grad_norm": 2.6134761315069284, + "language_loss": 0.79340684, + "learning_rate": 3.866627642955895e-06, + "loss": 0.81558692, + "num_input_tokens_seen": 51674645, + "step": 2385, + "time_per_iteration": 2.5856544971466064 + }, + { + "auxiliary_loss_clip": 0.01156607, + "auxiliary_loss_mlp": 0.01042784, + "balance_loss_clip": 1.05148256, + "balance_loss_mlp": 1.02182722, + "epoch": 0.14345408086577485, + "flos": 28548767784960.0, + "grad_norm": 2.6990187663653247, + "language_loss": 0.74960196, + "learning_rate": 3.866487766788612e-06, + "loss": 0.77159584, + "num_input_tokens_seen": 51695770, + "step": 2386, + "time_per_iteration": 2.6670751571655273 + }, + { + "auxiliary_loss_clip": 0.01171639, + "auxiliary_loss_mlp": 0.01048096, + "balance_loss_clip": 1.05699563, + "balance_loss_mlp": 1.02733016, + "epoch": 0.14351420411844282, + "flos": 20230061310720.0, + "grad_norm": 2.299870083842227, + "language_loss": 0.78659731, + "learning_rate": 3.866347819843925e-06, + "loss": 0.80879462, + "num_input_tokens_seen": 51714165, + "step": 2387, + "time_per_iteration": 2.5805532932281494 + }, + { + "auxiliary_loss_clip": 0.01140581, + "auxiliary_loss_mlp": 0.01055299, + "balance_loss_clip": 1.05355716, + "balance_loss_mlp": 1.03317428, + "epoch": 0.14357432737111078, + "flos": 19865029345920.0, + "grad_norm": 6.554164509194222, + "language_loss": 0.82492924, + "learning_rate": 3.866207802127143e-06, + "loss": 0.84688807, + "num_input_tokens_seen": 51734440, + "step": 2388, + "time_per_iteration": 2.656609058380127 + }, + { + "auxiliary_loss_clip": 0.01155007, + "auxiliary_loss_mlp": 0.01047154, + "balance_loss_clip": 1.0537287, + "balance_loss_mlp": 1.02674508, + "epoch": 0.14363445062377875, + "flos": 28256814040320.0, + "grad_norm": 2.5973624291758655, + "language_loss": 0.82025754, + "learning_rate": 3.866067713643573e-06, + "loss": 0.84227914, + "num_input_tokens_seen": 51753730, + "step": 2389, + "time_per_iteration": 4.21793794631958 + }, + { + "auxiliary_loss_clip": 0.01145665, + "auxiliary_loss_mlp": 0.01046852, + "balance_loss_clip": 1.05107975, + "balance_loss_mlp": 1.02513266, + "epoch": 0.1436945738764467, + "flos": 18186672407040.0, + "grad_norm": 3.7970835440683097, + "language_loss": 0.83056784, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.85249299, + "num_input_tokens_seen": 51771195, + "step": 2390, + "time_per_iteration": 2.6859514713287354 + }, + { + "auxiliary_loss_clip": 0.01152608, + "auxiliary_loss_mlp": 0.01054404, + "balance_loss_clip": 1.05400729, + "balance_loss_mlp": 1.0334475, + "epoch": 0.14375469712911468, + "flos": 27307910499840.0, + "grad_norm": 1.8176612067028404, + "language_loss": 0.75018179, + "learning_rate": 3.865787324397324e-06, + "loss": 0.77225184, + "num_input_tokens_seen": 51792290, + "step": 2391, + "time_per_iteration": 5.726900577545166 + }, + { + "auxiliary_loss_clip": 0.01045505, + "auxiliary_loss_mlp": 0.01033342, + "balance_loss_clip": 1.03226101, + "balance_loss_mlp": 1.0303973, + "epoch": 0.14381482038178264, + "flos": 56891445287040.0, + "grad_norm": 0.8787809928903102, + "language_loss": 0.61848003, + "learning_rate": 3.865647023645277e-06, + "loss": 0.63926852, + "num_input_tokens_seen": 51843675, + "step": 2392, + "time_per_iteration": 3.113558053970337 + }, + { + "auxiliary_loss_clip": 0.01158698, + "auxiliary_loss_mlp": 0.01058807, + "balance_loss_clip": 1.05467868, + "balance_loss_mlp": 1.03608608, + "epoch": 0.14387494363445064, + "flos": 14282177143680.0, + "grad_norm": 2.718376715006273, + "language_loss": 0.77346605, + "learning_rate": 3.865506652147709e-06, + "loss": 0.79564106, + "num_input_tokens_seen": 51860285, + "step": 2393, + "time_per_iteration": 2.6578521728515625 + }, + { + "auxiliary_loss_clip": 0.0116951, + "auxiliary_loss_mlp": 0.01052986, + "balance_loss_clip": 1.05671048, + "balance_loss_mlp": 1.03287578, + "epoch": 0.1439350668871186, + "flos": 26761493831040.0, + "grad_norm": 5.715284956255472, + "language_loss": 0.76301813, + "learning_rate": 3.865366209909941e-06, + "loss": 0.78524309, + "num_input_tokens_seen": 51880105, + "step": 2394, + "time_per_iteration": 4.345217943191528 + }, + { + "auxiliary_loss_clip": 0.01165266, + "auxiliary_loss_mlp": 0.01053501, + "balance_loss_clip": 1.05325842, + "balance_loss_mlp": 1.03365326, + "epoch": 0.14399519013978657, + "flos": 40700040537600.0, + "grad_norm": 2.2496244390836893, + "language_loss": 0.85859704, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88078463, + "num_input_tokens_seen": 51905175, + "step": 2395, + "time_per_iteration": 2.739717483520508 + }, + { + "auxiliary_loss_clip": 0.0112523, + "auxiliary_loss_mlp": 0.01051092, + "balance_loss_clip": 1.04946184, + "balance_loss_mlp": 1.028669, + "epoch": 0.14405531339245453, + "flos": 20557530627840.0, + "grad_norm": 4.117082508421602, + "language_loss": 0.82894099, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85070425, + "num_input_tokens_seen": 51924490, + "step": 2396, + "time_per_iteration": 2.686732053756714 + }, + { + "auxiliary_loss_clip": 0.01126754, + "auxiliary_loss_mlp": 0.00779833, + "balance_loss_clip": 1.04752374, + "balance_loss_mlp": 1.00036597, + "epoch": 0.1441154366451225, + "flos": 19572931946880.0, + "grad_norm": 6.956399779275871, + "language_loss": 0.82801461, + "learning_rate": 3.864944458808712e-06, + "loss": 0.84708053, + "num_input_tokens_seen": 51940490, + "step": 2397, + "time_per_iteration": 2.742809534072876 + }, + { + "auxiliary_loss_clip": 0.01168871, + "auxiliary_loss_mlp": 0.0104994, + "balance_loss_clip": 1.05485702, + "balance_loss_mlp": 1.02892387, + "epoch": 0.14417555989779046, + "flos": 18515721922560.0, + "grad_norm": 8.355198005975433, + "language_loss": 0.8001197, + "learning_rate": 3.86480373366343e-06, + "loss": 0.82230783, + "num_input_tokens_seen": 51957910, + "step": 2398, + "time_per_iteration": 2.573267936706543 + }, + { + "auxiliary_loss_clip": 0.01152449, + "auxiliary_loss_mlp": 0.01053407, + "balance_loss_clip": 1.05287588, + "balance_loss_mlp": 1.03336823, + "epoch": 0.14423568315045843, + "flos": 26031681296640.0, + "grad_norm": 3.294581575970509, + "language_loss": 0.64690518, + "learning_rate": 3.864662937804603e-06, + "loss": 0.66896379, + "num_input_tokens_seen": 51978010, + "step": 2399, + "time_per_iteration": 2.6831774711608887 + }, + { + "auxiliary_loss_clip": 0.01134916, + "auxiliary_loss_mlp": 0.01052493, + "balance_loss_clip": 1.04998159, + "balance_loss_mlp": 1.03119016, + "epoch": 0.14429580640312642, + "flos": 21288743792640.0, + "grad_norm": 3.586256880371596, + "language_loss": 0.82207137, + "learning_rate": 3.864522071237571e-06, + "loss": 0.84394544, + "num_input_tokens_seen": 51998515, + "step": 2400, + "time_per_iteration": 2.6812663078308105 + }, + { + "auxiliary_loss_clip": 0.01149983, + "auxiliary_loss_mlp": 0.01051884, + "balance_loss_clip": 1.0567503, + "balance_loss_mlp": 1.02954376, + "epoch": 0.14435592965579438, + "flos": 25627865621760.0, + "grad_norm": 2.3908005596579165, + "language_loss": 0.74217784, + "learning_rate": 3.864381133967676e-06, + "loss": 0.76419652, + "num_input_tokens_seen": 52019270, + "step": 2401, + "time_per_iteration": 2.773838520050049 + }, + { + "auxiliary_loss_clip": 0.01137207, + "auxiliary_loss_mlp": 0.01047592, + "balance_loss_clip": 1.05065656, + "balance_loss_mlp": 1.02671885, + "epoch": 0.14441605290846235, + "flos": 22965053656320.0, + "grad_norm": 2.616063077702737, + "language_loss": 0.80771816, + "learning_rate": 3.86424012600026e-06, + "loss": 0.82956612, + "num_input_tokens_seen": 52039315, + "step": 2402, + "time_per_iteration": 2.786031723022461 + }, + { + "auxiliary_loss_clip": 0.01120897, + "auxiliary_loss_mlp": 0.01052115, + "balance_loss_clip": 1.04718328, + "balance_loss_mlp": 1.02988231, + "epoch": 0.14447617616113032, + "flos": 17347655548800.0, + "grad_norm": 2.397935571801219, + "language_loss": 0.84159613, + "learning_rate": 3.864099047340673e-06, + "loss": 0.86332625, + "num_input_tokens_seen": 52056555, + "step": 2403, + "time_per_iteration": 2.8113911151885986 + }, + { + "auxiliary_loss_clip": 0.01129082, + "auxiliary_loss_mlp": 0.00783127, + "balance_loss_clip": 1.04854488, + "balance_loss_mlp": 1.00030184, + "epoch": 0.14453629941379828, + "flos": 24060185464320.0, + "grad_norm": 2.224282169770823, + "language_loss": 0.70142806, + "learning_rate": 3.863957897994262e-06, + "loss": 0.72055018, + "num_input_tokens_seen": 52075800, + "step": 2404, + "time_per_iteration": 2.7748003005981445 + }, + { + "auxiliary_loss_clip": 0.01144289, + "auxiliary_loss_mlp": 0.01051404, + "balance_loss_clip": 1.05279732, + "balance_loss_mlp": 1.03099549, + "epoch": 0.14459642266646625, + "flos": 14429554646400.0, + "grad_norm": 2.429117427076043, + "language_loss": 0.73179376, + "learning_rate": 3.863816677966381e-06, + "loss": 0.75375068, + "num_input_tokens_seen": 52092585, + "step": 2405, + "time_per_iteration": 2.7927868366241455 + }, + { + "auxiliary_loss_clip": 0.01108387, + "auxiliary_loss_mlp": 0.01054584, + "balance_loss_clip": 1.04661417, + "balance_loss_mlp": 1.0326612, + "epoch": 0.14465654591913424, + "flos": 9867032179200.0, + "grad_norm": 7.089523066959408, + "language_loss": 0.73039794, + "learning_rate": 3.863675387262386e-06, + "loss": 0.75202763, + "num_input_tokens_seen": 52108990, + "step": 2406, + "time_per_iteration": 2.742253303527832 + }, + { + "auxiliary_loss_clip": 0.01157268, + "auxiliary_loss_mlp": 0.01054465, + "balance_loss_clip": 1.05420268, + "balance_loss_mlp": 1.03198171, + "epoch": 0.1447166691718022, + "flos": 24972926987520.0, + "grad_norm": 5.383630788916188, + "language_loss": 0.75570732, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.77782464, + "num_input_tokens_seen": 52125385, + "step": 2407, + "time_per_iteration": 2.654636859893799 + }, + { + "auxiliary_loss_clip": 0.0116674, + "auxiliary_loss_mlp": 0.01054642, + "balance_loss_clip": 1.05440819, + "balance_loss_mlp": 1.03392315, + "epoch": 0.14477679242447017, + "flos": 21908023200000.0, + "grad_norm": 2.0240540465866146, + "language_loss": 0.79426706, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.81648088, + "num_input_tokens_seen": 52144985, + "step": 2408, + "time_per_iteration": 2.663611650466919 + }, + { + "auxiliary_loss_clip": 0.01155332, + "auxiliary_loss_mlp": 0.01053557, + "balance_loss_clip": 1.05411625, + "balance_loss_mlp": 1.03107429, + "epoch": 0.14483691567713813, + "flos": 20740746925440.0, + "grad_norm": 2.249858190268702, + "language_loss": 0.82188261, + "learning_rate": 3.863251091147299e-06, + "loss": 0.84397143, + "num_input_tokens_seen": 52163885, + "step": 2409, + "time_per_iteration": 2.6218342781066895 + }, + { + "auxiliary_loss_clip": 0.01116852, + "auxiliary_loss_mlp": 0.01065498, + "balance_loss_clip": 1.04859877, + "balance_loss_mlp": 1.04340839, + "epoch": 0.1448970389298061, + "flos": 35407705536000.0, + "grad_norm": 3.918408886138166, + "language_loss": 0.74477464, + "learning_rate": 3.863109517792446e-06, + "loss": 0.76659817, + "num_input_tokens_seen": 52184325, + "step": 2410, + "time_per_iteration": 2.8525002002716064 + }, + { + "auxiliary_loss_clip": 0.01166422, + "auxiliary_loss_mlp": 0.0105028, + "balance_loss_clip": 1.05447876, + "balance_loss_mlp": 1.0300622, + "epoch": 0.14495716218247406, + "flos": 15414368808960.0, + "grad_norm": 2.976325973684052, + "language_loss": 0.81616414, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.8383311, + "num_input_tokens_seen": 52202740, + "step": 2411, + "time_per_iteration": 2.580059051513672 + }, + { + "auxiliary_loss_clip": 0.01143671, + "auxiliary_loss_mlp": 0.01055066, + "balance_loss_clip": 1.05553794, + "balance_loss_mlp": 1.03366852, + "epoch": 0.14501728543514203, + "flos": 33693222493440.0, + "grad_norm": 2.049708152728223, + "language_loss": 0.69947547, + "learning_rate": 3.862826159140214e-06, + "loss": 0.72146285, + "num_input_tokens_seen": 52223100, + "step": 2412, + "time_per_iteration": 2.792389392852783 + }, + { + "auxiliary_loss_clip": 0.01153861, + "auxiliary_loss_mlp": 0.01047504, + "balance_loss_clip": 1.05600309, + "balance_loss_mlp": 1.02669024, + "epoch": 0.14507740868781002, + "flos": 15596112648960.0, + "grad_norm": 1.9741671649406984, + "language_loss": 0.76655865, + "learning_rate": 3.862684373853579e-06, + "loss": 0.78857231, + "num_input_tokens_seen": 52239690, + "step": 2413, + "time_per_iteration": 2.6535370349884033 + }, + { + "auxiliary_loss_clip": 0.01072879, + "auxiliary_loss_mlp": 0.01028499, + "balance_loss_clip": 1.04041791, + "balance_loss_mlp": 1.0252564, + "epoch": 0.145137531940478, + "flos": 66675343438080.0, + "grad_norm": 0.9047547971056389, + "language_loss": 0.58883119, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.60984492, + "num_input_tokens_seen": 52296705, + "step": 2414, + "time_per_iteration": 3.1230342388153076 + }, + { + "auxiliary_loss_clip": 0.01059489, + "auxiliary_loss_mlp": 0.01009718, + "balance_loss_clip": 1.03874373, + "balance_loss_mlp": 1.00692892, + "epoch": 0.14519765519314595, + "flos": 67521578929920.0, + "grad_norm": 0.8422279258983576, + "language_loss": 0.62171185, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64240396, + "num_input_tokens_seen": 52361830, + "step": 2415, + "time_per_iteration": 3.1932270526885986 + }, + { + "auxiliary_loss_clip": 0.01151643, + "auxiliary_loss_mlp": 0.01046675, + "balance_loss_clip": 1.05383611, + "balance_loss_mlp": 1.02500319, + "epoch": 0.14525777844581392, + "flos": 17198913329280.0, + "grad_norm": 2.2913061581681036, + "language_loss": 0.71468806, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.73667121, + "num_input_tokens_seen": 52379420, + "step": 2416, + "time_per_iteration": 2.5892374515533447 + }, + { + "auxiliary_loss_clip": 0.01050816, + "auxiliary_loss_mlp": 0.010049, + "balance_loss_clip": 1.03675056, + "balance_loss_mlp": 1.00211036, + "epoch": 0.14531790169848188, + "flos": 65404609015680.0, + "grad_norm": 0.7147623603004897, + "language_loss": 0.6037569, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62431407, + "num_input_tokens_seen": 52446290, + "step": 2417, + "time_per_iteration": 3.3065359592437744 + }, + { + "auxiliary_loss_clip": 0.01168766, + "auxiliary_loss_mlp": 0.01053548, + "balance_loss_clip": 1.05357766, + "balance_loss_mlp": 1.03275824, + "epoch": 0.14537802495114985, + "flos": 32562467372160.0, + "grad_norm": 3.7032433533234346, + "language_loss": 0.78014368, + "learning_rate": 3.861974388030356e-06, + "loss": 0.80236679, + "num_input_tokens_seen": 52467295, + "step": 2418, + "time_per_iteration": 2.887986183166504 + }, + { + "auxiliary_loss_clip": 0.01114137, + "auxiliary_loss_mlp": 0.01049779, + "balance_loss_clip": 1.04354823, + "balance_loss_mlp": 1.02911985, + "epoch": 0.1454381482038178, + "flos": 20226685432320.0, + "grad_norm": 2.096300480609688, + "language_loss": 0.71208847, + "learning_rate": 3.861832179025394e-06, + "loss": 0.73372757, + "num_input_tokens_seen": 52487295, + "step": 2419, + "time_per_iteration": 2.764268636703491 + }, + { + "auxiliary_loss_clip": 0.01142427, + "auxiliary_loss_mlp": 0.01054976, + "balance_loss_clip": 1.05351484, + "balance_loss_mlp": 1.03300607, + "epoch": 0.1454982714564858, + "flos": 22893124671360.0, + "grad_norm": 2.414673655978061, + "language_loss": 0.89847761, + "learning_rate": 3.861689899419569e-06, + "loss": 0.92045164, + "num_input_tokens_seen": 52504220, + "step": 2420, + "time_per_iteration": 2.7500016689300537 + }, + { + "auxiliary_loss_clip": 0.01155004, + "auxiliary_loss_mlp": 0.01060929, + "balance_loss_clip": 1.05202007, + "balance_loss_mlp": 1.04072309, + "epoch": 0.14555839470915377, + "flos": 20229845829120.0, + "grad_norm": 2.0953123539002383, + "language_loss": 0.82278717, + "learning_rate": 3.861547549218276e-06, + "loss": 0.8449465, + "num_input_tokens_seen": 52521900, + "step": 2421, + "time_per_iteration": 2.672722816467285 + }, + { + "auxiliary_loss_clip": 0.01099277, + "auxiliary_loss_mlp": 0.01056793, + "balance_loss_clip": 1.04282439, + "balance_loss_mlp": 1.03507352, + "epoch": 0.14561851796182174, + "flos": 22236282616320.0, + "grad_norm": 1.667429152986229, + "language_loss": 0.81741488, + "learning_rate": 3.861405128426914e-06, + "loss": 0.83897555, + "num_input_tokens_seen": 52540495, + "step": 2422, + "time_per_iteration": 2.739992141723633 + }, + { + "auxiliary_loss_clip": 0.01031842, + "auxiliary_loss_mlp": 0.00760413, + "balance_loss_clip": 1.0271318, + "balance_loss_mlp": 1.00019872, + "epoch": 0.1456786412144897, + "flos": 52636786289280.0, + "grad_norm": 0.9102961670465963, + "language_loss": 0.63342595, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65134847, + "num_input_tokens_seen": 52603305, + "step": 2423, + "time_per_iteration": 3.2704036235809326 + }, + { + "auxiliary_loss_clip": 0.01112855, + "auxiliary_loss_mlp": 0.00780065, + "balance_loss_clip": 1.05457556, + "balance_loss_mlp": 1.00038898, + "epoch": 0.14573876446715767, + "flos": 23221671396480.0, + "grad_norm": 2.2239460229896206, + "language_loss": 0.82163274, + "learning_rate": 3.861120075095585e-06, + "loss": 0.84056193, + "num_input_tokens_seen": 52623435, + "step": 2424, + "time_per_iteration": 2.7993249893188477 + }, + { + "auxiliary_loss_clip": 0.01141208, + "auxiliary_loss_mlp": 0.01069468, + "balance_loss_clip": 1.0535512, + "balance_loss_mlp": 1.0496788, + "epoch": 0.14579888771982563, + "flos": 18114384286080.0, + "grad_norm": 2.769336045727131, + "language_loss": 0.78602695, + "learning_rate": 3.860977442566429e-06, + "loss": 0.80813372, + "num_input_tokens_seen": 52642255, + "step": 2425, + "time_per_iteration": 2.698594093322754 + }, + { + "auxiliary_loss_clip": 0.01156078, + "auxiliary_loss_mlp": 0.01062133, + "balance_loss_clip": 1.05603778, + "balance_loss_mlp": 1.04148602, + "epoch": 0.14585901097249362, + "flos": 23001107932800.0, + "grad_norm": 50.77412231982301, + "language_loss": 0.83184898, + "learning_rate": 3.860834739468821e-06, + "loss": 0.85403109, + "num_input_tokens_seen": 52658700, + "step": 2426, + "time_per_iteration": 2.6948676109313965 + }, + { + "auxiliary_loss_clip": 0.01166642, + "auxiliary_loss_mlp": 0.01060596, + "balance_loss_clip": 1.05706, + "balance_loss_mlp": 1.04040194, + "epoch": 0.1459191342251616, + "flos": 21908669644800.0, + "grad_norm": 3.7420612082917475, + "language_loss": 0.87215799, + "learning_rate": 3.860691965808173e-06, + "loss": 0.8944304, + "num_input_tokens_seen": 52678140, + "step": 2427, + "time_per_iteration": 2.6479666233062744 + }, + { + "auxiliary_loss_clip": 0.01128634, + "auxiliary_loss_mlp": 0.01064346, + "balance_loss_clip": 1.04835391, + "balance_loss_mlp": 1.0405997, + "epoch": 0.14597925747782955, + "flos": 14975504438400.0, + "grad_norm": 1.9221483903926033, + "language_loss": 0.66815829, + "learning_rate": 3.8605491215899e-06, + "loss": 0.69008809, + "num_input_tokens_seen": 52696825, + "step": 2428, + "time_per_iteration": 2.6971306800842285 + }, + { + "auxiliary_loss_clip": 0.01155557, + "auxiliary_loss_mlp": 0.01059343, + "balance_loss_clip": 1.05335426, + "balance_loss_mlp": 1.03842235, + "epoch": 0.14603938073049752, + "flos": 21068898600960.0, + "grad_norm": 2.0918238083564242, + "language_loss": 0.83231717, + "learning_rate": 3.860406206819417e-06, + "loss": 0.8544662, + "num_input_tokens_seen": 52715125, + "step": 2429, + "time_per_iteration": 4.283279895782471 + }, + { + "auxiliary_loss_clip": 0.01120809, + "auxiliary_loss_mlp": 0.01053505, + "balance_loss_clip": 1.04625869, + "balance_loss_mlp": 1.03446746, + "epoch": 0.14609950398316549, + "flos": 19864777950720.0, + "grad_norm": 2.4559042296603746, + "language_loss": 0.79087842, + "learning_rate": 3.860263221502145e-06, + "loss": 0.81262159, + "num_input_tokens_seen": 52734015, + "step": 2430, + "time_per_iteration": 4.197890758514404 + }, + { + "auxiliary_loss_clip": 0.01170782, + "auxiliary_loss_mlp": 0.01061965, + "balance_loss_clip": 1.05820751, + "balance_loss_mlp": 1.04179525, + "epoch": 0.14615962723583345, + "flos": 22418852469120.0, + "grad_norm": 2.4376691278662506, + "language_loss": 0.82910693, + "learning_rate": 3.860120165643504e-06, + "loss": 0.85143435, + "num_input_tokens_seen": 52753025, + "step": 2431, + "time_per_iteration": 4.162708282470703 + }, + { + "auxiliary_loss_clip": 0.011607, + "auxiliary_loss_mlp": 0.01060112, + "balance_loss_clip": 1.05553937, + "balance_loss_mlp": 1.03853524, + "epoch": 0.14621975048850142, + "flos": 22346241125760.0, + "grad_norm": 2.881661839068268, + "language_loss": 0.78330141, + "learning_rate": 3.859977039248921e-06, + "loss": 0.80550951, + "num_input_tokens_seen": 52773420, + "step": 2432, + "time_per_iteration": 2.6907777786254883 + }, + { + "auxiliary_loss_clip": 0.01165399, + "auxiliary_loss_mlp": 0.00782861, + "balance_loss_clip": 1.05517077, + "balance_loss_mlp": 1.00040507, + "epoch": 0.1462798737411694, + "flos": 24389163152640.0, + "grad_norm": 2.3488382544651887, + "language_loss": 0.79515982, + "learning_rate": 3.859833842323822e-06, + "loss": 0.81464243, + "num_input_tokens_seen": 52792870, + "step": 2433, + "time_per_iteration": 2.719841241836548 + }, + { + "auxiliary_loss_clip": 0.01124303, + "auxiliary_loss_mlp": 0.01055776, + "balance_loss_clip": 1.05385411, + "balance_loss_mlp": 1.03484273, + "epoch": 0.14633999699383737, + "flos": 19244672530560.0, + "grad_norm": 2.0782880949269926, + "language_loss": 0.77905983, + "learning_rate": 3.859690574873638e-06, + "loss": 0.80086064, + "num_input_tokens_seen": 52811615, + "step": 2434, + "time_per_iteration": 4.371506929397583 + }, + { + "auxiliary_loss_clip": 0.01066282, + "auxiliary_loss_mlp": 0.01033141, + "balance_loss_clip": 1.05327988, + "balance_loss_mlp": 1.03022039, + "epoch": 0.14640012024650534, + "flos": 62660638270080.0, + "grad_norm": 0.8566726319617045, + "language_loss": 0.58453119, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60552537, + "num_input_tokens_seen": 52873230, + "step": 2435, + "time_per_iteration": 3.229882001876831 + }, + { + "auxiliary_loss_clip": 0.01160087, + "auxiliary_loss_mlp": 0.01045043, + "balance_loss_clip": 1.05263698, + "balance_loss_mlp": 1.0257076, + "epoch": 0.1464602434991733, + "flos": 12276243146880.0, + "grad_norm": 3.775553645712452, + "language_loss": 0.88436592, + "learning_rate": 3.859403828419744e-06, + "loss": 0.90641725, + "num_input_tokens_seen": 52889325, + "step": 2436, + "time_per_iteration": 2.568624973297119 + }, + { + "auxiliary_loss_clip": 0.011561, + "auxiliary_loss_mlp": 0.00780257, + "balance_loss_clip": 1.05587268, + "balance_loss_mlp": 1.00041819, + "epoch": 0.14652036675184127, + "flos": 20922311197440.0, + "grad_norm": 2.028718201913856, + "language_loss": 0.74904168, + "learning_rate": 3.85926034942691e-06, + "loss": 0.7684052, + "num_input_tokens_seen": 52909705, + "step": 2437, + "time_per_iteration": 2.6361188888549805 + }, + { + "auxiliary_loss_clip": 0.01165187, + "auxiliary_loss_mlp": 0.01050068, + "balance_loss_clip": 1.05295086, + "balance_loss_mlp": 1.02729869, + "epoch": 0.14658049000450923, + "flos": 27703681528320.0, + "grad_norm": 3.0822234004311033, + "language_loss": 0.73914421, + "learning_rate": 3.859116799930736e-06, + "loss": 0.76129669, + "num_input_tokens_seen": 52930300, + "step": 2438, + "time_per_iteration": 2.7590928077697754 + }, + { + "auxiliary_loss_clip": 0.01154571, + "auxiliary_loss_mlp": 0.01046509, + "balance_loss_clip": 1.05747688, + "balance_loss_mlp": 1.02708936, + "epoch": 0.14664061325717723, + "flos": 24936513575040.0, + "grad_norm": 4.476318678757457, + "language_loss": 0.74410725, + "learning_rate": 3.858973179936668e-06, + "loss": 0.76611805, + "num_input_tokens_seen": 52949955, + "step": 2439, + "time_per_iteration": 2.627037763595581 + }, + { + "auxiliary_loss_clip": 0.01152452, + "auxiliary_loss_mlp": 0.01051294, + "balance_loss_clip": 1.05477583, + "balance_loss_mlp": 1.0309453, + "epoch": 0.1467007365098452, + "flos": 40297661406720.0, + "grad_norm": 2.1583973700525343, + "language_loss": 0.74123728, + "learning_rate": 3.85882948945015e-06, + "loss": 0.76327467, + "num_input_tokens_seen": 52972905, + "step": 2440, + "time_per_iteration": 2.79715633392334 + }, + { + "auxiliary_loss_clip": 0.01160843, + "auxiliary_loss_mlp": 0.01044034, + "balance_loss_clip": 1.05471611, + "balance_loss_mlp": 1.02493691, + "epoch": 0.14676085976251316, + "flos": 26541074021760.0, + "grad_norm": 1.9756103236146798, + "language_loss": 0.82730794, + "learning_rate": 3.85868572847663e-06, + "loss": 0.84935671, + "num_input_tokens_seen": 52994850, + "step": 2441, + "time_per_iteration": 2.6505653858184814 + }, + { + "auxiliary_loss_clip": 0.01152605, + "auxiliary_loss_mlp": 0.01049175, + "balance_loss_clip": 1.05408478, + "balance_loss_mlp": 1.02796757, + "epoch": 0.14682098301518112, + "flos": 23550110380800.0, + "grad_norm": 2.582118236216862, + "language_loss": 0.71455544, + "learning_rate": 3.858541897021563e-06, + "loss": 0.73657322, + "num_input_tokens_seen": 53014740, + "step": 2442, + "time_per_iteration": 2.772648572921753 + }, + { + "auxiliary_loss_clip": 0.0113053, + "auxiliary_loss_mlp": 0.0104246, + "balance_loss_clip": 1.05283213, + "balance_loss_mlp": 1.02224207, + "epoch": 0.1468811062678491, + "flos": 11651073909120.0, + "grad_norm": 3.6780587187273155, + "language_loss": 0.81992352, + "learning_rate": 3.8583979950904e-06, + "loss": 0.84165335, + "num_input_tokens_seen": 53029780, + "step": 2443, + "time_per_iteration": 2.6979780197143555 + }, + { + "auxiliary_loss_clip": 0.01147138, + "auxiliary_loss_mlp": 0.0105693, + "balance_loss_clip": 1.05402422, + "balance_loss_mlp": 1.03474557, + "epoch": 0.14694122952051705, + "flos": 23002616304000.0, + "grad_norm": 3.190851099873364, + "language_loss": 0.83093917, + "learning_rate": 3.858254022688599e-06, + "loss": 0.85297978, + "num_input_tokens_seen": 53048620, + "step": 2444, + "time_per_iteration": 2.7177255153656006 + }, + { + "auxiliary_loss_clip": 0.01134628, + "auxiliary_loss_mlp": 0.01051986, + "balance_loss_clip": 1.05385137, + "balance_loss_mlp": 1.03213811, + "epoch": 0.14700135277318502, + "flos": 26502972670080.0, + "grad_norm": 3.1425569240832414, + "language_loss": 0.71183646, + "learning_rate": 3.85810997982162e-06, + "loss": 0.7337026, + "num_input_tokens_seen": 53070055, + "step": 2445, + "time_per_iteration": 2.735361099243164 + }, + { + "auxiliary_loss_clip": 0.01095177, + "auxiliary_loss_mlp": 0.01023118, + "balance_loss_clip": 1.05335557, + "balance_loss_mlp": 1.01999438, + "epoch": 0.147061476025853, + "flos": 59449434387840.0, + "grad_norm": 0.824990401786658, + "language_loss": 0.63083708, + "learning_rate": 3.857965866494923e-06, + "loss": 0.65202004, + "num_input_tokens_seen": 53126945, + "step": 2446, + "time_per_iteration": 3.0853025913238525 + }, + { + "auxiliary_loss_clip": 0.01120664, + "auxiliary_loss_mlp": 0.01045249, + "balance_loss_clip": 1.05621576, + "balance_loss_mlp": 1.02491164, + "epoch": 0.14712159927852098, + "flos": 28330897841280.0, + "grad_norm": 2.813052009295296, + "language_loss": 0.74895924, + "learning_rate": 3.857821682713975e-06, + "loss": 0.77061838, + "num_input_tokens_seen": 53149130, + "step": 2447, + "time_per_iteration": 2.858643054962158 + }, + { + "auxiliary_loss_clip": 0.01168929, + "auxiliary_loss_mlp": 0.01042907, + "balance_loss_clip": 1.0604012, + "balance_loss_mlp": 1.02383327, + "epoch": 0.14718172253118894, + "flos": 27089825074560.0, + "grad_norm": 2.2427639286159367, + "language_loss": 0.8528471, + "learning_rate": 3.857677428484242e-06, + "loss": 0.87496543, + "num_input_tokens_seen": 53167120, + "step": 2448, + "time_per_iteration": 2.699781894683838 + }, + { + "auxiliary_loss_clip": 0.01092169, + "auxiliary_loss_mlp": 0.01019616, + "balance_loss_clip": 1.05051064, + "balance_loss_mlp": 1.01654005, + "epoch": 0.1472418457838569, + "flos": 66706764860160.0, + "grad_norm": 0.7683837313264128, + "language_loss": 0.56829578, + "learning_rate": 3.857533103811195e-06, + "loss": 0.58941364, + "num_input_tokens_seen": 53227945, + "step": 2449, + "time_per_iteration": 3.1478211879730225 + }, + { + "auxiliary_loss_clip": 0.01135016, + "auxiliary_loss_mlp": 0.01050801, + "balance_loss_clip": 1.05464292, + "balance_loss_mlp": 1.03023791, + "epoch": 0.14730196903652487, + "flos": 19573578391680.0, + "grad_norm": 1.9048653074507311, + "language_loss": 0.85067344, + "learning_rate": 3.857388708700307e-06, + "loss": 0.87253165, + "num_input_tokens_seen": 53244615, + "step": 2450, + "time_per_iteration": 2.726008653640747 + }, + { + "auxiliary_loss_clip": 0.01158708, + "auxiliary_loss_mlp": 0.01049735, + "balance_loss_clip": 1.05984712, + "balance_loss_mlp": 1.02994645, + "epoch": 0.14736209228919284, + "flos": 16071031296000.0, + "grad_norm": 2.306043539040143, + "language_loss": 0.74523091, + "learning_rate": 3.857244243157052e-06, + "loss": 0.76731533, + "num_input_tokens_seen": 53262205, + "step": 2451, + "time_per_iteration": 2.641082286834717 + }, + { + "auxiliary_loss_clip": 0.01133915, + "auxiliary_loss_mlp": 0.01038458, + "balance_loss_clip": 1.05399728, + "balance_loss_mlp": 1.02031422, + "epoch": 0.1474222155418608, + "flos": 23039460679680.0, + "grad_norm": 1.8026547738986978, + "language_loss": 0.82384264, + "learning_rate": 3.85709970718691e-06, + "loss": 0.84556639, + "num_input_tokens_seen": 53282445, + "step": 2452, + "time_per_iteration": 2.7810096740722656 + }, + { + "auxiliary_loss_clip": 0.01101553, + "auxiliary_loss_mlp": 0.01041864, + "balance_loss_clip": 1.05924153, + "balance_loss_mlp": 1.0238874, + "epoch": 0.1474823387945288, + "flos": 17018641946880.0, + "grad_norm": 1.6675065143572472, + "language_loss": 0.74075705, + "learning_rate": 3.856955100795361e-06, + "loss": 0.76219124, + "num_input_tokens_seen": 53299060, + "step": 2453, + "time_per_iteration": 2.7913167476654053 + }, + { + "auxiliary_loss_clip": 0.01141798, + "auxiliary_loss_mlp": 0.0104607, + "balance_loss_clip": 1.05557632, + "balance_loss_mlp": 1.026353, + "epoch": 0.14754246204719676, + "flos": 17895041884800.0, + "grad_norm": 1.9958141581621542, + "language_loss": 0.7558704, + "learning_rate": 3.856810423987889e-06, + "loss": 0.77774906, + "num_input_tokens_seen": 53315970, + "step": 2454, + "time_per_iteration": 2.7199089527130127 + }, + { + "auxiliary_loss_clip": 0.01147348, + "auxiliary_loss_mlp": 0.01038134, + "balance_loss_clip": 1.05733335, + "balance_loss_mlp": 1.01864362, + "epoch": 0.14760258529986472, + "flos": 13079097987840.0, + "grad_norm": 2.0858167958418674, + "language_loss": 0.83077228, + "learning_rate": 3.856665676769979e-06, + "loss": 0.85262716, + "num_input_tokens_seen": 53332940, + "step": 2455, + "time_per_iteration": 2.75616192817688 + }, + { + "auxiliary_loss_clip": 0.01130504, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_clip": 1.05704689, + "balance_loss_mlp": 1.02452159, + "epoch": 0.1476627085525327, + "flos": 30806399358720.0, + "grad_norm": 2.3702229998953976, + "language_loss": 0.83881497, + "learning_rate": 3.85652085914712e-06, + "loss": 0.86054951, + "num_input_tokens_seen": 53353295, + "step": 2456, + "time_per_iteration": 2.7914254665374756 + }, + { + "auxiliary_loss_clip": 0.01154014, + "auxiliary_loss_mlp": 0.01043715, + "balance_loss_clip": 1.05863023, + "balance_loss_mlp": 1.02514231, + "epoch": 0.14772283180520066, + "flos": 21689434984320.0, + "grad_norm": 2.4172359629848996, + "language_loss": 0.84154665, + "learning_rate": 3.856375971124805e-06, + "loss": 0.86352402, + "num_input_tokens_seen": 53373410, + "step": 2457, + "time_per_iteration": 2.688265323638916 + }, + { + "auxiliary_loss_clip": 0.01155788, + "auxiliary_loss_mlp": 0.01042903, + "balance_loss_clip": 1.06250155, + "balance_loss_mlp": 1.02529585, + "epoch": 0.14778295505786862, + "flos": 18770400328320.0, + "grad_norm": 6.310680797376285, + "language_loss": 0.75692672, + "learning_rate": 3.856231012708527e-06, + "loss": 0.77891362, + "num_input_tokens_seen": 53391430, + "step": 2458, + "time_per_iteration": 2.698697805404663 + }, + { + "auxiliary_loss_clip": 0.01117404, + "auxiliary_loss_mlp": 0.01047753, + "balance_loss_clip": 1.05451179, + "balance_loss_mlp": 1.02718902, + "epoch": 0.1478430783105366, + "flos": 22893555634560.0, + "grad_norm": 3.1268711361266393, + "language_loss": 0.83348328, + "learning_rate": 3.856085983903782e-06, + "loss": 0.85513484, + "num_input_tokens_seen": 53409960, + "step": 2459, + "time_per_iteration": 2.790552854537964 + }, + { + "auxiliary_loss_clip": 0.01126767, + "auxiliary_loss_mlp": 0.01042293, + "balance_loss_clip": 1.05070424, + "balance_loss_mlp": 1.02435231, + "epoch": 0.14790320156320458, + "flos": 15085319293440.0, + "grad_norm": 3.1203941208753534, + "language_loss": 0.7554391, + "learning_rate": 3.855940884716071e-06, + "loss": 0.77712965, + "num_input_tokens_seen": 53426160, + "step": 2460, + "time_per_iteration": 2.815455675125122 + }, + { + "auxiliary_loss_clip": 0.01134117, + "auxiliary_loss_mlp": 0.01056838, + "balance_loss_clip": 1.05845904, + "balance_loss_mlp": 1.03770471, + "epoch": 0.14796332481587254, + "flos": 26504768350080.0, + "grad_norm": 3.59241393994, + "language_loss": 0.81227219, + "learning_rate": 3.855795715150896e-06, + "loss": 0.83418173, + "num_input_tokens_seen": 53448530, + "step": 2461, + "time_per_iteration": 2.785569190979004 + }, + { + "auxiliary_loss_clip": 0.01156748, + "auxiliary_loss_mlp": 0.01051178, + "balance_loss_clip": 1.05812359, + "balance_loss_mlp": 1.03044713, + "epoch": 0.1480234480685405, + "flos": 17563191108480.0, + "grad_norm": 3.2910626990147183, + "language_loss": 0.66117477, + "learning_rate": 3.855650475213761e-06, + "loss": 0.683254, + "num_input_tokens_seen": 53465915, + "step": 2462, + "time_per_iteration": 2.7222983837127686 + }, + { + "auxiliary_loss_clip": 0.01136035, + "auxiliary_loss_mlp": 0.01049537, + "balance_loss_clip": 1.05622339, + "balance_loss_mlp": 1.02965331, + "epoch": 0.14808357132120847, + "flos": 53582203232640.0, + "grad_norm": 1.8120706772856114, + "language_loss": 0.67226064, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.69411635, + "num_input_tokens_seen": 53496055, + "step": 2463, + "time_per_iteration": 3.0344398021698 + }, + { + "auxiliary_loss_clip": 0.01153077, + "auxiliary_loss_mlp": 0.01050435, + "balance_loss_clip": 1.05550933, + "balance_loss_mlp": 1.0307889, + "epoch": 0.14814369457387644, + "flos": 19829190551040.0, + "grad_norm": 1.9881580745750587, + "language_loss": 0.76870739, + "learning_rate": 3.855359784245646e-06, + "loss": 0.79074258, + "num_input_tokens_seen": 53513790, + "step": 2464, + "time_per_iteration": 2.69480037689209 + }, + { + "auxiliary_loss_clip": 0.01133748, + "auxiliary_loss_mlp": 0.01057139, + "balance_loss_clip": 1.05392432, + "balance_loss_mlp": 1.03769565, + "epoch": 0.1482038178265444, + "flos": 23914962777600.0, + "grad_norm": 1.8401367705559406, + "language_loss": 0.79628456, + "learning_rate": 3.855214333225688e-06, + "loss": 0.81819344, + "num_input_tokens_seen": 53533410, + "step": 2465, + "time_per_iteration": 2.6989939212799072 + }, + { + "auxiliary_loss_clip": 0.01170385, + "auxiliary_loss_mlp": 0.01054925, + "balance_loss_clip": 1.06119514, + "balance_loss_mlp": 1.03568494, + "epoch": 0.1482639410792124, + "flos": 24170503109760.0, + "grad_norm": 2.005541134809237, + "language_loss": 0.76272273, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78497583, + "num_input_tokens_seen": 53554775, + "step": 2466, + "time_per_iteration": 2.646245002746582 + }, + { + "auxiliary_loss_clip": 0.01018939, + "auxiliary_loss_mlp": 0.0114331, + "balance_loss_clip": 1.03313899, + "balance_loss_mlp": 1.14004362, + "epoch": 0.14832406433188036, + "flos": 66191051341440.0, + "grad_norm": 0.8320983618395327, + "language_loss": 0.6004858, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62210834, + "num_input_tokens_seen": 53609675, + "step": 2467, + "time_per_iteration": 3.33776593208313 + }, + { + "auxiliary_loss_clip": 0.01141854, + "auxiliary_loss_mlp": 0.01044026, + "balance_loss_clip": 1.05437851, + "balance_loss_mlp": 1.02509522, + "epoch": 0.14838418758454833, + "flos": 25411252654080.0, + "grad_norm": 2.92694776694492, + "language_loss": 0.87666196, + "learning_rate": 3.85477755808841e-06, + "loss": 0.89852077, + "num_input_tokens_seen": 53626950, + "step": 2468, + "time_per_iteration": 4.266207456588745 + }, + { + "auxiliary_loss_clip": 0.01130189, + "auxiliary_loss_mlp": 0.01048186, + "balance_loss_clip": 1.05255163, + "balance_loss_mlp": 1.02782488, + "epoch": 0.1484443108372163, + "flos": 23289901280640.0, + "grad_norm": 2.2284173124426223, + "language_loss": 0.7598694, + "learning_rate": 3.854631825701919e-06, + "loss": 0.78165317, + "num_input_tokens_seen": 53644200, + "step": 2469, + "time_per_iteration": 4.217481851577759 + }, + { + "auxiliary_loss_clip": 0.01126269, + "auxiliary_loss_mlp": 0.0104139, + "balance_loss_clip": 1.05208421, + "balance_loss_mlp": 1.02251911, + "epoch": 0.14850443408988426, + "flos": 14647675985280.0, + "grad_norm": 6.591244267451795, + "language_loss": 0.75895017, + "learning_rate": 3.854486022987603e-06, + "loss": 0.78062677, + "num_input_tokens_seen": 53659650, + "step": 2470, + "time_per_iteration": 2.7157187461853027 + }, + { + "auxiliary_loss_clip": 0.01161157, + "auxiliary_loss_mlp": 0.01044729, + "balance_loss_clip": 1.05831027, + "balance_loss_mlp": 1.02571499, + "epoch": 0.14856455734255222, + "flos": 23548314700800.0, + "grad_norm": 1.8610043660805562, + "language_loss": 0.7215873, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.74364614, + "num_input_tokens_seen": 53680275, + "step": 2471, + "time_per_iteration": 4.162387132644653 + }, + { + "auxiliary_loss_clip": 0.01135244, + "auxiliary_loss_mlp": 0.01047611, + "balance_loss_clip": 1.05438995, + "balance_loss_mlp": 1.02717888, + "epoch": 0.1486246805952202, + "flos": 18077288515200.0, + "grad_norm": 1.979025280241548, + "language_loss": 0.89558828, + "learning_rate": 3.854194206597615e-06, + "loss": 0.91741687, + "num_input_tokens_seen": 53698270, + "step": 2472, + "time_per_iteration": 2.739457607269287 + }, + { + "auxiliary_loss_clip": 0.01134625, + "auxiliary_loss_mlp": 0.01049109, + "balance_loss_clip": 1.06334805, + "balance_loss_mlp": 1.02964163, + "epoch": 0.14868480384788818, + "flos": 19353625459200.0, + "grad_norm": 2.6029609251362764, + "language_loss": 0.80801564, + "learning_rate": 3.854048192933008e-06, + "loss": 0.82985294, + "num_input_tokens_seen": 53716845, + "step": 2473, + "time_per_iteration": 4.412883758544922 + }, + { + "auxiliary_loss_clip": 0.01161034, + "auxiliary_loss_mlp": 0.01051306, + "balance_loss_clip": 1.0626657, + "balance_loss_mlp": 1.03267312, + "epoch": 0.14874492710055615, + "flos": 22200192426240.0, + "grad_norm": 3.426519274325147, + "language_loss": 0.77372944, + "learning_rate": 3.853902108962709e-06, + "loss": 0.79585278, + "num_input_tokens_seen": 53734970, + "step": 2474, + "time_per_iteration": 2.6879520416259766 + }, + { + "auxiliary_loss_clip": 0.01124216, + "auxiliary_loss_mlp": 0.01059785, + "balance_loss_clip": 1.05597806, + "balance_loss_mlp": 1.04041362, + "epoch": 0.1488050503532241, + "flos": 21103444506240.0, + "grad_norm": 2.4771626433268734, + "language_loss": 0.82151824, + "learning_rate": 3.853755954692255e-06, + "loss": 0.84335828, + "num_input_tokens_seen": 53753415, + "step": 2475, + "time_per_iteration": 2.7828469276428223 + }, + { + "auxiliary_loss_clip": 0.01115855, + "auxiliary_loss_mlp": 0.01052322, + "balance_loss_clip": 1.0614953, + "balance_loss_mlp": 1.03341544, + "epoch": 0.14886517360589208, + "flos": 12786569625600.0, + "grad_norm": 1.9349243252831771, + "language_loss": 0.80917645, + "learning_rate": 3.85360973012719e-06, + "loss": 0.83085823, + "num_input_tokens_seen": 53770305, + "step": 2476, + "time_per_iteration": 2.7227590084075928 + }, + { + "auxiliary_loss_clip": 0.01156019, + "auxiliary_loss_mlp": 0.0105036, + "balance_loss_clip": 1.06338036, + "balance_loss_mlp": 1.03216898, + "epoch": 0.14892529685856004, + "flos": 29022860419200.0, + "grad_norm": 2.0032169897498346, + "language_loss": 0.77659523, + "learning_rate": 3.853463435273058e-06, + "loss": 0.79865897, + "num_input_tokens_seen": 53788895, + "step": 2477, + "time_per_iteration": 2.740241765975952 + }, + { + "auxiliary_loss_clip": 0.0110234, + "auxiliary_loss_mlp": 0.01092005, + "balance_loss_clip": 1.07879949, + "balance_loss_mlp": 1.08730817, + "epoch": 0.148985420111228, + "flos": 61926121054080.0, + "grad_norm": 0.8188153224748298, + "language_loss": 0.60153681, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62348026, + "num_input_tokens_seen": 53850260, + "step": 2478, + "time_per_iteration": 3.2467947006225586 + }, + { + "auxiliary_loss_clip": 0.01107417, + "auxiliary_loss_mlp": 0.01048452, + "balance_loss_clip": 1.0516423, + "balance_loss_mlp": 1.03041577, + "epoch": 0.149045543363896, + "flos": 23915106432000.0, + "grad_norm": 2.666109649137694, + "language_loss": 0.7139731, + "learning_rate": 3.853170634719787e-06, + "loss": 0.73553181, + "num_input_tokens_seen": 53867520, + "step": 2479, + "time_per_iteration": 2.7973475456237793 + }, + { + "auxiliary_loss_clip": 0.01140551, + "auxiliary_loss_mlp": 0.01043104, + "balance_loss_clip": 1.05563831, + "balance_loss_mlp": 1.02407789, + "epoch": 0.14910566661656396, + "flos": 23654394541440.0, + "grad_norm": 1.7687137634424535, + "language_loss": 0.80758464, + "learning_rate": 3.853024129031751e-06, + "loss": 0.82942122, + "num_input_tokens_seen": 53886620, + "step": 2480, + "time_per_iteration": 2.7238829135894775 + }, + { + "auxiliary_loss_clip": 0.01138106, + "auxiliary_loss_mlp": 0.0104537, + "balance_loss_clip": 1.0584991, + "balance_loss_mlp": 1.02627277, + "epoch": 0.14916578986923193, + "flos": 20515299212160.0, + "grad_norm": 4.65741826395702, + "language_loss": 0.84375542, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86559021, + "num_input_tokens_seen": 53902230, + "step": 2481, + "time_per_iteration": 2.791550874710083 + }, + { + "auxiliary_loss_clip": 0.01149484, + "auxiliary_loss_mlp": 0.01050268, + "balance_loss_clip": 1.05772805, + "balance_loss_mlp": 1.02948999, + "epoch": 0.1492259131218999, + "flos": 22491822948480.0, + "grad_norm": 8.035113387353048, + "language_loss": 0.77703977, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.79903734, + "num_input_tokens_seen": 53919475, + "step": 2482, + "time_per_iteration": 2.7310593128204346 + }, + { + "auxiliary_loss_clip": 0.01133163, + "auxiliary_loss_mlp": 0.01040426, + "balance_loss_clip": 1.05452228, + "balance_loss_mlp": 1.02032781, + "epoch": 0.14928603637456786, + "flos": 23185868515200.0, + "grad_norm": 2.207731010812049, + "language_loss": 0.78967929, + "learning_rate": 3.852584190388713e-06, + "loss": 0.81141514, + "num_input_tokens_seen": 53939150, + "step": 2483, + "time_per_iteration": 2.749671220779419 + }, + { + "auxiliary_loss_clip": 0.01154122, + "auxiliary_loss_mlp": 0.00776708, + "balance_loss_clip": 1.06144214, + "balance_loss_mlp": 1.00029397, + "epoch": 0.14934615962723582, + "flos": 21653237053440.0, + "grad_norm": 2.020127706544282, + "language_loss": 0.70361555, + "learning_rate": 3.852437403666595e-06, + "loss": 0.72292387, + "num_input_tokens_seen": 53958735, + "step": 2484, + "time_per_iteration": 2.737781524658203 + }, + { + "auxiliary_loss_clip": 0.01141919, + "auxiliary_loss_mlp": 0.00778215, + "balance_loss_clip": 1.05718136, + "balance_loss_mlp": 1.00030363, + "epoch": 0.1494062828799038, + "flos": 27010066924800.0, + "grad_norm": 2.165877689982274, + "language_loss": 0.84666765, + "learning_rate": 3.852290546699863e-06, + "loss": 0.86586899, + "num_input_tokens_seen": 53975065, + "step": 2485, + "time_per_iteration": 2.697976589202881 + }, + { + "auxiliary_loss_clip": 0.01145272, + "auxiliary_loss_mlp": 0.0104224, + "balance_loss_clip": 1.05639958, + "balance_loss_mlp": 1.02257001, + "epoch": 0.14946640613257178, + "flos": 21214947300480.0, + "grad_norm": 2.5229241908443023, + "language_loss": 0.8476423, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.86951739, + "num_input_tokens_seen": 53993330, + "step": 2486, + "time_per_iteration": 2.6799628734588623 + }, + { + "auxiliary_loss_clip": 0.01149031, + "auxiliary_loss_mlp": 0.01039312, + "balance_loss_clip": 1.05667424, + "balance_loss_mlp": 1.0230875, + "epoch": 0.14952652938523975, + "flos": 13370872164480.0, + "grad_norm": 2.1822908802725203, + "language_loss": 0.74762607, + "learning_rate": 3.851996622054842e-06, + "loss": 0.76950949, + "num_input_tokens_seen": 54010515, + "step": 2487, + "time_per_iteration": 2.8037290573120117 + }, + { + "auxiliary_loss_clip": 0.01153097, + "auxiliary_loss_mlp": 0.01044274, + "balance_loss_clip": 1.05934322, + "balance_loss_mlp": 1.02611899, + "epoch": 0.1495866526379077, + "flos": 35517699959040.0, + "grad_norm": 16.320028017118723, + "language_loss": 0.72210175, + "learning_rate": 3.8518495543877e-06, + "loss": 0.74407548, + "num_input_tokens_seen": 54031315, + "step": 2488, + "time_per_iteration": 2.8031094074249268 + }, + { + "auxiliary_loss_clip": 0.01137536, + "auxiliary_loss_mlp": 0.01054916, + "balance_loss_clip": 1.05569518, + "balance_loss_mlp": 1.03636682, + "epoch": 0.14964677589057568, + "flos": 17632749795840.0, + "grad_norm": 3.2458980886023143, + "language_loss": 0.71352434, + "learning_rate": 3.851702416498235e-06, + "loss": 0.73544884, + "num_input_tokens_seen": 54045965, + "step": 2489, + "time_per_iteration": 2.648883819580078 + }, + { + "auxiliary_loss_clip": 0.0113767, + "auxiliary_loss_mlp": 0.01052603, + "balance_loss_clip": 1.05376494, + "balance_loss_mlp": 1.03357768, + "epoch": 0.14970689914324364, + "flos": 20185280029440.0, + "grad_norm": 3.893198448080141, + "language_loss": 0.81559736, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.8375001, + "num_input_tokens_seen": 54059960, + "step": 2490, + "time_per_iteration": 2.702808380126953 + }, + { + "auxiliary_loss_clip": 0.01125097, + "auxiliary_loss_mlp": 0.01055928, + "balance_loss_clip": 1.05606139, + "balance_loss_mlp": 1.03803492, + "epoch": 0.1497670223959116, + "flos": 37228699382400.0, + "grad_norm": 1.9071281232744548, + "language_loss": 0.80057055, + "learning_rate": 3.851407930074666e-06, + "loss": 0.82238084, + "num_input_tokens_seen": 54079330, + "step": 2491, + "time_per_iteration": 2.833272933959961 + }, + { + "auxiliary_loss_clip": 0.01143407, + "auxiliary_loss_mlp": 0.01052558, + "balance_loss_clip": 1.05301452, + "balance_loss_mlp": 1.03195894, + "epoch": 0.1498271456485796, + "flos": 24455848752000.0, + "grad_norm": 2.3105790695512294, + "language_loss": 0.90820229, + "learning_rate": 3.851260581551727e-06, + "loss": 0.93016195, + "num_input_tokens_seen": 54097555, + "step": 2492, + "time_per_iteration": 2.684178352355957 + }, + { + "auxiliary_loss_clip": 0.01152331, + "auxiliary_loss_mlp": 0.01063543, + "balance_loss_clip": 1.05835843, + "balance_loss_mlp": 1.04508913, + "epoch": 0.14988726890124757, + "flos": 16253601148800.0, + "grad_norm": 6.881290297472923, + "language_loss": 0.79406559, + "learning_rate": 3.851113162828802e-06, + "loss": 0.81622434, + "num_input_tokens_seen": 54115600, + "step": 2493, + "time_per_iteration": 2.6558918952941895 + }, + { + "auxiliary_loss_clip": 0.0114858, + "auxiliary_loss_mlp": 0.01052018, + "balance_loss_clip": 1.05345511, + "balance_loss_mlp": 1.03258693, + "epoch": 0.14994739215391553, + "flos": 20666555383680.0, + "grad_norm": 2.3431247769189967, + "language_loss": 0.79894584, + "learning_rate": 3.85096567391148e-06, + "loss": 0.82095182, + "num_input_tokens_seen": 54135220, + "step": 2494, + "time_per_iteration": 2.6774168014526367 + }, + { + "auxiliary_loss_clip": 0.01137216, + "auxiliary_loss_mlp": 0.01050857, + "balance_loss_clip": 1.05474579, + "balance_loss_mlp": 1.03212965, + "epoch": 0.1500075154065835, + "flos": 70652375239680.0, + "grad_norm": 1.928941284350508, + "language_loss": 0.66480517, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68668592, + "num_input_tokens_seen": 54161065, + "step": 2495, + "time_per_iteration": 3.1090729236602783 + }, + { + "auxiliary_loss_clip": 0.01103374, + "auxiliary_loss_mlp": 0.01038654, + "balance_loss_clip": 1.06896818, + "balance_loss_mlp": 1.03560257, + "epoch": 0.15006763865925146, + "flos": 68011937447040.0, + "grad_norm": 0.9030283421527312, + "language_loss": 0.59524739, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61666763, + "num_input_tokens_seen": 54225095, + "step": 2496, + "time_per_iteration": 3.2250726222991943 + }, + { + "auxiliary_loss_clip": 0.01163934, + "auxiliary_loss_mlp": 0.01055725, + "balance_loss_clip": 1.05690169, + "balance_loss_mlp": 1.0360074, + "epoch": 0.15012776191191943, + "flos": 18916269459840.0, + "grad_norm": 3.063784198565679, + "language_loss": 0.65276247, + "learning_rate": 3.850522786049075e-06, + "loss": 0.67495906, + "num_input_tokens_seen": 54243750, + "step": 2497, + "time_per_iteration": 2.619946002960205 + }, + { + "auxiliary_loss_clip": 0.01125657, + "auxiliary_loss_mlp": 0.01054091, + "balance_loss_clip": 1.05308235, + "balance_loss_mlp": 1.03316998, + "epoch": 0.1501878851645874, + "flos": 23701330638720.0, + "grad_norm": 1.5552670947231086, + "language_loss": 0.75182658, + "learning_rate": 3.850375016410121e-06, + "loss": 0.77362406, + "num_input_tokens_seen": 54266185, + "step": 2498, + "time_per_iteration": 2.778163433074951 + }, + { + "auxiliary_loss_clip": 0.01132738, + "auxiliary_loss_mlp": 0.01046919, + "balance_loss_clip": 1.05919099, + "balance_loss_mlp": 1.02701163, + "epoch": 0.15024800841725539, + "flos": 20412523422720.0, + "grad_norm": 3.357364003851319, + "language_loss": 0.71821117, + "learning_rate": 3.850227176604761e-06, + "loss": 0.74000776, + "num_input_tokens_seen": 54283940, + "step": 2499, + "time_per_iteration": 2.6929259300231934 + }, + { + "auxiliary_loss_clip": 0.01134239, + "auxiliary_loss_mlp": 0.01051817, + "balance_loss_clip": 1.0547812, + "balance_loss_mlp": 1.03236222, + "epoch": 0.15030813166992335, + "flos": 31831002812160.0, + "grad_norm": 2.1406696998963652, + "language_loss": 0.7206136, + "learning_rate": 3.850079266638601e-06, + "loss": 0.7424742, + "num_input_tokens_seen": 54304830, + "step": 2500, + "time_per_iteration": 2.769988536834717 + }, + { + "auxiliary_loss_clip": 0.01134021, + "auxiliary_loss_mlp": 0.0105021, + "balance_loss_clip": 1.06063724, + "balance_loss_mlp": 1.03181624, + "epoch": 0.15036825492259132, + "flos": 35657822914560.0, + "grad_norm": 2.0251881980439306, + "language_loss": 0.65127194, + "learning_rate": 3.849931286517249e-06, + "loss": 0.6731143, + "num_input_tokens_seen": 54325595, + "step": 2501, + "time_per_iteration": 2.810945510864258 + }, + { + "auxiliary_loss_clip": 0.01137877, + "auxiliary_loss_mlp": 0.01055223, + "balance_loss_clip": 1.0541079, + "balance_loss_mlp": 1.03511274, + "epoch": 0.15042837817525928, + "flos": 18838163335680.0, + "grad_norm": 2.209666371186328, + "language_loss": 0.83401144, + "learning_rate": 3.849783236246318e-06, + "loss": 0.85594243, + "num_input_tokens_seen": 54342180, + "step": 2502, + "time_per_iteration": 2.6780545711517334 + }, + { + "auxiliary_loss_clip": 0.01122961, + "auxiliary_loss_mlp": 0.01049887, + "balance_loss_clip": 1.05318308, + "balance_loss_mlp": 1.0323875, + "epoch": 0.15048850142792725, + "flos": 19535548867200.0, + "grad_norm": 2.0319272128830947, + "language_loss": 0.77134645, + "learning_rate": 3.849635115831421e-06, + "loss": 0.79307491, + "num_input_tokens_seen": 54360255, + "step": 2503, + "time_per_iteration": 2.7579123973846436 + }, + { + "auxiliary_loss_clip": 0.01159116, + "auxiliary_loss_mlp": 0.01044094, + "balance_loss_clip": 1.05766046, + "balance_loss_mlp": 1.02692807, + "epoch": 0.1505486246805952, + "flos": 22017550746240.0, + "grad_norm": 1.9852139459946199, + "language_loss": 0.85514295, + "learning_rate": 3.849486925278176e-06, + "loss": 0.87717503, + "num_input_tokens_seen": 54378260, + "step": 2504, + "time_per_iteration": 2.631882905960083 + }, + { + "auxiliary_loss_clip": 0.01146113, + "auxiliary_loss_mlp": 0.01048035, + "balance_loss_clip": 1.05622697, + "balance_loss_mlp": 1.03098798, + "epoch": 0.15060874793326318, + "flos": 20743153136640.0, + "grad_norm": 1.8222645508164372, + "language_loss": 0.83178544, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85372692, + "num_input_tokens_seen": 54399745, + "step": 2505, + "time_per_iteration": 2.7706007957458496 + }, + { + "auxiliary_loss_clip": 0.01125699, + "auxiliary_loss_mlp": 0.01053819, + "balance_loss_clip": 1.05586648, + "balance_loss_mlp": 1.03590202, + "epoch": 0.15066887118593117, + "flos": 16471902055680.0, + "grad_norm": 2.0148067518000445, + "language_loss": 0.76044405, + "learning_rate": 3.849190333779117e-06, + "loss": 0.7822392, + "num_input_tokens_seen": 54417105, + "step": 2506, + "time_per_iteration": 2.70989990234375 + }, + { + "auxiliary_loss_clip": 0.01165314, + "auxiliary_loss_mlp": 0.01041911, + "balance_loss_clip": 1.05785728, + "balance_loss_mlp": 1.02305174, + "epoch": 0.15072899443859913, + "flos": 19859319083520.0, + "grad_norm": 2.823460856599666, + "language_loss": 0.76220375, + "learning_rate": 3.849041932844552e-06, + "loss": 0.78427601, + "num_input_tokens_seen": 54433920, + "step": 2507, + "time_per_iteration": 2.5367634296417236 + }, + { + "auxiliary_loss_clip": 0.01144479, + "auxiliary_loss_mlp": 0.01041094, + "balance_loss_clip": 1.05261898, + "balance_loss_mlp": 1.02306986, + "epoch": 0.1507891176912671, + "flos": 20776226584320.0, + "grad_norm": 2.5197772895304906, + "language_loss": 0.68633789, + "learning_rate": 3.848893461794131e-06, + "loss": 0.70819366, + "num_input_tokens_seen": 54451540, + "step": 2508, + "time_per_iteration": 4.303388833999634 + }, + { + "auxiliary_loss_clip": 0.01130299, + "auxiliary_loss_mlp": 0.01046507, + "balance_loss_clip": 1.05477214, + "balance_loss_mlp": 1.02835178, + "epoch": 0.15084924094393506, + "flos": 23586631534080.0, + "grad_norm": 2.840517748098311, + "language_loss": 0.77994299, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.80171108, + "num_input_tokens_seen": 54470800, + "step": 2509, + "time_per_iteration": 4.380200147628784 + }, + { + "auxiliary_loss_clip": 0.01141335, + "auxiliary_loss_mlp": 0.00776843, + "balance_loss_clip": 1.05463386, + "balance_loss_mlp": 1.00027037, + "epoch": 0.15090936419660303, + "flos": 18911313383040.0, + "grad_norm": 2.53406994590866, + "language_loss": 0.79959804, + "learning_rate": 3.848596309368246e-06, + "loss": 0.81877983, + "num_input_tokens_seen": 54486525, + "step": 2510, + "time_per_iteration": 4.219487428665161 + }, + { + "auxiliary_loss_clip": 0.01150641, + "auxiliary_loss_mlp": 0.01047345, + "balance_loss_clip": 1.05529225, + "balance_loss_mlp": 1.02794981, + "epoch": 0.150969487449271, + "flos": 17928223073280.0, + "grad_norm": 1.8628702139594306, + "language_loss": 0.73398602, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.75596589, + "num_input_tokens_seen": 54503795, + "step": 2511, + "time_per_iteration": 2.62237811088562 + }, + { + "auxiliary_loss_clip": 0.01094269, + "auxiliary_loss_mlp": 0.0104236, + "balance_loss_clip": 1.04747009, + "balance_loss_mlp": 1.02365553, + "epoch": 0.151029610701939, + "flos": 24243078539520.0, + "grad_norm": 2.20399257021602, + "language_loss": 0.68716824, + "learning_rate": 3.848298876546534e-06, + "loss": 0.70853454, + "num_input_tokens_seen": 54523025, + "step": 2512, + "time_per_iteration": 2.823359489440918 + }, + { + "auxiliary_loss_clip": 0.01149398, + "auxiliary_loss_mlp": 0.01043296, + "balance_loss_clip": 1.05574036, + "balance_loss_mlp": 1.02615356, + "epoch": 0.15108973395460695, + "flos": 30262496641920.0, + "grad_norm": 2.6278607305338877, + "language_loss": 0.73833561, + "learning_rate": 3.84815005500134e-06, + "loss": 0.76026255, + "num_input_tokens_seen": 54545025, + "step": 2513, + "time_per_iteration": 4.386258602142334 + }, + { + "auxiliary_loss_clip": 0.01059691, + "auxiliary_loss_mlp": 0.01109321, + "balance_loss_clip": 1.0685482, + "balance_loss_mlp": 1.10529137, + "epoch": 0.15114985720727492, + "flos": 60437624428800.0, + "grad_norm": 0.9017688875456507, + "language_loss": 0.64720047, + "learning_rate": 3.84800116337411e-06, + "loss": 0.6688906, + "num_input_tokens_seen": 54604545, + "step": 2514, + "time_per_iteration": 3.254983425140381 + }, + { + "auxiliary_loss_clip": 0.01146323, + "auxiliary_loss_mlp": 0.0104352, + "balance_loss_clip": 1.05674648, + "balance_loss_mlp": 1.02584124, + "epoch": 0.15120998045994288, + "flos": 20521691832960.0, + "grad_norm": 3.178381755435586, + "language_loss": 0.72995645, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.7518549, + "num_input_tokens_seen": 54620590, + "step": 2515, + "time_per_iteration": 2.67921781539917 + }, + { + "auxiliary_loss_clip": 0.01133382, + "auxiliary_loss_mlp": 0.01040315, + "balance_loss_clip": 1.05675673, + "balance_loss_mlp": 1.02120531, + "epoch": 0.15127010371261085, + "flos": 21178893024000.0, + "grad_norm": 2.0712989062813243, + "language_loss": 0.7773214, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.79905832, + "num_input_tokens_seen": 54640410, + "step": 2516, + "time_per_iteration": 2.763467788696289 + }, + { + "auxiliary_loss_clip": 0.01087601, + "auxiliary_loss_mlp": 0.01004779, + "balance_loss_clip": 1.05344796, + "balance_loss_mlp": 1.00160813, + "epoch": 0.1513302269652788, + "flos": 65320648974720.0, + "grad_norm": 0.7270407819118658, + "language_loss": 0.54622567, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56714946, + "num_input_tokens_seen": 54701430, + "step": 2517, + "time_per_iteration": 3.2293660640716553 + }, + { + "auxiliary_loss_clip": 0.01110142, + "auxiliary_loss_mlp": 0.0104362, + "balance_loss_clip": 1.04499209, + "balance_loss_mlp": 1.02427244, + "epoch": 0.15139035021794678, + "flos": 19135827342720.0, + "grad_norm": 3.035771526476276, + "language_loss": 0.78264821, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.80418587, + "num_input_tokens_seen": 54720845, + "step": 2518, + "time_per_iteration": 2.8154754638671875 + }, + { + "auxiliary_loss_clip": 0.01147342, + "auxiliary_loss_mlp": 0.01056368, + "balance_loss_clip": 1.05279088, + "balance_loss_mlp": 1.03681803, + "epoch": 0.15145047347061477, + "flos": 26578564842240.0, + "grad_norm": 2.1881526177791097, + "language_loss": 0.70480245, + "learning_rate": 3.847255654205137e-06, + "loss": 0.72683954, + "num_input_tokens_seen": 54740495, + "step": 2519, + "time_per_iteration": 2.7098515033721924 + }, + { + "auxiliary_loss_clip": 0.01152463, + "auxiliary_loss_mlp": 0.01056975, + "balance_loss_clip": 1.05683672, + "balance_loss_mlp": 1.03802037, + "epoch": 0.15151059672328274, + "flos": 20302959962880.0, + "grad_norm": 1.9048594994100874, + "language_loss": 0.78681207, + "learning_rate": 3.847106342204354e-06, + "loss": 0.80890644, + "num_input_tokens_seen": 54758415, + "step": 2520, + "time_per_iteration": 2.664187431335449 + }, + { + "auxiliary_loss_clip": 0.01140573, + "auxiliary_loss_mlp": 0.01071607, + "balance_loss_clip": 1.05435348, + "balance_loss_mlp": 1.05244994, + "epoch": 0.1515707199759507, + "flos": 27228367831680.0, + "grad_norm": 3.950911503454746, + "language_loss": 0.74849677, + "learning_rate": 3.846956960161114e-06, + "loss": 0.77061862, + "num_input_tokens_seen": 54779355, + "step": 2521, + "time_per_iteration": 2.7900772094726562 + }, + { + "auxiliary_loss_clip": 0.01132038, + "auxiliary_loss_mlp": 0.01055874, + "balance_loss_clip": 1.05052209, + "balance_loss_mlp": 1.0360136, + "epoch": 0.15163084322861867, + "flos": 23587349806080.0, + "grad_norm": 4.620979243079986, + "language_loss": 0.8253814, + "learning_rate": 3.84680750808108e-06, + "loss": 0.84726053, + "num_input_tokens_seen": 54799465, + "step": 2522, + "time_per_iteration": 2.7216525077819824 + }, + { + "auxiliary_loss_clip": 0.01051858, + "auxiliary_loss_mlp": 0.01048797, + "balance_loss_clip": 1.05645704, + "balance_loss_mlp": 1.04595995, + "epoch": 0.15169096648128663, + "flos": 66889622021760.0, + "grad_norm": 0.8362305181264502, + "language_loss": 0.57885599, + "learning_rate": 3.846657985969922e-06, + "loss": 0.59986252, + "num_input_tokens_seen": 54857665, + "step": 2523, + "time_per_iteration": 3.2375056743621826 + }, + { + "auxiliary_loss_clip": 0.0114147, + "auxiliary_loss_mlp": 0.01057964, + "balance_loss_clip": 1.05213499, + "balance_loss_mlp": 1.0368042, + "epoch": 0.1517510897339546, + "flos": 29095435848960.0, + "grad_norm": 1.8054087157705183, + "language_loss": 0.74795163, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.76994598, + "num_input_tokens_seen": 54879895, + "step": 2524, + "time_per_iteration": 2.711557388305664 + }, + { + "auxiliary_loss_clip": 0.01138185, + "auxiliary_loss_mlp": 0.01057236, + "balance_loss_clip": 1.05304718, + "balance_loss_mlp": 1.03865099, + "epoch": 0.1518112129866226, + "flos": 18406553512320.0, + "grad_norm": 1.8255227790100423, + "language_loss": 0.74631184, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.76826608, + "num_input_tokens_seen": 54898245, + "step": 2525, + "time_per_iteration": 2.6936984062194824 + }, + { + "auxiliary_loss_clip": 0.01144047, + "auxiliary_loss_mlp": 0.01057009, + "balance_loss_clip": 1.05403006, + "balance_loss_mlp": 1.03747034, + "epoch": 0.15187133623929056, + "flos": 19425410789760.0, + "grad_norm": 1.8907352833287865, + "language_loss": 0.79600316, + "learning_rate": 3.846208999506402e-06, + "loss": 0.81801373, + "num_input_tokens_seen": 54917060, + "step": 2526, + "time_per_iteration": 2.651494264602661 + }, + { + "auxiliary_loss_clip": 0.01135228, + "auxiliary_loss_mlp": 0.01047798, + "balance_loss_clip": 1.05538774, + "balance_loss_mlp": 1.03056002, + "epoch": 0.15193145949195852, + "flos": 17566207850880.0, + "grad_norm": 1.7677336965262924, + "language_loss": 0.8443349, + "learning_rate": 3.846059197327466e-06, + "loss": 0.86616516, + "num_input_tokens_seen": 54936365, + "step": 2527, + "time_per_iteration": 2.702683448791504 + }, + { + "auxiliary_loss_clip": 0.01124925, + "auxiliary_loss_mlp": 0.01049207, + "balance_loss_clip": 1.04976487, + "balance_loss_mlp": 1.02985954, + "epoch": 0.15199158274462649, + "flos": 36176265866880.0, + "grad_norm": 1.85678489681458, + "language_loss": 0.69361663, + "learning_rate": 3.845909325145779e-06, + "loss": 0.7153579, + "num_input_tokens_seen": 54961365, + "step": 2528, + "time_per_iteration": 2.9250690937042236 + }, + { + "auxiliary_loss_clip": 0.01134092, + "auxiliary_loss_mlp": 0.01055056, + "balance_loss_clip": 1.05266535, + "balance_loss_mlp": 1.03587484, + "epoch": 0.15205170599729445, + "flos": 23074042498560.0, + "grad_norm": 2.004144148858156, + "language_loss": 0.86482549, + "learning_rate": 3.845759382967026e-06, + "loss": 0.88671696, + "num_input_tokens_seen": 54980750, + "step": 2529, + "time_per_iteration": 2.7277863025665283 + }, + { + "auxiliary_loss_clip": 0.01124798, + "auxiliary_loss_mlp": 0.01041651, + "balance_loss_clip": 1.05046487, + "balance_loss_mlp": 1.02297091, + "epoch": 0.15211182924996242, + "flos": 21908382336000.0, + "grad_norm": 2.544775548600603, + "language_loss": 0.83399373, + "learning_rate": 3.845609370796893e-06, + "loss": 0.85565823, + "num_input_tokens_seen": 54999675, + "step": 2530, + "time_per_iteration": 2.8717291355133057 + }, + { + "auxiliary_loss_clip": 0.01125761, + "auxiliary_loss_mlp": 0.01048121, + "balance_loss_clip": 1.05035281, + "balance_loss_mlp": 1.02940559, + "epoch": 0.15217195250263038, + "flos": 13881521865600.0, + "grad_norm": 2.1410437006568723, + "language_loss": 0.80404246, + "learning_rate": 3.845459288641066e-06, + "loss": 0.82578129, + "num_input_tokens_seen": 55018295, + "step": 2531, + "time_per_iteration": 2.8444995880126953 + }, + { + "auxiliary_loss_clip": 0.01143114, + "auxiliary_loss_mlp": 0.01043494, + "balance_loss_clip": 1.05216551, + "balance_loss_mlp": 1.02613723, + "epoch": 0.15223207575529837, + "flos": 24535319592960.0, + "grad_norm": 1.7922494378130023, + "language_loss": 0.78874445, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.81061059, + "num_input_tokens_seen": 55037975, + "step": 2532, + "time_per_iteration": 2.9122390747070312 + }, + { + "auxiliary_loss_clip": 0.01149502, + "auxiliary_loss_mlp": 0.0104596, + "balance_loss_clip": 1.05737543, + "balance_loss_mlp": 1.02676702, + "epoch": 0.15229219900796634, + "flos": 25556798563200.0, + "grad_norm": 1.9533698136575197, + "language_loss": 0.87679356, + "learning_rate": 3.845158914395105e-06, + "loss": 0.89874816, + "num_input_tokens_seen": 55057135, + "step": 2533, + "time_per_iteration": 2.7987985610961914 + }, + { + "auxiliary_loss_clip": 0.01117955, + "auxiliary_loss_mlp": 0.01048672, + "balance_loss_clip": 1.05235386, + "balance_loss_mlp": 1.02983665, + "epoch": 0.1523523222606343, + "flos": 18217806520320.0, + "grad_norm": 2.391026063452041, + "language_loss": 0.78886449, + "learning_rate": 3.84500862231636e-06, + "loss": 0.81053078, + "num_input_tokens_seen": 55075525, + "step": 2534, + "time_per_iteration": 2.7587406635284424 + }, + { + "auxiliary_loss_clip": 0.01164218, + "auxiliary_loss_mlp": 0.0104722, + "balance_loss_clip": 1.05609345, + "balance_loss_mlp": 1.0270381, + "epoch": 0.15241244551330227, + "flos": 13260087642240.0, + "grad_norm": 2.689732363294508, + "language_loss": 0.76809752, + "learning_rate": 3.844858260274702e-06, + "loss": 0.79021192, + "num_input_tokens_seen": 55090845, + "step": 2535, + "time_per_iteration": 2.7494406700134277 + }, + { + "auxiliary_loss_clip": 0.01142628, + "auxiliary_loss_mlp": 0.01042905, + "balance_loss_clip": 1.05345285, + "balance_loss_mlp": 1.02401042, + "epoch": 0.15247256876597023, + "flos": 19715568854400.0, + "grad_norm": 2.2235871255319446, + "language_loss": 0.78301942, + "learning_rate": 3.844707828275835e-06, + "loss": 0.80487478, + "num_input_tokens_seen": 55108750, + "step": 2536, + "time_per_iteration": 2.738638401031494 + }, + { + "auxiliary_loss_clip": 0.01128919, + "auxiliary_loss_mlp": 0.0105368, + "balance_loss_clip": 1.05349088, + "balance_loss_mlp": 1.03497589, + "epoch": 0.1525326920186382, + "flos": 20375858615040.0, + "grad_norm": 2.311649941233105, + "language_loss": 0.75824189, + "learning_rate": 3.844557326325461e-06, + "loss": 0.78006792, + "num_input_tokens_seen": 55126750, + "step": 2537, + "time_per_iteration": 2.632373809814453 + }, + { + "auxiliary_loss_clip": 0.0114911, + "auxiliary_loss_mlp": 0.01041421, + "balance_loss_clip": 1.05675745, + "balance_loss_mlp": 1.02331281, + "epoch": 0.15259281527130616, + "flos": 13589963170560.0, + "grad_norm": 2.193148723631548, + "language_loss": 0.77737647, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.79928178, + "num_input_tokens_seen": 55144690, + "step": 2538, + "time_per_iteration": 2.6835639476776123 + }, + { + "auxiliary_loss_clip": 0.01109367, + "auxiliary_loss_mlp": 0.01042256, + "balance_loss_clip": 1.05477905, + "balance_loss_mlp": 1.02480412, + "epoch": 0.15265293852397416, + "flos": 22860374446080.0, + "grad_norm": 2.951423477379744, + "language_loss": 0.89502335, + "learning_rate": 3.844256112593029e-06, + "loss": 0.91653961, + "num_input_tokens_seen": 55166055, + "step": 2539, + "time_per_iteration": 2.7825794219970703 + }, + { + "auxiliary_loss_clip": 0.01142581, + "auxiliary_loss_mlp": 0.01045856, + "balance_loss_clip": 1.05367279, + "balance_loss_mlp": 1.02721143, + "epoch": 0.15271306177664212, + "flos": 29238108670080.0, + "grad_norm": 2.1073423273657044, + "language_loss": 0.93423879, + "learning_rate": 3.844105400822391e-06, + "loss": 0.95612311, + "num_input_tokens_seen": 55186285, + "step": 2540, + "time_per_iteration": 2.717541456222534 + }, + { + "auxiliary_loss_clip": 0.01131603, + "auxiliary_loss_mlp": 0.01041863, + "balance_loss_clip": 1.05122495, + "balance_loss_mlp": 1.0240885, + "epoch": 0.1527731850293101, + "flos": 31246269310080.0, + "grad_norm": 2.084754505375857, + "language_loss": 0.75217843, + "learning_rate": 3.843954619123092e-06, + "loss": 0.77391309, + "num_input_tokens_seen": 55207915, + "step": 2541, + "time_per_iteration": 2.8376123905181885 + }, + { + "auxiliary_loss_clip": 0.01116303, + "auxiliary_loss_mlp": 0.01045227, + "balance_loss_clip": 1.04877007, + "balance_loss_mlp": 1.0268805, + "epoch": 0.15283330828197805, + "flos": 22382079920640.0, + "grad_norm": 2.037290364787748, + "language_loss": 0.80996066, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83157599, + "num_input_tokens_seen": 55227860, + "step": 2542, + "time_per_iteration": 2.7110376358032227 + }, + { + "auxiliary_loss_clip": 0.01160331, + "auxiliary_loss_mlp": 0.01048661, + "balance_loss_clip": 1.0566076, + "balance_loss_mlp": 1.02992105, + "epoch": 0.15289343153464602, + "flos": 25520133755520.0, + "grad_norm": 3.2152362880248857, + "language_loss": 0.77796149, + "learning_rate": 3.843652845961383e-06, + "loss": 0.80005145, + "num_input_tokens_seen": 55247330, + "step": 2543, + "time_per_iteration": 2.674131155014038 + }, + { + "auxiliary_loss_clip": 0.01145565, + "auxiliary_loss_mlp": 0.01042133, + "balance_loss_clip": 1.05380869, + "balance_loss_mlp": 1.02388239, + "epoch": 0.15295355478731398, + "flos": 22710016114560.0, + "grad_norm": 2.4890924021550918, + "language_loss": 0.85898137, + "learning_rate": 3.843501854510416e-06, + "loss": 0.88085836, + "num_input_tokens_seen": 55266195, + "step": 2544, + "time_per_iteration": 2.685840606689453 + }, + { + "auxiliary_loss_clip": 0.01149904, + "auxiliary_loss_mlp": 0.01051141, + "balance_loss_clip": 1.05162692, + "balance_loss_mlp": 1.03061318, + "epoch": 0.15301367803998198, + "flos": 23251907669760.0, + "grad_norm": 1.9817931887295275, + "language_loss": 0.83159137, + "learning_rate": 3.843350793153673e-06, + "loss": 0.85360181, + "num_input_tokens_seen": 55283305, + "step": 2545, + "time_per_iteration": 2.7415812015533447 + }, + { + "auxiliary_loss_clip": 0.01158976, + "auxiliary_loss_mlp": 0.01040888, + "balance_loss_clip": 1.05556524, + "balance_loss_mlp": 1.02257705, + "epoch": 0.15307380129264994, + "flos": 25886279041920.0, + "grad_norm": 6.0131413628182, + "language_loss": 0.71669161, + "learning_rate": 3.843199661896884e-06, + "loss": 0.73869026, + "num_input_tokens_seen": 55303035, + "step": 2546, + "time_per_iteration": 2.6626265048980713 + }, + { + "auxiliary_loss_clip": 0.01130357, + "auxiliary_loss_mlp": 0.01047635, + "balance_loss_clip": 1.05013335, + "balance_loss_mlp": 1.02688098, + "epoch": 0.1531339245453179, + "flos": 46973239205760.0, + "grad_norm": 1.6563553629779504, + "language_loss": 0.77438712, + "learning_rate": 3.843048460745779e-06, + "loss": 0.79616702, + "num_input_tokens_seen": 55327570, + "step": 2547, + "time_per_iteration": 4.451423168182373 + }, + { + "auxiliary_loss_clip": 0.01107553, + "auxiliary_loss_mlp": 0.01044692, + "balance_loss_clip": 1.04845536, + "balance_loss_mlp": 1.02517736, + "epoch": 0.15319404779798587, + "flos": 35882049565440.0, + "grad_norm": 2.3544675813743834, + "language_loss": 0.74357474, + "learning_rate": 3.842897189706092e-06, + "loss": 0.7650972, + "num_input_tokens_seen": 55351090, + "step": 2548, + "time_per_iteration": 2.846991539001465 + }, + { + "auxiliary_loss_clip": 0.01138346, + "auxiliary_loss_mlp": 0.0105294, + "balance_loss_clip": 1.05340147, + "balance_loss_mlp": 1.03304434, + "epoch": 0.15325417105065384, + "flos": 25664638170240.0, + "grad_norm": 1.446042531021912, + "language_loss": 0.80296385, + "learning_rate": 3.842745848783558e-06, + "loss": 0.82487667, + "num_input_tokens_seen": 55371050, + "step": 2549, + "time_per_iteration": 5.8849101066589355 + }, + { + "auxiliary_loss_clip": 0.01144858, + "auxiliary_loss_mlp": 0.01041292, + "balance_loss_clip": 1.05108786, + "balance_loss_mlp": 1.02255249, + "epoch": 0.1533142943033218, + "flos": 18770831291520.0, + "grad_norm": 1.6149920159034452, + "language_loss": 0.74602014, + "learning_rate": 3.842594437983917e-06, + "loss": 0.76788169, + "num_input_tokens_seen": 55390375, + "step": 2550, + "time_per_iteration": 2.684868812561035 + }, + { + "auxiliary_loss_clip": 0.01149823, + "auxiliary_loss_mlp": 0.01040743, + "balance_loss_clip": 1.05212283, + "balance_loss_mlp": 1.02129996, + "epoch": 0.15337441755598977, + "flos": 23107367341440.0, + "grad_norm": 2.33086854575276, + "language_loss": 0.76910275, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.79100841, + "num_input_tokens_seen": 55408890, + "step": 2551, + "time_per_iteration": 4.415414333343506 + }, + { + "auxiliary_loss_clip": 0.01086721, + "auxiliary_loss_mlp": 0.01054065, + "balance_loss_clip": 1.05333817, + "balance_loss_mlp": 1.05116868, + "epoch": 0.15343454080865776, + "flos": 59861079227520.0, + "grad_norm": 0.9493148205555214, + "language_loss": 0.5665558, + "learning_rate": 3.842291406776283e-06, + "loss": 0.5879637, + "num_input_tokens_seen": 55463815, + "step": 2552, + "time_per_iteration": 3.1105730533599854 + }, + { + "auxiliary_loss_clip": 0.011128, + "auxiliary_loss_mlp": 0.01039619, + "balance_loss_clip": 1.05131924, + "balance_loss_mlp": 1.0204618, + "epoch": 0.15349466406132573, + "flos": 11910887959680.0, + "grad_norm": 2.183188616823757, + "language_loss": 0.88550794, + "learning_rate": 3.84213978637978e-06, + "loss": 0.90703207, + "num_input_tokens_seen": 55481050, + "step": 2553, + "time_per_iteration": 2.748298406600952 + }, + { + "auxiliary_loss_clip": 0.01147024, + "auxiliary_loss_mlp": 0.01042929, + "balance_loss_clip": 1.05247378, + "balance_loss_mlp": 1.0232954, + "epoch": 0.1535547873139937, + "flos": 24096922099200.0, + "grad_norm": 1.8094820084348213, + "language_loss": 0.7800495, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80194902, + "num_input_tokens_seen": 55500050, + "step": 2554, + "time_per_iteration": 2.6555569171905518 + }, + { + "auxiliary_loss_clip": 0.01094445, + "auxiliary_loss_mlp": 0.01053684, + "balance_loss_clip": 1.04876757, + "balance_loss_mlp": 1.03291798, + "epoch": 0.15361491056666166, + "flos": 17566459246080.0, + "grad_norm": 2.372022486587551, + "language_loss": 0.77472258, + "learning_rate": 3.841836336030151e-06, + "loss": 0.79620385, + "num_input_tokens_seen": 55518125, + "step": 2555, + "time_per_iteration": 2.7507212162017822 + }, + { + "auxiliary_loss_clip": 0.01129555, + "auxiliary_loss_mlp": 0.01046723, + "balance_loss_clip": 1.05400753, + "balance_loss_mlp": 1.02873409, + "epoch": 0.15367503381932962, + "flos": 25046041121280.0, + "grad_norm": 1.5517643759455655, + "language_loss": 0.77453947, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.79630232, + "num_input_tokens_seen": 55540960, + "step": 2556, + "time_per_iteration": 2.7947654724121094 + }, + { + "auxiliary_loss_clip": 0.01140725, + "auxiliary_loss_mlp": 0.0077646, + "balance_loss_clip": 1.05336452, + "balance_loss_mlp": 1.00054574, + "epoch": 0.15373515707199759, + "flos": 21507332008320.0, + "grad_norm": 1.8786460244833383, + "language_loss": 0.90098578, + "learning_rate": 3.84153260631005e-06, + "loss": 0.92015761, + "num_input_tokens_seen": 55559210, + "step": 2557, + "time_per_iteration": 2.702029228210449 + }, + { + "auxiliary_loss_clip": 0.01137441, + "auxiliary_loss_mlp": 0.01048546, + "balance_loss_clip": 1.05146766, + "balance_loss_mlp": 1.02862656, + "epoch": 0.15379528032466555, + "flos": 25994729180160.0, + "grad_norm": 2.4046585493240102, + "language_loss": 0.7092281, + "learning_rate": 3.841380636700468e-06, + "loss": 0.73108798, + "num_input_tokens_seen": 55578925, + "step": 2558, + "time_per_iteration": 2.815653085708618 + }, + { + "auxiliary_loss_clip": 0.01131603, + "auxiliary_loss_mlp": 0.01045983, + "balance_loss_clip": 1.04937947, + "balance_loss_mlp": 1.02659965, + "epoch": 0.15385540357733354, + "flos": 19277315015040.0, + "grad_norm": 2.1050139676488535, + "language_loss": 0.92165422, + "learning_rate": 3.841228597265548e-06, + "loss": 0.94343007, + "num_input_tokens_seen": 55597255, + "step": 2559, + "time_per_iteration": 2.7363967895507812 + }, + { + "auxiliary_loss_clip": 0.011375, + "auxiliary_loss_mlp": 0.01057878, + "balance_loss_clip": 1.05492043, + "balance_loss_mlp": 1.03711152, + "epoch": 0.1539155268300015, + "flos": 28549126920960.0, + "grad_norm": 2.149412909113977, + "language_loss": 0.63330692, + "learning_rate": 3.841076488011055e-06, + "loss": 0.65526068, + "num_input_tokens_seen": 55619515, + "step": 2560, + "time_per_iteration": 2.811800003051758 + }, + { + "auxiliary_loss_clip": 0.01132154, + "auxiliary_loss_mlp": 0.01043974, + "balance_loss_clip": 1.04914606, + "balance_loss_mlp": 1.02416182, + "epoch": 0.15397565008266947, + "flos": 23547883737600.0, + "grad_norm": 2.066473237183783, + "language_loss": 0.88155699, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.90331829, + "num_input_tokens_seen": 55640050, + "step": 2561, + "time_per_iteration": 2.7991089820861816 + }, + { + "auxiliary_loss_clip": 0.0114054, + "auxiliary_loss_mlp": 0.01041879, + "balance_loss_clip": 1.05085099, + "balance_loss_mlp": 1.02380693, + "epoch": 0.15403577333533744, + "flos": 17129821518720.0, + "grad_norm": 1.906051405357337, + "language_loss": 0.83117974, + "learning_rate": 3.840772060066425e-06, + "loss": 0.85300398, + "num_input_tokens_seen": 55658695, + "step": 2562, + "time_per_iteration": 2.6410810947418213 + }, + { + "auxiliary_loss_clip": 0.01128756, + "auxiliary_loss_mlp": 0.00778205, + "balance_loss_clip": 1.04988563, + "balance_loss_mlp": 1.00058532, + "epoch": 0.1540958965880054, + "flos": 17894503180800.0, + "grad_norm": 2.3547297997270906, + "language_loss": 0.74647415, + "learning_rate": 3.840619741387832e-06, + "loss": 0.76554382, + "num_input_tokens_seen": 55676340, + "step": 2563, + "time_per_iteration": 2.6813745498657227 + }, + { + "auxiliary_loss_clip": 0.01116857, + "auxiliary_loss_mlp": 0.0104411, + "balance_loss_clip": 1.05126941, + "balance_loss_mlp": 1.02444029, + "epoch": 0.15415601984067337, + "flos": 32161057908480.0, + "grad_norm": 2.842824767177756, + "language_loss": 0.7609179, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.78252757, + "num_input_tokens_seen": 55698890, + "step": 2564, + "time_per_iteration": 2.832885265350342 + }, + { + "auxiliary_loss_clip": 0.01133461, + "auxiliary_loss_mlp": 0.01052887, + "balance_loss_clip": 1.05174518, + "balance_loss_mlp": 1.03443313, + "epoch": 0.15421614309334136, + "flos": 24024418496640.0, + "grad_norm": 2.0125869911748575, + "language_loss": 0.70960921, + "learning_rate": 3.840314894646969e-06, + "loss": 0.73147273, + "num_input_tokens_seen": 55718535, + "step": 2565, + "time_per_iteration": 2.7352514266967773 + }, + { + "auxiliary_loss_clip": 0.01137766, + "auxiliary_loss_mlp": 0.01046908, + "balance_loss_clip": 1.04731965, + "balance_loss_mlp": 1.02787066, + "epoch": 0.15427626634600933, + "flos": 24386290064640.0, + "grad_norm": 2.1021891280826965, + "language_loss": 0.71605748, + "learning_rate": 3.840162366596259e-06, + "loss": 0.73790431, + "num_input_tokens_seen": 55738970, + "step": 2566, + "time_per_iteration": 2.681710720062256 + }, + { + "auxiliary_loss_clip": 0.01150619, + "auxiliary_loss_mlp": 0.01040725, + "balance_loss_clip": 1.04834008, + "balance_loss_mlp": 1.02271223, + "epoch": 0.1543363895986773, + "flos": 23331522165120.0, + "grad_norm": 1.7167104030167524, + "language_loss": 0.84746087, + "learning_rate": 3.840009768766408e-06, + "loss": 0.86937428, + "num_input_tokens_seen": 55759585, + "step": 2567, + "time_per_iteration": 2.6413686275482178 + }, + { + "auxiliary_loss_clip": 0.01104646, + "auxiliary_loss_mlp": 0.01050344, + "balance_loss_clip": 1.04447246, + "balance_loss_mlp": 1.03164053, + "epoch": 0.15439651285134526, + "flos": 24274284480000.0, + "grad_norm": 2.9101336164483014, + "language_loss": 0.78074998, + "learning_rate": 3.839857101163202e-06, + "loss": 0.80229992, + "num_input_tokens_seen": 55779250, + "step": 2568, + "time_per_iteration": 2.7385261058807373 + }, + { + "auxiliary_loss_clip": 0.01121993, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.04715753, + "balance_loss_mlp": 1.01684201, + "epoch": 0.15445663610401322, + "flos": 22456163721600.0, + "grad_norm": 1.852436867559063, + "language_loss": 0.6991998, + "learning_rate": 3.83970436379243e-06, + "loss": 0.72079051, + "num_input_tokens_seen": 55800470, + "step": 2569, + "time_per_iteration": 2.746974229812622 + }, + { + "auxiliary_loss_clip": 0.01124209, + "auxiliary_loss_mlp": 0.01040299, + "balance_loss_clip": 1.04695952, + "balance_loss_mlp": 1.02178574, + "epoch": 0.1545167593566812, + "flos": 22049510872320.0, + "grad_norm": 1.7212875994527412, + "language_loss": 0.76482332, + "learning_rate": 3.839551556659884e-06, + "loss": 0.78646845, + "num_input_tokens_seen": 55817795, + "step": 2570, + "time_per_iteration": 2.7470619678497314 + }, + { + "auxiliary_loss_clip": 0.01137702, + "auxiliary_loss_mlp": 0.01038561, + "balance_loss_clip": 1.04993737, + "balance_loss_mlp": 1.0192852, + "epoch": 0.15457688260934915, + "flos": 19318253541120.0, + "grad_norm": 2.5033166184578066, + "language_loss": 0.77997506, + "learning_rate": 3.839398679771359e-06, + "loss": 0.80173767, + "num_input_tokens_seen": 55836125, + "step": 2571, + "time_per_iteration": 2.692863702774048 + }, + { + "auxiliary_loss_clip": 0.0113208, + "auxiliary_loss_mlp": 0.0104519, + "balance_loss_clip": 1.0498451, + "balance_loss_mlp": 1.02704597, + "epoch": 0.15463700586201715, + "flos": 24133981956480.0, + "grad_norm": 4.3242380509309015, + "language_loss": 0.82932413, + "learning_rate": 3.839245733132652e-06, + "loss": 0.85109681, + "num_input_tokens_seen": 55855280, + "step": 2572, + "time_per_iteration": 2.8341822624206543 + }, + { + "auxiliary_loss_clip": 0.01156188, + "auxiliary_loss_mlp": 0.01042592, + "balance_loss_clip": 1.05181205, + "balance_loss_mlp": 1.02383995, + "epoch": 0.1546971291146851, + "flos": 22420935457920.0, + "grad_norm": 1.5874704718869805, + "language_loss": 0.90373385, + "learning_rate": 3.839092716749563e-06, + "loss": 0.92572165, + "num_input_tokens_seen": 55875695, + "step": 2573, + "time_per_iteration": 2.740121364593506 + }, + { + "auxiliary_loss_clip": 0.01088424, + "auxiliary_loss_mlp": 0.01049893, + "balance_loss_clip": 1.04328668, + "balance_loss_mlp": 1.03003311, + "epoch": 0.15475725236735308, + "flos": 17530225401600.0, + "grad_norm": 1.596795561637076, + "language_loss": 0.70298707, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72437024, + "num_input_tokens_seen": 55894575, + "step": 2574, + "time_per_iteration": 2.7629144191741943 + }, + { + "auxiliary_loss_clip": 0.01127537, + "auxiliary_loss_mlp": 0.01045732, + "balance_loss_clip": 1.04714394, + "balance_loss_mlp": 1.02509642, + "epoch": 0.15481737562002104, + "flos": 22561740771840.0, + "grad_norm": 6.018921028505516, + "language_loss": 0.82426423, + "learning_rate": 3.838786474773448e-06, + "loss": 0.84599686, + "num_input_tokens_seen": 55912855, + "step": 2575, + "time_per_iteration": 2.656783103942871 + }, + { + "auxiliary_loss_clip": 0.01127415, + "auxiliary_loss_mlp": 0.01043354, + "balance_loss_clip": 1.04681587, + "balance_loss_mlp": 1.02584219, + "epoch": 0.154877498872689, + "flos": 24900567039360.0, + "grad_norm": 1.8376318938002576, + "language_loss": 0.85038638, + "learning_rate": 3.838633249192036e-06, + "loss": 0.87209404, + "num_input_tokens_seen": 55932375, + "step": 2576, + "time_per_iteration": 2.648484230041504 + }, + { + "auxiliary_loss_clip": 0.01152547, + "auxiliary_loss_mlp": 0.01043401, + "balance_loss_clip": 1.04872847, + "balance_loss_mlp": 1.02499545, + "epoch": 0.15493762212535697, + "flos": 28147501975680.0, + "grad_norm": 1.8027999188827728, + "language_loss": 0.82271254, + "learning_rate": 3.838479953889465e-06, + "loss": 0.84467208, + "num_input_tokens_seen": 55953970, + "step": 2577, + "time_per_iteration": 2.6355643272399902 + }, + { + "auxiliary_loss_clip": 0.01126009, + "auxiliary_loss_mlp": 0.01049018, + "balance_loss_clip": 1.05147958, + "balance_loss_mlp": 1.02984881, + "epoch": 0.15499774537802496, + "flos": 25411073086080.0, + "grad_norm": 2.1677069711314463, + "language_loss": 0.76556361, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.78731394, + "num_input_tokens_seen": 55973120, + "step": 2578, + "time_per_iteration": 2.649043560028076 + }, + { + "auxiliary_loss_clip": 0.01123677, + "auxiliary_loss_mlp": 0.01044461, + "balance_loss_clip": 1.05155993, + "balance_loss_mlp": 1.0253042, + "epoch": 0.15505786863069293, + "flos": 22091562720000.0, + "grad_norm": 1.9614380224881987, + "language_loss": 0.82443559, + "learning_rate": 3.83817315414411e-06, + "loss": 0.8461169, + "num_input_tokens_seen": 55993260, + "step": 2579, + "time_per_iteration": 2.62631893157959 + }, + { + "auxiliary_loss_clip": 0.01143904, + "auxiliary_loss_mlp": 0.01044324, + "balance_loss_clip": 1.05856657, + "balance_loss_mlp": 1.02556014, + "epoch": 0.1551179918833609, + "flos": 18917131386240.0, + "grad_norm": 2.610374735790095, + "language_loss": 0.80465376, + "learning_rate": 3.838019649712958e-06, + "loss": 0.82653606, + "num_input_tokens_seen": 56012130, + "step": 2580, + "time_per_iteration": 2.6512253284454346 + }, + { + "auxiliary_loss_clip": 0.0107737, + "auxiliary_loss_mlp": 0.01006304, + "balance_loss_clip": 1.04551053, + "balance_loss_mlp": 1.00360954, + "epoch": 0.15517811513602886, + "flos": 66239172587520.0, + "grad_norm": 0.842131683094019, + "language_loss": 0.58823448, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.60907125, + "num_input_tokens_seen": 56079045, + "step": 2581, + "time_per_iteration": 3.357855796813965 + }, + { + "auxiliary_loss_clip": 0.01108206, + "auxiliary_loss_mlp": 0.01047031, + "balance_loss_clip": 1.04392648, + "balance_loss_mlp": 1.0249418, + "epoch": 0.15523823838869683, + "flos": 24021078531840.0, + "grad_norm": 1.9584677228939371, + "language_loss": 0.84773678, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.86928916, + "num_input_tokens_seen": 56098745, + "step": 2582, + "time_per_iteration": 2.727062702178955 + }, + { + "auxiliary_loss_clip": 0.01144131, + "auxiliary_loss_mlp": 0.01051911, + "balance_loss_clip": 1.05233002, + "balance_loss_mlp": 1.03175235, + "epoch": 0.1552983616413648, + "flos": 20485062938880.0, + "grad_norm": 2.466663791870015, + "language_loss": 0.79050052, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.81246096, + "num_input_tokens_seen": 56117655, + "step": 2583, + "time_per_iteration": 2.664794683456421 + }, + { + "auxiliary_loss_clip": 0.01139818, + "auxiliary_loss_mlp": 0.01054771, + "balance_loss_clip": 1.04957032, + "balance_loss_mlp": 1.03252697, + "epoch": 0.15535848489403276, + "flos": 32123710742400.0, + "grad_norm": 1.8743170599575527, + "language_loss": 0.76320136, + "learning_rate": 3.837404935067705e-06, + "loss": 0.78514719, + "num_input_tokens_seen": 56141960, + "step": 2584, + "time_per_iteration": 2.757392168045044 + }, + { + "auxiliary_loss_clip": 0.01137324, + "auxiliary_loss_mlp": 0.01042496, + "balance_loss_clip": 1.04884958, + "balance_loss_mlp": 1.02302885, + "epoch": 0.15541860814670075, + "flos": 19098444263040.0, + "grad_norm": 1.6493041410587026, + "language_loss": 0.75269651, + "learning_rate": 3.837251082205368e-06, + "loss": 0.77449471, + "num_input_tokens_seen": 56161430, + "step": 2585, + "time_per_iteration": 2.6497461795806885 + }, + { + "auxiliary_loss_clip": 0.01116144, + "auxiliary_loss_mlp": 0.01042356, + "balance_loss_clip": 1.04862189, + "balance_loss_mlp": 1.02321053, + "epoch": 0.1554787313993687, + "flos": 19172097100800.0, + "grad_norm": 2.068989677221064, + "language_loss": 0.61187196, + "learning_rate": 3.837097159674286e-06, + "loss": 0.63345695, + "num_input_tokens_seen": 56179390, + "step": 2586, + "time_per_iteration": 2.697852373123169 + }, + { + "auxiliary_loss_clip": 0.01129408, + "auxiliary_loss_mlp": 0.01042187, + "balance_loss_clip": 1.04842281, + "balance_loss_mlp": 1.02341127, + "epoch": 0.15553885465203668, + "flos": 16143822207360.0, + "grad_norm": 1.8484108176722505, + "language_loss": 0.81318939, + "learning_rate": 3.836943167480296e-06, + "loss": 0.83490539, + "num_input_tokens_seen": 56198020, + "step": 2587, + "time_per_iteration": 4.212551593780518 + }, + { + "auxiliary_loss_clip": 0.01160891, + "auxiliary_loss_mlp": 0.01054822, + "balance_loss_clip": 1.05309868, + "balance_loss_mlp": 1.03325701, + "epoch": 0.15559897790470464, + "flos": 25337779384320.0, + "grad_norm": 1.866779523391448, + "language_loss": 0.88716942, + "learning_rate": 3.836789105629236e-06, + "loss": 0.90932655, + "num_input_tokens_seen": 56218165, + "step": 2588, + "time_per_iteration": 4.192267894744873 + }, + { + "auxiliary_loss_clip": 0.01094981, + "auxiliary_loss_mlp": 0.01052123, + "balance_loss_clip": 1.04558384, + "balance_loss_mlp": 1.03164268, + "epoch": 0.1556591011573726, + "flos": 23148772744320.0, + "grad_norm": 2.018423224363699, + "language_loss": 0.64624381, + "learning_rate": 3.83663497412695e-06, + "loss": 0.66771483, + "num_input_tokens_seen": 56237160, + "step": 2589, + "time_per_iteration": 4.303871154785156 + }, + { + "auxiliary_loss_clip": 0.01104407, + "auxiliary_loss_mlp": 0.01041976, + "balance_loss_clip": 1.04520249, + "balance_loss_mlp": 1.02123344, + "epoch": 0.15571922441004057, + "flos": 25370888745600.0, + "grad_norm": 1.784618480549341, + "language_loss": 0.82832813, + "learning_rate": 3.836480772979281e-06, + "loss": 0.84979194, + "num_input_tokens_seen": 56257610, + "step": 2590, + "time_per_iteration": 4.460350751876831 + }, + { + "auxiliary_loss_clip": 0.011248, + "auxiliary_loss_mlp": 0.01047287, + "balance_loss_clip": 1.05032134, + "balance_loss_mlp": 1.02694952, + "epoch": 0.15577934766270854, + "flos": 14501375890560.0, + "grad_norm": 2.6687659077907484, + "language_loss": 0.78766, + "learning_rate": 3.836326502192077e-06, + "loss": 0.80938083, + "num_input_tokens_seen": 56275215, + "step": 2591, + "time_per_iteration": 2.73305606842041 + }, + { + "auxiliary_loss_clip": 0.01143879, + "auxiliary_loss_mlp": 0.01049015, + "balance_loss_clip": 1.05174232, + "balance_loss_mlp": 1.03137255, + "epoch": 0.15583947091537653, + "flos": 37414537372800.0, + "grad_norm": 2.0331558547393054, + "language_loss": 0.65025747, + "learning_rate": 3.836172161771189e-06, + "loss": 0.67218637, + "num_input_tokens_seen": 56297130, + "step": 2592, + "time_per_iteration": 2.8582632541656494 + }, + { + "auxiliary_loss_clip": 0.01136043, + "auxiliary_loss_mlp": 0.01052096, + "balance_loss_clip": 1.05417228, + "balance_loss_mlp": 1.0322001, + "epoch": 0.1558995941680445, + "flos": 21834729498240.0, + "grad_norm": 2.311634250072179, + "language_loss": 0.82506329, + "learning_rate": 3.836017751722467e-06, + "loss": 0.84694475, + "num_input_tokens_seen": 56314995, + "step": 2593, + "time_per_iteration": 2.7230453491210938 + }, + { + "auxiliary_loss_clip": 0.01142565, + "auxiliary_loss_mlp": 0.01046037, + "balance_loss_clip": 1.05237365, + "balance_loss_mlp": 1.02676034, + "epoch": 0.15595971742071246, + "flos": 19792633484160.0, + "grad_norm": 2.778410683125911, + "language_loss": 0.73220694, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.75409293, + "num_input_tokens_seen": 56334005, + "step": 2594, + "time_per_iteration": 2.708063840866089 + }, + { + "auxiliary_loss_clip": 0.01117989, + "auxiliary_loss_mlp": 0.01040106, + "balance_loss_clip": 1.0453043, + "balance_loss_mlp": 1.02077007, + "epoch": 0.15601984067338043, + "flos": 26722135503360.0, + "grad_norm": 2.1444704922101105, + "language_loss": 0.81569934, + "learning_rate": 3.835708722764952e-06, + "loss": 0.83728027, + "num_input_tokens_seen": 56353795, + "step": 2595, + "time_per_iteration": 2.716334581375122 + }, + { + "auxiliary_loss_clip": 0.01155359, + "auxiliary_loss_mlp": 0.01043269, + "balance_loss_clip": 1.05093551, + "balance_loss_mlp": 1.0238502, + "epoch": 0.1560799639260484, + "flos": 18369278173440.0, + "grad_norm": 1.8943501893042642, + "language_loss": 0.86674929, + "learning_rate": 3.835554103867876e-06, + "loss": 0.88873553, + "num_input_tokens_seen": 56373195, + "step": 2596, + "time_per_iteration": 2.5947446823120117 + }, + { + "auxiliary_loss_clip": 0.01144729, + "auxiliary_loss_mlp": 0.01042109, + "balance_loss_clip": 1.05225515, + "balance_loss_mlp": 1.02360725, + "epoch": 0.15614008717871636, + "flos": 22598980197120.0, + "grad_norm": 1.8059460934517404, + "language_loss": 0.68772388, + "learning_rate": 3.835399415366404e-06, + "loss": 0.70959222, + "num_input_tokens_seen": 56391525, + "step": 2597, + "time_per_iteration": 2.8101041316986084 + }, + { + "auxiliary_loss_clip": 0.01130069, + "auxiliary_loss_mlp": 0.01050835, + "balance_loss_clip": 1.05409336, + "balance_loss_mlp": 1.03165436, + "epoch": 0.15620021043138435, + "flos": 22746860490240.0, + "grad_norm": 1.9103744906429732, + "language_loss": 0.79860938, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.82041842, + "num_input_tokens_seen": 56410715, + "step": 2598, + "time_per_iteration": 2.695117950439453 + }, + { + "auxiliary_loss_clip": 0.0112861, + "auxiliary_loss_mlp": 0.00776118, + "balance_loss_clip": 1.04750216, + "balance_loss_mlp": 1.0006249, + "epoch": 0.15626033368405232, + "flos": 13114936782720.0, + "grad_norm": 3.1104681024188827, + "language_loss": 0.83092594, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.84997326, + "num_input_tokens_seen": 56429170, + "step": 2599, + "time_per_iteration": 2.665703773498535 + }, + { + "auxiliary_loss_clip": 0.01160593, + "auxiliary_loss_mlp": 0.0105002, + "balance_loss_clip": 1.05274248, + "balance_loss_mlp": 1.02924192, + "epoch": 0.15632045693672028, + "flos": 16472297105280.0, + "grad_norm": 2.2910683048406266, + "language_loss": 0.81530893, + "learning_rate": 3.834934932294287e-06, + "loss": 0.83741504, + "num_input_tokens_seen": 56445685, + "step": 2600, + "time_per_iteration": 2.615651845932007 + }, + { + "auxiliary_loss_clip": 0.01161023, + "auxiliary_loss_mlp": 0.00776671, + "balance_loss_clip": 1.05562234, + "balance_loss_mlp": 1.00063944, + "epoch": 0.15638058018938825, + "flos": 20850346298880.0, + "grad_norm": 1.7832591469657297, + "language_loss": 0.88511437, + "learning_rate": 3.834779965433917e-06, + "loss": 0.90449131, + "num_input_tokens_seen": 56465900, + "step": 2601, + "time_per_iteration": 2.6833529472351074 + }, + { + "auxiliary_loss_clip": 0.0116257, + "auxiliary_loss_mlp": 0.0106307, + "balance_loss_clip": 1.05569744, + "balance_loss_mlp": 1.04120743, + "epoch": 0.1564407034420562, + "flos": 21872220318720.0, + "grad_norm": 1.9421054688538308, + "language_loss": 0.78707534, + "learning_rate": 3.834624928998508e-06, + "loss": 0.80933177, + "num_input_tokens_seen": 56485020, + "step": 2602, + "time_per_iteration": 2.6296608448028564 + }, + { + "auxiliary_loss_clip": 0.01126653, + "auxiliary_loss_mlp": 0.01043676, + "balance_loss_clip": 1.05035329, + "balance_loss_mlp": 1.02419758, + "epoch": 0.15650082669472418, + "flos": 21834549930240.0, + "grad_norm": 1.8230718276715763, + "language_loss": 0.74029547, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.76199877, + "num_input_tokens_seen": 56505205, + "step": 2603, + "time_per_iteration": 2.744508743286133 + }, + { + "auxiliary_loss_clip": 0.01143305, + "auxiliary_loss_mlp": 0.01051047, + "balance_loss_clip": 1.04820418, + "balance_loss_mlp": 1.03112721, + "epoch": 0.15656094994739214, + "flos": 13800542653440.0, + "grad_norm": 4.041164356714064, + "language_loss": 0.87723601, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.89917958, + "num_input_tokens_seen": 56521495, + "step": 2604, + "time_per_iteration": 2.682457447052002 + }, + { + "auxiliary_loss_clip": 0.01145351, + "auxiliary_loss_mlp": 0.01044759, + "balance_loss_clip": 1.04976749, + "balance_loss_mlp": 1.0256021, + "epoch": 0.15662107320006013, + "flos": 27308197808640.0, + "grad_norm": 2.260429022209425, + "language_loss": 0.8573193, + "learning_rate": 3.834159402300841e-06, + "loss": 0.87922043, + "num_input_tokens_seen": 56540665, + "step": 2605, + "time_per_iteration": 2.7724974155426025 + }, + { + "auxiliary_loss_clip": 0.0115108, + "auxiliary_loss_mlp": 0.01047256, + "balance_loss_clip": 1.05181313, + "balance_loss_mlp": 1.02676356, + "epoch": 0.1566811964527281, + "flos": 26685075646080.0, + "grad_norm": 1.7309636492693905, + "language_loss": 0.73101914, + "learning_rate": 3.834004087624087e-06, + "loss": 0.75300246, + "num_input_tokens_seen": 56560805, + "step": 2606, + "time_per_iteration": 2.7490081787109375 + }, + { + "auxiliary_loss_clip": 0.01158388, + "auxiliary_loss_mlp": 0.01049752, + "balance_loss_clip": 1.0552665, + "balance_loss_mlp": 1.03165627, + "epoch": 0.15674131970539606, + "flos": 16103422385280.0, + "grad_norm": 2.968092109370304, + "language_loss": 0.76497948, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.78706092, + "num_input_tokens_seen": 56576335, + "step": 2607, + "time_per_iteration": 2.6597230434417725 + }, + { + "auxiliary_loss_clip": 0.01120645, + "auxiliary_loss_mlp": 0.01047174, + "balance_loss_clip": 1.05131412, + "balance_loss_mlp": 1.0284934, + "epoch": 0.15680144295806403, + "flos": 19169690889600.0, + "grad_norm": 1.7981763092074996, + "language_loss": 0.82107675, + "learning_rate": 3.833693249639615e-06, + "loss": 0.84275496, + "num_input_tokens_seen": 56595880, + "step": 2608, + "time_per_iteration": 2.7072103023529053 + }, + { + "auxiliary_loss_clip": 0.0112834, + "auxiliary_loss_mlp": 0.01045106, + "balance_loss_clip": 1.04685056, + "balance_loss_mlp": 1.02436399, + "epoch": 0.156861566210732, + "flos": 20813430096000.0, + "grad_norm": 1.6817301031159713, + "language_loss": 0.72335941, + "learning_rate": 3.833537726343684e-06, + "loss": 0.74509382, + "num_input_tokens_seen": 56615130, + "step": 2609, + "time_per_iteration": 2.690690755844116 + }, + { + "auxiliary_loss_clip": 0.01143972, + "auxiliary_loss_mlp": 0.01036718, + "balance_loss_clip": 1.04901087, + "balance_loss_mlp": 1.01756072, + "epoch": 0.15692168946339996, + "flos": 20047922421120.0, + "grad_norm": 5.132438477880424, + "language_loss": 0.72317064, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74497753, + "num_input_tokens_seen": 56634005, + "step": 2610, + "time_per_iteration": 2.6515614986419678 + }, + { + "auxiliary_loss_clip": 0.01159588, + "auxiliary_loss_mlp": 0.01051513, + "balance_loss_clip": 1.05216432, + "balance_loss_mlp": 1.03063977, + "epoch": 0.15698181271606793, + "flos": 21398019943680.0, + "grad_norm": 2.0600295188113935, + "language_loss": 0.72915608, + "learning_rate": 3.833226471173919e-06, + "loss": 0.75126708, + "num_input_tokens_seen": 56653480, + "step": 2611, + "time_per_iteration": 2.630988359451294 + }, + { + "auxiliary_loss_clip": 0.01141924, + "auxiliary_loss_mlp": 0.01042538, + "balance_loss_clip": 1.04917872, + "balance_loss_mlp": 1.0231905, + "epoch": 0.15704193596873592, + "flos": 20845785271680.0, + "grad_norm": 2.0339762399532186, + "language_loss": 0.70766544, + "learning_rate": 3.833070739311887e-06, + "loss": 0.72951007, + "num_input_tokens_seen": 56672270, + "step": 2612, + "time_per_iteration": 2.6569461822509766 + }, + { + "auxiliary_loss_clip": 0.01116284, + "auxiliary_loss_mlp": 0.01051299, + "balance_loss_clip": 1.04844582, + "balance_loss_mlp": 1.03221321, + "epoch": 0.15710205922140388, + "flos": 21762908254080.0, + "grad_norm": 1.9704781930994688, + "language_loss": 0.76294881, + "learning_rate": 3.83291493793963e-06, + "loss": 0.78462464, + "num_input_tokens_seen": 56691510, + "step": 2613, + "time_per_iteration": 2.7188539505004883 + }, + { + "auxiliary_loss_clip": 0.01115155, + "auxiliary_loss_mlp": 0.01049301, + "balance_loss_clip": 1.04504919, + "balance_loss_mlp": 1.02956033, + "epoch": 0.15716218247407185, + "flos": 25007760201600.0, + "grad_norm": 2.137998057111896, + "language_loss": 0.65944499, + "learning_rate": 3.832759067063055e-06, + "loss": 0.68108952, + "num_input_tokens_seen": 56712230, + "step": 2614, + "time_per_iteration": 2.7550084590911865 + }, + { + "auxiliary_loss_clip": 0.01151987, + "auxiliary_loss_mlp": 0.01044173, + "balance_loss_clip": 1.05387104, + "balance_loss_mlp": 1.02374101, + "epoch": 0.1572223057267398, + "flos": 20191780391040.0, + "grad_norm": 2.2755662506820915, + "language_loss": 0.75204211, + "learning_rate": 3.832603126688072e-06, + "loss": 0.77400374, + "num_input_tokens_seen": 56727490, + "step": 2615, + "time_per_iteration": 2.683225154876709 + }, + { + "auxiliary_loss_clip": 0.01138545, + "auxiliary_loss_mlp": 0.01050891, + "balance_loss_clip": 1.05209839, + "balance_loss_mlp": 1.03078008, + "epoch": 0.15728242897940778, + "flos": 20959514709120.0, + "grad_norm": 2.581872009488739, + "language_loss": 0.73064095, + "learning_rate": 3.832447116820594e-06, + "loss": 0.75253528, + "num_input_tokens_seen": 56747385, + "step": 2616, + "time_per_iteration": 2.6660919189453125 + }, + { + "auxiliary_loss_clip": 0.01130717, + "auxiliary_loss_mlp": 0.01047511, + "balance_loss_clip": 1.04999971, + "balance_loss_mlp": 1.02794933, + "epoch": 0.15734255223207574, + "flos": 23038275530880.0, + "grad_norm": 2.813587490853999, + "language_loss": 0.72425079, + "learning_rate": 3.832291037466539e-06, + "loss": 0.74603307, + "num_input_tokens_seen": 56768055, + "step": 2617, + "time_per_iteration": 2.768561363220215 + }, + { + "auxiliary_loss_clip": 0.01138315, + "auxiliary_loss_mlp": 0.0104637, + "balance_loss_clip": 1.04947805, + "balance_loss_mlp": 1.02548432, + "epoch": 0.15740267548474374, + "flos": 20551281661440.0, + "grad_norm": 2.3222819484870016, + "language_loss": 0.74358094, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.76542777, + "num_input_tokens_seen": 56785110, + "step": 2618, + "time_per_iteration": 2.66121768951416 + }, + { + "auxiliary_loss_clip": 0.01162954, + "auxiliary_loss_mlp": 0.01046178, + "balance_loss_clip": 1.05417252, + "balance_loss_mlp": 1.02526867, + "epoch": 0.1574627987374117, + "flos": 22666922772480.0, + "grad_norm": 1.8808629075569874, + "language_loss": 0.78896272, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.81105405, + "num_input_tokens_seen": 56804975, + "step": 2619, + "time_per_iteration": 2.6743338108062744 + }, + { + "auxiliary_loss_clip": 0.01126081, + "auxiliary_loss_mlp": 0.01055551, + "balance_loss_clip": 1.05046356, + "balance_loss_mlp": 1.03576207, + "epoch": 0.15752292199007967, + "flos": 16800664262400.0, + "grad_norm": 1.9082963728737496, + "language_loss": 0.76517296, + "learning_rate": 3.831822382544101e-06, + "loss": 0.78698927, + "num_input_tokens_seen": 56822470, + "step": 2620, + "time_per_iteration": 2.6481080055236816 + }, + { + "auxiliary_loss_clip": 0.01136128, + "auxiliary_loss_mlp": 0.0104575, + "balance_loss_clip": 1.05097985, + "balance_loss_mlp": 1.02488887, + "epoch": 0.15758304524274763, + "flos": 29826002568960.0, + "grad_norm": 1.6603432400664486, + "language_loss": 0.7136035, + "learning_rate": 3.831666025302944e-06, + "loss": 0.73542225, + "num_input_tokens_seen": 56842100, + "step": 2621, + "time_per_iteration": 2.70985746383667 + }, + { + "auxiliary_loss_clip": 0.01103274, + "auxiliary_loss_mlp": 0.01052522, + "balance_loss_clip": 1.04624665, + "balance_loss_mlp": 1.02921629, + "epoch": 0.1576431684954156, + "flos": 53577426723840.0, + "grad_norm": 2.1843515622778624, + "language_loss": 0.72136736, + "learning_rate": 3.831509598604828e-06, + "loss": 0.74292529, + "num_input_tokens_seen": 56865920, + "step": 2622, + "time_per_iteration": 3.024561643600464 + }, + { + "auxiliary_loss_clip": 0.01095163, + "auxiliary_loss_mlp": 0.01043948, + "balance_loss_clip": 1.04474711, + "balance_loss_mlp": 1.02464843, + "epoch": 0.15770329174808356, + "flos": 20813609664000.0, + "grad_norm": 1.6586715789846178, + "language_loss": 0.87637675, + "learning_rate": 3.831353102455684e-06, + "loss": 0.8977679, + "num_input_tokens_seen": 56885265, + "step": 2623, + "time_per_iteration": 2.9600114822387695 + }, + { + "auxiliary_loss_clip": 0.01158714, + "auxiliary_loss_mlp": 0.01044337, + "balance_loss_clip": 1.05476475, + "balance_loss_mlp": 1.02564478, + "epoch": 0.15776341500075153, + "flos": 24974004395520.0, + "grad_norm": 1.6915331173398198, + "language_loss": 0.81600082, + "learning_rate": 3.831196536861448e-06, + "loss": 0.83803129, + "num_input_tokens_seen": 56906710, + "step": 2624, + "time_per_iteration": 2.6621103286743164 + }, + { + "auxiliary_loss_clip": 0.01122344, + "auxiliary_loss_mlp": 0.01049423, + "balance_loss_clip": 1.04776418, + "balance_loss_mlp": 1.02990842, + "epoch": 0.15782353825341952, + "flos": 21907915459200.0, + "grad_norm": 2.879465237309773, + "language_loss": 0.79977828, + "learning_rate": 3.831039901828054e-06, + "loss": 0.82149595, + "num_input_tokens_seen": 56924275, + "step": 2625, + "time_per_iteration": 2.7291064262390137 + }, + { + "auxiliary_loss_clip": 0.01157938, + "auxiliary_loss_mlp": 0.01046203, + "balance_loss_clip": 1.05403268, + "balance_loss_mlp": 1.02857196, + "epoch": 0.15788366150608749, + "flos": 26177191292160.0, + "grad_norm": 2.133783972400447, + "language_loss": 0.80332482, + "learning_rate": 3.830883197361445e-06, + "loss": 0.8253662, + "num_input_tokens_seen": 56941525, + "step": 2626, + "time_per_iteration": 4.252760171890259 + }, + { + "auxiliary_loss_clip": 0.01102762, + "auxiliary_loss_mlp": 0.01057658, + "balance_loss_clip": 1.05214024, + "balance_loss_mlp": 1.03512752, + "epoch": 0.15794378475875545, + "flos": 27709822753920.0, + "grad_norm": 3.9802810067864045, + "language_loss": 0.73636395, + "learning_rate": 3.830726423467561e-06, + "loss": 0.75796819, + "num_input_tokens_seen": 56962145, + "step": 2627, + "time_per_iteration": 4.328871250152588 + }, + { + "auxiliary_loss_clip": 0.01117433, + "auxiliary_loss_mlp": 0.01055032, + "balance_loss_clip": 1.0503006, + "balance_loss_mlp": 1.0351001, + "epoch": 0.15800390801142342, + "flos": 12130158533760.0, + "grad_norm": 2.0211273696228216, + "language_loss": 0.84589541, + "learning_rate": 3.830569580152348e-06, + "loss": 0.86762005, + "num_input_tokens_seen": 56977505, + "step": 2628, + "time_per_iteration": 2.6785013675689697 + }, + { + "auxiliary_loss_clip": 0.01129476, + "auxiliary_loss_mlp": 0.01040858, + "balance_loss_clip": 1.05065978, + "balance_loss_mlp": 1.02308416, + "epoch": 0.15806403126409138, + "flos": 20704728562560.0, + "grad_norm": 1.897214582222077, + "language_loss": 0.76437485, + "learning_rate": 3.830412667421752e-06, + "loss": 0.78607821, + "num_input_tokens_seen": 56996770, + "step": 2629, + "time_per_iteration": 4.2878499031066895 + }, + { + "auxiliary_loss_clip": 0.01143973, + "auxiliary_loss_mlp": 0.01046449, + "balance_loss_clip": 1.0529623, + "balance_loss_mlp": 1.02675569, + "epoch": 0.15812415451675935, + "flos": 17821712269440.0, + "grad_norm": 2.252423233454998, + "language_loss": 0.73337436, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.75527859, + "num_input_tokens_seen": 57014970, + "step": 2630, + "time_per_iteration": 4.253108263015747 + }, + { + "auxiliary_loss_clip": 0.01156261, + "auxiliary_loss_mlp": 0.01045602, + "balance_loss_clip": 1.05644512, + "balance_loss_mlp": 1.02615929, + "epoch": 0.15818427776942734, + "flos": 20084048524800.0, + "grad_norm": 2.390369083551665, + "language_loss": 0.83678091, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.85879952, + "num_input_tokens_seen": 57034045, + "step": 2631, + "time_per_iteration": 2.6145882606506348 + }, + { + "auxiliary_loss_clip": 0.01159092, + "auxiliary_loss_mlp": 0.01045772, + "balance_loss_clip": 1.05313432, + "balance_loss_mlp": 1.02746117, + "epoch": 0.1582444010220953, + "flos": 21214911386880.0, + "grad_norm": 1.8755653224160422, + "language_loss": 0.78415525, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.80620384, + "num_input_tokens_seen": 57053695, + "step": 2632, + "time_per_iteration": 2.656691551208496 + }, + { + "auxiliary_loss_clip": 0.01151481, + "auxiliary_loss_mlp": 0.01057283, + "balance_loss_clip": 1.05574381, + "balance_loss_mlp": 1.03769732, + "epoch": 0.15830452427476327, + "flos": 17858341163520.0, + "grad_norm": 2.079450153413421, + "language_loss": 0.8301838, + "learning_rate": 3.829784322464594e-06, + "loss": 0.85227144, + "num_input_tokens_seen": 57071290, + "step": 2633, + "time_per_iteration": 2.622725248336792 + }, + { + "auxiliary_loss_clip": 0.01165069, + "auxiliary_loss_mlp": 0.01041545, + "balance_loss_clip": 1.05761647, + "balance_loss_mlp": 1.02223265, + "epoch": 0.15836464752743123, + "flos": 24534960456960.0, + "grad_norm": 2.1719104392782813, + "language_loss": 0.77448404, + "learning_rate": 3.829627062746394e-06, + "loss": 0.79655015, + "num_input_tokens_seen": 57091465, + "step": 2634, + "time_per_iteration": 2.6383235454559326 + }, + { + "auxiliary_loss_clip": 0.01127407, + "auxiliary_loss_mlp": 0.00777775, + "balance_loss_clip": 1.05277348, + "balance_loss_mlp": 1.00136137, + "epoch": 0.1584247707800992, + "flos": 20120821073280.0, + "grad_norm": 3.5133527254089087, + "language_loss": 0.88479185, + "learning_rate": 3.829469733648552e-06, + "loss": 0.90384364, + "num_input_tokens_seen": 57110075, + "step": 2635, + "time_per_iteration": 2.725924491882324 + }, + { + "auxiliary_loss_clip": 0.01096223, + "auxiliary_loss_mlp": 0.01058885, + "balance_loss_clip": 1.04816198, + "balance_loss_mlp": 1.03847599, + "epoch": 0.15848489403276717, + "flos": 20375966355840.0, + "grad_norm": 2.8627721083207627, + "language_loss": 0.75762677, + "learning_rate": 3.829312335177034e-06, + "loss": 0.77917778, + "num_input_tokens_seen": 57128945, + "step": 2636, + "time_per_iteration": 2.775310516357422 + }, + { + "auxiliary_loss_clip": 0.01120174, + "auxiliary_loss_mlp": 0.01043834, + "balance_loss_clip": 1.05117822, + "balance_loss_mlp": 1.02350879, + "epoch": 0.15854501728543513, + "flos": 39346890359040.0, + "grad_norm": 2.388418559522659, + "language_loss": 0.71977961, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74141967, + "num_input_tokens_seen": 57152385, + "step": 2637, + "time_per_iteration": 2.8375279903411865 + }, + { + "auxiliary_loss_clip": 0.0115052, + "auxiliary_loss_mlp": 0.01044842, + "balance_loss_clip": 1.05661607, + "balance_loss_mlp": 1.02640057, + "epoch": 0.15860514053810312, + "flos": 24864225454080.0, + "grad_norm": 2.1640345554565057, + "language_loss": 0.78352648, + "learning_rate": 3.82899733013685e-06, + "loss": 0.80548006, + "num_input_tokens_seen": 57172620, + "step": 2638, + "time_per_iteration": 2.7298176288604736 + }, + { + "auxiliary_loss_clip": 0.01129706, + "auxiliary_loss_mlp": 0.01057375, + "balance_loss_clip": 1.05311394, + "balance_loss_mlp": 1.03715718, + "epoch": 0.1586652637907711, + "flos": 26177694082560.0, + "grad_norm": 2.325769963269074, + "language_loss": 0.75845039, + "learning_rate": 3.828839723580128e-06, + "loss": 0.78032124, + "num_input_tokens_seen": 57194680, + "step": 2639, + "time_per_iteration": 2.7731449604034424 + }, + { + "auxiliary_loss_clip": 0.01104856, + "auxiliary_loss_mlp": 0.01057283, + "balance_loss_clip": 1.05350864, + "balance_loss_mlp": 1.03772068, + "epoch": 0.15872538704343905, + "flos": 19792058866560.0, + "grad_norm": 2.173238447343554, + "language_loss": 0.81319505, + "learning_rate": 3.82868204767362e-06, + "loss": 0.83481646, + "num_input_tokens_seen": 57214675, + "step": 2640, + "time_per_iteration": 2.8024139404296875 + }, + { + "auxiliary_loss_clip": 0.01135166, + "auxiliary_loss_mlp": 0.01054673, + "balance_loss_clip": 1.05492401, + "balance_loss_mlp": 1.03426492, + "epoch": 0.15878551029610702, + "flos": 28475366342400.0, + "grad_norm": 2.013499020988034, + "language_loss": 0.66893363, + "learning_rate": 3.828524302423306e-06, + "loss": 0.69083202, + "num_input_tokens_seen": 57235830, + "step": 2641, + "time_per_iteration": 2.7519116401672363 + }, + { + "auxiliary_loss_clip": 0.01149448, + "auxiliary_loss_mlp": 0.01051949, + "balance_loss_clip": 1.05758858, + "balance_loss_mlp": 1.0326376, + "epoch": 0.15884563354877498, + "flos": 24206701040640.0, + "grad_norm": 2.139760259286454, + "language_loss": 0.7552591, + "learning_rate": 3.828366487835167e-06, + "loss": 0.77727306, + "num_input_tokens_seen": 57255970, + "step": 2642, + "time_per_iteration": 2.706136465072632 + }, + { + "auxiliary_loss_clip": 0.01156917, + "auxiliary_loss_mlp": 0.01042142, + "balance_loss_clip": 1.06263423, + "balance_loss_mlp": 1.02323556, + "epoch": 0.15890575680144295, + "flos": 23949795991680.0, + "grad_norm": 1.9419610036505286, + "language_loss": 0.70564604, + "learning_rate": 3.828208603915186e-06, + "loss": 0.72763658, + "num_input_tokens_seen": 57274435, + "step": 2643, + "time_per_iteration": 2.682015895843506 + }, + { + "auxiliary_loss_clip": 0.01161783, + "auxiliary_loss_mlp": 0.01041643, + "balance_loss_clip": 1.05891204, + "balance_loss_mlp": 1.02389312, + "epoch": 0.15896588005411091, + "flos": 21215019127680.0, + "grad_norm": 1.846517711414915, + "language_loss": 0.78057045, + "learning_rate": 3.828050650669353e-06, + "loss": 0.80260473, + "num_input_tokens_seen": 57293115, + "step": 2644, + "time_per_iteration": 2.683790922164917 + }, + { + "auxiliary_loss_clip": 0.01151239, + "auxiliary_loss_mlp": 0.01050105, + "balance_loss_clip": 1.05701637, + "balance_loss_mlp": 1.03154373, + "epoch": 0.1590260033067789, + "flos": 24352390604160.0, + "grad_norm": 3.757920662841351, + "language_loss": 0.81961924, + "learning_rate": 3.827892628103657e-06, + "loss": 0.84163266, + "num_input_tokens_seen": 57312565, + "step": 2645, + "time_per_iteration": 2.698085069656372 + }, + { + "auxiliary_loss_clip": 0.01162748, + "auxiliary_loss_mlp": 0.01048492, + "balance_loss_clip": 1.05487716, + "balance_loss_mlp": 1.02854836, + "epoch": 0.15908612655944687, + "flos": 32048944583040.0, + "grad_norm": 2.056693785790565, + "language_loss": 0.69412929, + "learning_rate": 3.827734536224087e-06, + "loss": 0.71624172, + "num_input_tokens_seen": 57333360, + "step": 2646, + "time_per_iteration": 2.7166528701782227 + }, + { + "auxiliary_loss_clip": 0.01135067, + "auxiliary_loss_mlp": 0.01040314, + "balance_loss_clip": 1.05435526, + "balance_loss_mlp": 1.02223015, + "epoch": 0.15914624981211484, + "flos": 17785370684160.0, + "grad_norm": 2.5975497323405055, + "language_loss": 0.62932581, + "learning_rate": 3.827576375036642e-06, + "loss": 0.65107965, + "num_input_tokens_seen": 57350575, + "step": 2647, + "time_per_iteration": 2.7405354976654053 + }, + { + "auxiliary_loss_clip": 0.01160144, + "auxiliary_loss_mlp": 0.01047955, + "balance_loss_clip": 1.05654776, + "balance_loss_mlp": 1.02896523, + "epoch": 0.1592063730647828, + "flos": 17712507945600.0, + "grad_norm": 2.2161421076431025, + "language_loss": 0.89490473, + "learning_rate": 3.827418144547318e-06, + "loss": 0.91698575, + "num_input_tokens_seen": 57367570, + "step": 2648, + "time_per_iteration": 2.6193346977233887 + }, + { + "auxiliary_loss_clip": 0.01158791, + "auxiliary_loss_mlp": 0.01048086, + "balance_loss_clip": 1.05630398, + "balance_loss_mlp": 1.03072906, + "epoch": 0.15926649631745077, + "flos": 18803545603200.0, + "grad_norm": 1.9960039108301237, + "language_loss": 0.91307199, + "learning_rate": 3.827259844762114e-06, + "loss": 0.93514073, + "num_input_tokens_seen": 57383980, + "step": 2649, + "time_per_iteration": 2.6137378215789795 + }, + { + "auxiliary_loss_clip": 0.01099661, + "auxiliary_loss_mlp": 0.01044384, + "balance_loss_clip": 1.05474401, + "balance_loss_mlp": 1.02439272, + "epoch": 0.15932661957011873, + "flos": 17566243764480.0, + "grad_norm": 2.3504548368335767, + "language_loss": 0.71782613, + "learning_rate": 3.827101475687033e-06, + "loss": 0.73926663, + "num_input_tokens_seen": 57400840, + "step": 2650, + "time_per_iteration": 2.8883376121520996 + }, + { + "auxiliary_loss_clip": 0.01146809, + "auxiliary_loss_mlp": 0.01041815, + "balance_loss_clip": 1.05386841, + "balance_loss_mlp": 1.02476835, + "epoch": 0.15938674282278673, + "flos": 13334351011200.0, + "grad_norm": 1.8238326955956992, + "language_loss": 0.71427429, + "learning_rate": 3.826943037328082e-06, + "loss": 0.73616046, + "num_input_tokens_seen": 57419230, + "step": 2651, + "time_per_iteration": 2.607879638671875 + }, + { + "auxiliary_loss_clip": 0.01118842, + "auxiliary_loss_mlp": 0.00777496, + "balance_loss_clip": 1.05154157, + "balance_loss_mlp": 1.00132799, + "epoch": 0.1594468660754547, + "flos": 22488842119680.0, + "grad_norm": 1.8928974850955373, + "language_loss": 0.80185902, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.82082248, + "num_input_tokens_seen": 57439315, + "step": 2652, + "time_per_iteration": 2.718695640563965 + }, + { + "auxiliary_loss_clip": 0.01138048, + "auxiliary_loss_mlp": 0.00775, + "balance_loss_clip": 1.0567826, + "balance_loss_mlp": 1.00124729, + "epoch": 0.15950698932812266, + "flos": 15007320910080.0, + "grad_norm": 2.6116065834427387, + "language_loss": 0.69539076, + "learning_rate": 3.826625952782601e-06, + "loss": 0.71452117, + "num_input_tokens_seen": 57454635, + "step": 2653, + "time_per_iteration": 2.7088639736175537 + }, + { + "auxiliary_loss_clip": 0.01144826, + "auxiliary_loss_mlp": 0.01038735, + "balance_loss_clip": 1.05257821, + "balance_loss_mlp": 1.02050805, + "epoch": 0.15956711258079062, + "flos": 30155052084480.0, + "grad_norm": 2.1937273620657307, + "language_loss": 0.76670635, + "learning_rate": 3.826467306608095e-06, + "loss": 0.78854191, + "num_input_tokens_seen": 57476805, + "step": 2654, + "time_per_iteration": 2.79425048828125 + }, + { + "auxiliary_loss_clip": 0.01114313, + "auxiliary_loss_mlp": 0.01041134, + "balance_loss_clip": 1.04714727, + "balance_loss_mlp": 1.02248931, + "epoch": 0.1596272358334586, + "flos": 21032700670080.0, + "grad_norm": 2.0572535633716247, + "language_loss": 0.81873977, + "learning_rate": 3.826308591173765e-06, + "loss": 0.84029424, + "num_input_tokens_seen": 57496400, + "step": 2655, + "time_per_iteration": 2.6990878582000732 + }, + { + "auxiliary_loss_clip": 0.01112525, + "auxiliary_loss_mlp": 0.01046346, + "balance_loss_clip": 1.04670715, + "balance_loss_mlp": 1.02849984, + "epoch": 0.15968735908612655, + "flos": 15268032800640.0, + "grad_norm": 2.0964800101687486, + "language_loss": 0.73768878, + "learning_rate": 3.826149806485631e-06, + "loss": 0.75927746, + "num_input_tokens_seen": 57513700, + "step": 2656, + "time_per_iteration": 2.7409873008728027 + }, + { + "auxiliary_loss_clip": 0.01111218, + "auxiliary_loss_mlp": 0.01039948, + "balance_loss_clip": 1.04749918, + "balance_loss_mlp": 1.02220988, + "epoch": 0.15974748233879452, + "flos": 52665726695040.0, + "grad_norm": 2.516351978408242, + "language_loss": 0.77637637, + "learning_rate": 3.825990952549713e-06, + "loss": 0.79788804, + "num_input_tokens_seen": 57536180, + "step": 2657, + "time_per_iteration": 2.984161376953125 + }, + { + "auxiliary_loss_clip": 0.01142397, + "auxiliary_loss_mlp": 0.01048058, + "balance_loss_clip": 1.05276513, + "balance_loss_mlp": 1.02984321, + "epoch": 0.1598076055914625, + "flos": 18733232730240.0, + "grad_norm": 2.1741432296797303, + "language_loss": 0.74654955, + "learning_rate": 3.825832029372035e-06, + "loss": 0.76845407, + "num_input_tokens_seen": 57555025, + "step": 2658, + "time_per_iteration": 2.6795172691345215 + }, + { + "auxiliary_loss_clip": 0.01137294, + "auxiliary_loss_mlp": 0.01047097, + "balance_loss_clip": 1.05887127, + "balance_loss_mlp": 1.02581763, + "epoch": 0.15986772884413047, + "flos": 34349238535680.0, + "grad_norm": 2.2676743120149916, + "language_loss": 0.75164986, + "learning_rate": 3.825673036958624e-06, + "loss": 0.77349377, + "num_input_tokens_seen": 57577660, + "step": 2659, + "time_per_iteration": 2.885744094848633 + }, + { + "auxiliary_loss_clip": 0.01122752, + "auxiliary_loss_mlp": 0.0105323, + "balance_loss_clip": 1.0512991, + "balance_loss_mlp": 1.0334295, + "epoch": 0.15992785209679844, + "flos": 22054969739520.0, + "grad_norm": 2.181311046841435, + "language_loss": 0.90998709, + "learning_rate": 3.825513975315508e-06, + "loss": 0.93174696, + "num_input_tokens_seen": 57596335, + "step": 2660, + "time_per_iteration": 2.7562267780303955 + }, + { + "auxiliary_loss_clip": 0.01114547, + "auxiliary_loss_mlp": 0.01058378, + "balance_loss_clip": 1.05538487, + "balance_loss_mlp": 1.03590751, + "epoch": 0.1599879753494664, + "flos": 33066652625280.0, + "grad_norm": 1.746468400789071, + "language_loss": 0.77724659, + "learning_rate": 3.82535484444872e-06, + "loss": 0.79897583, + "num_input_tokens_seen": 57616830, + "step": 2661, + "time_per_iteration": 2.9896914958953857 + }, + { + "auxiliary_loss_clip": 0.0113781, + "auxiliary_loss_mlp": 0.00777461, + "balance_loss_clip": 1.05382478, + "balance_loss_mlp": 1.00132632, + "epoch": 0.16004809860213437, + "flos": 28038010343040.0, + "grad_norm": 2.0483033922540086, + "language_loss": 0.74442393, + "learning_rate": 3.825195644364292e-06, + "loss": 0.76357663, + "num_input_tokens_seen": 57635515, + "step": 2662, + "time_per_iteration": 2.7993714809417725 + }, + { + "auxiliary_loss_clip": 0.01135674, + "auxiliary_loss_mlp": 0.00780783, + "balance_loss_clip": 1.05392313, + "balance_loss_mlp": 1.0016191, + "epoch": 0.16010822185480234, + "flos": 22780113505920.0, + "grad_norm": 2.9903694104875984, + "language_loss": 0.82515085, + "learning_rate": 3.825036375068263e-06, + "loss": 0.84431541, + "num_input_tokens_seen": 57654250, + "step": 2663, + "time_per_iteration": 2.678490161895752 + }, + { + "auxiliary_loss_clip": 0.01112205, + "auxiliary_loss_mlp": 0.01044917, + "balance_loss_clip": 1.05182636, + "balance_loss_mlp": 1.02574801, + "epoch": 0.16016834510747033, + "flos": 20084012611200.0, + "grad_norm": 2.06786422122115, + "language_loss": 0.7951405, + "learning_rate": 3.824877036566672e-06, + "loss": 0.81671166, + "num_input_tokens_seen": 57672645, + "step": 2664, + "time_per_iteration": 2.819880962371826 + }, + { + "auxiliary_loss_clip": 0.01151449, + "auxiliary_loss_mlp": 0.01048023, + "balance_loss_clip": 1.05374622, + "balance_loss_mlp": 1.02886605, + "epoch": 0.1602284683601383, + "flos": 21173829206400.0, + "grad_norm": 1.6697703441146605, + "language_loss": 0.93748474, + "learning_rate": 3.824717628865561e-06, + "loss": 0.95947945, + "num_input_tokens_seen": 57691055, + "step": 2665, + "time_per_iteration": 2.697660446166992 + }, + { + "auxiliary_loss_clip": 0.01127607, + "auxiliary_loss_mlp": 0.01047415, + "balance_loss_clip": 1.05185676, + "balance_loss_mlp": 1.02774525, + "epoch": 0.16028859161280626, + "flos": 14647568244480.0, + "grad_norm": 2.9655602739253095, + "language_loss": 0.85237324, + "learning_rate": 3.824558151970974e-06, + "loss": 0.87412339, + "num_input_tokens_seen": 57707235, + "step": 2666, + "time_per_iteration": 4.282273530960083 + }, + { + "auxiliary_loss_clip": 0.01129818, + "auxiliary_loss_mlp": 0.00777125, + "balance_loss_clip": 1.05257225, + "balance_loss_mlp": 1.00145936, + "epoch": 0.16034871486547422, + "flos": 20990325600000.0, + "grad_norm": 1.8366839898970433, + "language_loss": 0.81284773, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.83191717, + "num_input_tokens_seen": 57724190, + "step": 2667, + "time_per_iteration": 2.69508695602417 + }, + { + "auxiliary_loss_clip": 0.0116556, + "auxiliary_loss_mlp": 0.01046526, + "balance_loss_clip": 1.06089485, + "balance_loss_mlp": 1.02643883, + "epoch": 0.1604088381181422, + "flos": 21397732634880.0, + "grad_norm": 1.958935842080623, + "language_loss": 0.74031079, + "learning_rate": 3.824238990625567e-06, + "loss": 0.76243162, + "num_input_tokens_seen": 57743620, + "step": 2668, + "time_per_iteration": 4.2559425830841064 + }, + { + "auxiliary_loss_clip": 0.01148853, + "auxiliary_loss_mlp": 0.01051992, + "balance_loss_clip": 1.05547619, + "balance_loss_mlp": 1.03240585, + "epoch": 0.16046896137081015, + "flos": 23877040993920.0, + "grad_norm": 1.7737626564305047, + "language_loss": 0.77495629, + "learning_rate": 3.824079306186848e-06, + "loss": 0.7969647, + "num_input_tokens_seen": 57764810, + "step": 2669, + "time_per_iteration": 2.6424050331115723 + }, + { + "auxiliary_loss_clip": 0.01097339, + "auxiliary_loss_mlp": 0.01012737, + "balance_loss_clip": 1.06351233, + "balance_loss_mlp": 1.00986385, + "epoch": 0.16052908462347812, + "flos": 59806709015040.0, + "grad_norm": 0.8041290684345284, + "language_loss": 0.5549804, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57608116, + "num_input_tokens_seen": 57824390, + "step": 2670, + "time_per_iteration": 4.765664100646973 + }, + { + "auxiliary_loss_clip": 0.01149639, + "auxiliary_loss_mlp": 0.01043383, + "balance_loss_clip": 1.05322218, + "balance_loss_mlp": 1.02430916, + "epoch": 0.1605892078761461, + "flos": 18296559089280.0, + "grad_norm": 2.6306224128650464, + "language_loss": 0.77778888, + "learning_rate": 3.82375972980766e-06, + "loss": 0.7997191, + "num_input_tokens_seen": 57843665, + "step": 2671, + "time_per_iteration": 2.6876416206359863 + }, + { + "auxiliary_loss_clip": 0.01151164, + "auxiliary_loss_mlp": 0.01043962, + "balance_loss_clip": 1.05529547, + "balance_loss_mlp": 1.02503204, + "epoch": 0.16064933112881408, + "flos": 32160734686080.0, + "grad_norm": 1.9167251889277674, + "language_loss": 0.64766788, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.66961908, + "num_input_tokens_seen": 57863305, + "step": 2672, + "time_per_iteration": 2.7102553844451904 + }, + { + "auxiliary_loss_clip": 0.01150206, + "auxiliary_loss_mlp": 0.01046785, + "balance_loss_clip": 1.05674481, + "balance_loss_mlp": 1.02554154, + "epoch": 0.16070945438148204, + "flos": 19828795501440.0, + "grad_norm": 2.045175098484539, + "language_loss": 0.85708207, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.87905198, + "num_input_tokens_seen": 57883025, + "step": 2673, + "time_per_iteration": 2.656360626220703 + }, + { + "auxiliary_loss_clip": 0.01125542, + "auxiliary_loss_mlp": 0.01055838, + "balance_loss_clip": 1.05366015, + "balance_loss_mlp": 1.03716969, + "epoch": 0.16076957763415, + "flos": 18913144976640.0, + "grad_norm": 2.339006860757087, + "language_loss": 0.7289716, + "learning_rate": 3.823279846575403e-06, + "loss": 0.75078535, + "num_input_tokens_seen": 57901430, + "step": 2674, + "time_per_iteration": 2.7122414112091064 + }, + { + "auxiliary_loss_clip": 0.01150063, + "auxiliary_loss_mlp": 0.01045468, + "balance_loss_clip": 1.05416465, + "balance_loss_mlp": 1.02464211, + "epoch": 0.16082970088681797, + "flos": 16764358590720.0, + "grad_norm": 1.9341682597436423, + "language_loss": 0.84438515, + "learning_rate": 3.823119747211986e-06, + "loss": 0.86634052, + "num_input_tokens_seen": 57919550, + "step": 2675, + "time_per_iteration": 2.6646435260772705 + }, + { + "auxiliary_loss_clip": 0.01116221, + "auxiliary_loss_mlp": 0.01049343, + "balance_loss_clip": 1.05220723, + "balance_loss_mlp": 1.02823126, + "epoch": 0.16088982413948594, + "flos": 35150261783040.0, + "grad_norm": 1.871909119220515, + "language_loss": 0.82216591, + "learning_rate": 3.822959578715685e-06, + "loss": 0.84382153, + "num_input_tokens_seen": 57939890, + "step": 2676, + "time_per_iteration": 2.8457534313201904 + }, + { + "auxiliary_loss_clip": 0.01151157, + "auxiliary_loss_mlp": 0.01049874, + "balance_loss_clip": 1.05746996, + "balance_loss_mlp": 1.03162253, + "epoch": 0.1609499473921539, + "flos": 18625105814400.0, + "grad_norm": 2.1166154816193923, + "language_loss": 0.73485494, + "learning_rate": 3.822799341092573e-06, + "loss": 0.75686526, + "num_input_tokens_seen": 57957410, + "step": 2677, + "time_per_iteration": 2.65387225151062 + }, + { + "auxiliary_loss_clip": 0.01138188, + "auxiliary_loss_mlp": 0.01044363, + "balance_loss_clip": 1.05438483, + "balance_loss_mlp": 1.02537322, + "epoch": 0.1610100706448219, + "flos": 33145728416640.0, + "grad_norm": 3.229282061984371, + "language_loss": 0.76305777, + "learning_rate": 3.822639034348728e-06, + "loss": 0.78488332, + "num_input_tokens_seen": 57977900, + "step": 2678, + "time_per_iteration": 2.836071014404297 + }, + { + "auxiliary_loss_clip": 0.01148252, + "auxiliary_loss_mlp": 0.01047887, + "balance_loss_clip": 1.05379987, + "balance_loss_mlp": 1.02789569, + "epoch": 0.16107019389748986, + "flos": 34676707852800.0, + "grad_norm": 8.295814069484678, + "language_loss": 0.70340431, + "learning_rate": 3.822478658490228e-06, + "loss": 0.7253657, + "num_input_tokens_seen": 57998210, + "step": 2679, + "time_per_iteration": 2.771185874938965 + }, + { + "auxiliary_loss_clip": 0.01059502, + "auxiliary_loss_mlp": 0.00758644, + "balance_loss_clip": 1.04695845, + "balance_loss_mlp": 1.00150955, + "epoch": 0.16113031715015783, + "flos": 65713403260800.0, + "grad_norm": 0.7819629653273137, + "language_loss": 0.51843339, + "learning_rate": 3.822318213523154e-06, + "loss": 0.53661484, + "num_input_tokens_seen": 58059420, + "step": 2680, + "time_per_iteration": 3.3107378482818604 + }, + { + "auxiliary_loss_clip": 0.01144342, + "auxiliary_loss_mlp": 0.01047358, + "balance_loss_clip": 1.05360317, + "balance_loss_mlp": 1.02632904, + "epoch": 0.1611904404028258, + "flos": 20810413353600.0, + "grad_norm": 1.6718368455031125, + "language_loss": 0.8028667, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.82478368, + "num_input_tokens_seen": 58078370, + "step": 2681, + "time_per_iteration": 2.6986513137817383 + }, + { + "auxiliary_loss_clip": 0.01139192, + "auxiliary_loss_mlp": 0.01055518, + "balance_loss_clip": 1.05603266, + "balance_loss_mlp": 1.03602743, + "epoch": 0.16125056365549376, + "flos": 27013335062400.0, + "grad_norm": 2.154781054673542, + "language_loss": 0.68957973, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71152687, + "num_input_tokens_seen": 58097395, + "step": 2682, + "time_per_iteration": 2.794686794281006 + }, + { + "auxiliary_loss_clip": 0.01139216, + "auxiliary_loss_mlp": 0.01052349, + "balance_loss_clip": 1.05670619, + "balance_loss_mlp": 1.03195262, + "epoch": 0.16131068690816172, + "flos": 19276524915840.0, + "grad_norm": 1.9802191055590168, + "language_loss": 0.87362224, + "learning_rate": 3.821836464031348e-06, + "loss": 0.89553785, + "num_input_tokens_seen": 58115630, + "step": 2683, + "time_per_iteration": 2.703634262084961 + }, + { + "auxiliary_loss_clip": 0.01165497, + "auxiliary_loss_mlp": 0.0105575, + "balance_loss_clip": 1.05714059, + "balance_loss_mlp": 1.03491259, + "epoch": 0.16137081016082971, + "flos": 35337931367040.0, + "grad_norm": 1.939499216066865, + "language_loss": 0.74143028, + "learning_rate": 3.821675742690849e-06, + "loss": 0.76364273, + "num_input_tokens_seen": 58138655, + "step": 2684, + "time_per_iteration": 2.7890264987945557 + }, + { + "auxiliary_loss_clip": 0.01136683, + "auxiliary_loss_mlp": 0.00778989, + "balance_loss_clip": 1.05435085, + "balance_loss_mlp": 1.00176883, + "epoch": 0.16143093341349768, + "flos": 34235257703040.0, + "grad_norm": 1.9009911635557044, + "language_loss": 0.70506597, + "learning_rate": 3.821514952272223e-06, + "loss": 0.72422272, + "num_input_tokens_seen": 58157440, + "step": 2685, + "time_per_iteration": 2.803942918777466 + }, + { + "auxiliary_loss_clip": 0.01116315, + "auxiliary_loss_mlp": 0.01059092, + "balance_loss_clip": 1.05291295, + "balance_loss_mlp": 1.03757524, + "epoch": 0.16149105666616564, + "flos": 27999262546560.0, + "grad_norm": 2.295686008167468, + "language_loss": 0.72060591, + "learning_rate": 3.821354092781567e-06, + "loss": 0.74236, + "num_input_tokens_seen": 58176660, + "step": 2686, + "time_per_iteration": 2.850309133529663 + }, + { + "auxiliary_loss_clip": 0.01153803, + "auxiliary_loss_mlp": 0.01048887, + "balance_loss_clip": 1.05603862, + "balance_loss_mlp": 1.02922952, + "epoch": 0.1615511799188336, + "flos": 19422214479360.0, + "grad_norm": 2.056921120199424, + "language_loss": 0.81720114, + "learning_rate": 3.821193164224981e-06, + "loss": 0.83922803, + "num_input_tokens_seen": 58195085, + "step": 2687, + "time_per_iteration": 2.7077832221984863 + }, + { + "auxiliary_loss_clip": 0.01154388, + "auxiliary_loss_mlp": 0.01050682, + "balance_loss_clip": 1.05335689, + "balance_loss_mlp": 1.02910483, + "epoch": 0.16161130317150157, + "flos": 22854915578880.0, + "grad_norm": 1.6747986106054085, + "language_loss": 0.71680355, + "learning_rate": 3.821032166608568e-06, + "loss": 0.73885429, + "num_input_tokens_seen": 58213540, + "step": 2688, + "time_per_iteration": 2.700073480606079 + }, + { + "auxiliary_loss_clip": 0.0112226, + "auxiliary_loss_mlp": 0.0105252, + "balance_loss_clip": 1.0517168, + "balance_loss_mlp": 1.03330338, + "epoch": 0.16167142642416954, + "flos": 26110577520000.0, + "grad_norm": 2.2887064413695253, + "language_loss": 0.76168394, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.78343177, + "num_input_tokens_seen": 58236995, + "step": 2689, + "time_per_iteration": 2.846964120864868 + }, + { + "auxiliary_loss_clip": 0.01166324, + "auxiliary_loss_mlp": 0.01052979, + "balance_loss_clip": 1.05979431, + "balance_loss_mlp": 1.03308284, + "epoch": 0.1617315496768375, + "flos": 22779646629120.0, + "grad_norm": 2.045037041298705, + "language_loss": 0.87211925, + "learning_rate": 3.820709964220683e-06, + "loss": 0.89431226, + "num_input_tokens_seen": 58257230, + "step": 2690, + "time_per_iteration": 2.704497814178467 + }, + { + "auxiliary_loss_clip": 0.01143898, + "auxiliary_loss_mlp": 0.01046571, + "balance_loss_clip": 1.05318451, + "balance_loss_mlp": 1.02890396, + "epoch": 0.1617916729295055, + "flos": 22017299351040.0, + "grad_norm": 1.7518031225399346, + "language_loss": 0.87899524, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.90089989, + "num_input_tokens_seen": 58277080, + "step": 2691, + "time_per_iteration": 2.6763153076171875 + }, + { + "auxiliary_loss_clip": 0.01150265, + "auxiliary_loss_mlp": 0.01053114, + "balance_loss_clip": 1.05237532, + "balance_loss_mlp": 1.03142977, + "epoch": 0.16185179618217346, + "flos": 23438248450560.0, + "grad_norm": 2.1723450057475313, + "language_loss": 0.81989783, + "learning_rate": 3.820387485666784e-06, + "loss": 0.84193164, + "num_input_tokens_seen": 58294815, + "step": 2692, + "time_per_iteration": 2.6381001472473145 + }, + { + "auxiliary_loss_clip": 0.01167881, + "auxiliary_loss_mlp": 0.0104606, + "balance_loss_clip": 1.05555534, + "balance_loss_mlp": 1.02499604, + "epoch": 0.16191191943484143, + "flos": 25666110627840.0, + "grad_norm": 2.194958172554253, + "language_loss": 0.81381011, + "learning_rate": 3.820226142842862e-06, + "loss": 0.83594954, + "num_input_tokens_seen": 58313215, + "step": 2693, + "time_per_iteration": 2.6366944313049316 + }, + { + "auxiliary_loss_clip": 0.01164466, + "auxiliary_loss_mlp": 0.01058298, + "balance_loss_clip": 1.0587461, + "balance_loss_mlp": 1.03991616, + "epoch": 0.1619720426875094, + "flos": 23477355383040.0, + "grad_norm": 2.778189532536263, + "language_loss": 0.83837044, + "learning_rate": 3.820064730995783e-06, + "loss": 0.86059809, + "num_input_tokens_seen": 58333215, + "step": 2694, + "time_per_iteration": 2.7802140712738037 + }, + { + "auxiliary_loss_clip": 0.01116209, + "auxiliary_loss_mlp": 0.0105764, + "balance_loss_clip": 1.04927421, + "balance_loss_mlp": 1.0366354, + "epoch": 0.16203216594017736, + "flos": 24133658734080.0, + "grad_norm": 1.8201511645490482, + "language_loss": 0.69709098, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71882945, + "num_input_tokens_seen": 58351160, + "step": 2695, + "time_per_iteration": 2.756904125213623 + }, + { + "auxiliary_loss_clip": 0.01155526, + "auxiliary_loss_mlp": 0.01050837, + "balance_loss_clip": 1.05799723, + "balance_loss_mlp": 1.03026128, + "epoch": 0.16209228919284532, + "flos": 22340889999360.0, + "grad_norm": 2.1550523064219487, + "language_loss": 0.82986331, + "learning_rate": 3.819741700256637e-06, + "loss": 0.85192692, + "num_input_tokens_seen": 58368505, + "step": 2696, + "time_per_iteration": 2.651510238647461 + }, + { + "auxiliary_loss_clip": 0.01174193, + "auxiliary_loss_mlp": 0.01052819, + "balance_loss_clip": 1.05826569, + "balance_loss_mlp": 1.03095615, + "epoch": 0.1621524124455133, + "flos": 15815131827840.0, + "grad_norm": 2.9267990143146503, + "language_loss": 0.8862049, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.90847504, + "num_input_tokens_seen": 58385085, + "step": 2697, + "time_per_iteration": 2.5935380458831787 + }, + { + "auxiliary_loss_clip": 0.01158945, + "auxiliary_loss_mlp": 0.01045471, + "balance_loss_clip": 1.0552485, + "balance_loss_mlp": 1.02719641, + "epoch": 0.16221253569818128, + "flos": 30186688988160.0, + "grad_norm": 1.7480298293719791, + "language_loss": 0.80844599, + "learning_rate": 3.819418393498343e-06, + "loss": 0.83049017, + "num_input_tokens_seen": 58406985, + "step": 2698, + "time_per_iteration": 2.6685965061187744 + }, + { + "auxiliary_loss_clip": 0.01151678, + "auxiliary_loss_mlp": 0.01050084, + "balance_loss_clip": 1.05785704, + "balance_loss_mlp": 1.03060579, + "epoch": 0.16227265895084925, + "flos": 24605991601920.0, + "grad_norm": 1.590231062064763, + "language_loss": 0.77499473, + "learning_rate": 3.819256636627339e-06, + "loss": 0.79701245, + "num_input_tokens_seen": 58426205, + "step": 2699, + "time_per_iteration": 2.7206287384033203 + }, + { + "auxiliary_loss_clip": 0.01134482, + "auxiliary_loss_mlp": 0.01043888, + "balance_loss_clip": 1.0504272, + "balance_loss_mlp": 1.02510071, + "epoch": 0.1623327822035172, + "flos": 19573326996480.0, + "grad_norm": 2.299083669251571, + "language_loss": 0.85903585, + "learning_rate": 3.81909481076994e-06, + "loss": 0.88081944, + "num_input_tokens_seen": 58443830, + "step": 2700, + "time_per_iteration": 2.6440224647521973 + }, + { + "auxiliary_loss_clip": 0.01150266, + "auxiliary_loss_mlp": 0.00778348, + "balance_loss_clip": 1.05360484, + "balance_loss_mlp": 1.00180686, + "epoch": 0.16239290545618518, + "flos": 26468462678400.0, + "grad_norm": 1.7679372116400307, + "language_loss": 0.80424523, + "learning_rate": 3.818932915932284e-06, + "loss": 0.82353133, + "num_input_tokens_seen": 58464405, + "step": 2701, + "time_per_iteration": 2.6943976879119873 + }, + { + "auxiliary_loss_clip": 0.01144477, + "auxiliary_loss_mlp": 0.01046291, + "balance_loss_clip": 1.05771017, + "balance_loss_mlp": 1.02664542, + "epoch": 0.16245302870885314, + "flos": 15851940289920.0, + "grad_norm": 1.6539412057050027, + "language_loss": 0.72777367, + "learning_rate": 3.818770952120511e-06, + "loss": 0.74968135, + "num_input_tokens_seen": 58483295, + "step": 2702, + "time_per_iteration": 2.6914141178131104 + }, + { + "auxiliary_loss_clip": 0.01156069, + "auxiliary_loss_mlp": 0.01050141, + "balance_loss_clip": 1.05802381, + "balance_loss_mlp": 1.02896905, + "epoch": 0.1625131519615211, + "flos": 14756521173120.0, + "grad_norm": 1.8265391375227176, + "language_loss": 0.7273894, + "learning_rate": 3.81860891934076e-06, + "loss": 0.74945152, + "num_input_tokens_seen": 58501205, + "step": 2703, + "time_per_iteration": 2.6301820278167725 + }, + { + "auxiliary_loss_clip": 0.01165642, + "auxiliary_loss_mlp": 0.01050857, + "balance_loss_clip": 1.0553968, + "balance_loss_mlp": 1.02942359, + "epoch": 0.1625732752141891, + "flos": 28220508368640.0, + "grad_norm": 3.0329584489902666, + "language_loss": 0.70018482, + "learning_rate": 3.818446817599176e-06, + "loss": 0.72234988, + "num_input_tokens_seen": 58522315, + "step": 2704, + "time_per_iteration": 2.6667227745056152 + }, + { + "auxiliary_loss_clip": 0.01034679, + "auxiliary_loss_mlp": 0.01001657, + "balance_loss_clip": 1.03343439, + "balance_loss_mlp": 0.99865305, + "epoch": 0.16263339846685707, + "flos": 67327947688320.0, + "grad_norm": 0.7801109588151329, + "language_loss": 0.5336051, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55396849, + "num_input_tokens_seen": 58586695, + "step": 2705, + "time_per_iteration": 4.808594465255737 + }, + { + "auxiliary_loss_clip": 0.01138628, + "auxiliary_loss_mlp": 0.00781324, + "balance_loss_clip": 1.0539608, + "balance_loss_mlp": 1.00171995, + "epoch": 0.16269352171952503, + "flos": 14319165173760.0, + "grad_norm": 2.3827832530074455, + "language_loss": 0.7536028, + "learning_rate": 3.818122407255102e-06, + "loss": 0.77280229, + "num_input_tokens_seen": 58602435, + "step": 2706, + "time_per_iteration": 4.126614570617676 + }, + { + "auxiliary_loss_clip": 0.01130684, + "auxiliary_loss_mlp": 0.01047489, + "balance_loss_clip": 1.0523324, + "balance_loss_mlp": 1.02859437, + "epoch": 0.162753644972193, + "flos": 28361205941760.0, + "grad_norm": 2.2272392184651038, + "language_loss": 0.72203928, + "learning_rate": 3.817960098664914e-06, + "loss": 0.74382102, + "num_input_tokens_seen": 58621275, + "step": 2707, + "time_per_iteration": 4.2739410400390625 + }, + { + "auxiliary_loss_clip": 0.01142142, + "auxiliary_loss_mlp": 0.01047652, + "balance_loss_clip": 1.05433679, + "balance_loss_mlp": 1.02898431, + "epoch": 0.16281376822486096, + "flos": 19937856170880.0, + "grad_norm": 3.192481802987827, + "language_loss": 0.83481139, + "learning_rate": 3.817797721137495e-06, + "loss": 0.85670936, + "num_input_tokens_seen": 58637550, + "step": 2708, + "time_per_iteration": 2.7163965702056885 + }, + { + "auxiliary_loss_clip": 0.01101561, + "auxiliary_loss_mlp": 0.00781217, + "balance_loss_clip": 1.04896522, + "balance_loss_mlp": 1.00177419, + "epoch": 0.16287389147752893, + "flos": 21251719848960.0, + "grad_norm": 2.2850459718507654, + "language_loss": 0.86162847, + "learning_rate": 3.817635274679006e-06, + "loss": 0.88045627, + "num_input_tokens_seen": 58654135, + "step": 2709, + "time_per_iteration": 4.474989652633667 + }, + { + "auxiliary_loss_clip": 0.0114031, + "auxiliary_loss_mlp": 0.00777602, + "balance_loss_clip": 1.05267572, + "balance_loss_mlp": 1.00172114, + "epoch": 0.1629340147301969, + "flos": 19244672530560.0, + "grad_norm": 2.581053296112052, + "language_loss": 0.91410124, + "learning_rate": 3.817472759295605e-06, + "loss": 0.93328035, + "num_input_tokens_seen": 58674320, + "step": 2710, + "time_per_iteration": 2.6951892375946045 + }, + { + "auxiliary_loss_clip": 0.01118597, + "auxiliary_loss_mlp": 0.01054854, + "balance_loss_clip": 1.05254805, + "balance_loss_mlp": 1.03451669, + "epoch": 0.16299413798286488, + "flos": 21249816428160.0, + "grad_norm": 2.4322540773438437, + "language_loss": 0.81690979, + "learning_rate": 3.817310174993453e-06, + "loss": 0.83864427, + "num_input_tokens_seen": 58691000, + "step": 2711, + "time_per_iteration": 2.7854437828063965 + }, + { + "auxiliary_loss_clip": 0.01146056, + "auxiliary_loss_mlp": 0.01040648, + "balance_loss_clip": 1.04954815, + "balance_loss_mlp": 1.02107334, + "epoch": 0.16305426123553285, + "flos": 18770579896320.0, + "grad_norm": 3.73256798888747, + "language_loss": 0.8091476, + "learning_rate": 3.817147521778719e-06, + "loss": 0.83101463, + "num_input_tokens_seen": 58710230, + "step": 2712, + "time_per_iteration": 2.834291458129883 + }, + { + "auxiliary_loss_clip": 0.01171211, + "auxiliary_loss_mlp": 0.01053015, + "balance_loss_clip": 1.0590024, + "balance_loss_mlp": 1.03273714, + "epoch": 0.16311438448820081, + "flos": 22087648137600.0, + "grad_norm": 2.3460895846171996, + "language_loss": 0.7681579, + "learning_rate": 3.816984799657568e-06, + "loss": 0.79040015, + "num_input_tokens_seen": 58728610, + "step": 2713, + "time_per_iteration": 2.6188278198242188 + }, + { + "auxiliary_loss_clip": 0.01156539, + "auxiliary_loss_mlp": 0.0105792, + "balance_loss_clip": 1.06240916, + "balance_loss_mlp": 1.03832221, + "epoch": 0.16317450774086878, + "flos": 16467700164480.0, + "grad_norm": 2.543173325075216, + "language_loss": 0.79012156, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.81226611, + "num_input_tokens_seen": 58744385, + "step": 2714, + "time_per_iteration": 2.6534018516540527 + }, + { + "auxiliary_loss_clip": 0.01149567, + "auxiliary_loss_mlp": 0.01056152, + "balance_loss_clip": 1.05467987, + "balance_loss_mlp": 1.03724504, + "epoch": 0.16323463099353674, + "flos": 24352929308160.0, + "grad_norm": 1.614702766215493, + "language_loss": 0.77693665, + "learning_rate": 3.816659148720702e-06, + "loss": 0.79899377, + "num_input_tokens_seen": 58763905, + "step": 2715, + "time_per_iteration": 2.856006383895874 + }, + { + "auxiliary_loss_clip": 0.01129437, + "auxiliary_loss_mlp": 0.01044046, + "balance_loss_clip": 1.04810584, + "balance_loss_mlp": 1.02525854, + "epoch": 0.1632947542462047, + "flos": 24900782520960.0, + "grad_norm": 2.374975046722651, + "language_loss": 0.81513858, + "learning_rate": 3.816496219917336e-06, + "loss": 0.83687335, + "num_input_tokens_seen": 58785580, + "step": 2716, + "time_per_iteration": 2.6750845909118652 + }, + { + "auxiliary_loss_clip": 0.01144393, + "auxiliary_loss_mlp": 0.01055927, + "balance_loss_clip": 1.05851114, + "balance_loss_mlp": 1.03703237, + "epoch": 0.1633548774988727, + "flos": 24900279730560.0, + "grad_norm": 1.8186679286330678, + "language_loss": 0.86522418, + "learning_rate": 3.816333222232251e-06, + "loss": 0.88722742, + "num_input_tokens_seen": 58806075, + "step": 2717, + "time_per_iteration": 2.761622428894043 + }, + { + "auxiliary_loss_clip": 0.01135377, + "auxiliary_loss_mlp": 0.01045964, + "balance_loss_clip": 1.05334044, + "balance_loss_mlp": 1.0274632, + "epoch": 0.16341500075154067, + "flos": 30441798357120.0, + "grad_norm": 1.8799656187942837, + "language_loss": 0.76924133, + "learning_rate": 3.816170155671629e-06, + "loss": 0.79105473, + "num_input_tokens_seen": 58827405, + "step": 2718, + "time_per_iteration": 2.7946770191192627 + }, + { + "auxiliary_loss_clip": 0.01145146, + "auxiliary_loss_mlp": 0.01043682, + "balance_loss_clip": 1.05553615, + "balance_loss_mlp": 1.02566922, + "epoch": 0.16347512400420863, + "flos": 22784530878720.0, + "grad_norm": 2.2449478392049906, + "language_loss": 0.73827291, + "learning_rate": 3.816007020241652e-06, + "loss": 0.76016116, + "num_input_tokens_seen": 58847205, + "step": 2719, + "time_per_iteration": 2.719980478286743 + }, + { + "auxiliary_loss_clip": 0.01128361, + "auxiliary_loss_mlp": 0.01045887, + "balance_loss_clip": 1.04900515, + "balance_loss_mlp": 1.02732563, + "epoch": 0.1635352472568766, + "flos": 22633274707200.0, + "grad_norm": 1.7092252575708884, + "language_loss": 0.72267497, + "learning_rate": 3.815843815948507e-06, + "loss": 0.74441749, + "num_input_tokens_seen": 58866865, + "step": 2720, + "time_per_iteration": 2.8737292289733887 + }, + { + "auxiliary_loss_clip": 0.01109456, + "auxiliary_loss_mlp": 0.01049703, + "balance_loss_clip": 1.05004287, + "balance_loss_mlp": 1.02840054, + "epoch": 0.16359537050954456, + "flos": 15522998515200.0, + "grad_norm": 2.1621365878543153, + "language_loss": 0.75120997, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.77280164, + "num_input_tokens_seen": 58885200, + "step": 2721, + "time_per_iteration": 2.785296678543091 + }, + { + "auxiliary_loss_clip": 0.01110342, + "auxiliary_loss_mlp": 0.01059955, + "balance_loss_clip": 1.04597676, + "balance_loss_mlp": 1.03734064, + "epoch": 0.16365549376221253, + "flos": 22090162089600.0, + "grad_norm": 1.9032438792006017, + "language_loss": 0.79073942, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.81244236, + "num_input_tokens_seen": 58906385, + "step": 2722, + "time_per_iteration": 2.7850708961486816 + }, + { + "auxiliary_loss_clip": 0.01149809, + "auxiliary_loss_mlp": 0.00778798, + "balance_loss_clip": 1.05395257, + "balance_loss_mlp": 1.00171757, + "epoch": 0.1637156170148805, + "flos": 24060400945920.0, + "grad_norm": 2.3019049903761215, + "language_loss": 0.84954333, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.86882937, + "num_input_tokens_seen": 58925040, + "step": 2723, + "time_per_iteration": 2.7268764972686768 + }, + { + "auxiliary_loss_clip": 0.01108328, + "auxiliary_loss_mlp": 0.01044851, + "balance_loss_clip": 1.04805517, + "balance_loss_mlp": 1.02493143, + "epoch": 0.1637757402675485, + "flos": 26685362954880.0, + "grad_norm": 1.8985615531712963, + "language_loss": 0.71018666, + "learning_rate": 3.815190310268058e-06, + "loss": 0.73171842, + "num_input_tokens_seen": 58944790, + "step": 2724, + "time_per_iteration": 2.7691783905029297 + }, + { + "auxiliary_loss_clip": 0.01118053, + "auxiliary_loss_mlp": 0.01041883, + "balance_loss_clip": 1.05226958, + "balance_loss_mlp": 1.02364373, + "epoch": 0.16383586352021645, + "flos": 16106941918080.0, + "grad_norm": 2.1059770262776136, + "language_loss": 0.70552838, + "learning_rate": 3.815026761751955e-06, + "loss": 0.72712779, + "num_input_tokens_seen": 58962500, + "step": 2725, + "time_per_iteration": 2.6936957836151123 + }, + { + "auxiliary_loss_clip": 0.01112368, + "auxiliary_loss_mlp": 0.01046594, + "balance_loss_clip": 1.04912174, + "balance_loss_mlp": 1.028391, + "epoch": 0.16389598677288442, + "flos": 19165991788800.0, + "grad_norm": 2.27810298992254, + "language_loss": 0.88491893, + "learning_rate": 3.814863144409855e-06, + "loss": 0.90650856, + "num_input_tokens_seen": 58980355, + "step": 2726, + "time_per_iteration": 2.7967143058776855 + }, + { + "auxiliary_loss_clip": 0.01157668, + "auxiliary_loss_mlp": 0.0105068, + "balance_loss_clip": 1.06062055, + "balance_loss_mlp": 1.03099847, + "epoch": 0.16395611002555238, + "flos": 21507008785920.0, + "grad_norm": 2.0584475237926303, + "language_loss": 0.7469939, + "learning_rate": 3.814699458247963e-06, + "loss": 0.7690773, + "num_input_tokens_seen": 58999505, + "step": 2727, + "time_per_iteration": 2.6818623542785645 + }, + { + "auxiliary_loss_clip": 0.01150971, + "auxiliary_loss_mlp": 0.01052077, + "balance_loss_clip": 1.0570507, + "balance_loss_mlp": 1.03527999, + "epoch": 0.16401623327822035, + "flos": 21470918595840.0, + "grad_norm": 1.6112579442237729, + "language_loss": 0.83097756, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.85300803, + "num_input_tokens_seen": 59017930, + "step": 2728, + "time_per_iteration": 2.675360918045044 + }, + { + "auxiliary_loss_clip": 0.01156153, + "auxiliary_loss_mlp": 0.01045609, + "balance_loss_clip": 1.05826735, + "balance_loss_mlp": 1.02602315, + "epoch": 0.1640763565308883, + "flos": 13626232928640.0, + "grad_norm": 2.5738755626941106, + "language_loss": 0.84892929, + "learning_rate": 3.814371879489633e-06, + "loss": 0.87094688, + "num_input_tokens_seen": 59035130, + "step": 2729, + "time_per_iteration": 2.7004599571228027 + }, + { + "auxiliary_loss_clip": 0.01167293, + "auxiliary_loss_mlp": 0.01048461, + "balance_loss_clip": 1.0591594, + "balance_loss_mlp": 1.03053224, + "epoch": 0.16413647978355628, + "flos": 15451464579840.0, + "grad_norm": 1.9897225699042427, + "language_loss": 0.72895479, + "learning_rate": 3.814207986905616e-06, + "loss": 0.75111228, + "num_input_tokens_seen": 59053080, + "step": 2730, + "time_per_iteration": 2.593179702758789 + }, + { + "auxiliary_loss_clip": 0.01142509, + "auxiliary_loss_mlp": 0.01050071, + "balance_loss_clip": 1.05208349, + "balance_loss_mlp": 1.02908981, + "epoch": 0.16419660303622427, + "flos": 45878682015360.0, + "grad_norm": 1.6754501336017709, + "language_loss": 0.74384654, + "learning_rate": 3.814044025526651e-06, + "loss": 0.76577234, + "num_input_tokens_seen": 59075610, + "step": 2731, + "time_per_iteration": 2.8702962398529053 + }, + { + "auxiliary_loss_clip": 0.01122791, + "auxiliary_loss_mlp": 0.01047176, + "balance_loss_clip": 1.05006754, + "balance_loss_mlp": 1.02650499, + "epoch": 0.16425672628889224, + "flos": 18952826526720.0, + "grad_norm": 2.031351475505915, + "language_loss": 0.79190683, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.8136065, + "num_input_tokens_seen": 59094555, + "step": 2732, + "time_per_iteration": 2.734529972076416 + }, + { + "auxiliary_loss_clip": 0.01141118, + "auxiliary_loss_mlp": 0.01047385, + "balance_loss_clip": 1.05340672, + "balance_loss_mlp": 1.02796555, + "epoch": 0.1643168495415602, + "flos": 24312996362880.0, + "grad_norm": 2.250003976384769, + "language_loss": 0.69526887, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.71715385, + "num_input_tokens_seen": 59113515, + "step": 2733, + "time_per_iteration": 2.672377109527588 + }, + { + "auxiliary_loss_clip": 0.01143332, + "auxiliary_loss_mlp": 0.01053232, + "balance_loss_clip": 1.05603123, + "balance_loss_mlp": 1.0325135, + "epoch": 0.16437697279422817, + "flos": 26428421992320.0, + "grad_norm": 2.000873580428856, + "language_loss": 0.80976766, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.83173329, + "num_input_tokens_seen": 59133275, + "step": 2734, + "time_per_iteration": 2.710293769836426 + }, + { + "auxiliary_loss_clip": 0.01135758, + "auxiliary_loss_mlp": 0.01056722, + "balance_loss_clip": 1.05488348, + "balance_loss_mlp": 1.03470409, + "epoch": 0.16443709604689613, + "flos": 34532239351680.0, + "grad_norm": 2.100664117201308, + "language_loss": 0.81810421, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.840029, + "num_input_tokens_seen": 59154095, + "step": 2735, + "time_per_iteration": 2.8074140548706055 + }, + { + "auxiliary_loss_clip": 0.01070875, + "auxiliary_loss_mlp": 0.01044313, + "balance_loss_clip": 1.04323888, + "balance_loss_mlp": 1.02508426, + "epoch": 0.1644972192995641, + "flos": 23258048895360.0, + "grad_norm": 2.405088987017839, + "language_loss": 0.78515649, + "learning_rate": 3.813223186925296e-06, + "loss": 0.80630839, + "num_input_tokens_seen": 59173795, + "step": 2736, + "time_per_iteration": 2.839087963104248 + }, + { + "auxiliary_loss_clip": 0.01147998, + "auxiliary_loss_mlp": 0.01054659, + "balance_loss_clip": 1.05859447, + "balance_loss_mlp": 1.03513288, + "epoch": 0.1645573425522321, + "flos": 26979543342720.0, + "grad_norm": 1.9462182296456145, + "language_loss": 0.81052899, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.83255553, + "num_input_tokens_seen": 59191610, + "step": 2737, + "time_per_iteration": 2.7328996658325195 + }, + { + "auxiliary_loss_clip": 0.01150424, + "auxiliary_loss_mlp": 0.01052207, + "balance_loss_clip": 1.0559026, + "balance_loss_mlp": 1.03065443, + "epoch": 0.16461746580490005, + "flos": 28731768600960.0, + "grad_norm": 1.8596348168124566, + "language_loss": 0.87449318, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.89651948, + "num_input_tokens_seen": 59213000, + "step": 2738, + "time_per_iteration": 2.7345526218414307 + }, + { + "auxiliary_loss_clip": 0.01139154, + "auxiliary_loss_mlp": 0.0106055, + "balance_loss_clip": 1.05534518, + "balance_loss_mlp": 1.04079759, + "epoch": 0.16467758905756802, + "flos": 24930156867840.0, + "grad_norm": 1.728421510231393, + "language_loss": 0.71997833, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74197543, + "num_input_tokens_seen": 59232340, + "step": 2739, + "time_per_iteration": 2.7091422080993652 + }, + { + "auxiliary_loss_clip": 0.01154419, + "auxiliary_loss_mlp": 0.0105106, + "balance_loss_clip": 1.05673754, + "balance_loss_mlp": 1.0312835, + "epoch": 0.16473771231023598, + "flos": 24826519152000.0, + "grad_norm": 1.8559436932352185, + "language_loss": 0.81645715, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.83851194, + "num_input_tokens_seen": 59253950, + "step": 2740, + "time_per_iteration": 2.712658166885376 + }, + { + "auxiliary_loss_clip": 0.01114061, + "auxiliary_loss_mlp": 0.01068725, + "balance_loss_clip": 1.04991829, + "balance_loss_mlp": 1.04307163, + "epoch": 0.16479783556290395, + "flos": 39896072375040.0, + "grad_norm": 2.0528021789830837, + "language_loss": 0.69467485, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.71650267, + "num_input_tokens_seen": 59275545, + "step": 2741, + "time_per_iteration": 2.8629493713378906 + }, + { + "auxiliary_loss_clip": 0.01167543, + "auxiliary_loss_mlp": 0.01048721, + "balance_loss_clip": 1.05907226, + "balance_loss_mlp": 1.02906334, + "epoch": 0.16485795881557191, + "flos": 19897061299200.0, + "grad_norm": 1.7765193730452222, + "language_loss": 0.79811072, + "learning_rate": 3.812235911671472e-06, + "loss": 0.8202734, + "num_input_tokens_seen": 59293480, + "step": 2742, + "time_per_iteration": 2.626775026321411 + }, + { + "auxiliary_loss_clip": 0.01141681, + "auxiliary_loss_mlp": 0.01055663, + "balance_loss_clip": 1.05664062, + "balance_loss_mlp": 1.03477716, + "epoch": 0.16491808206823988, + "flos": 20556129997440.0, + "grad_norm": 1.91797408289014, + "language_loss": 0.8499459, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.87191939, + "num_input_tokens_seen": 59313435, + "step": 2743, + "time_per_iteration": 2.8218302726745605 + }, + { + "auxiliary_loss_clip": 0.01162447, + "auxiliary_loss_mlp": 0.01051969, + "balance_loss_clip": 1.05743837, + "balance_loss_mlp": 1.03196514, + "epoch": 0.16497820532090787, + "flos": 23800802376960.0, + "grad_norm": 1.4425200129075006, + "language_loss": 0.85558498, + "learning_rate": 3.811906270092265e-06, + "loss": 0.87772918, + "num_input_tokens_seen": 59331535, + "step": 2744, + "time_per_iteration": 4.206263542175293 + }, + { + "auxiliary_loss_clip": 0.01131671, + "auxiliary_loss_mlp": 0.0104676, + "balance_loss_clip": 1.05206287, + "balance_loss_mlp": 1.02812767, + "epoch": 0.16503832857357584, + "flos": 25482642935040.0, + "grad_norm": 1.6285200980820358, + "language_loss": 0.82770813, + "learning_rate": 3.811741346238036e-06, + "loss": 0.84949243, + "num_input_tokens_seen": 59350680, + "step": 2745, + "time_per_iteration": 4.331594467163086 + }, + { + "auxiliary_loss_clip": 0.011344, + "auxiliary_loss_mlp": 0.01057242, + "balance_loss_clip": 1.05874014, + "balance_loss_mlp": 1.03825223, + "epoch": 0.1650984518262438, + "flos": 17676058619520.0, + "grad_norm": 6.766690288332402, + "language_loss": 0.76811314, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.79002959, + "num_input_tokens_seen": 59367020, + "step": 2746, + "time_per_iteration": 4.225586414337158 + }, + { + "auxiliary_loss_clip": 0.01164296, + "auxiliary_loss_mlp": 0.01055636, + "balance_loss_clip": 1.05781221, + "balance_loss_mlp": 1.03533494, + "epoch": 0.16515857507891177, + "flos": 18698327688960.0, + "grad_norm": 1.9760186874049024, + "language_loss": 0.80818808, + "learning_rate": 3.811411292431592e-06, + "loss": 0.83038735, + "num_input_tokens_seen": 59386075, + "step": 2747, + "time_per_iteration": 2.6862480640411377 + }, + { + "auxiliary_loss_clip": 0.01157975, + "auxiliary_loss_mlp": 0.0104673, + "balance_loss_clip": 1.05990267, + "balance_loss_mlp": 1.02664328, + "epoch": 0.16521869833157973, + "flos": 15010481306880.0, + "grad_norm": 2.0608482379031337, + "language_loss": 0.69433749, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.71638453, + "num_input_tokens_seen": 59402690, + "step": 2748, + "time_per_iteration": 2.6520986557006836 + }, + { + "auxiliary_loss_clip": 0.01169692, + "auxiliary_loss_mlp": 0.00778195, + "balance_loss_clip": 1.06237423, + "balance_loss_mlp": 1.00173104, + "epoch": 0.1652788215842477, + "flos": 22121152548480.0, + "grad_norm": 2.259215537482641, + "language_loss": 0.88012803, + "learning_rate": 3.811080963869561e-06, + "loss": 0.89960694, + "num_input_tokens_seen": 59421130, + "step": 2749, + "time_per_iteration": 4.260679244995117 + }, + { + "auxiliary_loss_clip": 0.01154179, + "auxiliary_loss_mlp": 0.01045617, + "balance_loss_clip": 1.05586052, + "balance_loss_mlp": 1.02542281, + "epoch": 0.16533894483691566, + "flos": 18333080242560.0, + "grad_norm": 2.0880864906339864, + "language_loss": 0.79240286, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.81440079, + "num_input_tokens_seen": 59438970, + "step": 2750, + "time_per_iteration": 2.6335251331329346 + }, + { + "auxiliary_loss_clip": 0.01153343, + "auxiliary_loss_mlp": 0.0104591, + "balance_loss_clip": 1.0579437, + "balance_loss_mlp": 1.02602625, + "epoch": 0.16539906808958366, + "flos": 22382115834240.0, + "grad_norm": 1.6952801391084946, + "language_loss": 0.94854712, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.97053963, + "num_input_tokens_seen": 59458510, + "step": 2751, + "time_per_iteration": 2.697174310684204 + }, + { + "auxiliary_loss_clip": 0.0106803, + "auxiliary_loss_mlp": 0.0105236, + "balance_loss_clip": 1.04625726, + "balance_loss_mlp": 1.03247619, + "epoch": 0.16545919134225162, + "flos": 22711093522560.0, + "grad_norm": 2.614588592950962, + "language_loss": 0.71231711, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.73352098, + "num_input_tokens_seen": 59477110, + "step": 2752, + "time_per_iteration": 2.7780745029449463 + }, + { + "auxiliary_loss_clip": 0.01090521, + "auxiliary_loss_mlp": 0.01022104, + "balance_loss_clip": 1.05741131, + "balance_loss_mlp": 1.01941013, + "epoch": 0.1655193145949196, + "flos": 67802974076160.0, + "grad_norm": 0.7721529651221379, + "language_loss": 0.54058975, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56171602, + "num_input_tokens_seen": 59541155, + "step": 2753, + "time_per_iteration": 3.3371469974517822 + }, + { + "auxiliary_loss_clip": 0.01163808, + "auxiliary_loss_mlp": 0.00778536, + "balance_loss_clip": 1.05587018, + "balance_loss_mlp": 1.00172091, + "epoch": 0.16557943784758755, + "flos": 24280389792000.0, + "grad_norm": 1.6411537728312637, + "language_loss": 0.75436741, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.7737909, + "num_input_tokens_seen": 59561155, + "step": 2754, + "time_per_iteration": 2.6382133960723877 + }, + { + "auxiliary_loss_clip": 0.01139421, + "auxiliary_loss_mlp": 0.01060584, + "balance_loss_clip": 1.05406713, + "balance_loss_mlp": 1.03768396, + "epoch": 0.16563956110025552, + "flos": 20083617561600.0, + "grad_norm": 2.4067479946694137, + "language_loss": 0.86654639, + "learning_rate": 3.810088330151188e-06, + "loss": 0.88854647, + "num_input_tokens_seen": 59580460, + "step": 2755, + "time_per_iteration": 2.6590075492858887 + }, + { + "auxiliary_loss_clip": 0.01122817, + "auxiliary_loss_mlp": 0.01053169, + "balance_loss_clip": 1.04948378, + "balance_loss_mlp": 1.03293943, + "epoch": 0.16569968435292348, + "flos": 28034454896640.0, + "grad_norm": 1.7268487777137649, + "language_loss": 0.73350251, + "learning_rate": 3.80992265092595e-06, + "loss": 0.75526237, + "num_input_tokens_seen": 59600025, + "step": 2756, + "time_per_iteration": 2.771820545196533 + }, + { + "auxiliary_loss_clip": 0.01128662, + "auxiliary_loss_mlp": 0.01049666, + "balance_loss_clip": 1.05550277, + "balance_loss_mlp": 1.02969813, + "epoch": 0.16575980760559147, + "flos": 26250233598720.0, + "grad_norm": 1.5540667033085804, + "language_loss": 0.75308084, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.77486414, + "num_input_tokens_seen": 59620600, + "step": 2757, + "time_per_iteration": 2.8106157779693604 + }, + { + "auxiliary_loss_clip": 0.01143608, + "auxiliary_loss_mlp": 0.01054064, + "balance_loss_clip": 1.057634, + "balance_loss_mlp": 1.03390563, + "epoch": 0.16581993085825944, + "flos": 26943955943040.0, + "grad_norm": 1.8675154897424497, + "language_loss": 0.84604371, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.86802036, + "num_input_tokens_seen": 59641385, + "step": 2758, + "time_per_iteration": 2.8663368225097656 + }, + { + "auxiliary_loss_clip": 0.01168186, + "auxiliary_loss_mlp": 0.01058337, + "balance_loss_clip": 1.06166434, + "balance_loss_mlp": 1.03952527, + "epoch": 0.1658800541109274, + "flos": 21653632103040.0, + "grad_norm": 2.0824774555850243, + "language_loss": 0.78848934, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81075454, + "num_input_tokens_seen": 59659865, + "step": 2759, + "time_per_iteration": 2.655371904373169 + }, + { + "auxiliary_loss_clip": 0.01098973, + "auxiliary_loss_mlp": 0.0104879, + "balance_loss_clip": 1.0491066, + "balance_loss_mlp": 1.02846527, + "epoch": 0.16594017736359537, + "flos": 16435488643200.0, + "grad_norm": 2.4005603702739613, + "language_loss": 0.75130272, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.77278036, + "num_input_tokens_seen": 59678780, + "step": 2760, + "time_per_iteration": 2.767866611480713 + }, + { + "auxiliary_loss_clip": 0.01117278, + "auxiliary_loss_mlp": 0.0104823, + "balance_loss_clip": 1.05129814, + "balance_loss_mlp": 1.02867997, + "epoch": 0.16600030061626334, + "flos": 22637297030400.0, + "grad_norm": 1.5792623632565632, + "language_loss": 0.73425764, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.75591272, + "num_input_tokens_seen": 59698795, + "step": 2761, + "time_per_iteration": 2.762836456298828 + }, + { + "auxiliary_loss_clip": 0.0113507, + "auxiliary_loss_mlp": 0.01050415, + "balance_loss_clip": 1.05250192, + "balance_loss_mlp": 1.03018475, + "epoch": 0.1660604238689313, + "flos": 26396569607040.0, + "grad_norm": 2.9515424803015033, + "language_loss": 0.88832974, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.91018462, + "num_input_tokens_seen": 59718795, + "step": 2762, + "time_per_iteration": 2.766324281692505 + }, + { + "auxiliary_loss_clip": 0.01115163, + "auxiliary_loss_mlp": 0.01050144, + "balance_loss_clip": 1.05208707, + "balance_loss_mlp": 1.03080845, + "epoch": 0.16612054712159927, + "flos": 23039999383680.0, + "grad_norm": 1.84507980271118, + "language_loss": 0.87992418, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.90157735, + "num_input_tokens_seen": 59737555, + "step": 2763, + "time_per_iteration": 2.7734055519104004 + }, + { + "auxiliary_loss_clip": 0.01086152, + "auxiliary_loss_mlp": 0.01013622, + "balance_loss_clip": 1.0448606, + "balance_loss_mlp": 1.01065338, + "epoch": 0.16618067037426726, + "flos": 59241225202560.0, + "grad_norm": 0.7790832079967882, + "language_loss": 0.59799927, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.61899698, + "num_input_tokens_seen": 59800915, + "step": 2764, + "time_per_iteration": 3.1728692054748535 + }, + { + "auxiliary_loss_clip": 0.01152232, + "auxiliary_loss_mlp": 0.01053607, + "balance_loss_clip": 1.05467176, + "balance_loss_mlp": 1.03254318, + "epoch": 0.16624079362693522, + "flos": 27198813916800.0, + "grad_norm": 1.7436496772383425, + "language_loss": 0.82260036, + "learning_rate": 3.808428450193401e-06, + "loss": 0.84465873, + "num_input_tokens_seen": 59822910, + "step": 2765, + "time_per_iteration": 2.72440767288208 + }, + { + "auxiliary_loss_clip": 0.01171844, + "auxiliary_loss_mlp": 0.01049085, + "balance_loss_clip": 1.05882454, + "balance_loss_mlp": 1.02746069, + "epoch": 0.1663009168796032, + "flos": 10925068216320.0, + "grad_norm": 2.128015994498251, + "language_loss": 0.69980019, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.72200948, + "num_input_tokens_seen": 59838805, + "step": 2766, + "time_per_iteration": 2.5810647010803223 + }, + { + "auxiliary_loss_clip": 0.0115036, + "auxiliary_loss_mlp": 0.01047665, + "balance_loss_clip": 1.05772817, + "balance_loss_mlp": 1.02792454, + "epoch": 0.16636104013227115, + "flos": 17894431353600.0, + "grad_norm": 2.107381123394178, + "language_loss": 0.8845337, + "learning_rate": 3.808095651090769e-06, + "loss": 0.90651393, + "num_input_tokens_seen": 59855345, + "step": 2767, + "time_per_iteration": 2.659240245819092 + }, + { + "auxiliary_loss_clip": 0.01077283, + "auxiliary_loss_mlp": 0.01002999, + "balance_loss_clip": 1.046556, + "balance_loss_mlp": 1.00020981, + "epoch": 0.16642116338493912, + "flos": 66726050463360.0, + "grad_norm": 0.6403612433239105, + "language_loss": 0.5289067, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.54970956, + "num_input_tokens_seen": 59917710, + "step": 2768, + "time_per_iteration": 3.28488826751709 + }, + { + "auxiliary_loss_clip": 0.01137637, + "auxiliary_loss_mlp": 0.01051692, + "balance_loss_clip": 1.05451822, + "balance_loss_mlp": 1.03034163, + "epoch": 0.16648128663760708, + "flos": 19026048401280.0, + "grad_norm": 2.4342686570828267, + "language_loss": 0.84962058, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.87151396, + "num_input_tokens_seen": 59935105, + "step": 2769, + "time_per_iteration": 2.753257989883423 + }, + { + "auxiliary_loss_clip": 0.01068987, + "auxiliary_loss_mlp": 0.0100573, + "balance_loss_clip": 1.04678345, + "balance_loss_mlp": 1.00316668, + "epoch": 0.16654140989027508, + "flos": 70134976759680.0, + "grad_norm": 0.8107434108728753, + "language_loss": 0.57455683, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59530401, + "num_input_tokens_seen": 59984085, + "step": 2770, + "time_per_iteration": 3.2202906608581543 + }, + { + "auxiliary_loss_clip": 0.01054548, + "auxiliary_loss_mlp": 0.01003676, + "balance_loss_clip": 1.04637623, + "balance_loss_mlp": 1.00086308, + "epoch": 0.16660153314294304, + "flos": 70272406195200.0, + "grad_norm": 0.8940719168038874, + "language_loss": 0.56241393, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58299619, + "num_input_tokens_seen": 60043470, + "step": 2771, + "time_per_iteration": 3.3302085399627686 + }, + { + "auxiliary_loss_clip": 0.01110714, + "auxiliary_loss_mlp": 0.01053994, + "balance_loss_clip": 1.04819679, + "balance_loss_mlp": 1.03316772, + "epoch": 0.166661656395611, + "flos": 23075048079360.0, + "grad_norm": 2.9137693497887778, + "language_loss": 0.70419657, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.72584367, + "num_input_tokens_seen": 60063045, + "step": 2772, + "time_per_iteration": 2.845414161682129 + }, + { + "auxiliary_loss_clip": 0.0114592, + "auxiliary_loss_mlp": 0.01049708, + "balance_loss_clip": 1.05082583, + "balance_loss_mlp": 1.02923954, + "epoch": 0.16672177964827897, + "flos": 28366341586560.0, + "grad_norm": 2.20945076195277, + "language_loss": 0.86324167, + "learning_rate": 3.807095608468975e-06, + "loss": 0.88519788, + "num_input_tokens_seen": 60081945, + "step": 2773, + "time_per_iteration": 2.669412851333618 + }, + { + "auxiliary_loss_clip": 0.01095425, + "auxiliary_loss_mlp": 0.01049097, + "balance_loss_clip": 1.04436934, + "balance_loss_mlp": 1.0300827, + "epoch": 0.16678190290094694, + "flos": 19091010147840.0, + "grad_norm": 2.0211952616678937, + "language_loss": 0.82141376, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.84285897, + "num_input_tokens_seen": 60096820, + "step": 2774, + "time_per_iteration": 2.7111308574676514 + }, + { + "auxiliary_loss_clip": 0.01123493, + "auxiliary_loss_mlp": 0.01045144, + "balance_loss_clip": 1.05252421, + "balance_loss_mlp": 1.02446127, + "epoch": 0.1668420261536149, + "flos": 21799106184960.0, + "grad_norm": 3.3781068524499, + "language_loss": 0.8298822, + "learning_rate": 3.806761712658952e-06, + "loss": 0.85156858, + "num_input_tokens_seen": 60116140, + "step": 2775, + "time_per_iteration": 2.7367632389068604 + }, + { + "auxiliary_loss_clip": 0.01150495, + "auxiliary_loss_mlp": 0.01051475, + "balance_loss_clip": 1.05761933, + "balance_loss_mlp": 1.03264022, + "epoch": 0.16690214940628287, + "flos": 19062533640960.0, + "grad_norm": 1.8115651629444076, + "language_loss": 0.80919641, + "learning_rate": 3.806594661981897e-06, + "loss": 0.8312161, + "num_input_tokens_seen": 60134235, + "step": 2776, + "time_per_iteration": 2.651723623275757 + }, + { + "auxiliary_loss_clip": 0.0113775, + "auxiliary_loss_mlp": 0.01054199, + "balance_loss_clip": 1.05518723, + "balance_loss_mlp": 1.0346483, + "epoch": 0.16696227265895086, + "flos": 18588548747520.0, + "grad_norm": 2.7510345221850336, + "language_loss": 0.80203485, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.82395434, + "num_input_tokens_seen": 60153275, + "step": 2777, + "time_per_iteration": 2.6380929946899414 + }, + { + "auxiliary_loss_clip": 0.01147967, + "auxiliary_loss_mlp": 0.01045166, + "balance_loss_clip": 1.05270481, + "balance_loss_mlp": 1.02640271, + "epoch": 0.16702239591161883, + "flos": 23294139085440.0, + "grad_norm": 1.6179722336290305, + "language_loss": 0.85384095, + "learning_rate": 3.806260355115371e-06, + "loss": 0.87577224, + "num_input_tokens_seen": 60173215, + "step": 2778, + "time_per_iteration": 2.754652500152588 + }, + { + "auxiliary_loss_clip": 0.01136802, + "auxiliary_loss_mlp": 0.01040643, + "balance_loss_clip": 1.0531714, + "balance_loss_mlp": 1.02148652, + "epoch": 0.1670825191642868, + "flos": 24425648392320.0, + "grad_norm": 3.2091470007324414, + "language_loss": 0.74180603, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.76358056, + "num_input_tokens_seen": 60190515, + "step": 2779, + "time_per_iteration": 2.777193784713745 + }, + { + "auxiliary_loss_clip": 0.01112683, + "auxiliary_loss_mlp": 0.00777451, + "balance_loss_clip": 1.04981184, + "balance_loss_mlp": 1.0015173, + "epoch": 0.16714264241695476, + "flos": 26797512193920.0, + "grad_norm": 2.127789274190337, + "language_loss": 0.6557346, + "learning_rate": 3.805925774274554e-06, + "loss": 0.67463589, + "num_input_tokens_seen": 60211655, + "step": 2780, + "time_per_iteration": 2.896976947784424 + }, + { + "auxiliary_loss_clip": 0.01120921, + "auxiliary_loss_mlp": 0.01045506, + "balance_loss_clip": 1.04843462, + "balance_loss_mlp": 1.02547836, + "epoch": 0.16720276566962272, + "flos": 21835304115840.0, + "grad_norm": 2.46647860258999, + "language_loss": 0.78422606, + "learning_rate": 3.805758381129643e-06, + "loss": 0.80589032, + "num_input_tokens_seen": 60230860, + "step": 2781, + "time_per_iteration": 2.725782632827759 + }, + { + "auxiliary_loss_clip": 0.01094692, + "auxiliary_loss_mlp": 0.01050104, + "balance_loss_clip": 1.04439843, + "balance_loss_mlp": 1.03056526, + "epoch": 0.1672628889222907, + "flos": 21470415805440.0, + "grad_norm": 26.23767952829368, + "language_loss": 0.75119764, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77264553, + "num_input_tokens_seen": 60250535, + "step": 2782, + "time_per_iteration": 2.7064197063446045 + }, + { + "auxiliary_loss_clip": 0.01129162, + "auxiliary_loss_mlp": 0.01047612, + "balance_loss_clip": 1.05152631, + "balance_loss_mlp": 1.02764392, + "epoch": 0.16732301217495865, + "flos": 30774008269440.0, + "grad_norm": 2.116531296279042, + "language_loss": 0.67398441, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.69575214, + "num_input_tokens_seen": 60269530, + "step": 2783, + "time_per_iteration": 2.7901556491851807 + }, + { + "auxiliary_loss_clip": 0.01158882, + "auxiliary_loss_mlp": 0.0105166, + "balance_loss_clip": 1.05460215, + "balance_loss_mlp": 1.03271747, + "epoch": 0.16738313542762664, + "flos": 23474625949440.0, + "grad_norm": 1.7768362036873409, + "language_loss": 0.69919086, + "learning_rate": 3.805255790873081e-06, + "loss": 0.72129631, + "num_input_tokens_seen": 60289900, + "step": 2784, + "time_per_iteration": 5.714844226837158 + }, + { + "auxiliary_loss_clip": 0.01137618, + "auxiliary_loss_mlp": 0.01056022, + "balance_loss_clip": 1.05217624, + "balance_loss_mlp": 1.03539932, + "epoch": 0.1674432586802946, + "flos": 29789086366080.0, + "grad_norm": 4.741795209709136, + "language_loss": 0.60970068, + "learning_rate": 3.805088123868126e-06, + "loss": 0.6316371, + "num_input_tokens_seen": 60310025, + "step": 2785, + "time_per_iteration": 4.219547510147095 + }, + { + "auxiliary_loss_clip": 0.01057886, + "auxiliary_loss_mlp": 0.0100398, + "balance_loss_clip": 1.03758883, + "balance_loss_mlp": 1.00141752, + "epoch": 0.16750338193296258, + "flos": 66136073575680.0, + "grad_norm": 0.773077721474628, + "language_loss": 0.58780885, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.60842752, + "num_input_tokens_seen": 60377800, + "step": 2786, + "time_per_iteration": 3.2306320667266846 + }, + { + "auxiliary_loss_clip": 0.0113927, + "auxiliary_loss_mlp": 0.01044966, + "balance_loss_clip": 1.0496738, + "balance_loss_mlp": 1.02589226, + "epoch": 0.16756350518563054, + "flos": 25696777864320.0, + "grad_norm": 1.7333132735183339, + "language_loss": 0.76308596, + "learning_rate": 3.80475258451721e-06, + "loss": 0.78492826, + "num_input_tokens_seen": 60398215, + "step": 2787, + "time_per_iteration": 2.6434125900268555 + }, + { + "auxiliary_loss_clip": 0.01146924, + "auxiliary_loss_mlp": 0.01043386, + "balance_loss_clip": 1.0529089, + "balance_loss_mlp": 1.02544546, + "epoch": 0.1676236284382985, + "flos": 23836102467840.0, + "grad_norm": 1.7210472408736244, + "language_loss": 0.7717936, + "learning_rate": 3.804584712183972e-06, + "loss": 0.79369676, + "num_input_tokens_seen": 60416910, + "step": 2788, + "time_per_iteration": 4.359618425369263 + }, + { + "auxiliary_loss_clip": 0.01054629, + "auxiliary_loss_mlp": 0.00999991, + "balance_loss_clip": 1.03482509, + "balance_loss_mlp": 0.99746382, + "epoch": 0.16768375169096647, + "flos": 59874902985600.0, + "grad_norm": 0.8596744797543817, + "language_loss": 0.59331679, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61386299, + "num_input_tokens_seen": 60468660, + "step": 2789, + "time_per_iteration": 3.0742650032043457 + }, + { + "auxiliary_loss_clip": 0.01148272, + "auxiliary_loss_mlp": 0.01053856, + "balance_loss_clip": 1.05450928, + "balance_loss_mlp": 1.03428209, + "epoch": 0.16774387494363446, + "flos": 38435657207040.0, + "grad_norm": 1.689036486923415, + "language_loss": 0.7012763, + "learning_rate": 3.804248762233765e-06, + "loss": 0.7232976, + "num_input_tokens_seen": 60492370, + "step": 2790, + "time_per_iteration": 2.872232437133789 + }, + { + "auxiliary_loss_clip": 0.0112492, + "auxiliary_loss_mlp": 0.01051622, + "balance_loss_clip": 1.0497216, + "balance_loss_mlp": 1.0334661, + "epoch": 0.16780399819630243, + "flos": 22637620252800.0, + "grad_norm": 1.864386369112868, + "language_loss": 0.79464513, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.81641054, + "num_input_tokens_seen": 60512655, + "step": 2791, + "time_per_iteration": 2.7180140018463135 + }, + { + "auxiliary_loss_clip": 0.01122456, + "auxiliary_loss_mlp": 0.01050939, + "balance_loss_clip": 1.04977369, + "balance_loss_mlp": 1.03106701, + "epoch": 0.1678641214489704, + "flos": 32891516887680.0, + "grad_norm": 1.705849915566178, + "language_loss": 0.71547955, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.73721349, + "num_input_tokens_seen": 60533090, + "step": 2792, + "time_per_iteration": 2.9221818447113037 + }, + { + "auxiliary_loss_clip": 0.01131469, + "auxiliary_loss_mlp": 0.01044061, + "balance_loss_clip": 1.05479562, + "balance_loss_mlp": 1.02551246, + "epoch": 0.16792424470163836, + "flos": 19974916028160.0, + "grad_norm": 1.9301593564774673, + "language_loss": 0.71581644, + "learning_rate": 3.803744324194691e-06, + "loss": 0.73757172, + "num_input_tokens_seen": 60553190, + "step": 2793, + "time_per_iteration": 2.75104022026062 + }, + { + "auxiliary_loss_clip": 0.01143072, + "auxiliary_loss_mlp": 0.01053231, + "balance_loss_clip": 1.05276942, + "balance_loss_mlp": 1.03452659, + "epoch": 0.16798436795430632, + "flos": 19719878486400.0, + "grad_norm": 2.3859650274226833, + "language_loss": 0.7717455, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79370856, + "num_input_tokens_seen": 60571995, + "step": 2794, + "time_per_iteration": 2.6007745265960693 + }, + { + "auxiliary_loss_clip": 0.01137828, + "auxiliary_loss_mlp": 0.0104987, + "balance_loss_clip": 1.05250025, + "balance_loss_mlp": 1.03010476, + "epoch": 0.1680444912069743, + "flos": 28104839596800.0, + "grad_norm": 2.7692472240964747, + "language_loss": 0.71609265, + "learning_rate": 3.803407690167187e-06, + "loss": 0.73796958, + "num_input_tokens_seen": 60591275, + "step": 2795, + "time_per_iteration": 2.693826198577881 + }, + { + "auxiliary_loss_clip": 0.01131865, + "auxiliary_loss_mlp": 0.01041012, + "balance_loss_clip": 1.04973865, + "balance_loss_mlp": 1.02302384, + "epoch": 0.16810461445964225, + "flos": 18075205526400.0, + "grad_norm": 1.990096863808903, + "language_loss": 0.84230494, + "learning_rate": 3.803239270572142e-06, + "loss": 0.8640337, + "num_input_tokens_seen": 60609235, + "step": 2796, + "time_per_iteration": 2.697253465652466 + }, + { + "auxiliary_loss_clip": 0.01101634, + "auxiliary_loss_mlp": 0.01045196, + "balance_loss_clip": 1.04877055, + "balance_loss_mlp": 1.0262773, + "epoch": 0.16816473771231025, + "flos": 23878657105920.0, + "grad_norm": 1.9272276676322646, + "language_loss": 0.81609607, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.83756441, + "num_input_tokens_seen": 60629880, + "step": 2797, + "time_per_iteration": 2.8784244060516357 + }, + { + "auxiliary_loss_clip": 0.0114057, + "auxiliary_loss_mlp": 0.01041282, + "balance_loss_clip": 1.05136061, + "balance_loss_mlp": 1.02448523, + "epoch": 0.1682248609649782, + "flos": 22783597125120.0, + "grad_norm": 1.7015769336052518, + "language_loss": 0.74811113, + "learning_rate": 3.802902226251401e-06, + "loss": 0.76992965, + "num_input_tokens_seen": 60651175, + "step": 2798, + "time_per_iteration": 2.700727939605713 + }, + { + "auxiliary_loss_clip": 0.01161342, + "auxiliary_loss_mlp": 0.01048462, + "balance_loss_clip": 1.05728281, + "balance_loss_mlp": 1.03075945, + "epoch": 0.16828498421764618, + "flos": 20705123612160.0, + "grad_norm": 1.5964091182578661, + "language_loss": 0.79693568, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.81903368, + "num_input_tokens_seen": 60670210, + "step": 2799, + "time_per_iteration": 2.6582021713256836 + }, + { + "auxiliary_loss_clip": 0.01077177, + "auxiliary_loss_mlp": 0.01045216, + "balance_loss_clip": 1.04514158, + "balance_loss_mlp": 1.02374637, + "epoch": 0.16834510747031414, + "flos": 29420606695680.0, + "grad_norm": 4.227726163531211, + "language_loss": 0.70963746, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.73086143, + "num_input_tokens_seen": 60690895, + "step": 2800, + "time_per_iteration": 2.8856699466705322 + }, + { + "auxiliary_loss_clip": 0.01108822, + "auxiliary_loss_mlp": 0.00777078, + "balance_loss_clip": 1.04776788, + "balance_loss_mlp": 1.00161195, + "epoch": 0.1684052307229821, + "flos": 18145374744960.0, + "grad_norm": 1.9902029671619985, + "language_loss": 0.83663505, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.85549408, + "num_input_tokens_seen": 60708280, + "step": 2801, + "time_per_iteration": 2.6917035579681396 + }, + { + "auxiliary_loss_clip": 0.01128148, + "auxiliary_loss_mlp": 0.01049324, + "balance_loss_clip": 1.05011535, + "balance_loss_mlp": 1.03084683, + "epoch": 0.16846535397565007, + "flos": 16574929240320.0, + "grad_norm": 2.4052305427948735, + "language_loss": 0.82509923, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.84687394, + "num_input_tokens_seen": 60724150, + "step": 2802, + "time_per_iteration": 2.882611036300659 + }, + { + "auxiliary_loss_clip": 0.01150156, + "auxiliary_loss_mlp": 0.01048717, + "balance_loss_clip": 1.05517435, + "balance_loss_mlp": 1.02885723, + "epoch": 0.16852547722831807, + "flos": 30408868563840.0, + "grad_norm": 3.107584498439891, + "language_loss": 0.80643189, + "learning_rate": 3.802058419152413e-06, + "loss": 0.8284207, + "num_input_tokens_seen": 60746485, + "step": 2803, + "time_per_iteration": 2.7886922359466553 + }, + { + "auxiliary_loss_clip": 0.01148107, + "auxiliary_loss_mlp": 0.01047852, + "balance_loss_clip": 1.0556829, + "balance_loss_mlp": 1.02918339, + "epoch": 0.16858560048098603, + "flos": 33507420416640.0, + "grad_norm": 2.2127389669880713, + "language_loss": 0.76168799, + "learning_rate": 3.801889452704297e-06, + "loss": 0.7836476, + "num_input_tokens_seen": 60762875, + "step": 2804, + "time_per_iteration": 2.7588601112365723 + }, + { + "auxiliary_loss_clip": 0.01045171, + "auxiliary_loss_mlp": 0.01013955, + "balance_loss_clip": 1.03581083, + "balance_loss_mlp": 1.01078367, + "epoch": 0.168645723733654, + "flos": 67370502326400.0, + "grad_norm": 0.8536034833258724, + "language_loss": 0.55464876, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57524002, + "num_input_tokens_seen": 60825510, + "step": 2805, + "time_per_iteration": 3.2089412212371826 + }, + { + "auxiliary_loss_clip": 0.01138275, + "auxiliary_loss_mlp": 0.0103974, + "balance_loss_clip": 1.05013156, + "balance_loss_mlp": 1.02239537, + "epoch": 0.16870584698632196, + "flos": 21324618501120.0, + "grad_norm": 2.2836767274778427, + "language_loss": 0.73090243, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.75268269, + "num_input_tokens_seen": 60844440, + "step": 2806, + "time_per_iteration": 2.643596649169922 + }, + { + "auxiliary_loss_clip": 0.01117063, + "auxiliary_loss_mlp": 0.01045402, + "balance_loss_clip": 1.05330753, + "balance_loss_mlp": 1.02766335, + "epoch": 0.16876597023898993, + "flos": 20740746925440.0, + "grad_norm": 1.8406859431587912, + "language_loss": 0.69773197, + "learning_rate": 3.80138214341862e-06, + "loss": 0.71935666, + "num_input_tokens_seen": 60863210, + "step": 2807, + "time_per_iteration": 2.6946568489074707 + }, + { + "auxiliary_loss_clip": 0.01130702, + "auxiliary_loss_mlp": 0.01047199, + "balance_loss_clip": 1.04842246, + "balance_loss_mlp": 1.02794707, + "epoch": 0.1688260934916579, + "flos": 20303498666880.0, + "grad_norm": 3.042021842274248, + "language_loss": 0.70280695, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.72458601, + "num_input_tokens_seen": 60882510, + "step": 2808, + "time_per_iteration": 2.6656088829040527 + }, + { + "auxiliary_loss_clip": 0.01119025, + "auxiliary_loss_mlp": 0.01041739, + "balance_loss_clip": 1.05019665, + "balance_loss_mlp": 1.02164018, + "epoch": 0.16888621674432586, + "flos": 20340702178560.0, + "grad_norm": 2.0835789337145965, + "language_loss": 0.79903001, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.8206377, + "num_input_tokens_seen": 60901105, + "step": 2809, + "time_per_iteration": 2.7665679454803467 + }, + { + "auxiliary_loss_clip": 0.01155146, + "auxiliary_loss_mlp": 0.01042018, + "balance_loss_clip": 1.0557605, + "balance_loss_mlp": 1.02252758, + "epoch": 0.16894633999699385, + "flos": 16244802316800.0, + "grad_norm": 2.0672093223845245, + "language_loss": 0.88076419, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.90273583, + "num_input_tokens_seen": 60915340, + "step": 2810, + "time_per_iteration": 2.6186363697052 + }, + { + "auxiliary_loss_clip": 0.01149997, + "auxiliary_loss_mlp": 0.0104631, + "balance_loss_clip": 1.05503082, + "balance_loss_mlp": 1.02715337, + "epoch": 0.16900646324966181, + "flos": 19610171372160.0, + "grad_norm": 1.8921026809528976, + "language_loss": 0.92376304, + "learning_rate": 3.800704774747416e-06, + "loss": 0.9457261, + "num_input_tokens_seen": 60933735, + "step": 2811, + "time_per_iteration": 2.6567442417144775 + }, + { + "auxiliary_loss_clip": 0.01140053, + "auxiliary_loss_mlp": 0.01049063, + "balance_loss_clip": 1.05383325, + "balance_loss_mlp": 1.03039432, + "epoch": 0.16906658650232978, + "flos": 22018089450240.0, + "grad_norm": 2.116573413654177, + "language_loss": 0.78582352, + "learning_rate": 3.800535261856291e-06, + "loss": 0.8077147, + "num_input_tokens_seen": 60953105, + "step": 2812, + "time_per_iteration": 2.6796023845672607 + }, + { + "auxiliary_loss_clip": 0.01147895, + "auxiliary_loss_mlp": 0.01043917, + "balance_loss_clip": 1.05772316, + "balance_loss_mlp": 1.02653646, + "epoch": 0.16912670975499774, + "flos": 11763690024960.0, + "grad_norm": 2.5483899062625093, + "language_loss": 0.75195068, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.7738688, + "num_input_tokens_seen": 60969150, + "step": 2813, + "time_per_iteration": 2.621772050857544 + }, + { + "auxiliary_loss_clip": 0.01136313, + "auxiliary_loss_mlp": 0.01045037, + "balance_loss_clip": 1.05311871, + "balance_loss_mlp": 1.02599943, + "epoch": 0.1691868330076657, + "flos": 17161386595200.0, + "grad_norm": 3.0041182480764554, + "language_loss": 0.69118392, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.7129975, + "num_input_tokens_seen": 60982825, + "step": 2814, + "time_per_iteration": 2.837264060974121 + }, + { + "auxiliary_loss_clip": 0.01163835, + "auxiliary_loss_mlp": 0.01039837, + "balance_loss_clip": 1.05900145, + "balance_loss_mlp": 1.02134776, + "epoch": 0.16924695626033368, + "flos": 22416553998720.0, + "grad_norm": 3.1079956206415833, + "language_loss": 0.61439502, + "learning_rate": 3.800026313549776e-06, + "loss": 0.63643175, + "num_input_tokens_seen": 61000875, + "step": 2815, + "time_per_iteration": 2.6967194080352783 + }, + { + "auxiliary_loss_clip": 0.01129827, + "auxiliary_loss_mlp": 0.01042692, + "balance_loss_clip": 1.05139673, + "balance_loss_mlp": 1.02382088, + "epoch": 0.16930707951300164, + "flos": 25739655724800.0, + "grad_norm": 1.7930623183302479, + "language_loss": 0.82490849, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.84663367, + "num_input_tokens_seen": 61021940, + "step": 2816, + "time_per_iteration": 2.7227163314819336 + }, + { + "auxiliary_loss_clip": 0.01133129, + "auxiliary_loss_mlp": 0.01047914, + "balance_loss_clip": 1.05375743, + "balance_loss_mlp": 1.02853012, + "epoch": 0.16936720276566963, + "flos": 22747040058240.0, + "grad_norm": 3.083808689594852, + "language_loss": 0.87322289, + "learning_rate": 3.799686673382153e-06, + "loss": 0.89503324, + "num_input_tokens_seen": 61040285, + "step": 2817, + "time_per_iteration": 2.733180522918701 + }, + { + "auxiliary_loss_clip": 0.01141455, + "auxiliary_loss_mlp": 0.01052753, + "balance_loss_clip": 1.05800366, + "balance_loss_mlp": 1.03352427, + "epoch": 0.1694273260183376, + "flos": 19573973441280.0, + "grad_norm": 1.8594303503608436, + "language_loss": 0.81247765, + "learning_rate": 3.799516750928672e-06, + "loss": 0.83441973, + "num_input_tokens_seen": 61059020, + "step": 2818, + "time_per_iteration": 2.7384097576141357 + }, + { + "auxiliary_loss_clip": 0.01160132, + "auxiliary_loss_mlp": 0.01044196, + "balance_loss_clip": 1.05699944, + "balance_loss_mlp": 1.02496791, + "epoch": 0.16948744927100556, + "flos": 12457843332480.0, + "grad_norm": 2.739998367204505, + "language_loss": 0.80788404, + "learning_rate": 3.799346760237336e-06, + "loss": 0.82992733, + "num_input_tokens_seen": 61074245, + "step": 2819, + "time_per_iteration": 2.609870672225952 + }, + { + "auxiliary_loss_clip": 0.01069019, + "auxiliary_loss_mlp": 0.01015301, + "balance_loss_clip": 1.0485003, + "balance_loss_mlp": 1.0125947, + "epoch": 0.16954757252367353, + "flos": 71291694435840.0, + "grad_norm": 0.9309223426502673, + "language_loss": 0.61031163, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63115478, + "num_input_tokens_seen": 61127080, + "step": 2820, + "time_per_iteration": 3.161051034927368 + }, + { + "auxiliary_loss_clip": 0.01125604, + "auxiliary_loss_mlp": 0.0105036, + "balance_loss_clip": 1.05106986, + "balance_loss_mlp": 1.03207326, + "epoch": 0.1696076957763415, + "flos": 29606516513280.0, + "grad_norm": 1.8682266790688726, + "language_loss": 0.78265435, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.80441403, + "num_input_tokens_seen": 61146955, + "step": 2821, + "time_per_iteration": 2.838730573654175 + }, + { + "auxiliary_loss_clip": 0.0113863, + "auxiliary_loss_mlp": 0.01055528, + "balance_loss_clip": 1.05282724, + "balance_loss_mlp": 1.03494084, + "epoch": 0.16966781902900946, + "flos": 24388588535040.0, + "grad_norm": 2.1667405259997516, + "language_loss": 0.78521514, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.80715668, + "num_input_tokens_seen": 61166605, + "step": 2822, + "time_per_iteration": 2.783385753631592 + }, + { + "auxiliary_loss_clip": 0.01143597, + "auxiliary_loss_mlp": 0.00777154, + "balance_loss_clip": 1.05367076, + "balance_loss_mlp": 1.00129986, + "epoch": 0.16972794228167745, + "flos": 23038814234880.0, + "grad_norm": 1.8038457392731222, + "language_loss": 0.74939907, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.76860654, + "num_input_tokens_seen": 61186535, + "step": 2823, + "time_per_iteration": 4.329328298568726 + }, + { + "auxiliary_loss_clip": 0.01129469, + "auxiliary_loss_mlp": 0.0105385, + "balance_loss_clip": 1.05166912, + "balance_loss_mlp": 1.03496754, + "epoch": 0.16978806553434542, + "flos": 35228691129600.0, + "grad_norm": 3.336653609493179, + "language_loss": 0.60266119, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.62449437, + "num_input_tokens_seen": 61208965, + "step": 2824, + "time_per_iteration": 5.892346620559692 + }, + { + "auxiliary_loss_clip": 0.01138249, + "auxiliary_loss_mlp": 0.01042322, + "balance_loss_clip": 1.05565047, + "balance_loss_mlp": 1.02287912, + "epoch": 0.16984818878701338, + "flos": 32014290936960.0, + "grad_norm": 2.152838804074104, + "language_loss": 0.73322558, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.75503135, + "num_input_tokens_seen": 61230670, + "step": 2825, + "time_per_iteration": 2.834482431411743 + }, + { + "auxiliary_loss_clip": 0.01161467, + "auxiliary_loss_mlp": 0.01047701, + "balance_loss_clip": 1.05502653, + "balance_loss_mlp": 1.02762675, + "epoch": 0.16990831203968135, + "flos": 22818609907200.0, + "grad_norm": 2.05671259677731, + "language_loss": 0.85638934, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.87848103, + "num_input_tokens_seen": 61249510, + "step": 2826, + "time_per_iteration": 2.6443135738372803 + }, + { + "auxiliary_loss_clip": 0.01139368, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.05266595, + "balance_loss_mlp": 1.02856779, + "epoch": 0.1699684352923493, + "flos": 23039604334080.0, + "grad_norm": 1.9562557148441426, + "language_loss": 0.82465482, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.84652597, + "num_input_tokens_seen": 61269440, + "step": 2827, + "time_per_iteration": 2.7683157920837402 + }, + { + "auxiliary_loss_clip": 0.01131885, + "auxiliary_loss_mlp": 0.0104561, + "balance_loss_clip": 1.05320346, + "balance_loss_mlp": 1.02536786, + "epoch": 0.17002855854501728, + "flos": 21434110133760.0, + "grad_norm": 1.7386401818136152, + "language_loss": 0.73704529, + "learning_rate": 3.797813774376267e-06, + "loss": 0.75882024, + "num_input_tokens_seen": 61288195, + "step": 2828, + "time_per_iteration": 4.465311288833618 + }, + { + "auxiliary_loss_clip": 0.01061458, + "auxiliary_loss_mlp": 0.01009538, + "balance_loss_clip": 1.04764342, + "balance_loss_mlp": 1.00620067, + "epoch": 0.17008868179768524, + "flos": 71453509205760.0, + "grad_norm": 0.7670168832041738, + "language_loss": 0.56426483, + "learning_rate": 3.797643101661336e-06, + "loss": 0.58497471, + "num_input_tokens_seen": 61350850, + "step": 2829, + "time_per_iteration": 3.3114631175994873 + }, + { + "auxiliary_loss_clip": 0.01111753, + "auxiliary_loss_mlp": 0.01051557, + "balance_loss_clip": 1.04527223, + "balance_loss_mlp": 1.03088641, + "epoch": 0.17014880505035324, + "flos": 24900315644160.0, + "grad_norm": 1.7961285206560338, + "language_loss": 0.83465374, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.85628688, + "num_input_tokens_seen": 61370765, + "step": 2830, + "time_per_iteration": 2.795253038406372 + }, + { + "auxiliary_loss_clip": 0.01121533, + "auxiliary_loss_mlp": 0.0104408, + "balance_loss_clip": 1.04901659, + "balance_loss_mlp": 1.02442193, + "epoch": 0.1702089283030212, + "flos": 29862415981440.0, + "grad_norm": 2.4873654173451727, + "language_loss": 0.78360993, + "learning_rate": 3.797301551737529e-06, + "loss": 0.80526608, + "num_input_tokens_seen": 61388935, + "step": 2831, + "time_per_iteration": 2.7864232063293457 + }, + { + "auxiliary_loss_clip": 0.01123612, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_clip": 1.05275893, + "balance_loss_mlp": 1.0311985, + "epoch": 0.17026905155568917, + "flos": 17744180762880.0, + "grad_norm": 2.532473263441992, + "language_loss": 0.79668158, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.81842923, + "num_input_tokens_seen": 61407350, + "step": 2832, + "time_per_iteration": 2.842217206954956 + }, + { + "auxiliary_loss_clip": 0.01127135, + "auxiliary_loss_mlp": 0.01048966, + "balance_loss_clip": 1.05029321, + "balance_loss_mlp": 1.02984488, + "epoch": 0.17032917480835713, + "flos": 23148665003520.0, + "grad_norm": 1.8387196201649116, + "language_loss": 0.88638175, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.90814275, + "num_input_tokens_seen": 61429010, + "step": 2833, + "time_per_iteration": 2.75942325592041 + }, + { + "auxiliary_loss_clip": 0.01158799, + "auxiliary_loss_mlp": 0.01046883, + "balance_loss_clip": 1.05633831, + "balance_loss_mlp": 1.02842951, + "epoch": 0.1703892980610251, + "flos": 39202565512320.0, + "grad_norm": 2.49094605220443, + "language_loss": 0.71924698, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.74130386, + "num_input_tokens_seen": 61450040, + "step": 2834, + "time_per_iteration": 2.9035184383392334 + }, + { + "auxiliary_loss_clip": 0.01119873, + "auxiliary_loss_mlp": 0.01052215, + "balance_loss_clip": 1.05165124, + "balance_loss_mlp": 1.03428626, + "epoch": 0.17044942131369306, + "flos": 23039101543680.0, + "grad_norm": 1.9093816511111852, + "language_loss": 0.86831236, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.89003325, + "num_input_tokens_seen": 61468585, + "step": 2835, + "time_per_iteration": 2.7627484798431396 + }, + { + "auxiliary_loss_clip": 0.01149332, + "auxiliary_loss_mlp": 0.01049844, + "balance_loss_clip": 1.0536654, + "balance_loss_mlp": 1.02887547, + "epoch": 0.17050954456636103, + "flos": 17054983532160.0, + "grad_norm": 2.1227367002258153, + "language_loss": 0.74483943, + "learning_rate": 3.796446484348989e-06, + "loss": 0.76683116, + "num_input_tokens_seen": 61486330, + "step": 2836, + "time_per_iteration": 2.6748619079589844 + }, + { + "auxiliary_loss_clip": 0.01102249, + "auxiliary_loss_mlp": 0.01049533, + "balance_loss_clip": 1.04775679, + "balance_loss_mlp": 1.02790809, + "epoch": 0.17056966781902902, + "flos": 16836969934080.0, + "grad_norm": 2.1718385109372824, + "language_loss": 0.79959226, + "learning_rate": 3.796275266481036e-06, + "loss": 0.82111007, + "num_input_tokens_seen": 61503950, + "step": 2837, + "time_per_iteration": 2.757340908050537 + }, + { + "auxiliary_loss_clip": 0.01144378, + "auxiliary_loss_mlp": 0.01044803, + "balance_loss_clip": 1.05493581, + "balance_loss_mlp": 1.02644491, + "epoch": 0.17062979107169698, + "flos": 17712543859200.0, + "grad_norm": 1.6825251002952497, + "language_loss": 0.83258498, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.85447681, + "num_input_tokens_seen": 61523550, + "step": 2838, + "time_per_iteration": 2.705357551574707 + }, + { + "auxiliary_loss_clip": 0.0110604, + "auxiliary_loss_mlp": 0.01044889, + "balance_loss_clip": 1.05217135, + "balance_loss_mlp": 1.02685261, + "epoch": 0.17068991432436495, + "flos": 22525040050560.0, + "grad_norm": 1.7789799751303759, + "language_loss": 0.93788463, + "learning_rate": 3.795932626406812e-06, + "loss": 0.95939398, + "num_input_tokens_seen": 61542720, + "step": 2839, + "time_per_iteration": 2.7881791591644287 + }, + { + "auxiliary_loss_clip": 0.01126465, + "auxiliary_loss_mlp": 0.01045617, + "balance_loss_clip": 1.05183244, + "balance_loss_mlp": 1.0250175, + "epoch": 0.17075003757703291, + "flos": 25882939077120.0, + "grad_norm": 2.3337760403585435, + "language_loss": 0.83974946, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.86147022, + "num_input_tokens_seen": 61563040, + "step": 2840, + "time_per_iteration": 2.7564892768859863 + }, + { + "auxiliary_loss_clip": 0.01151834, + "auxiliary_loss_mlp": 0.01044417, + "balance_loss_clip": 1.05555129, + "balance_loss_mlp": 1.02449679, + "epoch": 0.17081016082970088, + "flos": 20120713332480.0, + "grad_norm": 1.9037435592597944, + "language_loss": 0.76307738, + "learning_rate": 3.79558971392481e-06, + "loss": 0.7850399, + "num_input_tokens_seen": 61581890, + "step": 2841, + "time_per_iteration": 2.695525646209717 + }, + { + "auxiliary_loss_clip": 0.01136217, + "auxiliary_loss_mlp": 0.01045847, + "balance_loss_clip": 1.0527097, + "balance_loss_mlp": 1.02744126, + "epoch": 0.17087028408236885, + "flos": 24936477661440.0, + "grad_norm": 1.7844240011089845, + "language_loss": 0.77076876, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.79258937, + "num_input_tokens_seen": 61602095, + "step": 2842, + "time_per_iteration": 2.773792266845703 + }, + { + "auxiliary_loss_clip": 0.01155915, + "auxiliary_loss_mlp": 0.01043896, + "balance_loss_clip": 1.05616069, + "balance_loss_mlp": 1.02503705, + "epoch": 0.17093040733503684, + "flos": 19057864872960.0, + "grad_norm": 1.8430349199993477, + "language_loss": 0.85694385, + "learning_rate": 3.795246529087043e-06, + "loss": 0.87894201, + "num_input_tokens_seen": 61620400, + "step": 2843, + "time_per_iteration": 2.5860671997070312 + }, + { + "auxiliary_loss_clip": 0.01154742, + "auxiliary_loss_mlp": 0.01044059, + "balance_loss_clip": 1.05549574, + "balance_loss_mlp": 1.02608204, + "epoch": 0.1709905305877048, + "flos": 13078954333440.0, + "grad_norm": 2.0353470349004485, + "language_loss": 0.68646181, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.70844984, + "num_input_tokens_seen": 61637680, + "step": 2844, + "time_per_iteration": 2.5961523056030273 + }, + { + "auxiliary_loss_clip": 0.01133396, + "auxiliary_loss_mlp": 0.00778162, + "balance_loss_clip": 1.05117011, + "balance_loss_mlp": 1.00112617, + "epoch": 0.17105065384037277, + "flos": 19209336526080.0, + "grad_norm": 2.027694794878894, + "language_loss": 0.78771943, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.806835, + "num_input_tokens_seen": 61655630, + "step": 2845, + "time_per_iteration": 2.720193386077881 + }, + { + "auxiliary_loss_clip": 0.01145033, + "auxiliary_loss_mlp": 0.01047407, + "balance_loss_clip": 1.05443549, + "balance_loss_mlp": 1.02914453, + "epoch": 0.17111077709304073, + "flos": 18515183218560.0, + "grad_norm": 2.2586144454646306, + "language_loss": 0.7811147, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.80303913, + "num_input_tokens_seen": 61673475, + "step": 2846, + "time_per_iteration": 2.691033363342285 + }, + { + "auxiliary_loss_clip": 0.01143809, + "auxiliary_loss_mlp": 0.0104645, + "balance_loss_clip": 1.05425262, + "balance_loss_mlp": 1.02865243, + "epoch": 0.1711709003457087, + "flos": 25082670015360.0, + "grad_norm": 2.2208975060456426, + "language_loss": 0.79762948, + "learning_rate": 3.794559342552472e-06, + "loss": 0.8195321, + "num_input_tokens_seen": 61693370, + "step": 2847, + "time_per_iteration": 2.7504522800445557 + }, + { + "auxiliary_loss_clip": 0.01142651, + "auxiliary_loss_mlp": 0.01045695, + "balance_loss_clip": 1.05101562, + "balance_loss_mlp": 1.02668071, + "epoch": 0.17123102359837666, + "flos": 17566387418880.0, + "grad_norm": 2.4457083156230017, + "language_loss": 0.8665086, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.88839209, + "num_input_tokens_seen": 61710820, + "step": 2848, + "time_per_iteration": 2.642946720123291 + }, + { + "auxiliary_loss_clip": 0.0111167, + "auxiliary_loss_mlp": 0.01044479, + "balance_loss_clip": 1.04839015, + "balance_loss_mlp": 1.02559662, + "epoch": 0.17129114685104463, + "flos": 26173635845760.0, + "grad_norm": 3.6033710399461856, + "language_loss": 0.75238276, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77394426, + "num_input_tokens_seen": 61729855, + "step": 2849, + "time_per_iteration": 2.7511017322540283 + }, + { + "auxiliary_loss_clip": 0.0103263, + "auxiliary_loss_mlp": 0.01006833, + "balance_loss_clip": 1.02775574, + "balance_loss_mlp": 1.00413883, + "epoch": 0.17135127010371262, + "flos": 69269710037760.0, + "grad_norm": 0.7881928427119427, + "language_loss": 0.57514679, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59554148, + "num_input_tokens_seen": 61790290, + "step": 2850, + "time_per_iteration": 3.234609603881836 + }, + { + "auxiliary_loss_clip": 0.01115021, + "auxiliary_loss_mlp": 0.01044381, + "balance_loss_clip": 1.05049884, + "balance_loss_mlp": 1.02661848, + "epoch": 0.1714113933563806, + "flos": 23550110380800.0, + "grad_norm": 2.962731712990184, + "language_loss": 0.81328994, + "learning_rate": 3.793871067220031e-06, + "loss": 0.83488399, + "num_input_tokens_seen": 61809265, + "step": 2851, + "time_per_iteration": 2.78957200050354 + }, + { + "auxiliary_loss_clip": 0.01114419, + "auxiliary_loss_mlp": 0.01043587, + "balance_loss_clip": 1.05193233, + "balance_loss_mlp": 1.02592039, + "epoch": 0.17147151660904855, + "flos": 21142443697920.0, + "grad_norm": 2.049906502724323, + "language_loss": 0.93085313, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.95243311, + "num_input_tokens_seen": 61828980, + "step": 2852, + "time_per_iteration": 2.8247029781341553 + }, + { + "auxiliary_loss_clip": 0.01123258, + "auxiliary_loss_mlp": 0.01048953, + "balance_loss_clip": 1.04961288, + "balance_loss_mlp": 1.03045225, + "epoch": 0.17153163986171652, + "flos": 18624890332800.0, + "grad_norm": 1.8770741979814063, + "language_loss": 0.69465554, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.71637762, + "num_input_tokens_seen": 61847915, + "step": 2853, + "time_per_iteration": 2.814162492752075 + }, + { + "auxiliary_loss_clip": 0.01120856, + "auxiliary_loss_mlp": 0.0104692, + "balance_loss_clip": 1.05593121, + "balance_loss_mlp": 1.02899122, + "epoch": 0.17159176311438448, + "flos": 18223265387520.0, + "grad_norm": 2.5884803351111705, + "language_loss": 0.66611075, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.68778855, + "num_input_tokens_seen": 61865570, + "step": 2854, + "time_per_iteration": 2.7968995571136475 + }, + { + "auxiliary_loss_clip": 0.01120742, + "auxiliary_loss_mlp": 0.01052217, + "balance_loss_clip": 1.04853106, + "balance_loss_mlp": 1.0349679, + "epoch": 0.17165188636705245, + "flos": 20738987159040.0, + "grad_norm": 1.705510390491261, + "language_loss": 0.8929621, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91469175, + "num_input_tokens_seen": 61883340, + "step": 2855, + "time_per_iteration": 2.7045016288757324 + }, + { + "auxiliary_loss_clip": 0.01157319, + "auxiliary_loss_mlp": 0.01043813, + "balance_loss_clip": 1.05505848, + "balance_loss_mlp": 1.02662265, + "epoch": 0.17171200961972044, + "flos": 24899884680960.0, + "grad_norm": 2.117219134143716, + "language_loss": 0.83963835, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.86164963, + "num_input_tokens_seen": 61900610, + "step": 2856, + "time_per_iteration": 2.7349936962127686 + }, + { + "auxiliary_loss_clip": 0.01150108, + "auxiliary_loss_mlp": 0.0104615, + "balance_loss_clip": 1.05812418, + "balance_loss_mlp": 1.02783966, + "epoch": 0.1717721328723884, + "flos": 20157234485760.0, + "grad_norm": 2.234025867710235, + "language_loss": 0.86309886, + "learning_rate": 3.792836613639026e-06, + "loss": 0.88506144, + "num_input_tokens_seen": 61916795, + "step": 2857, + "time_per_iteration": 2.749356746673584 + }, + { + "auxiliary_loss_clip": 0.01144467, + "auxiliary_loss_mlp": 0.0105057, + "balance_loss_clip": 1.05469525, + "balance_loss_mlp": 1.0324626, + "epoch": 0.17183225612505637, + "flos": 23361650697600.0, + "grad_norm": 2.069122070501307, + "language_loss": 0.78334701, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.80529737, + "num_input_tokens_seen": 61936665, + "step": 2858, + "time_per_iteration": 2.6673583984375 + }, + { + "auxiliary_loss_clip": 0.01147374, + "auxiliary_loss_mlp": 0.0105371, + "balance_loss_clip": 1.05591416, + "balance_loss_mlp": 1.03263378, + "epoch": 0.17189237937772434, + "flos": 18114240631680.0, + "grad_norm": 2.1629422323642453, + "language_loss": 0.77565676, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.79766762, + "num_input_tokens_seen": 61954415, + "step": 2859, + "time_per_iteration": 2.646648645401001 + }, + { + "auxiliary_loss_clip": 0.0110879, + "auxiliary_loss_mlp": 0.01047481, + "balance_loss_clip": 1.05317724, + "balance_loss_mlp": 1.02887201, + "epoch": 0.1719525026303923, + "flos": 23258408031360.0, + "grad_norm": 2.088627069497316, + "language_loss": 0.77088714, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.79244983, + "num_input_tokens_seen": 61973940, + "step": 2860, + "time_per_iteration": 2.7671573162078857 + }, + { + "auxiliary_loss_clip": 0.01145562, + "auxiliary_loss_mlp": 0.01042048, + "balance_loss_clip": 1.05316472, + "balance_loss_mlp": 1.02416611, + "epoch": 0.17201262588306027, + "flos": 20810413353600.0, + "grad_norm": 2.1608688480628304, + "language_loss": 0.81384242, + "learning_rate": 3.792145618140317e-06, + "loss": 0.83571851, + "num_input_tokens_seen": 61991845, + "step": 2861, + "time_per_iteration": 2.6492061614990234 + }, + { + "auxiliary_loss_clip": 0.011306, + "auxiliary_loss_mlp": 0.01051558, + "balance_loss_clip": 1.05280077, + "balance_loss_mlp": 1.0335927, + "epoch": 0.17207274913572823, + "flos": 20375858615040.0, + "grad_norm": 2.0128324416816192, + "language_loss": 0.85691392, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.87873554, + "num_input_tokens_seen": 62009395, + "step": 2862, + "time_per_iteration": 4.290126323699951 + }, + { + "auxiliary_loss_clip": 0.01116765, + "auxiliary_loss_mlp": 0.01043444, + "balance_loss_clip": 1.05126834, + "balance_loss_mlp": 1.02655208, + "epoch": 0.17213287238839622, + "flos": 26797727675520.0, + "grad_norm": 3.7047120479299993, + "language_loss": 0.78047049, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.80207253, + "num_input_tokens_seen": 62029005, + "step": 2863, + "time_per_iteration": 4.275500774383545 + }, + { + "auxiliary_loss_clip": 0.01122315, + "auxiliary_loss_mlp": 0.00776596, + "balance_loss_clip": 1.05132961, + "balance_loss_mlp": 1.00090909, + "epoch": 0.1721929956410642, + "flos": 26030819370240.0, + "grad_norm": 1.7350128683820358, + "language_loss": 0.72135127, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.74034035, + "num_input_tokens_seen": 62048730, + "step": 2864, + "time_per_iteration": 4.414710998535156 + }, + { + "auxiliary_loss_clip": 0.01121488, + "auxiliary_loss_mlp": 0.01049611, + "balance_loss_clip": 1.05114079, + "balance_loss_mlp": 1.03099, + "epoch": 0.17225311889373215, + "flos": 22273091078400.0, + "grad_norm": 1.9270646210248614, + "language_loss": 0.73002023, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.75173128, + "num_input_tokens_seen": 62069000, + "step": 2865, + "time_per_iteration": 2.7463715076446533 + }, + { + "auxiliary_loss_clip": 0.01145037, + "auxiliary_loss_mlp": 0.0077644, + "balance_loss_clip": 1.05669165, + "balance_loss_mlp": 1.00120521, + "epoch": 0.17231324214640012, + "flos": 21287774125440.0, + "grad_norm": 2.669585642962841, + "language_loss": 0.78357804, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.80279285, + "num_input_tokens_seen": 62086750, + "step": 2866, + "time_per_iteration": 2.785146713256836 + }, + { + "auxiliary_loss_clip": 0.01157272, + "auxiliary_loss_mlp": 0.01044358, + "balance_loss_clip": 1.05600274, + "balance_loss_mlp": 1.02536821, + "epoch": 0.17237336539906808, + "flos": 19680735640320.0, + "grad_norm": 2.551277931358127, + "language_loss": 0.79755104, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.81956732, + "num_input_tokens_seen": 62106240, + "step": 2867, + "time_per_iteration": 4.3145318031311035 + }, + { + "auxiliary_loss_clip": 0.01132297, + "auxiliary_loss_mlp": 0.01041396, + "balance_loss_clip": 1.0529356, + "balance_loss_mlp": 1.02274013, + "epoch": 0.17243348865173605, + "flos": 17529650784000.0, + "grad_norm": 1.8689780270661371, + "language_loss": 0.79206991, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.81380683, + "num_input_tokens_seen": 62124895, + "step": 2868, + "time_per_iteration": 2.7683827877044678 + }, + { + "auxiliary_loss_clip": 0.01111702, + "auxiliary_loss_mlp": 0.01041717, + "balance_loss_clip": 1.05331647, + "balance_loss_mlp": 1.02427697, + "epoch": 0.17249361190440402, + "flos": 18259858368000.0, + "grad_norm": 2.0344588273772923, + "language_loss": 0.84221756, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.86375177, + "num_input_tokens_seen": 62143510, + "step": 2869, + "time_per_iteration": 2.729156970977783 + }, + { + "auxiliary_loss_clip": 0.01132999, + "auxiliary_loss_mlp": 0.01048405, + "balance_loss_clip": 1.0535363, + "balance_loss_mlp": 1.02955842, + "epoch": 0.172553735157072, + "flos": 21174367910400.0, + "grad_norm": 1.8935704627114847, + "language_loss": 0.77299273, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.79480684, + "num_input_tokens_seen": 62162285, + "step": 2870, + "time_per_iteration": 2.752739191055298 + }, + { + "auxiliary_loss_clip": 0.0115398, + "auxiliary_loss_mlp": 0.01037809, + "balance_loss_clip": 1.05671024, + "balance_loss_mlp": 1.02110744, + "epoch": 0.17261385840973997, + "flos": 22273270646400.0, + "grad_norm": 2.0115587398764396, + "language_loss": 0.77409238, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.79601026, + "num_input_tokens_seen": 62180970, + "step": 2871, + "time_per_iteration": 2.660627603530884 + }, + { + "auxiliary_loss_clip": 0.01132474, + "auxiliary_loss_mlp": 0.01041073, + "balance_loss_clip": 1.05313993, + "balance_loss_mlp": 1.0222379, + "epoch": 0.17267398166240794, + "flos": 27922233830400.0, + "grad_norm": 2.203011669690562, + "language_loss": 0.74197829, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.76371384, + "num_input_tokens_seen": 62198965, + "step": 2872, + "time_per_iteration": 2.6959900856018066 + }, + { + "auxiliary_loss_clip": 0.01150773, + "auxiliary_loss_mlp": 0.01041508, + "balance_loss_clip": 1.05359554, + "balance_loss_mlp": 1.02362645, + "epoch": 0.1727341049150759, + "flos": 21945118970880.0, + "grad_norm": 1.7914171074077658, + "language_loss": 0.82336062, + "learning_rate": 3.790066109323988e-06, + "loss": 0.84528345, + "num_input_tokens_seen": 62219890, + "step": 2873, + "time_per_iteration": 2.603564977645874 + }, + { + "auxiliary_loss_clip": 0.01108819, + "auxiliary_loss_mlp": 0.01044995, + "balance_loss_clip": 1.04744792, + "balance_loss_mlp": 1.02522969, + "epoch": 0.17279422816774387, + "flos": 18107883924480.0, + "grad_norm": 3.7341652608759297, + "language_loss": 0.75355422, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.77509236, + "num_input_tokens_seen": 62237140, + "step": 2874, + "time_per_iteration": 2.8438260555267334 + }, + { + "auxiliary_loss_clip": 0.01159322, + "auxiliary_loss_mlp": 0.01044415, + "balance_loss_clip": 1.05658269, + "balance_loss_mlp": 1.02404249, + "epoch": 0.17285435142041183, + "flos": 21835447770240.0, + "grad_norm": 2.7053876793207037, + "language_loss": 0.80239916, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.82443655, + "num_input_tokens_seen": 62255405, + "step": 2875, + "time_per_iteration": 2.625183343887329 + }, + { + "auxiliary_loss_clip": 0.01135727, + "auxiliary_loss_mlp": 0.0105273, + "balance_loss_clip": 1.0535475, + "balance_loss_mlp": 1.03297722, + "epoch": 0.17291447467307983, + "flos": 18368452160640.0, + "grad_norm": 3.840653645811056, + "language_loss": 0.87621164, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.8980962, + "num_input_tokens_seen": 62271280, + "step": 2876, + "time_per_iteration": 2.6782751083374023 + }, + { + "auxiliary_loss_clip": 0.01136898, + "auxiliary_loss_mlp": 0.01044228, + "balance_loss_clip": 1.05730534, + "balance_loss_mlp": 1.02559566, + "epoch": 0.1729745979257478, + "flos": 18624638937600.0, + "grad_norm": 1.8931416121171032, + "language_loss": 0.84386718, + "learning_rate": 3.789370767013681e-06, + "loss": 0.86567843, + "num_input_tokens_seen": 62289140, + "step": 2877, + "time_per_iteration": 2.681131362915039 + }, + { + "auxiliary_loss_clip": 0.01120759, + "auxiliary_loss_mlp": 0.01043962, + "balance_loss_clip": 1.05222571, + "balance_loss_mlp": 1.02499604, + "epoch": 0.17303472117841576, + "flos": 22998234844800.0, + "grad_norm": 2.106635210245156, + "language_loss": 0.79660022, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.81824744, + "num_input_tokens_seen": 62307490, + "step": 2878, + "time_per_iteration": 2.8118834495544434 + }, + { + "auxiliary_loss_clip": 0.01136112, + "auxiliary_loss_mlp": 0.01047222, + "balance_loss_clip": 1.05593777, + "balance_loss_mlp": 1.02953172, + "epoch": 0.17309484443108372, + "flos": 25664386775040.0, + "grad_norm": 1.9675557254753375, + "language_loss": 0.70236337, + "learning_rate": 3.78902268871344e-06, + "loss": 0.72419673, + "num_input_tokens_seen": 62328570, + "step": 2879, + "time_per_iteration": 2.7998502254486084 + }, + { + "auxiliary_loss_clip": 0.01130517, + "auxiliary_loss_mlp": 0.01051722, + "balance_loss_clip": 1.05183411, + "balance_loss_mlp": 1.03337598, + "epoch": 0.1731549676837517, + "flos": 13552903313280.0, + "grad_norm": 2.0545155253910163, + "language_loss": 0.82884222, + "learning_rate": 3.78884854780014e-06, + "loss": 0.85066462, + "num_input_tokens_seen": 62345735, + "step": 2880, + "time_per_iteration": 2.6707684993743896 + }, + { + "auxiliary_loss_clip": 0.01110706, + "auxiliary_loss_mlp": 0.01054327, + "balance_loss_clip": 1.05214918, + "balance_loss_mlp": 1.03303647, + "epoch": 0.17321509093641965, + "flos": 22857070394880.0, + "grad_norm": 1.9029231217608267, + "language_loss": 0.80879176, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.83044201, + "num_input_tokens_seen": 62365525, + "step": 2881, + "time_per_iteration": 2.7851576805114746 + }, + { + "auxiliary_loss_clip": 0.01135983, + "auxiliary_loss_mlp": 0.01046895, + "balance_loss_clip": 1.05544055, + "balance_loss_mlp": 1.02921653, + "epoch": 0.17327521418908762, + "flos": 24352785653760.0, + "grad_norm": 2.753231520615002, + "language_loss": 0.77268815, + "learning_rate": 3.788500062480197e-06, + "loss": 0.79451692, + "num_input_tokens_seen": 62385160, + "step": 2882, + "time_per_iteration": 2.7785212993621826 + }, + { + "auxiliary_loss_clip": 0.01124099, + "auxiliary_loss_mlp": 0.01047516, + "balance_loss_clip": 1.0633558, + "balance_loss_mlp": 1.02947998, + "epoch": 0.1733353374417556, + "flos": 33105651816960.0, + "grad_norm": 2.096311926604511, + "language_loss": 0.76714236, + "learning_rate": 3.788325718086769e-06, + "loss": 0.78885853, + "num_input_tokens_seen": 62405280, + "step": 2883, + "time_per_iteration": 2.838848352432251 + }, + { + "auxiliary_loss_clip": 0.01110924, + "auxiliary_loss_mlp": 0.0104619, + "balance_loss_clip": 1.04929209, + "balance_loss_mlp": 1.02821302, + "epoch": 0.17339546069442358, + "flos": 24388947671040.0, + "grad_norm": 2.1194201700326873, + "language_loss": 0.8555252, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.87709635, + "num_input_tokens_seen": 62423665, + "step": 2884, + "time_per_iteration": 2.829376220703125 + }, + { + "auxiliary_loss_clip": 0.01133962, + "auxiliary_loss_mlp": 0.00775817, + "balance_loss_clip": 1.05472779, + "balance_loss_mlp": 1.00088096, + "epoch": 0.17345558394709154, + "flos": 27454174680960.0, + "grad_norm": 1.7131036779262108, + "language_loss": 0.74756771, + "learning_rate": 3.787976825866055e-06, + "loss": 0.76666546, + "num_input_tokens_seen": 62445170, + "step": 2885, + "time_per_iteration": 2.8710989952087402 + }, + { + "auxiliary_loss_clip": 0.01128977, + "auxiliary_loss_mlp": 0.01044901, + "balance_loss_clip": 1.05498922, + "balance_loss_mlp": 1.0280925, + "epoch": 0.1735157071997595, + "flos": 24682158391680.0, + "grad_norm": 2.374438581614022, + "language_loss": 0.7107017, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.73244053, + "num_input_tokens_seen": 62466135, + "step": 2886, + "time_per_iteration": 2.726621150970459 + }, + { + "auxiliary_loss_clip": 0.01142411, + "auxiliary_loss_mlp": 0.01041857, + "balance_loss_clip": 1.05233932, + "balance_loss_mlp": 1.02408338, + "epoch": 0.17357583045242747, + "flos": 21688932193920.0, + "grad_norm": 2.0566537172661747, + "language_loss": 0.69906294, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.72090566, + "num_input_tokens_seen": 62483910, + "step": 2887, + "time_per_iteration": 2.7066688537597656 + }, + { + "auxiliary_loss_clip": 0.01116425, + "auxiliary_loss_mlp": 0.01045383, + "balance_loss_clip": 1.05328536, + "balance_loss_mlp": 1.02728677, + "epoch": 0.17363595370509544, + "flos": 15375728753280.0, + "grad_norm": 2.038016964464323, + "language_loss": 0.85257947, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87419748, + "num_input_tokens_seen": 62501530, + "step": 2888, + "time_per_iteration": 2.7514970302581787 + }, + { + "auxiliary_loss_clip": 0.01095063, + "auxiliary_loss_mlp": 0.01049413, + "balance_loss_clip": 1.05020595, + "balance_loss_mlp": 1.02822983, + "epoch": 0.1736960769577634, + "flos": 23440941970560.0, + "grad_norm": 2.196318077733749, + "language_loss": 0.78491282, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.80635762, + "num_input_tokens_seen": 62521295, + "step": 2889, + "time_per_iteration": 2.8221559524536133 + }, + { + "auxiliary_loss_clip": 0.01112139, + "auxiliary_loss_mlp": 0.0077601, + "balance_loss_clip": 1.05236733, + "balance_loss_mlp": 1.00114667, + "epoch": 0.1737562002104314, + "flos": 18587830475520.0, + "grad_norm": 2.333227367674716, + "language_loss": 0.84076989, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.85965133, + "num_input_tokens_seen": 62539615, + "step": 2890, + "time_per_iteration": 2.7213382720947266 + }, + { + "auxiliary_loss_clip": 0.01142218, + "auxiliary_loss_mlp": 0.01054918, + "balance_loss_clip": 1.05530691, + "balance_loss_mlp": 1.03752589, + "epoch": 0.17381632346309936, + "flos": 15998060816640.0, + "grad_norm": 2.7278091568285596, + "language_loss": 0.82205319, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.84402454, + "num_input_tokens_seen": 62556820, + "step": 2891, + "time_per_iteration": 2.625162363052368 + }, + { + "auxiliary_loss_clip": 0.01097361, + "auxiliary_loss_mlp": 0.01050012, + "balance_loss_clip": 1.04281187, + "balance_loss_mlp": 1.02876878, + "epoch": 0.17387644671576732, + "flos": 13369830670080.0, + "grad_norm": 1.9017653264876209, + "language_loss": 0.81200826, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.83348203, + "num_input_tokens_seen": 62572450, + "step": 2892, + "time_per_iteration": 2.7682459354400635 + }, + { + "auxiliary_loss_clip": 0.01148834, + "auxiliary_loss_mlp": 0.0105551, + "balance_loss_clip": 1.05707812, + "balance_loss_mlp": 1.03631687, + "epoch": 0.1739365699684353, + "flos": 26615516958720.0, + "grad_norm": 2.0056711213447436, + "language_loss": 0.73950225, + "learning_rate": 3.786578545502627e-06, + "loss": 0.76154572, + "num_input_tokens_seen": 62592580, + "step": 2893, + "time_per_iteration": 2.8463022708892822 + }, + { + "auxiliary_loss_clip": 0.01132474, + "auxiliary_loss_mlp": 0.01043509, + "balance_loss_clip": 1.05198765, + "balance_loss_mlp": 1.02443516, + "epoch": 0.17399669322110325, + "flos": 23367971491200.0, + "grad_norm": 4.010773627073901, + "language_loss": 0.82507658, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.84683645, + "num_input_tokens_seen": 62611220, + "step": 2894, + "time_per_iteration": 2.719564914703369 + }, + { + "auxiliary_loss_clip": 0.01113951, + "auxiliary_loss_mlp": 0.01046249, + "balance_loss_clip": 1.0506922, + "balance_loss_mlp": 1.02463603, + "epoch": 0.17405681647377122, + "flos": 22054107813120.0, + "grad_norm": 2.3322053123967574, + "language_loss": 0.73826683, + "learning_rate": 3.786228297806741e-06, + "loss": 0.7598688, + "num_input_tokens_seen": 62629185, + "step": 2895, + "time_per_iteration": 2.743992805480957 + }, + { + "auxiliary_loss_clip": 0.01037578, + "auxiliary_loss_mlp": 0.01011099, + "balance_loss_clip": 1.0404408, + "balance_loss_mlp": 1.00788069, + "epoch": 0.1741169397264392, + "flos": 61457559114240.0, + "grad_norm": 0.8765647158253519, + "language_loss": 0.62754023, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.64802706, + "num_input_tokens_seen": 62691895, + "step": 2896, + "time_per_iteration": 3.345099687576294 + }, + { + "auxiliary_loss_clip": 0.0113101, + "auxiliary_loss_mlp": 0.00776588, + "balance_loss_clip": 1.05246758, + "balance_loss_mlp": 1.00102258, + "epoch": 0.17417706297910718, + "flos": 27017680608000.0, + "grad_norm": 1.7338863964520728, + "language_loss": 0.75822324, + "learning_rate": 3.785877779175034e-06, + "loss": 0.77729923, + "num_input_tokens_seen": 62713790, + "step": 2897, + "time_per_iteration": 2.772292137145996 + }, + { + "auxiliary_loss_clip": 0.01141357, + "auxiliary_loss_mlp": 0.01042983, + "balance_loss_clip": 1.0545547, + "balance_loss_mlp": 1.02512598, + "epoch": 0.17423718623177514, + "flos": 33508856960640.0, + "grad_norm": 1.944569306659421, + "language_loss": 0.6883949, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.71023834, + "num_input_tokens_seen": 62736285, + "step": 2898, + "time_per_iteration": 2.7278554439544678 + }, + { + "auxiliary_loss_clip": 0.01128715, + "auxiliary_loss_mlp": 0.01044216, + "balance_loss_clip": 1.05251193, + "balance_loss_mlp": 1.02504694, + "epoch": 0.1742973094844431, + "flos": 27198634348800.0, + "grad_norm": 2.99011081330885, + "language_loss": 0.76445562, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.78618491, + "num_input_tokens_seen": 62756240, + "step": 2899, + "time_per_iteration": 2.8052010536193848 + }, + { + "auxiliary_loss_clip": 0.01095069, + "auxiliary_loss_mlp": 0.01045896, + "balance_loss_clip": 1.04680347, + "balance_loss_mlp": 1.02632213, + "epoch": 0.17435743273711107, + "flos": 22710734386560.0, + "grad_norm": 3.2965812335226357, + "language_loss": 0.72860038, + "learning_rate": 3.785351493339121e-06, + "loss": 0.75001007, + "num_input_tokens_seen": 62775910, + "step": 2900, + "time_per_iteration": 2.868218421936035 + }, + { + "auxiliary_loss_clip": 0.01110522, + "auxiliary_loss_mlp": 0.00776698, + "balance_loss_clip": 1.05202782, + "balance_loss_mlp": 1.000983, + "epoch": 0.17441755598977904, + "flos": 41646466039680.0, + "grad_norm": 1.5488662608930523, + "language_loss": 0.69946706, + "learning_rate": 3.785175929316863e-06, + "loss": 0.71833932, + "num_input_tokens_seen": 62799385, + "step": 2901, + "time_per_iteration": 4.407040596008301 + }, + { + "auxiliary_loss_clip": 0.01129098, + "auxiliary_loss_mlp": 0.01045525, + "balance_loss_clip": 1.05246592, + "balance_loss_mlp": 1.02764344, + "epoch": 0.174477679242447, + "flos": 26287077974400.0, + "grad_norm": 2.1785959913748965, + "language_loss": 0.76588804, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78763425, + "num_input_tokens_seen": 62819380, + "step": 2902, + "time_per_iteration": 4.2244462966918945 + }, + { + "auxiliary_loss_clip": 0.01145685, + "auxiliary_loss_mlp": 0.0104382, + "balance_loss_clip": 1.0531354, + "balance_loss_mlp": 1.02567625, + "epoch": 0.174537802495115, + "flos": 17858412990720.0, + "grad_norm": 2.2508699895191073, + "language_loss": 0.81588745, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.83778256, + "num_input_tokens_seen": 62836205, + "step": 2903, + "time_per_iteration": 4.132925271987915 + }, + { + "auxiliary_loss_clip": 0.01126443, + "auxiliary_loss_mlp": 0.0103942, + "balance_loss_clip": 1.05449986, + "balance_loss_mlp": 1.02135992, + "epoch": 0.17459792574778296, + "flos": 16940715390720.0, + "grad_norm": 2.4085694554154187, + "language_loss": 0.73316491, + "learning_rate": 3.784648831112429e-06, + "loss": 0.75482351, + "num_input_tokens_seen": 62854045, + "step": 2904, + "time_per_iteration": 2.7033374309539795 + }, + { + "auxiliary_loss_clip": 0.01105192, + "auxiliary_loss_mlp": 0.0104577, + "balance_loss_clip": 1.05250716, + "balance_loss_mlp": 1.02822256, + "epoch": 0.17465804900045093, + "flos": 25520026014720.0, + "grad_norm": 1.8783326609306377, + "language_loss": 0.64233291, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.66384256, + "num_input_tokens_seen": 62873075, + "step": 2905, + "time_per_iteration": 2.8325791358947754 + }, + { + "auxiliary_loss_clip": 0.01135256, + "auxiliary_loss_mlp": 0.01053006, + "balance_loss_clip": 1.05869055, + "balance_loss_mlp": 1.03370619, + "epoch": 0.1747181722531189, + "flos": 24129708238080.0, + "grad_norm": 2.820817719352069, + "language_loss": 0.79504299, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.81692564, + "num_input_tokens_seen": 62892675, + "step": 2906, + "time_per_iteration": 4.491498231887817 + }, + { + "auxiliary_loss_clip": 0.01146195, + "auxiliary_loss_mlp": 0.01050729, + "balance_loss_clip": 1.05623174, + "balance_loss_mlp": 1.03258538, + "epoch": 0.17477829550578686, + "flos": 17748813617280.0, + "grad_norm": 2.262709441571415, + "language_loss": 0.81318873, + "learning_rate": 3.784121123841449e-06, + "loss": 0.83515799, + "num_input_tokens_seen": 62910675, + "step": 2907, + "time_per_iteration": 2.6855854988098145 + }, + { + "auxiliary_loss_clip": 0.01143202, + "auxiliary_loss_mlp": 0.01043315, + "balance_loss_clip": 1.05374384, + "balance_loss_mlp": 1.0253861, + "epoch": 0.17483841875845482, + "flos": 15377344865280.0, + "grad_norm": 2.068635027461873, + "language_loss": 0.81342787, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.83529305, + "num_input_tokens_seen": 62928130, + "step": 2908, + "time_per_iteration": 2.6449570655822754 + }, + { + "auxiliary_loss_clip": 0.01127136, + "auxiliary_loss_mlp": 0.01050925, + "balance_loss_clip": 1.05178046, + "balance_loss_mlp": 1.03163743, + "epoch": 0.17489854201112282, + "flos": 17163254102400.0, + "grad_norm": 3.147433356867123, + "language_loss": 0.80020624, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.82198691, + "num_input_tokens_seen": 62944290, + "step": 2909, + "time_per_iteration": 2.6820569038391113 + }, + { + "auxiliary_loss_clip": 0.0109059, + "auxiliary_loss_mlp": 0.01052625, + "balance_loss_clip": 1.05020881, + "balance_loss_mlp": 1.0310595, + "epoch": 0.17495866526379078, + "flos": 19755286318080.0, + "grad_norm": 1.6978440546881337, + "language_loss": 0.76742244, + "learning_rate": 3.783592807684017e-06, + "loss": 0.7888546, + "num_input_tokens_seen": 62963505, + "step": 2910, + "time_per_iteration": 2.6980416774749756 + }, + { + "auxiliary_loss_clip": 0.01158552, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_clip": 1.05618358, + "balance_loss_mlp": 1.03059566, + "epoch": 0.17501878851645875, + "flos": 28511133310080.0, + "grad_norm": 1.9812610358315632, + "language_loss": 0.8698765, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89195609, + "num_input_tokens_seen": 62985020, + "step": 2911, + "time_per_iteration": 2.692662477493286 + }, + { + "auxiliary_loss_clip": 0.01154744, + "auxiliary_loss_mlp": 0.00777232, + "balance_loss_clip": 1.05323184, + "balance_loss_mlp": 1.00110698, + "epoch": 0.1750789117691267, + "flos": 17931203902080.0, + "grad_norm": 3.030740090796483, + "language_loss": 0.89883876, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.91815847, + "num_input_tokens_seen": 63001745, + "step": 2912, + "time_per_iteration": 2.600738763809204 + }, + { + "auxiliary_loss_clip": 0.01146165, + "auxiliary_loss_mlp": 0.01045616, + "balance_loss_clip": 1.0538094, + "balance_loss_mlp": 1.02655411, + "epoch": 0.17513903502179468, + "flos": 18259427404800.0, + "grad_norm": 2.03479884577424, + "language_loss": 0.72818935, + "learning_rate": 3.783063882820439e-06, + "loss": 0.75010711, + "num_input_tokens_seen": 63019750, + "step": 2913, + "time_per_iteration": 2.623342275619507 + }, + { + "auxiliary_loss_clip": 0.01140074, + "auxiliary_loss_mlp": 0.01043928, + "balance_loss_clip": 1.05781865, + "balance_loss_mlp": 1.02557003, + "epoch": 0.17519915827446264, + "flos": 20704728562560.0, + "grad_norm": 2.137073079496124, + "language_loss": 0.6891731, + "learning_rate": 3.782887439295741e-06, + "loss": 0.71101314, + "num_input_tokens_seen": 63039500, + "step": 2914, + "time_per_iteration": 2.7065770626068115 + }, + { + "auxiliary_loss_clip": 0.01142434, + "auxiliary_loss_mlp": 0.01045043, + "balance_loss_clip": 1.05532789, + "balance_loss_mlp": 1.02649403, + "epoch": 0.1752592815271306, + "flos": 20523415685760.0, + "grad_norm": 2.051329837479214, + "language_loss": 0.93125081, + "learning_rate": 3.782710928163772e-06, + "loss": 0.9531256, + "num_input_tokens_seen": 63059785, + "step": 2915, + "time_per_iteration": 2.659029245376587 + }, + { + "auxiliary_loss_clip": 0.01114731, + "auxiliary_loss_mlp": 0.01040999, + "balance_loss_clip": 1.04957223, + "balance_loss_mlp": 1.02243853, + "epoch": 0.1753194047797986, + "flos": 21799178012160.0, + "grad_norm": 1.604344576738792, + "language_loss": 0.81092978, + "learning_rate": 3.782534349431226e-06, + "loss": 0.83248705, + "num_input_tokens_seen": 63079385, + "step": 2916, + "time_per_iteration": 2.7099549770355225 + }, + { + "auxiliary_loss_clip": 0.0114211, + "auxiliary_loss_mlp": 0.01046221, + "balance_loss_clip": 1.05090034, + "balance_loss_mlp": 1.02780342, + "epoch": 0.17537952803246656, + "flos": 20668351063680.0, + "grad_norm": 3.7582760939418716, + "language_loss": 0.73829222, + "learning_rate": 3.782357703104799e-06, + "loss": 0.76017547, + "num_input_tokens_seen": 63098970, + "step": 2917, + "time_per_iteration": 2.666717767715454 + }, + { + "auxiliary_loss_clip": 0.01133449, + "auxiliary_loss_mlp": 0.01047353, + "balance_loss_clip": 1.05319786, + "balance_loss_mlp": 1.02821994, + "epoch": 0.17543965128513453, + "flos": 23295072839040.0, + "grad_norm": 1.813699779869167, + "language_loss": 0.76739681, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.78920484, + "num_input_tokens_seen": 63118750, + "step": 2918, + "time_per_iteration": 2.647634744644165 + }, + { + "auxiliary_loss_clip": 0.01093958, + "auxiliary_loss_mlp": 0.01045643, + "balance_loss_clip": 1.0476644, + "balance_loss_mlp": 1.02425694, + "epoch": 0.1754997745378025, + "flos": 29095615416960.0, + "grad_norm": 2.436739755969174, + "language_loss": 0.73624814, + "learning_rate": 3.782004207697098e-06, + "loss": 0.75764406, + "num_input_tokens_seen": 63136865, + "step": 2919, + "time_per_iteration": 2.7904632091522217 + }, + { + "auxiliary_loss_clip": 0.0112465, + "auxiliary_loss_mlp": 0.01046524, + "balance_loss_clip": 1.04938293, + "balance_loss_mlp": 1.02805829, + "epoch": 0.17555989779047046, + "flos": 30371844620160.0, + "grad_norm": 2.5113730227003814, + "language_loss": 0.74840331, + "learning_rate": 3.781827358629228e-06, + "loss": 0.77011508, + "num_input_tokens_seen": 63158325, + "step": 2920, + "time_per_iteration": 2.727890968322754 + }, + { + "auxiliary_loss_clip": 0.01117257, + "auxiliary_loss_mlp": 0.01042893, + "balance_loss_clip": 1.0462867, + "balance_loss_mlp": 1.02371216, + "epoch": 0.17562002104313842, + "flos": 23287746464640.0, + "grad_norm": 3.6617213109535536, + "language_loss": 0.79731411, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.81891561, + "num_input_tokens_seen": 63173115, + "step": 2921, + "time_per_iteration": 2.753817558288574 + }, + { + "auxiliary_loss_clip": 0.01121718, + "auxiliary_loss_mlp": 0.01046234, + "balance_loss_clip": 1.05232286, + "balance_loss_mlp": 1.02679133, + "epoch": 0.1756801442958064, + "flos": 24790500789120.0, + "grad_norm": 2.6301689129577546, + "language_loss": 0.87826073, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.89994025, + "num_input_tokens_seen": 63192880, + "step": 2922, + "time_per_iteration": 2.7411837577819824 + }, + { + "auxiliary_loss_clip": 0.01144004, + "auxiliary_loss_mlp": 0.01047403, + "balance_loss_clip": 1.05196273, + "balance_loss_mlp": 1.02778149, + "epoch": 0.17574026754847438, + "flos": 25771651764480.0, + "grad_norm": 4.4893841411537085, + "language_loss": 0.62347209, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.64538622, + "num_input_tokens_seen": 63214395, + "step": 2923, + "time_per_iteration": 2.7666683197021484 + }, + { + "auxiliary_loss_clip": 0.01134872, + "auxiliary_loss_mlp": 0.01048692, + "balance_loss_clip": 1.05887377, + "balance_loss_mlp": 1.02847457, + "epoch": 0.17580039080114235, + "flos": 17456608477440.0, + "grad_norm": 2.8552131957437914, + "language_loss": 0.80392253, + "learning_rate": 3.78111928675413e-06, + "loss": 0.82575822, + "num_input_tokens_seen": 63231020, + "step": 2924, + "time_per_iteration": 2.729403257369995 + }, + { + "auxiliary_loss_clip": 0.01132783, + "auxiliary_loss_mlp": 0.01051456, + "balance_loss_clip": 1.05193377, + "balance_loss_mlp": 1.03082108, + "epoch": 0.1758605140538103, + "flos": 14864648088960.0, + "grad_norm": 5.080042666316876, + "language_loss": 0.71374178, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73558426, + "num_input_tokens_seen": 63246245, + "step": 2925, + "time_per_iteration": 2.9538233280181885 + }, + { + "auxiliary_loss_clip": 0.01117196, + "auxiliary_loss_mlp": 0.01045706, + "balance_loss_clip": 1.05052948, + "balance_loss_mlp": 1.02744341, + "epoch": 0.17592063730647828, + "flos": 23004268329600.0, + "grad_norm": 1.6620026542608322, + "language_loss": 0.71931666, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.74094564, + "num_input_tokens_seen": 63267790, + "step": 2926, + "time_per_iteration": 2.7738964557647705 + }, + { + "auxiliary_loss_clip": 0.01105944, + "auxiliary_loss_mlp": 0.01045732, + "balance_loss_clip": 1.04915071, + "balance_loss_mlp": 1.02253425, + "epoch": 0.17598076055914624, + "flos": 20741501111040.0, + "grad_norm": 2.6318732447225837, + "language_loss": 0.84724289, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.86875963, + "num_input_tokens_seen": 63286830, + "step": 2927, + "time_per_iteration": 2.704437494277954 + }, + { + "auxiliary_loss_clip": 0.01100437, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_clip": 1.05039644, + "balance_loss_mlp": 1.02887452, + "epoch": 0.1760408838118142, + "flos": 34092441227520.0, + "grad_norm": 1.9547597089289632, + "language_loss": 0.72147644, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.74294758, + "num_input_tokens_seen": 63308870, + "step": 2928, + "time_per_iteration": 2.793802261352539 + }, + { + "auxiliary_loss_clip": 0.01120251, + "auxiliary_loss_mlp": 0.01045623, + "balance_loss_clip": 1.0516876, + "balance_loss_mlp": 1.02679992, + "epoch": 0.1761010070644822, + "flos": 24168384207360.0, + "grad_norm": 1.8474008440192304, + "language_loss": 0.83097279, + "learning_rate": 3.780232677305744e-06, + "loss": 0.85263157, + "num_input_tokens_seen": 63329005, + "step": 2929, + "time_per_iteration": 2.733339786529541 + }, + { + "auxiliary_loss_clip": 0.01124127, + "auxiliary_loss_mlp": 0.01042521, + "balance_loss_clip": 1.04853475, + "balance_loss_mlp": 1.02479422, + "epoch": 0.17616113031715017, + "flos": 26576697335040.0, + "grad_norm": 2.4427170552109163, + "language_loss": 0.79211783, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.81378424, + "num_input_tokens_seen": 63349390, + "step": 2930, + "time_per_iteration": 2.748080015182495 + }, + { + "auxiliary_loss_clip": 0.01160654, + "auxiliary_loss_mlp": 0.01047281, + "balance_loss_clip": 1.05925918, + "balance_loss_mlp": 1.02758813, + "epoch": 0.17622125356981813, + "flos": 25666685245440.0, + "grad_norm": 2.504124366499191, + "language_loss": 0.76502466, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.78710401, + "num_input_tokens_seen": 63368835, + "step": 2931, + "time_per_iteration": 2.6691603660583496 + }, + { + "auxiliary_loss_clip": 0.01076453, + "auxiliary_loss_mlp": 0.01043586, + "balance_loss_clip": 1.04577017, + "balance_loss_mlp": 1.02478647, + "epoch": 0.1762813768224861, + "flos": 16508530949760.0, + "grad_norm": 2.941321746162514, + "language_loss": 0.76070881, + "learning_rate": 3.779699901503696e-06, + "loss": 0.78190923, + "num_input_tokens_seen": 63385220, + "step": 2932, + "time_per_iteration": 2.809630870819092 + }, + { + "auxiliary_loss_clip": 0.01148627, + "auxiliary_loss_mlp": 0.01043149, + "balance_loss_clip": 1.05284405, + "balance_loss_mlp": 1.0229789, + "epoch": 0.17634150007515406, + "flos": 11211850402560.0, + "grad_norm": 5.168612276821382, + "language_loss": 0.90027422, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.92219198, + "num_input_tokens_seen": 63400865, + "step": 2933, + "time_per_iteration": 2.6665337085723877 + }, + { + "auxiliary_loss_clip": 0.01154114, + "auxiliary_loss_mlp": 0.01055985, + "balance_loss_clip": 1.05539656, + "balance_loss_mlp": 1.03766203, + "epoch": 0.17640162332782203, + "flos": 23659925235840.0, + "grad_norm": 2.009210784374188, + "language_loss": 0.88323247, + "learning_rate": 3.779344380192448e-06, + "loss": 0.90533352, + "num_input_tokens_seen": 63421390, + "step": 2934, + "time_per_iteration": 2.6649580001831055 + }, + { + "auxiliary_loss_clip": 0.01128495, + "auxiliary_loss_mlp": 0.01048067, + "balance_loss_clip": 1.05581188, + "balance_loss_mlp": 1.03028131, + "epoch": 0.17646174658049, + "flos": 53796984606720.0, + "grad_norm": 1.6302121247923247, + "language_loss": 0.70403945, + "learning_rate": 3.779166518324077e-06, + "loss": 0.72580504, + "num_input_tokens_seen": 63444715, + "step": 2935, + "time_per_iteration": 3.006019115447998 + }, + { + "auxiliary_loss_clip": 0.01126189, + "auxiliary_loss_mlp": 0.01040034, + "balance_loss_clip": 1.05360174, + "balance_loss_mlp": 1.02135396, + "epoch": 0.17652186983315798, + "flos": 24243868638720.0, + "grad_norm": 2.5931578566124807, + "language_loss": 0.69721985, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.71888208, + "num_input_tokens_seen": 63465525, + "step": 2936, + "time_per_iteration": 2.7517428398132324 + }, + { + "auxiliary_loss_clip": 0.01105644, + "auxiliary_loss_mlp": 0.01045896, + "balance_loss_clip": 1.05023837, + "balance_loss_mlp": 1.02737129, + "epoch": 0.17658199308582595, + "flos": 27454282421760.0, + "grad_norm": 1.9170676229980566, + "language_loss": 0.71288073, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.73439616, + "num_input_tokens_seen": 63485815, + "step": 2937, + "time_per_iteration": 2.837181329727173 + }, + { + "auxiliary_loss_clip": 0.01141008, + "auxiliary_loss_mlp": 0.01046843, + "balance_loss_clip": 1.05945122, + "balance_loss_mlp": 1.02674472, + "epoch": 0.17664211633849392, + "flos": 22418672901120.0, + "grad_norm": 2.267148270780071, + "language_loss": 0.75439745, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.77627593, + "num_input_tokens_seen": 63503905, + "step": 2938, + "time_per_iteration": 2.883162021636963 + }, + { + "auxiliary_loss_clip": 0.01147345, + "auxiliary_loss_mlp": 0.01043976, + "balance_loss_clip": 1.05576169, + "balance_loss_mlp": 1.02553487, + "epoch": 0.17670223959116188, + "flos": 24715124098560.0, + "grad_norm": 2.921726967662053, + "language_loss": 0.71015209, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.73206532, + "num_input_tokens_seen": 63521985, + "step": 2939, + "time_per_iteration": 2.6938419342041016 + }, + { + "auxiliary_loss_clip": 0.01160437, + "auxiliary_loss_mlp": 0.01046921, + "balance_loss_clip": 1.05818558, + "balance_loss_mlp": 1.02794337, + "epoch": 0.17676236284382985, + "flos": 22527051212160.0, + "grad_norm": 3.114901170192376, + "language_loss": 0.73513985, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.75721341, + "num_input_tokens_seen": 63539830, + "step": 2940, + "time_per_iteration": 4.145469665527344 + }, + { + "auxiliary_loss_clip": 0.0112582, + "auxiliary_loss_mlp": 0.01046611, + "balance_loss_clip": 1.05631542, + "balance_loss_mlp": 1.02731109, + "epoch": 0.1768224860964978, + "flos": 12385160161920.0, + "grad_norm": 3.071469776016301, + "language_loss": 0.85375023, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.87547457, + "num_input_tokens_seen": 63555495, + "step": 2941, + "time_per_iteration": 4.279599666595459 + }, + { + "auxiliary_loss_clip": 0.01161068, + "auxiliary_loss_mlp": 0.01045254, + "balance_loss_clip": 1.05717027, + "balance_loss_mlp": 1.0257628, + "epoch": 0.1768826093491658, + "flos": 24353360271360.0, + "grad_norm": 2.434766510066968, + "language_loss": 0.76885259, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.79091585, + "num_input_tokens_seen": 63575290, + "step": 2942, + "time_per_iteration": 4.2280871868133545 + }, + { + "auxiliary_loss_clip": 0.01106234, + "auxiliary_loss_mlp": 0.00780676, + "balance_loss_clip": 1.04992843, + "balance_loss_mlp": 1.00087166, + "epoch": 0.17694273260183377, + "flos": 23587062497280.0, + "grad_norm": 3.301743041114179, + "language_loss": 0.8024286, + "learning_rate": 3.77774119516197e-06, + "loss": 0.82129776, + "num_input_tokens_seen": 63594670, + "step": 2943, + "time_per_iteration": 2.8921029567718506 + }, + { + "auxiliary_loss_clip": 0.01132848, + "auxiliary_loss_mlp": 0.01052225, + "balance_loss_clip": 1.05352235, + "balance_loss_mlp": 1.03124392, + "epoch": 0.17700285585450173, + "flos": 26760991040640.0, + "grad_norm": 5.7613375603973465, + "language_loss": 0.80809408, + "learning_rate": 3.777562726341155e-06, + "loss": 0.82994485, + "num_input_tokens_seen": 63614780, + "step": 2944, + "time_per_iteration": 2.692831039428711 + }, + { + "auxiliary_loss_clip": 0.01161854, + "auxiliary_loss_mlp": 0.01056825, + "balance_loss_clip": 1.05807233, + "balance_loss_mlp": 1.03796625, + "epoch": 0.1770629791071697, + "flos": 42776323320960.0, + "grad_norm": 2.4257754996125227, + "language_loss": 0.73812854, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.7603153, + "num_input_tokens_seen": 63637190, + "step": 2945, + "time_per_iteration": 2.782910108566284 + }, + { + "auxiliary_loss_clip": 0.011481, + "auxiliary_loss_mlp": 0.01047361, + "balance_loss_clip": 1.05756998, + "balance_loss_mlp": 1.02862108, + "epoch": 0.17712310235983766, + "flos": 17345572560000.0, + "grad_norm": 2.8106797532110637, + "language_loss": 0.7793628, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.80131739, + "num_input_tokens_seen": 63652140, + "step": 2946, + "time_per_iteration": 4.278741121292114 + }, + { + "auxiliary_loss_clip": 0.01109059, + "auxiliary_loss_mlp": 0.01052842, + "balance_loss_clip": 1.04997015, + "balance_loss_mlp": 1.03341079, + "epoch": 0.17718322561250563, + "flos": 23878477537920.0, + "grad_norm": 2.172386857191393, + "language_loss": 0.76068008, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.7822991, + "num_input_tokens_seen": 63671700, + "step": 2947, + "time_per_iteration": 2.7949914932250977 + }, + { + "auxiliary_loss_clip": 0.0114934, + "auxiliary_loss_mlp": 0.01044342, + "balance_loss_clip": 1.05480659, + "balance_loss_mlp": 1.025388, + "epoch": 0.1772433488651736, + "flos": 36466352104320.0, + "grad_norm": 2.6793588646204745, + "language_loss": 0.72557831, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.74751514, + "num_input_tokens_seen": 63691685, + "step": 2948, + "time_per_iteration": 2.901662826538086 + }, + { + "auxiliary_loss_clip": 0.01151572, + "auxiliary_loss_mlp": 0.01050692, + "balance_loss_clip": 1.05921662, + "balance_loss_mlp": 1.03236949, + "epoch": 0.1773034721178416, + "flos": 26684716510080.0, + "grad_norm": 1.8296543316983853, + "language_loss": 0.81782824, + "learning_rate": 3.776669371292171e-06, + "loss": 0.8398509, + "num_input_tokens_seen": 63711720, + "step": 2949, + "time_per_iteration": 2.7284891605377197 + }, + { + "auxiliary_loss_clip": 0.01080853, + "auxiliary_loss_mlp": 0.0100651, + "balance_loss_clip": 1.04975748, + "balance_loss_mlp": 1.00226629, + "epoch": 0.17736359537050955, + "flos": 57117467617920.0, + "grad_norm": 0.768126622018234, + "language_loss": 0.64989161, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.67076528, + "num_input_tokens_seen": 63776280, + "step": 2950, + "time_per_iteration": 3.2761552333831787 + }, + { + "auxiliary_loss_clip": 0.01121454, + "auxiliary_loss_mlp": 0.01045861, + "balance_loss_clip": 1.05373287, + "balance_loss_mlp": 1.02743077, + "epoch": 0.17742371862317752, + "flos": 27198203385600.0, + "grad_norm": 2.9882590699755927, + "language_loss": 0.83619881, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.85787189, + "num_input_tokens_seen": 63797535, + "step": 2951, + "time_per_iteration": 2.7637627124786377 + }, + { + "auxiliary_loss_clip": 0.01125929, + "auxiliary_loss_mlp": 0.01046039, + "balance_loss_clip": 1.05109882, + "balance_loss_mlp": 1.02682269, + "epoch": 0.17748384187584548, + "flos": 20959694277120.0, + "grad_norm": 2.3133151959471796, + "language_loss": 0.80395055, + "learning_rate": 3.776132549750806e-06, + "loss": 0.82567012, + "num_input_tokens_seen": 63817045, + "step": 2952, + "time_per_iteration": 2.7605957984924316 + }, + { + "auxiliary_loss_clip": 0.01162679, + "auxiliary_loss_mlp": 0.01044862, + "balance_loss_clip": 1.05858529, + "balance_loss_mlp": 1.02513337, + "epoch": 0.17754396512851345, + "flos": 25009986844800.0, + "grad_norm": 2.8185319653472116, + "language_loss": 0.79273909, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.81481451, + "num_input_tokens_seen": 63837665, + "step": 2953, + "time_per_iteration": 2.798912525177002 + }, + { + "auxiliary_loss_clip": 0.0112399, + "auxiliary_loss_mlp": 0.01043314, + "balance_loss_clip": 1.05482125, + "balance_loss_mlp": 1.02470589, + "epoch": 0.1776040883811814, + "flos": 32051566275840.0, + "grad_norm": 2.017710353628998, + "language_loss": 0.87963271, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.90130568, + "num_input_tokens_seen": 63858455, + "step": 2954, + "time_per_iteration": 2.838931083679199 + }, + { + "auxiliary_loss_clip": 0.01144028, + "auxiliary_loss_mlp": 0.01052958, + "balance_loss_clip": 1.06043494, + "balance_loss_mlp": 1.03296697, + "epoch": 0.17766421163384938, + "flos": 21574125348480.0, + "grad_norm": 1.9130853947826985, + "language_loss": 0.85313326, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.87510312, + "num_input_tokens_seen": 63876935, + "step": 2955, + "time_per_iteration": 2.7965714931488037 + }, + { + "auxiliary_loss_clip": 0.01127677, + "auxiliary_loss_mlp": 0.01047004, + "balance_loss_clip": 1.05093336, + "balance_loss_mlp": 1.02660692, + "epoch": 0.17772433488651737, + "flos": 22419319345920.0, + "grad_norm": 18.24238703278013, + "language_loss": 0.71152055, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.73326737, + "num_input_tokens_seen": 63896815, + "step": 2956, + "time_per_iteration": 2.8358442783355713 + }, + { + "auxiliary_loss_clip": 0.01150063, + "auxiliary_loss_mlp": 0.010506, + "balance_loss_clip": 1.05813813, + "balance_loss_mlp": 1.03156281, + "epoch": 0.17778445813918534, + "flos": 25629445820160.0, + "grad_norm": 2.981126112172262, + "language_loss": 0.82881534, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.85082197, + "num_input_tokens_seen": 63916140, + "step": 2957, + "time_per_iteration": 2.7034976482391357 + }, + { + "auxiliary_loss_clip": 0.01100452, + "auxiliary_loss_mlp": 0.01047239, + "balance_loss_clip": 1.04976833, + "balance_loss_mlp": 1.02789164, + "epoch": 0.1778445813918533, + "flos": 25628871202560.0, + "grad_norm": 2.7180995933425622, + "language_loss": 0.75164193, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.77311885, + "num_input_tokens_seen": 63935220, + "step": 2958, + "time_per_iteration": 2.8312718868255615 + }, + { + "auxiliary_loss_clip": 0.01146025, + "auxiliary_loss_mlp": 0.01043359, + "balance_loss_clip": 1.06117964, + "balance_loss_mlp": 1.02502513, + "epoch": 0.17790470464452127, + "flos": 22345522853760.0, + "grad_norm": 9.439636088267013, + "language_loss": 0.80363399, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.82552785, + "num_input_tokens_seen": 63954550, + "step": 2959, + "time_per_iteration": 2.722102642059326 + }, + { + "auxiliary_loss_clip": 0.01164621, + "auxiliary_loss_mlp": 0.01049069, + "balance_loss_clip": 1.05812871, + "balance_loss_mlp": 1.02938771, + "epoch": 0.17796482789718923, + "flos": 18765875214720.0, + "grad_norm": 2.62580469975692, + "language_loss": 0.51511085, + "learning_rate": 3.774698062689362e-06, + "loss": 0.53724772, + "num_input_tokens_seen": 63972425, + "step": 2960, + "time_per_iteration": 2.6222047805786133 + }, + { + "auxiliary_loss_clip": 0.01111843, + "auxiliary_loss_mlp": 0.01052801, + "balance_loss_clip": 1.05275989, + "balance_loss_mlp": 1.03228474, + "epoch": 0.1780249511498572, + "flos": 23440941970560.0, + "grad_norm": 1.7626913000215665, + "language_loss": 0.88908094, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.91072738, + "num_input_tokens_seen": 63992165, + "step": 2961, + "time_per_iteration": 2.8088786602020264 + }, + { + "auxiliary_loss_clip": 0.01116231, + "auxiliary_loss_mlp": 0.01054867, + "balance_loss_clip": 1.05181062, + "balance_loss_mlp": 1.03385067, + "epoch": 0.1780850744025252, + "flos": 23367468700800.0, + "grad_norm": 1.716412227369414, + "language_loss": 0.79170465, + "learning_rate": 3.774338767820631e-06, + "loss": 0.81341565, + "num_input_tokens_seen": 64013470, + "step": 2962, + "time_per_iteration": 2.7546913623809814 + }, + { + "auxiliary_loss_clip": 0.01145526, + "auxiliary_loss_mlp": 0.01052794, + "balance_loss_clip": 1.05649889, + "balance_loss_mlp": 1.03104997, + "epoch": 0.17814519765519315, + "flos": 13771994319360.0, + "grad_norm": 2.3241756501763446, + "language_loss": 0.74910223, + "learning_rate": 3.774159019458203e-06, + "loss": 0.77108544, + "num_input_tokens_seen": 64030975, + "step": 2963, + "time_per_iteration": 2.680356979370117 + }, + { + "auxiliary_loss_clip": 0.01140656, + "auxiliary_loss_mlp": 0.01043225, + "balance_loss_clip": 1.05769885, + "balance_loss_mlp": 1.02347231, + "epoch": 0.17820532090786112, + "flos": 21976396738560.0, + "grad_norm": 1.747536927551571, + "language_loss": 0.78837025, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.81020904, + "num_input_tokens_seen": 64050075, + "step": 2964, + "time_per_iteration": 2.748398780822754 + }, + { + "auxiliary_loss_clip": 0.01151685, + "auxiliary_loss_mlp": 0.00776982, + "balance_loss_clip": 1.05950594, + "balance_loss_mlp": 1.00098181, + "epoch": 0.17826544416052909, + "flos": 24790752184320.0, + "grad_norm": 3.046027397796258, + "language_loss": 0.81160808, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.83089471, + "num_input_tokens_seen": 64071920, + "step": 2965, + "time_per_iteration": 2.8090012073516846 + }, + { + "auxiliary_loss_clip": 0.01151658, + "auxiliary_loss_mlp": 0.01047086, + "balance_loss_clip": 1.06002402, + "balance_loss_mlp": 1.02916884, + "epoch": 0.17832556741319705, + "flos": 13879582531200.0, + "grad_norm": 2.554359630612449, + "language_loss": 0.95307338, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.97506082, + "num_input_tokens_seen": 64086835, + "step": 2966, + "time_per_iteration": 2.7159550189971924 + }, + { + "auxiliary_loss_clip": 0.01112928, + "auxiliary_loss_mlp": 0.00777395, + "balance_loss_clip": 1.05336046, + "balance_loss_mlp": 1.00083637, + "epoch": 0.17838569066586502, + "flos": 36641703323520.0, + "grad_norm": 7.5683867487642065, + "language_loss": 0.72833109, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.74723434, + "num_input_tokens_seen": 64107360, + "step": 2967, + "time_per_iteration": 2.9540669918060303 + }, + { + "auxiliary_loss_clip": 0.01129124, + "auxiliary_loss_mlp": 0.01046817, + "balance_loss_clip": 1.05574143, + "balance_loss_mlp": 1.02775562, + "epoch": 0.17844581391853298, + "flos": 18727271072640.0, + "grad_norm": 2.1617023205672523, + "language_loss": 0.76897681, + "learning_rate": 3.773259268638157e-06, + "loss": 0.7907362, + "num_input_tokens_seen": 64124690, + "step": 2968, + "time_per_iteration": 2.752717971801758 + }, + { + "auxiliary_loss_clip": 0.01085006, + "auxiliary_loss_mlp": 0.01044958, + "balance_loss_clip": 1.04640651, + "balance_loss_mlp": 1.02559829, + "epoch": 0.17850593717120097, + "flos": 27378259286400.0, + "grad_norm": 2.039560504387258, + "language_loss": 0.75839806, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.77969772, + "num_input_tokens_seen": 64146315, + "step": 2969, + "time_per_iteration": 2.9161994457244873 + }, + { + "auxiliary_loss_clip": 0.01075271, + "auxiliary_loss_mlp": 0.01013071, + "balance_loss_clip": 1.06177902, + "balance_loss_mlp": 1.00932813, + "epoch": 0.17856606042386894, + "flos": 66996025084800.0, + "grad_norm": 0.8520394227890811, + "language_loss": 0.69012916, + "learning_rate": 3.772898897567171e-06, + "loss": 0.7110126, + "num_input_tokens_seen": 64210875, + "step": 2970, + "time_per_iteration": 3.3269262313842773 + }, + { + "auxiliary_loss_clip": 0.011313, + "auxiliary_loss_mlp": 0.01044166, + "balance_loss_clip": 1.05561864, + "balance_loss_mlp": 1.02493763, + "epoch": 0.1786261836765369, + "flos": 36977001805440.0, + "grad_norm": 1.9951166568015506, + "language_loss": 0.67617297, + "learning_rate": 3.772718611185505e-06, + "loss": 0.69792765, + "num_input_tokens_seen": 64230740, + "step": 2971, + "time_per_iteration": 2.8691961765289307 + }, + { + "auxiliary_loss_clip": 0.01110831, + "auxiliary_loss_mlp": 0.01052779, + "balance_loss_clip": 1.05309939, + "balance_loss_mlp": 1.03266823, + "epoch": 0.17868630692920487, + "flos": 24825441744000.0, + "grad_norm": 1.5664358375440484, + "language_loss": 0.8971802, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.91881633, + "num_input_tokens_seen": 64252300, + "step": 2972, + "time_per_iteration": 2.893923759460449 + }, + { + "auxiliary_loss_clip": 0.01124705, + "auxiliary_loss_mlp": 0.01055871, + "balance_loss_clip": 1.05635929, + "balance_loss_mlp": 1.03466403, + "epoch": 0.17874643018187283, + "flos": 16981977139200.0, + "grad_norm": 2.4611679901229153, + "language_loss": 0.88593906, + "learning_rate": 3.77235783676401e-06, + "loss": 0.90774482, + "num_input_tokens_seen": 64270105, + "step": 2973, + "time_per_iteration": 2.7340333461761475 + }, + { + "auxiliary_loss_clip": 0.01164127, + "auxiliary_loss_mlp": 0.01047073, + "balance_loss_clip": 1.06285155, + "balance_loss_mlp": 1.0283215, + "epoch": 0.1788065534345408, + "flos": 21032233793280.0, + "grad_norm": 3.4039298885336557, + "language_loss": 0.7668556, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.78896761, + "num_input_tokens_seen": 64287250, + "step": 2974, + "time_per_iteration": 2.632495403289795 + }, + { + "auxiliary_loss_clip": 0.0114187, + "auxiliary_loss_mlp": 0.01053, + "balance_loss_clip": 1.06101942, + "balance_loss_mlp": 1.03390288, + "epoch": 0.17886667668720876, + "flos": 23987717775360.0, + "grad_norm": 2.484949778027245, + "language_loss": 0.74701655, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.76896524, + "num_input_tokens_seen": 64307140, + "step": 2975, + "time_per_iteration": 2.704012870788574 + }, + { + "auxiliary_loss_clip": 0.01149026, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_clip": 1.05678535, + "balance_loss_mlp": 1.03004813, + "epoch": 0.17892679993987676, + "flos": 25739476156800.0, + "grad_norm": 1.518747487377626, + "language_loss": 0.73032069, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.75229883, + "num_input_tokens_seen": 64328760, + "step": 2976, + "time_per_iteration": 2.7357017993927 + }, + { + "auxiliary_loss_clip": 0.01150398, + "auxiliary_loss_mlp": 0.01038685, + "balance_loss_clip": 1.06239033, + "balance_loss_mlp": 1.0229373, + "epoch": 0.17898692319254472, + "flos": 25699686865920.0, + "grad_norm": 1.4579507247258654, + "language_loss": 0.770594, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.79248488, + "num_input_tokens_seen": 64348800, + "step": 2977, + "time_per_iteration": 2.727318286895752 + }, + { + "auxiliary_loss_clip": 0.01131521, + "auxiliary_loss_mlp": 0.01045834, + "balance_loss_clip": 1.06618452, + "balance_loss_mlp": 1.02841735, + "epoch": 0.1790470464452127, + "flos": 19317786664320.0, + "grad_norm": 2.7286854986191282, + "language_loss": 0.80235189, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.82412547, + "num_input_tokens_seen": 64367955, + "step": 2978, + "time_per_iteration": 2.8178791999816895 + }, + { + "auxiliary_loss_clip": 0.0114307, + "auxiliary_loss_mlp": 0.01052978, + "balance_loss_clip": 1.05818772, + "balance_loss_mlp": 1.03330874, + "epoch": 0.17910716969788065, + "flos": 30044267562240.0, + "grad_norm": 1.4967765935497133, + "language_loss": 0.76192784, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.7838884, + "num_input_tokens_seen": 64389805, + "step": 2979, + "time_per_iteration": 4.241487741470337 + }, + { + "auxiliary_loss_clip": 0.01122958, + "auxiliary_loss_mlp": 0.0104457, + "balance_loss_clip": 1.0590893, + "balance_loss_mlp": 1.02660525, + "epoch": 0.17916729295054862, + "flos": 19427709260160.0, + "grad_norm": 1.9491816848203256, + "language_loss": 0.68945503, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.71113026, + "num_input_tokens_seen": 64408220, + "step": 2980, + "time_per_iteration": 2.6817352771759033 + }, + { + "auxiliary_loss_clip": 0.01152986, + "auxiliary_loss_mlp": 0.0104519, + "balance_loss_clip": 1.0588038, + "balance_loss_mlp": 1.02497244, + "epoch": 0.17922741620321658, + "flos": 14611549881600.0, + "grad_norm": 1.9134992191513662, + "language_loss": 0.70793843, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.72992027, + "num_input_tokens_seen": 64426380, + "step": 2981, + "time_per_iteration": 4.310532331466675 + }, + { + "auxiliary_loss_clip": 0.01137747, + "auxiliary_loss_mlp": 0.01056086, + "balance_loss_clip": 1.06083858, + "balance_loss_mlp": 1.03686976, + "epoch": 0.17928753945588458, + "flos": 17165301177600.0, + "grad_norm": 2.529665562311581, + "language_loss": 0.8190546, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.84099293, + "num_input_tokens_seen": 64444355, + "step": 2982, + "time_per_iteration": 2.710726261138916 + }, + { + "auxiliary_loss_clip": 0.01162978, + "auxiliary_loss_mlp": 0.01041014, + "balance_loss_clip": 1.06181359, + "balance_loss_mlp": 1.02306128, + "epoch": 0.17934766270855254, + "flos": 31395622060800.0, + "grad_norm": 1.6440716861921114, + "language_loss": 0.83123535, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.85327524, + "num_input_tokens_seen": 64467800, + "step": 2983, + "time_per_iteration": 2.700378656387329 + }, + { + "auxiliary_loss_clip": 0.01153001, + "auxiliary_loss_mlp": 0.01048341, + "balance_loss_clip": 1.05694914, + "balance_loss_mlp": 1.02932739, + "epoch": 0.1794077859612205, + "flos": 20814184281600.0, + "grad_norm": 1.6703280507743268, + "language_loss": 0.85149562, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.87350899, + "num_input_tokens_seen": 64487230, + "step": 2984, + "time_per_iteration": 2.6529407501220703 + }, + { + "auxiliary_loss_clip": 0.01126981, + "auxiliary_loss_mlp": 0.01043442, + "balance_loss_clip": 1.05520201, + "balance_loss_mlp": 1.02424896, + "epoch": 0.17946790921388847, + "flos": 28986447006720.0, + "grad_norm": 2.4609160562432053, + "language_loss": 0.8935222, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.9152264, + "num_input_tokens_seen": 64509165, + "step": 2985, + "time_per_iteration": 4.528426170349121 + }, + { + "auxiliary_loss_clip": 0.01160091, + "auxiliary_loss_mlp": 0.01040749, + "balance_loss_clip": 1.06142831, + "balance_loss_mlp": 1.02434587, + "epoch": 0.17952803246655644, + "flos": 20737406960640.0, + "grad_norm": 2.095497349072142, + "language_loss": 0.69538593, + "learning_rate": 3.770006252694922e-06, + "loss": 0.71739429, + "num_input_tokens_seen": 64527940, + "step": 2986, + "time_per_iteration": 2.6890172958374023 + }, + { + "auxiliary_loss_clip": 0.01158556, + "auxiliary_loss_mlp": 0.00776, + "balance_loss_clip": 1.05752599, + "balance_loss_mlp": 1.00081134, + "epoch": 0.1795881557192244, + "flos": 28255988027520.0, + "grad_norm": 2.4599229747435123, + "language_loss": 0.77855188, + "learning_rate": 3.769824891588688e-06, + "loss": 0.79789746, + "num_input_tokens_seen": 64545230, + "step": 2987, + "time_per_iteration": 2.650761842727661 + }, + { + "auxiliary_loss_clip": 0.0116216, + "auxiliary_loss_mlp": 0.01043775, + "balance_loss_clip": 1.05775642, + "balance_loss_mlp": 1.02441502, + "epoch": 0.17964827897189237, + "flos": 18552027594240.0, + "grad_norm": 2.0190394876224467, + "language_loss": 0.77958816, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.80164748, + "num_input_tokens_seen": 64563820, + "step": 2988, + "time_per_iteration": 2.6151437759399414 + }, + { + "auxiliary_loss_clip": 0.01059513, + "auxiliary_loss_mlp": 0.00756906, + "balance_loss_clip": 1.07071137, + "balance_loss_mlp": 1.00131369, + "epoch": 0.17970840222456036, + "flos": 58165088711040.0, + "grad_norm": 0.7650122273387262, + "language_loss": 0.62709254, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64525676, + "num_input_tokens_seen": 64621315, + "step": 2989, + "time_per_iteration": 3.1990275382995605 + }, + { + "auxiliary_loss_clip": 0.01137168, + "auxiliary_loss_mlp": 0.01038826, + "balance_loss_clip": 1.05553865, + "balance_loss_mlp": 1.02128983, + "epoch": 0.17976852547722832, + "flos": 20300805146880.0, + "grad_norm": 2.3566032567209483, + "language_loss": 0.71070904, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.73246896, + "num_input_tokens_seen": 64639885, + "step": 2990, + "time_per_iteration": 2.7275335788726807 + }, + { + "auxiliary_loss_clip": 0.01135847, + "auxiliary_loss_mlp": 0.01044966, + "balance_loss_clip": 1.05398035, + "balance_loss_mlp": 1.02639365, + "epoch": 0.1798286487298963, + "flos": 39669367685760.0, + "grad_norm": 1.8035266350414116, + "language_loss": 0.68888462, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.7106927, + "num_input_tokens_seen": 64661220, + "step": 2991, + "time_per_iteration": 2.8237311840057373 + }, + { + "auxiliary_loss_clip": 0.01104375, + "auxiliary_loss_mlp": 0.01046061, + "balance_loss_clip": 1.05156851, + "balance_loss_mlp": 1.02663028, + "epoch": 0.17988877198256426, + "flos": 25520313323520.0, + "grad_norm": 1.6063564491400402, + "language_loss": 0.82933879, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.85084313, + "num_input_tokens_seen": 64682530, + "step": 2992, + "time_per_iteration": 2.8303778171539307 + }, + { + "auxiliary_loss_clip": 0.01140805, + "auxiliary_loss_mlp": 0.01035603, + "balance_loss_clip": 1.05302262, + "balance_loss_mlp": 1.0187583, + "epoch": 0.17994889523523222, + "flos": 18807496099200.0, + "grad_norm": 2.076285453641059, + "language_loss": 0.82228035, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.84404445, + "num_input_tokens_seen": 64701025, + "step": 2993, + "time_per_iteration": 2.710369110107422 + }, + { + "auxiliary_loss_clip": 0.01135151, + "auxiliary_loss_mlp": 0.01040493, + "balance_loss_clip": 1.05135202, + "balance_loss_mlp": 1.02236176, + "epoch": 0.18000901848790019, + "flos": 21104450087040.0, + "grad_norm": 1.7027458997386926, + "language_loss": 0.78129464, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.80305111, + "num_input_tokens_seen": 64719570, + "step": 2994, + "time_per_iteration": 2.6666738986968994 + }, + { + "auxiliary_loss_clip": 0.01158877, + "auxiliary_loss_mlp": 0.01045455, + "balance_loss_clip": 1.05657315, + "balance_loss_mlp": 1.02819359, + "epoch": 0.18006914174056818, + "flos": 19646441130240.0, + "grad_norm": 2.4198973911698434, + "language_loss": 0.81139499, + "learning_rate": 3.768371587287296e-06, + "loss": 0.83343828, + "num_input_tokens_seen": 64738110, + "step": 2995, + "time_per_iteration": 2.699521541595459 + }, + { + "auxiliary_loss_clip": 0.01142902, + "auxiliary_loss_mlp": 0.01047606, + "balance_loss_clip": 1.05350447, + "balance_loss_mlp": 1.0310601, + "epoch": 0.18012926499323614, + "flos": 19499889640320.0, + "grad_norm": 1.8607496799697536, + "language_loss": 0.84162772, + "learning_rate": 3.768189622421512e-06, + "loss": 0.86353278, + "num_input_tokens_seen": 64756345, + "step": 2996, + "time_per_iteration": 2.696723461151123 + }, + { + "auxiliary_loss_clip": 0.01127214, + "auxiliary_loss_mlp": 0.01039953, + "balance_loss_clip": 1.06094205, + "balance_loss_mlp": 1.02273917, + "epoch": 0.1801893882459041, + "flos": 19464553635840.0, + "grad_norm": 2.1291201116421283, + "language_loss": 0.88189137, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90356302, + "num_input_tokens_seen": 64776375, + "step": 2997, + "time_per_iteration": 2.785522699356079 + }, + { + "auxiliary_loss_clip": 0.01134376, + "auxiliary_loss_mlp": 0.01045962, + "balance_loss_clip": 1.04949927, + "balance_loss_mlp": 1.02753246, + "epoch": 0.18024951149857207, + "flos": 26870590414080.0, + "grad_norm": 1.7579499924576911, + "language_loss": 0.85068727, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.87249064, + "num_input_tokens_seen": 64796210, + "step": 2998, + "time_per_iteration": 2.6912384033203125 + }, + { + "auxiliary_loss_clip": 0.01159537, + "auxiliary_loss_mlp": 0.01044427, + "balance_loss_clip": 1.06019807, + "balance_loss_mlp": 1.02641416, + "epoch": 0.18030963475124004, + "flos": 30226621933440.0, + "grad_norm": 1.8075624565441775, + "language_loss": 0.84176779, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.86380744, + "num_input_tokens_seen": 64818590, + "step": 2999, + "time_per_iteration": 2.722447395324707 + }, + { + "auxiliary_loss_clip": 0.01143605, + "auxiliary_loss_mlp": 0.01047321, + "balance_loss_clip": 1.05324686, + "balance_loss_mlp": 1.02870023, + "epoch": 0.180369758003908, + "flos": 22307493329280.0, + "grad_norm": 1.8789697336390492, + "language_loss": 0.75206578, + "learning_rate": 3.76746109252814e-06, + "loss": 0.77397501, + "num_input_tokens_seen": 64838350, + "step": 3000, + "time_per_iteration": 2.669875144958496 + }, + { + "auxiliary_loss_clip": 0.01130052, + "auxiliary_loss_mlp": 0.00775745, + "balance_loss_clip": 1.0526886, + "balance_loss_mlp": 1.00060582, + "epoch": 0.18042988125657597, + "flos": 23732033788800.0, + "grad_norm": 2.1714361871851704, + "language_loss": 0.71088028, + "learning_rate": 3.76727879248177e-06, + "loss": 0.72993821, + "num_input_tokens_seen": 64858065, + "step": 3001, + "time_per_iteration": 2.7207603454589844 + }, + { + "auxiliary_loss_clip": 0.01150091, + "auxiliary_loss_mlp": 0.01044695, + "balance_loss_clip": 1.05701649, + "balance_loss_mlp": 1.02605033, + "epoch": 0.18049000450924396, + "flos": 24093582134400.0, + "grad_norm": 2.218812983953599, + "language_loss": 0.8849982, + "learning_rate": 3.767096425420011e-06, + "loss": 0.90694606, + "num_input_tokens_seen": 64877305, + "step": 3002, + "time_per_iteration": 2.6577625274658203 + }, + { + "auxiliary_loss_clip": 0.01157827, + "auxiliary_loss_mlp": 0.01048268, + "balance_loss_clip": 1.05624068, + "balance_loss_mlp": 1.03076851, + "epoch": 0.18055012776191193, + "flos": 22163168482560.0, + "grad_norm": 1.6287780165264572, + "language_loss": 0.80328667, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.8253476, + "num_input_tokens_seen": 64896955, + "step": 3003, + "time_per_iteration": 2.6274783611297607 + }, + { + "auxiliary_loss_clip": 0.01158367, + "auxiliary_loss_mlp": 0.01043654, + "balance_loss_clip": 1.05622995, + "balance_loss_mlp": 1.02596307, + "epoch": 0.1806102510145799, + "flos": 28913512440960.0, + "grad_norm": 2.3308952017896956, + "language_loss": 0.67250973, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.69452989, + "num_input_tokens_seen": 64917080, + "step": 3004, + "time_per_iteration": 2.6652631759643555 + }, + { + "auxiliary_loss_clip": 0.01147517, + "auxiliary_loss_mlp": 0.01054518, + "balance_loss_clip": 1.05606318, + "balance_loss_mlp": 1.03528929, + "epoch": 0.18067037426724786, + "flos": 19025689265280.0, + "grad_norm": 2.592432277036083, + "language_loss": 0.85111535, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.87313569, + "num_input_tokens_seen": 64935215, + "step": 3005, + "time_per_iteration": 2.654977560043335 + }, + { + "auxiliary_loss_clip": 0.0114499, + "auxiliary_loss_mlp": 0.01041993, + "balance_loss_clip": 1.05690646, + "balance_loss_mlp": 1.02489829, + "epoch": 0.18073049751991582, + "flos": 27453635976960.0, + "grad_norm": 1.5217876402754629, + "language_loss": 0.83215338, + "learning_rate": 3.766366287157432e-06, + "loss": 0.85402322, + "num_input_tokens_seen": 64956275, + "step": 3006, + "time_per_iteration": 2.7118306159973145 + }, + { + "auxiliary_loss_clip": 0.01127168, + "auxiliary_loss_mlp": 0.01050084, + "balance_loss_clip": 1.05063033, + "balance_loss_mlp": 1.03105807, + "epoch": 0.1807906207725838, + "flos": 28729039167360.0, + "grad_norm": 1.6327495611050657, + "language_loss": 0.77377248, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79554498, + "num_input_tokens_seen": 64979390, + "step": 3007, + "time_per_iteration": 2.7996537685394287 + }, + { + "auxiliary_loss_clip": 0.01070026, + "auxiliary_loss_mlp": 0.01030441, + "balance_loss_clip": 1.04936945, + "balance_loss_mlp": 1.02712655, + "epoch": 0.18085074402525175, + "flos": 64466515468800.0, + "grad_norm": 0.801982400183398, + "language_loss": 0.56987137, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.5908761, + "num_input_tokens_seen": 65043135, + "step": 3008, + "time_per_iteration": 3.4269092082977295 + }, + { + "auxiliary_loss_clip": 0.01130838, + "auxiliary_loss_mlp": 0.01047085, + "balance_loss_clip": 1.05308366, + "balance_loss_mlp": 1.02686691, + "epoch": 0.18091086727791975, + "flos": 23476960333440.0, + "grad_norm": 1.8424126412451678, + "language_loss": 0.67248082, + "learning_rate": 3.765817980138021e-06, + "loss": 0.69426012, + "num_input_tokens_seen": 65062845, + "step": 3009, + "time_per_iteration": 2.7875866889953613 + }, + { + "auxiliary_loss_clip": 0.01161719, + "auxiliary_loss_mlp": 0.01044187, + "balance_loss_clip": 1.0595516, + "balance_loss_mlp": 1.02673507, + "epoch": 0.1809709905305877, + "flos": 24170467196160.0, + "grad_norm": 2.4429360498363986, + "language_loss": 0.75690198, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.778961, + "num_input_tokens_seen": 65082110, + "step": 3010, + "time_per_iteration": 2.6060268878936768 + }, + { + "auxiliary_loss_clip": 0.01127916, + "auxiliary_loss_mlp": 0.01037817, + "balance_loss_clip": 1.05715132, + "balance_loss_mlp": 1.02063942, + "epoch": 0.18103111378325568, + "flos": 21650902669440.0, + "grad_norm": 1.6324915654296899, + "language_loss": 0.67356348, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.69522083, + "num_input_tokens_seen": 65101985, + "step": 3011, + "time_per_iteration": 2.763596534729004 + }, + { + "auxiliary_loss_clip": 0.01105034, + "auxiliary_loss_mlp": 0.00777475, + "balance_loss_clip": 1.04540467, + "balance_loss_mlp": 1.00078559, + "epoch": 0.18109123703592364, + "flos": 53686918356480.0, + "grad_norm": 1.551526807882757, + "language_loss": 0.71288514, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.73171026, + "num_input_tokens_seen": 65129295, + "step": 3012, + "time_per_iteration": 3.037775993347168 + }, + { + "auxiliary_loss_clip": 0.01132189, + "auxiliary_loss_mlp": 0.01052085, + "balance_loss_clip": 1.05564284, + "balance_loss_mlp": 1.03348863, + "epoch": 0.1811513602885916, + "flos": 35845564325760.0, + "grad_norm": 2.095737131475866, + "language_loss": 0.62309992, + "learning_rate": 3.765085966704609e-06, + "loss": 0.64494264, + "num_input_tokens_seen": 65150625, + "step": 3013, + "time_per_iteration": 2.7692227363586426 + }, + { + "auxiliary_loss_clip": 0.01131323, + "auxiliary_loss_mlp": 0.0105253, + "balance_loss_clip": 1.05343401, + "balance_loss_mlp": 1.03486276, + "epoch": 0.18121148354125957, + "flos": 23732572492800.0, + "grad_norm": 1.6679267545988328, + "language_loss": 0.76147234, + "learning_rate": 3.764902795998309e-06, + "loss": 0.78331089, + "num_input_tokens_seen": 65170880, + "step": 3014, + "time_per_iteration": 2.7296786308288574 + }, + { + "auxiliary_loss_clip": 0.01163543, + "auxiliary_loss_mlp": 0.01050053, + "balance_loss_clip": 1.05964816, + "balance_loss_mlp": 1.02987087, + "epoch": 0.18127160679392756, + "flos": 28728320895360.0, + "grad_norm": 2.1234423596691796, + "language_loss": 0.66310829, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.6852442, + "num_input_tokens_seen": 65192530, + "step": 3015, + "time_per_iteration": 2.7575571537017822 + }, + { + "auxiliary_loss_clip": 0.0113004, + "auxiliary_loss_mlp": 0.00776613, + "balance_loss_clip": 1.05429327, + "balance_loss_mlp": 1.00067461, + "epoch": 0.18133173004659553, + "flos": 20485062938880.0, + "grad_norm": 1.7837261279259933, + "language_loss": 0.78152305, + "learning_rate": 3.764536253816785e-06, + "loss": 0.80058956, + "num_input_tokens_seen": 65211675, + "step": 3016, + "time_per_iteration": 2.6718828678131104 + }, + { + "auxiliary_loss_clip": 0.01145073, + "auxiliary_loss_mlp": 0.01049504, + "balance_loss_clip": 1.05684161, + "balance_loss_mlp": 1.03068125, + "epoch": 0.1813918532992635, + "flos": 22852078404480.0, + "grad_norm": 1.7248072345223011, + "language_loss": 0.8351965, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.85714233, + "num_input_tokens_seen": 65231185, + "step": 3017, + "time_per_iteration": 2.6879045963287354 + }, + { + "auxiliary_loss_clip": 0.0114091, + "auxiliary_loss_mlp": 0.01042994, + "balance_loss_clip": 1.05404854, + "balance_loss_mlp": 1.02539897, + "epoch": 0.18145197655193146, + "flos": 36065122208640.0, + "grad_norm": 2.2664795482488924, + "language_loss": 0.6769017, + "learning_rate": 3.764169443989697e-06, + "loss": 0.69874066, + "num_input_tokens_seen": 65251645, + "step": 3018, + "time_per_iteration": 4.31333327293396 + }, + { + "auxiliary_loss_clip": 0.01147629, + "auxiliary_loss_mlp": 0.00776661, + "balance_loss_clip": 1.05706179, + "balance_loss_mlp": 1.00074184, + "epoch": 0.18151209980459942, + "flos": 24023951619840.0, + "grad_norm": 1.8935259017451227, + "language_loss": 0.76396847, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.78321135, + "num_input_tokens_seen": 65271125, + "step": 3019, + "time_per_iteration": 2.7667160034179688 + }, + { + "auxiliary_loss_clip": 0.01121465, + "auxiliary_loss_mlp": 0.01046742, + "balance_loss_clip": 1.05550635, + "balance_loss_mlp": 1.02722728, + "epoch": 0.1815722230572674, + "flos": 23951627585280.0, + "grad_norm": 2.042490471678265, + "language_loss": 0.81550395, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.83718598, + "num_input_tokens_seen": 65290600, + "step": 3020, + "time_per_iteration": 4.3900346755981445 + }, + { + "auxiliary_loss_clip": 0.01136424, + "auxiliary_loss_mlp": 0.01046217, + "balance_loss_clip": 1.05758023, + "balance_loss_mlp": 1.02567708, + "epoch": 0.18163234630993536, + "flos": 24386469632640.0, + "grad_norm": 1.9628186536024828, + "language_loss": 0.7757082, + "learning_rate": 3.763618727535352e-06, + "loss": 0.79753458, + "num_input_tokens_seen": 65311040, + "step": 3021, + "time_per_iteration": 4.3029396533966064 + }, + { + "auxiliary_loss_clip": 0.01143245, + "auxiliary_loss_mlp": 0.01047278, + "balance_loss_clip": 1.05453348, + "balance_loss_mlp": 1.02907431, + "epoch": 0.18169246956260335, + "flos": 24681332378880.0, + "grad_norm": 1.725306643191844, + "language_loss": 0.84863859, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87054378, + "num_input_tokens_seen": 65332115, + "step": 3022, + "time_per_iteration": 2.7353312969207764 + }, + { + "auxiliary_loss_clip": 0.01132435, + "auxiliary_loss_mlp": 0.01042747, + "balance_loss_clip": 1.05769348, + "balance_loss_mlp": 1.0235188, + "epoch": 0.1817525928152713, + "flos": 24243294021120.0, + "grad_norm": 2.230341519134859, + "language_loss": 0.69367266, + "learning_rate": 3.763251248837859e-06, + "loss": 0.71542448, + "num_input_tokens_seen": 65352210, + "step": 3023, + "time_per_iteration": 2.775200605392456 + }, + { + "auxiliary_loss_clip": 0.01127605, + "auxiliary_loss_mlp": 0.01043947, + "balance_loss_clip": 1.04900002, + "balance_loss_mlp": 1.02556491, + "epoch": 0.18181271606793928, + "flos": 16472081623680.0, + "grad_norm": 2.150764188548567, + "language_loss": 0.74107385, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.76278937, + "num_input_tokens_seen": 65370600, + "step": 3024, + "time_per_iteration": 2.7364041805267334 + }, + { + "auxiliary_loss_clip": 0.01145205, + "auxiliary_loss_mlp": 0.01046837, + "balance_loss_clip": 1.05719447, + "balance_loss_mlp": 1.02900314, + "epoch": 0.18187283932060724, + "flos": 18581042805120.0, + "grad_norm": 2.148591016046099, + "language_loss": 0.8835662, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.90548658, + "num_input_tokens_seen": 65387270, + "step": 3025, + "time_per_iteration": 4.274658679962158 + }, + { + "auxiliary_loss_clip": 0.01133667, + "auxiliary_loss_mlp": 0.01050575, + "balance_loss_clip": 1.05470932, + "balance_loss_mlp": 1.03137028, + "epoch": 0.1819329625732752, + "flos": 20266833859200.0, + "grad_norm": 3.6399614210311206, + "language_loss": 0.79041791, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.81226033, + "num_input_tokens_seen": 65406550, + "step": 3026, + "time_per_iteration": 2.7589778900146484 + }, + { + "auxiliary_loss_clip": 0.01132736, + "auxiliary_loss_mlp": 0.01055367, + "balance_loss_clip": 1.05774415, + "balance_loss_mlp": 1.03679442, + "epoch": 0.18199308582594317, + "flos": 25915186512000.0, + "grad_norm": 1.6980721374313217, + "language_loss": 0.759978, + "learning_rate": 3.762515489146692e-06, + "loss": 0.78185904, + "num_input_tokens_seen": 65425955, + "step": 3027, + "time_per_iteration": 2.7347826957702637 + }, + { + "auxiliary_loss_clip": 0.01163558, + "auxiliary_loss_mlp": 0.01053369, + "balance_loss_clip": 1.05835891, + "balance_loss_mlp": 1.03378284, + "epoch": 0.18205320907861114, + "flos": 15377524433280.0, + "grad_norm": 2.2893837743041368, + "language_loss": 0.85592651, + "learning_rate": 3.762331382119546e-06, + "loss": 0.87809575, + "num_input_tokens_seen": 65442820, + "step": 3028, + "time_per_iteration": 2.598905563354492 + }, + { + "auxiliary_loss_clip": 0.01156921, + "auxiliary_loss_mlp": 0.0104449, + "balance_loss_clip": 1.0578618, + "balance_loss_mlp": 1.0260129, + "epoch": 0.18211333233127913, + "flos": 25624310175360.0, + "grad_norm": 1.8897570500397638, + "language_loss": 0.82807779, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.85009193, + "num_input_tokens_seen": 65461825, + "step": 3029, + "time_per_iteration": 2.677332639694214 + }, + { + "auxiliary_loss_clip": 0.01114993, + "auxiliary_loss_mlp": 0.01050232, + "balance_loss_clip": 1.05223596, + "balance_loss_mlp": 1.02931094, + "epoch": 0.1821734555839471, + "flos": 14976007228800.0, + "grad_norm": 10.840079090220346, + "language_loss": 0.78091359, + "learning_rate": 3.761962967588891e-06, + "loss": 0.80256593, + "num_input_tokens_seen": 65479480, + "step": 3030, + "time_per_iteration": 2.6865499019622803 + }, + { + "auxiliary_loss_clip": 0.01139676, + "auxiliary_loss_mlp": 0.01043273, + "balance_loss_clip": 1.05401075, + "balance_loss_mlp": 1.0240562, + "epoch": 0.18223357883661506, + "flos": 20194007034240.0, + "grad_norm": 2.05958060196279, + "language_loss": 0.85162055, + "learning_rate": 3.761778660099352e-06, + "loss": 0.87345004, + "num_input_tokens_seen": 65497775, + "step": 3031, + "time_per_iteration": 2.6336488723754883 + }, + { + "auxiliary_loss_clip": 0.01116657, + "auxiliary_loss_mlp": 0.00776186, + "balance_loss_clip": 1.0497843, + "balance_loss_mlp": 1.00052071, + "epoch": 0.18229370208928303, + "flos": 15231978524160.0, + "grad_norm": 1.83501853384953, + "language_loss": 0.79992211, + "learning_rate": 3.76159428580299e-06, + "loss": 0.81885058, + "num_input_tokens_seen": 65516505, + "step": 3032, + "time_per_iteration": 2.6879780292510986 + }, + { + "auxiliary_loss_clip": 0.01166412, + "auxiliary_loss_mlp": 0.01048902, + "balance_loss_clip": 1.06163025, + "balance_loss_mlp": 1.03038836, + "epoch": 0.182353825341951, + "flos": 23840483927040.0, + "grad_norm": 1.8132660189598853, + "language_loss": 0.81316388, + "learning_rate": 3.761409844706795e-06, + "loss": 0.83531702, + "num_input_tokens_seen": 65536160, + "step": 3033, + "time_per_iteration": 2.628100872039795 + }, + { + "auxiliary_loss_clip": 0.01048591, + "auxiliary_loss_mlp": 0.0100128, + "balance_loss_clip": 1.05392861, + "balance_loss_mlp": 0.99850291, + "epoch": 0.18241394859461896, + "flos": 61190957393280.0, + "grad_norm": 0.8825814513625035, + "language_loss": 0.63439631, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.65489495, + "num_input_tokens_seen": 65589375, + "step": 3034, + "time_per_iteration": 3.2329187393188477 + }, + { + "auxiliary_loss_clip": 0.0112853, + "auxiliary_loss_mlp": 0.01041043, + "balance_loss_clip": 1.05698252, + "balance_loss_mlp": 1.02384114, + "epoch": 0.18247407184728695, + "flos": 18471694826880.0, + "grad_norm": 3.107937736318082, + "language_loss": 0.79893476, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.82063049, + "num_input_tokens_seen": 65606720, + "step": 3035, + "time_per_iteration": 2.7644357681274414 + }, + { + "auxiliary_loss_clip": 0.01134115, + "auxiliary_loss_mlp": 0.01046396, + "balance_loss_clip": 1.05675578, + "balance_loss_mlp": 1.02906322, + "epoch": 0.18253419509995492, + "flos": 21795191602560.0, + "grad_norm": 1.870086430131469, + "language_loss": 0.85076666, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.87257177, + "num_input_tokens_seen": 65625495, + "step": 3036, + "time_per_iteration": 2.7102303504943848 + }, + { + "auxiliary_loss_clip": 0.01140083, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.05572963, + "balance_loss_mlp": 1.02192414, + "epoch": 0.18259431835262288, + "flos": 20149764456960.0, + "grad_norm": 2.1821496235124727, + "language_loss": 0.80254716, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82433879, + "num_input_tokens_seen": 65643515, + "step": 3037, + "time_per_iteration": 2.6703832149505615 + }, + { + "auxiliary_loss_clip": 0.01139652, + "auxiliary_loss_mlp": 0.00776941, + "balance_loss_clip": 1.05986989, + "balance_loss_mlp": 1.00062871, + "epoch": 0.18265444160529085, + "flos": 16981653916800.0, + "grad_norm": 3.0764011293768023, + "language_loss": 0.7950514, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.81421733, + "num_input_tokens_seen": 65658155, + "step": 3038, + "time_per_iteration": 2.7410895824432373 + }, + { + "auxiliary_loss_clip": 0.01125628, + "auxiliary_loss_mlp": 0.01044597, + "balance_loss_clip": 1.05254972, + "balance_loss_mlp": 1.02551126, + "epoch": 0.1827145648579588, + "flos": 34423250509440.0, + "grad_norm": 1.9524772610579864, + "language_loss": 0.67722493, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.69892722, + "num_input_tokens_seen": 65679310, + "step": 3039, + "time_per_iteration": 2.756833076477051 + }, + { + "auxiliary_loss_clip": 0.0113051, + "auxiliary_loss_mlp": 0.01051065, + "balance_loss_clip": 1.053087, + "balance_loss_mlp": 1.03304029, + "epoch": 0.18277468811062678, + "flos": 53287017264000.0, + "grad_norm": 1.8757227718998248, + "language_loss": 0.73394251, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.75575823, + "num_input_tokens_seen": 65705235, + "step": 3040, + "time_per_iteration": 3.026679039001465 + }, + { + "auxiliary_loss_clip": 0.01143558, + "auxiliary_loss_mlp": 0.01042261, + "balance_loss_clip": 1.05585194, + "balance_loss_mlp": 1.02373624, + "epoch": 0.18283481136329474, + "flos": 31650659602560.0, + "grad_norm": 2.017308993436446, + "language_loss": 0.60348576, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.62534392, + "num_input_tokens_seen": 65727575, + "step": 3041, + "time_per_iteration": 2.738554000854492 + }, + { + "auxiliary_loss_clip": 0.01116972, + "auxiliary_loss_mlp": 0.01053827, + "balance_loss_clip": 1.05058599, + "balance_loss_mlp": 1.03544497, + "epoch": 0.18289493461596273, + "flos": 53137664513280.0, + "grad_norm": 2.3558133433802104, + "language_loss": 0.59825706, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.61996508, + "num_input_tokens_seen": 65751370, + "step": 3042, + "time_per_iteration": 3.0009193420410156 + }, + { + "auxiliary_loss_clip": 0.0112422, + "auxiliary_loss_mlp": 0.01046569, + "balance_loss_clip": 1.05319464, + "balance_loss_mlp": 1.02917695, + "epoch": 0.1829550578686307, + "flos": 25589369220480.0, + "grad_norm": 1.5313119565207096, + "language_loss": 0.8757726, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.89748049, + "num_input_tokens_seen": 65771040, + "step": 3043, + "time_per_iteration": 2.7406487464904785 + }, + { + "auxiliary_loss_clip": 0.01056788, + "auxiliary_loss_mlp": 0.01056357, + "balance_loss_clip": 1.04592645, + "balance_loss_mlp": 1.03712869, + "epoch": 0.18301518112129866, + "flos": 22601422321920.0, + "grad_norm": 2.144378235575635, + "language_loss": 0.70980251, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.73093396, + "num_input_tokens_seen": 65789345, + "step": 3044, + "time_per_iteration": 2.785931348800659 + }, + { + "auxiliary_loss_clip": 0.01105073, + "auxiliary_loss_mlp": 0.01059118, + "balance_loss_clip": 1.05111921, + "balance_loss_mlp": 1.0381608, + "epoch": 0.18307530437396663, + "flos": 34020799551360.0, + "grad_norm": 3.097061979225562, + "language_loss": 0.64460731, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66624922, + "num_input_tokens_seen": 65810990, + "step": 3045, + "time_per_iteration": 2.8085720539093018 + }, + { + "auxiliary_loss_clip": 0.01155246, + "auxiliary_loss_mlp": 0.01044973, + "balance_loss_clip": 1.05604315, + "balance_loss_mlp": 1.02780676, + "epoch": 0.1831354276266346, + "flos": 21279765392640.0, + "grad_norm": 11.455833434854163, + "language_loss": 0.78461385, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.80661607, + "num_input_tokens_seen": 65827230, + "step": 3046, + "time_per_iteration": 2.603299140930176 + }, + { + "auxiliary_loss_clip": 0.01118725, + "auxiliary_loss_mlp": 0.01042864, + "balance_loss_clip": 1.04837, + "balance_loss_mlp": 1.0240643, + "epoch": 0.18319555087930256, + "flos": 21032952065280.0, + "grad_norm": 1.9889932097770582, + "language_loss": 0.78733194, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.8089478, + "num_input_tokens_seen": 65845900, + "step": 3047, + "time_per_iteration": 2.7109453678131104 + }, + { + "auxiliary_loss_clip": 0.01144516, + "auxiliary_loss_mlp": 0.01042422, + "balance_loss_clip": 1.05723858, + "balance_loss_mlp": 1.0254705, + "epoch": 0.18325567413197055, + "flos": 34382958428160.0, + "grad_norm": 1.5191744259185578, + "language_loss": 0.80704039, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.82890975, + "num_input_tokens_seen": 65868730, + "step": 3048, + "time_per_iteration": 2.7485053539276123 + }, + { + "auxiliary_loss_clip": 0.01139433, + "auxiliary_loss_mlp": 0.01046004, + "balance_loss_clip": 1.05405188, + "balance_loss_mlp": 1.02552414, + "epoch": 0.18331579738463852, + "flos": 20558464381440.0, + "grad_norm": 2.1437824577601354, + "language_loss": 0.86579728, + "learning_rate": 3.758449708105424e-06, + "loss": 0.88765168, + "num_input_tokens_seen": 65888420, + "step": 3049, + "time_per_iteration": 2.6876962184906006 + }, + { + "auxiliary_loss_clip": 0.01143881, + "auxiliary_loss_mlp": 0.01045208, + "balance_loss_clip": 1.05379057, + "balance_loss_mlp": 1.02544308, + "epoch": 0.18337592063730648, + "flos": 19607872901760.0, + "grad_norm": 2.616661567020713, + "language_loss": 0.77827966, + "learning_rate": 3.75826413248424e-06, + "loss": 0.80017054, + "num_input_tokens_seen": 65905840, + "step": 3050, + "time_per_iteration": 2.5814058780670166 + }, + { + "auxiliary_loss_clip": 0.01126116, + "auxiliary_loss_mlp": 0.01041302, + "balance_loss_clip": 1.04954183, + "balance_loss_mlp": 1.0238502, + "epoch": 0.18343604388997445, + "flos": 20850885002880.0, + "grad_norm": 2.3686375880611656, + "language_loss": 0.99064422, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.01231837, + "num_input_tokens_seen": 65922845, + "step": 3051, + "time_per_iteration": 2.701848268508911 + }, + { + "auxiliary_loss_clip": 0.01125492, + "auxiliary_loss_mlp": 0.010397, + "balance_loss_clip": 1.05189931, + "balance_loss_mlp": 1.02078128, + "epoch": 0.1834961671426424, + "flos": 24394370624640.0, + "grad_norm": 2.0338529701436237, + "language_loss": 0.8607648, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.88241673, + "num_input_tokens_seen": 65945555, + "step": 3052, + "time_per_iteration": 2.7252042293548584 + }, + { + "auxiliary_loss_clip": 0.01152967, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.05449986, + "balance_loss_mlp": 1.02737474, + "epoch": 0.18355629039531038, + "flos": 21251612108160.0, + "grad_norm": 1.8649432496703628, + "language_loss": 0.73393309, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.7559092, + "num_input_tokens_seen": 65963965, + "step": 3053, + "time_per_iteration": 2.6331369876861572 + }, + { + "auxiliary_loss_clip": 0.01158728, + "auxiliary_loss_mlp": 0.01044052, + "balance_loss_clip": 1.05783379, + "balance_loss_mlp": 1.02565801, + "epoch": 0.18361641364797834, + "flos": 28656499651200.0, + "grad_norm": 1.5358769917973574, + "language_loss": 0.61891186, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.64093965, + "num_input_tokens_seen": 65985965, + "step": 3054, + "time_per_iteration": 2.6792421340942383 + }, + { + "auxiliary_loss_clip": 0.01108826, + "auxiliary_loss_mlp": 0.01042654, + "balance_loss_clip": 1.05558836, + "balance_loss_mlp": 1.02502322, + "epoch": 0.18367653690064634, + "flos": 20918827578240.0, + "grad_norm": 2.2474279661883667, + "language_loss": 0.78218341, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80369824, + "num_input_tokens_seen": 66005645, + "step": 3055, + "time_per_iteration": 2.778691053390503 + }, + { + "auxiliary_loss_clip": 0.01096638, + "auxiliary_loss_mlp": 0.01050677, + "balance_loss_clip": 1.05003095, + "balance_loss_mlp": 1.03211594, + "epoch": 0.1837366601533143, + "flos": 28765596234240.0, + "grad_norm": 1.8043720478204575, + "language_loss": 0.7022509, + "learning_rate": 3.757149278859014e-06, + "loss": 0.72372401, + "num_input_tokens_seen": 66025675, + "step": 3056, + "time_per_iteration": 2.794254779815674 + }, + { + "auxiliary_loss_clip": 0.01140367, + "auxiliary_loss_mlp": 0.01038358, + "balance_loss_clip": 1.05211461, + "balance_loss_mlp": 1.02181149, + "epoch": 0.18379678340598227, + "flos": 21251432540160.0, + "grad_norm": 1.8709784760841586, + "language_loss": 0.80357504, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.82536227, + "num_input_tokens_seen": 66046125, + "step": 3057, + "time_per_iteration": 2.644728899002075 + }, + { + "auxiliary_loss_clip": 0.01150041, + "auxiliary_loss_mlp": 0.01043781, + "balance_loss_clip": 1.05482352, + "balance_loss_mlp": 1.02332497, + "epoch": 0.18385690665865023, + "flos": 20449619193600.0, + "grad_norm": 7.225766788646501, + "language_loss": 0.82570755, + "learning_rate": 3.756777127858533e-06, + "loss": 0.84764576, + "num_input_tokens_seen": 66064375, + "step": 3058, + "time_per_iteration": 4.136845588684082 + }, + { + "auxiliary_loss_clip": 0.01119139, + "auxiliary_loss_mlp": 0.00776668, + "balance_loss_clip": 1.04992914, + "balance_loss_mlp": 1.00066566, + "epoch": 0.1839170299113182, + "flos": 26140562398080.0, + "grad_norm": 2.277694088171661, + "language_loss": 0.85071868, + "learning_rate": 3.756590952429017e-06, + "loss": 0.86967677, + "num_input_tokens_seen": 66084590, + "step": 3059, + "time_per_iteration": 2.745020866394043 + }, + { + "auxiliary_loss_clip": 0.01151831, + "auxiliary_loss_mlp": 0.00775088, + "balance_loss_clip": 1.05359423, + "balance_loss_mlp": 1.00077426, + "epoch": 0.18397715316398616, + "flos": 31758032332800.0, + "grad_norm": 2.3540516696336216, + "language_loss": 0.72983348, + "learning_rate": 3.756404710389396e-06, + "loss": 0.74910271, + "num_input_tokens_seen": 66107105, + "step": 3060, + "time_per_iteration": 5.792214393615723 + }, + { + "auxiliary_loss_clip": 0.01149482, + "auxiliary_loss_mlp": 0.01041417, + "balance_loss_clip": 1.05812132, + "balance_loss_mlp": 1.02266574, + "epoch": 0.18403727641665413, + "flos": 24611989173120.0, + "grad_norm": 1.5810457302838978, + "language_loss": 0.73126459, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.75317359, + "num_input_tokens_seen": 66129295, + "step": 3061, + "time_per_iteration": 2.754167318344116 + }, + { + "auxiliary_loss_clip": 0.01138281, + "auxiliary_loss_mlp": 0.01043599, + "balance_loss_clip": 1.05435956, + "balance_loss_mlp": 1.02379823, + "epoch": 0.18409739966932212, + "flos": 23439900476160.0, + "grad_norm": 1.8413104246803462, + "language_loss": 0.81937188, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.8411907, + "num_input_tokens_seen": 66146910, + "step": 3062, + "time_per_iteration": 2.7545394897460938 + }, + { + "auxiliary_loss_clip": 0.01144664, + "auxiliary_loss_mlp": 0.01040639, + "balance_loss_clip": 1.05668104, + "balance_loss_mlp": 1.02259111, + "epoch": 0.18415752292199009, + "flos": 21872112577920.0, + "grad_norm": 2.011374259171591, + "language_loss": 0.72994816, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.75180125, + "num_input_tokens_seen": 66165370, + "step": 3063, + "time_per_iteration": 2.738293170928955 + }, + { + "auxiliary_loss_clip": 0.01133824, + "auxiliary_loss_mlp": 0.01040987, + "balance_loss_clip": 1.05164194, + "balance_loss_mlp": 1.02490544, + "epoch": 0.18421764617465805, + "flos": 25410678036480.0, + "grad_norm": 2.2975785147287953, + "language_loss": 0.65614092, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.67788899, + "num_input_tokens_seen": 66186210, + "step": 3064, + "time_per_iteration": 4.404583930969238 + }, + { + "auxiliary_loss_clip": 0.01141547, + "auxiliary_loss_mlp": 0.01042996, + "balance_loss_clip": 1.05395937, + "balance_loss_mlp": 1.02498376, + "epoch": 0.18427776942732602, + "flos": 27198131558400.0, + "grad_norm": 2.1874829734431898, + "language_loss": 0.68347883, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.70532429, + "num_input_tokens_seen": 66204800, + "step": 3065, + "time_per_iteration": 2.7149577140808105 + }, + { + "auxiliary_loss_clip": 0.01136969, + "auxiliary_loss_mlp": 0.01045319, + "balance_loss_clip": 1.05518305, + "balance_loss_mlp": 1.02674615, + "epoch": 0.18433789267999398, + "flos": 27852351920640.0, + "grad_norm": 2.2758854533642925, + "language_loss": 0.73142231, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.75324523, + "num_input_tokens_seen": 66222195, + "step": 3066, + "time_per_iteration": 2.672675609588623 + }, + { + "auxiliary_loss_clip": 0.01125186, + "auxiliary_loss_mlp": 0.01043389, + "balance_loss_clip": 1.04947495, + "balance_loss_mlp": 1.0256983, + "epoch": 0.18439801593266195, + "flos": 17856940533120.0, + "grad_norm": 2.1067167513095444, + "language_loss": 0.82191038, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.8435961, + "num_input_tokens_seen": 66239505, + "step": 3067, + "time_per_iteration": 2.697768211364746 + }, + { + "auxiliary_loss_clip": 0.01082345, + "auxiliary_loss_mlp": 0.00756782, + "balance_loss_clip": 1.04466891, + "balance_loss_mlp": 1.00113225, + "epoch": 0.18445813918532994, + "flos": 56389522590720.0, + "grad_norm": 0.7960107429271657, + "language_loss": 0.59750569, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61589694, + "num_input_tokens_seen": 66295695, + "step": 3068, + "time_per_iteration": 3.0305213928222656 + }, + { + "auxiliary_loss_clip": 0.01127048, + "auxiliary_loss_mlp": 0.01041294, + "balance_loss_clip": 1.05452299, + "balance_loss_mlp": 1.02356791, + "epoch": 0.1845182624379979, + "flos": 20957180325120.0, + "grad_norm": 3.7299324256794244, + "language_loss": 0.76434112, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.78602457, + "num_input_tokens_seen": 66315315, + "step": 3069, + "time_per_iteration": 2.6757962703704834 + }, + { + "auxiliary_loss_clip": 0.01146412, + "auxiliary_loss_mlp": 0.010456, + "balance_loss_clip": 1.05468106, + "balance_loss_mlp": 1.02798057, + "epoch": 0.18457838569066587, + "flos": 20485170679680.0, + "grad_norm": 1.9225240149566294, + "language_loss": 0.8491416, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.87106168, + "num_input_tokens_seen": 66333675, + "step": 3070, + "time_per_iteration": 2.617023229598999 + }, + { + "auxiliary_loss_clip": 0.01127789, + "auxiliary_loss_mlp": 0.01043452, + "balance_loss_clip": 1.0553112, + "balance_loss_mlp": 1.02510571, + "epoch": 0.18463850894333383, + "flos": 25010022758400.0, + "grad_norm": 6.700503585098448, + "language_loss": 0.77807182, + "learning_rate": 3.754351653708265e-06, + "loss": 0.79978424, + "num_input_tokens_seen": 66354075, + "step": 3071, + "time_per_iteration": 2.847329616546631 + }, + { + "auxiliary_loss_clip": 0.01109458, + "auxiliary_loss_mlp": 0.01049978, + "balance_loss_clip": 1.05054557, + "balance_loss_mlp": 1.03154778, + "epoch": 0.1846986321960018, + "flos": 16800628348800.0, + "grad_norm": 2.0836336776071565, + "language_loss": 0.77414191, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.79573631, + "num_input_tokens_seen": 66372520, + "step": 3072, + "time_per_iteration": 2.780921220779419 + }, + { + "auxiliary_loss_clip": 0.01138997, + "auxiliary_loss_mlp": 0.01043594, + "balance_loss_clip": 1.05106127, + "balance_loss_mlp": 1.02465141, + "epoch": 0.18475875544866976, + "flos": 20814327936000.0, + "grad_norm": 4.959080593148226, + "language_loss": 0.86546457, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.88729048, + "num_input_tokens_seen": 66390745, + "step": 3073, + "time_per_iteration": 2.631913661956787 + }, + { + "auxiliary_loss_clip": 0.01158717, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.05862749, + "balance_loss_mlp": 1.02366686, + "epoch": 0.18481887870133773, + "flos": 22601422321920.0, + "grad_norm": 2.162700927804164, + "language_loss": 0.91831195, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.94030046, + "num_input_tokens_seen": 66410525, + "step": 3074, + "time_per_iteration": 2.6152567863464355 + }, + { + "auxiliary_loss_clip": 0.01104968, + "auxiliary_loss_mlp": 0.01047718, + "balance_loss_clip": 1.04757643, + "balance_loss_mlp": 1.02763104, + "epoch": 0.18487900195400572, + "flos": 29458815788160.0, + "grad_norm": 1.9967983521568784, + "language_loss": 0.64783108, + "learning_rate": 3.75360309139087e-06, + "loss": 0.66935796, + "num_input_tokens_seen": 66432535, + "step": 3075, + "time_per_iteration": 2.763559103012085 + }, + { + "auxiliary_loss_clip": 0.01135247, + "auxiliary_loss_mlp": 0.01046601, + "balance_loss_clip": 1.05689573, + "balance_loss_mlp": 1.02913702, + "epoch": 0.1849391252066737, + "flos": 20628777254400.0, + "grad_norm": 1.8996898495981898, + "language_loss": 0.72803432, + "learning_rate": 3.753415784551761e-06, + "loss": 0.74985278, + "num_input_tokens_seen": 66450620, + "step": 3076, + "time_per_iteration": 2.76629376411438 + }, + { + "auxiliary_loss_clip": 0.01124833, + "auxiliary_loss_mlp": 0.01042344, + "balance_loss_clip": 1.0584389, + "balance_loss_mlp": 1.0249157, + "epoch": 0.18499924845934165, + "flos": 14428549065600.0, + "grad_norm": 2.4862024108169556, + "language_loss": 0.80772626, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.82939804, + "num_input_tokens_seen": 66467865, + "step": 3077, + "time_per_iteration": 2.7296142578125 + }, + { + "auxiliary_loss_clip": 0.01128471, + "auxiliary_loss_mlp": 0.01041495, + "balance_loss_clip": 1.05401397, + "balance_loss_mlp": 1.02428079, + "epoch": 0.18505937171200962, + "flos": 23727652329600.0, + "grad_norm": 1.8214336253769514, + "language_loss": 0.78693211, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.80863178, + "num_input_tokens_seen": 66486245, + "step": 3078, + "time_per_iteration": 2.715838670730591 + }, + { + "auxiliary_loss_clip": 0.01154963, + "auxiliary_loss_mlp": 0.01043373, + "balance_loss_clip": 1.05546641, + "balance_loss_mlp": 1.02655268, + "epoch": 0.18511949496467758, + "flos": 25957489754880.0, + "grad_norm": 1.7455066055145632, + "language_loss": 0.77326959, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.79525292, + "num_input_tokens_seen": 66506510, + "step": 3079, + "time_per_iteration": 2.674128770828247 + }, + { + "auxiliary_loss_clip": 0.01119079, + "auxiliary_loss_mlp": 0.01041512, + "balance_loss_clip": 1.04717147, + "balance_loss_mlp": 1.02328515, + "epoch": 0.18517961821734555, + "flos": 42413553912960.0, + "grad_norm": 1.885086933557342, + "language_loss": 0.82143807, + "learning_rate": 3.752665892369369e-06, + "loss": 0.84304404, + "num_input_tokens_seen": 66530960, + "step": 3080, + "time_per_iteration": 2.906940460205078 + }, + { + "auxiliary_loss_clip": 0.01123637, + "auxiliary_loss_mlp": 0.01044031, + "balance_loss_clip": 1.05894399, + "balance_loss_mlp": 1.02563691, + "epoch": 0.18523974147001354, + "flos": 24097568544000.0, + "grad_norm": 2.065822240576764, + "language_loss": 0.73973286, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.76140958, + "num_input_tokens_seen": 66550275, + "step": 3081, + "time_per_iteration": 2.7960739135742188 + }, + { + "auxiliary_loss_clip": 0.01126977, + "auxiliary_loss_mlp": 0.01051674, + "balance_loss_clip": 1.05360913, + "balance_loss_mlp": 1.03286242, + "epoch": 0.1852998647226815, + "flos": 27375278457600.0, + "grad_norm": 1.9854893879184425, + "language_loss": 0.71991849, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.74170506, + "num_input_tokens_seen": 66569040, + "step": 3082, + "time_per_iteration": 2.6965079307556152 + }, + { + "auxiliary_loss_clip": 0.01124933, + "auxiliary_loss_mlp": 0.01046296, + "balance_loss_clip": 1.05649543, + "balance_loss_mlp": 1.02694798, + "epoch": 0.18535998797534947, + "flos": 18332757020160.0, + "grad_norm": 2.0424653419479886, + "language_loss": 0.69580144, + "learning_rate": 3.752102775364407e-06, + "loss": 0.71751374, + "num_input_tokens_seen": 66587775, + "step": 3083, + "time_per_iteration": 2.727252721786499 + }, + { + "auxiliary_loss_clip": 0.01122388, + "auxiliary_loss_mlp": 0.01046999, + "balance_loss_clip": 1.05204451, + "balance_loss_mlp": 1.02964258, + "epoch": 0.18542011122801744, + "flos": 37845859887360.0, + "grad_norm": 2.185713468975319, + "language_loss": 0.68965334, + "learning_rate": 3.751914936806767e-06, + "loss": 0.71134722, + "num_input_tokens_seen": 66610800, + "step": 3084, + "time_per_iteration": 2.95849871635437 + }, + { + "auxiliary_loss_clip": 0.01155184, + "auxiliary_loss_mlp": 0.01043029, + "balance_loss_clip": 1.05578482, + "balance_loss_mlp": 1.0257436, + "epoch": 0.1854802344806854, + "flos": 25186128163200.0, + "grad_norm": 1.6859724806626923, + "language_loss": 0.77390355, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.79588568, + "num_input_tokens_seen": 66630960, + "step": 3085, + "time_per_iteration": 2.68961501121521 + }, + { + "auxiliary_loss_clip": 0.01152089, + "auxiliary_loss_mlp": 0.01049004, + "balance_loss_clip": 1.05316019, + "balance_loss_mlp": 1.03142118, + "epoch": 0.18554035773335337, + "flos": 26684788337280.0, + "grad_norm": 1.993169596996871, + "language_loss": 0.73752379, + "learning_rate": 3.751539060400244e-06, + "loss": 0.75953472, + "num_input_tokens_seen": 66650585, + "step": 3086, + "time_per_iteration": 2.652475595474243 + }, + { + "auxiliary_loss_clip": 0.01142754, + "auxiliary_loss_mlp": 0.01049865, + "balance_loss_clip": 1.05530787, + "balance_loss_mlp": 1.03134012, + "epoch": 0.18560048098602133, + "flos": 22346887570560.0, + "grad_norm": 7.927127736744579, + "language_loss": 0.69762361, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.71954978, + "num_input_tokens_seen": 66670045, + "step": 3087, + "time_per_iteration": 2.668849229812622 + }, + { + "auxiliary_loss_clip": 0.01119022, + "auxiliary_loss_mlp": 0.01055302, + "balance_loss_clip": 1.05543649, + "balance_loss_mlp": 1.03546548, + "epoch": 0.18566060423868933, + "flos": 17748526308480.0, + "grad_norm": 2.1117122734340263, + "language_loss": 0.72513628, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.74687952, + "num_input_tokens_seen": 66688790, + "step": 3088, + "time_per_iteration": 2.7150719165802 + }, + { + "auxiliary_loss_clip": 0.0112638, + "auxiliary_loss_mlp": 0.01044188, + "balance_loss_clip": 1.04933047, + "balance_loss_mlp": 1.02616334, + "epoch": 0.1857207274913573, + "flos": 24677274142080.0, + "grad_norm": 2.112009927874319, + "language_loss": 0.91859758, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.94030321, + "num_input_tokens_seen": 66708090, + "step": 3089, + "time_per_iteration": 2.7239248752593994 + }, + { + "auxiliary_loss_clip": 0.01104754, + "auxiliary_loss_mlp": 0.01046981, + "balance_loss_clip": 1.0494597, + "balance_loss_mlp": 1.02919531, + "epoch": 0.18578085074402526, + "flos": 28147825198080.0, + "grad_norm": 2.490831087537115, + "language_loss": 0.57275403, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.59427136, + "num_input_tokens_seen": 66727320, + "step": 3090, + "time_per_iteration": 2.8263309001922607 + }, + { + "auxiliary_loss_clip": 0.01125877, + "auxiliary_loss_mlp": 0.0104478, + "balance_loss_clip": 1.04981184, + "balance_loss_mlp": 1.02636242, + "epoch": 0.18584097399669322, + "flos": 23951878980480.0, + "grad_norm": 1.7797305478565062, + "language_loss": 0.81704801, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.83875453, + "num_input_tokens_seen": 66747505, + "step": 3091, + "time_per_iteration": 2.697525978088379 + }, + { + "auxiliary_loss_clip": 0.01101743, + "auxiliary_loss_mlp": 0.01050837, + "balance_loss_clip": 1.04999971, + "balance_loss_mlp": 1.03277707, + "epoch": 0.18590109724936119, + "flos": 17201678676480.0, + "grad_norm": 2.0826959244757832, + "language_loss": 0.83704746, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.8585732, + "num_input_tokens_seen": 66766425, + "step": 3092, + "time_per_iteration": 2.8379435539245605 + }, + { + "auxiliary_loss_clip": 0.01136846, + "auxiliary_loss_mlp": 0.01048758, + "balance_loss_clip": 1.05389428, + "balance_loss_mlp": 1.03036356, + "epoch": 0.18596122050202915, + "flos": 17234644383360.0, + "grad_norm": 5.439917179387958, + "language_loss": 0.93443698, + "learning_rate": 3.750221401168038e-06, + "loss": 0.95629299, + "num_input_tokens_seen": 66781130, + "step": 3093, + "time_per_iteration": 2.8053483963012695 + }, + { + "auxiliary_loss_clip": 0.01130362, + "auxiliary_loss_mlp": 0.01042367, + "balance_loss_clip": 1.05440521, + "balance_loss_mlp": 1.02464092, + "epoch": 0.18602134375469712, + "flos": 19020733188480.0, + "grad_norm": 1.7318887555782294, + "language_loss": 0.77516603, + "learning_rate": 3.750032898603443e-06, + "loss": 0.7968933, + "num_input_tokens_seen": 66797535, + "step": 3094, + "time_per_iteration": 2.7402310371398926 + }, + { + "auxiliary_loss_clip": 0.0109741, + "auxiliary_loss_mlp": 0.01049219, + "balance_loss_clip": 1.0519228, + "balance_loss_mlp": 1.0323391, + "epoch": 0.1860814670073651, + "flos": 50950094417280.0, + "grad_norm": 1.7033453736007413, + "language_loss": 0.69854707, + "learning_rate": 3.749844329677425e-06, + "loss": 0.72001338, + "num_input_tokens_seen": 66821720, + "step": 3095, + "time_per_iteration": 3.133192777633667 + }, + { + "auxiliary_loss_clip": 0.01113224, + "auxiliary_loss_mlp": 0.010546, + "balance_loss_clip": 1.0511899, + "balance_loss_mlp": 1.03415525, + "epoch": 0.18614159026003307, + "flos": 19390972625280.0, + "grad_norm": 2.2828801406167307, + "language_loss": 0.81214821, + "learning_rate": 3.749655694397135e-06, + "loss": 0.83382642, + "num_input_tokens_seen": 66839060, + "step": 3096, + "time_per_iteration": 2.7599101066589355 + }, + { + "auxiliary_loss_clip": 0.01147399, + "auxiliary_loss_mlp": 0.0104683, + "balance_loss_clip": 1.05678356, + "balance_loss_mlp": 1.02810192, + "epoch": 0.18620171351270104, + "flos": 21798782962560.0, + "grad_norm": 2.430947734084612, + "language_loss": 0.75326216, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.77520448, + "num_input_tokens_seen": 66857760, + "step": 3097, + "time_per_iteration": 4.255983114242554 + }, + { + "auxiliary_loss_clip": 0.01133757, + "auxiliary_loss_mlp": 0.01050365, + "balance_loss_clip": 1.05756521, + "balance_loss_mlp": 1.03228104, + "epoch": 0.186261836765369, + "flos": 16362877299840.0, + "grad_norm": 2.553895603581972, + "language_loss": 0.66602015, + "learning_rate": 3.749278224802352e-06, + "loss": 0.68786132, + "num_input_tokens_seen": 66876460, + "step": 3098, + "time_per_iteration": 2.723567247390747 + }, + { + "auxiliary_loss_clip": 0.01163461, + "auxiliary_loss_mlp": 0.01052357, + "balance_loss_clip": 1.05991709, + "balance_loss_mlp": 1.03212702, + "epoch": 0.18632196001803697, + "flos": 23370054480000.0, + "grad_norm": 1.6168121451860142, + "language_loss": 0.69838905, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.7205472, + "num_input_tokens_seen": 66897960, + "step": 3099, + "time_per_iteration": 5.687380075454712 + }, + { + "auxiliary_loss_clip": 0.01148363, + "auxiliary_loss_mlp": 0.01051556, + "balance_loss_clip": 1.05713868, + "balance_loss_mlp": 1.03243458, + "epoch": 0.18638208327070493, + "flos": 22492002516480.0, + "grad_norm": 1.7060244708994476, + "language_loss": 0.71840072, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.74039996, + "num_input_tokens_seen": 66917675, + "step": 3100, + "time_per_iteration": 2.6711015701293945 + }, + { + "auxiliary_loss_clip": 0.01138377, + "auxiliary_loss_mlp": 0.01050667, + "balance_loss_clip": 1.05749035, + "balance_loss_mlp": 1.03133154, + "epoch": 0.18644220652337293, + "flos": 29165245931520.0, + "grad_norm": 1.9639279354826686, + "language_loss": 0.80343997, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82533038, + "num_input_tokens_seen": 66936000, + "step": 3101, + "time_per_iteration": 2.6996583938598633 + }, + { + "auxiliary_loss_clip": 0.01112778, + "auxiliary_loss_mlp": 0.01042097, + "balance_loss_clip": 1.05307627, + "balance_loss_mlp": 1.02478826, + "epoch": 0.1865023297760409, + "flos": 24243796811520.0, + "grad_norm": 1.8804860702941575, + "language_loss": 0.77053607, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.79208481, + "num_input_tokens_seen": 66955700, + "step": 3102, + "time_per_iteration": 2.726146936416626 + }, + { + "auxiliary_loss_clip": 0.01150817, + "auxiliary_loss_mlp": 0.01039303, + "balance_loss_clip": 1.057688, + "balance_loss_mlp": 1.0213027, + "epoch": 0.18656245302870886, + "flos": 19128716449920.0, + "grad_norm": 2.314682178811096, + "language_loss": 0.76689744, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.78879869, + "num_input_tokens_seen": 66972815, + "step": 3103, + "time_per_iteration": 4.374122619628906 + }, + { + "auxiliary_loss_clip": 0.01132531, + "auxiliary_loss_mlp": 0.0104481, + "balance_loss_clip": 1.05477643, + "balance_loss_mlp": 1.02671361, + "epoch": 0.18662257628137682, + "flos": 17786088956160.0, + "grad_norm": 1.6956506235876265, + "language_loss": 0.79252636, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.8142997, + "num_input_tokens_seen": 66992280, + "step": 3104, + "time_per_iteration": 2.695012092590332 + }, + { + "auxiliary_loss_clip": 0.01106786, + "auxiliary_loss_mlp": 0.01050273, + "balance_loss_clip": 1.05117702, + "balance_loss_mlp": 1.03096056, + "epoch": 0.1866826995340448, + "flos": 24024382583040.0, + "grad_norm": 2.065624302338532, + "language_loss": 0.8496474, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87121809, + "num_input_tokens_seen": 67012220, + "step": 3105, + "time_per_iteration": 2.761521816253662 + }, + { + "auxiliary_loss_clip": 0.0112324, + "auxiliary_loss_mlp": 0.01043689, + "balance_loss_clip": 1.05166531, + "balance_loss_mlp": 1.02407932, + "epoch": 0.18674282278671275, + "flos": 26141244756480.0, + "grad_norm": 1.8352441384571676, + "language_loss": 0.86880243, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.8904717, + "num_input_tokens_seen": 67032030, + "step": 3106, + "time_per_iteration": 2.785738706588745 + }, + { + "auxiliary_loss_clip": 0.01150222, + "auxiliary_loss_mlp": 0.01040973, + "balance_loss_clip": 1.0566026, + "balance_loss_mlp": 1.02281737, + "epoch": 0.18680294603938072, + "flos": 19201938324480.0, + "grad_norm": 2.128833658771433, + "language_loss": 0.78226906, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.80418098, + "num_input_tokens_seen": 67048920, + "step": 3107, + "time_per_iteration": 2.693995237350464 + }, + { + "auxiliary_loss_clip": 0.01153763, + "auxiliary_loss_mlp": 0.01053056, + "balance_loss_clip": 1.05873394, + "balance_loss_mlp": 1.03341043, + "epoch": 0.1868630692920487, + "flos": 28544889116160.0, + "grad_norm": 3.0927798335187506, + "language_loss": 0.74159014, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.7636584, + "num_input_tokens_seen": 67068645, + "step": 3108, + "time_per_iteration": 2.795715570449829 + }, + { + "auxiliary_loss_clip": 0.01107582, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_clip": 1.05207491, + "balance_loss_mlp": 1.02451098, + "epoch": 0.18692319254471668, + "flos": 17238020261760.0, + "grad_norm": 1.6837485322309411, + "language_loss": 0.74348569, + "learning_rate": 3.747197400772658e-06, + "loss": 0.76498872, + "num_input_tokens_seen": 67087075, + "step": 3109, + "time_per_iteration": 2.7627830505371094 + }, + { + "auxiliary_loss_clip": 0.01145572, + "auxiliary_loss_mlp": 0.01044117, + "balance_loss_clip": 1.05631042, + "balance_loss_mlp": 1.02526462, + "epoch": 0.18698331579738464, + "flos": 23185186156800.0, + "grad_norm": 1.499459601293056, + "language_loss": 0.84250218, + "learning_rate": 3.747007837284772e-06, + "loss": 0.86439908, + "num_input_tokens_seen": 67108040, + "step": 3110, + "time_per_iteration": 2.7665328979492188 + }, + { + "auxiliary_loss_clip": 0.01147578, + "auxiliary_loss_mlp": 0.01042389, + "balance_loss_clip": 1.05929494, + "balance_loss_mlp": 1.02381575, + "epoch": 0.1870434390500526, + "flos": 25516721963520.0, + "grad_norm": 1.9108380391903876, + "language_loss": 0.84738445, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.86928415, + "num_input_tokens_seen": 67127605, + "step": 3111, + "time_per_iteration": 2.729233741760254 + }, + { + "auxiliary_loss_clip": 0.01128, + "auxiliary_loss_mlp": 0.01044544, + "balance_loss_clip": 1.05348754, + "balance_loss_mlp": 1.02635229, + "epoch": 0.18710356230272057, + "flos": 19500823393920.0, + "grad_norm": 1.8704338434966796, + "language_loss": 0.76875687, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.79048228, + "num_input_tokens_seen": 67145785, + "step": 3112, + "time_per_iteration": 2.7392494678497314 + }, + { + "auxiliary_loss_clip": 0.0114846, + "auxiliary_loss_mlp": 0.0104709, + "balance_loss_clip": 1.05636978, + "balance_loss_mlp": 1.02913654, + "epoch": 0.18716368555538854, + "flos": 26760847386240.0, + "grad_norm": 1.8996972204761096, + "language_loss": 0.64466536, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.66662085, + "num_input_tokens_seen": 67165930, + "step": 3113, + "time_per_iteration": 2.7393765449523926 + }, + { + "auxiliary_loss_clip": 0.01153807, + "auxiliary_loss_mlp": 0.01048748, + "balance_loss_clip": 1.05685568, + "balance_loss_mlp": 1.02900672, + "epoch": 0.1872238088080565, + "flos": 25189827264000.0, + "grad_norm": 6.483287708452815, + "language_loss": 0.817972, + "learning_rate": 3.746248920938024e-06, + "loss": 0.83999759, + "num_input_tokens_seen": 67185830, + "step": 3114, + "time_per_iteration": 2.740229368209839 + }, + { + "auxiliary_loss_clip": 0.01104278, + "auxiliary_loss_mlp": 0.01050738, + "balance_loss_clip": 1.04921412, + "balance_loss_mlp": 1.03024614, + "epoch": 0.1872839320607245, + "flos": 24134305178880.0, + "grad_norm": 2.3064843449079175, + "language_loss": 0.57413173, + "learning_rate": 3.74605902628851e-06, + "loss": 0.59568191, + "num_input_tokens_seen": 67206930, + "step": 3115, + "time_per_iteration": 2.811549663543701 + }, + { + "auxiliary_loss_clip": 0.01123025, + "auxiliary_loss_mlp": 0.01052226, + "balance_loss_clip": 1.05446446, + "balance_loss_mlp": 1.03241396, + "epoch": 0.18734405531339246, + "flos": 21173793292800.0, + "grad_norm": 2.577640519639585, + "language_loss": 0.70842528, + "learning_rate": 3.745869065428261e-06, + "loss": 0.73017788, + "num_input_tokens_seen": 67226290, + "step": 3116, + "time_per_iteration": 2.8053951263427734 + }, + { + "auxiliary_loss_clip": 0.0115042, + "auxiliary_loss_mlp": 0.01035569, + "balance_loss_clip": 1.05196476, + "balance_loss_mlp": 1.01787841, + "epoch": 0.18740417856606043, + "flos": 17237697039360.0, + "grad_norm": 3.010261965906642, + "language_loss": 0.78994375, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.81180358, + "num_input_tokens_seen": 67244410, + "step": 3117, + "time_per_iteration": 2.819415330886841 + }, + { + "auxiliary_loss_clip": 0.01132901, + "auxiliary_loss_mlp": 0.01049724, + "balance_loss_clip": 1.05260777, + "balance_loss_mlp": 1.03047204, + "epoch": 0.1874643018187284, + "flos": 32558049999360.0, + "grad_norm": 2.2828109389679865, + "language_loss": 0.83903432, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86086059, + "num_input_tokens_seen": 67264470, + "step": 3118, + "time_per_iteration": 2.783804416656494 + }, + { + "auxiliary_loss_clip": 0.01144867, + "auxiliary_loss_mlp": 0.0104452, + "balance_loss_clip": 1.05412436, + "balance_loss_mlp": 1.02688873, + "epoch": 0.18752442507139636, + "flos": 23258156636160.0, + "grad_norm": 3.566737352043019, + "language_loss": 0.76283264, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.78472656, + "num_input_tokens_seen": 67284315, + "step": 3119, + "time_per_iteration": 2.6872506141662598 + }, + { + "auxiliary_loss_clip": 0.01156835, + "auxiliary_loss_mlp": 0.01046653, + "balance_loss_clip": 1.05519438, + "balance_loss_mlp": 1.02899814, + "epoch": 0.18758454832406432, + "flos": 21760933006080.0, + "grad_norm": 1.7224942549361077, + "language_loss": 0.82017547, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.84221041, + "num_input_tokens_seen": 67302780, + "step": 3120, + "time_per_iteration": 2.637505292892456 + }, + { + "auxiliary_loss_clip": 0.0113033, + "auxiliary_loss_mlp": 0.01035538, + "balance_loss_clip": 1.05060756, + "balance_loss_mlp": 1.01828837, + "epoch": 0.1876446715767323, + "flos": 29570210841600.0, + "grad_norm": 2.5027223446471982, + "language_loss": 0.84992659, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.87158525, + "num_input_tokens_seen": 67323405, + "step": 3121, + "time_per_iteration": 2.788353681564331 + }, + { + "auxiliary_loss_clip": 0.01096681, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.045645, + "balance_loss_mlp": 1.02599168, + "epoch": 0.18770479482940028, + "flos": 30339992234880.0, + "grad_norm": 2.1738591443482362, + "language_loss": 0.70032287, + "learning_rate": 3.744727910244937e-06, + "loss": 0.72173256, + "num_input_tokens_seen": 67345800, + "step": 3122, + "time_per_iteration": 3.0225250720977783 + }, + { + "auxiliary_loss_clip": 0.01153439, + "auxiliary_loss_mlp": 0.01042355, + "balance_loss_clip": 1.05445123, + "balance_loss_mlp": 1.02288795, + "epoch": 0.18776491808206824, + "flos": 14465357527680.0, + "grad_norm": 4.839579375412361, + "language_loss": 0.70661515, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.72857308, + "num_input_tokens_seen": 67363575, + "step": 3123, + "time_per_iteration": 2.779904365539551 + }, + { + "auxiliary_loss_clip": 0.01142265, + "auxiliary_loss_mlp": 0.01041425, + "balance_loss_clip": 1.05286181, + "balance_loss_mlp": 1.02454507, + "epoch": 0.1878250413347362, + "flos": 24498547044480.0, + "grad_norm": 2.057520579072589, + "language_loss": 0.74103826, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.76287514, + "num_input_tokens_seen": 67381765, + "step": 3124, + "time_per_iteration": 2.6336071491241455 + }, + { + "auxiliary_loss_clip": 0.01157579, + "auxiliary_loss_mlp": 0.01052509, + "balance_loss_clip": 1.05653572, + "balance_loss_mlp": 1.03333998, + "epoch": 0.18788516458740417, + "flos": 39786185692800.0, + "grad_norm": 3.0670363966795096, + "language_loss": 0.80654436, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.82864523, + "num_input_tokens_seen": 67405000, + "step": 3125, + "time_per_iteration": 2.7224199771881104 + }, + { + "auxiliary_loss_clip": 0.01046615, + "auxiliary_loss_mlp": 0.01006504, + "balance_loss_clip": 1.04444218, + "balance_loss_mlp": 1.00435853, + "epoch": 0.18794528784007214, + "flos": 64699250664960.0, + "grad_norm": 0.9424570711133922, + "language_loss": 0.63647306, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65700436, + "num_input_tokens_seen": 67467140, + "step": 3126, + "time_per_iteration": 3.313321113586426 + }, + { + "auxiliary_loss_clip": 0.01128308, + "auxiliary_loss_mlp": 0.01040458, + "balance_loss_clip": 1.05377257, + "balance_loss_mlp": 1.02236164, + "epoch": 0.1880054110927401, + "flos": 28622061486720.0, + "grad_norm": 1.8734163453478039, + "language_loss": 0.81308508, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.83477271, + "num_input_tokens_seen": 67487980, + "step": 3127, + "time_per_iteration": 2.7137866020202637 + }, + { + "auxiliary_loss_clip": 0.01088267, + "auxiliary_loss_mlp": 0.0101138, + "balance_loss_clip": 1.04814553, + "balance_loss_mlp": 1.00912714, + "epoch": 0.1880655343454081, + "flos": 64488958490880.0, + "grad_norm": 0.7699217277386954, + "language_loss": 0.61922526, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.64022171, + "num_input_tokens_seen": 67552500, + "step": 3128, + "time_per_iteration": 3.264270782470703 + }, + { + "auxiliary_loss_clip": 0.01108205, + "auxiliary_loss_mlp": 0.01049422, + "balance_loss_clip": 1.04763842, + "balance_loss_mlp": 1.02907288, + "epoch": 0.18812565759807606, + "flos": 32124464928000.0, + "grad_norm": 2.4867495334212175, + "language_loss": 0.70985162, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.73142785, + "num_input_tokens_seen": 67573295, + "step": 3129, + "time_per_iteration": 2.79929256439209 + }, + { + "auxiliary_loss_clip": 0.01158485, + "auxiliary_loss_mlp": 0.01050611, + "balance_loss_clip": 1.05767536, + "balance_loss_mlp": 1.03109634, + "epoch": 0.18818578085074403, + "flos": 20624539449600.0, + "grad_norm": 2.4831518001798676, + "language_loss": 0.85035253, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87244344, + "num_input_tokens_seen": 67590010, + "step": 3130, + "time_per_iteration": 2.60624361038208 + }, + { + "auxiliary_loss_clip": 0.01107202, + "auxiliary_loss_mlp": 0.01049966, + "balance_loss_clip": 1.04649067, + "balance_loss_mlp": 1.03023696, + "epoch": 0.188245904103412, + "flos": 28840506048000.0, + "grad_norm": 9.096753382647533, + "language_loss": 0.7643525, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.7859242, + "num_input_tokens_seen": 67611110, + "step": 3131, + "time_per_iteration": 2.759230136871338 + }, + { + "auxiliary_loss_clip": 0.0112329, + "auxiliary_loss_mlp": 0.01049221, + "balance_loss_clip": 1.05344164, + "balance_loss_mlp": 1.03014708, + "epoch": 0.18830602735607996, + "flos": 29420319386880.0, + "grad_norm": 2.109252219381847, + "language_loss": 0.80713749, + "learning_rate": 3.74282069289017e-06, + "loss": 0.82886261, + "num_input_tokens_seen": 67631990, + "step": 3132, + "time_per_iteration": 2.773817777633667 + }, + { + "auxiliary_loss_clip": 0.01093588, + "auxiliary_loss_mlp": 0.00779094, + "balance_loss_clip": 1.04652429, + "balance_loss_mlp": 1.00091529, + "epoch": 0.18836615060874792, + "flos": 28872933050880.0, + "grad_norm": 2.092242478448591, + "language_loss": 0.79653811, + "learning_rate": 3.742629607551614e-06, + "loss": 0.81526494, + "num_input_tokens_seen": 67650490, + "step": 3133, + "time_per_iteration": 2.7873754501342773 + }, + { + "auxiliary_loss_clip": 0.01119878, + "auxiliary_loss_mlp": 0.01059381, + "balance_loss_clip": 1.05341148, + "balance_loss_mlp": 1.03921056, + "epoch": 0.18842627386141592, + "flos": 22601673717120.0, + "grad_norm": 1.9069857551930867, + "language_loss": 0.83001804, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.85181063, + "num_input_tokens_seen": 67668860, + "step": 3134, + "time_per_iteration": 2.9284298419952393 + }, + { + "auxiliary_loss_clip": 0.01131578, + "auxiliary_loss_mlp": 0.01046681, + "balance_loss_clip": 1.05168402, + "balance_loss_mlp": 1.02802503, + "epoch": 0.18848639711408388, + "flos": 24573600512640.0, + "grad_norm": 2.0376543711114152, + "language_loss": 0.82859468, + "learning_rate": 3.742247238639684e-06, + "loss": 0.85037726, + "num_input_tokens_seen": 67690220, + "step": 3135, + "time_per_iteration": 2.8006811141967773 + }, + { + "auxiliary_loss_clip": 0.01143148, + "auxiliary_loss_mlp": 0.01050197, + "balance_loss_clip": 1.05505157, + "balance_loss_mlp": 1.03146911, + "epoch": 0.18854652036675185, + "flos": 34166920078080.0, + "grad_norm": 1.9728388324049713, + "language_loss": 0.78658557, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.80851901, + "num_input_tokens_seen": 67709820, + "step": 3136, + "time_per_iteration": 4.256143569946289 + }, + { + "auxiliary_loss_clip": 0.01135545, + "auxiliary_loss_mlp": 0.01048618, + "balance_loss_clip": 1.05388892, + "balance_loss_mlp": 1.03006911, + "epoch": 0.1886066436194198, + "flos": 24200236592640.0, + "grad_norm": 1.7483697887361769, + "language_loss": 0.80820233, + "learning_rate": 3.741864605462996e-06, + "loss": 0.83004391, + "num_input_tokens_seen": 67729490, + "step": 3137, + "time_per_iteration": 2.7538130283355713 + }, + { + "auxiliary_loss_clip": 0.01159054, + "auxiliary_loss_mlp": 0.01048373, + "balance_loss_clip": 1.05827475, + "balance_loss_mlp": 1.03107548, + "epoch": 0.18866676687208778, + "flos": 21251109317760.0, + "grad_norm": 1.9799764624272802, + "language_loss": 0.81274408, + "learning_rate": 3.741673189793504e-06, + "loss": 0.83481836, + "num_input_tokens_seen": 67749665, + "step": 3138, + "time_per_iteration": 4.143909931182861 + }, + { + "auxiliary_loss_clip": 0.01150082, + "auxiliary_loss_mlp": 0.01056444, + "balance_loss_clip": 1.05626798, + "balance_loss_mlp": 1.03713167, + "epoch": 0.18872689012475574, + "flos": 37308673013760.0, + "grad_norm": 2.326218248348143, + "language_loss": 0.63655496, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.65862024, + "num_input_tokens_seen": 67776230, + "step": 3139, + "time_per_iteration": 4.30991268157959 + }, + { + "auxiliary_loss_clip": 0.0115289, + "auxiliary_loss_mlp": 0.01043021, + "balance_loss_clip": 1.05286491, + "balance_loss_mlp": 1.02356625, + "epoch": 0.1887870133774237, + "flos": 21652303299840.0, + "grad_norm": 2.1185902638296525, + "language_loss": 0.7148211, + "learning_rate": 3.741290160328514e-06, + "loss": 0.73678017, + "num_input_tokens_seen": 67795080, + "step": 3140, + "time_per_iteration": 2.6880578994750977 + }, + { + "auxiliary_loss_clip": 0.01154738, + "auxiliary_loss_mlp": 0.01043099, + "balance_loss_clip": 1.05349982, + "balance_loss_mlp": 1.02382278, + "epoch": 0.1888471366300917, + "flos": 15924659374080.0, + "grad_norm": 2.6250212982316574, + "language_loss": 0.87069929, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.89267766, + "num_input_tokens_seen": 67813110, + "step": 3141, + "time_per_iteration": 2.6677181720733643 + }, + { + "auxiliary_loss_clip": 0.01130655, + "auxiliary_loss_mlp": 0.01052882, + "balance_loss_clip": 1.0507834, + "balance_loss_mlp": 1.03243756, + "epoch": 0.18890725988275966, + "flos": 18551955767040.0, + "grad_norm": 1.873404502116747, + "language_loss": 0.7744689, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79630429, + "num_input_tokens_seen": 67831070, + "step": 3142, + "time_per_iteration": 2.63077449798584 + }, + { + "auxiliary_loss_clip": 0.01128192, + "auxiliary_loss_mlp": 0.01038074, + "balance_loss_clip": 1.05298221, + "balance_loss_mlp": 1.02132463, + "epoch": 0.18896738313542763, + "flos": 28840865184000.0, + "grad_norm": 1.6611052928231447, + "language_loss": 0.78867507, + "learning_rate": 3.740715120924971e-06, + "loss": 0.81033778, + "num_input_tokens_seen": 67852170, + "step": 3143, + "time_per_iteration": 4.417406797409058 + }, + { + "auxiliary_loss_clip": 0.0111986, + "auxiliary_loss_mlp": 0.01048019, + "balance_loss_clip": 1.05024099, + "balance_loss_mlp": 1.02821851, + "epoch": 0.1890275063880956, + "flos": 22412747157120.0, + "grad_norm": 2.855732191409361, + "language_loss": 0.71476078, + "learning_rate": 3.740523309097912e-06, + "loss": 0.73643959, + "num_input_tokens_seen": 67869945, + "step": 3144, + "time_per_iteration": 2.8104894161224365 + }, + { + "auxiliary_loss_clip": 0.01125398, + "auxiliary_loss_mlp": 0.01044816, + "balance_loss_clip": 1.05102479, + "balance_loss_mlp": 1.02492023, + "epoch": 0.18908762964076356, + "flos": 24243904552320.0, + "grad_norm": 2.5973078221757144, + "language_loss": 0.73390597, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.75560808, + "num_input_tokens_seen": 67890240, + "step": 3145, + "time_per_iteration": 2.715609312057495 + }, + { + "auxiliary_loss_clip": 0.01110308, + "auxiliary_loss_mlp": 0.01042542, + "balance_loss_clip": 1.04543984, + "balance_loss_mlp": 1.02446938, + "epoch": 0.18914775289343153, + "flos": 16982910892800.0, + "grad_norm": 2.915733862437625, + "language_loss": 0.76263785, + "learning_rate": 3.740139487448616e-06, + "loss": 0.78416634, + "num_input_tokens_seen": 67907825, + "step": 3146, + "time_per_iteration": 2.777221202850342 + }, + { + "auxiliary_loss_clip": 0.01092807, + "auxiliary_loss_mlp": 0.01049336, + "balance_loss_clip": 1.04319823, + "balance_loss_mlp": 1.02829611, + "epoch": 0.1892078761460995, + "flos": 21543781334400.0, + "grad_norm": 1.988128972125699, + "language_loss": 0.7837925, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.80521393, + "num_input_tokens_seen": 67926670, + "step": 3147, + "time_per_iteration": 2.8039205074310303 + }, + { + "auxiliary_loss_clip": 0.01143577, + "auxiliary_loss_mlp": 0.01042953, + "balance_loss_clip": 1.0548687, + "balance_loss_mlp": 1.02454758, + "epoch": 0.18926799939876748, + "flos": 23001538896000.0, + "grad_norm": 3.932544798883504, + "language_loss": 0.67477876, + "learning_rate": 3.739755401854267e-06, + "loss": 0.69664401, + "num_input_tokens_seen": 67943645, + "step": 3148, + "time_per_iteration": 2.7273359298706055 + }, + { + "auxiliary_loss_clip": 0.01112331, + "auxiliary_loss_mlp": 0.01039139, + "balance_loss_clip": 1.04617155, + "balance_loss_mlp": 1.02014899, + "epoch": 0.18932812265143545, + "flos": 22273019251200.0, + "grad_norm": 2.9848849244070315, + "language_loss": 0.76207471, + "learning_rate": 3.739563260095902e-06, + "loss": 0.78358936, + "num_input_tokens_seen": 67962345, + "step": 3149, + "time_per_iteration": 2.8031978607177734 + }, + { + "auxiliary_loss_clip": 0.01130375, + "auxiliary_loss_mlp": 0.01045773, + "balance_loss_clip": 1.05438852, + "balance_loss_mlp": 1.02797484, + "epoch": 0.1893882459041034, + "flos": 18624423456000.0, + "grad_norm": 2.3661599820320136, + "language_loss": 0.80378366, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.82554519, + "num_input_tokens_seen": 67979760, + "step": 3150, + "time_per_iteration": 2.7836129665374756 + }, + { + "auxiliary_loss_clip": 0.01137112, + "auxiliary_loss_mlp": 0.0104876, + "balance_loss_clip": 1.0528239, + "balance_loss_mlp": 1.03019929, + "epoch": 0.18944836915677138, + "flos": 22892981016960.0, + "grad_norm": 2.0711129864945956, + "language_loss": 0.85251844, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.87437713, + "num_input_tokens_seen": 67996895, + "step": 3151, + "time_per_iteration": 2.7782201766967773 + }, + { + "auxiliary_loss_clip": 0.01121267, + "auxiliary_loss_mlp": 0.01046776, + "balance_loss_clip": 1.05223882, + "balance_loss_mlp": 1.02839363, + "epoch": 0.18950849240943934, + "flos": 26796542526720.0, + "grad_norm": 2.1337439707996673, + "language_loss": 0.74114192, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.76282233, + "num_input_tokens_seen": 68018365, + "step": 3152, + "time_per_iteration": 2.8767755031585693 + }, + { + "auxiliary_loss_clip": 0.01120312, + "auxiliary_loss_mlp": 0.0104438, + "balance_loss_clip": 1.05119991, + "balance_loss_mlp": 1.02463925, + "epoch": 0.1895686156621073, + "flos": 24971239048320.0, + "grad_norm": 1.9471461777193173, + "language_loss": 0.75520492, + "learning_rate": 3.738794033491209e-06, + "loss": 0.77685189, + "num_input_tokens_seen": 68037985, + "step": 3153, + "time_per_iteration": 2.7722980976104736 + }, + { + "auxiliary_loss_clip": 0.01158287, + "auxiliary_loss_mlp": 0.01049678, + "balance_loss_clip": 1.0559293, + "balance_loss_mlp": 1.03102183, + "epoch": 0.1896287389147753, + "flos": 21944544353280.0, + "grad_norm": 2.099749434473157, + "language_loss": 0.79984629, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.82192594, + "num_input_tokens_seen": 68057975, + "step": 3154, + "time_per_iteration": 2.6530587673187256 + }, + { + "auxiliary_loss_clip": 0.01117992, + "auxiliary_loss_mlp": 0.01056707, + "balance_loss_clip": 1.04851115, + "balance_loss_mlp": 1.03536844, + "epoch": 0.18968886216744327, + "flos": 18179058723840.0, + "grad_norm": 3.210440214164498, + "language_loss": 0.73046303, + "learning_rate": 3.738409024548223e-06, + "loss": 0.75220996, + "num_input_tokens_seen": 68074175, + "step": 3155, + "time_per_iteration": 2.729832410812378 + }, + { + "auxiliary_loss_clip": 0.01126019, + "auxiliary_loss_mlp": 0.01045659, + "balance_loss_clip": 1.05104291, + "balance_loss_mlp": 1.02626419, + "epoch": 0.18974898542011123, + "flos": 20412487509120.0, + "grad_norm": 1.8299076145086866, + "language_loss": 0.73869717, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76041389, + "num_input_tokens_seen": 68095230, + "step": 3156, + "time_per_iteration": 2.6747231483459473 + }, + { + "auxiliary_loss_clip": 0.01156549, + "auxiliary_loss_mlp": 0.0104418, + "balance_loss_clip": 1.05489409, + "balance_loss_mlp": 1.02645326, + "epoch": 0.1898091086727792, + "flos": 23985024255360.0, + "grad_norm": 1.9629652277148564, + "language_loss": 0.68053937, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.70254672, + "num_input_tokens_seen": 68113805, + "step": 3157, + "time_per_iteration": 2.7092478275299072 + }, + { + "auxiliary_loss_clip": 0.01114914, + "auxiliary_loss_mlp": 0.01044181, + "balance_loss_clip": 1.04805827, + "balance_loss_mlp": 1.02533436, + "epoch": 0.18986923192544716, + "flos": 27637067756160.0, + "grad_norm": 1.7829025355963362, + "language_loss": 0.79893303, + "learning_rate": 3.737831016747176e-06, + "loss": 0.82052404, + "num_input_tokens_seen": 68133190, + "step": 3158, + "time_per_iteration": 2.7921364307403564 + }, + { + "auxiliary_loss_clip": 0.01163231, + "auxiliary_loss_mlp": 0.01049502, + "balance_loss_clip": 1.05787683, + "balance_loss_mlp": 1.02923679, + "epoch": 0.18992935517811513, + "flos": 25484151306240.0, + "grad_norm": 1.856283461980025, + "language_loss": 0.72348613, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74561346, + "num_input_tokens_seen": 68152330, + "step": 3159, + "time_per_iteration": 2.6111273765563965 + }, + { + "auxiliary_loss_clip": 0.01149613, + "auxiliary_loss_mlp": 0.01053808, + "balance_loss_clip": 1.05840325, + "balance_loss_mlp": 1.03386414, + "epoch": 0.1899894784307831, + "flos": 17420805596160.0, + "grad_norm": 2.2573250756933647, + "language_loss": 0.84977192, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.87180614, + "num_input_tokens_seen": 68170185, + "step": 3160, + "time_per_iteration": 2.659259796142578 + }, + { + "auxiliary_loss_clip": 0.01129342, + "auxiliary_loss_mlp": 0.01049909, + "balance_loss_clip": 1.05297387, + "balance_loss_mlp": 1.03289795, + "epoch": 0.19004960168345109, + "flos": 27492240119040.0, + "grad_norm": 2.752358611011079, + "language_loss": 0.73407793, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.7558704, + "num_input_tokens_seen": 68191665, + "step": 3161, + "time_per_iteration": 2.784040689468384 + }, + { + "auxiliary_loss_clip": 0.01139858, + "auxiliary_loss_mlp": 0.0105519, + "balance_loss_clip": 1.05456805, + "balance_loss_mlp": 1.03476942, + "epoch": 0.19010972493611905, + "flos": 38654676385920.0, + "grad_norm": 1.6629026055958476, + "language_loss": 0.8115741, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.83352458, + "num_input_tokens_seen": 68214635, + "step": 3162, + "time_per_iteration": 2.9375386238098145 + }, + { + "auxiliary_loss_clip": 0.01157449, + "auxiliary_loss_mlp": 0.01040035, + "balance_loss_clip": 1.05625844, + "balance_loss_mlp": 1.02062798, + "epoch": 0.19016984818878702, + "flos": 19244744357760.0, + "grad_norm": 2.448016750033594, + "language_loss": 0.75615001, + "learning_rate": 3.73686635253511e-06, + "loss": 0.77812481, + "num_input_tokens_seen": 68232150, + "step": 3163, + "time_per_iteration": 2.7344541549682617 + }, + { + "auxiliary_loss_clip": 0.0110099, + "auxiliary_loss_mlp": 0.01050093, + "balance_loss_clip": 1.050578, + "balance_loss_mlp": 1.02880192, + "epoch": 0.19022997144145498, + "flos": 37596891744000.0, + "grad_norm": 2.2644227245470514, + "language_loss": 0.74093997, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76245081, + "num_input_tokens_seen": 68253370, + "step": 3164, + "time_per_iteration": 2.9165730476379395 + }, + { + "auxiliary_loss_clip": 0.01141317, + "auxiliary_loss_mlp": 0.01038043, + "balance_loss_clip": 1.05518687, + "balance_loss_mlp": 1.0195303, + "epoch": 0.19029009469412295, + "flos": 61530921665280.0, + "grad_norm": 1.5484522746055986, + "language_loss": 0.66844344, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.69023699, + "num_input_tokens_seen": 68278895, + "step": 3165, + "time_per_iteration": 3.006096124649048 + }, + { + "auxiliary_loss_clip": 0.01146225, + "auxiliary_loss_mlp": 0.0104856, + "balance_loss_clip": 1.05512285, + "balance_loss_mlp": 1.02848506, + "epoch": 0.1903502179467909, + "flos": 13954851480960.0, + "grad_norm": 2.8598536292657144, + "language_loss": 0.74239767, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.76434553, + "num_input_tokens_seen": 68294880, + "step": 3166, + "time_per_iteration": 2.678844928741455 + }, + { + "auxiliary_loss_clip": 0.01050093, + "auxiliary_loss_mlp": 0.01014959, + "balance_loss_clip": 1.04342103, + "balance_loss_mlp": 1.01201403, + "epoch": 0.1904103411994589, + "flos": 66899641916160.0, + "grad_norm": 0.7754190343967906, + "language_loss": 0.50311053, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52376103, + "num_input_tokens_seen": 68359665, + "step": 3167, + "time_per_iteration": 3.277529239654541 + }, + { + "auxiliary_loss_clip": 0.01138483, + "auxiliary_loss_mlp": 0.01051348, + "balance_loss_clip": 1.05485487, + "balance_loss_mlp": 1.03293037, + "epoch": 0.19047046445212687, + "flos": 21908741472000.0, + "grad_norm": 2.3487387451986192, + "language_loss": 0.74504036, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.76693863, + "num_input_tokens_seen": 68378950, + "step": 3168, + "time_per_iteration": 2.690995216369629 + }, + { + "auxiliary_loss_clip": 0.01040165, + "auxiliary_loss_mlp": 0.01023518, + "balance_loss_clip": 1.03869283, + "balance_loss_mlp": 1.02085996, + "epoch": 0.19053058770479483, + "flos": 59255156701440.0, + "grad_norm": 0.8605055473788603, + "language_loss": 0.60079956, + "learning_rate": 3.73570658211056e-06, + "loss": 0.62143636, + "num_input_tokens_seen": 68434235, + "step": 3169, + "time_per_iteration": 3.2108101844787598 + }, + { + "auxiliary_loss_clip": 0.01103792, + "auxiliary_loss_mlp": 0.01056606, + "balance_loss_clip": 1.05267787, + "balance_loss_mlp": 1.03741288, + "epoch": 0.1905907109574628, + "flos": 23951304362880.0, + "grad_norm": 1.5575975614891868, + "language_loss": 0.78179795, + "learning_rate": 3.735513056633436e-06, + "loss": 0.80340189, + "num_input_tokens_seen": 68453830, + "step": 3170, + "time_per_iteration": 2.832043409347534 + }, + { + "auxiliary_loss_clip": 0.01142047, + "auxiliary_loss_mlp": 0.01045041, + "balance_loss_clip": 1.05325115, + "balance_loss_mlp": 1.02605128, + "epoch": 0.19065083421013077, + "flos": 20812316774400.0, + "grad_norm": 1.7671932984988854, + "language_loss": 0.78177166, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.80364257, + "num_input_tokens_seen": 68473005, + "step": 3171, + "time_per_iteration": 2.7823612689971924 + }, + { + "auxiliary_loss_clip": 0.01158227, + "auxiliary_loss_mlp": 0.01047345, + "balance_loss_clip": 1.05499291, + "balance_loss_mlp": 1.0285697, + "epoch": 0.19071095746279873, + "flos": 31284981192960.0, + "grad_norm": 2.1976685633770905, + "language_loss": 0.77953529, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.80159104, + "num_input_tokens_seen": 68493470, + "step": 3172, + "time_per_iteration": 2.746279001235962 + }, + { + "auxiliary_loss_clip": 0.01145112, + "auxiliary_loss_mlp": 0.01055334, + "balance_loss_clip": 1.05438328, + "balance_loss_mlp": 1.03703523, + "epoch": 0.1907710807154667, + "flos": 14356117290240.0, + "grad_norm": 1.5258786569967644, + "language_loss": 0.80223799, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.82424247, + "num_input_tokens_seen": 68511290, + "step": 3173, + "time_per_iteration": 2.7396810054779053 + }, + { + "auxiliary_loss_clip": 0.01113266, + "auxiliary_loss_mlp": 0.00778142, + "balance_loss_clip": 1.04967713, + "balance_loss_mlp": 1.00094676, + "epoch": 0.1908312039681347, + "flos": 26907039740160.0, + "grad_norm": 1.5341307852526682, + "language_loss": 0.78495061, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.80386466, + "num_input_tokens_seen": 68532575, + "step": 3174, + "time_per_iteration": 2.8579304218292236 + }, + { + "auxiliary_loss_clip": 0.01106714, + "auxiliary_loss_mlp": 0.01047557, + "balance_loss_clip": 1.04928994, + "balance_loss_mlp": 1.02838778, + "epoch": 0.19089132722080265, + "flos": 14494695960960.0, + "grad_norm": 1.8075853216546063, + "language_loss": 0.81067109, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.83221382, + "num_input_tokens_seen": 68548760, + "step": 3175, + "time_per_iteration": 2.718254804611206 + }, + { + "auxiliary_loss_clip": 0.01080497, + "auxiliary_loss_mlp": 0.01053652, + "balance_loss_clip": 1.04361629, + "balance_loss_mlp": 1.0342685, + "epoch": 0.19095145047347062, + "flos": 13952876232960.0, + "grad_norm": 2.2545261224105873, + "language_loss": 0.85529047, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.87663192, + "num_input_tokens_seen": 68563100, + "step": 3176, + "time_per_iteration": 4.2962729930877686 + }, + { + "auxiliary_loss_clip": 0.0113361, + "auxiliary_loss_mlp": 0.01059849, + "balance_loss_clip": 1.05418086, + "balance_loss_mlp": 1.03928506, + "epoch": 0.19101157372613858, + "flos": 25301832848640.0, + "grad_norm": 2.0896270593066832, + "language_loss": 0.813025, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.83495957, + "num_input_tokens_seen": 68581650, + "step": 3177, + "time_per_iteration": 2.815127372741699 + }, + { + "auxiliary_loss_clip": 0.01122377, + "auxiliary_loss_mlp": 0.01044946, + "balance_loss_clip": 1.0482533, + "balance_loss_mlp": 1.0265398, + "epoch": 0.19107169697880655, + "flos": 20558212986240.0, + "grad_norm": 2.67963335978105, + "language_loss": 0.7530241, + "learning_rate": 3.73396248424356e-06, + "loss": 0.7746973, + "num_input_tokens_seen": 68600360, + "step": 3178, + "time_per_iteration": 4.351228475570679 + }, + { + "auxiliary_loss_clip": 0.01146729, + "auxiliary_loss_mlp": 0.01042476, + "balance_loss_clip": 1.05574143, + "balance_loss_mlp": 1.02458286, + "epoch": 0.19113182023147451, + "flos": 22163204396160.0, + "grad_norm": 4.753014277211421, + "language_loss": 0.81381619, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.83570826, + "num_input_tokens_seen": 68617885, + "step": 3179, + "time_per_iteration": 4.259284019470215 + }, + { + "auxiliary_loss_clip": 0.01147837, + "auxiliary_loss_mlp": 0.01048144, + "balance_loss_clip": 1.05645823, + "balance_loss_mlp": 1.0291661, + "epoch": 0.19119194348414248, + "flos": 18581796990720.0, + "grad_norm": 2.753081884541086, + "language_loss": 0.79384613, + "learning_rate": 3.733574183478691e-06, + "loss": 0.81580591, + "num_input_tokens_seen": 68634550, + "step": 3180, + "time_per_iteration": 2.6609203815460205 + }, + { + "auxiliary_loss_clip": 0.01129361, + "auxiliary_loss_mlp": 0.0105402, + "balance_loss_clip": 1.05249727, + "balance_loss_mlp": 1.03445804, + "epoch": 0.19125206673681047, + "flos": 19026623018880.0, + "grad_norm": 2.660238694189741, + "language_loss": 0.79517245, + "learning_rate": 3.733379934486615e-06, + "loss": 0.81700623, + "num_input_tokens_seen": 68651895, + "step": 3181, + "time_per_iteration": 2.6877176761627197 + }, + { + "auxiliary_loss_clip": 0.0114301, + "auxiliary_loss_mlp": 0.01053621, + "balance_loss_clip": 1.05339336, + "balance_loss_mlp": 1.03527462, + "epoch": 0.19131218998947844, + "flos": 21690153256320.0, + "grad_norm": 2.2179888965480243, + "language_loss": 0.74570775, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.76767409, + "num_input_tokens_seen": 68671500, + "step": 3182, + "time_per_iteration": 4.2829508781433105 + }, + { + "auxiliary_loss_clip": 0.01128679, + "auxiliary_loss_mlp": 0.01044063, + "balance_loss_clip": 1.05578041, + "balance_loss_mlp": 1.02575254, + "epoch": 0.1913723132421464, + "flos": 18442500048000.0, + "grad_norm": 1.7534728284311585, + "language_loss": 0.64618582, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.66791326, + "num_input_tokens_seen": 68690570, + "step": 3183, + "time_per_iteration": 2.7652854919433594 + }, + { + "auxiliary_loss_clip": 0.01132257, + "auxiliary_loss_mlp": 0.01050867, + "balance_loss_clip": 1.0512805, + "balance_loss_mlp": 1.0311259, + "epoch": 0.19143243649481437, + "flos": 27160102033920.0, + "grad_norm": 1.555926798692704, + "language_loss": 0.73459226, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.75642347, + "num_input_tokens_seen": 68709735, + "step": 3184, + "time_per_iteration": 2.6929056644439697 + }, + { + "auxiliary_loss_clip": 0.01122578, + "auxiliary_loss_mlp": 0.01054123, + "balance_loss_clip": 1.05015373, + "balance_loss_mlp": 1.03347623, + "epoch": 0.19149255974748233, + "flos": 21718952985600.0, + "grad_norm": 2.0989643169058514, + "language_loss": 0.87983418, + "learning_rate": 3.732602281292598e-06, + "loss": 0.9016012, + "num_input_tokens_seen": 68727565, + "step": 3185, + "time_per_iteration": 2.6859230995178223 + }, + { + "auxiliary_loss_clip": 0.01153787, + "auxiliary_loss_mlp": 0.01044436, + "balance_loss_clip": 1.05334914, + "balance_loss_mlp": 1.02505302, + "epoch": 0.1915526830001503, + "flos": 22963293889920.0, + "grad_norm": 2.4520480945942587, + "language_loss": 0.73240852, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.75439072, + "num_input_tokens_seen": 68748110, + "step": 3186, + "time_per_iteration": 2.6398978233337402 + }, + { + "auxiliary_loss_clip": 0.01132874, + "auxiliary_loss_mlp": 0.01044989, + "balance_loss_clip": 1.05609488, + "balance_loss_mlp": 1.02379346, + "epoch": 0.1916128062528183, + "flos": 26140741966080.0, + "grad_norm": 2.739457234253781, + "language_loss": 0.83550584, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.85728443, + "num_input_tokens_seen": 68769765, + "step": 3187, + "time_per_iteration": 2.7476372718811035 + }, + { + "auxiliary_loss_clip": 0.01076264, + "auxiliary_loss_mlp": 0.01021317, + "balance_loss_clip": 1.04604995, + "balance_loss_mlp": 1.01892138, + "epoch": 0.19167292950548626, + "flos": 54925767457920.0, + "grad_norm": 0.8659386797819415, + "language_loss": 0.55824959, + "learning_rate": 3.732018351516544e-06, + "loss": 0.57922542, + "num_input_tokens_seen": 68826815, + "step": 3188, + "time_per_iteration": 3.2144031524658203 + }, + { + "auxiliary_loss_clip": 0.01139007, + "auxiliary_loss_mlp": 0.01054399, + "balance_loss_clip": 1.054564, + "balance_loss_mlp": 1.03537333, + "epoch": 0.19173305275815422, + "flos": 29935601942400.0, + "grad_norm": 2.2897904709915573, + "language_loss": 0.69839454, + "learning_rate": 3.731823576891397e-06, + "loss": 0.72032857, + "num_input_tokens_seen": 68847585, + "step": 3189, + "time_per_iteration": 2.7998950481414795 + }, + { + "auxiliary_loss_clip": 0.01118438, + "auxiliary_loss_mlp": 0.01038566, + "balance_loss_clip": 1.04930174, + "balance_loss_mlp": 1.02116132, + "epoch": 0.1917931760108222, + "flos": 24752471264640.0, + "grad_norm": 2.362312815249866, + "language_loss": 0.74320328, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.76477331, + "num_input_tokens_seen": 68866620, + "step": 3190, + "time_per_iteration": 2.7386670112609863 + }, + { + "auxiliary_loss_clip": 0.01111071, + "auxiliary_loss_mlp": 0.0106718, + "balance_loss_clip": 1.04946983, + "balance_loss_mlp": 1.04702199, + "epoch": 0.19185329926349015, + "flos": 18843550375680.0, + "grad_norm": 3.545467698458187, + "language_loss": 0.8444041, + "learning_rate": 3.73143383063572e-06, + "loss": 0.8661865, + "num_input_tokens_seen": 68885515, + "step": 3191, + "time_per_iteration": 2.7025794982910156 + }, + { + "auxiliary_loss_clip": 0.01127894, + "auxiliary_loss_mlp": 0.01039849, + "balance_loss_clip": 1.05251908, + "balance_loss_mlp": 1.02231336, + "epoch": 0.19191342251615812, + "flos": 22086858038400.0, + "grad_norm": 2.0663841109071526, + "language_loss": 0.89985192, + "learning_rate": 3.73123885901997e-06, + "loss": 0.92152941, + "num_input_tokens_seen": 68903225, + "step": 3192, + "time_per_iteration": 2.802852153778076 + }, + { + "auxiliary_loss_clip": 0.01130336, + "auxiliary_loss_mlp": 0.01054766, + "balance_loss_clip": 1.05716372, + "balance_loss_mlp": 1.03509688, + "epoch": 0.19197354576882608, + "flos": 22199115018240.0, + "grad_norm": 2.3467564445058775, + "language_loss": 0.75159264, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.77344358, + "num_input_tokens_seen": 68922860, + "step": 3193, + "time_per_iteration": 2.7680914402008057 + }, + { + "auxiliary_loss_clip": 0.01128303, + "auxiliary_loss_mlp": 0.00777332, + "balance_loss_clip": 1.05222785, + "balance_loss_mlp": 1.00071752, + "epoch": 0.19203366902149407, + "flos": 24896185580160.0, + "grad_norm": 2.078743387775855, + "language_loss": 0.75189757, + "learning_rate": 3.730848718849612e-06, + "loss": 0.77095383, + "num_input_tokens_seen": 68943000, + "step": 3194, + "time_per_iteration": 2.7537553310394287 + }, + { + "auxiliary_loss_clip": 0.01068142, + "auxiliary_loss_mlp": 0.01004387, + "balance_loss_clip": 1.03910232, + "balance_loss_mlp": 1.00182378, + "epoch": 0.19209379227416204, + "flos": 68416722789120.0, + "grad_norm": 0.7955224937316553, + "language_loss": 0.68507159, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70579696, + "num_input_tokens_seen": 69000255, + "step": 3195, + "time_per_iteration": 3.117191791534424 + }, + { + "auxiliary_loss_clip": 0.01116081, + "auxiliary_loss_mlp": 0.01052392, + "balance_loss_clip": 1.05205238, + "balance_loss_mlp": 1.0320189, + "epoch": 0.19215391552683, + "flos": 22055185221120.0, + "grad_norm": 2.6559439291645757, + "language_loss": 0.73141015, + "learning_rate": 3.730458316143429e-06, + "loss": 0.75309479, + "num_input_tokens_seen": 69019665, + "step": 3196, + "time_per_iteration": 2.7234303951263428 + }, + { + "auxiliary_loss_clip": 0.01139018, + "auxiliary_loss_mlp": 0.01044947, + "balance_loss_clip": 1.06151462, + "balance_loss_mlp": 1.02596927, + "epoch": 0.19221403877949797, + "flos": 20302959962880.0, + "grad_norm": 3.0997718824135734, + "language_loss": 0.83654135, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.85838103, + "num_input_tokens_seen": 69039055, + "step": 3197, + "time_per_iteration": 2.72575306892395 + }, + { + "auxiliary_loss_clip": 0.01086216, + "auxiliary_loss_mlp": 0.01055059, + "balance_loss_clip": 1.04615641, + "balance_loss_mlp": 1.03320754, + "epoch": 0.19227416203216594, + "flos": 23185329811200.0, + "grad_norm": 2.2465298420006383, + "language_loss": 0.80656433, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.82797706, + "num_input_tokens_seen": 69056370, + "step": 3198, + "time_per_iteration": 2.741678237915039 + }, + { + "auxiliary_loss_clip": 0.01135487, + "auxiliary_loss_mlp": 0.01056572, + "balance_loss_clip": 1.05502987, + "balance_loss_mlp": 1.03655636, + "epoch": 0.1923342852848339, + "flos": 25776607841280.0, + "grad_norm": 1.9205907836873994, + "language_loss": 0.78993976, + "learning_rate": 3.729872219959029e-06, + "loss": 0.81186032, + "num_input_tokens_seen": 69075915, + "step": 3199, + "time_per_iteration": 2.7821297645568848 + }, + { + "auxiliary_loss_clip": 0.01116808, + "auxiliary_loss_mlp": 0.01056964, + "balance_loss_clip": 1.05010581, + "balance_loss_mlp": 1.036412, + "epoch": 0.19239440853750187, + "flos": 17128349061120.0, + "grad_norm": 3.662083840248298, + "language_loss": 0.83574522, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.85748297, + "num_input_tokens_seen": 69094145, + "step": 3200, + "time_per_iteration": 2.7095022201538086 + }, + { + "auxiliary_loss_clip": 0.01159025, + "auxiliary_loss_mlp": 0.01048823, + "balance_loss_clip": 1.05997193, + "balance_loss_mlp": 1.03060746, + "epoch": 0.19245453179016986, + "flos": 16435093593600.0, + "grad_norm": 1.9278966392289572, + "language_loss": 0.79092836, + "learning_rate": 3.729481161172443e-06, + "loss": 0.81300688, + "num_input_tokens_seen": 69111110, + "step": 3201, + "time_per_iteration": 2.684979200363159 + }, + { + "auxiliary_loss_clip": 0.01103349, + "auxiliary_loss_mlp": 0.01053366, + "balance_loss_clip": 1.04825675, + "balance_loss_mlp": 1.03418541, + "epoch": 0.19251465504283782, + "flos": 20230276792320.0, + "grad_norm": 2.4062417134527645, + "language_loss": 0.69276404, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.71433127, + "num_input_tokens_seen": 69130280, + "step": 3202, + "time_per_iteration": 2.8284943103790283 + }, + { + "auxiliary_loss_clip": 0.01132334, + "auxiliary_loss_mlp": 0.01041011, + "balance_loss_clip": 1.05389905, + "balance_loss_mlp": 1.02256894, + "epoch": 0.1925747782955058, + "flos": 19464374067840.0, + "grad_norm": 1.9491265782204168, + "language_loss": 0.91396749, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.93570089, + "num_input_tokens_seen": 69149570, + "step": 3203, + "time_per_iteration": 2.802433729171753 + }, + { + "auxiliary_loss_clip": 0.0114953, + "auxiliary_loss_mlp": 0.01049732, + "balance_loss_clip": 1.05674863, + "balance_loss_mlp": 1.02959776, + "epoch": 0.19263490154817375, + "flos": 17785586165760.0, + "grad_norm": 5.05881669068558, + "language_loss": 0.81689429, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.83888692, + "num_input_tokens_seen": 69168190, + "step": 3204, + "time_per_iteration": 2.671285629272461 + }, + { + "auxiliary_loss_clip": 0.01116988, + "auxiliary_loss_mlp": 0.01048941, + "balance_loss_clip": 1.04950142, + "balance_loss_mlp": 1.0298202, + "epoch": 0.19269502480084172, + "flos": 17457075354240.0, + "grad_norm": 2.296941025186916, + "language_loss": 0.76167846, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.78333771, + "num_input_tokens_seen": 69186950, + "step": 3205, + "time_per_iteration": 2.8654470443725586 + }, + { + "auxiliary_loss_clip": 0.01140852, + "auxiliary_loss_mlp": 0.01046651, + "balance_loss_clip": 1.05839586, + "balance_loss_mlp": 1.02749407, + "epoch": 0.19275514805350968, + "flos": 21506901045120.0, + "grad_norm": 3.761768843322395, + "language_loss": 0.83394569, + "learning_rate": 3.728502366649107e-06, + "loss": 0.85582072, + "num_input_tokens_seen": 69204850, + "step": 3206, + "time_per_iteration": 2.8610613346099854 + }, + { + "auxiliary_loss_clip": 0.0105715, + "auxiliary_loss_mlp": 0.01004055, + "balance_loss_clip": 1.03779244, + "balance_loss_mlp": 1.00174224, + "epoch": 0.19281527130617768, + "flos": 47695979738880.0, + "grad_norm": 0.8644529519848262, + "language_loss": 0.60561717, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62622917, + "num_input_tokens_seen": 69259200, + "step": 3207, + "time_per_iteration": 3.126537322998047 + }, + { + "auxiliary_loss_clip": 0.01120285, + "auxiliary_loss_mlp": 0.01045527, + "balance_loss_clip": 1.05201781, + "balance_loss_mlp": 1.02678764, + "epoch": 0.19287539455884564, + "flos": 11801252672640.0, + "grad_norm": 2.296187182186814, + "language_loss": 0.75463599, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.77629405, + "num_input_tokens_seen": 69275835, + "step": 3208, + "time_per_iteration": 2.6978750228881836 + }, + { + "auxiliary_loss_clip": 0.01150534, + "auxiliary_loss_mlp": 0.00777875, + "balance_loss_clip": 1.05520236, + "balance_loss_mlp": 1.00063884, + "epoch": 0.1929355178115136, + "flos": 20631434860800.0, + "grad_norm": 1.9483983315924505, + "language_loss": 0.60869855, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.62798262, + "num_input_tokens_seen": 69294810, + "step": 3209, + "time_per_iteration": 2.699798107147217 + }, + { + "auxiliary_loss_clip": 0.01158758, + "auxiliary_loss_mlp": 0.01053815, + "balance_loss_clip": 1.05472994, + "balance_loss_mlp": 1.03261995, + "epoch": 0.19299564106418157, + "flos": 40807916058240.0, + "grad_norm": 1.9992177661428934, + "language_loss": 0.80025005, + "learning_rate": 3.727718151176243e-06, + "loss": 0.82237577, + "num_input_tokens_seen": 69316065, + "step": 3210, + "time_per_iteration": 2.832665205001831 + }, + { + "auxiliary_loss_clip": 0.01118997, + "auxiliary_loss_mlp": 0.01047494, + "balance_loss_clip": 1.05044246, + "balance_loss_mlp": 1.02920699, + "epoch": 0.19305576431684954, + "flos": 11361418634880.0, + "grad_norm": 2.515510367397107, + "language_loss": 0.82571948, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.84738445, + "num_input_tokens_seen": 69332900, + "step": 3211, + "time_per_iteration": 2.7664191722869873 + }, + { + "auxiliary_loss_clip": 0.01073663, + "auxiliary_loss_mlp": 0.01002544, + "balance_loss_clip": 1.03501034, + "balance_loss_mlp": 1.00021982, + "epoch": 0.1931158875695175, + "flos": 54511895975040.0, + "grad_norm": 0.9633495631759209, + "language_loss": 0.63641912, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.6571812, + "num_input_tokens_seen": 69382535, + "step": 3212, + "time_per_iteration": 2.974940299987793 + }, + { + "auxiliary_loss_clip": 0.01131742, + "auxiliary_loss_mlp": 0.01044059, + "balance_loss_clip": 1.05586314, + "balance_loss_mlp": 1.02565336, + "epoch": 0.19317601082218547, + "flos": 19828436365440.0, + "grad_norm": 1.7209148950717332, + "language_loss": 0.76375663, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.78551459, + "num_input_tokens_seen": 69400600, + "step": 3213, + "time_per_iteration": 2.7898454666137695 + }, + { + "auxiliary_loss_clip": 0.01123196, + "auxiliary_loss_mlp": 0.0105066, + "balance_loss_clip": 1.05261111, + "balance_loss_mlp": 1.03116894, + "epoch": 0.19323613407485346, + "flos": 13152068467200.0, + "grad_norm": 2.349758973823363, + "language_loss": 0.70871878, + "learning_rate": 3.726932887459503e-06, + "loss": 0.73045731, + "num_input_tokens_seen": 69417350, + "step": 3214, + "time_per_iteration": 2.8155152797698975 + }, + { + "auxiliary_loss_clip": 0.01155585, + "auxiliary_loss_mlp": 0.01047831, + "balance_loss_clip": 1.05412841, + "balance_loss_mlp": 1.02807808, + "epoch": 0.19329625732752143, + "flos": 14027247342720.0, + "grad_norm": 2.190607045917922, + "language_loss": 0.75067955, + "learning_rate": 3.72673640779803e-06, + "loss": 0.77271378, + "num_input_tokens_seen": 69431845, + "step": 3215, + "time_per_iteration": 4.111938238143921 + }, + { + "auxiliary_loss_clip": 0.01112217, + "auxiliary_loss_mlp": 0.01049964, + "balance_loss_clip": 1.04928339, + "balance_loss_mlp": 1.0323447, + "epoch": 0.1933563805801894, + "flos": 23441732069760.0, + "grad_norm": 1.7842520268521305, + "language_loss": 0.88426638, + "learning_rate": 3.72653986265854e-06, + "loss": 0.9058882, + "num_input_tokens_seen": 69453275, + "step": 3216, + "time_per_iteration": 2.7699615955352783 + }, + { + "auxiliary_loss_clip": 0.01153806, + "auxiliary_loss_mlp": 0.01052131, + "balance_loss_clip": 1.05435801, + "balance_loss_mlp": 1.03442836, + "epoch": 0.19341650383285736, + "flos": 20485314334080.0, + "grad_norm": 1.6996051239972392, + "language_loss": 0.7974773, + "learning_rate": 3.726343252048485e-06, + "loss": 0.81953669, + "num_input_tokens_seen": 69471830, + "step": 3217, + "time_per_iteration": 2.6788718700408936 + }, + { + "auxiliary_loss_clip": 0.01143281, + "auxiliary_loss_mlp": 0.0104914, + "balance_loss_clip": 1.05695105, + "balance_loss_mlp": 1.02864754, + "epoch": 0.19347662708552532, + "flos": 17858484817920.0, + "grad_norm": 4.708784796317305, + "language_loss": 0.6161437, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.6380679, + "num_input_tokens_seen": 69489320, + "step": 3218, + "time_per_iteration": 4.352849960327148 + }, + { + "auxiliary_loss_clip": 0.01157355, + "auxiliary_loss_mlp": 0.01047211, + "balance_loss_clip": 1.05723107, + "balance_loss_mlp": 1.02873373, + "epoch": 0.1935367503381933, + "flos": 18187247024640.0, + "grad_norm": 1.9724785552136583, + "language_loss": 0.80345452, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.82550013, + "num_input_tokens_seen": 69506665, + "step": 3219, + "time_per_iteration": 4.1739161014556885 + }, + { + "auxiliary_loss_clip": 0.01104687, + "auxiliary_loss_mlp": 0.01047672, + "balance_loss_clip": 1.05145359, + "balance_loss_mlp": 1.02819324, + "epoch": 0.19359687359086128, + "flos": 15957122290560.0, + "grad_norm": 2.7508533279024077, + "language_loss": 0.85693008, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.87845367, + "num_input_tokens_seen": 69523835, + "step": 3220, + "time_per_iteration": 2.777284622192383 + }, + { + "auxiliary_loss_clip": 0.01149581, + "auxiliary_loss_mlp": 0.01041747, + "balance_loss_clip": 1.05441856, + "balance_loss_mlp": 1.02511764, + "epoch": 0.19365699684352924, + "flos": 21215198695680.0, + "grad_norm": 2.05545450883527, + "language_loss": 0.84637755, + "learning_rate": 3.725556155051766e-06, + "loss": 0.86829084, + "num_input_tokens_seen": 69542620, + "step": 3221, + "time_per_iteration": 4.224115371704102 + }, + { + "auxiliary_loss_clip": 0.01143661, + "auxiliary_loss_mlp": 0.01044558, + "balance_loss_clip": 1.05466259, + "balance_loss_mlp": 1.02730846, + "epoch": 0.1937171200961972, + "flos": 17311098481920.0, + "grad_norm": 2.658004231066563, + "language_loss": 0.86087942, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.8827616, + "num_input_tokens_seen": 69561130, + "step": 3222, + "time_per_iteration": 2.6400530338287354 + }, + { + "auxiliary_loss_clip": 0.01069453, + "auxiliary_loss_mlp": 0.01045281, + "balance_loss_clip": 1.04206085, + "balance_loss_mlp": 1.02599275, + "epoch": 0.19377724334886517, + "flos": 22635968227200.0, + "grad_norm": 1.8604116943694204, + "language_loss": 0.78510809, + "learning_rate": 3.72516221392398e-06, + "loss": 0.8062554, + "num_input_tokens_seen": 69580425, + "step": 3223, + "time_per_iteration": 2.9685652256011963 + }, + { + "auxiliary_loss_clip": 0.01146062, + "auxiliary_loss_mlp": 0.01046815, + "balance_loss_clip": 1.05697751, + "balance_loss_mlp": 1.02819431, + "epoch": 0.19383736660153314, + "flos": 15077813351040.0, + "grad_norm": 1.8958208586464897, + "language_loss": 0.75391948, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.77584827, + "num_input_tokens_seen": 69597085, + "step": 3224, + "time_per_iteration": 2.665294885635376 + }, + { + "auxiliary_loss_clip": 0.01102293, + "auxiliary_loss_mlp": 0.01050181, + "balance_loss_clip": 1.04728186, + "balance_loss_mlp": 1.02927208, + "epoch": 0.1938974898542011, + "flos": 47119934350080.0, + "grad_norm": 3.358076005999295, + "language_loss": 0.71180636, + "learning_rate": 3.7247680111229e-06, + "loss": 0.73333108, + "num_input_tokens_seen": 69618885, + "step": 3225, + "time_per_iteration": 2.997511863708496 + }, + { + "auxiliary_loss_clip": 0.0112035, + "auxiliary_loss_mlp": 0.01053167, + "balance_loss_clip": 1.0519309, + "balance_loss_mlp": 1.03480864, + "epoch": 0.19395761310686907, + "flos": 25812554376960.0, + "grad_norm": 2.42331686427639, + "language_loss": 0.69379079, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71552593, + "num_input_tokens_seen": 69638200, + "step": 3226, + "time_per_iteration": 2.746338129043579 + }, + { + "auxiliary_loss_clip": 0.01126783, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.05692983, + "balance_loss_mlp": 1.02264214, + "epoch": 0.19401773635953706, + "flos": 23039604334080.0, + "grad_norm": 2.1006513764454864, + "language_loss": 0.76236808, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.78405869, + "num_input_tokens_seen": 69657550, + "step": 3227, + "time_per_iteration": 2.760087728500366 + }, + { + "auxiliary_loss_clip": 0.01117794, + "auxiliary_loss_mlp": 0.010438, + "balance_loss_clip": 1.05304587, + "balance_loss_mlp": 1.0256561, + "epoch": 0.19407785961220503, + "flos": 15920780705280.0, + "grad_norm": 2.8268368707906397, + "language_loss": 0.69577461, + "learning_rate": 3.724176216414662e-06, + "loss": 0.71739054, + "num_input_tokens_seen": 69675005, + "step": 3228, + "time_per_iteration": 2.6779348850250244 + }, + { + "auxiliary_loss_clip": 0.01148199, + "auxiliary_loss_mlp": 0.01042315, + "balance_loss_clip": 1.05775642, + "balance_loss_mlp": 1.02445757, + "epoch": 0.194137982864873, + "flos": 25921722787200.0, + "grad_norm": 1.7694943420266864, + "language_loss": 0.74160898, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76351416, + "num_input_tokens_seen": 69696455, + "step": 3229, + "time_per_iteration": 2.7229623794555664 + }, + { + "auxiliary_loss_clip": 0.01119678, + "auxiliary_loss_mlp": 0.01044155, + "balance_loss_clip": 1.05435359, + "balance_loss_mlp": 1.0262022, + "epoch": 0.19419810611754096, + "flos": 13261344618240.0, + "grad_norm": 1.9766126324167548, + "language_loss": 0.65722096, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.67885935, + "num_input_tokens_seen": 69714245, + "step": 3230, + "time_per_iteration": 2.740324020385742 + }, + { + "auxiliary_loss_clip": 0.01124671, + "auxiliary_loss_mlp": 0.00776003, + "balance_loss_clip": 1.05223823, + "balance_loss_mlp": 1.00081468, + "epoch": 0.19425822937020892, + "flos": 15705568368000.0, + "grad_norm": 1.9307338208311895, + "language_loss": 0.82042694, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.83943367, + "num_input_tokens_seen": 69731515, + "step": 3231, + "time_per_iteration": 2.7453513145446777 + }, + { + "auxiliary_loss_clip": 0.0113141, + "auxiliary_loss_mlp": 0.01042332, + "balance_loss_clip": 1.05393946, + "balance_loss_mlp": 1.02220988, + "epoch": 0.1943183526228769, + "flos": 23105392093440.0, + "grad_norm": 10.866686758212083, + "language_loss": 0.87038374, + "learning_rate": 3.72338624150555e-06, + "loss": 0.89212114, + "num_input_tokens_seen": 69748885, + "step": 3232, + "time_per_iteration": 2.7575178146362305 + }, + { + "auxiliary_loss_clip": 0.01100451, + "auxiliary_loss_mlp": 0.01050878, + "balance_loss_clip": 1.05029583, + "balance_loss_mlp": 1.03102958, + "epoch": 0.19437847587554485, + "flos": 24712610146560.0, + "grad_norm": 2.531838729905544, + "language_loss": 0.85189134, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87340462, + "num_input_tokens_seen": 69767540, + "step": 3233, + "time_per_iteration": 2.8617444038391113 + }, + { + "auxiliary_loss_clip": 0.01149478, + "auxiliary_loss_mlp": 0.01054519, + "balance_loss_clip": 1.0574832, + "balance_loss_mlp": 1.0357672, + "epoch": 0.19443859912821285, + "flos": 23116130259840.0, + "grad_norm": 1.7408859410354203, + "language_loss": 0.89099532, + "learning_rate": 3.722990861915158e-06, + "loss": 0.91303527, + "num_input_tokens_seen": 69789340, + "step": 3234, + "time_per_iteration": 2.7648239135742188 + }, + { + "auxiliary_loss_clip": 0.01135157, + "auxiliary_loss_mlp": 0.01044708, + "balance_loss_clip": 1.05003643, + "balance_loss_mlp": 1.02544403, + "epoch": 0.1944987223808808, + "flos": 15084385539840.0, + "grad_norm": 2.4074482975555926, + "language_loss": 0.78673434, + "learning_rate": 3.722793074112234e-06, + "loss": 0.80853301, + "num_input_tokens_seen": 69806470, + "step": 3235, + "time_per_iteration": 2.76930832862854 + }, + { + "auxiliary_loss_clip": 0.01136497, + "auxiliary_loss_mlp": 0.01046749, + "balance_loss_clip": 1.0580672, + "balance_loss_mlp": 1.0293448, + "epoch": 0.19455884563354878, + "flos": 17126876603520.0, + "grad_norm": 2.2511193258734354, + "language_loss": 0.79391634, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.81574875, + "num_input_tokens_seen": 69822655, + "step": 3236, + "time_per_iteration": 2.7060179710388184 + }, + { + "auxiliary_loss_clip": 0.01156991, + "auxiliary_loss_mlp": 0.01044638, + "balance_loss_clip": 1.05862045, + "balance_loss_mlp": 1.02482522, + "epoch": 0.19461896888621674, + "flos": 20193396503040.0, + "grad_norm": 2.1553329609131713, + "language_loss": 0.76224017, + "learning_rate": 3.72239730252843e-06, + "loss": 0.78425646, + "num_input_tokens_seen": 69841895, + "step": 3237, + "time_per_iteration": 2.642235040664673 + }, + { + "auxiliary_loss_clip": 0.01158804, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_clip": 1.05648041, + "balance_loss_mlp": 1.03289127, + "epoch": 0.1946790921388847, + "flos": 25301365971840.0, + "grad_norm": 1.5204653275468003, + "language_loss": 0.74828202, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.77038062, + "num_input_tokens_seen": 69862220, + "step": 3238, + "time_per_iteration": 2.6618688106536865 + }, + { + "auxiliary_loss_clip": 0.01108331, + "auxiliary_loss_mlp": 0.01046572, + "balance_loss_clip": 1.04992437, + "balance_loss_mlp": 1.02791595, + "epoch": 0.19473921539155267, + "flos": 20193396503040.0, + "grad_norm": 3.1324225641798518, + "language_loss": 0.734164, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.75571299, + "num_input_tokens_seen": 69881830, + "step": 3239, + "time_per_iteration": 2.7637152671813965 + }, + { + "auxiliary_loss_clip": 0.01132567, + "auxiliary_loss_mlp": 0.01047988, + "balance_loss_clip": 1.05458641, + "balance_loss_mlp": 1.02947509, + "epoch": 0.19479933864422067, + "flos": 20887549810560.0, + "grad_norm": 2.155392951393246, + "language_loss": 0.73291272, + "learning_rate": 3.721803155320412e-06, + "loss": 0.7547183, + "num_input_tokens_seen": 69900515, + "step": 3240, + "time_per_iteration": 2.6980888843536377 + }, + { + "auxiliary_loss_clip": 0.01131601, + "auxiliary_loss_mlp": 0.0103943, + "balance_loss_clip": 1.05846488, + "balance_loss_mlp": 1.02208555, + "epoch": 0.19485946189688863, + "flos": 23295072839040.0, + "grad_norm": 5.847648280625993, + "language_loss": 0.65809447, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.6798048, + "num_input_tokens_seen": 69920060, + "step": 3241, + "time_per_iteration": 2.659707546234131 + }, + { + "auxiliary_loss_clip": 0.01128971, + "auxiliary_loss_mlp": 0.01048707, + "balance_loss_clip": 1.05226684, + "balance_loss_mlp": 1.03039646, + "epoch": 0.1949195851495566, + "flos": 23295036925440.0, + "grad_norm": 1.4408225707306088, + "language_loss": 0.82747853, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.84925532, + "num_input_tokens_seen": 69939820, + "step": 3242, + "time_per_iteration": 2.7137632369995117 + }, + { + "auxiliary_loss_clip": 0.01077632, + "auxiliary_loss_mlp": 0.01014225, + "balance_loss_clip": 1.04083347, + "balance_loss_mlp": 1.01131678, + "epoch": 0.19497970840222456, + "flos": 64962871557120.0, + "grad_norm": 0.853263603243422, + "language_loss": 0.57500821, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59592682, + "num_input_tokens_seen": 70002145, + "step": 3243, + "time_per_iteration": 3.1446309089660645 + }, + { + "auxiliary_loss_clip": 0.01138548, + "auxiliary_loss_mlp": 0.01050428, + "balance_loss_clip": 1.05331421, + "balance_loss_mlp": 1.02988815, + "epoch": 0.19503983165489253, + "flos": 19644717277440.0, + "grad_norm": 7.2345723863132, + "language_loss": 0.83789021, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.85977995, + "num_input_tokens_seen": 70020510, + "step": 3244, + "time_per_iteration": 2.6194229125976562 + }, + { + "auxiliary_loss_clip": 0.01143261, + "auxiliary_loss_mlp": 0.01046223, + "balance_loss_clip": 1.05732584, + "balance_loss_mlp": 1.02869976, + "epoch": 0.1950999549075605, + "flos": 21141976821120.0, + "grad_norm": 2.0710390949438837, + "language_loss": 0.7739507, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.79584551, + "num_input_tokens_seen": 70040760, + "step": 3245, + "time_per_iteration": 2.6684374809265137 + }, + { + "auxiliary_loss_clip": 0.01142874, + "auxiliary_loss_mlp": 0.01043114, + "balance_loss_clip": 1.05566645, + "balance_loss_mlp": 1.02431464, + "epoch": 0.19516007816022846, + "flos": 20884820376960.0, + "grad_norm": 2.1010289547443133, + "language_loss": 0.83988321, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.86174309, + "num_input_tokens_seen": 70058720, + "step": 3246, + "time_per_iteration": 2.6595354080200195 + }, + { + "auxiliary_loss_clip": 0.0114599, + "auxiliary_loss_mlp": 0.00776442, + "balance_loss_clip": 1.05517101, + "balance_loss_mlp": 1.00080454, + "epoch": 0.19522020141289645, + "flos": 16910515031040.0, + "grad_norm": 3.3581015873305438, + "language_loss": 0.76840878, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.78763306, + "num_input_tokens_seen": 70076470, + "step": 3247, + "time_per_iteration": 2.7777793407440186 + }, + { + "auxiliary_loss_clip": 0.01121778, + "auxiliary_loss_mlp": 0.01043977, + "balance_loss_clip": 1.05689096, + "balance_loss_mlp": 1.02651262, + "epoch": 0.19528032466556441, + "flos": 26724829023360.0, + "grad_norm": 1.8981807103962522, + "language_loss": 0.75459039, + "learning_rate": 3.720215890515421e-06, + "loss": 0.77624786, + "num_input_tokens_seen": 70096220, + "step": 3248, + "time_per_iteration": 2.8088901042938232 + }, + { + "auxiliary_loss_clip": 0.01156017, + "auxiliary_loss_mlp": 0.01048303, + "balance_loss_clip": 1.05548215, + "balance_loss_mlp": 1.03008783, + "epoch": 0.19534044791823238, + "flos": 21032808410880.0, + "grad_norm": 2.7209722336942135, + "language_loss": 0.77774823, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.79979146, + "num_input_tokens_seen": 70114800, + "step": 3249, + "time_per_iteration": 2.610877752304077 + }, + { + "auxiliary_loss_clip": 0.01148434, + "auxiliary_loss_mlp": 0.01050332, + "balance_loss_clip": 1.05689144, + "balance_loss_mlp": 1.03299928, + "epoch": 0.19540057117090034, + "flos": 22344050396160.0, + "grad_norm": 1.5551573885822045, + "language_loss": 0.73118901, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.75317669, + "num_input_tokens_seen": 70134930, + "step": 3250, + "time_per_iteration": 2.5901567935943604 + }, + { + "auxiliary_loss_clip": 0.01101628, + "auxiliary_loss_mlp": 0.01046467, + "balance_loss_clip": 1.05080378, + "balance_loss_mlp": 1.02876413, + "epoch": 0.1954606944235683, + "flos": 20301631159680.0, + "grad_norm": 2.030501302548557, + "language_loss": 0.79203367, + "learning_rate": 3.719619589699017e-06, + "loss": 0.81351459, + "num_input_tokens_seen": 70152045, + "step": 3251, + "time_per_iteration": 2.6619749069213867 + }, + { + "auxiliary_loss_clip": 0.0115825, + "auxiliary_loss_mlp": 0.01044132, + "balance_loss_clip": 1.05741858, + "balance_loss_mlp": 1.02606022, + "epoch": 0.19552081767623627, + "flos": 17346865449600.0, + "grad_norm": 7.451515078679223, + "language_loss": 0.83871722, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.86074108, + "num_input_tokens_seen": 70169240, + "step": 3252, + "time_per_iteration": 2.5029656887054443 + }, + { + "auxiliary_loss_clip": 0.01142752, + "auxiliary_loss_mlp": 0.01057294, + "balance_loss_clip": 1.05278862, + "balance_loss_mlp": 1.03518057, + "epoch": 0.19558094092890424, + "flos": 31977626129280.0, + "grad_norm": 1.7140417843701068, + "language_loss": 0.73995864, + "learning_rate": 3.719221729768117e-06, + "loss": 0.76195908, + "num_input_tokens_seen": 70192690, + "step": 3253, + "time_per_iteration": 2.609117269515991 + }, + { + "auxiliary_loss_clip": 0.01102675, + "auxiliary_loss_mlp": 0.01046707, + "balance_loss_clip": 1.04759037, + "balance_loss_mlp": 1.02782381, + "epoch": 0.19564106418157223, + "flos": 22268889187200.0, + "grad_norm": 2.1302159220485675, + "language_loss": 0.76167047, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.78316426, + "num_input_tokens_seen": 70209685, + "step": 3254, + "time_per_iteration": 4.174965858459473 + }, + { + "auxiliary_loss_clip": 0.01043127, + "auxiliary_loss_mlp": 0.01006966, + "balance_loss_clip": 1.04737842, + "balance_loss_mlp": 1.0036757, + "epoch": 0.1957011874342402, + "flos": 54364554385920.0, + "grad_norm": 0.84452007287803, + "language_loss": 0.55275303, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.57325399, + "num_input_tokens_seen": 70265050, + "step": 3255, + "time_per_iteration": 3.2241716384887695 + }, + { + "auxiliary_loss_clip": 0.01133721, + "auxiliary_loss_mlp": 0.01041696, + "balance_loss_clip": 1.0557251, + "balance_loss_mlp": 1.02349281, + "epoch": 0.19576131068690816, + "flos": 16506699356160.0, + "grad_norm": 2.6103802859468392, + "language_loss": 0.70870697, + "learning_rate": 3.718624450942688e-06, + "loss": 0.73046112, + "num_input_tokens_seen": 70281830, + "step": 3256, + "time_per_iteration": 2.641296148300171 + }, + { + "auxiliary_loss_clip": 0.01152768, + "auxiliary_loss_mlp": 0.01042867, + "balance_loss_clip": 1.0544858, + "balance_loss_mlp": 1.02523613, + "epoch": 0.19582143393957613, + "flos": 14719676797440.0, + "grad_norm": 2.649319646209249, + "language_loss": 0.80722409, + "learning_rate": 3.718425227649987e-06, + "loss": 0.82918048, + "num_input_tokens_seen": 70297420, + "step": 3257, + "time_per_iteration": 4.258259057998657 + }, + { + "auxiliary_loss_clip": 0.01106644, + "auxiliary_loss_mlp": 0.01043385, + "balance_loss_clip": 1.05470431, + "balance_loss_mlp": 1.02601588, + "epoch": 0.1958815571922441, + "flos": 24425504737920.0, + "grad_norm": 6.015808523610408, + "language_loss": 0.75124931, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.77274966, + "num_input_tokens_seen": 70319210, + "step": 3258, + "time_per_iteration": 4.386433362960815 + }, + { + "auxiliary_loss_clip": 0.01082287, + "auxiliary_loss_mlp": 0.01044148, + "balance_loss_clip": 1.04533339, + "balance_loss_mlp": 1.0237875, + "epoch": 0.19594168044491206, + "flos": 24900279730560.0, + "grad_norm": 1.8034996675319444, + "language_loss": 0.73872411, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.75998843, + "num_input_tokens_seen": 70339045, + "step": 3259, + "time_per_iteration": 2.815469264984131 + }, + { + "auxiliary_loss_clip": 0.01131793, + "auxiliary_loss_mlp": 0.01043364, + "balance_loss_clip": 1.05167735, + "balance_loss_mlp": 1.02392125, + "epoch": 0.19600180369758005, + "flos": 12057008486400.0, + "grad_norm": 2.2096667980592, + "language_loss": 0.77053022, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.79228187, + "num_input_tokens_seen": 70356505, + "step": 3260, + "time_per_iteration": 4.2817702293396 + }, + { + "auxiliary_loss_clip": 0.01148118, + "auxiliary_loss_mlp": 0.01043761, + "balance_loss_clip": 1.0551343, + "balance_loss_mlp": 1.0248661, + "epoch": 0.19606192695024802, + "flos": 20850202644480.0, + "grad_norm": 5.605178759176999, + "language_loss": 0.82261205, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.84453082, + "num_input_tokens_seen": 70375410, + "step": 3261, + "time_per_iteration": 2.673092842102051 + }, + { + "auxiliary_loss_clip": 0.01121379, + "auxiliary_loss_mlp": 0.01044043, + "balance_loss_clip": 1.0550617, + "balance_loss_mlp": 1.02488637, + "epoch": 0.19612205020291598, + "flos": 28475509996800.0, + "grad_norm": 1.8492209450679535, + "language_loss": 0.76671481, + "learning_rate": 3.717428133894807e-06, + "loss": 0.78836906, + "num_input_tokens_seen": 70396315, + "step": 3262, + "time_per_iteration": 2.803938150405884 + }, + { + "auxiliary_loss_clip": 0.01148893, + "auxiliary_loss_mlp": 0.01047259, + "balance_loss_clip": 1.05960584, + "balance_loss_mlp": 1.02950907, + "epoch": 0.19618217345558395, + "flos": 25556618995200.0, + "grad_norm": 1.7278621785184562, + "language_loss": 0.8668195, + "learning_rate": 3.71722851973837e-06, + "loss": 0.88878107, + "num_input_tokens_seen": 70417945, + "step": 3263, + "time_per_iteration": 2.6677918434143066 + }, + { + "auxiliary_loss_clip": 0.0113123, + "auxiliary_loss_mlp": 0.01042546, + "balance_loss_clip": 1.05328059, + "balance_loss_mlp": 1.02505815, + "epoch": 0.1962422967082519, + "flos": 25264413855360.0, + "grad_norm": 3.447639973868791, + "language_loss": 0.73775035, + "learning_rate": 3.717028840464455e-06, + "loss": 0.75948811, + "num_input_tokens_seen": 70438690, + "step": 3264, + "time_per_iteration": 2.6973094940185547 + }, + { + "auxiliary_loss_clip": 0.01144053, + "auxiliary_loss_mlp": 0.01049918, + "balance_loss_clip": 1.05736756, + "balance_loss_mlp": 1.03223944, + "epoch": 0.19630241996091988, + "flos": 18807352444800.0, + "grad_norm": 2.4424358562200927, + "language_loss": 0.78513813, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.80707777, + "num_input_tokens_seen": 70455385, + "step": 3265, + "time_per_iteration": 2.625739336013794 + }, + { + "auxiliary_loss_clip": 0.01031434, + "auxiliary_loss_mlp": 0.01002481, + "balance_loss_clip": 1.03386986, + "balance_loss_mlp": 0.99983466, + "epoch": 0.19636254321358784, + "flos": 62321137896960.0, + "grad_norm": 0.7932330660809486, + "language_loss": 0.53389955, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55423868, + "num_input_tokens_seen": 70514280, + "step": 3266, + "time_per_iteration": 3.2586586475372314 + }, + { + "auxiliary_loss_clip": 0.01124628, + "auxiliary_loss_mlp": 0.00776501, + "balance_loss_clip": 1.04957044, + "balance_loss_mlp": 1.00080895, + "epoch": 0.19642266646625584, + "flos": 21069329564160.0, + "grad_norm": 2.0008611208986133, + "language_loss": 0.80109024, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.8201015, + "num_input_tokens_seen": 70531800, + "step": 3267, + "time_per_iteration": 2.678537368774414 + }, + { + "auxiliary_loss_clip": 0.01130982, + "auxiliary_loss_mlp": 0.01043983, + "balance_loss_clip": 1.05263019, + "balance_loss_mlp": 1.02660179, + "epoch": 0.1964827897189238, + "flos": 14538651229440.0, + "grad_norm": 1.9909459598185588, + "language_loss": 0.86758262, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.88933229, + "num_input_tokens_seen": 70550615, + "step": 3268, + "time_per_iteration": 2.6949849128723145 + }, + { + "auxiliary_loss_clip": 0.01099432, + "auxiliary_loss_mlp": 0.01041621, + "balance_loss_clip": 1.04954004, + "balance_loss_mlp": 1.02408528, + "epoch": 0.19654291297159177, + "flos": 19244636616960.0, + "grad_norm": 2.2632495429204127, + "language_loss": 0.68785441, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.70926493, + "num_input_tokens_seen": 70568690, + "step": 3269, + "time_per_iteration": 2.770078182220459 + }, + { + "auxiliary_loss_clip": 0.01116538, + "auxiliary_loss_mlp": 0.01052319, + "balance_loss_clip": 1.05113554, + "balance_loss_mlp": 1.03330541, + "epoch": 0.19660303622425973, + "flos": 25775710001280.0, + "grad_norm": 7.1863103423452355, + "language_loss": 0.80241841, + "learning_rate": 3.715829397778135e-06, + "loss": 0.82410699, + "num_input_tokens_seen": 70588665, + "step": 3270, + "time_per_iteration": 2.7294864654541016 + }, + { + "auxiliary_loss_clip": 0.01139501, + "auxiliary_loss_mlp": 0.01045694, + "balance_loss_clip": 1.05189824, + "balance_loss_mlp": 1.02833724, + "epoch": 0.1966631594769277, + "flos": 20595093275520.0, + "grad_norm": 1.9668649321541274, + "language_loss": 0.83912349, + "learning_rate": 3.715629262894028e-06, + "loss": 0.86097538, + "num_input_tokens_seen": 70606900, + "step": 3271, + "time_per_iteration": 2.640235662460327 + }, + { + "auxiliary_loss_clip": 0.01139368, + "auxiliary_loss_mlp": 0.01051303, + "balance_loss_clip": 1.05468225, + "balance_loss_mlp": 1.0332067, + "epoch": 0.19672328272959566, + "flos": 23623188600960.0, + "grad_norm": 1.9968416702279483, + "language_loss": 0.79902714, + "learning_rate": 3.715429062953087e-06, + "loss": 0.82093388, + "num_input_tokens_seen": 70625955, + "step": 3272, + "time_per_iteration": 2.636629343032837 + }, + { + "auxiliary_loss_clip": 0.01124328, + "auxiliary_loss_mlp": 0.01058493, + "balance_loss_clip": 1.05192566, + "balance_loss_mlp": 1.03715479, + "epoch": 0.19678340598226365, + "flos": 23110922787840.0, + "grad_norm": 1.7302013075823783, + "language_loss": 0.80942369, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.83125186, + "num_input_tokens_seen": 70646090, + "step": 3273, + "time_per_iteration": 2.6967809200286865 + }, + { + "auxiliary_loss_clip": 0.01144024, + "auxiliary_loss_mlp": 0.01054564, + "balance_loss_clip": 1.05456042, + "balance_loss_mlp": 1.03655195, + "epoch": 0.19684352923493162, + "flos": 24534852716160.0, + "grad_norm": 2.225126358921887, + "language_loss": 0.77984649, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.80183232, + "num_input_tokens_seen": 70666065, + "step": 3274, + "time_per_iteration": 2.6808643341064453 + }, + { + "auxiliary_loss_clip": 0.01141267, + "auxiliary_loss_mlp": 0.01046445, + "balance_loss_clip": 1.05480242, + "balance_loss_mlp": 1.02840877, + "epoch": 0.19690365248759958, + "flos": 21796448578560.0, + "grad_norm": 2.318697297640889, + "language_loss": 0.81433225, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.8362093, + "num_input_tokens_seen": 70681580, + "step": 3275, + "time_per_iteration": 2.672672986984253 + }, + { + "auxiliary_loss_clip": 0.01115756, + "auxiliary_loss_mlp": 0.01045314, + "balance_loss_clip": 1.05148947, + "balance_loss_mlp": 1.02686024, + "epoch": 0.19696377574026755, + "flos": 19056643810560.0, + "grad_norm": 2.4665004531377166, + "language_loss": 0.80909657, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.83070731, + "num_input_tokens_seen": 70697745, + "step": 3276, + "time_per_iteration": 2.726970672607422 + }, + { + "auxiliary_loss_clip": 0.01142619, + "auxiliary_loss_mlp": 0.01043042, + "balance_loss_clip": 1.05443609, + "balance_loss_mlp": 1.02491045, + "epoch": 0.19702389899293551, + "flos": 22820656982400.0, + "grad_norm": 2.17541075016206, + "language_loss": 0.89113599, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.9129926, + "num_input_tokens_seen": 70715110, + "step": 3277, + "time_per_iteration": 2.6738827228546143 + }, + { + "auxiliary_loss_clip": 0.01103709, + "auxiliary_loss_mlp": 0.01048433, + "balance_loss_clip": 1.04638815, + "balance_loss_mlp": 1.02864444, + "epoch": 0.19708402224560348, + "flos": 22894237992960.0, + "grad_norm": 2.640727897616601, + "language_loss": 0.62070847, + "learning_rate": 3.714226497539239e-06, + "loss": 0.64222991, + "num_input_tokens_seen": 70734715, + "step": 3278, + "time_per_iteration": 2.7382938861846924 + }, + { + "auxiliary_loss_clip": 0.01115303, + "auxiliary_loss_mlp": 0.0105759, + "balance_loss_clip": 1.05033016, + "balance_loss_mlp": 1.03793263, + "epoch": 0.19714414549827144, + "flos": 25662519267840.0, + "grad_norm": 1.930104581155035, + "language_loss": 0.73606467, + "learning_rate": 3.714025842413166e-06, + "loss": 0.75779366, + "num_input_tokens_seen": 70752650, + "step": 3279, + "time_per_iteration": 2.8123648166656494 + }, + { + "auxiliary_loss_clip": 0.0114648, + "auxiliary_loss_mlp": 0.01042853, + "balance_loss_clip": 1.05422091, + "balance_loss_mlp": 1.02567458, + "epoch": 0.19720426875093944, + "flos": 23915824704000.0, + "grad_norm": 1.7034036878345749, + "language_loss": 0.82685816, + "learning_rate": 3.713825122291061e-06, + "loss": 0.84875143, + "num_input_tokens_seen": 70772365, + "step": 3280, + "time_per_iteration": 2.7000861167907715 + }, + { + "auxiliary_loss_clip": 0.01106655, + "auxiliary_loss_mlp": 0.01048884, + "balance_loss_clip": 1.04887283, + "balance_loss_mlp": 1.03071654, + "epoch": 0.1972643920036074, + "flos": 13881952828800.0, + "grad_norm": 2.435959864664923, + "language_loss": 0.78173983, + "learning_rate": 3.713624337180536e-06, + "loss": 0.80329525, + "num_input_tokens_seen": 70790340, + "step": 3281, + "time_per_iteration": 2.7017247676849365 + }, + { + "auxiliary_loss_clip": 0.01125353, + "auxiliary_loss_mlp": 0.0104135, + "balance_loss_clip": 1.05461836, + "balance_loss_mlp": 1.02519727, + "epoch": 0.19732451525627537, + "flos": 19863592801920.0, + "grad_norm": 1.7390973872526612, + "language_loss": 0.79777479, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.8194418, + "num_input_tokens_seen": 70809295, + "step": 3282, + "time_per_iteration": 2.7064146995544434 + }, + { + "auxiliary_loss_clip": 0.01112073, + "auxiliary_loss_mlp": 0.01043047, + "balance_loss_clip": 1.05485284, + "balance_loss_mlp": 1.02538049, + "epoch": 0.19738463850894333, + "flos": 24973429777920.0, + "grad_norm": 2.512566515566025, + "language_loss": 0.7192747, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.74082589, + "num_input_tokens_seen": 70828765, + "step": 3283, + "time_per_iteration": 2.775297164916992 + }, + { + "auxiliary_loss_clip": 0.01137498, + "auxiliary_loss_mlp": 0.01043438, + "balance_loss_clip": 1.05320621, + "balance_loss_mlp": 1.02665281, + "epoch": 0.1974447617616113, + "flos": 18368883123840.0, + "grad_norm": 1.8864815757917637, + "language_loss": 0.78981179, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.81162113, + "num_input_tokens_seen": 70846805, + "step": 3284, + "time_per_iteration": 2.6344916820526123 + }, + { + "auxiliary_loss_clip": 0.01126512, + "auxiliary_loss_mlp": 0.00776821, + "balance_loss_clip": 1.05065584, + "balance_loss_mlp": 1.00114048, + "epoch": 0.19750488501427926, + "flos": 22892945103360.0, + "grad_norm": 2.1903874509936982, + "language_loss": 0.86317503, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.88220835, + "num_input_tokens_seen": 70863805, + "step": 3285, + "time_per_iteration": 2.725186586380005 + }, + { + "auxiliary_loss_clip": 0.01115791, + "auxiliary_loss_mlp": 0.01044707, + "balance_loss_clip": 1.05167055, + "balance_loss_mlp": 1.02658761, + "epoch": 0.19756500826694723, + "flos": 21871502046720.0, + "grad_norm": 2.208260347555195, + "language_loss": 0.88770825, + "learning_rate": 3.712619437068174e-06, + "loss": 0.90931326, + "num_input_tokens_seen": 70882660, + "step": 3286, + "time_per_iteration": 2.6819698810577393 + }, + { + "auxiliary_loss_clip": 0.01118742, + "auxiliary_loss_mlp": 0.01052526, + "balance_loss_clip": 1.05227792, + "balance_loss_mlp": 1.03016233, + "epoch": 0.19762513151961522, + "flos": 15158972131200.0, + "grad_norm": 2.0768117117784874, + "language_loss": 0.77941382, + "learning_rate": 3.712418262187102e-06, + "loss": 0.80112648, + "num_input_tokens_seen": 70898765, + "step": 3287, + "time_per_iteration": 2.641193389892578 + }, + { + "auxiliary_loss_clip": 0.01127955, + "auxiliary_loss_mlp": 0.01047337, + "balance_loss_clip": 1.0526104, + "balance_loss_mlp": 1.02849019, + "epoch": 0.1976852547722832, + "flos": 16979175878400.0, + "grad_norm": 2.061421898899755, + "language_loss": 0.80853081, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.83028376, + "num_input_tokens_seen": 70916370, + "step": 3288, + "time_per_iteration": 2.625068426132202 + }, + { + "auxiliary_loss_clip": 0.01132408, + "auxiliary_loss_mlp": 0.01048194, + "balance_loss_clip": 1.05143857, + "balance_loss_mlp": 1.03045535, + "epoch": 0.19774537802495115, + "flos": 20302924049280.0, + "grad_norm": 2.345717890688315, + "language_loss": 0.7317158, + "learning_rate": 3.712015717627374e-06, + "loss": 0.75352174, + "num_input_tokens_seen": 70934870, + "step": 3289, + "time_per_iteration": 2.6319406032562256 + }, + { + "auxiliary_loss_clip": 0.01133413, + "auxiliary_loss_mlp": 0.01045224, + "balance_loss_clip": 1.05575252, + "balance_loss_mlp": 1.02678204, + "epoch": 0.19780550127761912, + "flos": 27235478724480.0, + "grad_norm": 1.9087552003653308, + "language_loss": 0.79608113, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.81786746, + "num_input_tokens_seen": 70955140, + "step": 3290, + "time_per_iteration": 2.706570863723755 + }, + { + "auxiliary_loss_clip": 0.01049926, + "auxiliary_loss_mlp": 0.0101105, + "balance_loss_clip": 1.0327636, + "balance_loss_mlp": 1.00853467, + "epoch": 0.19786562453028708, + "flos": 63550972684800.0, + "grad_norm": 0.8952067644857119, + "language_loss": 0.60318571, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62379545, + "num_input_tokens_seen": 71012005, + "step": 3291, + "time_per_iteration": 3.2849009037017822 + }, + { + "auxiliary_loss_clip": 0.01158891, + "auxiliary_loss_mlp": 0.01040785, + "balance_loss_clip": 1.05417156, + "balance_loss_mlp": 1.02088892, + "epoch": 0.19792574778295505, + "flos": 26286647011200.0, + "grad_norm": 1.932789926440358, + "language_loss": 0.81595641, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.83795315, + "num_input_tokens_seen": 71031140, + "step": 3292, + "time_per_iteration": 2.6751551628112793 + }, + { + "auxiliary_loss_clip": 0.01119797, + "auxiliary_loss_mlp": 0.00778082, + "balance_loss_clip": 1.05296063, + "balance_loss_mlp": 1.00086236, + "epoch": 0.19798587103562304, + "flos": 19938107566080.0, + "grad_norm": 2.409042629875397, + "language_loss": 0.81013, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.82910883, + "num_input_tokens_seen": 71050250, + "step": 3293, + "time_per_iteration": 4.3039703369140625 + }, + { + "auxiliary_loss_clip": 0.01137316, + "auxiliary_loss_mlp": 0.01052434, + "balance_loss_clip": 1.05370128, + "balance_loss_mlp": 1.03277683, + "epoch": 0.198045994288291, + "flos": 20120282369280.0, + "grad_norm": 1.8764131105986912, + "language_loss": 0.61480314, + "learning_rate": 3.711008220265093e-06, + "loss": 0.63670063, + "num_input_tokens_seen": 71068665, + "step": 3294, + "time_per_iteration": 2.671241044998169 + }, + { + "auxiliary_loss_clip": 0.01132208, + "auxiliary_loss_mlp": 0.01039978, + "balance_loss_clip": 1.05456376, + "balance_loss_mlp": 1.02201271, + "epoch": 0.19810611754095897, + "flos": 17967653228160.0, + "grad_norm": 2.0334748560156393, + "language_loss": 0.87313825, + "learning_rate": 3.710806526117251e-06, + "loss": 0.89486015, + "num_input_tokens_seen": 71085320, + "step": 3295, + "time_per_iteration": 2.659680128097534 + }, + { + "auxiliary_loss_clip": 0.01113106, + "auxiliary_loss_mlp": 0.01050184, + "balance_loss_clip": 1.05079484, + "balance_loss_mlp": 1.03256536, + "epoch": 0.19816624079362694, + "flos": 15084996071040.0, + "grad_norm": 2.5215255479345067, + "language_loss": 0.80839241, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.83002532, + "num_input_tokens_seen": 71102020, + "step": 3296, + "time_per_iteration": 4.299339294433594 + }, + { + "auxiliary_loss_clip": 0.01123906, + "auxiliary_loss_mlp": 0.01045438, + "balance_loss_clip": 1.05233586, + "balance_loss_mlp": 1.02522039, + "epoch": 0.1982263640462949, + "flos": 24900315644160.0, + "grad_norm": 2.528943220563754, + "language_loss": 0.68126047, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70295388, + "num_input_tokens_seen": 71123390, + "step": 3297, + "time_per_iteration": 4.258284091949463 + }, + { + "auxiliary_loss_clip": 0.01153129, + "auxiliary_loss_mlp": 0.01037574, + "balance_loss_clip": 1.05660713, + "balance_loss_mlp": 1.02031219, + "epoch": 0.19828648729896287, + "flos": 20376181837440.0, + "grad_norm": 1.9083451106828888, + "language_loss": 0.81310993, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83501697, + "num_input_tokens_seen": 71141800, + "step": 3298, + "time_per_iteration": 2.6156656742095947 + }, + { + "auxiliary_loss_clip": 0.01137409, + "auxiliary_loss_mlp": 0.01042227, + "balance_loss_clip": 1.0573976, + "balance_loss_mlp": 1.02159238, + "epoch": 0.19834661055163083, + "flos": 18880035615360.0, + "grad_norm": 1.8996943203321497, + "language_loss": 0.85154539, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.87334174, + "num_input_tokens_seen": 71159505, + "step": 3299, + "time_per_iteration": 2.6749041080474854 + }, + { + "auxiliary_loss_clip": 0.01036953, + "auxiliary_loss_mlp": 0.01013935, + "balance_loss_clip": 1.02875936, + "balance_loss_mlp": 1.01106215, + "epoch": 0.19840673380429882, + "flos": 60259184640000.0, + "grad_norm": 0.82907550606663, + "language_loss": 0.53206414, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55257303, + "num_input_tokens_seen": 71223265, + "step": 3300, + "time_per_iteration": 4.83857798576355 + }, + { + "auxiliary_loss_clip": 0.01105122, + "auxiliary_loss_mlp": 0.01064471, + "balance_loss_clip": 1.04748702, + "balance_loss_mlp": 1.0410459, + "epoch": 0.1984668570569668, + "flos": 19902017376000.0, + "grad_norm": 316.1702389408657, + "language_loss": 0.73014295, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.75183886, + "num_input_tokens_seen": 71242385, + "step": 3301, + "time_per_iteration": 2.700654983520508 + }, + { + "auxiliary_loss_clip": 0.01118926, + "auxiliary_loss_mlp": 0.01044315, + "balance_loss_clip": 1.05295372, + "balance_loss_mlp": 1.02619529, + "epoch": 0.19852698030963475, + "flos": 15630766295040.0, + "grad_norm": 2.410718710355122, + "language_loss": 0.88264418, + "learning_rate": 3.709392851040235e-06, + "loss": 0.90427655, + "num_input_tokens_seen": 71258990, + "step": 3302, + "time_per_iteration": 2.7190146446228027 + }, + { + "auxiliary_loss_clip": 0.01118067, + "auxiliary_loss_mlp": 0.01045078, + "balance_loss_clip": 1.05155802, + "balance_loss_mlp": 1.02661204, + "epoch": 0.19858710356230272, + "flos": 43143007311360.0, + "grad_norm": 2.210364764996701, + "language_loss": 0.73592931, + "learning_rate": 3.709190638115111e-06, + "loss": 0.75756073, + "num_input_tokens_seen": 71282770, + "step": 3303, + "time_per_iteration": 2.9379186630249023 + }, + { + "auxiliary_loss_clip": 0.01143275, + "auxiliary_loss_mlp": 0.01048515, + "balance_loss_clip": 1.05491257, + "balance_loss_mlp": 1.03002524, + "epoch": 0.19864722681497068, + "flos": 35144084643840.0, + "grad_norm": 1.9482807590384623, + "language_loss": 0.75103521, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.77295315, + "num_input_tokens_seen": 71301410, + "step": 3304, + "time_per_iteration": 2.743474245071411 + }, + { + "auxiliary_loss_clip": 0.01133571, + "auxiliary_loss_mlp": 0.01034983, + "balance_loss_clip": 1.05309725, + "balance_loss_mlp": 1.01710188, + "epoch": 0.19870735006763865, + "flos": 19426200888960.0, + "grad_norm": 1.8722016114425952, + "language_loss": 0.8628391, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.8845247, + "num_input_tokens_seen": 71319670, + "step": 3305, + "time_per_iteration": 2.7129390239715576 + }, + { + "auxiliary_loss_clip": 0.01128329, + "auxiliary_loss_mlp": 0.01044081, + "balance_loss_clip": 1.04770195, + "balance_loss_mlp": 1.02603281, + "epoch": 0.19876747332030664, + "flos": 23547380947200.0, + "grad_norm": 2.9829227362861106, + "language_loss": 0.68476367, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.70648777, + "num_input_tokens_seen": 71339850, + "step": 3306, + "time_per_iteration": 2.7083208560943604 + }, + { + "auxiliary_loss_clip": 0.01119386, + "auxiliary_loss_mlp": 0.01038782, + "balance_loss_clip": 1.04822719, + "balance_loss_mlp": 1.02168787, + "epoch": 0.1988275965729746, + "flos": 19829406032640.0, + "grad_norm": 1.683647244561179, + "language_loss": 0.76433122, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.78591287, + "num_input_tokens_seen": 71359795, + "step": 3307, + "time_per_iteration": 2.728661298751831 + }, + { + "auxiliary_loss_clip": 0.01157548, + "auxiliary_loss_mlp": 0.01044665, + "balance_loss_clip": 1.05895782, + "balance_loss_mlp": 1.02714145, + "epoch": 0.19888771982564257, + "flos": 23513625141120.0, + "grad_norm": 2.438172575069382, + "language_loss": 0.75991976, + "learning_rate": 3.708178601452737e-06, + "loss": 0.78194201, + "num_input_tokens_seen": 71378885, + "step": 3308, + "time_per_iteration": 2.6580557823181152 + }, + { + "auxiliary_loss_clip": 0.01107283, + "auxiliary_loss_mlp": 0.01041656, + "balance_loss_clip": 1.05453563, + "balance_loss_mlp": 1.02307141, + "epoch": 0.19894784307831054, + "flos": 18150510389760.0, + "grad_norm": 1.928689575161362, + "language_loss": 0.76043576, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.7819252, + "num_input_tokens_seen": 71397285, + "step": 3309, + "time_per_iteration": 2.77226185798645 + }, + { + "auxiliary_loss_clip": 0.0114115, + "auxiliary_loss_mlp": 0.01045061, + "balance_loss_clip": 1.05222607, + "balance_loss_mlp": 1.02592754, + "epoch": 0.1990079663309785, + "flos": 24276044246400.0, + "grad_norm": 75.17312936609292, + "language_loss": 0.87855697, + "learning_rate": 3.707773333313917e-06, + "loss": 0.90041906, + "num_input_tokens_seen": 71415775, + "step": 3310, + "time_per_iteration": 2.6789662837982178 + }, + { + "auxiliary_loss_clip": 0.01153037, + "auxiliary_loss_mlp": 0.01039864, + "balance_loss_clip": 1.05415869, + "balance_loss_mlp": 1.02139854, + "epoch": 0.19906808958364647, + "flos": 34897666366080.0, + "grad_norm": 2.3155756588664342, + "language_loss": 0.63650048, + "learning_rate": 3.70757060210226e-06, + "loss": 0.6584295, + "num_input_tokens_seen": 71437315, + "step": 3311, + "time_per_iteration": 2.7604620456695557 + }, + { + "auxiliary_loss_clip": 0.01115133, + "auxiliary_loss_mlp": 0.01043871, + "balance_loss_clip": 1.04763019, + "balance_loss_mlp": 1.02501202, + "epoch": 0.19912821283631443, + "flos": 24024885373440.0, + "grad_norm": 3.8064295514597717, + "language_loss": 0.74542546, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76701546, + "num_input_tokens_seen": 71456320, + "step": 3312, + "time_per_iteration": 2.796475410461426 + }, + { + "auxiliary_loss_clip": 0.01141587, + "auxiliary_loss_mlp": 0.01037435, + "balance_loss_clip": 1.05358124, + "balance_loss_mlp": 1.02017355, + "epoch": 0.19918833608898243, + "flos": 19859031774720.0, + "grad_norm": 2.2312990164825943, + "language_loss": 0.84033173, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.86212194, + "num_input_tokens_seen": 71475360, + "step": 3313, + "time_per_iteration": 2.6044952869415283 + }, + { + "auxiliary_loss_clip": 0.01146797, + "auxiliary_loss_mlp": 0.01042166, + "balance_loss_clip": 1.05695391, + "balance_loss_mlp": 1.02422476, + "epoch": 0.1992484593416504, + "flos": 29095794984960.0, + "grad_norm": 3.856678450124864, + "language_loss": 0.810305, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.83219463, + "num_input_tokens_seen": 71496155, + "step": 3314, + "time_per_iteration": 2.68841814994812 + }, + { + "auxiliary_loss_clip": 0.01112846, + "auxiliary_loss_mlp": 0.01043677, + "balance_loss_clip": 1.04617178, + "balance_loss_mlp": 1.02643955, + "epoch": 0.19930858259431836, + "flos": 23295001011840.0, + "grad_norm": 1.4822079401394097, + "language_loss": 0.87391549, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.89548075, + "num_input_tokens_seen": 71517295, + "step": 3315, + "time_per_iteration": 2.720093011856079 + }, + { + "auxiliary_loss_clip": 0.0111589, + "auxiliary_loss_mlp": 0.00777002, + "balance_loss_clip": 1.04992676, + "balance_loss_mlp": 1.00093687, + "epoch": 0.19936870584698632, + "flos": 25378825651200.0, + "grad_norm": 1.7805516248937883, + "language_loss": 0.70957202, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.72850096, + "num_input_tokens_seen": 71540000, + "step": 3316, + "time_per_iteration": 2.850100517272949 + }, + { + "auxiliary_loss_clip": 0.01019745, + "auxiliary_loss_mlp": 0.01012504, + "balance_loss_clip": 1.03032303, + "balance_loss_mlp": 1.01003671, + "epoch": 0.1994288290996543, + "flos": 62168053109760.0, + "grad_norm": 0.8326978726055106, + "language_loss": 0.66287398, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68319643, + "num_input_tokens_seen": 71607880, + "step": 3317, + "time_per_iteration": 3.425114870071411 + }, + { + "auxiliary_loss_clip": 0.01148059, + "auxiliary_loss_mlp": 0.01048913, + "balance_loss_clip": 1.05397809, + "balance_loss_mlp": 1.02964854, + "epoch": 0.19948895235232225, + "flos": 19025832919680.0, + "grad_norm": 2.282515690517884, + "language_loss": 0.74494618, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.76691592, + "num_input_tokens_seen": 71625695, + "step": 3318, + "time_per_iteration": 2.6815896034240723 + }, + { + "auxiliary_loss_clip": 0.01114942, + "auxiliary_loss_mlp": 0.01044681, + "balance_loss_clip": 1.04767084, + "balance_loss_mlp": 1.02786088, + "epoch": 0.19954907560499022, + "flos": 37815803182080.0, + "grad_norm": 1.8966456913695608, + "language_loss": 0.78894758, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.81054389, + "num_input_tokens_seen": 71648520, + "step": 3319, + "time_per_iteration": 2.847911834716797 + }, + { + "auxiliary_loss_clip": 0.01134557, + "auxiliary_loss_mlp": 0.01042988, + "balance_loss_clip": 1.05354095, + "balance_loss_mlp": 1.02312756, + "epoch": 0.1996091988576582, + "flos": 49565199594240.0, + "grad_norm": 2.1348540211051197, + "language_loss": 0.76006937, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.78184479, + "num_input_tokens_seen": 71672185, + "step": 3320, + "time_per_iteration": 2.9324615001678467 + }, + { + "auxiliary_loss_clip": 0.01120226, + "auxiliary_loss_mlp": 0.01042998, + "balance_loss_clip": 1.05083311, + "balance_loss_mlp": 1.02496171, + "epoch": 0.19966932211032618, + "flos": 22635788659200.0, + "grad_norm": 2.2436863685702546, + "language_loss": 0.80077857, + "learning_rate": 3.705539729936701e-06, + "loss": 0.82241082, + "num_input_tokens_seen": 71692890, + "step": 3321, + "time_per_iteration": 2.7534186840057373 + }, + { + "auxiliary_loss_clip": 0.01033096, + "auxiliary_loss_mlp": 0.01011167, + "balance_loss_clip": 1.02391553, + "balance_loss_mlp": 1.00828266, + "epoch": 0.19972944536299414, + "flos": 54082117745280.0, + "grad_norm": 0.874673110280983, + "language_loss": 0.65145189, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67189455, + "num_input_tokens_seen": 71745815, + "step": 3322, + "time_per_iteration": 3.0398683547973633 + }, + { + "auxiliary_loss_clip": 0.01039999, + "auxiliary_loss_mlp": 0.01007775, + "balance_loss_clip": 1.02971482, + "balance_loss_mlp": 1.00479472, + "epoch": 0.1997895686156621, + "flos": 69355031817600.0, + "grad_norm": 0.7915334307535052, + "language_loss": 0.56919783, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.58967561, + "num_input_tokens_seen": 71806915, + "step": 3323, + "time_per_iteration": 3.2814581394195557 + }, + { + "auxiliary_loss_clip": 0.01131487, + "auxiliary_loss_mlp": 0.00776139, + "balance_loss_clip": 1.05244064, + "balance_loss_mlp": 1.00085235, + "epoch": 0.19984969186833007, + "flos": 18552063507840.0, + "grad_norm": 1.8766856730809967, + "language_loss": 0.80573648, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.82481277, + "num_input_tokens_seen": 71824645, + "step": 3324, + "time_per_iteration": 2.66456937789917 + }, + { + "auxiliary_loss_clip": 0.01132572, + "auxiliary_loss_mlp": 0.01050254, + "balance_loss_clip": 1.04625165, + "balance_loss_mlp": 1.03027487, + "epoch": 0.19990981512099804, + "flos": 26429678968320.0, + "grad_norm": 2.4535669107623486, + "language_loss": 0.53931105, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.56113935, + "num_input_tokens_seen": 71845125, + "step": 3325, + "time_per_iteration": 2.696556329727173 + }, + { + "auxiliary_loss_clip": 0.01130165, + "auxiliary_loss_mlp": 0.01050725, + "balance_loss_clip": 1.05065942, + "balance_loss_mlp": 1.03328443, + "epoch": 0.19996993837366603, + "flos": 16325997010560.0, + "grad_norm": 2.1570763946475187, + "language_loss": 0.86074936, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.88255823, + "num_input_tokens_seen": 71863500, + "step": 3326, + "time_per_iteration": 2.7167885303497314 + }, + { + "auxiliary_loss_clip": 0.0115173, + "auxiliary_loss_mlp": 0.01042065, + "balance_loss_clip": 1.05427039, + "balance_loss_mlp": 1.02511311, + "epoch": 0.200030061626334, + "flos": 20844169159680.0, + "grad_norm": 2.0419576492150395, + "language_loss": 0.71793801, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.73987597, + "num_input_tokens_seen": 71881845, + "step": 3327, + "time_per_iteration": 2.6097662448883057 + }, + { + "auxiliary_loss_clip": 0.01131035, + "auxiliary_loss_mlp": 0.01052756, + "balance_loss_clip": 1.05146813, + "balance_loss_mlp": 1.03290796, + "epoch": 0.20009018487900196, + "flos": 23762629198080.0, + "grad_norm": 1.8948781463857982, + "language_loss": 0.7668376, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.78867549, + "num_input_tokens_seen": 71900940, + "step": 3328, + "time_per_iteration": 2.6869349479675293 + }, + { + "auxiliary_loss_clip": 0.01118681, + "auxiliary_loss_mlp": 0.01044603, + "balance_loss_clip": 1.04693103, + "balance_loss_mlp": 1.02799726, + "epoch": 0.20015030813166992, + "flos": 28111555440000.0, + "grad_norm": 2.0833377369651984, + "language_loss": 0.69400644, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.71563935, + "num_input_tokens_seen": 71921925, + "step": 3329, + "time_per_iteration": 2.844280481338501 + }, + { + "auxiliary_loss_clip": 0.01107384, + "auxiliary_loss_mlp": 0.01069575, + "balance_loss_clip": 1.04727411, + "balance_loss_mlp": 1.04641271, + "epoch": 0.2002104313843379, + "flos": 26067160955520.0, + "grad_norm": 3.099532194576676, + "language_loss": 0.81395614, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.83572567, + "num_input_tokens_seen": 71941855, + "step": 3330, + "time_per_iteration": 2.841885566711426 + }, + { + "auxiliary_loss_clip": 0.01137825, + "auxiliary_loss_mlp": 0.01048123, + "balance_loss_clip": 1.05147684, + "balance_loss_mlp": 1.02977705, + "epoch": 0.20027055463700585, + "flos": 22966633854720.0, + "grad_norm": 2.224132696455658, + "language_loss": 0.76606882, + "learning_rate": 3.703502390349417e-06, + "loss": 0.78792834, + "num_input_tokens_seen": 71960915, + "step": 3331, + "time_per_iteration": 2.7007360458374023 + }, + { + "auxiliary_loss_clip": 0.01093521, + "auxiliary_loss_mlp": 0.01069739, + "balance_loss_clip": 1.04292202, + "balance_loss_mlp": 1.04851985, + "epoch": 0.20033067788967382, + "flos": 17165660313600.0, + "grad_norm": 2.044808670508971, + "language_loss": 0.79330826, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.81494087, + "num_input_tokens_seen": 71979220, + "step": 3332, + "time_per_iteration": 4.518973112106323 + }, + { + "auxiliary_loss_clip": 0.01046467, + "auxiliary_loss_mlp": 0.010754, + "balance_loss_clip": 1.02134657, + "balance_loss_mlp": 1.07303989, + "epoch": 0.2003908011423418, + "flos": 60825566292480.0, + "grad_norm": 0.9607431077817938, + "language_loss": 0.61968678, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64090544, + "num_input_tokens_seen": 72033950, + "step": 3333, + "time_per_iteration": 3.074782371520996 + }, + { + "auxiliary_loss_clip": 0.01112058, + "auxiliary_loss_mlp": 0.00777645, + "balance_loss_clip": 1.04686844, + "balance_loss_mlp": 1.00099933, + "epoch": 0.20045092439500978, + "flos": 24206234163840.0, + "grad_norm": 2.9954165903614447, + "language_loss": 0.81385547, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.83275253, + "num_input_tokens_seen": 72051395, + "step": 3334, + "time_per_iteration": 4.270732641220093 + }, + { + "auxiliary_loss_clip": 0.01096467, + "auxiliary_loss_mlp": 0.01058699, + "balance_loss_clip": 1.04709518, + "balance_loss_mlp": 1.03889799, + "epoch": 0.20051104764767774, + "flos": 29387605075200.0, + "grad_norm": 2.9016061168315703, + "language_loss": 0.74238038, + "learning_rate": 3.702685645366134e-06, + "loss": 0.76393211, + "num_input_tokens_seen": 72071305, + "step": 3335, + "time_per_iteration": 4.376626491546631 + }, + { + "auxiliary_loss_clip": 0.01149242, + "auxiliary_loss_mlp": 0.01059851, + "balance_loss_clip": 1.05611062, + "balance_loss_mlp": 1.04120684, + "epoch": 0.2005711709003457, + "flos": 23513804709120.0, + "grad_norm": 1.700795836589561, + "language_loss": 0.79981416, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.82190514, + "num_input_tokens_seen": 72090165, + "step": 3336, + "time_per_iteration": 2.7031586170196533 + }, + { + "auxiliary_loss_clip": 0.01116655, + "auxiliary_loss_mlp": 0.01048065, + "balance_loss_clip": 1.04808092, + "balance_loss_mlp": 1.0272038, + "epoch": 0.20063129415301367, + "flos": 22523388024960.0, + "grad_norm": 2.0182523905302157, + "language_loss": 0.7761423, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.79778945, + "num_input_tokens_seen": 72107210, + "step": 3337, + "time_per_iteration": 2.6990835666656494 + }, + { + "auxiliary_loss_clip": 0.01158617, + "auxiliary_loss_mlp": 0.01045618, + "balance_loss_clip": 1.05752003, + "balance_loss_mlp": 1.02631783, + "epoch": 0.20069141740568164, + "flos": 25958243940480.0, + "grad_norm": 2.232061800350416, + "language_loss": 0.69108742, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.71312982, + "num_input_tokens_seen": 72126315, + "step": 3338, + "time_per_iteration": 2.6827659606933594 + }, + { + "auxiliary_loss_clip": 0.01117671, + "auxiliary_loss_mlp": 0.01053755, + "balance_loss_clip": 1.04930723, + "balance_loss_mlp": 1.03543282, + "epoch": 0.2007515406583496, + "flos": 24790608529920.0, + "grad_norm": 2.685005372503905, + "language_loss": 0.68898237, + "learning_rate": 3.701867867326735e-06, + "loss": 0.71069658, + "num_input_tokens_seen": 72146470, + "step": 3339, + "time_per_iteration": 4.430418014526367 + }, + { + "auxiliary_loss_clip": 0.01123098, + "auxiliary_loss_mlp": 0.01041763, + "balance_loss_clip": 1.05656064, + "balance_loss_mlp": 1.02408433, + "epoch": 0.2008116639110176, + "flos": 37925582123520.0, + "grad_norm": 2.0597617887640607, + "language_loss": 0.66606021, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.6877088, + "num_input_tokens_seen": 72166600, + "step": 3340, + "time_per_iteration": 3.0020461082458496 + }, + { + "auxiliary_loss_clip": 0.01145166, + "auxiliary_loss_mlp": 0.01036815, + "balance_loss_clip": 1.05326021, + "balance_loss_mlp": 1.01712155, + "epoch": 0.20087178716368556, + "flos": 20740531443840.0, + "grad_norm": 6.669810478748975, + "language_loss": 0.74554622, + "learning_rate": 3.701458591066019e-06, + "loss": 0.76736599, + "num_input_tokens_seen": 72185160, + "step": 3341, + "time_per_iteration": 2.762573480606079 + }, + { + "auxiliary_loss_clip": 0.01110242, + "auxiliary_loss_mlp": 0.01044424, + "balance_loss_clip": 1.04981375, + "balance_loss_mlp": 1.02595794, + "epoch": 0.20093191041635353, + "flos": 23842279607040.0, + "grad_norm": 7.177474445031109, + "language_loss": 0.71779013, + "learning_rate": 3.70125385615256e-06, + "loss": 0.73933673, + "num_input_tokens_seen": 72205160, + "step": 3342, + "time_per_iteration": 2.7128167152404785 + }, + { + "auxiliary_loss_clip": 0.01114025, + "auxiliary_loss_mlp": 0.01045057, + "balance_loss_clip": 1.05036438, + "balance_loss_mlp": 1.02749765, + "epoch": 0.2009920336690215, + "flos": 21792067119360.0, + "grad_norm": 2.3652416151608873, + "language_loss": 0.72892809, + "learning_rate": 3.701049056727384e-06, + "loss": 0.75051892, + "num_input_tokens_seen": 72223555, + "step": 3343, + "time_per_iteration": 2.8155410289764404 + }, + { + "auxiliary_loss_clip": 0.01113341, + "auxiliary_loss_mlp": 0.01046556, + "balance_loss_clip": 1.04568779, + "balance_loss_mlp": 1.02762532, + "epoch": 0.20105215692168946, + "flos": 26359222440960.0, + "grad_norm": 2.2972411099560195, + "language_loss": 0.80645263, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.82805163, + "num_input_tokens_seen": 72242465, + "step": 3344, + "time_per_iteration": 2.780198335647583 + }, + { + "auxiliary_loss_clip": 0.01155099, + "auxiliary_loss_mlp": 0.01045938, + "balance_loss_clip": 1.05386972, + "balance_loss_mlp": 1.02773499, + "epoch": 0.20111228017435742, + "flos": 18807280617600.0, + "grad_norm": 2.2640230255386125, + "language_loss": 0.83114576, + "learning_rate": 3.700639264372948e-06, + "loss": 0.85315621, + "num_input_tokens_seen": 72260655, + "step": 3345, + "time_per_iteration": 2.6209781169891357 + }, + { + "auxiliary_loss_clip": 0.01093716, + "auxiliary_loss_mlp": 0.01041329, + "balance_loss_clip": 1.04619193, + "balance_loss_mlp": 1.02492619, + "epoch": 0.20117240342702541, + "flos": 19975059682560.0, + "grad_norm": 1.7610524328763844, + "language_loss": 0.67947632, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.70082676, + "num_input_tokens_seen": 72279055, + "step": 3346, + "time_per_iteration": 2.692222833633423 + }, + { + "auxiliary_loss_clip": 0.01114086, + "auxiliary_loss_mlp": 0.01048128, + "balance_loss_clip": 1.04710329, + "balance_loss_mlp": 1.03028262, + "epoch": 0.20123252667969338, + "flos": 23142703345920.0, + "grad_norm": 2.3067659385334958, + "language_loss": 0.72993439, + "learning_rate": 3.70022921406487e-06, + "loss": 0.75155658, + "num_input_tokens_seen": 72297895, + "step": 3347, + "time_per_iteration": 2.7501564025878906 + }, + { + "auxiliary_loss_clip": 0.01142236, + "auxiliary_loss_mlp": 0.01047715, + "balance_loss_clip": 1.05465829, + "balance_loss_mlp": 1.03122878, + "epoch": 0.20129264993236134, + "flos": 23221671396480.0, + "grad_norm": 1.5798788242702444, + "language_loss": 0.86869538, + "learning_rate": 3.70002409219765e-06, + "loss": 0.8905949, + "num_input_tokens_seen": 72318385, + "step": 3348, + "time_per_iteration": 2.688606023788452 + }, + { + "auxiliary_loss_clip": 0.01099793, + "auxiliary_loss_mlp": 0.01045183, + "balance_loss_clip": 1.04737949, + "balance_loss_mlp": 1.02587092, + "epoch": 0.2013527731850293, + "flos": 21871466133120.0, + "grad_norm": 1.8024729376762028, + "language_loss": 0.71082795, + "learning_rate": 3.699818905865346e-06, + "loss": 0.73227775, + "num_input_tokens_seen": 72338235, + "step": 3349, + "time_per_iteration": 2.8423163890838623 + }, + { + "auxiliary_loss_clip": 0.01119982, + "auxiliary_loss_mlp": 0.01044662, + "balance_loss_clip": 1.0504061, + "balance_loss_mlp": 1.02520752, + "epoch": 0.20141289643769728, + "flos": 18040803275520.0, + "grad_norm": 1.7324672298731074, + "language_loss": 0.71324664, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.73489314, + "num_input_tokens_seen": 72357825, + "step": 3350, + "time_per_iteration": 2.7691454887390137 + }, + { + "auxiliary_loss_clip": 0.01126392, + "auxiliary_loss_mlp": 0.01043835, + "balance_loss_clip": 1.0497458, + "balance_loss_mlp": 1.02312887, + "epoch": 0.20147301969036524, + "flos": 23951412103680.0, + "grad_norm": 2.3965463087123107, + "language_loss": 0.76391226, + "learning_rate": 3.69940833983661e-06, + "loss": 0.78561449, + "num_input_tokens_seen": 72376335, + "step": 3351, + "time_per_iteration": 2.701244592666626 + }, + { + "auxiliary_loss_clip": 0.01134085, + "auxiliary_loss_mlp": 0.01047695, + "balance_loss_clip": 1.05303741, + "balance_loss_mlp": 1.02840734, + "epoch": 0.2015331429430332, + "flos": 25588471380480.0, + "grad_norm": 1.5574195085232978, + "language_loss": 0.80808926, + "learning_rate": 3.699202960155748e-06, + "loss": 0.82990712, + "num_input_tokens_seen": 72395440, + "step": 3352, + "time_per_iteration": 2.707792043685913 + }, + { + "auxiliary_loss_clip": 0.011457, + "auxiliary_loss_mlp": 0.01042883, + "balance_loss_clip": 1.05415952, + "balance_loss_mlp": 1.0244298, + "epoch": 0.2015932661957012, + "flos": 26724972677760.0, + "grad_norm": 1.9831574274346238, + "language_loss": 0.80594563, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.82783151, + "num_input_tokens_seen": 72414670, + "step": 3353, + "time_per_iteration": 2.675960063934326 + }, + { + "auxiliary_loss_clip": 0.01126272, + "auxiliary_loss_mlp": 0.01045978, + "balance_loss_clip": 1.05195928, + "balance_loss_mlp": 1.02787042, + "epoch": 0.20165338944836916, + "flos": 15633136592640.0, + "grad_norm": 2.0684163707657763, + "language_loss": 0.90046668, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.92218912, + "num_input_tokens_seen": 72432210, + "step": 3354, + "time_per_iteration": 2.6648361682891846 + }, + { + "auxiliary_loss_clip": 0.0104514, + "auxiliary_loss_mlp": 0.0075774, + "balance_loss_clip": 1.0285337, + "balance_loss_mlp": 1.00170481, + "epoch": 0.20171351270103713, + "flos": 57912529207680.0, + "grad_norm": 0.8264169258847935, + "language_loss": 0.55863291, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.57666171, + "num_input_tokens_seen": 72489225, + "step": 3355, + "time_per_iteration": 3.155352830886841 + }, + { + "auxiliary_loss_clip": 0.01127799, + "auxiliary_loss_mlp": 0.00776255, + "balance_loss_clip": 1.05133796, + "balance_loss_mlp": 1.00109434, + "epoch": 0.2017736359537051, + "flos": 20814363849600.0, + "grad_norm": 1.8367443502770229, + "language_loss": 0.84333616, + "learning_rate": 3.698380797170751e-06, + "loss": 0.86237669, + "num_input_tokens_seen": 72508715, + "step": 3356, + "time_per_iteration": 2.754645586013794 + }, + { + "auxiliary_loss_clip": 0.01127514, + "auxiliary_loss_mlp": 0.01052066, + "balance_loss_clip": 1.04904747, + "balance_loss_mlp": 1.02811635, + "epoch": 0.20183375920637306, + "flos": 17092043389440.0, + "grad_norm": 3.2349249330618504, + "language_loss": 0.70046175, + "learning_rate": 3.698175095398085e-06, + "loss": 0.72225749, + "num_input_tokens_seen": 72525135, + "step": 3357, + "time_per_iteration": 2.6905863285064697 + }, + { + "auxiliary_loss_clip": 0.0113535, + "auxiliary_loss_mlp": 0.01044956, + "balance_loss_clip": 1.05209541, + "balance_loss_mlp": 1.02590632, + "epoch": 0.20189388245904102, + "flos": 18661339658880.0, + "grad_norm": 2.41944886120848, + "language_loss": 0.7169627, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.73876572, + "num_input_tokens_seen": 72543690, + "step": 3358, + "time_per_iteration": 2.696295738220215 + }, + { + "auxiliary_loss_clip": 0.01139673, + "auxiliary_loss_mlp": 0.01052145, + "balance_loss_clip": 1.05050206, + "balance_loss_mlp": 1.03496706, + "epoch": 0.20195400571170902, + "flos": 16797539779200.0, + "grad_norm": 2.6870341127491675, + "language_loss": 0.83242267, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.85434085, + "num_input_tokens_seen": 72560725, + "step": 3359, + "time_per_iteration": 2.6779677867889404 + }, + { + "auxiliary_loss_clip": 0.01052166, + "auxiliary_loss_mlp": 0.01026452, + "balance_loss_clip": 1.02534354, + "balance_loss_mlp": 1.02345943, + "epoch": 0.20201412896437698, + "flos": 67174716268800.0, + "grad_norm": 0.8259567660078829, + "language_loss": 0.58980465, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61059082, + "num_input_tokens_seen": 72621940, + "step": 3360, + "time_per_iteration": 3.1175289154052734 + }, + { + "auxiliary_loss_clip": 0.01096543, + "auxiliary_loss_mlp": 0.01051237, + "balance_loss_clip": 1.05081403, + "balance_loss_mlp": 1.03154337, + "epoch": 0.20207425221704495, + "flos": 21325013550720.0, + "grad_norm": 2.668010943284884, + "language_loss": 0.63219774, + "learning_rate": 3.697351644435763e-06, + "loss": 0.65367556, + "num_input_tokens_seen": 72639135, + "step": 3361, + "time_per_iteration": 2.7732017040252686 + }, + { + "auxiliary_loss_clip": 0.01119862, + "auxiliary_loss_mlp": 0.01069748, + "balance_loss_clip": 1.04988885, + "balance_loss_mlp": 1.05035317, + "epoch": 0.2021343754697129, + "flos": 22527158952960.0, + "grad_norm": 1.9150118782569074, + "language_loss": 0.75946522, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.78136134, + "num_input_tokens_seen": 72658525, + "step": 3362, + "time_per_iteration": 2.755686044692993 + }, + { + "auxiliary_loss_clip": 0.01139499, + "auxiliary_loss_mlp": 0.00777827, + "balance_loss_clip": 1.05068207, + "balance_loss_mlp": 1.0011797, + "epoch": 0.20219449872238088, + "flos": 19062785036160.0, + "grad_norm": 2.043450343479612, + "language_loss": 0.76542944, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.78460264, + "num_input_tokens_seen": 72678085, + "step": 3363, + "time_per_iteration": 2.788773775100708 + }, + { + "auxiliary_loss_clip": 0.01143235, + "auxiliary_loss_mlp": 0.01068217, + "balance_loss_clip": 1.05241406, + "balance_loss_mlp": 1.0511229, + "epoch": 0.20225462197504884, + "flos": 24717027519360.0, + "grad_norm": 1.8380065969237507, + "language_loss": 0.75088942, + "learning_rate": 3.696733380367391e-06, + "loss": 0.773004, + "num_input_tokens_seen": 72698695, + "step": 3364, + "time_per_iteration": 2.7484803199768066 + }, + { + "auxiliary_loss_clip": 0.01111683, + "auxiliary_loss_mlp": 0.01065374, + "balance_loss_clip": 1.05202723, + "balance_loss_mlp": 1.04583549, + "epoch": 0.2023147452277168, + "flos": 22018304931840.0, + "grad_norm": 2.1478979049108395, + "language_loss": 0.71917796, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.7409485, + "num_input_tokens_seen": 72717880, + "step": 3365, + "time_per_iteration": 2.770939350128174 + }, + { + "auxiliary_loss_clip": 0.01110149, + "auxiliary_loss_mlp": 0.01064133, + "balance_loss_clip": 1.04989934, + "balance_loss_mlp": 1.04559648, + "epoch": 0.2023748684803848, + "flos": 17745365911680.0, + "grad_norm": 2.2136098995040228, + "language_loss": 0.85318875, + "learning_rate": 3.696320882607286e-06, + "loss": 0.87493157, + "num_input_tokens_seen": 72736410, + "step": 3366, + "time_per_iteration": 2.717759609222412 + }, + { + "auxiliary_loss_clip": 0.01116913, + "auxiliary_loss_mlp": 0.0106476, + "balance_loss_clip": 1.050488, + "balance_loss_mlp": 1.04605615, + "epoch": 0.20243499173305277, + "flos": 31138932493440.0, + "grad_norm": 2.048733189447585, + "language_loss": 0.69766563, + "learning_rate": 3.696114537236335e-06, + "loss": 0.71948242, + "num_input_tokens_seen": 72758295, + "step": 3367, + "time_per_iteration": 2.788444995880127 + }, + { + "auxiliary_loss_clip": 0.01144949, + "auxiliary_loss_mlp": 0.01060722, + "balance_loss_clip": 1.04997301, + "balance_loss_mlp": 1.03857303, + "epoch": 0.20249511498572073, + "flos": 33839235279360.0, + "grad_norm": 1.942153338299175, + "language_loss": 0.68162113, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.70367789, + "num_input_tokens_seen": 72782495, + "step": 3368, + "time_per_iteration": 2.7339746952056885 + }, + { + "auxiliary_loss_clip": 0.01123527, + "auxiliary_loss_mlp": 0.01063426, + "balance_loss_clip": 1.0543493, + "balance_loss_mlp": 1.04405439, + "epoch": 0.2025552382383887, + "flos": 21215629658880.0, + "grad_norm": 1.8860162071579365, + "language_loss": 0.77298439, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.79485393, + "num_input_tokens_seen": 72801885, + "step": 3369, + "time_per_iteration": 2.739088535308838 + }, + { + "auxiliary_loss_clip": 0.01136965, + "auxiliary_loss_mlp": 0.01071822, + "balance_loss_clip": 1.05140853, + "balance_loss_mlp": 1.05315351, + "epoch": 0.20261536149105666, + "flos": 14647388676480.0, + "grad_norm": 2.9806431283259354, + "language_loss": 0.65055734, + "learning_rate": 3.695495115253795e-06, + "loss": 0.67264521, + "num_input_tokens_seen": 72816990, + "step": 3370, + "time_per_iteration": 2.7082977294921875 + }, + { + "auxiliary_loss_clip": 0.0105828, + "auxiliary_loss_mlp": 0.01019528, + "balance_loss_clip": 1.03235602, + "balance_loss_mlp": 1.01690567, + "epoch": 0.20267548474372463, + "flos": 66783649921920.0, + "grad_norm": 0.678414814309544, + "language_loss": 0.58126765, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60204571, + "num_input_tokens_seen": 72879240, + "step": 3371, + "time_per_iteration": 4.805691242218018 + }, + { + "auxiliary_loss_clip": 0.01117624, + "auxiliary_loss_mlp": 0.01050757, + "balance_loss_clip": 1.04833245, + "balance_loss_mlp": 1.0329231, + "epoch": 0.2027356079963926, + "flos": 24680793674880.0, + "grad_norm": 2.167047343870177, + "language_loss": 0.91830015, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.9399839, + "num_input_tokens_seen": 72899030, + "step": 3372, + "time_per_iteration": 4.306687831878662 + }, + { + "auxiliary_loss_clip": 0.01137734, + "auxiliary_loss_mlp": 0.01057192, + "balance_loss_clip": 1.05065978, + "balance_loss_mlp": 1.03598428, + "epoch": 0.20279573124906058, + "flos": 26392762765440.0, + "grad_norm": 2.1240220719821195, + "language_loss": 0.78505349, + "learning_rate": 3.694875114631167e-06, + "loss": 0.80700278, + "num_input_tokens_seen": 72919190, + "step": 3373, + "time_per_iteration": 4.223219394683838 + }, + { + "auxiliary_loss_clip": 0.01091396, + "auxiliary_loss_mlp": 0.01058555, + "balance_loss_clip": 1.04464257, + "balance_loss_mlp": 1.03719246, + "epoch": 0.20285585450172855, + "flos": 33799984692480.0, + "grad_norm": 2.5403716567908745, + "language_loss": 0.71275264, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.7342521, + "num_input_tokens_seen": 72939720, + "step": 3374, + "time_per_iteration": 2.853079319000244 + }, + { + "auxiliary_loss_clip": 0.01042818, + "auxiliary_loss_mlp": 0.01010518, + "balance_loss_clip": 1.02580416, + "balance_loss_mlp": 1.00797904, + "epoch": 0.20291597775439651, + "flos": 71164823598720.0, + "grad_norm": 0.9711663240936556, + "language_loss": 0.62466931, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64520264, + "num_input_tokens_seen": 73000015, + "step": 3375, + "time_per_iteration": 3.2016799449920654 + }, + { + "auxiliary_loss_clip": 0.01153133, + "auxiliary_loss_mlp": 0.01048539, + "balance_loss_clip": 1.05278802, + "balance_loss_mlp": 1.03021622, + "epoch": 0.20297610100706448, + "flos": 19494287118720.0, + "grad_norm": 1.613636998778186, + "language_loss": 0.82316196, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.84517872, + "num_input_tokens_seen": 73017675, + "step": 3376, + "time_per_iteration": 2.6073458194732666 + }, + { + "auxiliary_loss_clip": 0.01142412, + "auxiliary_loss_mlp": 0.01038523, + "balance_loss_clip": 1.0506475, + "balance_loss_mlp": 1.01912737, + "epoch": 0.20303622425973245, + "flos": 25044245441280.0, + "grad_norm": 2.0454517065820026, + "language_loss": 0.81243992, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.83424926, + "num_input_tokens_seen": 73036135, + "step": 3377, + "time_per_iteration": 2.6802914142608643 + }, + { + "auxiliary_loss_clip": 0.01127133, + "auxiliary_loss_mlp": 0.01049784, + "balance_loss_clip": 1.05416846, + "balance_loss_mlp": 1.03053212, + "epoch": 0.2030963475124004, + "flos": 21979988098560.0, + "grad_norm": 1.9719049052811064, + "language_loss": 0.76726258, + "learning_rate": 3.69384049496805e-06, + "loss": 0.78903174, + "num_input_tokens_seen": 73054075, + "step": 3378, + "time_per_iteration": 2.7052531242370605 + }, + { + "auxiliary_loss_clip": 0.01087342, + "auxiliary_loss_mlp": 0.01049115, + "balance_loss_clip": 1.04531622, + "balance_loss_mlp": 1.02726364, + "epoch": 0.2031564707650684, + "flos": 19500392430720.0, + "grad_norm": 2.0079998756584017, + "language_loss": 0.7982831, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.81964767, + "num_input_tokens_seen": 73073530, + "step": 3379, + "time_per_iteration": 4.379331588745117 + }, + { + "auxiliary_loss_clip": 0.01139431, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.05384874, + "balance_loss_mlp": 1.02164412, + "epoch": 0.20321659401773637, + "flos": 22747075971840.0, + "grad_norm": 1.5868581768713355, + "language_loss": 0.86639273, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.88817787, + "num_input_tokens_seen": 73092820, + "step": 3380, + "time_per_iteration": 2.7405402660369873 + }, + { + "auxiliary_loss_clip": 0.01156702, + "auxiliary_loss_mlp": 0.01053775, + "balance_loss_clip": 1.05730438, + "balance_loss_mlp": 1.03507149, + "epoch": 0.20327671727040433, + "flos": 22455840499200.0, + "grad_norm": 2.063467458189152, + "language_loss": 0.74637043, + "learning_rate": 3.693218952340186e-06, + "loss": 0.76847517, + "num_input_tokens_seen": 73113385, + "step": 3381, + "time_per_iteration": 2.6237549781799316 + }, + { + "auxiliary_loss_clip": 0.01118794, + "auxiliary_loss_mlp": 0.01042351, + "balance_loss_clip": 1.04590273, + "balance_loss_mlp": 1.02289653, + "epoch": 0.2033368405230723, + "flos": 19535010163200.0, + "grad_norm": 1.6994666268173182, + "language_loss": 0.79167414, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.81328559, + "num_input_tokens_seen": 73131195, + "step": 3382, + "time_per_iteration": 2.6707420349121094 + }, + { + "auxiliary_loss_clip": 0.01113758, + "auxiliary_loss_mlp": 0.00779415, + "balance_loss_clip": 1.0459373, + "balance_loss_mlp": 1.00091934, + "epoch": 0.20339696377574026, + "flos": 13809233744640.0, + "grad_norm": 1.9483404178521286, + "language_loss": 0.8042953, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.82322699, + "num_input_tokens_seen": 73148850, + "step": 3383, + "time_per_iteration": 2.7859487533569336 + }, + { + "auxiliary_loss_clip": 0.01100731, + "auxiliary_loss_mlp": 0.01046151, + "balance_loss_clip": 1.04473877, + "balance_loss_mlp": 1.02621913, + "epoch": 0.20345708702840823, + "flos": 20339409288960.0, + "grad_norm": 3.0507793260875693, + "language_loss": 0.74539214, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.76686096, + "num_input_tokens_seen": 73166775, + "step": 3384, + "time_per_iteration": 2.802645206451416 + }, + { + "auxiliary_loss_clip": 0.0114772, + "auxiliary_loss_mlp": 0.01042851, + "balance_loss_clip": 1.05207324, + "balance_loss_mlp": 1.02232289, + "epoch": 0.2035172102810762, + "flos": 20333950421760.0, + "grad_norm": 7.661095363155204, + "language_loss": 0.76801658, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.7899223, + "num_input_tokens_seen": 73183215, + "step": 3385, + "time_per_iteration": 2.823343515396118 + }, + { + "auxiliary_loss_clip": 0.01107407, + "auxiliary_loss_mlp": 0.01063941, + "balance_loss_clip": 1.04730904, + "balance_loss_mlp": 1.04331779, + "epoch": 0.2035773335337442, + "flos": 23330983461120.0, + "grad_norm": 41.05937457193927, + "language_loss": 0.68458641, + "learning_rate": 3.692181763924639e-06, + "loss": 0.70629984, + "num_input_tokens_seen": 73203290, + "step": 3386, + "time_per_iteration": 2.830810546875 + }, + { + "auxiliary_loss_clip": 0.01104248, + "auxiliary_loss_mlp": 0.01064893, + "balance_loss_clip": 1.04774165, + "balance_loss_mlp": 1.04379284, + "epoch": 0.20363745678641215, + "flos": 28330287310080.0, + "grad_norm": 3.4161658794101384, + "language_loss": 0.80985248, + "learning_rate": 3.691974133706947e-06, + "loss": 0.83154386, + "num_input_tokens_seen": 73226185, + "step": 3387, + "time_per_iteration": 2.8204662799835205 + }, + { + "auxiliary_loss_clip": 0.0112504, + "auxiliary_loss_mlp": 0.01049361, + "balance_loss_clip": 1.05224109, + "balance_loss_mlp": 1.03000104, + "epoch": 0.20369758003908012, + "flos": 18915658928640.0, + "grad_norm": 2.703878094865874, + "language_loss": 0.7988956, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.82063961, + "num_input_tokens_seen": 73243300, + "step": 3388, + "time_per_iteration": 2.687053918838501 + }, + { + "auxiliary_loss_clip": 0.01157403, + "auxiliary_loss_mlp": 0.01048089, + "balance_loss_clip": 1.05471182, + "balance_loss_mlp": 1.0281812, + "epoch": 0.20375770329174808, + "flos": 19206499351680.0, + "grad_norm": 1.8133180655285324, + "language_loss": 0.7184962, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.74055111, + "num_input_tokens_seen": 73261490, + "step": 3389, + "time_per_iteration": 2.614321708679199 + }, + { + "auxiliary_loss_clip": 0.01141855, + "auxiliary_loss_mlp": 0.01054311, + "balance_loss_clip": 1.05387521, + "balance_loss_mlp": 1.0351541, + "epoch": 0.20381782654441605, + "flos": 19391008538880.0, + "grad_norm": 1.8982692343761227, + "language_loss": 0.87280858, + "learning_rate": 3.691350858126404e-06, + "loss": 0.89477026, + "num_input_tokens_seen": 73280180, + "step": 3390, + "time_per_iteration": 2.6770312786102295 + }, + { + "auxiliary_loss_clip": 0.01125093, + "auxiliary_loss_mlp": 0.01052498, + "balance_loss_clip": 1.05142403, + "balance_loss_mlp": 1.03129053, + "epoch": 0.203877949797084, + "flos": 24827704300800.0, + "grad_norm": 2.3308941901233355, + "language_loss": 0.71194077, + "learning_rate": 3.691142971316662e-06, + "loss": 0.73371667, + "num_input_tokens_seen": 73300680, + "step": 3391, + "time_per_iteration": 2.7198221683502197 + }, + { + "auxiliary_loss_clip": 0.01120121, + "auxiliary_loss_mlp": 0.01051383, + "balance_loss_clip": 1.05222178, + "balance_loss_mlp": 1.0318923, + "epoch": 0.20393807304975198, + "flos": 18003707504640.0, + "grad_norm": 2.4765720957839217, + "language_loss": 0.86745828, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.88917333, + "num_input_tokens_seen": 73316760, + "step": 3392, + "time_per_iteration": 2.6961052417755127 + }, + { + "auxiliary_loss_clip": 0.01145712, + "auxiliary_loss_mlp": 0.01051212, + "balance_loss_clip": 1.05204964, + "balance_loss_mlp": 1.03236461, + "epoch": 0.20399819630241997, + "flos": 24206988349440.0, + "grad_norm": 1.665333238668028, + "language_loss": 0.80659354, + "learning_rate": 3.69072700532013e-06, + "loss": 0.82856286, + "num_input_tokens_seen": 73339385, + "step": 3393, + "time_per_iteration": 2.6883490085601807 + }, + { + "auxiliary_loss_clip": 0.01123025, + "auxiliary_loss_mlp": 0.010424, + "balance_loss_clip": 1.04751348, + "balance_loss_mlp": 1.02385163, + "epoch": 0.20405831955508794, + "flos": 20777124424320.0, + "grad_norm": 1.8745864895680615, + "language_loss": 0.86126244, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.88291663, + "num_input_tokens_seen": 73357235, + "step": 3394, + "time_per_iteration": 2.758887767791748 + }, + { + "auxiliary_loss_clip": 0.0114219, + "auxiliary_loss_mlp": 0.01049288, + "balance_loss_clip": 1.05699492, + "balance_loss_mlp": 1.03088212, + "epoch": 0.2041184428077559, + "flos": 15486908325120.0, + "grad_norm": 2.5133342949273416, + "language_loss": 0.83761692, + "learning_rate": 3.69031078287345e-06, + "loss": 0.85953164, + "num_input_tokens_seen": 73374435, + "step": 3395, + "time_per_iteration": 2.6468729972839355 + }, + { + "auxiliary_loss_clip": 0.01145796, + "auxiliary_loss_mlp": 0.01039804, + "balance_loss_clip": 1.05311751, + "balance_loss_mlp": 1.0200156, + "epoch": 0.20417856606042387, + "flos": 15588463052160.0, + "grad_norm": 2.8477422591662376, + "language_loss": 0.83736277, + "learning_rate": 3.690102575501033e-06, + "loss": 0.85921878, + "num_input_tokens_seen": 73391025, + "step": 3396, + "time_per_iteration": 2.6296958923339844 + }, + { + "auxiliary_loss_clip": 0.01112843, + "auxiliary_loss_mlp": 0.01045334, + "balance_loss_clip": 1.04787922, + "balance_loss_mlp": 1.02616525, + "epoch": 0.20423868931309183, + "flos": 24279348297600.0, + "grad_norm": 2.1192113228666303, + "language_loss": 0.77199841, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.79358017, + "num_input_tokens_seen": 73409270, + "step": 3397, + "time_per_iteration": 2.776784896850586 + }, + { + "auxiliary_loss_clip": 0.01128614, + "auxiliary_loss_mlp": 0.01050131, + "balance_loss_clip": 1.05143905, + "balance_loss_mlp": 1.03264332, + "epoch": 0.2042988125657598, + "flos": 18614870438400.0, + "grad_norm": 3.16091809956727, + "language_loss": 0.8791461, + "learning_rate": 3.689685968497518e-06, + "loss": 0.9009335, + "num_input_tokens_seen": 73425225, + "step": 3398, + "time_per_iteration": 2.6866374015808105 + }, + { + "auxiliary_loss_clip": 0.01126796, + "auxiliary_loss_mlp": 0.01052169, + "balance_loss_clip": 1.05476117, + "balance_loss_mlp": 1.03316689, + "epoch": 0.2043589358184278, + "flos": 17851230270720.0, + "grad_norm": 2.139785862197821, + "language_loss": 0.78045064, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.80224031, + "num_input_tokens_seen": 73440940, + "step": 3399, + "time_per_iteration": 2.6545825004577637 + }, + { + "auxiliary_loss_clip": 0.01144155, + "auxiliary_loss_mlp": 0.01042424, + "balance_loss_clip": 1.05252838, + "balance_loss_mlp": 1.02299261, + "epoch": 0.20441905907109575, + "flos": 21435223455360.0, + "grad_norm": 3.6374157446104802, + "language_loss": 0.76563728, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.787503, + "num_input_tokens_seen": 73458805, + "step": 3400, + "time_per_iteration": 2.7279481887817383 + }, + { + "auxiliary_loss_clip": 0.01121071, + "auxiliary_loss_mlp": 0.00776799, + "balance_loss_clip": 1.05304742, + "balance_loss_mlp": 1.00072634, + "epoch": 0.20447918232376372, + "flos": 27707703851520.0, + "grad_norm": 1.8758513970592474, + "language_loss": 0.79382575, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.81280446, + "num_input_tokens_seen": 73479380, + "step": 3401, + "time_per_iteration": 2.7918031215667725 + }, + { + "auxiliary_loss_clip": 0.01131319, + "auxiliary_loss_mlp": 0.01044892, + "balance_loss_clip": 1.0484674, + "balance_loss_mlp": 1.02540183, + "epoch": 0.20453930557643168, + "flos": 30524214113280.0, + "grad_norm": 2.2159471948141034, + "language_loss": 0.69798994, + "learning_rate": 3.688851985676991e-06, + "loss": 0.71975207, + "num_input_tokens_seen": 73505105, + "step": 3402, + "time_per_iteration": 2.79670786857605 + }, + { + "auxiliary_loss_clip": 0.01120554, + "auxiliary_loss_mlp": 0.01043946, + "balance_loss_clip": 1.05060196, + "balance_loss_mlp": 1.02439535, + "epoch": 0.20459942882909965, + "flos": 18987767481600.0, + "grad_norm": 1.7908768446457861, + "language_loss": 0.81114817, + "learning_rate": 3.688643329848496e-06, + "loss": 0.83279312, + "num_input_tokens_seen": 73523700, + "step": 3403, + "time_per_iteration": 2.70182728767395 + }, + { + "auxiliary_loss_clip": 0.01144248, + "auxiliary_loss_mlp": 0.01041199, + "balance_loss_clip": 1.05348516, + "balance_loss_mlp": 1.02295971, + "epoch": 0.20465955208176762, + "flos": 20339050152960.0, + "grad_norm": 2.511955552730785, + "language_loss": 0.83403814, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.8558926, + "num_input_tokens_seen": 73542625, + "step": 3404, + "time_per_iteration": 2.630807399749756 + }, + { + "auxiliary_loss_clip": 0.01138937, + "auxiliary_loss_mlp": 0.01048101, + "balance_loss_clip": 1.04838705, + "balance_loss_mlp": 1.0292058, + "epoch": 0.20471967533443558, + "flos": 21251288885760.0, + "grad_norm": 1.7149716538767368, + "language_loss": 0.86209136, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.88396174, + "num_input_tokens_seen": 73561450, + "step": 3405, + "time_per_iteration": 2.6076929569244385 + }, + { + "auxiliary_loss_clip": 0.01116224, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.05039132, + "balance_loss_mlp": 1.02621806, + "epoch": 0.20477979858710357, + "flos": 14501555458560.0, + "grad_norm": 2.1633598971137435, + "language_loss": 0.84356105, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.86516619, + "num_input_tokens_seen": 73577155, + "step": 3406, + "time_per_iteration": 2.768890142440796 + }, + { + "auxiliary_loss_clip": 0.01152751, + "auxiliary_loss_mlp": 0.01039548, + "balance_loss_clip": 1.0542599, + "balance_loss_mlp": 1.02191663, + "epoch": 0.20483992183977154, + "flos": 11400310085760.0, + "grad_norm": 2.4892039461455675, + "language_loss": 0.67453218, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.69645512, + "num_input_tokens_seen": 73594900, + "step": 3407, + "time_per_iteration": 2.5661377906799316 + }, + { + "auxiliary_loss_clip": 0.0115175, + "auxiliary_loss_mlp": 0.01050505, + "balance_loss_clip": 1.05328465, + "balance_loss_mlp": 1.03294516, + "epoch": 0.2049000450924395, + "flos": 19060271084160.0, + "grad_norm": 2.4363182538361285, + "language_loss": 0.84214294, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.86416554, + "num_input_tokens_seen": 73613810, + "step": 3408, + "time_per_iteration": 2.585186004638672 + }, + { + "auxiliary_loss_clip": 0.01154901, + "auxiliary_loss_mlp": 0.01042295, + "balance_loss_clip": 1.0536257, + "balance_loss_mlp": 1.02471161, + "epoch": 0.20496016834510747, + "flos": 14574561851520.0, + "grad_norm": 2.317815935455145, + "language_loss": 0.63898516, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.6609571, + "num_input_tokens_seen": 73631495, + "step": 3409, + "time_per_iteration": 2.5877959728240967 + }, + { + "auxiliary_loss_clip": 0.0113795, + "auxiliary_loss_mlp": 0.01042481, + "balance_loss_clip": 1.04903567, + "balance_loss_mlp": 1.02409852, + "epoch": 0.20502029159777543, + "flos": 22126647329280.0, + "grad_norm": 1.3925959707869588, + "language_loss": 0.80547982, + "learning_rate": 3.687180946553745e-06, + "loss": 0.8272841, + "num_input_tokens_seen": 73652840, + "step": 3410, + "time_per_iteration": 4.1697752475738525 + }, + { + "auxiliary_loss_clip": 0.01099823, + "auxiliary_loss_mlp": 0.01046015, + "balance_loss_clip": 1.05186486, + "balance_loss_mlp": 1.02820492, + "epoch": 0.2050804148504434, + "flos": 25367907916800.0, + "grad_norm": 2.407452066099965, + "language_loss": 0.75804615, + "learning_rate": 3.686971778678803e-06, + "loss": 0.77950454, + "num_input_tokens_seen": 73672150, + "step": 3411, + "time_per_iteration": 2.8072102069854736 + }, + { + "auxiliary_loss_clip": 0.0113879, + "auxiliary_loss_mlp": 0.01046868, + "balance_loss_clip": 1.05501246, + "balance_loss_mlp": 1.02887905, + "epoch": 0.2051405381031114, + "flos": 23620171858560.0, + "grad_norm": 2.4936494073109445, + "language_loss": 0.73356283, + "learning_rate": 3.686762546833722e-06, + "loss": 0.75541937, + "num_input_tokens_seen": 73691940, + "step": 3412, + "time_per_iteration": 5.778446912765503 + }, + { + "auxiliary_loss_clip": 0.01127692, + "auxiliary_loss_mlp": 0.01057937, + "balance_loss_clip": 1.04926813, + "balance_loss_mlp": 1.03748107, + "epoch": 0.20520066135577936, + "flos": 19565533745280.0, + "grad_norm": 2.3541654180764353, + "language_loss": 0.77958596, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.80144227, + "num_input_tokens_seen": 73709080, + "step": 3413, + "time_per_iteration": 2.6457245349884033 + }, + { + "auxiliary_loss_clip": 0.0110869, + "auxiliary_loss_mlp": 0.01047866, + "balance_loss_clip": 1.04991519, + "balance_loss_mlp": 1.02862608, + "epoch": 0.20526078460844732, + "flos": 17676345928320.0, + "grad_norm": 2.4834314093653673, + "language_loss": 0.85112405, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.8726896, + "num_input_tokens_seen": 73727670, + "step": 3414, + "time_per_iteration": 2.7343668937683105 + }, + { + "auxiliary_loss_clip": 0.01140219, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.05012155, + "balance_loss_mlp": 1.02118468, + "epoch": 0.2053209078611153, + "flos": 21500328856320.0, + "grad_norm": 2.0410772094937433, + "language_loss": 0.80372798, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.82552463, + "num_input_tokens_seen": 73747170, + "step": 3415, + "time_per_iteration": 2.6669082641601562 + }, + { + "auxiliary_loss_clip": 0.01087022, + "auxiliary_loss_mlp": 0.01042771, + "balance_loss_clip": 1.04786301, + "balance_loss_mlp": 1.02643943, + "epoch": 0.20538103111378325, + "flos": 25663524848640.0, + "grad_norm": 1.941742032659622, + "language_loss": 0.72958827, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.75088626, + "num_input_tokens_seen": 73767690, + "step": 3416, + "time_per_iteration": 2.892782211303711 + }, + { + "auxiliary_loss_clip": 0.01145149, + "auxiliary_loss_mlp": 0.01044328, + "balance_loss_clip": 1.05453372, + "balance_loss_mlp": 1.02577877, + "epoch": 0.20544115436645122, + "flos": 23148952312320.0, + "grad_norm": 2.508583707985938, + "language_loss": 0.78741407, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.80930889, + "num_input_tokens_seen": 73786900, + "step": 3417, + "time_per_iteration": 2.7298929691314697 + }, + { + "auxiliary_loss_clip": 0.01145459, + "auxiliary_loss_mlp": 0.0104683, + "balance_loss_clip": 1.0536468, + "balance_loss_mlp": 1.02819777, + "epoch": 0.20550127761911918, + "flos": 19390433921280.0, + "grad_norm": 2.4305498920504043, + "language_loss": 0.8729043, + "learning_rate": 3.685505812834798e-06, + "loss": 0.89482725, + "num_input_tokens_seen": 73804515, + "step": 3418, + "time_per_iteration": 4.382033109664917 + }, + { + "auxiliary_loss_clip": 0.01140182, + "auxiliary_loss_mlp": 0.01046543, + "balance_loss_clip": 1.05682349, + "balance_loss_mlp": 1.02776778, + "epoch": 0.20556140087178718, + "flos": 22893124671360.0, + "grad_norm": 14.690715253896212, + "language_loss": 0.62538671, + "learning_rate": 3.685296133421035e-06, + "loss": 0.64725399, + "num_input_tokens_seen": 73822910, + "step": 3419, + "time_per_iteration": 2.7318668365478516 + }, + { + "auxiliary_loss_clip": 0.01139691, + "auxiliary_loss_mlp": 0.01046928, + "balance_loss_clip": 1.05550981, + "balance_loss_mlp": 1.02651954, + "epoch": 0.20562152412445514, + "flos": 19789652655360.0, + "grad_norm": 1.8153871521224594, + "language_loss": 0.86339438, + "learning_rate": 3.685086390100674e-06, + "loss": 0.88526058, + "num_input_tokens_seen": 73841160, + "step": 3420, + "time_per_iteration": 2.723606824874878 + }, + { + "auxiliary_loss_clip": 0.01104401, + "auxiliary_loss_mlp": 0.00780617, + "balance_loss_clip": 1.04621911, + "balance_loss_mlp": 1.00071514, + "epoch": 0.2056816473771231, + "flos": 31501989210240.0, + "grad_norm": 2.3982854973621954, + "language_loss": 0.7127136, + "learning_rate": 3.684876582881668e-06, + "loss": 0.73156381, + "num_input_tokens_seen": 73862795, + "step": 3421, + "time_per_iteration": 2.8138315677642822 + }, + { + "auxiliary_loss_clip": 0.01153254, + "auxiliary_loss_mlp": 0.01039984, + "balance_loss_clip": 1.05382609, + "balance_loss_mlp": 1.02160168, + "epoch": 0.20574177062979107, + "flos": 23258372117760.0, + "grad_norm": 6.231519820465981, + "language_loss": 0.70559299, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.72752541, + "num_input_tokens_seen": 73881525, + "step": 3422, + "time_per_iteration": 2.6411848068237305 + }, + { + "auxiliary_loss_clip": 0.01062123, + "auxiliary_loss_mlp": 0.01005097, + "balance_loss_clip": 1.03459418, + "balance_loss_mlp": 1.00220013, + "epoch": 0.20580189388245904, + "flos": 70312518708480.0, + "grad_norm": 0.740118932422812, + "language_loss": 0.55461621, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57528841, + "num_input_tokens_seen": 73937775, + "step": 3423, + "time_per_iteration": 3.259685516357422 + }, + { + "auxiliary_loss_clip": 0.01104389, + "auxiliary_loss_mlp": 0.01039296, + "balance_loss_clip": 1.04975653, + "balance_loss_mlp": 1.02089024, + "epoch": 0.205862017135127, + "flos": 30737846252160.0, + "grad_norm": 1.9242047681435088, + "language_loss": 0.71910381, + "learning_rate": 3.684246777912353e-06, + "loss": 0.74054068, + "num_input_tokens_seen": 73958250, + "step": 3424, + "time_per_iteration": 2.800283432006836 + }, + { + "auxiliary_loss_clip": 0.01125916, + "auxiliary_loss_mlp": 0.00777945, + "balance_loss_clip": 1.05704927, + "balance_loss_mlp": 1.00086677, + "epoch": 0.20592214038779497, + "flos": 21324546673920.0, + "grad_norm": 1.6235965502825092, + "language_loss": 0.74980927, + "learning_rate": 3.684036715178351e-06, + "loss": 0.76884782, + "num_input_tokens_seen": 73977775, + "step": 3425, + "time_per_iteration": 2.751030206680298 + }, + { + "auxiliary_loss_clip": 0.01104665, + "auxiliary_loss_mlp": 0.01058685, + "balance_loss_clip": 1.05047321, + "balance_loss_mlp": 1.03983784, + "epoch": 0.20598226364046296, + "flos": 22891652213760.0, + "grad_norm": 1.7765616723027935, + "language_loss": 0.87936616, + "learning_rate": 3.683826588585508e-06, + "loss": 0.90099961, + "num_input_tokens_seen": 73996590, + "step": 3426, + "time_per_iteration": 2.8539180755615234 + }, + { + "auxiliary_loss_clip": 0.01144422, + "auxiliary_loss_mlp": 0.01045493, + "balance_loss_clip": 1.05773449, + "balance_loss_mlp": 1.0281601, + "epoch": 0.20604238689313092, + "flos": 23878549365120.0, + "grad_norm": 1.836530467647624, + "language_loss": 0.76435733, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.78625643, + "num_input_tokens_seen": 74015935, + "step": 3427, + "time_per_iteration": 2.7024967670440674 + }, + { + "auxiliary_loss_clip": 0.01159387, + "auxiliary_loss_mlp": 0.01050023, + "balance_loss_clip": 1.0577209, + "balance_loss_mlp": 1.03185558, + "epoch": 0.2061025101457989, + "flos": 22491535639680.0, + "grad_norm": 2.7350574840199964, + "language_loss": 0.74176943, + "learning_rate": 3.683406143855174e-06, + "loss": 0.76386356, + "num_input_tokens_seen": 74036575, + "step": 3428, + "time_per_iteration": 2.593151569366455 + }, + { + "auxiliary_loss_clip": 0.01132797, + "auxiliary_loss_mlp": 0.01046534, + "balance_loss_clip": 1.05232322, + "balance_loss_mlp": 1.0274843, + "epoch": 0.20616263339846685, + "flos": 22778928357120.0, + "grad_norm": 3.829070534376961, + "language_loss": 0.73316109, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.75495446, + "num_input_tokens_seen": 74055365, + "step": 3429, + "time_per_iteration": 2.7357261180877686 + }, + { + "auxiliary_loss_clip": 0.01144108, + "auxiliary_loss_mlp": 0.01049081, + "balance_loss_clip": 1.05838966, + "balance_loss_mlp": 1.03030515, + "epoch": 0.20622275665113482, + "flos": 20882198684160.0, + "grad_norm": 2.201354934958512, + "language_loss": 0.85586745, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.87779927, + "num_input_tokens_seen": 74074875, + "step": 3430, + "time_per_iteration": 2.658486843109131 + }, + { + "auxiliary_loss_clip": 0.01088509, + "auxiliary_loss_mlp": 0.01053254, + "balance_loss_clip": 1.04814601, + "balance_loss_mlp": 1.03387105, + "epoch": 0.20628287990380278, + "flos": 19354415558400.0, + "grad_norm": 1.8292569880077065, + "language_loss": 0.68859613, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.71001375, + "num_input_tokens_seen": 74094505, + "step": 3431, + "time_per_iteration": 2.811061143875122 + }, + { + "auxiliary_loss_clip": 0.01027012, + "auxiliary_loss_mlp": 0.01012446, + "balance_loss_clip": 1.03099978, + "balance_loss_mlp": 1.00976419, + "epoch": 0.20634300315647078, + "flos": 71517932248320.0, + "grad_norm": 0.8066063325789609, + "language_loss": 0.60172188, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62211645, + "num_input_tokens_seen": 74158500, + "step": 3432, + "time_per_iteration": 3.415828227996826 + }, + { + "auxiliary_loss_clip": 0.01146488, + "auxiliary_loss_mlp": 0.01044703, + "balance_loss_clip": 1.0583806, + "balance_loss_mlp": 1.02669072, + "epoch": 0.20640312640913874, + "flos": 21723944976000.0, + "grad_norm": 2.5535613418278116, + "language_loss": 0.72622889, + "learning_rate": 3.682353915057679e-06, + "loss": 0.74814081, + "num_input_tokens_seen": 74176685, + "step": 3433, + "time_per_iteration": 2.715195655822754 + }, + { + "auxiliary_loss_clip": 0.0109694, + "auxiliary_loss_mlp": 0.01050867, + "balance_loss_clip": 1.04781306, + "balance_loss_mlp": 1.03019655, + "epoch": 0.2064632496618067, + "flos": 20554621626240.0, + "grad_norm": 2.096486283687917, + "language_loss": 0.87233114, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.8938092, + "num_input_tokens_seen": 74194935, + "step": 3434, + "time_per_iteration": 2.7781460285186768 + }, + { + "auxiliary_loss_clip": 0.01151381, + "auxiliary_loss_mlp": 0.01045497, + "balance_loss_clip": 1.05561388, + "balance_loss_mlp": 1.02719867, + "epoch": 0.20652337291447467, + "flos": 29823273135360.0, + "grad_norm": 1.7621185839090663, + "language_loss": 0.69533503, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.71730381, + "num_input_tokens_seen": 74215400, + "step": 3435, + "time_per_iteration": 2.7425992488861084 + }, + { + "auxiliary_loss_clip": 0.01127853, + "auxiliary_loss_mlp": 0.01045604, + "balance_loss_clip": 1.05583, + "balance_loss_mlp": 1.02672172, + "epoch": 0.20658349616714264, + "flos": 26213640618240.0, + "grad_norm": 30.077934868422773, + "language_loss": 0.89116997, + "learning_rate": 3.681721812174988e-06, + "loss": 0.91290456, + "num_input_tokens_seen": 74234090, + "step": 3436, + "time_per_iteration": 2.7460577487945557 + }, + { + "auxiliary_loss_clip": 0.01118033, + "auxiliary_loss_mlp": 0.01041557, + "balance_loss_clip": 1.05178559, + "balance_loss_mlp": 1.02168477, + "epoch": 0.2066436194198106, + "flos": 25994370044160.0, + "grad_norm": 1.7370712778981523, + "language_loss": 0.77330887, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.79490477, + "num_input_tokens_seen": 74253345, + "step": 3437, + "time_per_iteration": 2.7507588863372803 + }, + { + "auxiliary_loss_clip": 0.01144607, + "auxiliary_loss_mlp": 0.01040376, + "balance_loss_clip": 1.05298507, + "balance_loss_mlp": 1.02323389, + "epoch": 0.20670374267247857, + "flos": 21361067827200.0, + "grad_norm": 1.8326742989814773, + "language_loss": 0.77813125, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.799981, + "num_input_tokens_seen": 74271615, + "step": 3438, + "time_per_iteration": 2.7624385356903076 + }, + { + "auxiliary_loss_clip": 0.01063811, + "auxiliary_loss_mlp": 0.01002308, + "balance_loss_clip": 1.03603387, + "balance_loss_mlp": 0.9995541, + "epoch": 0.20676386592514656, + "flos": 66383281952640.0, + "grad_norm": 0.8298524953876073, + "language_loss": 0.67093015, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.69159138, + "num_input_tokens_seen": 74331390, + "step": 3439, + "time_per_iteration": 3.2026216983795166 + }, + { + "auxiliary_loss_clip": 0.01148913, + "auxiliary_loss_mlp": 0.01041213, + "balance_loss_clip": 1.05590546, + "balance_loss_mlp": 1.02299786, + "epoch": 0.20682398917781453, + "flos": 17274577328640.0, + "grad_norm": 1.9537104709510729, + "language_loss": 0.83907467, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.86097592, + "num_input_tokens_seen": 74347335, + "step": 3440, + "time_per_iteration": 2.6949758529663086 + }, + { + "auxiliary_loss_clip": 0.01147739, + "auxiliary_loss_mlp": 0.01041939, + "balance_loss_clip": 1.05509627, + "balance_loss_mlp": 1.02458239, + "epoch": 0.2068841124304825, + "flos": 18077288515200.0, + "grad_norm": 1.8008884636634683, + "language_loss": 0.84828413, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.8701809, + "num_input_tokens_seen": 74366310, + "step": 3441, + "time_per_iteration": 2.6440463066101074 + }, + { + "auxiliary_loss_clip": 0.01110175, + "auxiliary_loss_mlp": 0.01048552, + "balance_loss_clip": 1.05599904, + "balance_loss_mlp": 1.03050399, + "epoch": 0.20694423568315046, + "flos": 27347017432320.0, + "grad_norm": 1.7415147413468661, + "language_loss": 0.85854685, + "learning_rate": 3.680455884806959e-06, + "loss": 0.88013411, + "num_input_tokens_seen": 74387100, + "step": 3442, + "time_per_iteration": 2.8222689628601074 + }, + { + "auxiliary_loss_clip": 0.01078025, + "auxiliary_loss_mlp": 0.01050799, + "balance_loss_clip": 1.05186844, + "balance_loss_mlp": 1.03095019, + "epoch": 0.20700435893581842, + "flos": 20229845829120.0, + "grad_norm": 1.9775081815037283, + "language_loss": 0.73038852, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.75167674, + "num_input_tokens_seen": 74404460, + "step": 3443, + "time_per_iteration": 2.8044140338897705 + }, + { + "auxiliary_loss_clip": 0.01127625, + "auxiliary_loss_mlp": 0.00776303, + "balance_loss_clip": 1.05408895, + "balance_loss_mlp": 1.00079513, + "epoch": 0.2070644821884864, + "flos": 20631111638400.0, + "grad_norm": 1.84636320729986, + "language_loss": 0.85586846, + "learning_rate": 3.680033399147797e-06, + "loss": 0.87490773, + "num_input_tokens_seen": 74423790, + "step": 3444, + "time_per_iteration": 2.7582647800445557 + }, + { + "auxiliary_loss_clip": 0.01036759, + "auxiliary_loss_mlp": 0.01007145, + "balance_loss_clip": 1.03905272, + "balance_loss_mlp": 1.0042963, + "epoch": 0.20712460544115438, + "flos": 65941077617280.0, + "grad_norm": 0.6999396122177431, + "language_loss": 0.57092249, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.59136152, + "num_input_tokens_seen": 74488130, + "step": 3445, + "time_per_iteration": 3.249602794647217 + }, + { + "auxiliary_loss_clip": 0.01152738, + "auxiliary_loss_mlp": 0.00776634, + "balance_loss_clip": 1.0538106, + "balance_loss_mlp": 1.00088191, + "epoch": 0.20718472869382235, + "flos": 19425734012160.0, + "grad_norm": 1.6453630130444594, + "language_loss": 0.78469276, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.80398649, + "num_input_tokens_seen": 74506720, + "step": 3446, + "time_per_iteration": 2.6341898441314697 + }, + { + "auxiliary_loss_clip": 0.01151445, + "auxiliary_loss_mlp": 0.01043774, + "balance_loss_clip": 1.05439711, + "balance_loss_mlp": 1.02297151, + "epoch": 0.2072448519464903, + "flos": 24499049834880.0, + "grad_norm": 2.013256457797304, + "language_loss": 0.63031304, + "learning_rate": 3.679399192876334e-06, + "loss": 0.65226525, + "num_input_tokens_seen": 74525330, + "step": 3447, + "time_per_iteration": 2.6912922859191895 + }, + { + "auxiliary_loss_clip": 0.01103828, + "auxiliary_loss_mlp": 0.01058453, + "balance_loss_clip": 1.04668319, + "balance_loss_mlp": 1.03828287, + "epoch": 0.20730497519915828, + "flos": 23075694524160.0, + "grad_norm": 1.7423220349735584, + "language_loss": 0.86291325, + "learning_rate": 3.679187663409184e-06, + "loss": 0.88453603, + "num_input_tokens_seen": 74544535, + "step": 3448, + "time_per_iteration": 2.787576675415039 + }, + { + "auxiliary_loss_clip": 0.01128629, + "auxiliary_loss_mlp": 0.01045151, + "balance_loss_clip": 1.049932, + "balance_loss_mlp": 1.02556467, + "epoch": 0.20736509845182624, + "flos": 21069042255360.0, + "grad_norm": 3.8253504349982044, + "language_loss": 0.75264204, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.77437979, + "num_input_tokens_seen": 74562300, + "step": 3449, + "time_per_iteration": 4.354467391967773 + }, + { + "auxiliary_loss_clip": 0.01141162, + "auxiliary_loss_mlp": 0.01050212, + "balance_loss_clip": 1.0534308, + "balance_loss_mlp": 1.03073323, + "epoch": 0.2074252217044942, + "flos": 17633288499840.0, + "grad_norm": 2.156163289660715, + "language_loss": 0.76558924, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.787503, + "num_input_tokens_seen": 74580080, + "step": 3450, + "time_per_iteration": 2.7020533084869385 + }, + { + "auxiliary_loss_clip": 0.01128554, + "auxiliary_loss_mlp": 0.01044182, + "balance_loss_clip": 1.05234683, + "balance_loss_mlp": 1.02522802, + "epoch": 0.20748534495716217, + "flos": 23546985897600.0, + "grad_norm": 1.6446708221415856, + "language_loss": 0.82074821, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.84247565, + "num_input_tokens_seen": 74598980, + "step": 3451, + "time_per_iteration": 2.7753186225891113 + }, + { + "auxiliary_loss_clip": 0.01064426, + "auxiliary_loss_mlp": 0.01003577, + "balance_loss_clip": 1.02722275, + "balance_loss_mlp": 1.00099015, + "epoch": 0.20754546820983016, + "flos": 52252935598080.0, + "grad_norm": 0.793594031040259, + "language_loss": 0.56562752, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58630753, + "num_input_tokens_seen": 74655275, + "step": 3452, + "time_per_iteration": 6.257205963134766 + }, + { + "auxiliary_loss_clip": 0.01124123, + "auxiliary_loss_mlp": 0.00776806, + "balance_loss_clip": 1.05206704, + "balance_loss_mlp": 1.0008918, + "epoch": 0.20760559146249813, + "flos": 20412379768320.0, + "grad_norm": 2.245823129763223, + "language_loss": 0.88341558, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.90242493, + "num_input_tokens_seen": 74674560, + "step": 3453, + "time_per_iteration": 2.7009050846099854 + }, + { + "auxiliary_loss_clip": 0.01146287, + "auxiliary_loss_mlp": 0.01044217, + "balance_loss_clip": 1.05471313, + "balance_loss_mlp": 1.02521539, + "epoch": 0.2076657147151661, + "flos": 23186012169600.0, + "grad_norm": 2.2325669459725574, + "language_loss": 0.79920429, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.82110935, + "num_input_tokens_seen": 74694500, + "step": 3454, + "time_per_iteration": 2.7080893516540527 + }, + { + "auxiliary_loss_clip": 0.01104984, + "auxiliary_loss_mlp": 0.00777717, + "balance_loss_clip": 1.04356718, + "balance_loss_mlp": 1.0007751, + "epoch": 0.20772583796783406, + "flos": 18293219124480.0, + "grad_norm": 3.601668384502942, + "language_loss": 0.76601356, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.78484058, + "num_input_tokens_seen": 74710485, + "step": 3455, + "time_per_iteration": 2.6733248233795166 + }, + { + "auxiliary_loss_clip": 0.01115407, + "auxiliary_loss_mlp": 0.01050321, + "balance_loss_clip": 1.04759336, + "balance_loss_mlp": 1.0326066, + "epoch": 0.20778596122050202, + "flos": 17602800831360.0, + "grad_norm": 1.908671081537558, + "language_loss": 0.80200219, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.82365942, + "num_input_tokens_seen": 74727450, + "step": 3456, + "time_per_iteration": 2.6950278282165527 + }, + { + "auxiliary_loss_clip": 0.01112832, + "auxiliary_loss_mlp": 0.00777675, + "balance_loss_clip": 1.05166578, + "balance_loss_mlp": 1.00099969, + "epoch": 0.20784608447317, + "flos": 23805578885760.0, + "grad_norm": 2.135320694722552, + "language_loss": 0.78070557, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.79961067, + "num_input_tokens_seen": 74746725, + "step": 3457, + "time_per_iteration": 4.381137132644653 + }, + { + "auxiliary_loss_clip": 0.01082177, + "auxiliary_loss_mlp": 0.01058291, + "balance_loss_clip": 1.04310393, + "balance_loss_mlp": 1.03651094, + "epoch": 0.20790620772583795, + "flos": 17639286071040.0, + "grad_norm": 1.7652855773158553, + "language_loss": 0.8360287, + "learning_rate": 3.677068867939333e-06, + "loss": 0.85743344, + "num_input_tokens_seen": 74765255, + "step": 3458, + "time_per_iteration": 2.7332653999328613 + }, + { + "auxiliary_loss_clip": 0.01140275, + "auxiliary_loss_mlp": 0.0077698, + "balance_loss_clip": 1.05156302, + "balance_loss_mlp": 1.00095606, + "epoch": 0.20796633097850595, + "flos": 27673481168640.0, + "grad_norm": 11.883071119862361, + "language_loss": 0.75769317, + "learning_rate": 3.676856638489272e-06, + "loss": 0.77686572, + "num_input_tokens_seen": 74785710, + "step": 3459, + "time_per_iteration": 2.705026626586914 + }, + { + "auxiliary_loss_clip": 0.01089168, + "auxiliary_loss_mlp": 0.01038825, + "balance_loss_clip": 1.04769015, + "balance_loss_mlp": 1.02081251, + "epoch": 0.2080264542311739, + "flos": 19245606284160.0, + "grad_norm": 2.1071303009051428, + "language_loss": 0.77105331, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.79233319, + "num_input_tokens_seen": 74804490, + "step": 3460, + "time_per_iteration": 2.749965190887451 + }, + { + "auxiliary_loss_clip": 0.0109477, + "auxiliary_loss_mlp": 0.01047592, + "balance_loss_clip": 1.04938984, + "balance_loss_mlp": 1.02838707, + "epoch": 0.20808657748384188, + "flos": 27525924097920.0, + "grad_norm": 9.5480036120023, + "language_loss": 0.75802225, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.77944589, + "num_input_tokens_seen": 74826340, + "step": 3461, + "time_per_iteration": 2.7929086685180664 + }, + { + "auxiliary_loss_clip": 0.01124748, + "auxiliary_loss_mlp": 0.01041543, + "balance_loss_clip": 1.04610133, + "balance_loss_mlp": 1.02203989, + "epoch": 0.20814670073650984, + "flos": 26906931999360.0, + "grad_norm": 2.001927586001653, + "language_loss": 0.8848443, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.90650725, + "num_input_tokens_seen": 74844960, + "step": 3462, + "time_per_iteration": 2.7031619548797607 + }, + { + "auxiliary_loss_clip": 0.01023861, + "auxiliary_loss_mlp": 0.00757905, + "balance_loss_clip": 1.02540636, + "balance_loss_mlp": 1.00168896, + "epoch": 0.2082068239891778, + "flos": 70175735717760.0, + "grad_norm": 0.7622558664505636, + "language_loss": 0.59010452, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.6079222, + "num_input_tokens_seen": 74909075, + "step": 3463, + "time_per_iteration": 3.4111485481262207 + }, + { + "auxiliary_loss_clip": 0.01132553, + "auxiliary_loss_mlp": 0.01047591, + "balance_loss_clip": 1.04893148, + "balance_loss_mlp": 1.02866018, + "epoch": 0.20826694724184577, + "flos": 24608074590720.0, + "grad_norm": 2.6002828602708283, + "language_loss": 0.66744608, + "learning_rate": 3.675794537601429e-06, + "loss": 0.68924749, + "num_input_tokens_seen": 74928125, + "step": 3464, + "time_per_iteration": 2.718229293823242 + }, + { + "auxiliary_loss_clip": 0.0112374, + "auxiliary_loss_mlp": 0.0104712, + "balance_loss_clip": 1.05101657, + "balance_loss_mlp": 1.02755797, + "epoch": 0.20832707049451377, + "flos": 12892829034240.0, + "grad_norm": 2.9384916482598205, + "language_loss": 0.84044278, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.86215138, + "num_input_tokens_seen": 74945090, + "step": 3465, + "time_per_iteration": 2.732109546661377 + }, + { + "auxiliary_loss_clip": 0.01096712, + "auxiliary_loss_mlp": 0.01040605, + "balance_loss_clip": 1.04373813, + "balance_loss_mlp": 1.02221096, + "epoch": 0.20838719374718173, + "flos": 22198827709440.0, + "grad_norm": 2.576139197384499, + "language_loss": 0.81923312, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.84060633, + "num_input_tokens_seen": 74963630, + "step": 3466, + "time_per_iteration": 2.7758567333221436 + }, + { + "auxiliary_loss_clip": 0.01140158, + "auxiliary_loss_mlp": 0.01044188, + "balance_loss_clip": 1.05322194, + "balance_loss_mlp": 1.02787983, + "epoch": 0.2084473169998497, + "flos": 15158648908800.0, + "grad_norm": 4.780862188541671, + "language_loss": 0.82008922, + "learning_rate": 3.675156514448716e-06, + "loss": 0.84193271, + "num_input_tokens_seen": 74981875, + "step": 3467, + "time_per_iteration": 2.5788159370422363 + }, + { + "auxiliary_loss_clip": 0.01149826, + "auxiliary_loss_mlp": 0.01040027, + "balance_loss_clip": 1.05362797, + "balance_loss_mlp": 1.02265835, + "epoch": 0.20850744025251766, + "flos": 17456788045440.0, + "grad_norm": 2.009157691583003, + "language_loss": 0.82178962, + "learning_rate": 3.674943713009518e-06, + "loss": 0.84368813, + "num_input_tokens_seen": 74999155, + "step": 3468, + "time_per_iteration": 2.5874218940734863 + }, + { + "auxiliary_loss_clip": 0.01143942, + "auxiliary_loss_mlp": 0.01048537, + "balance_loss_clip": 1.05300629, + "balance_loss_mlp": 1.02774715, + "epoch": 0.20856756350518563, + "flos": 25698968593920.0, + "grad_norm": 2.0793964386868584, + "language_loss": 0.90328556, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.92521036, + "num_input_tokens_seen": 75017850, + "step": 3469, + "time_per_iteration": 2.6595447063446045 + }, + { + "auxiliary_loss_clip": 0.01125181, + "auxiliary_loss_mlp": 0.0104984, + "balance_loss_clip": 1.05548537, + "balance_loss_mlp": 1.03175592, + "epoch": 0.2086276867578536, + "flos": 37889060970240.0, + "grad_norm": 1.9058635967771913, + "language_loss": 0.76809812, + "learning_rate": 3.674517919597092e-06, + "loss": 0.78984833, + "num_input_tokens_seen": 75039270, + "step": 3470, + "time_per_iteration": 2.908046245574951 + }, + { + "auxiliary_loss_clip": 0.01133446, + "auxiliary_loss_mlp": 0.01047618, + "balance_loss_clip": 1.0551517, + "balance_loss_mlp": 1.02942634, + "epoch": 0.20868781001052156, + "flos": 25557049958400.0, + "grad_norm": 2.301093296435647, + "language_loss": 0.75801277, + "learning_rate": 3.674304927640011e-06, + "loss": 0.77982342, + "num_input_tokens_seen": 75059350, + "step": 3471, + "time_per_iteration": 2.713533401489258 + }, + { + "auxiliary_loss_clip": 0.01123818, + "auxiliary_loss_mlp": 0.01053513, + "balance_loss_clip": 1.04961812, + "balance_loss_mlp": 1.03384328, + "epoch": 0.20874793326318955, + "flos": 27529192235520.0, + "grad_norm": 2.366290140730035, + "language_loss": 0.75703716, + "learning_rate": 3.67409187219312e-06, + "loss": 0.77881044, + "num_input_tokens_seen": 75080150, + "step": 3472, + "time_per_iteration": 2.785034656524658 + }, + { + "auxiliary_loss_clip": 0.01140589, + "auxiliary_loss_mlp": 0.01046494, + "balance_loss_clip": 1.05084538, + "balance_loss_mlp": 1.02854145, + "epoch": 0.20880805651585752, + "flos": 18548795370240.0, + "grad_norm": 7.277377921302429, + "language_loss": 0.84276807, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.86463886, + "num_input_tokens_seen": 75097920, + "step": 3473, + "time_per_iteration": 2.6236281394958496 + }, + { + "auxiliary_loss_clip": 0.01057043, + "auxiliary_loss_mlp": 0.01037704, + "balance_loss_clip": 1.05363917, + "balance_loss_mlp": 1.03434241, + "epoch": 0.20886817976852548, + "flos": 65946644225280.0, + "grad_norm": 0.9045809123115837, + "language_loss": 0.63652557, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.65747303, + "num_input_tokens_seen": 75152410, + "step": 3474, + "time_per_iteration": 3.1946537494659424 + }, + { + "auxiliary_loss_clip": 0.0113535, + "auxiliary_loss_mlp": 0.01045984, + "balance_loss_clip": 1.05276895, + "balance_loss_mlp": 1.02782845, + "epoch": 0.20892830302119345, + "flos": 36539178929280.0, + "grad_norm": 3.2311626254468795, + "language_loss": 0.69970965, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.72152305, + "num_input_tokens_seen": 75173265, + "step": 3475, + "time_per_iteration": 2.7967529296875 + }, + { + "auxiliary_loss_clip": 0.01158022, + "auxiliary_loss_mlp": 0.01046944, + "balance_loss_clip": 1.05606794, + "balance_loss_mlp": 1.02862167, + "epoch": 0.2089884262738614, + "flos": 20956749361920.0, + "grad_norm": 1.9789108228051473, + "language_loss": 0.70372891, + "learning_rate": 3.673239015669065e-06, + "loss": 0.72577858, + "num_input_tokens_seen": 75193640, + "step": 3476, + "time_per_iteration": 2.629687786102295 + }, + { + "auxiliary_loss_clip": 0.01131765, + "auxiliary_loss_mlp": 0.01045236, + "balance_loss_clip": 1.05439556, + "balance_loss_mlp": 1.02722347, + "epoch": 0.20904854952652938, + "flos": 22784028088320.0, + "grad_norm": 2.3868812434184603, + "language_loss": 0.89227062, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.91404068, + "num_input_tokens_seen": 75212545, + "step": 3477, + "time_per_iteration": 2.7574357986450195 + }, + { + "auxiliary_loss_clip": 0.01092922, + "auxiliary_loss_mlp": 0.01046119, + "balance_loss_clip": 1.045825, + "balance_loss_mlp": 1.02737951, + "epoch": 0.20910867277919734, + "flos": 27303277645440.0, + "grad_norm": 2.6092415644893814, + "language_loss": 0.67816859, + "learning_rate": 3.672812206678344e-06, + "loss": 0.69955903, + "num_input_tokens_seen": 75230865, + "step": 3478, + "time_per_iteration": 2.7929017543792725 + }, + { + "auxiliary_loss_clip": 0.01094689, + "auxiliary_loss_mlp": 0.01042766, + "balance_loss_clip": 1.04024661, + "balance_loss_mlp": 1.02308464, + "epoch": 0.20916879603186533, + "flos": 14319237000960.0, + "grad_norm": 4.056245481336458, + "language_loss": 0.84239435, + "learning_rate": 3.672598707029127e-06, + "loss": 0.86376888, + "num_input_tokens_seen": 75248285, + "step": 3479, + "time_per_iteration": 2.743544816970825 + }, + { + "auxiliary_loss_clip": 0.01111533, + "auxiliary_loss_mlp": 0.01050991, + "balance_loss_clip": 1.04863191, + "balance_loss_mlp": 1.03028417, + "epoch": 0.2092289192845333, + "flos": 22273019251200.0, + "grad_norm": 9.599906344578406, + "language_loss": 0.74294043, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.76456571, + "num_input_tokens_seen": 75266310, + "step": 3480, + "time_per_iteration": 2.7278034687042236 + }, + { + "auxiliary_loss_clip": 0.01107791, + "auxiliary_loss_mlp": 0.01038756, + "balance_loss_clip": 1.04748154, + "balance_loss_mlp": 1.02226901, + "epoch": 0.20928904253720126, + "flos": 14830712714880.0, + "grad_norm": 2.178942595840573, + "language_loss": 0.75664043, + "learning_rate": 3.67217151746346e-06, + "loss": 0.77810597, + "num_input_tokens_seen": 75284175, + "step": 3481, + "time_per_iteration": 2.71073842048645 + }, + { + "auxiliary_loss_clip": 0.01090021, + "auxiliary_loss_mlp": 0.01046234, + "balance_loss_clip": 1.04561555, + "balance_loss_mlp": 1.02727938, + "epoch": 0.20934916578986923, + "flos": 23259162216960.0, + "grad_norm": 1.816378391984801, + "language_loss": 0.8517971, + "learning_rate": 3.671957827563209e-06, + "loss": 0.87315965, + "num_input_tokens_seen": 75303465, + "step": 3482, + "time_per_iteration": 2.8777174949645996 + }, + { + "auxiliary_loss_clip": 0.01099298, + "auxiliary_loss_mlp": 0.01046228, + "balance_loss_clip": 1.05039477, + "balance_loss_mlp": 1.02817941, + "epoch": 0.2094092890425372, + "flos": 32014398677760.0, + "grad_norm": 1.802490425012806, + "language_loss": 0.70550174, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.72695696, + "num_input_tokens_seen": 75325290, + "step": 3483, + "time_per_iteration": 2.8599836826324463 + }, + { + "auxiliary_loss_clip": 0.01127333, + "auxiliary_loss_mlp": 0.01048954, + "balance_loss_clip": 1.05204535, + "balance_loss_mlp": 1.03082263, + "epoch": 0.20946941229520516, + "flos": 20010647082240.0, + "grad_norm": 1.9649551735344426, + "language_loss": 0.74867833, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.77044123, + "num_input_tokens_seen": 75343895, + "step": 3484, + "time_per_iteration": 2.655538320541382 + }, + { + "auxiliary_loss_clip": 0.01117623, + "auxiliary_loss_mlp": 0.01046902, + "balance_loss_clip": 1.0514648, + "balance_loss_mlp": 1.0274353, + "epoch": 0.20952953554787315, + "flos": 30740072895360.0, + "grad_norm": 1.6308141537991403, + "language_loss": 0.70815694, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.72980225, + "num_input_tokens_seen": 75367100, + "step": 3485, + "time_per_iteration": 2.744417667388916 + }, + { + "auxiliary_loss_clip": 0.01083098, + "auxiliary_loss_mlp": 0.00777163, + "balance_loss_clip": 1.0433619, + "balance_loss_mlp": 1.00097859, + "epoch": 0.20958965880054112, + "flos": 27049209770880.0, + "grad_norm": 2.030771632516388, + "language_loss": 0.83274543, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.85134804, + "num_input_tokens_seen": 75389925, + "step": 3486, + "time_per_iteration": 2.742042303085327 + }, + { + "auxiliary_loss_clip": 0.01140212, + "auxiliary_loss_mlp": 0.01048337, + "balance_loss_clip": 1.05242062, + "balance_loss_mlp": 1.03115916, + "epoch": 0.20964978205320908, + "flos": 34204123589760.0, + "grad_norm": 1.6926372989653347, + "language_loss": 0.87134725, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.89323276, + "num_input_tokens_seen": 75408575, + "step": 3487, + "time_per_iteration": 2.708331346511841 + }, + { + "auxiliary_loss_clip": 0.01112214, + "auxiliary_loss_mlp": 0.01041678, + "balance_loss_clip": 1.04791641, + "balance_loss_mlp": 1.0228194, + "epoch": 0.20970990530587705, + "flos": 23477391296640.0, + "grad_norm": 4.471143750410675, + "language_loss": 0.72291327, + "learning_rate": 3.670674357028504e-06, + "loss": 0.74445224, + "num_input_tokens_seen": 75427155, + "step": 3488, + "time_per_iteration": 4.250715970993042 + }, + { + "auxiliary_loss_clip": 0.01121403, + "auxiliary_loss_mlp": 0.01037296, + "balance_loss_clip": 1.05096245, + "balance_loss_mlp": 1.02014148, + "epoch": 0.209770028558545, + "flos": 18551452976640.0, + "grad_norm": 2.6694226497987437, + "language_loss": 0.79665899, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.81824595, + "num_input_tokens_seen": 75444450, + "step": 3489, + "time_per_iteration": 2.6926958560943604 + }, + { + "auxiliary_loss_clip": 0.01152639, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_clip": 1.05325401, + "balance_loss_mlp": 1.02875018, + "epoch": 0.20983015181121298, + "flos": 21617003208960.0, + "grad_norm": 2.022409198347131, + "language_loss": 0.72505707, + "learning_rate": 3.670246026613266e-06, + "loss": 0.74704129, + "num_input_tokens_seen": 75462625, + "step": 3490, + "time_per_iteration": 4.133761644363403 + }, + { + "auxiliary_loss_clip": 0.01122247, + "auxiliary_loss_mlp": 0.01050283, + "balance_loss_clip": 1.0509479, + "balance_loss_mlp": 1.03402328, + "epoch": 0.20989027506388094, + "flos": 16614718531200.0, + "grad_norm": 1.8035978449536252, + "language_loss": 0.70332754, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.72505283, + "num_input_tokens_seen": 75480640, + "step": 3491, + "time_per_iteration": 2.667243003845215 + }, + { + "auxiliary_loss_clip": 0.0113848, + "auxiliary_loss_mlp": 0.0077627, + "balance_loss_clip": 1.05017376, + "balance_loss_mlp": 1.00098944, + "epoch": 0.20995039831654894, + "flos": 23216823060480.0, + "grad_norm": 2.379943808529104, + "language_loss": 0.79751909, + "learning_rate": 3.669817442854444e-06, + "loss": 0.81666666, + "num_input_tokens_seen": 75494900, + "step": 3492, + "time_per_iteration": 4.270704984664917 + }, + { + "auxiliary_loss_clip": 0.01138825, + "auxiliary_loss_mlp": 0.00776339, + "balance_loss_clip": 1.05182219, + "balance_loss_mlp": 1.00108409, + "epoch": 0.2100105215692169, + "flos": 18147493647360.0, + "grad_norm": 2.2783194747149906, + "language_loss": 0.86987948, + "learning_rate": 3.669603055991502e-06, + "loss": 0.88903111, + "num_input_tokens_seen": 75513370, + "step": 3493, + "time_per_iteration": 2.7830448150634766 + }, + { + "auxiliary_loss_clip": 0.01110786, + "auxiliary_loss_mlp": 0.01037681, + "balance_loss_clip": 1.04520118, + "balance_loss_mlp": 1.02105093, + "epoch": 0.21007064482188487, + "flos": 15961611490560.0, + "grad_norm": 6.813030650079402, + "language_loss": 0.68622243, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.70770705, + "num_input_tokens_seen": 75532480, + "step": 3494, + "time_per_iteration": 2.8479061126708984 + }, + { + "auxiliary_loss_clip": 0.01145467, + "auxiliary_loss_mlp": 0.01037272, + "balance_loss_clip": 1.05302739, + "balance_loss_mlp": 1.01998639, + "epoch": 0.21013076807455283, + "flos": 32234315696640.0, + "grad_norm": 1.7516454579581615, + "language_loss": 0.78848761, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.81031501, + "num_input_tokens_seen": 75552745, + "step": 3495, + "time_per_iteration": 2.9313197135925293 + }, + { + "auxiliary_loss_clip": 0.01119614, + "auxiliary_loss_mlp": 0.01045108, + "balance_loss_clip": 1.04760814, + "balance_loss_mlp": 1.02708316, + "epoch": 0.2101908913272208, + "flos": 23696625957120.0, + "grad_norm": 2.1492916784611844, + "language_loss": 0.77302933, + "learning_rate": 3.668959515566116e-06, + "loss": 0.79467654, + "num_input_tokens_seen": 75574355, + "step": 3496, + "time_per_iteration": 4.467881441116333 + }, + { + "auxiliary_loss_clip": 0.01135202, + "auxiliary_loss_mlp": 0.01046618, + "balance_loss_clip": 1.05169654, + "balance_loss_mlp": 1.02839065, + "epoch": 0.21025101457988876, + "flos": 20375786787840.0, + "grad_norm": 2.146148958862047, + "language_loss": 0.82076812, + "learning_rate": 3.668744875505915e-06, + "loss": 0.8425864, + "num_input_tokens_seen": 75592215, + "step": 3497, + "time_per_iteration": 2.683037281036377 + }, + { + "auxiliary_loss_clip": 0.01144559, + "auxiliary_loss_mlp": 0.01047188, + "balance_loss_clip": 1.05445957, + "balance_loss_mlp": 1.02967596, + "epoch": 0.21031113783255675, + "flos": 25775638174080.0, + "grad_norm": 1.732381679276629, + "language_loss": 0.67239833, + "learning_rate": 3.668530172166741e-06, + "loss": 0.69431579, + "num_input_tokens_seen": 75610740, + "step": 3498, + "time_per_iteration": 2.685481548309326 + }, + { + "auxiliary_loss_clip": 0.01121255, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_clip": 1.04974794, + "balance_loss_mlp": 1.02611172, + "epoch": 0.21037126108522472, + "flos": 22018197191040.0, + "grad_norm": 1.7892967196850054, + "language_loss": 0.80832362, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.82998168, + "num_input_tokens_seen": 75631005, + "step": 3499, + "time_per_iteration": 2.744995355606079 + }, + { + "auxiliary_loss_clip": 0.01139753, + "auxiliary_loss_mlp": 0.01039729, + "balance_loss_clip": 1.05226696, + "balance_loss_mlp": 1.02312946, + "epoch": 0.21043138433789269, + "flos": 25334403505920.0, + "grad_norm": 1.6464696881852638, + "language_loss": 0.77983701, + "learning_rate": 3.668100575684043e-06, + "loss": 0.80163181, + "num_input_tokens_seen": 75650655, + "step": 3500, + "time_per_iteration": 2.7704038619995117 + }, + { + "auxiliary_loss_clip": 0.01129369, + "auxiliary_loss_mlp": 0.01042187, + "balance_loss_clip": 1.05095315, + "balance_loss_mlp": 1.02390063, + "epoch": 0.21049150759056065, + "flos": 25556654908800.0, + "grad_norm": 1.5981262394728393, + "language_loss": 0.74450207, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.76621759, + "num_input_tokens_seen": 75669895, + "step": 3501, + "time_per_iteration": 2.7066893577575684 + }, + { + "auxiliary_loss_clip": 0.01134924, + "auxiliary_loss_mlp": 0.01039556, + "balance_loss_clip": 1.04989994, + "balance_loss_mlp": 1.02227044, + "epoch": 0.21055163084322862, + "flos": 24495602129280.0, + "grad_norm": 1.6188770382514572, + "language_loss": 0.75278366, + "learning_rate": 3.667670726183183e-06, + "loss": 0.77452844, + "num_input_tokens_seen": 75689535, + "step": 3502, + "time_per_iteration": 2.724635124206543 + }, + { + "auxiliary_loss_clip": 0.01098479, + "auxiliary_loss_mlp": 0.01040924, + "balance_loss_clip": 1.04576206, + "balance_loss_mlp": 1.02248216, + "epoch": 0.21061175409589658, + "flos": 25739045193600.0, + "grad_norm": 1.9441266701933382, + "language_loss": 0.77188909, + "learning_rate": 3.667455706571316e-06, + "loss": 0.7932831, + "num_input_tokens_seen": 75709265, + "step": 3503, + "time_per_iteration": 2.7545289993286133 + }, + { + "auxiliary_loss_clip": 0.010957, + "auxiliary_loss_mlp": 0.01045911, + "balance_loss_clip": 1.04817343, + "balance_loss_mlp": 1.02478695, + "epoch": 0.21067187734856455, + "flos": 18989168112000.0, + "grad_norm": 2.256374081289255, + "language_loss": 0.78297234, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.8043884, + "num_input_tokens_seen": 75727050, + "step": 3504, + "time_per_iteration": 2.7454304695129395 + }, + { + "auxiliary_loss_clip": 0.01117408, + "auxiliary_loss_mlp": 0.01049815, + "balance_loss_clip": 1.0488404, + "balance_loss_mlp": 1.03152788, + "epoch": 0.21073200060123254, + "flos": 24681368292480.0, + "grad_norm": 1.5753219052286964, + "language_loss": 0.76731002, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.78898227, + "num_input_tokens_seen": 75747175, + "step": 3505, + "time_per_iteration": 2.7509703636169434 + }, + { + "auxiliary_loss_clip": 0.01120291, + "auxiliary_loss_mlp": 0.01052026, + "balance_loss_clip": 1.04882348, + "balance_loss_mlp": 1.03383446, + "epoch": 0.2107921238539005, + "flos": 28549342402560.0, + "grad_norm": 1.9938386598136906, + "language_loss": 0.63933277, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.66105598, + "num_input_tokens_seen": 75767690, + "step": 3506, + "time_per_iteration": 2.773611545562744 + }, + { + "auxiliary_loss_clip": 0.01138444, + "auxiliary_loss_mlp": 0.01050655, + "balance_loss_clip": 1.05078697, + "balance_loss_mlp": 1.03257108, + "epoch": 0.21085224710656847, + "flos": 25885848078720.0, + "grad_norm": 2.170999698474249, + "language_loss": 0.82010436, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.84199536, + "num_input_tokens_seen": 75787255, + "step": 3507, + "time_per_iteration": 2.6604206562042236 + }, + { + "auxiliary_loss_clip": 0.01136754, + "auxiliary_loss_mlp": 0.01043314, + "balance_loss_clip": 1.04972744, + "balance_loss_mlp": 1.02472949, + "epoch": 0.21091237035923643, + "flos": 14976294537600.0, + "grad_norm": 2.0519706535557414, + "language_loss": 0.75213134, + "learning_rate": 3.666379660223824e-06, + "loss": 0.77393204, + "num_input_tokens_seen": 75805890, + "step": 3508, + "time_per_iteration": 2.7164604663848877 + }, + { + "auxiliary_loss_clip": 0.01154655, + "auxiliary_loss_mlp": 0.01036811, + "balance_loss_clip": 1.05263913, + "balance_loss_mlp": 1.01894128, + "epoch": 0.2109724936119044, + "flos": 16362518163840.0, + "grad_norm": 3.4182125548434112, + "language_loss": 0.84984946, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.87176406, + "num_input_tokens_seen": 75821620, + "step": 3509, + "time_per_iteration": 2.661743402481079 + }, + { + "auxiliary_loss_clip": 0.01120944, + "auxiliary_loss_mlp": 0.01044014, + "balance_loss_clip": 1.05299115, + "balance_loss_mlp": 1.02443957, + "epoch": 0.21103261686457236, + "flos": 31502492000640.0, + "grad_norm": 2.210880078691599, + "language_loss": 0.68125075, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.70290035, + "num_input_tokens_seen": 75842490, + "step": 3510, + "time_per_iteration": 2.7881460189819336 + }, + { + "auxiliary_loss_clip": 0.01152569, + "auxiliary_loss_mlp": 0.01046993, + "balance_loss_clip": 1.05026078, + "balance_loss_mlp": 1.02892137, + "epoch": 0.21109274011724033, + "flos": 27344072517120.0, + "grad_norm": 1.958863999940011, + "language_loss": 0.72639364, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.74838924, + "num_input_tokens_seen": 75865985, + "step": 3511, + "time_per_iteration": 2.6942689418792725 + }, + { + "auxiliary_loss_clip": 0.01066393, + "auxiliary_loss_mlp": 0.01041278, + "balance_loss_clip": 1.04279399, + "balance_loss_mlp": 1.0208931, + "epoch": 0.21115286336990832, + "flos": 17820383466240.0, + "grad_norm": 3.2801391377369686, + "language_loss": 0.69354337, + "learning_rate": 3.665517685689794e-06, + "loss": 0.71462011, + "num_input_tokens_seen": 75882745, + "step": 3512, + "time_per_iteration": 2.8260998725891113 + }, + { + "auxiliary_loss_clip": 0.01140043, + "auxiliary_loss_mlp": 0.01050555, + "balance_loss_clip": 1.04943943, + "balance_loss_mlp": 1.03082585, + "epoch": 0.2112129866225763, + "flos": 27197987904000.0, + "grad_norm": 2.072678482519775, + "language_loss": 0.73145646, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.75336242, + "num_input_tokens_seen": 75904305, + "step": 3513, + "time_per_iteration": 2.9639391899108887 + }, + { + "auxiliary_loss_clip": 0.01121964, + "auxiliary_loss_mlp": 0.01038325, + "balance_loss_clip": 1.04785061, + "balance_loss_mlp": 1.02089679, + "epoch": 0.21127310987524425, + "flos": 23731279603200.0, + "grad_norm": 2.0322171916220086, + "language_loss": 0.74422491, + "learning_rate": 3.665086319450502e-06, + "loss": 0.76582778, + "num_input_tokens_seen": 75923710, + "step": 3514, + "time_per_iteration": 2.7379143238067627 + }, + { + "auxiliary_loss_clip": 0.01136944, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.05334568, + "balance_loss_mlp": 1.01941383, + "epoch": 0.21133323312791222, + "flos": 18332505624960.0, + "grad_norm": 2.431934297389972, + "language_loss": 0.76738697, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.78913867, + "num_input_tokens_seen": 75942625, + "step": 3515, + "time_per_iteration": 2.6339287757873535 + }, + { + "auxiliary_loss_clip": 0.011289, + "auxiliary_loss_mlp": 0.01047482, + "balance_loss_clip": 1.05247736, + "balance_loss_mlp": 1.0288614, + "epoch": 0.21139335638058018, + "flos": 17931203902080.0, + "grad_norm": 2.7460645413082756, + "language_loss": 0.68756706, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.70933092, + "num_input_tokens_seen": 75959930, + "step": 3516, + "time_per_iteration": 2.6489672660827637 + }, + { + "auxiliary_loss_clip": 0.01118182, + "auxiliary_loss_mlp": 0.01049447, + "balance_loss_clip": 1.05634522, + "balance_loss_mlp": 1.03045666, + "epoch": 0.21145347963324815, + "flos": 24572092141440.0, + "grad_norm": 1.8368744753927078, + "language_loss": 0.85010064, + "learning_rate": 3.664438796560225e-06, + "loss": 0.87177694, + "num_input_tokens_seen": 75980335, + "step": 3517, + "time_per_iteration": 2.745887279510498 + }, + { + "auxiliary_loss_clip": 0.01125904, + "auxiliary_loss_mlp": 0.01042813, + "balance_loss_clip": 1.04719234, + "balance_loss_mlp": 1.02506244, + "epoch": 0.21151360288591614, + "flos": 35845959375360.0, + "grad_norm": 2.246330970109572, + "language_loss": 0.63672101, + "learning_rate": 3.664222829354512e-06, + "loss": 0.65840822, + "num_input_tokens_seen": 76002095, + "step": 3518, + "time_per_iteration": 2.7990219593048096 + }, + { + "auxiliary_loss_clip": 0.01089367, + "auxiliary_loss_mlp": 0.01057733, + "balance_loss_clip": 1.05040181, + "balance_loss_mlp": 1.04001832, + "epoch": 0.2115737261385841, + "flos": 24641579001600.0, + "grad_norm": 2.1349107177710875, + "language_loss": 0.89256221, + "learning_rate": 3.664006799041303e-06, + "loss": 0.91403317, + "num_input_tokens_seen": 76020425, + "step": 3519, + "time_per_iteration": 2.8022944927215576 + }, + { + "auxiliary_loss_clip": 0.01135146, + "auxiliary_loss_mlp": 0.01049587, + "balance_loss_clip": 1.05320001, + "balance_loss_mlp": 1.03140712, + "epoch": 0.21163384939125207, + "flos": 25226887121280.0, + "grad_norm": 1.8050755180524396, + "language_loss": 0.81235015, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.8341974, + "num_input_tokens_seen": 76041210, + "step": 3520, + "time_per_iteration": 2.750988245010376 + }, + { + "auxiliary_loss_clip": 0.01124406, + "auxiliary_loss_mlp": 0.01048631, + "balance_loss_clip": 1.05111551, + "balance_loss_mlp": 1.03095269, + "epoch": 0.21169397264392004, + "flos": 26067520091520.0, + "grad_norm": 1.92815865975435, + "language_loss": 0.76254267, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.78427303, + "num_input_tokens_seen": 76062685, + "step": 3521, + "time_per_iteration": 2.7965810298919678 + }, + { + "auxiliary_loss_clip": 0.0109789, + "auxiliary_loss_mlp": 0.01044794, + "balance_loss_clip": 1.04872918, + "balance_loss_mlp": 1.02841413, + "epoch": 0.211754095896588, + "flos": 23108265181440.0, + "grad_norm": 2.0270933567011302, + "language_loss": 0.75752926, + "learning_rate": 3.663358329538626e-06, + "loss": 0.77895606, + "num_input_tokens_seen": 76082300, + "step": 3522, + "time_per_iteration": 2.8280131816864014 + }, + { + "auxiliary_loss_clip": 0.01153324, + "auxiliary_loss_mlp": 0.01053431, + "balance_loss_clip": 1.05353725, + "balance_loss_mlp": 1.03541851, + "epoch": 0.21181421914925597, + "flos": 27922341571200.0, + "grad_norm": 1.8399634756194385, + "language_loss": 0.70481133, + "learning_rate": 3.663142046877374e-06, + "loss": 0.72687888, + "num_input_tokens_seen": 76101135, + "step": 3523, + "time_per_iteration": 2.6909022331237793 + }, + { + "auxiliary_loss_clip": 0.01139749, + "auxiliary_loss_mlp": 0.01054127, + "balance_loss_clip": 1.05166054, + "balance_loss_mlp": 1.03619766, + "epoch": 0.21187434240192393, + "flos": 17128636369920.0, + "grad_norm": 2.455264594190525, + "language_loss": 0.77290082, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.7948395, + "num_input_tokens_seen": 76119320, + "step": 3524, + "time_per_iteration": 2.6844334602355957 + }, + { + "auxiliary_loss_clip": 0.01132697, + "auxiliary_loss_mlp": 0.0104457, + "balance_loss_clip": 1.05066419, + "balance_loss_mlp": 1.02621162, + "epoch": 0.21193446565459192, + "flos": 22347318533760.0, + "grad_norm": 1.841652047976503, + "language_loss": 0.81680572, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.83857846, + "num_input_tokens_seen": 76137445, + "step": 3525, + "time_per_iteration": 2.71073842048645 + }, + { + "auxiliary_loss_clip": 0.01088536, + "auxiliary_loss_mlp": 0.01041509, + "balance_loss_clip": 1.04158318, + "balance_loss_mlp": 1.02353263, + "epoch": 0.2119945889072599, + "flos": 27199316707200.0, + "grad_norm": 1.867957043941215, + "language_loss": 0.75627208, + "learning_rate": 3.662492820527356e-06, + "loss": 0.77757257, + "num_input_tokens_seen": 76159500, + "step": 3526, + "time_per_iteration": 2.973966598510742 + }, + { + "auxiliary_loss_clip": 0.0115455, + "auxiliary_loss_mlp": 0.01041027, + "balance_loss_clip": 1.05324817, + "balance_loss_mlp": 1.023229, + "epoch": 0.21205471215992786, + "flos": 20991869884800.0, + "grad_norm": 1.8230643924086412, + "language_loss": 0.77070421, + "learning_rate": 3.662276285649284e-06, + "loss": 0.79265994, + "num_input_tokens_seen": 76177990, + "step": 3527, + "time_per_iteration": 2.648961067199707 + }, + { + "auxiliary_loss_clip": 0.01151081, + "auxiliary_loss_mlp": 0.0104874, + "balance_loss_clip": 1.05143785, + "balance_loss_mlp": 1.02977419, + "epoch": 0.21211483541259582, + "flos": 20777663128320.0, + "grad_norm": 2.807984733302778, + "language_loss": 0.7815178, + "learning_rate": 3.662059687737528e-06, + "loss": 0.80351603, + "num_input_tokens_seen": 76197125, + "step": 3528, + "time_per_iteration": 4.401185989379883 + }, + { + "auxiliary_loss_clip": 0.01135768, + "auxiliary_loss_mlp": 0.01045736, + "balance_loss_clip": 1.04889631, + "balance_loss_mlp": 1.02817655, + "epoch": 0.21217495866526379, + "flos": 18989994124800.0, + "grad_norm": 2.1271435469609257, + "language_loss": 0.8128866, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.8347016, + "num_input_tokens_seen": 76216215, + "step": 3529, + "time_per_iteration": 4.309772968292236 + }, + { + "auxiliary_loss_clip": 0.0113319, + "auxiliary_loss_mlp": 0.00777373, + "balance_loss_clip": 1.04967499, + "balance_loss_mlp": 1.00112891, + "epoch": 0.21223508191793175, + "flos": 20667309569280.0, + "grad_norm": 1.9704727824538568, + "language_loss": 0.76427567, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.78338128, + "num_input_tokens_seen": 76237010, + "step": 3530, + "time_per_iteration": 2.7592365741729736 + }, + { + "auxiliary_loss_clip": 0.0115078, + "auxiliary_loss_mlp": 0.01047067, + "balance_loss_clip": 1.0522244, + "balance_loss_mlp": 1.02990103, + "epoch": 0.21229520517059972, + "flos": 21616464504960.0, + "grad_norm": 2.1154933827202274, + "language_loss": 0.82973897, + "learning_rate": 3.661409515882308e-06, + "loss": 0.85171747, + "num_input_tokens_seen": 76255965, + "step": 3531, + "time_per_iteration": 4.168981313705444 + }, + { + "auxiliary_loss_clip": 0.01120152, + "auxiliary_loss_mlp": 0.01042697, + "balance_loss_clip": 1.04767489, + "balance_loss_mlp": 1.02313459, + "epoch": 0.2123553284232677, + "flos": 13991049411840.0, + "grad_norm": 2.335526210972433, + "language_loss": 0.73087364, + "learning_rate": 3.661192665917977e-06, + "loss": 0.75250214, + "num_input_tokens_seen": 76272150, + "step": 3532, + "time_per_iteration": 2.6797189712524414 + }, + { + "auxiliary_loss_clip": 0.01126693, + "auxiliary_loss_mlp": 0.01041409, + "balance_loss_clip": 1.0539782, + "balance_loss_mlp": 1.02269292, + "epoch": 0.21241545167593567, + "flos": 18296774570880.0, + "grad_norm": 6.22254473074881, + "language_loss": 0.74268675, + "learning_rate": 3.660975752961054e-06, + "loss": 0.76436776, + "num_input_tokens_seen": 76291425, + "step": 3533, + "time_per_iteration": 2.741152048110962 + }, + { + "auxiliary_loss_clip": 0.01146682, + "auxiliary_loss_mlp": 0.0104438, + "balance_loss_clip": 1.05342829, + "balance_loss_mlp": 1.0265224, + "epoch": 0.21247557492860364, + "flos": 34713121265280.0, + "grad_norm": 2.0406923816018714, + "language_loss": 0.70889592, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.73080653, + "num_input_tokens_seen": 76313975, + "step": 3534, + "time_per_iteration": 2.8210513591766357 + }, + { + "auxiliary_loss_clip": 0.01133157, + "auxiliary_loss_mlp": 0.01043651, + "balance_loss_clip": 1.05234385, + "balance_loss_mlp": 1.02463722, + "epoch": 0.2125356981812716, + "flos": 22053820504320.0, + "grad_norm": 2.102271516852891, + "language_loss": 0.71675557, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.73852366, + "num_input_tokens_seen": 76330955, + "step": 3535, + "time_per_iteration": 2.804506540298462 + }, + { + "auxiliary_loss_clip": 0.01137461, + "auxiliary_loss_mlp": 0.01053804, + "balance_loss_clip": 1.05108476, + "balance_loss_mlp": 1.03607774, + "epoch": 0.21259582143393957, + "flos": 28548336821760.0, + "grad_norm": 24.01704513629389, + "language_loss": 0.70639503, + "learning_rate": 3.660324636216996e-06, + "loss": 0.72830772, + "num_input_tokens_seen": 76352680, + "step": 3536, + "time_per_iteration": 4.442729473114014 + }, + { + "auxiliary_loss_clip": 0.011554, + "auxiliary_loss_mlp": 0.01049939, + "balance_loss_clip": 1.05231214, + "balance_loss_mlp": 1.03082991, + "epoch": 0.21265594468660753, + "flos": 20120892900480.0, + "grad_norm": 2.2527167001205806, + "language_loss": 0.8784188, + "learning_rate": 3.660107471371981e-06, + "loss": 0.90047216, + "num_input_tokens_seen": 76370750, + "step": 3537, + "time_per_iteration": 2.6365723609924316 + }, + { + "auxiliary_loss_clip": 0.01137536, + "auxiliary_loss_mlp": 0.00776226, + "balance_loss_clip": 1.04911351, + "balance_loss_mlp": 1.00101614, + "epoch": 0.21271606793927553, + "flos": 23076161400960.0, + "grad_norm": 1.8080285651248438, + "language_loss": 0.80480909, + "learning_rate": 3.659890243575524e-06, + "loss": 0.82394671, + "num_input_tokens_seen": 76390610, + "step": 3538, + "time_per_iteration": 2.7403554916381836 + }, + { + "auxiliary_loss_clip": 0.01080631, + "auxiliary_loss_mlp": 0.0105169, + "balance_loss_clip": 1.04171312, + "balance_loss_mlp": 1.03219926, + "epoch": 0.2127761911919435, + "flos": 26388201738240.0, + "grad_norm": 2.705287390300715, + "language_loss": 0.86691839, + "learning_rate": 3.659672952835863e-06, + "loss": 0.88824159, + "num_input_tokens_seen": 76408860, + "step": 3539, + "time_per_iteration": 2.8177876472473145 + }, + { + "auxiliary_loss_clip": 0.01120184, + "auxiliary_loss_mlp": 0.01047424, + "balance_loss_clip": 1.04577422, + "balance_loss_mlp": 1.0295074, + "epoch": 0.21283631444461146, + "flos": 20228265630720.0, + "grad_norm": 5.212413836862573, + "language_loss": 0.57756186, + "learning_rate": 3.659455599161237e-06, + "loss": 0.59923792, + "num_input_tokens_seen": 76424980, + "step": 3540, + "time_per_iteration": 2.786552667617798 + }, + { + "auxiliary_loss_clip": 0.01154193, + "auxiliary_loss_mlp": 0.010403, + "balance_loss_clip": 1.05276537, + "balance_loss_mlp": 1.02131045, + "epoch": 0.21289643769727942, + "flos": 13516992691200.0, + "grad_norm": 2.318388810062464, + "language_loss": 0.76114893, + "learning_rate": 3.659238182559888e-06, + "loss": 0.78309381, + "num_input_tokens_seen": 76443135, + "step": 3541, + "time_per_iteration": 2.646207332611084 + }, + { + "auxiliary_loss_clip": 0.01108241, + "auxiliary_loss_mlp": 0.01044876, + "balance_loss_clip": 1.0464325, + "balance_loss_mlp": 1.02676797, + "epoch": 0.2129565609499474, + "flos": 24827021942400.0, + "grad_norm": 3.508596736579257, + "language_loss": 0.69749588, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.71902704, + "num_input_tokens_seen": 76462470, + "step": 3542, + "time_per_iteration": 2.746612787246704 + }, + { + "auxiliary_loss_clip": 0.01149445, + "auxiliary_loss_mlp": 0.01038617, + "balance_loss_clip": 1.05146265, + "balance_loss_mlp": 1.02160525, + "epoch": 0.21301668420261535, + "flos": 23659242877440.0, + "grad_norm": 2.3488794859192397, + "language_loss": 0.75651306, + "learning_rate": 3.658803160610004e-06, + "loss": 0.77839369, + "num_input_tokens_seen": 76481995, + "step": 3543, + "time_per_iteration": 2.665900230407715 + }, + { + "auxiliary_loss_clip": 0.0112855, + "auxiliary_loss_mlp": 0.01042048, + "balance_loss_clip": 1.05257249, + "balance_loss_mlp": 1.02409506, + "epoch": 0.21307680745528332, + "flos": 16362805472640.0, + "grad_norm": 1.8076409354305347, + "language_loss": 0.66981912, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.6915251, + "num_input_tokens_seen": 76500245, + "step": 3544, + "time_per_iteration": 2.6692638397216797 + }, + { + "auxiliary_loss_clip": 0.01121216, + "auxiliary_loss_mlp": 0.01046395, + "balance_loss_clip": 1.0480237, + "balance_loss_mlp": 1.02897835, + "epoch": 0.2131369307079513, + "flos": 19099054794240.0, + "grad_norm": 1.8644107460894377, + "language_loss": 0.70977402, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.73145014, + "num_input_tokens_seen": 76519535, + "step": 3545, + "time_per_iteration": 2.686939001083374 + }, + { + "auxiliary_loss_clip": 0.01128605, + "auxiliary_loss_mlp": 0.01048325, + "balance_loss_clip": 1.05368018, + "balance_loss_mlp": 1.0300498, + "epoch": 0.21319705396061928, + "flos": 30372275583360.0, + "grad_norm": 1.8809403827144264, + "language_loss": 0.72329843, + "learning_rate": 3.658150155940946e-06, + "loss": 0.74506772, + "num_input_tokens_seen": 76542065, + "step": 3546, + "time_per_iteration": 2.8044040203094482 + }, + { + "auxiliary_loss_clip": 0.01115103, + "auxiliary_loss_mlp": 0.01050245, + "balance_loss_clip": 1.0539, + "balance_loss_mlp": 1.03250647, + "epoch": 0.21325717721328724, + "flos": 21756192410880.0, + "grad_norm": 3.48585993087404, + "language_loss": 0.80431038, + "learning_rate": 3.657932361952479e-06, + "loss": 0.82596385, + "num_input_tokens_seen": 76560540, + "step": 3547, + "time_per_iteration": 2.7981739044189453 + }, + { + "auxiliary_loss_clip": 0.01154388, + "auxiliary_loss_mlp": 0.01045355, + "balance_loss_clip": 1.05115056, + "balance_loss_mlp": 1.02685428, + "epoch": 0.2133173004659552, + "flos": 28730870760960.0, + "grad_norm": 2.460294966859189, + "language_loss": 0.7449761, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.7669735, + "num_input_tokens_seen": 76581760, + "step": 3548, + "time_per_iteration": 2.709476947784424 + }, + { + "auxiliary_loss_clip": 0.01117193, + "auxiliary_loss_mlp": 0.01059153, + "balance_loss_clip": 1.05099797, + "balance_loss_mlp": 1.03938842, + "epoch": 0.21337742371862317, + "flos": 16837077674880.0, + "grad_norm": 2.783715227630402, + "language_loss": 0.74218595, + "learning_rate": 3.657496585376922e-06, + "loss": 0.76394939, + "num_input_tokens_seen": 76599940, + "step": 3549, + "time_per_iteration": 2.751401662826538 + }, + { + "auxiliary_loss_clip": 0.01121431, + "auxiliary_loss_mlp": 0.01050546, + "balance_loss_clip": 1.05331278, + "balance_loss_mlp": 1.03283179, + "epoch": 0.21343754697129114, + "flos": 24424930120320.0, + "grad_norm": 1.8583266555890872, + "language_loss": 0.80719978, + "learning_rate": 3.657278602806357e-06, + "loss": 0.82891953, + "num_input_tokens_seen": 76619580, + "step": 3550, + "time_per_iteration": 2.74678373336792 + }, + { + "auxiliary_loss_clip": 0.01151996, + "auxiliary_loss_mlp": 0.01048347, + "balance_loss_clip": 1.05428052, + "balance_loss_mlp": 1.03147876, + "epoch": 0.21349767022395913, + "flos": 19277817805440.0, + "grad_norm": 1.7548210279469212, + "language_loss": 0.88234103, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90434444, + "num_input_tokens_seen": 76638195, + "step": 3551, + "time_per_iteration": 2.746938705444336 + }, + { + "auxiliary_loss_clip": 0.01151269, + "auxiliary_loss_mlp": 0.01048306, + "balance_loss_clip": 1.05139017, + "balance_loss_mlp": 1.03111625, + "epoch": 0.2135577934766271, + "flos": 17347547808000.0, + "grad_norm": 1.8976063035050816, + "language_loss": 0.83877259, + "learning_rate": 3.656842449140983e-06, + "loss": 0.86076838, + "num_input_tokens_seen": 76656695, + "step": 3552, + "time_per_iteration": 2.616567373275757 + }, + { + "auxiliary_loss_clip": 0.0113626, + "auxiliary_loss_mlp": 0.01050705, + "balance_loss_clip": 1.04937124, + "balance_loss_mlp": 1.0325495, + "epoch": 0.21361791672929506, + "flos": 24057204635520.0, + "grad_norm": 2.604872460919843, + "language_loss": 0.76370007, + "learning_rate": 3.656624278062713e-06, + "loss": 0.78556973, + "num_input_tokens_seen": 76677430, + "step": 3553, + "time_per_iteration": 2.730829954147339 + }, + { + "auxiliary_loss_clip": 0.01142267, + "auxiliary_loss_mlp": 0.01046102, + "balance_loss_clip": 1.05434144, + "balance_loss_mlp": 1.02915072, + "epoch": 0.21367803998196302, + "flos": 22162306556160.0, + "grad_norm": 1.5008078028945642, + "language_loss": 0.72580731, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.74769098, + "num_input_tokens_seen": 76697615, + "step": 3554, + "time_per_iteration": 2.701207399368286 + }, + { + "auxiliary_loss_clip": 0.01097601, + "auxiliary_loss_mlp": 0.00776401, + "balance_loss_clip": 1.04785013, + "balance_loss_mlp": 1.00128174, + "epoch": 0.213738163234631, + "flos": 20886867452160.0, + "grad_norm": 2.0681583889949957, + "language_loss": 0.67728174, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.69602168, + "num_input_tokens_seen": 76715685, + "step": 3555, + "time_per_iteration": 2.76454758644104 + }, + { + "auxiliary_loss_clip": 0.01124456, + "auxiliary_loss_mlp": 0.01045031, + "balance_loss_clip": 1.06086278, + "balance_loss_mlp": 1.02689981, + "epoch": 0.21379828648729896, + "flos": 28403114135040.0, + "grad_norm": 2.155752981705525, + "language_loss": 0.64553648, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.66723132, + "num_input_tokens_seen": 76735405, + "step": 3556, + "time_per_iteration": 2.839993953704834 + }, + { + "auxiliary_loss_clip": 0.01139371, + "auxiliary_loss_mlp": 0.01051642, + "balance_loss_clip": 1.05236566, + "balance_loss_mlp": 1.0331769, + "epoch": 0.21385840973996692, + "flos": 25479662106240.0, + "grad_norm": 1.7378281716746964, + "language_loss": 0.72588408, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.74779421, + "num_input_tokens_seen": 76754395, + "step": 3557, + "time_per_iteration": 2.7678587436676025 + }, + { + "auxiliary_loss_clip": 0.01151319, + "auxiliary_loss_mlp": 0.00776703, + "balance_loss_clip": 1.0647192, + "balance_loss_mlp": 1.00117195, + "epoch": 0.2139185329926349, + "flos": 28074280101120.0, + "grad_norm": 1.8333462571334693, + "language_loss": 0.6714859, + "learning_rate": 3.655532480546528e-06, + "loss": 0.6907661, + "num_input_tokens_seen": 76777210, + "step": 3558, + "time_per_iteration": 2.7584826946258545 + }, + { + "auxiliary_loss_clip": 0.01159331, + "auxiliary_loss_mlp": 0.01041115, + "balance_loss_clip": 1.0541842, + "balance_loss_mlp": 1.02297139, + "epoch": 0.21397865624530288, + "flos": 19608698914560.0, + "grad_norm": 1.8974456617751176, + "language_loss": 0.79882181, + "learning_rate": 3.655313932676286e-06, + "loss": 0.82082617, + "num_input_tokens_seen": 76795830, + "step": 3559, + "time_per_iteration": 2.6918041706085205 + }, + { + "auxiliary_loss_clip": 0.01155068, + "auxiliary_loss_mlp": 0.01046018, + "balance_loss_clip": 1.05566323, + "balance_loss_mlp": 1.0295198, + "epoch": 0.21403877949797084, + "flos": 24681476033280.0, + "grad_norm": 1.8730564704536732, + "language_loss": 0.68085694, + "learning_rate": 3.655095322036373e-06, + "loss": 0.70286781, + "num_input_tokens_seen": 76814700, + "step": 3560, + "time_per_iteration": 2.6445770263671875 + }, + { + "auxiliary_loss_clip": 0.01145074, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_clip": 1.0535686, + "balance_loss_mlp": 1.02537155, + "epoch": 0.2140989027506388, + "flos": 19861150677120.0, + "grad_norm": 1.8952415763477797, + "language_loss": 0.73272544, + "learning_rate": 3.65487664863508e-06, + "loss": 0.75461322, + "num_input_tokens_seen": 76833400, + "step": 3561, + "time_per_iteration": 2.6568899154663086 + }, + { + "auxiliary_loss_clip": 0.01133795, + "auxiliary_loss_mlp": 0.01044555, + "balance_loss_clip": 1.05333674, + "balance_loss_mlp": 1.02700794, + "epoch": 0.21415902600330677, + "flos": 19135324552320.0, + "grad_norm": 2.1953085541278203, + "language_loss": 0.78028738, + "learning_rate": 3.654657912480698e-06, + "loss": 0.80207092, + "num_input_tokens_seen": 76850645, + "step": 3562, + "time_per_iteration": 2.73655104637146 + }, + { + "auxiliary_loss_clip": 0.01155634, + "auxiliary_loss_mlp": 0.01042255, + "balance_loss_clip": 1.05661631, + "balance_loss_mlp": 1.02457595, + "epoch": 0.21421914925597474, + "flos": 22272624201600.0, + "grad_norm": 3.5245068195694937, + "language_loss": 0.84338713, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.86536604, + "num_input_tokens_seen": 76870135, + "step": 3563, + "time_per_iteration": 2.676630973815918 + }, + { + "auxiliary_loss_clip": 0.01157426, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.05830729, + "balance_loss_mlp": 1.01957488, + "epoch": 0.2142792725086427, + "flos": 33875109987840.0, + "grad_norm": 1.5172669047015535, + "language_loss": 0.76581991, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.78775525, + "num_input_tokens_seen": 76893905, + "step": 3564, + "time_per_iteration": 2.7504193782806396 + }, + { + "auxiliary_loss_clip": 0.01134427, + "auxiliary_loss_mlp": 0.01044002, + "balance_loss_clip": 1.06131172, + "balance_loss_mlp": 1.02674031, + "epoch": 0.2143393957613107, + "flos": 19860216923520.0, + "grad_norm": 1.7115347614953564, + "language_loss": 0.88466394, + "learning_rate": 3.654001327581981e-06, + "loss": 0.90644825, + "num_input_tokens_seen": 76914205, + "step": 3565, + "time_per_iteration": 2.7911624908447266 + }, + { + "auxiliary_loss_clip": 0.01071735, + "auxiliary_loss_mlp": 0.01008336, + "balance_loss_clip": 1.05462575, + "balance_loss_mlp": 1.0057019, + "epoch": 0.21439951901397866, + "flos": 68530093090560.0, + "grad_norm": 0.8339683756542131, + "language_loss": 0.52192736, + "learning_rate": 3.653782340498215e-06, + "loss": 0.54272807, + "num_input_tokens_seen": 76975650, + "step": 3566, + "time_per_iteration": 3.1801936626434326 + }, + { + "auxiliary_loss_clip": 0.01141614, + "auxiliary_loss_mlp": 0.01041326, + "balance_loss_clip": 1.05527854, + "balance_loss_mlp": 1.02505386, + "epoch": 0.21445964226664663, + "flos": 19682998197120.0, + "grad_norm": 1.8485820369681922, + "language_loss": 0.67324477, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.6950742, + "num_input_tokens_seen": 76992615, + "step": 3567, + "time_per_iteration": 2.6948626041412354 + }, + { + "auxiliary_loss_clip": 0.01123629, + "auxiliary_loss_mlp": 0.01045447, + "balance_loss_clip": 1.05142832, + "balance_loss_mlp": 1.02749455, + "epoch": 0.2145197655193146, + "flos": 31107259676160.0, + "grad_norm": 3.2542445550844317, + "language_loss": 0.74213678, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.76382756, + "num_input_tokens_seen": 77017005, + "step": 3568, + "time_per_iteration": 4.396210670471191 + }, + { + "auxiliary_loss_clip": 0.01140095, + "auxiliary_loss_mlp": 0.01050029, + "balance_loss_clip": 1.05480075, + "balance_loss_mlp": 1.03333998, + "epoch": 0.21457988877198256, + "flos": 20120785159680.0, + "grad_norm": 1.7132363384404574, + "language_loss": 0.77343202, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.79533333, + "num_input_tokens_seen": 77034990, + "step": 3569, + "time_per_iteration": 4.224002122879028 + }, + { + "auxiliary_loss_clip": 0.011511, + "auxiliary_loss_mlp": 0.0104435, + "balance_loss_clip": 1.05651093, + "balance_loss_mlp": 1.02521753, + "epoch": 0.21464001202465052, + "flos": 18588045957120.0, + "grad_norm": 2.6050136504577583, + "language_loss": 0.70278227, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.72473681, + "num_input_tokens_seen": 77052610, + "step": 3570, + "time_per_iteration": 2.668304681777954 + }, + { + "auxiliary_loss_clip": 0.01158856, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_clip": 1.05765057, + "balance_loss_mlp": 1.02955759, + "epoch": 0.21470013527731852, + "flos": 21835160461440.0, + "grad_norm": 2.5503136440013647, + "language_loss": 0.79031628, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.81237268, + "num_input_tokens_seen": 77072475, + "step": 3571, + "time_per_iteration": 4.066440105438232 + }, + { + "auxiliary_loss_clip": 0.0113831, + "auxiliary_loss_mlp": 0.01047146, + "balance_loss_clip": 1.05283594, + "balance_loss_mlp": 1.02703547, + "epoch": 0.21476025852998648, + "flos": 17603195880960.0, + "grad_norm": 1.9606975528380188, + "language_loss": 0.82601345, + "learning_rate": 3.652467101342991e-06, + "loss": 0.84786803, + "num_input_tokens_seen": 77089930, + "step": 3572, + "time_per_iteration": 2.6096267700195312 + }, + { + "auxiliary_loss_clip": 0.01134964, + "auxiliary_loss_mlp": 0.01041355, + "balance_loss_clip": 1.05588293, + "balance_loss_mlp": 1.02358127, + "epoch": 0.21482038178265445, + "flos": 24828135264000.0, + "grad_norm": 4.1014522432452285, + "language_loss": 0.65240026, + "learning_rate": 3.652247675452598e-06, + "loss": 0.67416352, + "num_input_tokens_seen": 77108970, + "step": 3573, + "time_per_iteration": 2.690986394882202 + }, + { + "auxiliary_loss_clip": 0.01147698, + "auxiliary_loss_mlp": 0.01048414, + "balance_loss_clip": 1.05253768, + "balance_loss_mlp": 1.03140295, + "epoch": 0.2148805050353224, + "flos": 23258228463360.0, + "grad_norm": 2.3397683674355565, + "language_loss": 0.75229824, + "learning_rate": 3.652028186908807e-06, + "loss": 0.77425939, + "num_input_tokens_seen": 77126045, + "step": 3574, + "time_per_iteration": 2.621736526489258 + }, + { + "auxiliary_loss_clip": 0.01138272, + "auxiliary_loss_mlp": 0.01041549, + "balance_loss_clip": 1.0526228, + "balance_loss_mlp": 1.02414417, + "epoch": 0.21494062828799038, + "flos": 21321098968320.0, + "grad_norm": 1.8157113535402463, + "language_loss": 0.72179317, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.74359143, + "num_input_tokens_seen": 77144600, + "step": 3575, + "time_per_iteration": 4.362869501113892 + }, + { + "auxiliary_loss_clip": 0.01126687, + "auxiliary_loss_mlp": 0.01041186, + "balance_loss_clip": 1.05261374, + "balance_loss_mlp": 1.02422237, + "epoch": 0.21500075154065834, + "flos": 18843334894080.0, + "grad_norm": 3.8402092268612216, + "language_loss": 0.68255925, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.70423794, + "num_input_tokens_seen": 77162965, + "step": 3576, + "time_per_iteration": 2.665370225906372 + }, + { + "auxiliary_loss_clip": 0.01138295, + "auxiliary_loss_mlp": 0.01049053, + "balance_loss_clip": 1.05064976, + "balance_loss_mlp": 1.02859676, + "epoch": 0.2150608747933263, + "flos": 18441997257600.0, + "grad_norm": 2.2101409055401566, + "language_loss": 0.88707685, + "learning_rate": 3.651369345440292e-06, + "loss": 0.90895033, + "num_input_tokens_seen": 77179960, + "step": 3577, + "time_per_iteration": 2.655118465423584 + }, + { + "auxiliary_loss_clip": 0.01070337, + "auxiliary_loss_mlp": 0.01022454, + "balance_loss_clip": 1.0487709, + "balance_loss_mlp": 1.01998615, + "epoch": 0.2151209980459943, + "flos": 66598242894720.0, + "grad_norm": 0.8146982557647512, + "language_loss": 0.56184745, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.58277535, + "num_input_tokens_seen": 77239500, + "step": 3578, + "time_per_iteration": 3.2133536338806152 + }, + { + "auxiliary_loss_clip": 0.01144391, + "auxiliary_loss_mlp": 0.00775114, + "balance_loss_clip": 1.05492067, + "balance_loss_mlp": 1.00130272, + "epoch": 0.21518112129866226, + "flos": 21575885114880.0, + "grad_norm": 2.988933296047806, + "language_loss": 0.88686001, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.90605509, + "num_input_tokens_seen": 77254680, + "step": 3579, + "time_per_iteration": 2.6801605224609375 + }, + { + "auxiliary_loss_clip": 0.01143273, + "auxiliary_loss_mlp": 0.0104707, + "balance_loss_clip": 1.05253708, + "balance_loss_mlp": 1.02945101, + "epoch": 0.21524124455133023, + "flos": 20047635112320.0, + "grad_norm": 1.8556029181899094, + "language_loss": 0.77953792, + "learning_rate": 3.650709940390972e-06, + "loss": 0.80144137, + "num_input_tokens_seen": 77274060, + "step": 3580, + "time_per_iteration": 2.6932644844055176 + }, + { + "auxiliary_loss_clip": 0.01145284, + "auxiliary_loss_mlp": 0.01043211, + "balance_loss_clip": 1.05702484, + "balance_loss_mlp": 1.02543712, + "epoch": 0.2153013678039982, + "flos": 23951807153280.0, + "grad_norm": 1.9843281400180077, + "language_loss": 0.72948015, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.75136507, + "num_input_tokens_seen": 77293255, + "step": 3581, + "time_per_iteration": 2.712376117706299 + }, + { + "auxiliary_loss_clip": 0.01138503, + "auxiliary_loss_mlp": 0.0104555, + "balance_loss_clip": 1.05348194, + "balance_loss_mlp": 1.0269891, + "epoch": 0.21536149105666616, + "flos": 20594841880320.0, + "grad_norm": 2.4257233983700113, + "language_loss": 0.70726413, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.72910464, + "num_input_tokens_seen": 77312390, + "step": 3582, + "time_per_iteration": 2.67122220993042 + }, + { + "auxiliary_loss_clip": 0.01154755, + "auxiliary_loss_mlp": 0.01040327, + "balance_loss_clip": 1.05591798, + "balance_loss_mlp": 1.0227195, + "epoch": 0.21542161430933413, + "flos": 12860042895360.0, + "grad_norm": 2.4025311229753363, + "language_loss": 0.84906816, + "learning_rate": 3.650049971985889e-06, + "loss": 0.87101901, + "num_input_tokens_seen": 77330985, + "step": 3583, + "time_per_iteration": 2.6395328044891357 + }, + { + "auxiliary_loss_clip": 0.01133287, + "auxiliary_loss_mlp": 0.01047024, + "balance_loss_clip": 1.05368245, + "balance_loss_mlp": 1.02971518, + "epoch": 0.21548173756200212, + "flos": 26103933504000.0, + "grad_norm": 2.7569743809923533, + "language_loss": 0.83223897, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.85404205, + "num_input_tokens_seen": 77350770, + "step": 3584, + "time_per_iteration": 2.730823040008545 + }, + { + "auxiliary_loss_clip": 0.01118851, + "auxiliary_loss_mlp": 0.00774813, + "balance_loss_clip": 1.0520674, + "balance_loss_mlp": 1.00120699, + "epoch": 0.21554186081467008, + "flos": 22163779013760.0, + "grad_norm": 1.9634031706782962, + "language_loss": 0.90054697, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.9194836, + "num_input_tokens_seen": 77370510, + "step": 3585, + "time_per_iteration": 2.722216844558716 + }, + { + "auxiliary_loss_clip": 0.01145179, + "auxiliary_loss_mlp": 0.01045359, + "balance_loss_clip": 1.05783939, + "balance_loss_mlp": 1.02793026, + "epoch": 0.21560198406733805, + "flos": 22966741595520.0, + "grad_norm": 1.9859337557251673, + "language_loss": 0.74663597, + "learning_rate": 3.649389440450277e-06, + "loss": 0.76854134, + "num_input_tokens_seen": 77390645, + "step": 3586, + "time_per_iteration": 2.7681503295898438 + }, + { + "auxiliary_loss_clip": 0.01120328, + "auxiliary_loss_mlp": 0.01046334, + "balance_loss_clip": 1.05628061, + "balance_loss_mlp": 1.03011, + "epoch": 0.215662107320006, + "flos": 22784064001920.0, + "grad_norm": 2.903090853788092, + "language_loss": 0.83029532, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.85196197, + "num_input_tokens_seen": 77409655, + "step": 3587, + "time_per_iteration": 2.788416624069214 + }, + { + "auxiliary_loss_clip": 0.01109364, + "auxiliary_loss_mlp": 0.00776304, + "balance_loss_clip": 1.05255485, + "balance_loss_mlp": 1.00129569, + "epoch": 0.21572223057267398, + "flos": 30883859038080.0, + "grad_norm": 1.7067147212291012, + "language_loss": 0.75593436, + "learning_rate": 3.648948773354224e-06, + "loss": 0.774791, + "num_input_tokens_seen": 77430560, + "step": 3588, + "time_per_iteration": 2.866584062576294 + }, + { + "auxiliary_loss_clip": 0.01136336, + "auxiliary_loss_mlp": 0.01039583, + "balance_loss_clip": 1.04921389, + "balance_loss_mlp": 1.0224762, + "epoch": 0.21578235382534194, + "flos": 26910487445760.0, + "grad_norm": 1.721393113594195, + "language_loss": 0.80745661, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.82921582, + "num_input_tokens_seen": 77455000, + "step": 3589, + "time_per_iteration": 2.8839404582977295 + }, + { + "auxiliary_loss_clip": 0.01157121, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.05677748, + "balance_loss_mlp": 1.01992083, + "epoch": 0.2158424770780099, + "flos": 24425720219520.0, + "grad_norm": 2.201221744880259, + "language_loss": 0.72849286, + "learning_rate": 3.648507856144961e-06, + "loss": 0.75042707, + "num_input_tokens_seen": 77475075, + "step": 3590, + "time_per_iteration": 2.6692256927490234 + }, + { + "auxiliary_loss_clip": 0.01134591, + "auxiliary_loss_mlp": 0.01044904, + "balance_loss_clip": 1.05195427, + "balance_loss_mlp": 1.02623618, + "epoch": 0.2159026003306779, + "flos": 23949975559680.0, + "grad_norm": 2.25677544320114, + "language_loss": 0.8402462, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86204112, + "num_input_tokens_seen": 77495945, + "step": 3591, + "time_per_iteration": 2.7531416416168213 + }, + { + "auxiliary_loss_clip": 0.01123784, + "auxiliary_loss_mlp": 0.01049552, + "balance_loss_clip": 1.05391979, + "balance_loss_mlp": 1.02972734, + "epoch": 0.21596272358334587, + "flos": 30040963511040.0, + "grad_norm": 2.2410681113576585, + "language_loss": 0.69175243, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.71348578, + "num_input_tokens_seen": 77517140, + "step": 3592, + "time_per_iteration": 2.8716177940368652 + }, + { + "auxiliary_loss_clip": 0.01117322, + "auxiliary_loss_mlp": 0.01050667, + "balance_loss_clip": 1.04998767, + "balance_loss_mlp": 1.03179634, + "epoch": 0.21602284683601383, + "flos": 20376217751040.0, + "grad_norm": 2.3652325886308123, + "language_loss": 0.84022737, + "learning_rate": 3.647846011515108e-06, + "loss": 0.86190724, + "num_input_tokens_seen": 77536085, + "step": 3593, + "time_per_iteration": 2.7185158729553223 + }, + { + "auxiliary_loss_clip": 0.01123006, + "auxiliary_loss_mlp": 0.01048394, + "balance_loss_clip": 1.05243289, + "balance_loss_mlp": 1.029809, + "epoch": 0.2160829700886818, + "flos": 20777339905920.0, + "grad_norm": 4.017970268493579, + "language_loss": 0.75192308, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.77363706, + "num_input_tokens_seen": 77553675, + "step": 3594, + "time_per_iteration": 2.726027011871338 + }, + { + "auxiliary_loss_clip": 0.01140408, + "auxiliary_loss_mlp": 0.01044406, + "balance_loss_clip": 1.05318236, + "balance_loss_mlp": 1.02650058, + "epoch": 0.21614309334134976, + "flos": 22309755886080.0, + "grad_norm": 1.541030891618627, + "language_loss": 0.80459857, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.82644665, + "num_input_tokens_seen": 77573360, + "step": 3595, + "time_per_iteration": 2.66504168510437 + }, + { + "auxiliary_loss_clip": 0.01119754, + "auxiliary_loss_mlp": 0.01039521, + "balance_loss_clip": 1.05060601, + "balance_loss_mlp": 1.02125788, + "epoch": 0.21620321659401773, + "flos": 19609524927360.0, + "grad_norm": 2.1030283577585007, + "language_loss": 0.78930759, + "learning_rate": 3.647183604506897e-06, + "loss": 0.81090033, + "num_input_tokens_seen": 77591865, + "step": 3596, + "time_per_iteration": 2.7159698009490967 + }, + { + "auxiliary_loss_clip": 0.01080261, + "auxiliary_loss_mlp": 0.01047978, + "balance_loss_clip": 1.04591155, + "balance_loss_mlp": 1.03106225, + "epoch": 0.2162633398466857, + "flos": 18844555956480.0, + "grad_norm": 1.6709210997095376, + "language_loss": 0.83061242, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.85189474, + "num_input_tokens_seen": 77611600, + "step": 3597, + "time_per_iteration": 2.79276704788208 + }, + { + "auxiliary_loss_clip": 0.01133147, + "auxiliary_loss_mlp": 0.00775626, + "balance_loss_clip": 1.05385637, + "balance_loss_mlp": 1.00146937, + "epoch": 0.21632346309935369, + "flos": 18768820129920.0, + "grad_norm": 1.6388312470031852, + "language_loss": 0.80549502, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.8245827, + "num_input_tokens_seen": 77630665, + "step": 3598, + "time_per_iteration": 2.6823580265045166 + }, + { + "auxiliary_loss_clip": 0.01123845, + "auxiliary_loss_mlp": 0.01051638, + "balance_loss_clip": 1.05069876, + "balance_loss_mlp": 1.03218365, + "epoch": 0.21638358635202165, + "flos": 26324173745280.0, + "grad_norm": 1.9066675721358164, + "language_loss": 0.82023275, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.84198749, + "num_input_tokens_seen": 77650835, + "step": 3599, + "time_per_iteration": 2.73583722114563 + }, + { + "auxiliary_loss_clip": 0.0110774, + "auxiliary_loss_mlp": 0.00775854, + "balance_loss_clip": 1.04651821, + "balance_loss_mlp": 1.00131536, + "epoch": 0.21644370960468962, + "flos": 20740854666240.0, + "grad_norm": 2.996184273033617, + "language_loss": 0.76724887, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.78608489, + "num_input_tokens_seen": 77669000, + "step": 3600, + "time_per_iteration": 2.695081949234009 + }, + { + "auxiliary_loss_clip": 0.01112458, + "auxiliary_loss_mlp": 0.01044855, + "balance_loss_clip": 1.04869664, + "balance_loss_mlp": 1.02886891, + "epoch": 0.21650383285735758, + "flos": 23952238116480.0, + "grad_norm": 2.259096111885494, + "language_loss": 0.80784452, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.82941765, + "num_input_tokens_seen": 77688745, + "step": 3601, + "time_per_iteration": 2.8094849586486816 + }, + { + "auxiliary_loss_clip": 0.01155408, + "auxiliary_loss_mlp": 0.01046912, + "balance_loss_clip": 1.0550983, + "balance_loss_mlp": 1.02973413, + "epoch": 0.21656395611002555, + "flos": 23696087253120.0, + "grad_norm": 2.558776342313561, + "language_loss": 0.83192647, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.85394967, + "num_input_tokens_seen": 77708445, + "step": 3602, + "time_per_iteration": 2.652876377105713 + }, + { + "auxiliary_loss_clip": 0.0115161, + "auxiliary_loss_mlp": 0.0105032, + "balance_loss_clip": 1.0525223, + "balance_loss_mlp": 1.03286743, + "epoch": 0.2166240793626935, + "flos": 20666052593280.0, + "grad_norm": 1.768938326380195, + "language_loss": 0.7449019, + "learning_rate": 3.645635802397693e-06, + "loss": 0.76692116, + "num_input_tokens_seen": 77728465, + "step": 3603, + "time_per_iteration": 2.619614601135254 + }, + { + "auxiliary_loss_clip": 0.01116481, + "auxiliary_loss_mlp": 0.01047384, + "balance_loss_clip": 1.04873598, + "balance_loss_mlp": 1.02883554, + "epoch": 0.2166842026153615, + "flos": 21580410228480.0, + "grad_norm": 1.6710689829239502, + "language_loss": 0.74178421, + "learning_rate": 3.645414438132855e-06, + "loss": 0.76342291, + "num_input_tokens_seen": 77746735, + "step": 3604, + "time_per_iteration": 2.730182647705078 + }, + { + "auxiliary_loss_clip": 0.01138214, + "auxiliary_loss_mlp": 0.01038079, + "balance_loss_clip": 1.05246544, + "balance_loss_mlp": 1.02124691, + "epoch": 0.21674432586802947, + "flos": 25629948610560.0, + "grad_norm": 1.7167946204354523, + "language_loss": 0.7990489, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.82081187, + "num_input_tokens_seen": 77768105, + "step": 3605, + "time_per_iteration": 2.67668080329895 + }, + { + "auxiliary_loss_clip": 0.01079717, + "auxiliary_loss_mlp": 0.01002026, + "balance_loss_clip": 1.0400598, + "balance_loss_mlp": 0.99942732, + "epoch": 0.21680444912069743, + "flos": 56417783616000.0, + "grad_norm": 0.7112415560884942, + "language_loss": 0.5834192, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.6042366, + "num_input_tokens_seen": 77833750, + "step": 3606, + "time_per_iteration": 3.2736570835113525 + }, + { + "auxiliary_loss_clip": 0.01155294, + "auxiliary_loss_mlp": 0.01043491, + "balance_loss_clip": 1.05404341, + "balance_loss_mlp": 1.02498984, + "epoch": 0.2168645723733654, + "flos": 23878944414720.0, + "grad_norm": 2.2731951350022275, + "language_loss": 0.73142302, + "learning_rate": 3.644749971006248e-06, + "loss": 0.75341088, + "num_input_tokens_seen": 77853780, + "step": 3607, + "time_per_iteration": 4.267899990081787 + }, + { + "auxiliary_loss_clip": 0.01133762, + "auxiliary_loss_mlp": 0.01046639, + "balance_loss_clip": 1.05282903, + "balance_loss_mlp": 1.02789962, + "epoch": 0.21692469562603336, + "flos": 16946174257920.0, + "grad_norm": 2.181379073292718, + "language_loss": 0.76540339, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.78720737, + "num_input_tokens_seen": 77872575, + "step": 3608, + "time_per_iteration": 4.285630464553833 + }, + { + "auxiliary_loss_clip": 0.01080204, + "auxiliary_loss_mlp": 0.01047623, + "balance_loss_clip": 1.04536235, + "balance_loss_mlp": 1.0309217, + "epoch": 0.21698481887870133, + "flos": 25119047514240.0, + "grad_norm": 2.042587105390135, + "language_loss": 0.74584132, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.76711953, + "num_input_tokens_seen": 77892700, + "step": 3609, + "time_per_iteration": 2.802569627761841 + }, + { + "auxiliary_loss_clip": 0.01131798, + "auxiliary_loss_mlp": 0.01049353, + "balance_loss_clip": 1.05227149, + "balance_loss_mlp": 1.03159094, + "epoch": 0.2170449421313693, + "flos": 17894682748800.0, + "grad_norm": 1.9074832440543417, + "language_loss": 0.89132321, + "learning_rate": 3.6440849425579e-06, + "loss": 0.91313475, + "num_input_tokens_seen": 77911060, + "step": 3610, + "time_per_iteration": 4.189727306365967 + }, + { + "auxiliary_loss_clip": 0.01155294, + "auxiliary_loss_mlp": 0.01044238, + "balance_loss_clip": 1.05534768, + "balance_loss_mlp": 1.02649963, + "epoch": 0.2171050653840373, + "flos": 22638446265600.0, + "grad_norm": 2.058717355808165, + "language_loss": 0.77779067, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.79978603, + "num_input_tokens_seen": 77929930, + "step": 3611, + "time_per_iteration": 2.6317896842956543 + }, + { + "auxiliary_loss_clip": 0.01088447, + "auxiliary_loss_mlp": 0.01047447, + "balance_loss_clip": 1.04764366, + "balance_loss_mlp": 1.03026867, + "epoch": 0.21716518863670525, + "flos": 19499997381120.0, + "grad_norm": 2.3883055198257184, + "language_loss": 0.63578451, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.65714347, + "num_input_tokens_seen": 77949060, + "step": 3612, + "time_per_iteration": 2.8771228790283203 + }, + { + "auxiliary_loss_clip": 0.01091118, + "auxiliary_loss_mlp": 0.01053996, + "balance_loss_clip": 1.04585218, + "balance_loss_mlp": 1.03454065, + "epoch": 0.21722531188937322, + "flos": 19792022952960.0, + "grad_norm": 1.801964584441428, + "language_loss": 0.75912857, + "learning_rate": 3.643419353014776e-06, + "loss": 0.78057969, + "num_input_tokens_seen": 77967920, + "step": 3613, + "time_per_iteration": 2.710601568222046 + }, + { + "auxiliary_loss_clip": 0.0110572, + "auxiliary_loss_mlp": 0.01051253, + "balance_loss_clip": 1.05008733, + "balance_loss_mlp": 1.03121352, + "epoch": 0.21728543514204118, + "flos": 13334386924800.0, + "grad_norm": 1.9293696862218277, + "language_loss": 0.71047795, + "learning_rate": 3.643197365185261e-06, + "loss": 0.73204768, + "num_input_tokens_seen": 77985330, + "step": 3614, + "time_per_iteration": 4.407632112503052 + }, + { + "auxiliary_loss_clip": 0.0114355, + "auxiliary_loss_mlp": 0.01048776, + "balance_loss_clip": 1.05521107, + "balance_loss_mlp": 1.0306083, + "epoch": 0.21734555839470915, + "flos": 15231870783360.0, + "grad_norm": 1.7289280951335333, + "language_loss": 0.73030001, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.75222325, + "num_input_tokens_seen": 78003105, + "step": 3615, + "time_per_iteration": 2.6358401775360107 + }, + { + "auxiliary_loss_clip": 0.01145731, + "auxiliary_loss_mlp": 0.01046632, + "balance_loss_clip": 1.05206716, + "balance_loss_mlp": 1.02703404, + "epoch": 0.2174056816473771, + "flos": 19973982274560.0, + "grad_norm": 2.3648922858816976, + "language_loss": 0.90127194, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.92319548, + "num_input_tokens_seen": 78019655, + "step": 3616, + "time_per_iteration": 2.659787178039551 + }, + { + "auxiliary_loss_clip": 0.01103597, + "auxiliary_loss_mlp": 0.01040899, + "balance_loss_clip": 1.048136, + "balance_loss_mlp": 1.02244496, + "epoch": 0.21746580490004508, + "flos": 16687293960960.0, + "grad_norm": 2.928463545610362, + "language_loss": 0.81107831, + "learning_rate": 3.642531027869148e-06, + "loss": 0.83252329, + "num_input_tokens_seen": 78036025, + "step": 3617, + "time_per_iteration": 2.7723491191864014 + }, + { + "auxiliary_loss_clip": 0.01132531, + "auxiliary_loss_mlp": 0.01041286, + "balance_loss_clip": 1.05330408, + "balance_loss_mlp": 1.02382231, + "epoch": 0.21752592815271307, + "flos": 25772298209280.0, + "grad_norm": 1.9251992817215786, + "language_loss": 0.75688154, + "learning_rate": 3.642308790849329e-06, + "loss": 0.77861977, + "num_input_tokens_seen": 78055645, + "step": 3618, + "time_per_iteration": 2.7608227729797363 + }, + { + "auxiliary_loss_clip": 0.01147874, + "auxiliary_loss_mlp": 0.01048647, + "balance_loss_clip": 1.05600834, + "balance_loss_mlp": 1.03045571, + "epoch": 0.21758605140538104, + "flos": 11254692349440.0, + "grad_norm": 2.18435089101569, + "language_loss": 0.69099152, + "learning_rate": 3.642086491552996e-06, + "loss": 0.71295673, + "num_input_tokens_seen": 78071660, + "step": 3619, + "time_per_iteration": 2.671637773513794 + }, + { + "auxiliary_loss_clip": 0.01144421, + "auxiliary_loss_mlp": 0.01042659, + "balance_loss_clip": 1.05394137, + "balance_loss_mlp": 1.02482569, + "epoch": 0.217646174658049, + "flos": 19242625455360.0, + "grad_norm": 4.829425462001391, + "language_loss": 0.78716505, + "learning_rate": 3.641864129988579e-06, + "loss": 0.8090359, + "num_input_tokens_seen": 78091265, + "step": 3620, + "time_per_iteration": 2.7232043743133545 + }, + { + "auxiliary_loss_clip": 0.01148457, + "auxiliary_loss_mlp": 0.01042109, + "balance_loss_clip": 1.05161178, + "balance_loss_mlp": 1.02507412, + "epoch": 0.21770629791071697, + "flos": 21945083057280.0, + "grad_norm": 1.4663479636678602, + "language_loss": 0.79966211, + "learning_rate": 3.641641706164509e-06, + "loss": 0.82156777, + "num_input_tokens_seen": 78110095, + "step": 3621, + "time_per_iteration": 2.6326823234558105 + }, + { + "auxiliary_loss_clip": 0.01143183, + "auxiliary_loss_mlp": 0.01035793, + "balance_loss_clip": 1.05334592, + "balance_loss_mlp": 1.01955688, + "epoch": 0.21776642116338493, + "flos": 24936764970240.0, + "grad_norm": 1.609721344037994, + "language_loss": 0.87796915, + "learning_rate": 3.641419220089221e-06, + "loss": 0.89975888, + "num_input_tokens_seen": 78129475, + "step": 3622, + "time_per_iteration": 2.6864428520202637 + }, + { + "auxiliary_loss_clip": 0.01146899, + "auxiliary_loss_mlp": 0.01037591, + "balance_loss_clip": 1.05495822, + "balance_loss_mlp": 1.01801729, + "epoch": 0.2178265444160529, + "flos": 17821317219840.0, + "grad_norm": 1.856609178217172, + "language_loss": 0.77077621, + "learning_rate": 3.641196671771152e-06, + "loss": 0.79262108, + "num_input_tokens_seen": 78146880, + "step": 3623, + "time_per_iteration": 2.743601083755493 + }, + { + "auxiliary_loss_clip": 0.01121788, + "auxiliary_loss_mlp": 0.01052122, + "balance_loss_clip": 1.05279899, + "balance_loss_mlp": 1.03226197, + "epoch": 0.2178866676687209, + "flos": 17712902995200.0, + "grad_norm": 2.4362835431673036, + "language_loss": 0.84600008, + "learning_rate": 3.640974061218741e-06, + "loss": 0.86773914, + "num_input_tokens_seen": 78165065, + "step": 3624, + "time_per_iteration": 2.7499353885650635 + }, + { + "auxiliary_loss_clip": 0.01139543, + "auxiliary_loss_mlp": 0.01057514, + "balance_loss_clip": 1.05353129, + "balance_loss_mlp": 1.03804684, + "epoch": 0.21794679092138886, + "flos": 16945851035520.0, + "grad_norm": 2.4333310175924905, + "language_loss": 0.78037703, + "learning_rate": 3.640751388440429e-06, + "loss": 0.80234766, + "num_input_tokens_seen": 78180005, + "step": 3625, + "time_per_iteration": 2.6314821243286133 + }, + { + "auxiliary_loss_clip": 0.01061536, + "auxiliary_loss_mlp": 0.01003869, + "balance_loss_clip": 1.03318405, + "balance_loss_mlp": 1.00130582, + "epoch": 0.21800691417405682, + "flos": 63718566566400.0, + "grad_norm": 0.8242097668179436, + "language_loss": 0.60701489, + "learning_rate": 3.64052865344466e-06, + "loss": 0.62766898, + "num_input_tokens_seen": 78245350, + "step": 3626, + "time_per_iteration": 3.257289409637451 + }, + { + "auxiliary_loss_clip": 0.0112643, + "auxiliary_loss_mlp": 0.00776719, + "balance_loss_clip": 1.05120194, + "balance_loss_mlp": 1.00134754, + "epoch": 0.21806703742672479, + "flos": 21616392677760.0, + "grad_norm": 2.2464694521793094, + "language_loss": 0.9077245, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.92675602, + "num_input_tokens_seen": 78264165, + "step": 3627, + "time_per_iteration": 2.6639885902404785 + }, + { + "auxiliary_loss_clip": 0.0109778, + "auxiliary_loss_mlp": 0.01043665, + "balance_loss_clip": 1.04912198, + "balance_loss_mlp": 1.02471113, + "epoch": 0.21812716067939275, + "flos": 19354882435200.0, + "grad_norm": 1.8437472480823303, + "language_loss": 0.73480809, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.75622261, + "num_input_tokens_seen": 78283745, + "step": 3628, + "time_per_iteration": 2.7430238723754883 + }, + { + "auxiliary_loss_clip": 0.01151444, + "auxiliary_loss_mlp": 0.01042108, + "balance_loss_clip": 1.05143893, + "balance_loss_mlp": 1.02391696, + "epoch": 0.21818728393206072, + "flos": 23548063305600.0, + "grad_norm": 2.8127332529660296, + "language_loss": 0.77337319, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.79530871, + "num_input_tokens_seen": 78302900, + "step": 3629, + "time_per_iteration": 2.6468687057495117 + }, + { + "auxiliary_loss_clip": 0.01142447, + "auxiliary_loss_mlp": 0.01044137, + "balance_loss_clip": 1.0532223, + "balance_loss_mlp": 1.02709055, + "epoch": 0.21824740718472868, + "flos": 30225652266240.0, + "grad_norm": 1.7154004506833416, + "language_loss": 0.71373391, + "learning_rate": 3.63963709145597e-06, + "loss": 0.73559982, + "num_input_tokens_seen": 78326470, + "step": 3630, + "time_per_iteration": 2.7334208488464355 + }, + { + "auxiliary_loss_clip": 0.01089422, + "auxiliary_loss_mlp": 0.01040838, + "balance_loss_clip": 1.04771948, + "balance_loss_mlp": 1.02488792, + "epoch": 0.21830753043739667, + "flos": 26134672567680.0, + "grad_norm": 2.4394061962398625, + "language_loss": 0.76502508, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.78632766, + "num_input_tokens_seen": 78345810, + "step": 3631, + "time_per_iteration": 2.9277098178863525 + }, + { + "auxiliary_loss_clip": 0.01153805, + "auxiliary_loss_mlp": 0.01036973, + "balance_loss_clip": 1.05322635, + "balance_loss_mlp": 1.01950908, + "epoch": 0.21836765369006464, + "flos": 21720712752000.0, + "grad_norm": 3.3333075141454556, + "language_loss": 0.75291955, + "learning_rate": 3.639190937376594e-06, + "loss": 0.77482736, + "num_input_tokens_seen": 78364085, + "step": 3632, + "time_per_iteration": 2.666961908340454 + }, + { + "auxiliary_loss_clip": 0.01149425, + "auxiliary_loss_mlp": 0.01038996, + "balance_loss_clip": 1.05168736, + "balance_loss_mlp": 1.02262831, + "epoch": 0.2184277769427326, + "flos": 19937604775680.0, + "grad_norm": 2.135610011090477, + "language_loss": 0.83723396, + "learning_rate": 3.638967767095249e-06, + "loss": 0.85911822, + "num_input_tokens_seen": 78381385, + "step": 3633, + "time_per_iteration": 2.6193437576293945 + }, + { + "auxiliary_loss_clip": 0.0112373, + "auxiliary_loss_mlp": 0.01049933, + "balance_loss_clip": 1.05514872, + "balance_loss_mlp": 1.03280258, + "epoch": 0.21848790019540057, + "flos": 20340235301760.0, + "grad_norm": 1.713148643324746, + "language_loss": 0.81381643, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.83555305, + "num_input_tokens_seen": 78400500, + "step": 3634, + "time_per_iteration": 2.7383267879486084 + }, + { + "auxiliary_loss_clip": 0.01144832, + "auxiliary_loss_mlp": 0.01040423, + "balance_loss_clip": 1.0548327, + "balance_loss_mlp": 1.02263677, + "epoch": 0.21854802344806853, + "flos": 15450818135040.0, + "grad_norm": 1.8988648345390304, + "language_loss": 0.74810624, + "learning_rate": 3.638521240091558e-06, + "loss": 0.76995879, + "num_input_tokens_seen": 78418340, + "step": 3635, + "time_per_iteration": 2.7461390495300293 + }, + { + "auxiliary_loss_clip": 0.01124703, + "auxiliary_loss_mlp": 0.01052922, + "balance_loss_clip": 1.05011106, + "balance_loss_mlp": 1.03524303, + "epoch": 0.2186081467007365, + "flos": 16320717711360.0, + "grad_norm": 2.2147010555825295, + "language_loss": 0.88340998, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.90518618, + "num_input_tokens_seen": 78434375, + "step": 3636, + "time_per_iteration": 2.631352186203003 + }, + { + "auxiliary_loss_clip": 0.01121776, + "auxiliary_loss_mlp": 0.00776363, + "balance_loss_clip": 1.05596519, + "balance_loss_mlp": 1.00133038, + "epoch": 0.2186682699534045, + "flos": 21689255416320.0, + "grad_norm": 2.464516707854487, + "language_loss": 0.76037598, + "learning_rate": 3.638074464556311e-06, + "loss": 0.77935731, + "num_input_tokens_seen": 78451735, + "step": 3637, + "time_per_iteration": 2.823063373565674 + }, + { + "auxiliary_loss_clip": 0.01137371, + "auxiliary_loss_mlp": 0.0104323, + "balance_loss_clip": 1.05512452, + "balance_loss_mlp": 1.02393031, + "epoch": 0.21872839320607246, + "flos": 17739260599680.0, + "grad_norm": 2.6753688852020328, + "language_loss": 0.89996254, + "learning_rate": 3.63785098361053e-06, + "loss": 0.92176855, + "num_input_tokens_seen": 78462730, + "step": 3638, + "time_per_iteration": 2.6404030323028564 + }, + { + "auxiliary_loss_clip": 0.01142035, + "auxiliary_loss_mlp": 0.01051888, + "balance_loss_clip": 1.0538702, + "balance_loss_mlp": 1.03351748, + "epoch": 0.21878851645874042, + "flos": 18652289431680.0, + "grad_norm": 2.4375531856602692, + "language_loss": 0.89243078, + "learning_rate": 3.637627440557275e-06, + "loss": 0.91436994, + "num_input_tokens_seen": 78476300, + "step": 3639, + "time_per_iteration": 2.6214118003845215 + }, + { + "auxiliary_loss_clip": 0.01134092, + "auxiliary_loss_mlp": 0.00776277, + "balance_loss_clip": 1.05406988, + "balance_loss_mlp": 1.00129211, + "epoch": 0.2188486397114084, + "flos": 25557301353600.0, + "grad_norm": 1.9800691484462982, + "language_loss": 0.79167712, + "learning_rate": 3.637403835405024e-06, + "loss": 0.81078082, + "num_input_tokens_seen": 78496135, + "step": 3640, + "time_per_iteration": 2.7559502124786377 + }, + { + "auxiliary_loss_clip": 0.01149345, + "auxiliary_loss_mlp": 0.01055855, + "balance_loss_clip": 1.05816483, + "balance_loss_mlp": 1.03617346, + "epoch": 0.21890876296407635, + "flos": 17892061056000.0, + "grad_norm": 2.2045237000129942, + "language_loss": 0.71708757, + "learning_rate": 3.637180168162255e-06, + "loss": 0.73913956, + "num_input_tokens_seen": 78513855, + "step": 3641, + "time_per_iteration": 2.6673953533172607 + }, + { + "auxiliary_loss_clip": 0.01130115, + "auxiliary_loss_mlp": 0.0104373, + "balance_loss_clip": 1.05217481, + "balance_loss_mlp": 1.02593243, + "epoch": 0.21896888621674432, + "flos": 17749100926080.0, + "grad_norm": 1.9358190088314053, + "language_loss": 0.81427026, + "learning_rate": 3.63695643883745e-06, + "loss": 0.83600873, + "num_input_tokens_seen": 78531740, + "step": 3642, + "time_per_iteration": 2.6722965240478516 + }, + { + "auxiliary_loss_clip": 0.01150265, + "auxiliary_loss_mlp": 0.01044184, + "balance_loss_clip": 1.05707705, + "balance_loss_mlp": 1.02520561, + "epoch": 0.21902900946941228, + "flos": 23076161400960.0, + "grad_norm": 2.2890480980316865, + "language_loss": 0.7124145, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.73435903, + "num_input_tokens_seen": 78549600, + "step": 3643, + "time_per_iteration": 2.6586625576019287 + }, + { + "auxiliary_loss_clip": 0.01156283, + "auxiliary_loss_mlp": 0.01046488, + "balance_loss_clip": 1.05430686, + "balance_loss_mlp": 1.02728367, + "epoch": 0.21908913272208028, + "flos": 48178545004800.0, + "grad_norm": 2.705040309825256, + "language_loss": 0.68497038, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.70699811, + "num_input_tokens_seen": 78573350, + "step": 3644, + "time_per_iteration": 2.835944414138794 + }, + { + "auxiliary_loss_clip": 0.01157461, + "auxiliary_loss_mlp": 0.01049851, + "balance_loss_clip": 1.05381823, + "balance_loss_mlp": 1.03175521, + "epoch": 0.21914925597474824, + "flos": 22236749493120.0, + "grad_norm": 2.498314523319793, + "language_loss": 0.77761143, + "learning_rate": 3.636284878455669e-06, + "loss": 0.79968452, + "num_input_tokens_seen": 78591005, + "step": 3645, + "time_per_iteration": 2.6053528785705566 + }, + { + "auxiliary_loss_clip": 0.01142456, + "auxiliary_loss_mlp": 0.01054431, + "balance_loss_clip": 1.05606842, + "balance_loss_mlp": 1.03732491, + "epoch": 0.2192093792274162, + "flos": 22125605834880.0, + "grad_norm": 3.1951942186566766, + "language_loss": 0.82604313, + "learning_rate": 3.636060900887582e-06, + "loss": 0.84801197, + "num_input_tokens_seen": 78610645, + "step": 3646, + "time_per_iteration": 4.198619842529297 + }, + { + "auxiliary_loss_clip": 0.01141068, + "auxiliary_loss_mlp": 0.01040772, + "balance_loss_clip": 1.05287766, + "balance_loss_mlp": 1.02365351, + "epoch": 0.21926950248008417, + "flos": 15669442264320.0, + "grad_norm": 1.720246481727725, + "language_loss": 0.82877636, + "learning_rate": 3.635836861279901e-06, + "loss": 0.85059476, + "num_input_tokens_seen": 78628340, + "step": 3647, + "time_per_iteration": 4.229920387268066 + }, + { + "auxiliary_loss_clip": 0.0115057, + "auxiliary_loss_mlp": 0.01054202, + "balance_loss_clip": 1.05145597, + "balance_loss_mlp": 1.03685677, + "epoch": 0.21932962573275214, + "flos": 30262496641920.0, + "grad_norm": 1.6932394069108108, + "language_loss": 0.72652817, + "learning_rate": 3.635612759641123e-06, + "loss": 0.74857587, + "num_input_tokens_seen": 78649355, + "step": 3648, + "time_per_iteration": 2.7226104736328125 + }, + { + "auxiliary_loss_clip": 0.01110484, + "auxiliary_loss_mlp": 0.01057841, + "balance_loss_clip": 1.04757857, + "balance_loss_mlp": 1.03643107, + "epoch": 0.2193897489854201, + "flos": 10780132838400.0, + "grad_norm": 3.9115777702699175, + "language_loss": 0.74917972, + "learning_rate": 3.635388595979745e-06, + "loss": 0.77086294, + "num_input_tokens_seen": 78664915, + "step": 3649, + "time_per_iteration": 4.201031446456909 + }, + { + "auxiliary_loss_clip": 0.01138726, + "auxiliary_loss_mlp": 0.0105421, + "balance_loss_clip": 1.0536499, + "balance_loss_mlp": 1.03718746, + "epoch": 0.21944987223808807, + "flos": 19133313390720.0, + "grad_norm": 1.8914434058388716, + "language_loss": 0.86353791, + "learning_rate": 3.635164370304267e-06, + "loss": 0.88546729, + "num_input_tokens_seen": 78681475, + "step": 3650, + "time_per_iteration": 2.6061322689056396 + }, + { + "auxiliary_loss_clip": 0.01130852, + "auxiliary_loss_mlp": 0.01052398, + "balance_loss_clip": 1.04992914, + "balance_loss_mlp": 1.03439701, + "epoch": 0.21950999549075606, + "flos": 22711093522560.0, + "grad_norm": 2.798139483493165, + "language_loss": 0.83541161, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.85724407, + "num_input_tokens_seen": 78702300, + "step": 3651, + "time_per_iteration": 2.7605133056640625 + }, + { + "auxiliary_loss_clip": 0.01143643, + "auxiliary_loss_mlp": 0.0105251, + "balance_loss_clip": 1.05282581, + "balance_loss_mlp": 1.03511763, + "epoch": 0.21957011874342403, + "flos": 10561329141120.0, + "grad_norm": 1.9065881796375543, + "language_loss": 0.74475014, + "learning_rate": 3.634715732945027e-06, + "loss": 0.76671165, + "num_input_tokens_seen": 78720230, + "step": 3652, + "time_per_iteration": 2.597443103790283 + }, + { + "auxiliary_loss_clip": 0.01038431, + "auxiliary_loss_mlp": 0.01009267, + "balance_loss_clip": 1.0361495, + "balance_loss_mlp": 1.0068711, + "epoch": 0.219630241996092, + "flos": 65747913252480.0, + "grad_norm": 0.7482502800744824, + "language_loss": 0.51550615, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.5359832, + "num_input_tokens_seen": 78780200, + "step": 3653, + "time_per_iteration": 3.324497699737549 + }, + { + "auxiliary_loss_clip": 0.01125533, + "auxiliary_loss_mlp": 0.01062527, + "balance_loss_clip": 1.05436754, + "balance_loss_mlp": 1.04470527, + "epoch": 0.21969036524875996, + "flos": 23696518216320.0, + "grad_norm": 1.9578946934595152, + "language_loss": 0.75356162, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.77544224, + "num_input_tokens_seen": 78800575, + "step": 3654, + "time_per_iteration": 4.296064615249634 + }, + { + "auxiliary_loss_clip": 0.01152337, + "auxiliary_loss_mlp": 0.01051249, + "balance_loss_clip": 1.05944824, + "balance_loss_mlp": 1.03376114, + "epoch": 0.21975048850142792, + "flos": 19640910435840.0, + "grad_norm": 1.8387519277823352, + "language_loss": 0.72646022, + "learning_rate": 3.634042312013064e-06, + "loss": 0.74849606, + "num_input_tokens_seen": 78819585, + "step": 3655, + "time_per_iteration": 2.6634860038757324 + }, + { + "auxiliary_loss_clip": 0.01130021, + "auxiliary_loss_mlp": 0.01048784, + "balance_loss_clip": 1.05423379, + "balance_loss_mlp": 1.03071189, + "epoch": 0.21981061175409589, + "flos": 22448550038400.0, + "grad_norm": 1.722985511504472, + "language_loss": 0.80795759, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.82974565, + "num_input_tokens_seen": 78837330, + "step": 3656, + "time_per_iteration": 2.730391502380371 + }, + { + "auxiliary_loss_clip": 0.01124773, + "auxiliary_loss_mlp": 0.00776202, + "balance_loss_clip": 1.06113994, + "balance_loss_mlp": 1.00139225, + "epoch": 0.21987073500676388, + "flos": 18151049093760.0, + "grad_norm": 2.646453773467974, + "language_loss": 0.84885842, + "learning_rate": 3.63359305489566e-06, + "loss": 0.86786819, + "num_input_tokens_seen": 78854955, + "step": 3657, + "time_per_iteration": 2.657607078552246 + }, + { + "auxiliary_loss_clip": 0.01142645, + "auxiliary_loss_mlp": 0.01040533, + "balance_loss_clip": 1.05631852, + "balance_loss_mlp": 1.02260423, + "epoch": 0.21993085825943184, + "flos": 25626177682560.0, + "grad_norm": 2.6990832263195585, + "language_loss": 0.80355585, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.82538766, + "num_input_tokens_seen": 78874965, + "step": 3658, + "time_per_iteration": 2.6584107875823975 + }, + { + "auxiliary_loss_clip": 0.01048937, + "auxiliary_loss_mlp": 0.0100499, + "balance_loss_clip": 1.03857517, + "balance_loss_mlp": 1.00202215, + "epoch": 0.2199909815120998, + "flos": 70923217743360.0, + "grad_norm": 0.7788612160796681, + "language_loss": 0.58191586, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60245514, + "num_input_tokens_seen": 78937740, + "step": 3659, + "time_per_iteration": 3.3395371437072754 + }, + { + "auxiliary_loss_clip": 0.01111007, + "auxiliary_loss_mlp": 0.0105329, + "balance_loss_clip": 1.05029392, + "balance_loss_mlp": 1.03471744, + "epoch": 0.22005110476476777, + "flos": 21543529939200.0, + "grad_norm": 4.382741616753977, + "language_loss": 0.7477597, + "learning_rate": 3.632918704645772e-06, + "loss": 0.76940262, + "num_input_tokens_seen": 78955055, + "step": 3660, + "time_per_iteration": 2.782975435256958 + }, + { + "auxiliary_loss_clip": 0.01147277, + "auxiliary_loss_mlp": 0.01044652, + "balance_loss_clip": 1.05691171, + "balance_loss_mlp": 1.02653265, + "epoch": 0.22011122801743574, + "flos": 22054502862720.0, + "grad_norm": 1.8856077512582532, + "language_loss": 0.81484449, + "learning_rate": 3.632693797376893e-06, + "loss": 0.83676374, + "num_input_tokens_seen": 78974895, + "step": 3661, + "time_per_iteration": 2.7780110836029053 + }, + { + "auxiliary_loss_clip": 0.01126694, + "auxiliary_loss_mlp": 0.01056397, + "balance_loss_clip": 1.05167532, + "balance_loss_mlp": 1.03800273, + "epoch": 0.2201713512701037, + "flos": 26687589598080.0, + "grad_norm": 1.9746283079458686, + "language_loss": 0.73154199, + "learning_rate": 3.632468828196102e-06, + "loss": 0.75337297, + "num_input_tokens_seen": 78994990, + "step": 3662, + "time_per_iteration": 2.7189040184020996 + }, + { + "auxiliary_loss_clip": 0.0113519, + "auxiliary_loss_mlp": 0.01051686, + "balance_loss_clip": 1.05718994, + "balance_loss_mlp": 1.03555691, + "epoch": 0.22023147452277167, + "flos": 22162198815360.0, + "grad_norm": 2.0576168655035714, + "language_loss": 0.78066969, + "learning_rate": 3.632243797111929e-06, + "loss": 0.80253839, + "num_input_tokens_seen": 79014405, + "step": 3663, + "time_per_iteration": 2.731412410736084 + }, + { + "auxiliary_loss_clip": 0.01142837, + "auxiliary_loss_mlp": 0.01063521, + "balance_loss_clip": 1.05659413, + "balance_loss_mlp": 1.04352939, + "epoch": 0.22029159777543966, + "flos": 22523280284160.0, + "grad_norm": 1.752119258875799, + "language_loss": 0.80294079, + "learning_rate": 3.632018704132908e-06, + "loss": 0.82500434, + "num_input_tokens_seen": 79032375, + "step": 3664, + "time_per_iteration": 2.7043297290802 + }, + { + "auxiliary_loss_clip": 0.01134207, + "auxiliary_loss_mlp": 0.01044352, + "balance_loss_clip": 1.05424213, + "balance_loss_mlp": 1.02474177, + "epoch": 0.22035172102810763, + "flos": 13042469093760.0, + "grad_norm": 3.138103913885462, + "language_loss": 0.76388288, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.78566849, + "num_input_tokens_seen": 79049635, + "step": 3665, + "time_per_iteration": 2.68300199508667 + }, + { + "auxiliary_loss_clip": 0.01128405, + "auxiliary_loss_mlp": 0.01053304, + "balance_loss_clip": 1.05599689, + "balance_loss_mlp": 1.03589976, + "epoch": 0.2204118442807756, + "flos": 12165817760640.0, + "grad_norm": 2.9738702224471583, + "language_loss": 0.9800086, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00182581, + "num_input_tokens_seen": 79062890, + "step": 3666, + "time_per_iteration": 2.702584981918335 + }, + { + "auxiliary_loss_clip": 0.01141573, + "auxiliary_loss_mlp": 0.00776689, + "balance_loss_clip": 1.05254698, + "balance_loss_mlp": 1.00133562, + "epoch": 0.22047196753344356, + "flos": 40108806673920.0, + "grad_norm": 1.894759892223008, + "language_loss": 0.80946934, + "learning_rate": 3.631343053912122e-06, + "loss": 0.82865196, + "num_input_tokens_seen": 79085495, + "step": 3667, + "time_per_iteration": 2.8920814990997314 + }, + { + "auxiliary_loss_clip": 0.01149896, + "auxiliary_loss_mlp": 0.01051036, + "balance_loss_clip": 1.06145239, + "balance_loss_mlp": 1.03161693, + "epoch": 0.22053209078611152, + "flos": 20701137202560.0, + "grad_norm": 1.8771463594277091, + "language_loss": 0.7736783, + "learning_rate": 3.631117713439087e-06, + "loss": 0.79568756, + "num_input_tokens_seen": 79101820, + "step": 3668, + "time_per_iteration": 2.6733500957489014 + }, + { + "auxiliary_loss_clip": 0.01143618, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.05955744, + "balance_loss_mlp": 1.02972412, + "epoch": 0.2205922140387795, + "flos": 24716309247360.0, + "grad_norm": 1.7809066581326154, + "language_loss": 0.71624571, + "learning_rate": 3.630892311113904e-06, + "loss": 0.7381565, + "num_input_tokens_seen": 79123320, + "step": 3669, + "time_per_iteration": 2.7298974990844727 + }, + { + "auxiliary_loss_clip": 0.01155448, + "auxiliary_loss_mlp": 0.01039044, + "balance_loss_clip": 1.0544126, + "balance_loss_mlp": 1.0217346, + "epoch": 0.22065233729144745, + "flos": 23477247642240.0, + "grad_norm": 2.1257290130035082, + "language_loss": 0.85160267, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.87354761, + "num_input_tokens_seen": 79141615, + "step": 3670, + "time_per_iteration": 2.6624948978424072 + }, + { + "auxiliary_loss_clip": 0.01137906, + "auxiliary_loss_mlp": 0.01042298, + "balance_loss_clip": 1.05475712, + "balance_loss_mlp": 1.02376091, + "epoch": 0.22071246054411545, + "flos": 35225566646400.0, + "grad_norm": 1.8008957470192373, + "language_loss": 0.76928926, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.79109132, + "num_input_tokens_seen": 79164910, + "step": 3671, + "time_per_iteration": 2.7914648056030273 + }, + { + "auxiliary_loss_clip": 0.01126159, + "auxiliary_loss_mlp": 0.01040764, + "balance_loss_clip": 1.05423856, + "balance_loss_mlp": 1.02281129, + "epoch": 0.2207725837967834, + "flos": 18150294908160.0, + "grad_norm": 2.015071454696955, + "language_loss": 0.80643147, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.82810068, + "num_input_tokens_seen": 79179685, + "step": 3672, + "time_per_iteration": 2.674381732940674 + }, + { + "auxiliary_loss_clip": 0.01149005, + "auxiliary_loss_mlp": 0.01047239, + "balance_loss_clip": 1.05706501, + "balance_loss_mlp": 1.02992952, + "epoch": 0.22083270704945138, + "flos": 20479675898880.0, + "grad_norm": 2.222038104071356, + "language_loss": 0.73278964, + "learning_rate": 3.629990083462682e-06, + "loss": 0.75475204, + "num_input_tokens_seen": 79196285, + "step": 3673, + "time_per_iteration": 2.6856846809387207 + }, + { + "auxiliary_loss_clip": 0.01121745, + "auxiliary_loss_mlp": 0.01044908, + "balance_loss_clip": 1.05473876, + "balance_loss_mlp": 1.02608538, + "epoch": 0.22089283030211934, + "flos": 34125801984000.0, + "grad_norm": 1.9530426336903413, + "language_loss": 0.76384282, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.78550935, + "num_input_tokens_seen": 79216060, + "step": 3674, + "time_per_iteration": 2.816190242767334 + }, + { + "auxiliary_loss_clip": 0.01156134, + "auxiliary_loss_mlp": 0.01047969, + "balance_loss_clip": 1.05650616, + "balance_loss_mlp": 1.02850175, + "epoch": 0.2209529535547873, + "flos": 18077216688000.0, + "grad_norm": 2.045565300481816, + "language_loss": 0.74367136, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.76571238, + "num_input_tokens_seen": 79235145, + "step": 3675, + "time_per_iteration": 2.69748592376709 + }, + { + "auxiliary_loss_clip": 0.01155113, + "auxiliary_loss_mlp": 0.01045626, + "balance_loss_clip": 1.05442023, + "balance_loss_mlp": 1.02800727, + "epoch": 0.22101307680745527, + "flos": 27235335070080.0, + "grad_norm": 1.898816078558846, + "language_loss": 0.79801333, + "learning_rate": 3.629312763695772e-06, + "loss": 0.82002068, + "num_input_tokens_seen": 79256960, + "step": 3676, + "time_per_iteration": 2.6792948246002197 + }, + { + "auxiliary_loss_clip": 0.01133095, + "auxiliary_loss_mlp": 0.01049823, + "balance_loss_clip": 1.05366707, + "balance_loss_mlp": 1.03257358, + "epoch": 0.22107320006012326, + "flos": 16543256423040.0, + "grad_norm": 2.1537198076644954, + "language_loss": 0.75327688, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.77510607, + "num_input_tokens_seen": 79274860, + "step": 3677, + "time_per_iteration": 2.781393527984619 + }, + { + "auxiliary_loss_clip": 0.0111612, + "auxiliary_loss_mlp": 0.01050059, + "balance_loss_clip": 1.04986429, + "balance_loss_mlp": 1.03212988, + "epoch": 0.22113332331279123, + "flos": 22054466949120.0, + "grad_norm": 1.7875463894855461, + "language_loss": 0.83287871, + "learning_rate": 3.628860908251712e-06, + "loss": 0.85454059, + "num_input_tokens_seen": 79294005, + "step": 3678, + "time_per_iteration": 2.752838611602783 + }, + { + "auxiliary_loss_clip": 0.01094052, + "auxiliary_loss_mlp": 0.01058605, + "balance_loss_clip": 1.04951406, + "balance_loss_mlp": 1.03992522, + "epoch": 0.2211934465654592, + "flos": 26612787525120.0, + "grad_norm": 1.6742153249136704, + "language_loss": 0.89135075, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.91287732, + "num_input_tokens_seen": 79314005, + "step": 3679, + "time_per_iteration": 2.8282527923583984 + }, + { + "auxiliary_loss_clip": 0.01147641, + "auxiliary_loss_mlp": 0.01054276, + "balance_loss_clip": 1.05507338, + "balance_loss_mlp": 1.03615618, + "epoch": 0.22125356981812716, + "flos": 16360363347840.0, + "grad_norm": 3.092644946410345, + "language_loss": 0.8649044, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.88692355, + "num_input_tokens_seen": 79331030, + "step": 3680, + "time_per_iteration": 2.630829095840454 + }, + { + "auxiliary_loss_clip": 0.0111249, + "auxiliary_loss_mlp": 0.01052062, + "balance_loss_clip": 1.05374503, + "balance_loss_mlp": 1.03395414, + "epoch": 0.22131369307079513, + "flos": 21651118151040.0, + "grad_norm": 1.9427224492838853, + "language_loss": 0.81773758, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.83938313, + "num_input_tokens_seen": 79348560, + "step": 3681, + "time_per_iteration": 2.805880069732666 + }, + { + "auxiliary_loss_clip": 0.01148508, + "auxiliary_loss_mlp": 0.00775652, + "balance_loss_clip": 1.0530386, + "balance_loss_mlp": 1.00146043, + "epoch": 0.2213738163234631, + "flos": 19609524927360.0, + "grad_norm": 2.296553230959153, + "language_loss": 0.80099678, + "learning_rate": 3.62795645623335e-06, + "loss": 0.82023835, + "num_input_tokens_seen": 79367175, + "step": 3682, + "time_per_iteration": 2.624234199523926 + }, + { + "auxiliary_loss_clip": 0.0112405, + "auxiliary_loss_mlp": 0.0105126, + "balance_loss_clip": 1.0500052, + "balance_loss_mlp": 1.03198409, + "epoch": 0.22143393957613106, + "flos": 23623404082560.0, + "grad_norm": 1.6781760642146926, + "language_loss": 0.77394038, + "learning_rate": 3.627730188876638e-06, + "loss": 0.7956934, + "num_input_tokens_seen": 79388435, + "step": 3683, + "time_per_iteration": 2.6746323108673096 + }, + { + "auxiliary_loss_clip": 0.01129753, + "auxiliary_loss_mlp": 0.01051291, + "balance_loss_clip": 1.05048668, + "balance_loss_mlp": 1.03411245, + "epoch": 0.22149406282879905, + "flos": 26177801823360.0, + "grad_norm": 2.1201256685163323, + "language_loss": 0.72406399, + "learning_rate": 3.627503859796234e-06, + "loss": 0.7458744, + "num_input_tokens_seen": 79407910, + "step": 3684, + "time_per_iteration": 2.695958375930786 + }, + { + "auxiliary_loss_clip": 0.01084051, + "auxiliary_loss_mlp": 0.01045612, + "balance_loss_clip": 1.04670835, + "balance_loss_mlp": 1.02571654, + "epoch": 0.221554186081467, + "flos": 14538758970240.0, + "grad_norm": 2.1308896442870893, + "language_loss": 0.79817796, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.81947458, + "num_input_tokens_seen": 79424020, + "step": 3685, + "time_per_iteration": 2.7443795204162598 + }, + { + "auxiliary_loss_clip": 0.01147394, + "auxiliary_loss_mlp": 0.01045457, + "balance_loss_clip": 1.05201805, + "balance_loss_mlp": 1.02867222, + "epoch": 0.22161430933413498, + "flos": 22238257864320.0, + "grad_norm": 1.6870532517893482, + "language_loss": 0.87305272, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.89498115, + "num_input_tokens_seen": 79445605, + "step": 3686, + "time_per_iteration": 4.388494968414307 + }, + { + "auxiliary_loss_clip": 0.01137917, + "auxiliary_loss_mlp": 0.0104367, + "balance_loss_clip": 1.052562, + "balance_loss_mlp": 1.02620554, + "epoch": 0.22167443258680294, + "flos": 23476529370240.0, + "grad_norm": 1.8821221420403713, + "language_loss": 0.78069639, + "learning_rate": 3.626824502298707e-06, + "loss": 0.80251229, + "num_input_tokens_seen": 79463850, + "step": 3687, + "time_per_iteration": 4.123531103134155 + }, + { + "auxiliary_loss_clip": 0.0112545, + "auxiliary_loss_mlp": 0.01052599, + "balance_loss_clip": 1.0494144, + "balance_loss_mlp": 1.0331558, + "epoch": 0.2217345558394709, + "flos": 23221132692480.0, + "grad_norm": 1.8251811803295879, + "language_loss": 0.84860861, + "learning_rate": 3.626597926409383e-06, + "loss": 0.8703891, + "num_input_tokens_seen": 79482845, + "step": 3688, + "time_per_iteration": 4.287938594818115 + }, + { + "auxiliary_loss_clip": 0.01110764, + "auxiliary_loss_mlp": 0.01051634, + "balance_loss_clip": 1.04967332, + "balance_loss_mlp": 1.03254843, + "epoch": 0.22179467909213887, + "flos": 20011078045440.0, + "grad_norm": 1.7785994747216247, + "language_loss": 0.81150943, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.83313334, + "num_input_tokens_seen": 79501550, + "step": 3689, + "time_per_iteration": 2.7521302700042725 + }, + { + "auxiliary_loss_clip": 0.01124628, + "auxiliary_loss_mlp": 0.01048971, + "balance_loss_clip": 1.05078936, + "balance_loss_mlp": 1.03131568, + "epoch": 0.22185480234480687, + "flos": 19683034110720.0, + "grad_norm": 1.7481542974535997, + "language_loss": 0.70018351, + "learning_rate": 3.626144589597061e-06, + "loss": 0.72191954, + "num_input_tokens_seen": 79519680, + "step": 3690, + "time_per_iteration": 2.6664223670959473 + }, + { + "auxiliary_loss_clip": 0.01147193, + "auxiliary_loss_mlp": 0.00777365, + "balance_loss_clip": 1.0537169, + "balance_loss_mlp": 1.00153625, + "epoch": 0.22191492559747483, + "flos": 21981316901760.0, + "grad_norm": 1.8112729447523994, + "language_loss": 0.72609359, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.74533916, + "num_input_tokens_seen": 79539000, + "step": 3691, + "time_per_iteration": 2.6724495887756348 + }, + { + "auxiliary_loss_clip": 0.01144688, + "auxiliary_loss_mlp": 0.01046427, + "balance_loss_clip": 1.05663919, + "balance_loss_mlp": 1.0275923, + "epoch": 0.2219750488501428, + "flos": 23222066446080.0, + "grad_norm": 1.8134603978799304, + "language_loss": 0.71503472, + "learning_rate": 3.625691006130477e-06, + "loss": 0.73694593, + "num_input_tokens_seen": 79559695, + "step": 3692, + "time_per_iteration": 2.6743686199188232 + }, + { + "auxiliary_loss_clip": 0.01147828, + "auxiliary_loss_mlp": 0.01048973, + "balance_loss_clip": 1.05410266, + "balance_loss_mlp": 1.03098464, + "epoch": 0.22203517210281076, + "flos": 22453685683200.0, + "grad_norm": 2.1147705582229577, + "language_loss": 0.87551594, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.89748394, + "num_input_tokens_seen": 79579095, + "step": 3693, + "time_per_iteration": 4.2962939739227295 + }, + { + "auxiliary_loss_clip": 0.01141134, + "auxiliary_loss_mlp": 0.01041066, + "balance_loss_clip": 1.0537045, + "balance_loss_mlp": 1.02479386, + "epoch": 0.22209529535547873, + "flos": 17564555825280.0, + "grad_norm": 1.9865017520636683, + "language_loss": 0.85553116, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.87735319, + "num_input_tokens_seen": 79596430, + "step": 3694, + "time_per_iteration": 2.585657835006714 + }, + { + "auxiliary_loss_clip": 0.01107468, + "auxiliary_loss_mlp": 0.01045482, + "balance_loss_clip": 1.04370403, + "balance_loss_mlp": 1.02640843, + "epoch": 0.2221554186081467, + "flos": 21469015175040.0, + "grad_norm": 2.1752375595399136, + "language_loss": 0.68740189, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.70893133, + "num_input_tokens_seen": 79615825, + "step": 3695, + "time_per_iteration": 2.744264841079712 + }, + { + "auxiliary_loss_clip": 0.01118075, + "auxiliary_loss_mlp": 0.01047291, + "balance_loss_clip": 1.051736, + "balance_loss_mlp": 1.0310905, + "epoch": 0.22221554186081466, + "flos": 27673445255040.0, + "grad_norm": 1.6851408018575031, + "language_loss": 0.71540272, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.73705637, + "num_input_tokens_seen": 79637875, + "step": 3696, + "time_per_iteration": 2.7320780754089355 + }, + { + "auxiliary_loss_clip": 0.01140935, + "auxiliary_loss_mlp": 0.0104304, + "balance_loss_clip": 1.05123305, + "balance_loss_mlp": 1.02455115, + "epoch": 0.22227566511348265, + "flos": 25958926298880.0, + "grad_norm": 1.7186386141421306, + "language_loss": 0.87905443, + "learning_rate": 3.624555968803217e-06, + "loss": 0.90089417, + "num_input_tokens_seen": 79656970, + "step": 3697, + "time_per_iteration": 2.65919828414917 + }, + { + "auxiliary_loss_clip": 0.01118987, + "auxiliary_loss_mlp": 0.0104214, + "balance_loss_clip": 1.04718316, + "balance_loss_mlp": 1.0255338, + "epoch": 0.22233578836615062, + "flos": 39203678833920.0, + "grad_norm": 1.6515031384229777, + "language_loss": 0.65900242, + "learning_rate": 3.624328776493346e-06, + "loss": 0.6806137, + "num_input_tokens_seen": 79680275, + "step": 3698, + "time_per_iteration": 2.7708024978637695 + }, + { + "auxiliary_loss_clip": 0.01142696, + "auxiliary_loss_mlp": 0.01049333, + "balance_loss_clip": 1.05630088, + "balance_loss_mlp": 1.03102303, + "epoch": 0.22239591161881858, + "flos": 36283782251520.0, + "grad_norm": 1.9634592665257078, + "language_loss": 0.82520199, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.84712231, + "num_input_tokens_seen": 79701255, + "step": 3699, + "time_per_iteration": 2.7743008136749268 + }, + { + "auxiliary_loss_clip": 0.01129692, + "auxiliary_loss_mlp": 0.01047594, + "balance_loss_clip": 1.05154991, + "balance_loss_mlp": 1.02939105, + "epoch": 0.22245603487148655, + "flos": 19719591177600.0, + "grad_norm": 1.6711069078421557, + "language_loss": 0.79384553, + "learning_rate": 3.62387420709809e-06, + "loss": 0.8156184, + "num_input_tokens_seen": 79721315, + "step": 3700, + "time_per_iteration": 2.652172327041626 + }, + { + "auxiliary_loss_clip": 0.01111144, + "auxiliary_loss_mlp": 0.01045464, + "balance_loss_clip": 1.04893112, + "balance_loss_mlp": 1.02608061, + "epoch": 0.2225161581241545, + "flos": 46280450615040.0, + "grad_norm": 2.123831341506728, + "language_loss": 0.72503817, + "learning_rate": 3.623646830029943e-06, + "loss": 0.74660432, + "num_input_tokens_seen": 79742705, + "step": 3701, + "time_per_iteration": 2.943124294281006 + }, + { + "auxiliary_loss_clip": 0.01139412, + "auxiliary_loss_mlp": 0.0104206, + "balance_loss_clip": 1.05053067, + "balance_loss_mlp": 1.0246197, + "epoch": 0.22257628137682248, + "flos": 23696194993920.0, + "grad_norm": 1.9127522113256972, + "language_loss": 0.79901838, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.82083315, + "num_input_tokens_seen": 79763000, + "step": 3702, + "time_per_iteration": 2.6978282928466797 + }, + { + "auxiliary_loss_clip": 0.01129024, + "auxiliary_loss_mlp": 0.01044082, + "balance_loss_clip": 1.04707038, + "balance_loss_mlp": 1.02655816, + "epoch": 0.22263640462949044, + "flos": 19353984595200.0, + "grad_norm": 1.8258996761992496, + "language_loss": 0.78237271, + "learning_rate": 3.623191891195723e-06, + "loss": 0.80410373, + "num_input_tokens_seen": 79781335, + "step": 3703, + "time_per_iteration": 2.6528990268707275 + }, + { + "auxiliary_loss_clip": 0.01140219, + "auxiliary_loss_mlp": 0.01036919, + "balance_loss_clip": 1.0503273, + "balance_loss_mlp": 1.0171181, + "epoch": 0.22269652788215843, + "flos": 20776047016320.0, + "grad_norm": 2.1693263198920563, + "language_loss": 0.74490714, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.76667851, + "num_input_tokens_seen": 79800150, + "step": 3704, + "time_per_iteration": 2.679184913635254 + }, + { + "auxiliary_loss_clip": 0.0110341, + "auxiliary_loss_mlp": 0.01043861, + "balance_loss_clip": 1.046996, + "balance_loss_mlp": 1.02684951, + "epoch": 0.2227566511348264, + "flos": 47958843467520.0, + "grad_norm": 1.8279463297536431, + "language_loss": 0.644319, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.66579175, + "num_input_tokens_seen": 79822390, + "step": 3705, + "time_per_iteration": 2.972221612930298 + }, + { + "auxiliary_loss_clip": 0.01037239, + "auxiliary_loss_mlp": 0.01023153, + "balance_loss_clip": 1.03748369, + "balance_loss_mlp": 1.02111423, + "epoch": 0.22281677438749437, + "flos": 66218953230720.0, + "grad_norm": 1.2472387125776994, + "language_loss": 0.65169704, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.67230093, + "num_input_tokens_seen": 79873350, + "step": 3706, + "time_per_iteration": 3.118619203567505 + }, + { + "auxiliary_loss_clip": 0.01116185, + "auxiliary_loss_mlp": 0.01040401, + "balance_loss_clip": 1.04938805, + "balance_loss_mlp": 1.02290082, + "epoch": 0.22287689764016233, + "flos": 21871609787520.0, + "grad_norm": 1.912279921070755, + "language_loss": 0.80597419, + "learning_rate": 3.622281274977141e-06, + "loss": 0.8275401, + "num_input_tokens_seen": 79891715, + "step": 3707, + "time_per_iteration": 2.6555368900299072 + }, + { + "auxiliary_loss_clip": 0.01149897, + "auxiliary_loss_mlp": 0.01039316, + "balance_loss_clip": 1.05199265, + "balance_loss_mlp": 1.02203059, + "epoch": 0.2229370208928303, + "flos": 27672475587840.0, + "grad_norm": 1.9339558574691282, + "language_loss": 0.78542316, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.80731529, + "num_input_tokens_seen": 79911175, + "step": 3708, + "time_per_iteration": 2.7179131507873535 + }, + { + "auxiliary_loss_clip": 0.01128276, + "auxiliary_loss_mlp": 0.01042525, + "balance_loss_clip": 1.05055118, + "balance_loss_mlp": 1.02363038, + "epoch": 0.22299714414549826, + "flos": 30154657034880.0, + "grad_norm": 1.8085596793383067, + "language_loss": 0.80606776, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.82777578, + "num_input_tokens_seen": 79931875, + "step": 3709, + "time_per_iteration": 2.810605764389038 + }, + { + "auxiliary_loss_clip": 0.01135044, + "auxiliary_loss_mlp": 0.00777248, + "balance_loss_clip": 1.0480969, + "balance_loss_mlp": 1.0014261, + "epoch": 0.22305726739816625, + "flos": 23143134309120.0, + "grad_norm": 2.100780376064183, + "language_loss": 0.69068789, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.70981085, + "num_input_tokens_seen": 79952445, + "step": 3710, + "time_per_iteration": 2.7197980880737305 + }, + { + "auxiliary_loss_clip": 0.01111671, + "auxiliary_loss_mlp": 0.01050475, + "balance_loss_clip": 1.04630041, + "balance_loss_mlp": 1.03220057, + "epoch": 0.22311739065083422, + "flos": 19172061187200.0, + "grad_norm": 2.1025491711486763, + "language_loss": 0.90782154, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.92944294, + "num_input_tokens_seen": 79971030, + "step": 3711, + "time_per_iteration": 2.808014154434204 + }, + { + "auxiliary_loss_clip": 0.01117969, + "auxiliary_loss_mlp": 0.01059175, + "balance_loss_clip": 1.04696095, + "balance_loss_mlp": 1.03921938, + "epoch": 0.22317751390350218, + "flos": 13617757319040.0, + "grad_norm": 6.2447945102939615, + "language_loss": 0.89070308, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.91247451, + "num_input_tokens_seen": 79982085, + "step": 3712, + "time_per_iteration": 2.6701955795288086 + }, + { + "auxiliary_loss_clip": 0.01150852, + "auxiliary_loss_mlp": 0.01044271, + "balance_loss_clip": 1.05445373, + "balance_loss_mlp": 1.02627039, + "epoch": 0.22323763715617015, + "flos": 11029065068160.0, + "grad_norm": 5.249819485386642, + "language_loss": 0.75858659, + "learning_rate": 3.620913505310117e-06, + "loss": 0.78053784, + "num_input_tokens_seen": 79997460, + "step": 3713, + "time_per_iteration": 2.5961148738861084 + }, + { + "auxiliary_loss_clip": 0.01106588, + "auxiliary_loss_mlp": 0.01043158, + "balance_loss_clip": 1.05345535, + "balance_loss_mlp": 1.0252645, + "epoch": 0.22329776040883811, + "flos": 41351531466240.0, + "grad_norm": 1.7774284049242903, + "language_loss": 0.62422931, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.6457268, + "num_input_tokens_seen": 80022450, + "step": 3714, + "time_per_iteration": 2.9655838012695312 + }, + { + "auxiliary_loss_clip": 0.0112071, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.05258489, + "balance_loss_mlp": 1.0163759, + "epoch": 0.22335788366150608, + "flos": 25119478477440.0, + "grad_norm": 5.465931600334143, + "language_loss": 0.79076529, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.81230301, + "num_input_tokens_seen": 80042100, + "step": 3715, + "time_per_iteration": 2.8040106296539307 + }, + { + "auxiliary_loss_clip": 0.01113318, + "auxiliary_loss_mlp": 0.01049585, + "balance_loss_clip": 1.05601192, + "balance_loss_mlp": 1.03176367, + "epoch": 0.22341800691417404, + "flos": 16983377769600.0, + "grad_norm": 2.696607190089822, + "language_loss": 0.77416688, + "learning_rate": 3.620228790579645e-06, + "loss": 0.79579592, + "num_input_tokens_seen": 80059690, + "step": 3716, + "time_per_iteration": 2.721008777618408 + }, + { + "auxiliary_loss_clip": 0.01123787, + "auxiliary_loss_mlp": 0.01043954, + "balance_loss_clip": 1.04860306, + "balance_loss_mlp": 1.02644157, + "epoch": 0.22347813016684204, + "flos": 14136738975360.0, + "grad_norm": 3.4762745813408884, + "language_loss": 0.79258984, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.81426722, + "num_input_tokens_seen": 80076060, + "step": 3717, + "time_per_iteration": 2.724637746810913 + }, + { + "auxiliary_loss_clip": 0.0107853, + "auxiliary_loss_mlp": 0.01042478, + "balance_loss_clip": 1.04485083, + "balance_loss_mlp": 1.02390504, + "epoch": 0.22353825341951, + "flos": 23583147914880.0, + "grad_norm": 1.9798483733973138, + "language_loss": 0.67890245, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.70011252, + "num_input_tokens_seen": 80094760, + "step": 3718, + "time_per_iteration": 2.8178799152374268 + }, + { + "auxiliary_loss_clip": 0.01128946, + "auxiliary_loss_mlp": 0.01043035, + "balance_loss_clip": 1.04887676, + "balance_loss_mlp": 1.02374721, + "epoch": 0.22359837667217797, + "flos": 29824206888960.0, + "grad_norm": 1.6261924310986715, + "language_loss": 0.81046188, + "learning_rate": 3.619543522896045e-06, + "loss": 0.83218175, + "num_input_tokens_seen": 80114475, + "step": 3719, + "time_per_iteration": 2.8068079948425293 + }, + { + "auxiliary_loss_clip": 0.0112823, + "auxiliary_loss_mlp": 0.0105526, + "balance_loss_clip": 1.05054009, + "balance_loss_mlp": 1.03555441, + "epoch": 0.22365849992484593, + "flos": 17603088140160.0, + "grad_norm": 2.128611791985372, + "language_loss": 0.86535168, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.88718653, + "num_input_tokens_seen": 80132920, + "step": 3720, + "time_per_iteration": 2.726252794265747 + }, + { + "auxiliary_loss_clip": 0.01123833, + "auxiliary_loss_mlp": 0.01039252, + "balance_loss_clip": 1.05347347, + "balance_loss_mlp": 1.0207628, + "epoch": 0.2237186231775139, + "flos": 22710949868160.0, + "grad_norm": 1.725668609175168, + "language_loss": 0.7471531, + "learning_rate": 3.619086370692945e-06, + "loss": 0.76878393, + "num_input_tokens_seen": 80152845, + "step": 3721, + "time_per_iteration": 2.77329158782959 + }, + { + "auxiliary_loss_clip": 0.01158005, + "auxiliary_loss_mlp": 0.01043442, + "balance_loss_clip": 1.05607998, + "balance_loss_mlp": 1.02497673, + "epoch": 0.22377874643018186, + "flos": 13371518609280.0, + "grad_norm": 3.166607303525693, + "language_loss": 0.7957024, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.8177169, + "num_input_tokens_seen": 80170680, + "step": 3722, + "time_per_iteration": 2.7204909324645996 + }, + { + "auxiliary_loss_clip": 0.01113056, + "auxiliary_loss_mlp": 0.01041868, + "balance_loss_clip": 1.0520618, + "balance_loss_mlp": 1.02571511, + "epoch": 0.22383886968284986, + "flos": 17894970057600.0, + "grad_norm": 2.0043774256219997, + "language_loss": 0.82129884, + "learning_rate": 3.618628972906178e-06, + "loss": 0.84284806, + "num_input_tokens_seen": 80189030, + "step": 3723, + "time_per_iteration": 2.7908549308776855 + }, + { + "auxiliary_loss_clip": 0.01155309, + "auxiliary_loss_mlp": 0.01046826, + "balance_loss_clip": 1.05468059, + "balance_loss_mlp": 1.02857494, + "epoch": 0.22389899293551782, + "flos": 23879123982720.0, + "grad_norm": 2.0838579777085022, + "language_loss": 0.84742224, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.86944354, + "num_input_tokens_seen": 80208365, + "step": 3724, + "time_per_iteration": 2.691678047180176 + }, + { + "auxiliary_loss_clip": 0.01123425, + "auxiliary_loss_mlp": 0.01042537, + "balance_loss_clip": 1.0494504, + "balance_loss_mlp": 1.02516866, + "epoch": 0.2239591161881858, + "flos": 27272430840960.0, + "grad_norm": 1.76453761267329, + "language_loss": 0.79456621, + "learning_rate": 3.618171329605121e-06, + "loss": 0.81622583, + "num_input_tokens_seen": 80228685, + "step": 3725, + "time_per_iteration": 4.339299917221069 + }, + { + "auxiliary_loss_clip": 0.01091555, + "auxiliary_loss_mlp": 0.01043361, + "balance_loss_clip": 1.05116296, + "balance_loss_mlp": 1.02538443, + "epoch": 0.22401923944085375, + "flos": 22236857233920.0, + "grad_norm": 1.776149940187026, + "language_loss": 0.77333415, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.79468334, + "num_input_tokens_seen": 80247635, + "step": 3726, + "time_per_iteration": 4.320322275161743 + }, + { + "auxiliary_loss_clip": 0.0115151, + "auxiliary_loss_mlp": 0.01047267, + "balance_loss_clip": 1.05424356, + "balance_loss_mlp": 1.02664328, + "epoch": 0.22407936269352172, + "flos": 12053668521600.0, + "grad_norm": 2.83844669603944, + "language_loss": 0.72643399, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.74842173, + "num_input_tokens_seen": 80260045, + "step": 3727, + "time_per_iteration": 4.218656539916992 + }, + { + "auxiliary_loss_clip": 0.01157504, + "auxiliary_loss_mlp": 0.01043436, + "balance_loss_clip": 1.0541296, + "balance_loss_mlp": 1.02321815, + "epoch": 0.22413948594618968, + "flos": 19353553632000.0, + "grad_norm": 2.250671737688348, + "language_loss": 0.86600292, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.88801229, + "num_input_tokens_seen": 80277680, + "step": 3728, + "time_per_iteration": 2.650423765182495 + }, + { + "auxiliary_loss_clip": 0.01122602, + "auxiliary_loss_mlp": 0.01053562, + "balance_loss_clip": 1.050982, + "balance_loss_mlp": 1.03134131, + "epoch": 0.22419960919885765, + "flos": 24170000319360.0, + "grad_norm": 2.1953419048873877, + "language_loss": 0.80038953, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.82215106, + "num_input_tokens_seen": 80294795, + "step": 3729, + "time_per_iteration": 2.7553794384002686 + }, + { + "auxiliary_loss_clip": 0.01126228, + "auxiliary_loss_mlp": 0.01046911, + "balance_loss_clip": 1.05183935, + "balance_loss_mlp": 1.02992368, + "epoch": 0.22425973245152564, + "flos": 27378977558400.0, + "grad_norm": 1.8211738544282683, + "language_loss": 0.86968076, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.89141214, + "num_input_tokens_seen": 80315425, + "step": 3730, + "time_per_iteration": 2.8044395446777344 + }, + { + "auxiliary_loss_clip": 0.01121982, + "auxiliary_loss_mlp": 0.00775761, + "balance_loss_clip": 1.04924226, + "balance_loss_mlp": 1.00148201, + "epoch": 0.2243198557041936, + "flos": 13735652734080.0, + "grad_norm": 2.1817469574553017, + "language_loss": 0.73091185, + "learning_rate": 3.616796927310559e-06, + "loss": 0.74988931, + "num_input_tokens_seen": 80333905, + "step": 3731, + "time_per_iteration": 2.764198064804077 + }, + { + "auxiliary_loss_clip": 0.01127044, + "auxiliary_loss_mlp": 0.0104235, + "balance_loss_clip": 1.05654919, + "balance_loss_mlp": 1.02467108, + "epoch": 0.22437997895686157, + "flos": 19530700531200.0, + "grad_norm": 2.1924274894904787, + "language_loss": 0.75427651, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.77597046, + "num_input_tokens_seen": 80352165, + "step": 3732, + "time_per_iteration": 4.544835090637207 + }, + { + "auxiliary_loss_clip": 0.01155285, + "auxiliary_loss_mlp": 0.01053522, + "balance_loss_clip": 1.05655456, + "balance_loss_mlp": 1.03560436, + "epoch": 0.22444010220952954, + "flos": 23696230907520.0, + "grad_norm": 1.745203479087184, + "language_loss": 0.88139856, + "learning_rate": 3.616338302646873e-06, + "loss": 0.90348667, + "num_input_tokens_seen": 80371305, + "step": 3733, + "time_per_iteration": 2.7097933292388916 + }, + { + "auxiliary_loss_clip": 0.0110922, + "auxiliary_loss_mlp": 0.01040674, + "balance_loss_clip": 1.05094051, + "balance_loss_mlp": 1.02264953, + "epoch": 0.2245002254621975, + "flos": 22382905933440.0, + "grad_norm": 1.6873732683679492, + "language_loss": 0.84643197, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.86793089, + "num_input_tokens_seen": 80391020, + "step": 3734, + "time_per_iteration": 2.7647547721862793 + }, + { + "auxiliary_loss_clip": 0.0113181, + "auxiliary_loss_mlp": 0.01049327, + "balance_loss_clip": 1.05362856, + "balance_loss_mlp": 1.03149319, + "epoch": 0.22456034871486547, + "flos": 26942303917440.0, + "grad_norm": 1.774553175519815, + "language_loss": 0.7679311, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.78974247, + "num_input_tokens_seen": 80411365, + "step": 3735, + "time_per_iteration": 2.7682430744171143 + }, + { + "auxiliary_loss_clip": 0.01138858, + "auxiliary_loss_mlp": 0.01045746, + "balance_loss_clip": 1.06029248, + "balance_loss_mlp": 1.02927136, + "epoch": 0.22462047196753343, + "flos": 28983538005120.0, + "grad_norm": 1.671324371931155, + "language_loss": 0.842767, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.86461306, + "num_input_tokens_seen": 80431075, + "step": 3736, + "time_per_iteration": 3.009368419647217 + }, + { + "auxiliary_loss_clip": 0.0111279, + "auxiliary_loss_mlp": 0.01044111, + "balance_loss_clip": 1.05240226, + "balance_loss_mlp": 1.02677774, + "epoch": 0.22468059522020142, + "flos": 20011329440640.0, + "grad_norm": 1.8971112354532307, + "language_loss": 0.86643183, + "learning_rate": 3.615420317888586e-06, + "loss": 0.88800085, + "num_input_tokens_seen": 80449240, + "step": 3737, + "time_per_iteration": 2.792965888977051 + }, + { + "auxiliary_loss_clip": 0.0115891, + "auxiliary_loss_mlp": 0.0104972, + "balance_loss_clip": 1.05792093, + "balance_loss_mlp": 1.03051496, + "epoch": 0.2247407184728694, + "flos": 29314239546240.0, + "grad_norm": 6.664079021041442, + "language_loss": 0.79027152, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.81235784, + "num_input_tokens_seen": 80467900, + "step": 3738, + "time_per_iteration": 2.716878652572632 + }, + { + "auxiliary_loss_clip": 0.01122737, + "auxiliary_loss_mlp": 0.01047993, + "balance_loss_clip": 1.0520165, + "balance_loss_mlp": 1.0315063, + "epoch": 0.22480084172553735, + "flos": 22310366417280.0, + "grad_norm": 1.837059456311059, + "language_loss": 0.76693523, + "learning_rate": 3.614960957933224e-06, + "loss": 0.78864253, + "num_input_tokens_seen": 80487100, + "step": 3739, + "time_per_iteration": 2.743222713470459 + }, + { + "auxiliary_loss_clip": 0.01116493, + "auxiliary_loss_mlp": 0.01049772, + "balance_loss_clip": 1.05008686, + "balance_loss_mlp": 1.03011417, + "epoch": 0.22486096497820532, + "flos": 25591272641280.0, + "grad_norm": 2.2924613412630133, + "language_loss": 0.74577379, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.7674365, + "num_input_tokens_seen": 80508625, + "step": 3740, + "time_per_iteration": 2.7339253425598145 + }, + { + "auxiliary_loss_clip": 0.01152276, + "auxiliary_loss_mlp": 0.01045147, + "balance_loss_clip": 1.05556941, + "balance_loss_mlp": 1.02728927, + "epoch": 0.22492108823087328, + "flos": 17639824775040.0, + "grad_norm": 1.9086069443180373, + "language_loss": 0.75610423, + "learning_rate": 3.614501353019939e-06, + "loss": 0.77807844, + "num_input_tokens_seen": 80527345, + "step": 3741, + "time_per_iteration": 2.7347571849823 + }, + { + "auxiliary_loss_clip": 0.01133279, + "auxiliary_loss_mlp": 0.01039745, + "balance_loss_clip": 1.05599904, + "balance_loss_mlp": 1.02316284, + "epoch": 0.22498121148354125, + "flos": 16034653797120.0, + "grad_norm": 1.7754272123040742, + "language_loss": 0.87332213, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.89505225, + "num_input_tokens_seen": 80545545, + "step": 3742, + "time_per_iteration": 2.702103614807129 + }, + { + "auxiliary_loss_clip": 0.01095068, + "auxiliary_loss_mlp": 0.01053093, + "balance_loss_clip": 1.04728913, + "balance_loss_mlp": 1.03398395, + "epoch": 0.22504133473620924, + "flos": 24023772051840.0, + "grad_norm": 2.1035678371185256, + "language_loss": 0.812823, + "learning_rate": 3.614041503218444e-06, + "loss": 0.83430457, + "num_input_tokens_seen": 80565040, + "step": 3743, + "time_per_iteration": 2.777566909790039 + }, + { + "auxiliary_loss_clip": 0.01142483, + "auxiliary_loss_mlp": 0.01040692, + "balance_loss_clip": 1.05282855, + "balance_loss_mlp": 1.02319252, + "epoch": 0.2251014579888772, + "flos": 16763963541120.0, + "grad_norm": 2.836562973763206, + "language_loss": 0.63821399, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.66004574, + "num_input_tokens_seen": 80582815, + "step": 3744, + "time_per_iteration": 2.6738698482513428 + }, + { + "auxiliary_loss_clip": 0.01139201, + "auxiliary_loss_mlp": 0.01043137, + "balance_loss_clip": 1.05523586, + "balance_loss_mlp": 1.0255779, + "epoch": 0.22516158124154517, + "flos": 13991013498240.0, + "grad_norm": 4.405698565190268, + "language_loss": 0.76340199, + "learning_rate": 3.613581408598489e-06, + "loss": 0.78522527, + "num_input_tokens_seen": 80600865, + "step": 3745, + "time_per_iteration": 2.8423044681549072 + }, + { + "auxiliary_loss_clip": 0.01116037, + "auxiliary_loss_mlp": 0.0104407, + "balance_loss_clip": 1.04906797, + "balance_loss_mlp": 1.0267489, + "epoch": 0.22522170449421314, + "flos": 14390016750720.0, + "grad_norm": 7.51155110796741, + "language_loss": 0.8056733, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.82727438, + "num_input_tokens_seen": 80617455, + "step": 3746, + "time_per_iteration": 2.743417739868164 + }, + { + "auxiliary_loss_clip": 0.01142091, + "auxiliary_loss_mlp": 0.01050597, + "balance_loss_clip": 1.05323768, + "balance_loss_mlp": 1.0328114, + "epoch": 0.2252818277468811, + "flos": 23805542972160.0, + "grad_norm": 2.6189948571262116, + "language_loss": 0.86153656, + "learning_rate": 3.613121069229862e-06, + "loss": 0.88346344, + "num_input_tokens_seen": 80635125, + "step": 3747, + "time_per_iteration": 2.7622148990631104 + }, + { + "auxiliary_loss_clip": 0.01138021, + "auxiliary_loss_mlp": 0.0077598, + "balance_loss_clip": 1.05126321, + "balance_loss_mlp": 1.00154519, + "epoch": 0.22534195099954907, + "flos": 24718033100160.0, + "grad_norm": 2.3477587169419483, + "language_loss": 0.76400602, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.78314602, + "num_input_tokens_seen": 80656370, + "step": 3748, + "time_per_iteration": 2.7347261905670166 + }, + { + "auxiliary_loss_clip": 0.01156837, + "auxiliary_loss_mlp": 0.01043045, + "balance_loss_clip": 1.05704546, + "balance_loss_mlp": 1.02525926, + "epoch": 0.22540207425221703, + "flos": 21032341534080.0, + "grad_norm": 1.5503962030073002, + "language_loss": 0.7984724, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.82047117, + "num_input_tokens_seen": 80676495, + "step": 3749, + "time_per_iteration": 2.6900558471679688 + }, + { + "auxiliary_loss_clip": 0.01123701, + "auxiliary_loss_mlp": 0.01041028, + "balance_loss_clip": 1.05050755, + "balance_loss_mlp": 1.02436304, + "epoch": 0.22546219750488503, + "flos": 19390362094080.0, + "grad_norm": 3.015206251853355, + "language_loss": 0.79585081, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.81749809, + "num_input_tokens_seen": 80694755, + "step": 3750, + "time_per_iteration": 2.727651596069336 + }, + { + "auxiliary_loss_clip": 0.01097337, + "auxiliary_loss_mlp": 0.01055462, + "balance_loss_clip": 1.05065274, + "balance_loss_mlp": 1.03756917, + "epoch": 0.225522320757553, + "flos": 25192628524800.0, + "grad_norm": 2.662961533862713, + "language_loss": 0.82433236, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.84586036, + "num_input_tokens_seen": 80713670, + "step": 3751, + "time_per_iteration": 2.827995538711548 + }, + { + "auxiliary_loss_clip": 0.01121046, + "auxiliary_loss_mlp": 0.01046103, + "balance_loss_clip": 1.05429292, + "balance_loss_mlp": 1.02828133, + "epoch": 0.22558244401022096, + "flos": 17163110448000.0, + "grad_norm": 2.0142745824369315, + "language_loss": 0.83813727, + "learning_rate": 3.611969150491165e-06, + "loss": 0.8598088, + "num_input_tokens_seen": 80731450, + "step": 3752, + "time_per_iteration": 2.78725266456604 + }, + { + "auxiliary_loss_clip": 0.01152116, + "auxiliary_loss_mlp": 0.01037502, + "balance_loss_clip": 1.05584741, + "balance_loss_mlp": 1.02123034, + "epoch": 0.22564256726288892, + "flos": 15231008856960.0, + "grad_norm": 1.9292267305553392, + "language_loss": 0.78254855, + "learning_rate": 3.611738583330375e-06, + "loss": 0.80444479, + "num_input_tokens_seen": 80748415, + "step": 3753, + "time_per_iteration": 2.7116169929504395 + }, + { + "auxiliary_loss_clip": 0.01126321, + "auxiliary_loss_mlp": 0.0104341, + "balance_loss_clip": 1.05120027, + "balance_loss_mlp": 1.02546871, + "epoch": 0.2257026905155569, + "flos": 34568652764160.0, + "grad_norm": 1.8777790089425805, + "language_loss": 0.78391469, + "learning_rate": 3.611507955052295e-06, + "loss": 0.80561191, + "num_input_tokens_seen": 80770835, + "step": 3754, + "time_per_iteration": 2.91738224029541 + }, + { + "auxiliary_loss_clip": 0.01128102, + "auxiliary_loss_mlp": 0.01048192, + "balance_loss_clip": 1.05648673, + "balance_loss_mlp": 1.03040624, + "epoch": 0.22576281376822485, + "flos": 19938430788480.0, + "grad_norm": 1.9337610105869587, + "language_loss": 0.70648986, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.72825277, + "num_input_tokens_seen": 80787840, + "step": 3755, + "time_per_iteration": 2.7427992820739746 + }, + { + "auxiliary_loss_clip": 0.01126515, + "auxiliary_loss_mlp": 0.01053366, + "balance_loss_clip": 1.05531752, + "balance_loss_mlp": 1.03559232, + "epoch": 0.22582293702089282, + "flos": 24602005192320.0, + "grad_norm": 3.9817469401483216, + "language_loss": 0.77865845, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.80045724, + "num_input_tokens_seen": 80806335, + "step": 3756, + "time_per_iteration": 2.7879996299743652 + }, + { + "auxiliary_loss_clip": 0.01132066, + "auxiliary_loss_mlp": 0.01044227, + "balance_loss_clip": 1.0559032, + "balance_loss_mlp": 1.0261426, + "epoch": 0.2258830602735608, + "flos": 23035438356480.0, + "grad_norm": 1.801741818571408, + "language_loss": 0.82615864, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.84792161, + "num_input_tokens_seen": 80825355, + "step": 3757, + "time_per_iteration": 2.685218095779419 + }, + { + "auxiliary_loss_clip": 0.01140048, + "auxiliary_loss_mlp": 0.01047555, + "balance_loss_clip": 1.05321026, + "balance_loss_mlp": 1.02917302, + "epoch": 0.22594318352622877, + "flos": 22158427887360.0, + "grad_norm": 2.3786564016745495, + "language_loss": 0.73007452, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.7519505, + "num_input_tokens_seen": 80842570, + "step": 3758, + "time_per_iteration": 2.6716878414154053 + }, + { + "auxiliary_loss_clip": 0.01137739, + "auxiliary_loss_mlp": 0.01048984, + "balance_loss_clip": 1.0577718, + "balance_loss_mlp": 1.03019619, + "epoch": 0.22600330677889674, + "flos": 20594303176320.0, + "grad_norm": 2.226232476294752, + "language_loss": 0.77150333, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.79337054, + "num_input_tokens_seen": 80858745, + "step": 3759, + "time_per_iteration": 2.787487030029297 + }, + { + "auxiliary_loss_clip": 0.01104852, + "auxiliary_loss_mlp": 0.01043473, + "balance_loss_clip": 1.04747176, + "balance_loss_mlp": 1.02507949, + "epoch": 0.2260634300315647, + "flos": 35659798162560.0, + "grad_norm": 1.6253921855068183, + "language_loss": 0.78189945, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.80338269, + "num_input_tokens_seen": 80880085, + "step": 3760, + "time_per_iteration": 2.8760766983032227 + }, + { + "auxiliary_loss_clip": 0.01042849, + "auxiliary_loss_mlp": 0.01009599, + "balance_loss_clip": 1.03235281, + "balance_loss_mlp": 1.00633264, + "epoch": 0.22612355328423267, + "flos": 72090455126400.0, + "grad_norm": 0.9481639821873915, + "language_loss": 0.60083473, + "learning_rate": 3.609891846556569e-06, + "loss": 0.62135923, + "num_input_tokens_seen": 80937660, + "step": 3761, + "time_per_iteration": 3.2168753147125244 + }, + { + "auxiliary_loss_clip": 0.01114836, + "auxiliary_loss_mlp": 0.01051216, + "balance_loss_clip": 1.0493567, + "balance_loss_mlp": 1.03295338, + "epoch": 0.22618367653690064, + "flos": 22783776693120.0, + "grad_norm": 2.3328987294287047, + "language_loss": 0.76767397, + "learning_rate": 3.609660729655211e-06, + "loss": 0.78933448, + "num_input_tokens_seen": 80956265, + "step": 3762, + "time_per_iteration": 2.8012428283691406 + }, + { + "auxiliary_loss_clip": 0.01128732, + "auxiliary_loss_mlp": 0.01042327, + "balance_loss_clip": 1.05266595, + "balance_loss_mlp": 1.02190685, + "epoch": 0.22624379978956863, + "flos": 20448254476800.0, + "grad_norm": 2.7297545785195907, + "language_loss": 0.79000401, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.81171465, + "num_input_tokens_seen": 80975185, + "step": 3763, + "time_per_iteration": 2.7217857837677 + }, + { + "auxiliary_loss_clip": 0.01142679, + "auxiliary_loss_mlp": 0.01057425, + "balance_loss_clip": 1.0557214, + "balance_loss_mlp": 1.03835177, + "epoch": 0.2263039230422366, + "flos": 17494314779520.0, + "grad_norm": 31.68022075556768, + "language_loss": 0.91241246, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.93441343, + "num_input_tokens_seen": 80992830, + "step": 3764, + "time_per_iteration": 4.232046842575073 + }, + { + "auxiliary_loss_clip": 0.01131876, + "auxiliary_loss_mlp": 0.01055516, + "balance_loss_clip": 1.05196834, + "balance_loss_mlp": 1.0367409, + "epoch": 0.22636404629490456, + "flos": 28329748606080.0, + "grad_norm": 1.9816130101247444, + "language_loss": 0.75202596, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.77389991, + "num_input_tokens_seen": 81013675, + "step": 3765, + "time_per_iteration": 4.291628122329712 + }, + { + "auxiliary_loss_clip": 0.01140284, + "auxiliary_loss_mlp": 0.01047009, + "balance_loss_clip": 1.05632913, + "balance_loss_mlp": 1.02942574, + "epoch": 0.22642416954757252, + "flos": 17489143221120.0, + "grad_norm": 2.1881182413466176, + "language_loss": 0.8966549, + "learning_rate": 3.608735651752494e-06, + "loss": 0.91852784, + "num_input_tokens_seen": 81030345, + "step": 3766, + "time_per_iteration": 2.6462960243225098 + }, + { + "auxiliary_loss_clip": 0.01126107, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.05579042, + "balance_loss_mlp": 1.02950931, + "epoch": 0.2264842928002405, + "flos": 24384530298240.0, + "grad_norm": 1.6297384952566736, + "language_loss": 0.74816859, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.76990426, + "num_input_tokens_seen": 81051000, + "step": 3767, + "time_per_iteration": 4.181917667388916 + }, + { + "auxiliary_loss_clip": 0.01139766, + "auxiliary_loss_mlp": 0.01048037, + "balance_loss_clip": 1.05206823, + "balance_loss_mlp": 1.02981031, + "epoch": 0.22654441605290845, + "flos": 19830519354240.0, + "grad_norm": 1.6389844555489992, + "language_loss": 0.71764815, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.73952615, + "num_input_tokens_seen": 81071205, + "step": 3768, + "time_per_iteration": 2.6622893810272217 + }, + { + "auxiliary_loss_clip": 0.01143239, + "auxiliary_loss_mlp": 0.01057198, + "balance_loss_clip": 1.05766034, + "balance_loss_mlp": 1.03895879, + "epoch": 0.22660453930557642, + "flos": 27454569730560.0, + "grad_norm": 1.5883345705718652, + "language_loss": 0.78320074, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.80520505, + "num_input_tokens_seen": 81091880, + "step": 3769, + "time_per_iteration": 2.692366123199463 + }, + { + "auxiliary_loss_clip": 0.01121985, + "auxiliary_loss_mlp": 0.01045951, + "balance_loss_clip": 1.0452522, + "balance_loss_mlp": 1.02712774, + "epoch": 0.2266646625582444, + "flos": 23988148738560.0, + "grad_norm": 1.8427419299971495, + "language_loss": 0.6877771, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.70945644, + "num_input_tokens_seen": 81113290, + "step": 3770, + "time_per_iteration": 2.7605137825012207 + }, + { + "auxiliary_loss_clip": 0.01155061, + "auxiliary_loss_mlp": 0.01053072, + "balance_loss_clip": 1.0551908, + "balance_loss_mlp": 1.03454649, + "epoch": 0.22672478581091238, + "flos": 26028054023040.0, + "grad_norm": 1.6594447480271795, + "language_loss": 0.80540276, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.82748413, + "num_input_tokens_seen": 81133535, + "step": 3771, + "time_per_iteration": 4.265140771865845 + }, + { + "auxiliary_loss_clip": 0.0110854, + "auxiliary_loss_mlp": 0.01058177, + "balance_loss_clip": 1.04661536, + "balance_loss_mlp": 1.04091501, + "epoch": 0.22678490906358034, + "flos": 23841812730240.0, + "grad_norm": 1.6696234119475444, + "language_loss": 0.78947794, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81114507, + "num_input_tokens_seen": 81154650, + "step": 3772, + "time_per_iteration": 2.7325806617736816 + }, + { + "auxiliary_loss_clip": 0.01036659, + "auxiliary_loss_mlp": 0.0100656, + "balance_loss_clip": 1.0461247, + "balance_loss_mlp": 1.00336492, + "epoch": 0.2268450323162483, + "flos": 65048088574080.0, + "grad_norm": 0.653194629863103, + "language_loss": 0.54380804, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56424022, + "num_input_tokens_seen": 81221240, + "step": 3773, + "time_per_iteration": 3.3729567527770996 + }, + { + "auxiliary_loss_clip": 0.0111914, + "auxiliary_loss_mlp": 0.01046238, + "balance_loss_clip": 1.05257821, + "balance_loss_mlp": 1.02851129, + "epoch": 0.22690515556891627, + "flos": 22526081544960.0, + "grad_norm": 1.81548541557593, + "language_loss": 0.70406783, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.7257216, + "num_input_tokens_seen": 81241520, + "step": 3774, + "time_per_iteration": 2.7159364223480225 + }, + { + "auxiliary_loss_clip": 0.01125586, + "auxiliary_loss_mlp": 0.01046805, + "balance_loss_clip": 1.05404115, + "balance_loss_mlp": 1.02929282, + "epoch": 0.22696527882158424, + "flos": 18223444955520.0, + "grad_norm": 2.2603412716687523, + "language_loss": 0.74377871, + "learning_rate": 3.606650658627658e-06, + "loss": 0.76550257, + "num_input_tokens_seen": 81256825, + "step": 3775, + "time_per_iteration": 2.7857720851898193 + }, + { + "auxiliary_loss_clip": 0.01152024, + "auxiliary_loss_mlp": 0.01045868, + "balance_loss_clip": 1.05331159, + "balance_loss_mlp": 1.02915478, + "epoch": 0.22702540207425223, + "flos": 17019252478080.0, + "grad_norm": 1.8428958927362264, + "language_loss": 0.81582248, + "learning_rate": 3.606418687985928e-06, + "loss": 0.83780146, + "num_input_tokens_seen": 81275695, + "step": 3776, + "time_per_iteration": 2.6054935455322266 + }, + { + "auxiliary_loss_clip": 0.01135081, + "auxiliary_loss_mlp": 0.01043769, + "balance_loss_clip": 1.05466735, + "balance_loss_mlp": 1.02654314, + "epoch": 0.2270855253269202, + "flos": 21325731822720.0, + "grad_norm": 1.7711090356153572, + "language_loss": 0.82893199, + "learning_rate": 3.606186656428641e-06, + "loss": 0.85072052, + "num_input_tokens_seen": 81294920, + "step": 3777, + "time_per_iteration": 2.722621202468872 + }, + { + "auxiliary_loss_clip": 0.01127657, + "auxiliary_loss_mlp": 0.01042436, + "balance_loss_clip": 1.05438471, + "balance_loss_mlp": 1.02435195, + "epoch": 0.22714564857958816, + "flos": 23550469516800.0, + "grad_norm": 2.3905711679994295, + "language_loss": 0.72538829, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.74708927, + "num_input_tokens_seen": 81314275, + "step": 3778, + "time_per_iteration": 2.730919599533081 + }, + { + "auxiliary_loss_clip": 0.01112853, + "auxiliary_loss_mlp": 0.01040216, + "balance_loss_clip": 1.05304575, + "balance_loss_mlp": 1.02241838, + "epoch": 0.22720577183225613, + "flos": 25989880844160.0, + "grad_norm": 2.4150679449588535, + "language_loss": 0.64176035, + "learning_rate": 3.605722410602591e-06, + "loss": 0.66329098, + "num_input_tokens_seen": 81333890, + "step": 3779, + "time_per_iteration": 2.7663822174072266 + }, + { + "auxiliary_loss_clip": 0.01132359, + "auxiliary_loss_mlp": 0.01047274, + "balance_loss_clip": 1.05292106, + "balance_loss_mlp": 1.02928495, + "epoch": 0.2272658950849241, + "flos": 20814076540800.0, + "grad_norm": 1.6627524387617407, + "language_loss": 0.70659381, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.72839016, + "num_input_tokens_seen": 81353640, + "step": 3780, + "time_per_iteration": 2.666081666946411 + }, + { + "auxiliary_loss_clip": 0.0114157, + "auxiliary_loss_mlp": 0.01046965, + "balance_loss_clip": 1.05450416, + "balance_loss_mlp": 1.02880907, + "epoch": 0.22732601833759206, + "flos": 23909324342400.0, + "grad_norm": 1.783300050979337, + "language_loss": 0.89418924, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.91607457, + "num_input_tokens_seen": 81371595, + "step": 3781, + "time_per_iteration": 2.686478614807129 + }, + { + "auxiliary_loss_clip": 0.01152428, + "auxiliary_loss_mlp": 0.01041162, + "balance_loss_clip": 1.05349672, + "balance_loss_mlp": 1.02354264, + "epoch": 0.22738614159026002, + "flos": 15924407978880.0, + "grad_norm": 19.977426185094338, + "language_loss": 0.74404943, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.76598531, + "num_input_tokens_seen": 81388435, + "step": 3782, + "time_per_iteration": 2.5633177757263184 + }, + { + "auxiliary_loss_clip": 0.01129007, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_clip": 1.05195391, + "balance_loss_mlp": 1.02926588, + "epoch": 0.22744626484292801, + "flos": 24205515891840.0, + "grad_norm": 2.051662638457334, + "language_loss": 0.82665169, + "learning_rate": 3.604793188351095e-06, + "loss": 0.84839618, + "num_input_tokens_seen": 81410195, + "step": 3783, + "time_per_iteration": 2.742572069168091 + }, + { + "auxiliary_loss_clip": 0.01129724, + "auxiliary_loss_mlp": 0.01043254, + "balance_loss_clip": 1.055516, + "balance_loss_mlp": 1.02495527, + "epoch": 0.22750638809559598, + "flos": 24791614110720.0, + "grad_norm": 2.0126417567412256, + "language_loss": 0.75996566, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.78169543, + "num_input_tokens_seen": 81430060, + "step": 3784, + "time_per_iteration": 2.7283668518066406 + }, + { + "auxiliary_loss_clip": 0.01148666, + "auxiliary_loss_mlp": 0.01041397, + "balance_loss_clip": 1.05224681, + "balance_loss_mlp": 1.02382576, + "epoch": 0.22756651134826394, + "flos": 22236498097920.0, + "grad_norm": 1.784429661746796, + "language_loss": 0.7105484, + "learning_rate": 3.604328212066594e-06, + "loss": 0.73244894, + "num_input_tokens_seen": 81447375, + "step": 3785, + "time_per_iteration": 2.627401351928711 + }, + { + "auxiliary_loss_clip": 0.01042691, + "auxiliary_loss_mlp": 0.0101642, + "balance_loss_clip": 1.03303862, + "balance_loss_mlp": 1.01427412, + "epoch": 0.2276266346009319, + "flos": 62707466626560.0, + "grad_norm": 0.8323137639565091, + "language_loss": 0.6189881, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.63957924, + "num_input_tokens_seen": 81505235, + "step": 3786, + "time_per_iteration": 3.321380376815796 + }, + { + "auxiliary_loss_clip": 0.01135149, + "auxiliary_loss_mlp": 0.01044526, + "balance_loss_clip": 1.0540669, + "balance_loss_mlp": 1.02645397, + "epoch": 0.22768675785359987, + "flos": 18613936684800.0, + "grad_norm": 2.677223616893363, + "language_loss": 0.86047274, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.8822695, + "num_input_tokens_seen": 81518685, + "step": 3787, + "time_per_iteration": 2.72554349899292 + }, + { + "auxiliary_loss_clip": 0.01129718, + "auxiliary_loss_mlp": 0.01039908, + "balance_loss_clip": 1.05296564, + "balance_loss_mlp": 1.02323031, + "epoch": 0.22774688110626784, + "flos": 26870195364480.0, + "grad_norm": 1.361320938410825, + "language_loss": 0.72755021, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.74924648, + "num_input_tokens_seen": 81538940, + "step": 3788, + "time_per_iteration": 2.7717456817626953 + }, + { + "auxiliary_loss_clip": 0.01125411, + "auxiliary_loss_mlp": 0.01035437, + "balance_loss_clip": 1.05099773, + "balance_loss_mlp": 1.01800895, + "epoch": 0.2278070043589358, + "flos": 15553593924480.0, + "grad_norm": 2.510042380876752, + "language_loss": 0.67785919, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.69946766, + "num_input_tokens_seen": 81555525, + "step": 3789, + "time_per_iteration": 2.6492021083831787 + }, + { + "auxiliary_loss_clip": 0.01114067, + "auxiliary_loss_mlp": 0.01042939, + "balance_loss_clip": 1.04577208, + "balance_loss_mlp": 1.0244137, + "epoch": 0.2278671276116038, + "flos": 22416805393920.0, + "grad_norm": 2.807016388048184, + "language_loss": 0.76026487, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.7818349, + "num_input_tokens_seen": 81576305, + "step": 3790, + "time_per_iteration": 2.789419412612915 + }, + { + "auxiliary_loss_clip": 0.01094774, + "auxiliary_loss_mlp": 0.01043575, + "balance_loss_clip": 1.04942632, + "balance_loss_mlp": 1.02388144, + "epoch": 0.22792725086427176, + "flos": 20631363033600.0, + "grad_norm": 2.1998519418279843, + "language_loss": 0.9070015, + "learning_rate": 3.602931823424522e-06, + "loss": 0.92838502, + "num_input_tokens_seen": 81594115, + "step": 3791, + "time_per_iteration": 2.74957275390625 + }, + { + "auxiliary_loss_clip": 0.01143903, + "auxiliary_loss_mlp": 0.01039768, + "balance_loss_clip": 1.05332911, + "balance_loss_mlp": 1.02229166, + "epoch": 0.22798737411693973, + "flos": 31428946903680.0, + "grad_norm": 1.6288404079645773, + "language_loss": 0.82029706, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.84213376, + "num_input_tokens_seen": 81615355, + "step": 3792, + "time_per_iteration": 2.7578563690185547 + }, + { + "auxiliary_loss_clip": 0.01074793, + "auxiliary_loss_mlp": 0.01002047, + "balance_loss_clip": 1.03528738, + "balance_loss_mlp": 0.99944824, + "epoch": 0.2280474973696077, + "flos": 52396685827200.0, + "grad_norm": 1.1490057531785423, + "language_loss": 0.65688264, + "learning_rate": 3.602465874182981e-06, + "loss": 0.67765105, + "num_input_tokens_seen": 81662075, + "step": 3793, + "time_per_iteration": 2.892385959625244 + }, + { + "auxiliary_loss_clip": 0.01156846, + "auxiliary_loss_mlp": 0.01048751, + "balance_loss_clip": 1.05509233, + "balance_loss_mlp": 1.03063166, + "epoch": 0.22810762062227566, + "flos": 26396066816640.0, + "grad_norm": 2.315054268007893, + "language_loss": 0.77095032, + "learning_rate": 3.602232808409293e-06, + "loss": 0.79300624, + "num_input_tokens_seen": 81681625, + "step": 3794, + "time_per_iteration": 2.6432933807373047 + }, + { + "auxiliary_loss_clip": 0.01106797, + "auxiliary_loss_mlp": 0.0104554, + "balance_loss_clip": 1.04641223, + "balance_loss_mlp": 1.02560771, + "epoch": 0.22816774387494362, + "flos": 25630271832960.0, + "grad_norm": 2.8263872836139194, + "language_loss": 0.80649161, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.82801497, + "num_input_tokens_seen": 81701170, + "step": 3795, + "time_per_iteration": 2.748461961746216 + }, + { + "auxiliary_loss_clip": 0.01136851, + "auxiliary_loss_mlp": 0.01049098, + "balance_loss_clip": 1.0527277, + "balance_loss_mlp": 1.03194404, + "epoch": 0.22822786712761162, + "flos": 22451602694400.0, + "grad_norm": 1.970346796529307, + "language_loss": 0.77348727, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.79534674, + "num_input_tokens_seen": 81721265, + "step": 3796, + "time_per_iteration": 2.6720409393310547 + }, + { + "auxiliary_loss_clip": 0.01111647, + "auxiliary_loss_mlp": 0.0077572, + "balance_loss_clip": 1.04920197, + "balance_loss_mlp": 1.00161827, + "epoch": 0.22828799038027958, + "flos": 12202554395520.0, + "grad_norm": 3.9384070064251793, + "language_loss": 0.95837742, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.97725105, + "num_input_tokens_seen": 81736565, + "step": 3797, + "time_per_iteration": 2.730684995651245 + }, + { + "auxiliary_loss_clip": 0.01140956, + "auxiliary_loss_mlp": 0.00774906, + "balance_loss_clip": 1.05310869, + "balance_loss_mlp": 1.00178146, + "epoch": 0.22834811363294755, + "flos": 22085708803200.0, + "grad_norm": 2.215225796779507, + "language_loss": 0.81875294, + "learning_rate": 3.601299937834666e-06, + "loss": 0.83791155, + "num_input_tokens_seen": 81756240, + "step": 3798, + "time_per_iteration": 2.7082717418670654 + }, + { + "auxiliary_loss_clip": 0.01113838, + "auxiliary_loss_mlp": 0.01041342, + "balance_loss_clip": 1.04808974, + "balance_loss_mlp": 1.02263761, + "epoch": 0.2284082368856155, + "flos": 24860634094080.0, + "grad_norm": 2.1089113145856344, + "language_loss": 0.78796971, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.8095215, + "num_input_tokens_seen": 81775720, + "step": 3799, + "time_per_iteration": 2.7810587882995605 + }, + { + "auxiliary_loss_clip": 0.01121546, + "auxiliary_loss_mlp": 0.01055329, + "balance_loss_clip": 1.04926765, + "balance_loss_mlp": 1.03627968, + "epoch": 0.22846836013828348, + "flos": 23292882109440.0, + "grad_norm": 1.7973625036918341, + "language_loss": 0.75191152, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.77368033, + "num_input_tokens_seen": 81795830, + "step": 3800, + "time_per_iteration": 2.7185163497924805 + }, + { + "auxiliary_loss_clip": 0.01121477, + "auxiliary_loss_mlp": 0.01037963, + "balance_loss_clip": 1.04833913, + "balance_loss_mlp": 1.02235246, + "epoch": 0.22852848339095144, + "flos": 27416288810880.0, + "grad_norm": 1.7410667809724167, + "language_loss": 0.64073247, + "learning_rate": 3.600599647297484e-06, + "loss": 0.66232693, + "num_input_tokens_seen": 81815745, + "step": 3801, + "time_per_iteration": 2.7509078979492188 + }, + { + "auxiliary_loss_clip": 0.01129432, + "auxiliary_loss_mlp": 0.01038736, + "balance_loss_clip": 1.05498147, + "balance_loss_mlp": 1.02301216, + "epoch": 0.2285886066436194, + "flos": 26321157002880.0, + "grad_norm": 1.6732672610702524, + "language_loss": 0.81560862, + "learning_rate": 3.60036609571682e-06, + "loss": 0.83729029, + "num_input_tokens_seen": 81835155, + "step": 3802, + "time_per_iteration": 2.7188339233398438 + }, + { + "auxiliary_loss_clip": 0.01126952, + "auxiliary_loss_mlp": 0.0105215, + "balance_loss_clip": 1.05203629, + "balance_loss_mlp": 1.0342809, + "epoch": 0.2286487298962874, + "flos": 29716475022720.0, + "grad_norm": 2.0652844737971625, + "language_loss": 0.78909743, + "learning_rate": 3.600132483450114e-06, + "loss": 0.81088841, + "num_input_tokens_seen": 81855655, + "step": 3803, + "time_per_iteration": 2.7760777473449707 + }, + { + "auxiliary_loss_clip": 0.01109356, + "auxiliary_loss_mlp": 0.01043096, + "balance_loss_clip": 1.04399478, + "balance_loss_mlp": 1.02511966, + "epoch": 0.22870885314895537, + "flos": 21287199507840.0, + "grad_norm": 1.7519930287683254, + "language_loss": 0.84902716, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.87055165, + "num_input_tokens_seen": 81876385, + "step": 3804, + "time_per_iteration": 5.891911745071411 + }, + { + "auxiliary_loss_clip": 0.01141965, + "auxiliary_loss_mlp": 0.01040951, + "balance_loss_clip": 1.05229163, + "balance_loss_mlp": 1.02440476, + "epoch": 0.22876897640162333, + "flos": 14939450161920.0, + "grad_norm": 2.045415026345325, + "language_loss": 0.76673448, + "learning_rate": 3.59966507689401e-06, + "loss": 0.78856367, + "num_input_tokens_seen": 81893225, + "step": 3805, + "time_per_iteration": 2.643104076385498 + }, + { + "auxiliary_loss_clip": 0.0112853, + "auxiliary_loss_mlp": 0.00775286, + "balance_loss_clip": 1.05192351, + "balance_loss_mlp": 1.00156116, + "epoch": 0.2288290996542913, + "flos": 18113917409280.0, + "grad_norm": 2.368547935700865, + "language_loss": 0.78250653, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.80154467, + "num_input_tokens_seen": 81911350, + "step": 3806, + "time_per_iteration": 4.312817335128784 + }, + { + "auxiliary_loss_clip": 0.01123441, + "auxiliary_loss_mlp": 0.01052484, + "balance_loss_clip": 1.05244482, + "balance_loss_mlp": 1.03282619, + "epoch": 0.22888922290695926, + "flos": 39855457071360.0, + "grad_norm": 2.0706298183861, + "language_loss": 0.700813, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.72257227, + "num_input_tokens_seen": 81935420, + "step": 3807, + "time_per_iteration": 2.8060836791992188 + }, + { + "auxiliary_loss_clip": 0.01143724, + "auxiliary_loss_mlp": 0.01057417, + "balance_loss_clip": 1.0545013, + "balance_loss_mlp": 1.03891551, + "epoch": 0.22894934615962723, + "flos": 23403774372480.0, + "grad_norm": 4.007429648995762, + "language_loss": 0.6543591, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.6763705, + "num_input_tokens_seen": 81953845, + "step": 3808, + "time_per_iteration": 2.703885078430176 + }, + { + "auxiliary_loss_clip": 0.0109921, + "auxiliary_loss_mlp": 0.01061828, + "balance_loss_clip": 1.04773676, + "balance_loss_mlp": 1.04295671, + "epoch": 0.22900946941229522, + "flos": 18843011671680.0, + "grad_norm": 2.028069656557901, + "language_loss": 0.74749511, + "learning_rate": 3.598729535939222e-06, + "loss": 0.76910543, + "num_input_tokens_seen": 81972100, + "step": 3809, + "time_per_iteration": 2.726862907409668 + }, + { + "auxiliary_loss_clip": 0.01128097, + "auxiliary_loss_mlp": 0.01053112, + "balance_loss_clip": 1.0527637, + "balance_loss_mlp": 1.03666139, + "epoch": 0.22906959266496318, + "flos": 22929394429440.0, + "grad_norm": 1.6287389468918274, + "language_loss": 0.81654954, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.83836162, + "num_input_tokens_seen": 81992760, + "step": 3810, + "time_per_iteration": 2.6750009059906006 + }, + { + "auxiliary_loss_clip": 0.01132496, + "auxiliary_loss_mlp": 0.01040979, + "balance_loss_clip": 1.05216146, + "balance_loss_mlp": 1.0247184, + "epoch": 0.22912971591763115, + "flos": 19354523299200.0, + "grad_norm": 2.375204791625097, + "language_loss": 0.78126299, + "learning_rate": 3.598261401682441e-06, + "loss": 0.80299771, + "num_input_tokens_seen": 82009080, + "step": 3811, + "time_per_iteration": 4.302153587341309 + }, + { + "auxiliary_loss_clip": 0.01130856, + "auxiliary_loss_mlp": 0.00775213, + "balance_loss_clip": 1.05357778, + "balance_loss_mlp": 1.00159776, + "epoch": 0.22918983917029911, + "flos": 19933546538880.0, + "grad_norm": 1.797699433224321, + "language_loss": 0.82817954, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.84724021, + "num_input_tokens_seen": 82026705, + "step": 3812, + "time_per_iteration": 2.635796308517456 + }, + { + "auxiliary_loss_clip": 0.01089198, + "auxiliary_loss_mlp": 0.01067747, + "balance_loss_clip": 1.04705882, + "balance_loss_mlp": 1.0480535, + "epoch": 0.22924996242296708, + "flos": 16690885320960.0, + "grad_norm": 3.3357789636694952, + "language_loss": 0.82689399, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.84846342, + "num_input_tokens_seen": 82043245, + "step": 3813, + "time_per_iteration": 2.7896463871002197 + }, + { + "auxiliary_loss_clip": 0.01135441, + "auxiliary_loss_mlp": 0.01044219, + "balance_loss_clip": 1.05230987, + "balance_loss_mlp": 1.02743411, + "epoch": 0.22931008567563504, + "flos": 33036164956800.0, + "grad_norm": 1.5779710642832598, + "language_loss": 0.70018709, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.72198373, + "num_input_tokens_seen": 82066870, + "step": 3814, + "time_per_iteration": 2.759460687637329 + }, + { + "auxiliary_loss_clip": 0.01141204, + "auxiliary_loss_mlp": 0.01046745, + "balance_loss_clip": 1.05307984, + "balance_loss_mlp": 1.02947164, + "epoch": 0.229370208928303, + "flos": 23330696152320.0, + "grad_norm": 2.3195881009003174, + "language_loss": 0.66811371, + "learning_rate": 3.597324405965139e-06, + "loss": 0.6899932, + "num_input_tokens_seen": 82083180, + "step": 3815, + "time_per_iteration": 2.6878743171691895 + }, + { + "auxiliary_loss_clip": 0.01142177, + "auxiliary_loss_mlp": 0.01045942, + "balance_loss_clip": 1.05412412, + "balance_loss_mlp": 1.02921689, + "epoch": 0.229430332180971, + "flos": 28617213150720.0, + "grad_norm": 2.436037188170917, + "language_loss": 0.83555114, + "learning_rate": 3.597090005586848e-06, + "loss": 0.85743231, + "num_input_tokens_seen": 82102950, + "step": 3816, + "time_per_iteration": 2.702638626098633 + }, + { + "auxiliary_loss_clip": 0.01142001, + "auxiliary_loss_mlp": 0.01037145, + "balance_loss_clip": 1.05649173, + "balance_loss_mlp": 1.01952624, + "epoch": 0.22949045543363897, + "flos": 17238199829760.0, + "grad_norm": 2.261586370580253, + "language_loss": 0.8657164, + "learning_rate": 3.596855544646742e-06, + "loss": 0.88750786, + "num_input_tokens_seen": 82119510, + "step": 3817, + "time_per_iteration": 2.6439061164855957 + }, + { + "auxiliary_loss_clip": 0.01125222, + "auxiliary_loss_mlp": 0.01048919, + "balance_loss_clip": 1.0493896, + "balance_loss_mlp": 1.03166902, + "epoch": 0.22955057868630693, + "flos": 27489438858240.0, + "grad_norm": 3.8274774650765706, + "language_loss": 0.74976468, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.77150607, + "num_input_tokens_seen": 82140095, + "step": 3818, + "time_per_iteration": 2.7610766887664795 + }, + { + "auxiliary_loss_clip": 0.01146421, + "auxiliary_loss_mlp": 0.01043004, + "balance_loss_clip": 1.05866313, + "balance_loss_mlp": 1.02550387, + "epoch": 0.2296107019389749, + "flos": 23476421629440.0, + "grad_norm": 1.7490504114150227, + "language_loss": 0.74682397, + "learning_rate": 3.596386441116659e-06, + "loss": 0.76871818, + "num_input_tokens_seen": 82159510, + "step": 3819, + "time_per_iteration": 2.7125203609466553 + }, + { + "auxiliary_loss_clip": 0.0114108, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.05479693, + "balance_loss_mlp": 1.02630615, + "epoch": 0.22967082519164286, + "flos": 31285160760960.0, + "grad_norm": 2.0230347194773732, + "language_loss": 0.81103987, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.83288836, + "num_input_tokens_seen": 82179580, + "step": 3820, + "time_per_iteration": 2.7268714904785156 + }, + { + "auxiliary_loss_clip": 0.01129285, + "auxiliary_loss_mlp": 0.01044606, + "balance_loss_clip": 1.05326903, + "balance_loss_mlp": 1.02627158, + "epoch": 0.22973094844431083, + "flos": 14642935390080.0, + "grad_norm": 2.2801321869619153, + "language_loss": 0.69099033, + "learning_rate": 3.595917095446042e-06, + "loss": 0.71272922, + "num_input_tokens_seen": 82195585, + "step": 3821, + "time_per_iteration": 2.659498691558838 + }, + { + "auxiliary_loss_clip": 0.01098739, + "auxiliary_loss_mlp": 0.01036962, + "balance_loss_clip": 1.05118072, + "balance_loss_mlp": 1.01888967, + "epoch": 0.2297910716969788, + "flos": 22823853292800.0, + "grad_norm": 1.473505926288008, + "language_loss": 0.82876307, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.85012007, + "num_input_tokens_seen": 82217530, + "step": 3822, + "time_per_iteration": 2.898287057876587 + }, + { + "auxiliary_loss_clip": 0.01149833, + "auxiliary_loss_mlp": 0.01044764, + "balance_loss_clip": 1.05239797, + "balance_loss_mlp": 1.02617884, + "epoch": 0.2298511949496468, + "flos": 23039029716480.0, + "grad_norm": 2.077495396622281, + "language_loss": 0.66552204, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.68746805, + "num_input_tokens_seen": 82237980, + "step": 3823, + "time_per_iteration": 2.6397016048431396 + }, + { + "auxiliary_loss_clip": 0.01064018, + "auxiliary_loss_mlp": 0.01005373, + "balance_loss_clip": 1.04052305, + "balance_loss_mlp": 1.00196409, + "epoch": 0.22991131820231475, + "flos": 66890914911360.0, + "grad_norm": 0.8015900374762405, + "language_loss": 0.56731141, + "learning_rate": 3.595212623082357e-06, + "loss": 0.5880053, + "num_input_tokens_seen": 82301785, + "step": 3824, + "time_per_iteration": 3.2301526069641113 + }, + { + "auxiliary_loss_clip": 0.01123513, + "auxiliary_loss_mlp": 0.01037782, + "balance_loss_clip": 1.0506382, + "balance_loss_mlp": 1.02098525, + "epoch": 0.22997144145498272, + "flos": 17887248633600.0, + "grad_norm": 2.0770938093466995, + "language_loss": 0.7301755, + "learning_rate": 3.594977677968009e-06, + "loss": 0.7517885, + "num_input_tokens_seen": 82317355, + "step": 3825, + "time_per_iteration": 2.6161818504333496 + }, + { + "auxiliary_loss_clip": 0.01147516, + "auxiliary_loss_mlp": 0.01049665, + "balance_loss_clip": 1.05828226, + "balance_loss_mlp": 1.03119957, + "epoch": 0.23003156470765068, + "flos": 24676843178880.0, + "grad_norm": 1.8689845885894332, + "language_loss": 0.87652314, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.89849496, + "num_input_tokens_seen": 82336645, + "step": 3826, + "time_per_iteration": 2.668858766555786 + }, + { + "auxiliary_loss_clip": 0.01134406, + "auxiliary_loss_mlp": 0.01045536, + "balance_loss_clip": 1.05722022, + "balance_loss_mlp": 1.02697468, + "epoch": 0.23009168796031865, + "flos": 15814126247040.0, + "grad_norm": 2.4660324215504312, + "language_loss": 0.81861693, + "learning_rate": 3.594507606303083e-06, + "loss": 0.84041631, + "num_input_tokens_seen": 82354225, + "step": 3827, + "time_per_iteration": 2.67173171043396 + }, + { + "auxiliary_loss_clip": 0.01083629, + "auxiliary_loss_mlp": 0.01046658, + "balance_loss_clip": 1.04976189, + "balance_loss_mlp": 1.02728689, + "epoch": 0.2301518112129866, + "flos": 16212842190720.0, + "grad_norm": 1.9417227311694012, + "language_loss": 0.86676306, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.88806593, + "num_input_tokens_seen": 82370240, + "step": 3828, + "time_per_iteration": 2.7641990184783936 + }, + { + "auxiliary_loss_clip": 0.01126786, + "auxiliary_loss_mlp": 0.01048261, + "balance_loss_clip": 1.05381465, + "balance_loss_mlp": 1.02981901, + "epoch": 0.2302119344656546, + "flos": 20595452411520.0, + "grad_norm": 2.6386744924703223, + "language_loss": 0.7044189, + "learning_rate": 3.594037292782607e-06, + "loss": 0.72616941, + "num_input_tokens_seen": 82389145, + "step": 3829, + "time_per_iteration": 2.6674952507019043 + }, + { + "auxiliary_loss_clip": 0.01085573, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_clip": 1.04650855, + "balance_loss_mlp": 1.02835345, + "epoch": 0.23027205771832257, + "flos": 26796901662720.0, + "grad_norm": 1.6431866637768902, + "language_loss": 0.84075069, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.86205769, + "num_input_tokens_seen": 82409185, + "step": 3830, + "time_per_iteration": 2.8631880283355713 + }, + { + "auxiliary_loss_clip": 0.01132962, + "auxiliary_loss_mlp": 0.01052116, + "balance_loss_clip": 1.0506047, + "balance_loss_mlp": 1.03415167, + "epoch": 0.23033218097099054, + "flos": 43873143068160.0, + "grad_norm": 2.3429509345019213, + "language_loss": 0.67036134, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.6922121, + "num_input_tokens_seen": 82432070, + "step": 3831, + "time_per_iteration": 2.91282320022583 + }, + { + "auxiliary_loss_clip": 0.0111204, + "auxiliary_loss_mlp": 0.01053367, + "balance_loss_clip": 1.05277622, + "balance_loss_mlp": 1.03496158, + "epoch": 0.2303923042236585, + "flos": 26067663745920.0, + "grad_norm": 2.3469890931023194, + "language_loss": 0.75711727, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.7787714, + "num_input_tokens_seen": 82450625, + "step": 3832, + "time_per_iteration": 2.759467601776123 + }, + { + "auxiliary_loss_clip": 0.0110298, + "auxiliary_loss_mlp": 0.01044565, + "balance_loss_clip": 1.05044174, + "balance_loss_mlp": 1.02596867, + "epoch": 0.23045242747632647, + "flos": 18296379521280.0, + "grad_norm": 1.7769817461106177, + "language_loss": 0.87558299, + "learning_rate": 3.593095940460389e-06, + "loss": 0.89705843, + "num_input_tokens_seen": 82468575, + "step": 3833, + "time_per_iteration": 2.8548035621643066 + }, + { + "auxiliary_loss_clip": 0.01116173, + "auxiliary_loss_mlp": 0.01046082, + "balance_loss_clip": 1.05032015, + "balance_loss_mlp": 1.02814126, + "epoch": 0.23051255072899443, + "flos": 25520528805120.0, + "grad_norm": 2.030934473686878, + "language_loss": 0.74736786, + "learning_rate": 3.592860451331624e-06, + "loss": 0.7689904, + "num_input_tokens_seen": 82488655, + "step": 3834, + "time_per_iteration": 2.719237804412842 + }, + { + "auxiliary_loss_clip": 0.01104525, + "auxiliary_loss_mlp": 0.01064338, + "balance_loss_clip": 1.04610491, + "balance_loss_mlp": 1.043679, + "epoch": 0.2305726739816624, + "flos": 21215198695680.0, + "grad_norm": 1.9050082770497696, + "language_loss": 0.86071098, + "learning_rate": 3.592624901801432e-06, + "loss": 0.88239956, + "num_input_tokens_seen": 82507220, + "step": 3835, + "time_per_iteration": 2.627782106399536 + }, + { + "auxiliary_loss_clip": 0.01115977, + "auxiliary_loss_mlp": 0.01060727, + "balance_loss_clip": 1.04934275, + "balance_loss_mlp": 1.03979373, + "epoch": 0.2306327972343304, + "flos": 23331127115520.0, + "grad_norm": 2.798777841757382, + "language_loss": 0.82434011, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.84610713, + "num_input_tokens_seen": 82527920, + "step": 3836, + "time_per_iteration": 2.6091606616973877 + }, + { + "auxiliary_loss_clip": 0.01144536, + "auxiliary_loss_mlp": 0.0105466, + "balance_loss_clip": 1.06090033, + "balance_loss_mlp": 1.03683817, + "epoch": 0.23069292048699835, + "flos": 20666734951680.0, + "grad_norm": 1.7189193248017045, + "language_loss": 0.79633009, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.81832206, + "num_input_tokens_seen": 82549040, + "step": 3837, + "time_per_iteration": 2.535435914993286 + }, + { + "auxiliary_loss_clip": 0.01057695, + "auxiliary_loss_mlp": 0.01033541, + "balance_loss_clip": 1.04840386, + "balance_loss_mlp": 1.03003633, + "epoch": 0.23075304373966632, + "flos": 70454832393600.0, + "grad_norm": 0.9031703200773207, + "language_loss": 0.65381849, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67473078, + "num_input_tokens_seen": 82604070, + "step": 3838, + "time_per_iteration": 3.0868518352508545 + }, + { + "auxiliary_loss_clip": 0.01138177, + "auxiliary_loss_mlp": 0.01056497, + "balance_loss_clip": 1.05361629, + "balance_loss_mlp": 1.0395453, + "epoch": 0.23081316699233428, + "flos": 16617986668800.0, + "grad_norm": 2.5143705705619097, + "language_loss": 0.75403488, + "learning_rate": 3.591682099845058e-06, + "loss": 0.77598161, + "num_input_tokens_seen": 82619665, + "step": 3839, + "time_per_iteration": 2.6391067504882812 + }, + { + "auxiliary_loss_clip": 0.01125705, + "auxiliary_loss_mlp": 0.01046933, + "balance_loss_clip": 1.05447173, + "balance_loss_mlp": 1.02882481, + "epoch": 0.23087329024500225, + "flos": 13298081253120.0, + "grad_norm": 1.8684605740856612, + "language_loss": 0.68962026, + "learning_rate": 3.591446248441752e-06, + "loss": 0.71134663, + "num_input_tokens_seen": 82637530, + "step": 3840, + "time_per_iteration": 2.6295006275177 + }, + { + "auxiliary_loss_clip": 0.01158019, + "auxiliary_loss_mlp": 0.01046048, + "balance_loss_clip": 1.05840647, + "balance_loss_mlp": 1.026057, + "epoch": 0.23093341349767021, + "flos": 17785729820160.0, + "grad_norm": 2.5615469809997697, + "language_loss": 0.80033958, + "learning_rate": 3.591210336690645e-06, + "loss": 0.8223803, + "num_input_tokens_seen": 82656130, + "step": 3841, + "time_per_iteration": 2.6512410640716553 + }, + { + "auxiliary_loss_clip": 0.01145317, + "auxiliary_loss_mlp": 0.01047066, + "balance_loss_clip": 1.05756617, + "balance_loss_mlp": 1.0301621, + "epoch": 0.23099353675033818, + "flos": 23988076911360.0, + "grad_norm": 1.7953422744525294, + "language_loss": 0.83389241, + "learning_rate": 3.590974364600683e-06, + "loss": 0.85581625, + "num_input_tokens_seen": 82675295, + "step": 3842, + "time_per_iteration": 2.7676117420196533 + }, + { + "auxiliary_loss_clip": 0.01144752, + "auxiliary_loss_mlp": 0.01044783, + "balance_loss_clip": 1.05491304, + "balance_loss_mlp": 1.02650845, + "epoch": 0.23105366000300617, + "flos": 35995168471680.0, + "grad_norm": 1.8421697704365976, + "language_loss": 0.66661239, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.68850774, + "num_input_tokens_seen": 82703260, + "step": 3843, + "time_per_iteration": 5.82958722114563 + }, + { + "auxiliary_loss_clip": 0.01142299, + "auxiliary_loss_mlp": 0.01047166, + "balance_loss_clip": 1.05609, + "balance_loss_mlp": 1.02914143, + "epoch": 0.23111378325567414, + "flos": 31245335556480.0, + "grad_norm": 1.8996188882256444, + "language_loss": 0.77221334, + "learning_rate": 3.590502239439987e-06, + "loss": 0.79410803, + "num_input_tokens_seen": 82725060, + "step": 3844, + "time_per_iteration": 2.771226406097412 + }, + { + "auxiliary_loss_clip": 0.01141796, + "auxiliary_loss_mlp": 0.01045598, + "balance_loss_clip": 1.05503309, + "balance_loss_mlp": 1.02607179, + "epoch": 0.2311739065083421, + "flos": 19208223204480.0, + "grad_norm": 1.9651801579729304, + "language_loss": 0.78155982, + "learning_rate": 3.590266086387156e-06, + "loss": 0.80343372, + "num_input_tokens_seen": 82742960, + "step": 3845, + "time_per_iteration": 4.247429370880127 + }, + { + "auxiliary_loss_clip": 0.01117167, + "auxiliary_loss_mlp": 0.01039426, + "balance_loss_clip": 1.05274439, + "balance_loss_mlp": 1.02292788, + "epoch": 0.23123402976101007, + "flos": 23360178240000.0, + "grad_norm": 2.083958857623256, + "language_loss": 0.76397669, + "learning_rate": 3.590029873031276e-06, + "loss": 0.78554261, + "num_input_tokens_seen": 82760205, + "step": 3846, + "time_per_iteration": 2.7805917263031006 + }, + { + "auxiliary_loss_clip": 0.01131462, + "auxiliary_loss_mlp": 0.01049247, + "balance_loss_clip": 1.05376291, + "balance_loss_mlp": 1.03193808, + "epoch": 0.23129415301367803, + "flos": 13735365425280.0, + "grad_norm": 1.8827740097117207, + "language_loss": 0.70281041, + "learning_rate": 3.589793599381304e-06, + "loss": 0.72461748, + "num_input_tokens_seen": 82778590, + "step": 3847, + "time_per_iteration": 2.6848642826080322 + }, + { + "auxiliary_loss_clip": 0.01065475, + "auxiliary_loss_mlp": 0.01006045, + "balance_loss_clip": 1.04309821, + "balance_loss_mlp": 1.00356507, + "epoch": 0.231354276266346, + "flos": 69737015001600.0, + "grad_norm": 0.7955227467680892, + "language_loss": 0.61006129, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63077646, + "num_input_tokens_seen": 82833925, + "step": 3848, + "time_per_iteration": 3.08832049369812 + }, + { + "auxiliary_loss_clip": 0.01142916, + "auxiliary_loss_mlp": 0.01044943, + "balance_loss_clip": 1.05631924, + "balance_loss_mlp": 1.02640557, + "epoch": 0.231414399519014, + "flos": 18835900778880.0, + "grad_norm": 1.9602331138800266, + "language_loss": 0.78082883, + "learning_rate": 3.589320871234923e-06, + "loss": 0.80270743, + "num_input_tokens_seen": 82850625, + "step": 3849, + "time_per_iteration": 2.6830787658691406 + }, + { + "auxiliary_loss_clip": 0.01137959, + "auxiliary_loss_mlp": 0.01044864, + "balance_loss_clip": 1.05184579, + "balance_loss_mlp": 1.02630353, + "epoch": 0.23147452277168196, + "flos": 36135470995200.0, + "grad_norm": 2.354271482082729, + "language_loss": 0.71243513, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.7342633, + "num_input_tokens_seen": 82872105, + "step": 3850, + "time_per_iteration": 4.467762231826782 + }, + { + "auxiliary_loss_clip": 0.01121609, + "auxiliary_loss_mlp": 0.00776401, + "balance_loss_clip": 1.05099773, + "balance_loss_mlp": 1.00153255, + "epoch": 0.23153464602434992, + "flos": 20812927305600.0, + "grad_norm": 4.184777043510671, + "language_loss": 0.76577097, + "learning_rate": 3.588847902019718e-06, + "loss": 0.78475106, + "num_input_tokens_seen": 82890595, + "step": 3851, + "time_per_iteration": 2.7452898025512695 + }, + { + "auxiliary_loss_clip": 0.01152703, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.05650854, + "balance_loss_mlp": 1.0206244, + "epoch": 0.2315947692770179, + "flos": 19939256801280.0, + "grad_norm": 2.0528428588063914, + "language_loss": 0.69642782, + "learning_rate": 3.588611327033723e-06, + "loss": 0.71834141, + "num_input_tokens_seen": 82908910, + "step": 3852, + "time_per_iteration": 2.613687038421631 + }, + { + "auxiliary_loss_clip": 0.0110964, + "auxiliary_loss_mlp": 0.01050002, + "balance_loss_clip": 1.05097961, + "balance_loss_mlp": 1.0303328, + "epoch": 0.23165489252968585, + "flos": 12855553695360.0, + "grad_norm": 2.8596642791724993, + "language_loss": 0.67063856, + "learning_rate": 3.588374691807428e-06, + "loss": 0.69223493, + "num_input_tokens_seen": 82925405, + "step": 3853, + "time_per_iteration": 2.6974282264709473 + }, + { + "auxiliary_loss_clip": 0.01146149, + "auxiliary_loss_mlp": 0.01041525, + "balance_loss_clip": 1.05749798, + "balance_loss_mlp": 1.02340484, + "epoch": 0.23171501578235382, + "flos": 30628282792320.0, + "grad_norm": 1.7603397459637538, + "language_loss": 0.80139267, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.82326943, + "num_input_tokens_seen": 82945615, + "step": 3854, + "time_per_iteration": 2.712125062942505 + }, + { + "auxiliary_loss_clip": 0.01115767, + "auxiliary_loss_mlp": 0.01052387, + "balance_loss_clip": 1.04737794, + "balance_loss_mlp": 1.03070331, + "epoch": 0.23177513903502178, + "flos": 23842782397440.0, + "grad_norm": 1.9709775740629982, + "language_loss": 0.65103847, + "learning_rate": 3.587901240669831e-06, + "loss": 0.67272007, + "num_input_tokens_seen": 82967570, + "step": 3855, + "time_per_iteration": 2.718756675720215 + }, + { + "auxiliary_loss_clip": 0.01153506, + "auxiliary_loss_mlp": 0.01048508, + "balance_loss_clip": 1.05417824, + "balance_loss_mlp": 1.03050709, + "epoch": 0.23183526228768978, + "flos": 29570282668800.0, + "grad_norm": 1.7803112411977504, + "language_loss": 0.70386064, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.7258808, + "num_input_tokens_seen": 82987435, + "step": 3856, + "time_per_iteration": 2.798675060272217 + }, + { + "auxiliary_loss_clip": 0.01103018, + "auxiliary_loss_mlp": 0.01035927, + "balance_loss_clip": 1.05080032, + "balance_loss_mlp": 1.0200007, + "epoch": 0.23189538554035774, + "flos": 34458694254720.0, + "grad_norm": 1.7837780829213195, + "language_loss": 0.77101243, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.79240191, + "num_input_tokens_seen": 83010505, + "step": 3857, + "time_per_iteration": 2.8545501232147217 + }, + { + "auxiliary_loss_clip": 0.01136868, + "auxiliary_loss_mlp": 0.00777317, + "balance_loss_clip": 1.0528996, + "balance_loss_mlp": 1.00133562, + "epoch": 0.2319555087930257, + "flos": 18003815245440.0, + "grad_norm": 2.445609387195472, + "language_loss": 0.91629225, + "learning_rate": 3.587190612385584e-06, + "loss": 0.9354341, + "num_input_tokens_seen": 83026705, + "step": 3858, + "time_per_iteration": 2.7018845081329346 + }, + { + "auxiliary_loss_clip": 0.01095626, + "auxiliary_loss_mlp": 0.01043975, + "balance_loss_clip": 1.04882586, + "balance_loss_mlp": 1.0263319, + "epoch": 0.23201563204569367, + "flos": 23143852581120.0, + "grad_norm": 1.987074492721614, + "language_loss": 0.76833785, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.78973383, + "num_input_tokens_seen": 83046500, + "step": 3859, + "time_per_iteration": 2.7465155124664307 + }, + { + "auxiliary_loss_clip": 0.01136816, + "auxiliary_loss_mlp": 0.01041128, + "balance_loss_clip": 1.05060959, + "balance_loss_mlp": 1.02316284, + "epoch": 0.23207575529836164, + "flos": 20667991927680.0, + "grad_norm": 1.7166447387893018, + "language_loss": 0.84341264, + "learning_rate": 3.58671655924898e-06, + "loss": 0.86519206, + "num_input_tokens_seen": 83065280, + "step": 3860, + "time_per_iteration": 2.6602063179016113 + }, + { + "auxiliary_loss_clip": 0.01091436, + "auxiliary_loss_mlp": 0.01044571, + "balance_loss_clip": 1.04641938, + "balance_loss_mlp": 1.02640343, + "epoch": 0.2321358785510296, + "flos": 16472189364480.0, + "grad_norm": 2.014536853896284, + "language_loss": 0.83431923, + "learning_rate": 3.586479442423508e-06, + "loss": 0.85567933, + "num_input_tokens_seen": 83082310, + "step": 3861, + "time_per_iteration": 2.728750228881836 + }, + { + "auxiliary_loss_clip": 0.01130655, + "auxiliary_loss_mlp": 0.00776368, + "balance_loss_clip": 1.05122983, + "balance_loss_mlp": 1.00149858, + "epoch": 0.2321960018036976, + "flos": 21616320850560.0, + "grad_norm": 1.8874922149770945, + "language_loss": 0.85921204, + "learning_rate": 3.586242265438576e-06, + "loss": 0.87828225, + "num_input_tokens_seen": 83102065, + "step": 3862, + "time_per_iteration": 2.7289161682128906 + }, + { + "auxiliary_loss_clip": 0.01112788, + "auxiliary_loss_mlp": 0.0104236, + "balance_loss_clip": 1.04956031, + "balance_loss_mlp": 1.02645802, + "epoch": 0.23225612505636556, + "flos": 22271474966400.0, + "grad_norm": 1.4078274786009342, + "language_loss": 0.75131166, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.77286315, + "num_input_tokens_seen": 83121445, + "step": 3863, + "time_per_iteration": 2.7308037281036377 + }, + { + "auxiliary_loss_clip": 0.01109911, + "auxiliary_loss_mlp": 0.0104503, + "balance_loss_clip": 1.05320251, + "balance_loss_mlp": 1.02840066, + "epoch": 0.23231624830903352, + "flos": 17052325925760.0, + "grad_norm": 1.8195520841096788, + "language_loss": 0.74952984, + "learning_rate": 3.58576773102631e-06, + "loss": 0.77107918, + "num_input_tokens_seen": 83138175, + "step": 3864, + "time_per_iteration": 2.669403314590454 + }, + { + "auxiliary_loss_clip": 0.01148697, + "auxiliary_loss_mlp": 0.01038596, + "balance_loss_clip": 1.05258274, + "balance_loss_mlp": 1.02182317, + "epoch": 0.2323763715617015, + "flos": 34640043045120.0, + "grad_norm": 1.757817857347048, + "language_loss": 0.70438093, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.72625393, + "num_input_tokens_seen": 83161975, + "step": 3865, + "time_per_iteration": 2.766399621963501 + }, + { + "auxiliary_loss_clip": 0.01156124, + "auxiliary_loss_mlp": 0.01048904, + "balance_loss_clip": 1.05352104, + "balance_loss_mlp": 1.02978325, + "epoch": 0.23243649481436945, + "flos": 25551698832000.0, + "grad_norm": 1.8965816841290546, + "language_loss": 0.94702542, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.96907574, + "num_input_tokens_seen": 83180905, + "step": 3866, + "time_per_iteration": 2.659867525100708 + }, + { + "auxiliary_loss_clip": 0.01131283, + "auxiliary_loss_mlp": 0.01044032, + "balance_loss_clip": 1.04904807, + "balance_loss_mlp": 1.02683008, + "epoch": 0.23249661806703742, + "flos": 20483482740480.0, + "grad_norm": 4.181849364953483, + "language_loss": 0.73026884, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.75202191, + "num_input_tokens_seen": 83196390, + "step": 3867, + "time_per_iteration": 2.645481586456299 + }, + { + "auxiliary_loss_clip": 0.0112954, + "auxiliary_loss_mlp": 0.01046355, + "balance_loss_clip": 1.05079126, + "balance_loss_mlp": 1.02855754, + "epoch": 0.23255674131970538, + "flos": 20376612800640.0, + "grad_norm": 1.9671041323983256, + "language_loss": 0.82770872, + "learning_rate": 3.584817940684145e-06, + "loss": 0.84946775, + "num_input_tokens_seen": 83216165, + "step": 3868, + "time_per_iteration": 2.7670326232910156 + }, + { + "auxiliary_loss_clip": 0.01125563, + "auxiliary_loss_mlp": 0.01043558, + "balance_loss_clip": 1.04875207, + "balance_loss_mlp": 1.02648687, + "epoch": 0.23261686457237338, + "flos": 17056096853760.0, + "grad_norm": 2.1100994183362967, + "language_loss": 0.72952414, + "learning_rate": 3.58458034283495e-06, + "loss": 0.75121534, + "num_input_tokens_seen": 83233845, + "step": 3869, + "time_per_iteration": 2.6661763191223145 + }, + { + "auxiliary_loss_clip": 0.01132223, + "auxiliary_loss_mlp": 0.0105087, + "balance_loss_clip": 1.05129242, + "balance_loss_mlp": 1.03382349, + "epoch": 0.23267698782504134, + "flos": 29169878785920.0, + "grad_norm": 2.500604422715561, + "language_loss": 0.79142725, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.81325811, + "num_input_tokens_seen": 83254930, + "step": 3870, + "time_per_iteration": 2.707321882247925 + }, + { + "auxiliary_loss_clip": 0.01152434, + "auxiliary_loss_mlp": 0.01046711, + "balance_loss_clip": 1.05334866, + "balance_loss_mlp": 1.02924728, + "epoch": 0.2327371110777093, + "flos": 21174655219200.0, + "grad_norm": 2.176894576680098, + "language_loss": 0.70915782, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.73114932, + "num_input_tokens_seen": 83272095, + "step": 3871, + "time_per_iteration": 2.6389646530151367 + }, + { + "auxiliary_loss_clip": 0.01139847, + "auxiliary_loss_mlp": 0.01051541, + "balance_loss_clip": 1.05543458, + "balance_loss_mlp": 1.03244328, + "epoch": 0.23279723433037727, + "flos": 24863112132480.0, + "grad_norm": 1.8306984701748774, + "language_loss": 0.68877381, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.71068764, + "num_input_tokens_seen": 83290980, + "step": 3872, + "time_per_iteration": 2.662309408187866 + }, + { + "auxiliary_loss_clip": 0.0114472, + "auxiliary_loss_mlp": 0.01042459, + "balance_loss_clip": 1.05313611, + "balance_loss_mlp": 1.02388597, + "epoch": 0.23285735758304524, + "flos": 38800617344640.0, + "grad_norm": 1.5710106481349988, + "language_loss": 0.779724, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.80159569, + "num_input_tokens_seen": 83315175, + "step": 3873, + "time_per_iteration": 2.884542942047119 + }, + { + "auxiliary_loss_clip": 0.01053683, + "auxiliary_loss_mlp": 0.01022765, + "balance_loss_clip": 1.03691578, + "balance_loss_mlp": 1.02038097, + "epoch": 0.2329174808357132, + "flos": 53944113692160.0, + "grad_norm": 0.8561383552409444, + "language_loss": 0.6051712, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.62593567, + "num_input_tokens_seen": 83372060, + "step": 3874, + "time_per_iteration": 3.165809392929077 + }, + { + "auxiliary_loss_clip": 0.0112779, + "auxiliary_loss_mlp": 0.01040869, + "balance_loss_clip": 1.05157447, + "balance_loss_mlp": 1.02328515, + "epoch": 0.23297760408838117, + "flos": 21216024708480.0, + "grad_norm": 2.5039775977564522, + "language_loss": 0.80842507, + "learning_rate": 3.583153494218927e-06, + "loss": 0.83011162, + "num_input_tokens_seen": 83389795, + "step": 3875, + "time_per_iteration": 2.673657178878784 + }, + { + "auxiliary_loss_clip": 0.01147803, + "auxiliary_loss_mlp": 0.00774568, + "balance_loss_clip": 1.05367982, + "balance_loss_mlp": 1.00145388, + "epoch": 0.23303772734104916, + "flos": 28403006394240.0, + "grad_norm": 4.3174446976030465, + "language_loss": 0.6123395, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.63156319, + "num_input_tokens_seen": 83410005, + "step": 3876, + "time_per_iteration": 2.6973021030426025 + }, + { + "auxiliary_loss_clip": 0.01116571, + "auxiliary_loss_mlp": 0.01051971, + "balance_loss_clip": 1.05002618, + "balance_loss_mlp": 1.03345811, + "epoch": 0.23309785059371713, + "flos": 24314720215680.0, + "grad_norm": 2.4263361529850447, + "language_loss": 0.70649457, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.72817999, + "num_input_tokens_seen": 83430250, + "step": 3877, + "time_per_iteration": 2.7506351470947266 + }, + { + "auxiliary_loss_clip": 0.01143537, + "auxiliary_loss_mlp": 0.01051311, + "balance_loss_clip": 1.05495286, + "balance_loss_mlp": 1.03245187, + "epoch": 0.2331579738463851, + "flos": 15992925171840.0, + "grad_norm": 2.202899784913125, + "language_loss": 0.80724835, + "learning_rate": 3.582439259339073e-06, + "loss": 0.82919687, + "num_input_tokens_seen": 83447950, + "step": 3878, + "time_per_iteration": 2.6945395469665527 + }, + { + "auxiliary_loss_clip": 0.0109123, + "auxiliary_loss_mlp": 0.01049547, + "balance_loss_clip": 1.04632592, + "balance_loss_mlp": 1.0298301, + "epoch": 0.23321809709905306, + "flos": 36426957863040.0, + "grad_norm": 1.857420507716431, + "language_loss": 0.7521472, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.77355498, + "num_input_tokens_seen": 83467785, + "step": 3879, + "time_per_iteration": 2.8909342288970947 + }, + { + "auxiliary_loss_clip": 0.01095967, + "auxiliary_loss_mlp": 0.01051433, + "balance_loss_clip": 1.04621899, + "balance_loss_mlp": 1.03238297, + "epoch": 0.23327822035172102, + "flos": 21324762155520.0, + "grad_norm": 2.179587653719585, + "language_loss": 0.89532614, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.91680014, + "num_input_tokens_seen": 83485390, + "step": 3880, + "time_per_iteration": 2.7358896732330322 + }, + { + "auxiliary_loss_clip": 0.01127816, + "auxiliary_loss_mlp": 0.01049697, + "balance_loss_clip": 1.05119944, + "balance_loss_mlp": 1.0319109, + "epoch": 0.233338343604389, + "flos": 19171881619200.0, + "grad_norm": 1.6825190155617658, + "language_loss": 0.71915156, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.74092674, + "num_input_tokens_seen": 83504890, + "step": 3881, + "time_per_iteration": 2.702533721923828 + }, + { + "auxiliary_loss_clip": 0.01148084, + "auxiliary_loss_mlp": 0.0104282, + "balance_loss_clip": 1.05186546, + "balance_loss_mlp": 1.02497458, + "epoch": 0.23339846685705698, + "flos": 26908368543360.0, + "grad_norm": 1.5464986217430505, + "language_loss": 0.68210357, + "learning_rate": 3.581486106120537e-06, + "loss": 0.70401263, + "num_input_tokens_seen": 83526475, + "step": 3882, + "time_per_iteration": 2.6449384689331055 + }, + { + "auxiliary_loss_clip": 0.01106984, + "auxiliary_loss_mlp": 0.01053219, + "balance_loss_clip": 1.04567862, + "balance_loss_mlp": 1.03457499, + "epoch": 0.23345859010972494, + "flos": 32343160884480.0, + "grad_norm": 2.180831821464153, + "language_loss": 0.77379489, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.79539698, + "num_input_tokens_seen": 83546620, + "step": 3883, + "time_per_iteration": 5.806958913803101 + }, + { + "auxiliary_loss_clip": 0.01053192, + "auxiliary_loss_mlp": 0.01007679, + "balance_loss_clip": 1.03368068, + "balance_loss_mlp": 1.0053544, + "epoch": 0.2335187133623929, + "flos": 58484229050880.0, + "grad_norm": 0.7945750769740417, + "language_loss": 0.59117424, + "learning_rate": 3.58100916965445e-06, + "loss": 0.61178291, + "num_input_tokens_seen": 83616160, + "step": 3884, + "time_per_iteration": 3.3524324893951416 + }, + { + "auxiliary_loss_clip": 0.01117007, + "auxiliary_loss_mlp": 0.01034005, + "balance_loss_clip": 1.04925692, + "balance_loss_mlp": 1.01704168, + "epoch": 0.23357883661506088, + "flos": 24502317972480.0, + "grad_norm": 1.6775563031527567, + "language_loss": 0.80286831, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.82437843, + "num_input_tokens_seen": 83636795, + "step": 3885, + "time_per_iteration": 4.295818328857422 + }, + { + "auxiliary_loss_clip": 0.01136024, + "auxiliary_loss_mlp": 0.01040639, + "balance_loss_clip": 1.05494285, + "balance_loss_mlp": 1.02274597, + "epoch": 0.23363895986772884, + "flos": 18948516894720.0, + "grad_norm": 2.2066793657203116, + "language_loss": 0.88230193, + "learning_rate": 3.580531993380261e-06, + "loss": 0.90406859, + "num_input_tokens_seen": 83654050, + "step": 3886, + "time_per_iteration": 2.6672091484069824 + }, + { + "auxiliary_loss_clip": 0.01150675, + "auxiliary_loss_mlp": 0.01042457, + "balance_loss_clip": 1.05293703, + "balance_loss_mlp": 1.02512443, + "epoch": 0.2336990831203968, + "flos": 31686821619840.0, + "grad_norm": 4.0082984179074055, + "language_loss": 0.73170543, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.75363672, + "num_input_tokens_seen": 83673720, + "step": 3887, + "time_per_iteration": 2.7338294982910156 + }, + { + "auxiliary_loss_clip": 0.01140271, + "auxiliary_loss_mlp": 0.0104923, + "balance_loss_clip": 1.05201173, + "balance_loss_mlp": 1.03183722, + "epoch": 0.23375920637306477, + "flos": 27709750926720.0, + "grad_norm": 2.677865426107907, + "language_loss": 0.84125429, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.86314929, + "num_input_tokens_seen": 83693470, + "step": 3888, + "time_per_iteration": 2.7020208835601807 + }, + { + "auxiliary_loss_clip": 0.01121847, + "auxiliary_loss_mlp": 0.010605, + "balance_loss_clip": 1.04974008, + "balance_loss_mlp": 1.04121208, + "epoch": 0.23381932962573276, + "flos": 17675627656320.0, + "grad_norm": 3.2074942430893976, + "language_loss": 0.87298381, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.89480728, + "num_input_tokens_seen": 83711620, + "step": 3889, + "time_per_iteration": 4.319674491882324 + }, + { + "auxiliary_loss_clip": 0.01141703, + "auxiliary_loss_mlp": 0.01046248, + "balance_loss_clip": 1.05330396, + "balance_loss_mlp": 1.02877164, + "epoch": 0.23387945287840073, + "flos": 14390842763520.0, + "grad_norm": 3.8719217250511164, + "language_loss": 0.76830876, + "learning_rate": 3.579576921697125e-06, + "loss": 0.79018819, + "num_input_tokens_seen": 83727890, + "step": 3890, + "time_per_iteration": 2.6133198738098145 + }, + { + "auxiliary_loss_clip": 0.01107139, + "auxiliary_loss_mlp": 0.00775386, + "balance_loss_clip": 1.04837406, + "balance_loss_mlp": 1.00124502, + "epoch": 0.2339395761310687, + "flos": 46097988503040.0, + "grad_norm": 1.8304579433009527, + "language_loss": 0.73385048, + "learning_rate": 3.579338004009412e-06, + "loss": 0.75267571, + "num_input_tokens_seen": 83749370, + "step": 3891, + "time_per_iteration": 3.008927583694458 + }, + { + "auxiliary_loss_clip": 0.01145053, + "auxiliary_loss_mlp": 0.01047702, + "balance_loss_clip": 1.05121398, + "balance_loss_mlp": 1.03035665, + "epoch": 0.23399969938373666, + "flos": 22382044007040.0, + "grad_norm": 1.8316289897122906, + "language_loss": 0.82725632, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.84918392, + "num_input_tokens_seen": 83769560, + "step": 3892, + "time_per_iteration": 2.6455893516540527 + }, + { + "auxiliary_loss_clip": 0.01100914, + "auxiliary_loss_mlp": 0.01055558, + "balance_loss_clip": 1.04450488, + "balance_loss_mlp": 1.03491104, + "epoch": 0.23405982263640462, + "flos": 43508542066560.0, + "grad_norm": 2.707564715226966, + "language_loss": 0.64982933, + "learning_rate": 3.578859988977082e-06, + "loss": 0.67139405, + "num_input_tokens_seen": 83795635, + "step": 3893, + "time_per_iteration": 2.9392964839935303 + }, + { + "auxiliary_loss_clip": 0.01106007, + "auxiliary_loss_mlp": 0.01045218, + "balance_loss_clip": 1.04782617, + "balance_loss_mlp": 1.02701449, + "epoch": 0.2341199458890726, + "flos": 22564685687040.0, + "grad_norm": 2.5782091790717105, + "language_loss": 0.79415286, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.81566513, + "num_input_tokens_seen": 83814090, + "step": 3894, + "time_per_iteration": 2.839935541152954 + }, + { + "auxiliary_loss_clip": 0.01134295, + "auxiliary_loss_mlp": 0.01049748, + "balance_loss_clip": 1.04747164, + "balance_loss_mlp": 1.03253388, + "epoch": 0.23418006914174055, + "flos": 25633970933760.0, + "grad_norm": 1.551347830991082, + "language_loss": 0.81978422, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.84162462, + "num_input_tokens_seen": 83836870, + "step": 3895, + "time_per_iteration": 2.739955425262451 + }, + { + "auxiliary_loss_clip": 0.01134592, + "auxiliary_loss_mlp": 0.01052429, + "balance_loss_clip": 1.04999852, + "balance_loss_mlp": 1.03514385, + "epoch": 0.23424019239440855, + "flos": 13545936074880.0, + "grad_norm": 1.8690411936118732, + "language_loss": 0.80239451, + "learning_rate": 3.578142517422292e-06, + "loss": 0.82426476, + "num_input_tokens_seen": 83853275, + "step": 3896, + "time_per_iteration": 2.681114435195923 + }, + { + "auxiliary_loss_clip": 0.01125586, + "auxiliary_loss_mlp": 0.01045792, + "balance_loss_clip": 1.04685259, + "balance_loss_mlp": 1.02779162, + "epoch": 0.2343003156470765, + "flos": 22419498913920.0, + "grad_norm": 2.2492510100498087, + "language_loss": 0.83249009, + "learning_rate": 3.577903240538623e-06, + "loss": 0.85420382, + "num_input_tokens_seen": 83872340, + "step": 3897, + "time_per_iteration": 2.728916645050049 + }, + { + "auxiliary_loss_clip": 0.01134669, + "auxiliary_loss_mlp": 0.01058403, + "balance_loss_clip": 1.04949594, + "balance_loss_mlp": 1.04016376, + "epoch": 0.23436043889974448, + "flos": 14790815683200.0, + "grad_norm": 1.5875861860902294, + "language_loss": 0.78903484, + "learning_rate": 3.577663903820705e-06, + "loss": 0.81096554, + "num_input_tokens_seen": 83888795, + "step": 3898, + "time_per_iteration": 2.6597952842712402 + }, + { + "auxiliary_loss_clip": 0.01109182, + "auxiliary_loss_mlp": 0.01055226, + "balance_loss_clip": 1.04657888, + "balance_loss_mlp": 1.03785777, + "epoch": 0.23442056215241244, + "flos": 22965700101120.0, + "grad_norm": 1.9975380770167093, + "language_loss": 0.73769581, + "learning_rate": 3.577424507277614e-06, + "loss": 0.75933987, + "num_input_tokens_seen": 83906820, + "step": 3899, + "time_per_iteration": 2.7511518001556396 + }, + { + "auxiliary_loss_clip": 0.01110646, + "auxiliary_loss_mlp": 0.01053556, + "balance_loss_clip": 1.04662895, + "balance_loss_mlp": 1.03530502, + "epoch": 0.2344806854050804, + "flos": 23071887682560.0, + "grad_norm": 2.822835219305806, + "language_loss": 0.75323856, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.77488053, + "num_input_tokens_seen": 83926370, + "step": 3900, + "time_per_iteration": 2.7366316318511963 + }, + { + "auxiliary_loss_clip": 0.01097598, + "auxiliary_loss_mlp": 0.01047935, + "balance_loss_clip": 1.04771769, + "balance_loss_mlp": 1.03019702, + "epoch": 0.23454080865774837, + "flos": 16327074418560.0, + "grad_norm": 1.7042292639984586, + "language_loss": 0.67123592, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.69269133, + "num_input_tokens_seen": 83944600, + "step": 3901, + "time_per_iteration": 2.857386589050293 + }, + { + "auxiliary_loss_clip": 0.01029196, + "auxiliary_loss_mlp": 0.01060621, + "balance_loss_clip": 1.02959871, + "balance_loss_mlp": 1.0584631, + "epoch": 0.23460093191041637, + "flos": 67760958142080.0, + "grad_norm": 0.7708596717968548, + "language_loss": 0.58189189, + "learning_rate": 3.576705958788091e-06, + "loss": 0.60279006, + "num_input_tokens_seen": 84005100, + "step": 3902, + "time_per_iteration": 3.2769579887390137 + }, + { + "auxiliary_loss_clip": 0.01126982, + "auxiliary_loss_mlp": 0.01045748, + "balance_loss_clip": 1.05044544, + "balance_loss_mlp": 1.02691305, + "epoch": 0.23466105516308433, + "flos": 20077619990400.0, + "grad_norm": 2.0309755154884708, + "language_loss": 0.80396789, + "learning_rate": 3.576466323035108e-06, + "loss": 0.82569516, + "num_input_tokens_seen": 84023775, + "step": 3903, + "time_per_iteration": 2.683908462524414 + }, + { + "auxiliary_loss_clip": 0.01092072, + "auxiliary_loss_mlp": 0.01044121, + "balance_loss_clip": 1.04248238, + "balance_loss_mlp": 1.02614391, + "epoch": 0.2347211784157523, + "flos": 24535714642560.0, + "grad_norm": 1.970422818337997, + "language_loss": 0.82400727, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.84536922, + "num_input_tokens_seen": 84042605, + "step": 3904, + "time_per_iteration": 2.8023037910461426 + }, + { + "auxiliary_loss_clip": 0.01147463, + "auxiliary_loss_mlp": 0.01043559, + "balance_loss_clip": 1.05247784, + "balance_loss_mlp": 1.02620173, + "epoch": 0.23478130166842026, + "flos": 23805040181760.0, + "grad_norm": 1.9105311329606578, + "language_loss": 0.71330345, + "learning_rate": 3.57598687219895e-06, + "loss": 0.73521364, + "num_input_tokens_seen": 84061520, + "step": 3905, + "time_per_iteration": 2.650956869125366 + }, + { + "auxiliary_loss_clip": 0.01143661, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.05086017, + "balance_loss_mlp": 1.01877677, + "epoch": 0.23484142492108823, + "flos": 24093618048000.0, + "grad_norm": 2.334164983860831, + "language_loss": 0.71415532, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.73594707, + "num_input_tokens_seen": 84081800, + "step": 3906, + "time_per_iteration": 2.6635055541992188 + }, + { + "auxiliary_loss_clip": 0.01138147, + "auxiliary_loss_mlp": 0.01042098, + "balance_loss_clip": 1.04703832, + "balance_loss_mlp": 1.02246392, + "epoch": 0.2349015481737562, + "flos": 29095830898560.0, + "grad_norm": 2.5527171953873693, + "language_loss": 0.74024308, + "learning_rate": 3.575507182316473e-06, + "loss": 0.7620455, + "num_input_tokens_seen": 84102340, + "step": 3907, + "time_per_iteration": 2.751154661178589 + }, + { + "auxiliary_loss_clip": 0.01135101, + "auxiliary_loss_mlp": 0.01047433, + "balance_loss_clip": 1.04911268, + "balance_loss_mlp": 1.02950394, + "epoch": 0.23496167142642416, + "flos": 18916305373440.0, + "grad_norm": 1.9847054585906883, + "language_loss": 0.72428519, + "learning_rate": 3.575267247755601e-06, + "loss": 0.74611056, + "num_input_tokens_seen": 84120370, + "step": 3908, + "time_per_iteration": 2.631162166595459 + }, + { + "auxiliary_loss_clip": 0.01053013, + "auxiliary_loss_mlp": 0.01020478, + "balance_loss_clip": 1.03362584, + "balance_loss_mlp": 1.01765239, + "epoch": 0.23502179467909215, + "flos": 55868062896000.0, + "grad_norm": 1.0307072678924762, + "language_loss": 0.73359185, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75432676, + "num_input_tokens_seen": 84165515, + "step": 3909, + "time_per_iteration": 2.974531650543213 + }, + { + "auxiliary_loss_clip": 0.01136436, + "auxiliary_loss_mlp": 0.01046445, + "balance_loss_clip": 1.05006361, + "balance_loss_mlp": 1.02797985, + "epoch": 0.23508191793176011, + "flos": 23401763210880.0, + "grad_norm": 1.6771333047394956, + "language_loss": 0.88288009, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.90470886, + "num_input_tokens_seen": 84184540, + "step": 3910, + "time_per_iteration": 2.6615123748779297 + }, + { + "auxiliary_loss_clip": 0.01134757, + "auxiliary_loss_mlp": 0.01038734, + "balance_loss_clip": 1.04980493, + "balance_loss_mlp": 1.02188933, + "epoch": 0.23514204118442808, + "flos": 20047671025920.0, + "grad_norm": 1.9388895528834493, + "language_loss": 0.76067305, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.78240794, + "num_input_tokens_seen": 84202025, + "step": 3911, + "time_per_iteration": 2.6846752166748047 + }, + { + "auxiliary_loss_clip": 0.01130294, + "auxiliary_loss_mlp": 0.01041364, + "balance_loss_clip": 1.04968345, + "balance_loss_mlp": 1.02546179, + "epoch": 0.23520216443709605, + "flos": 21580589796480.0, + "grad_norm": 1.5851255377793763, + "language_loss": 0.81651384, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.83823043, + "num_input_tokens_seen": 84221895, + "step": 3912, + "time_per_iteration": 2.6340627670288086 + }, + { + "auxiliary_loss_clip": 0.01123815, + "auxiliary_loss_mlp": 0.01046223, + "balance_loss_clip": 1.05082059, + "balance_loss_mlp": 1.02836537, + "epoch": 0.235262287689764, + "flos": 23185796688000.0, + "grad_norm": 3.1390338867327165, + "language_loss": 0.71748006, + "learning_rate": 3.574066679118909e-06, + "loss": 0.73918045, + "num_input_tokens_seen": 84240455, + "step": 3913, + "time_per_iteration": 2.6716067790985107 + }, + { + "auxiliary_loss_clip": 0.01141007, + "auxiliary_loss_mlp": 0.00776535, + "balance_loss_clip": 1.05018401, + "balance_loss_mlp": 1.00136077, + "epoch": 0.23532241094243198, + "flos": 23185222070400.0, + "grad_norm": 1.7080087282408476, + "language_loss": 0.76152158, + "learning_rate": 3.57382638628884e-06, + "loss": 0.78069693, + "num_input_tokens_seen": 84261605, + "step": 3914, + "time_per_iteration": 2.706982135772705 + }, + { + "auxiliary_loss_clip": 0.01088532, + "auxiliary_loss_mlp": 0.01039819, + "balance_loss_clip": 1.0485754, + "balance_loss_mlp": 1.02153206, + "epoch": 0.23538253419509997, + "flos": 17019324305280.0, + "grad_norm": 2.2148128973951877, + "language_loss": 0.89692557, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.91820902, + "num_input_tokens_seen": 84278675, + "step": 3915, + "time_per_iteration": 2.8005998134613037 + }, + { + "auxiliary_loss_clip": 0.01045613, + "auxiliary_loss_mlp": 0.0100868, + "balance_loss_clip": 1.02860212, + "balance_loss_mlp": 1.00596201, + "epoch": 0.23544265744776793, + "flos": 63448588967040.0, + "grad_norm": 0.8066012642326402, + "language_loss": 0.59382623, + "learning_rate": 3.573345621598854e-06, + "loss": 0.61436915, + "num_input_tokens_seen": 84329765, + "step": 3916, + "time_per_iteration": 3.168708086013794 + }, + { + "auxiliary_loss_clip": 0.01027738, + "auxiliary_loss_mlp": 0.01005192, + "balance_loss_clip": 1.03619492, + "balance_loss_mlp": 1.00231957, + "epoch": 0.2355027807004359, + "flos": 70515343831680.0, + "grad_norm": 0.7680467252570666, + "language_loss": 0.49518228, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51551157, + "num_input_tokens_seen": 84393680, + "step": 3917, + "time_per_iteration": 3.3240060806274414 + }, + { + "auxiliary_loss_clip": 0.01112941, + "auxiliary_loss_mlp": 0.01048231, + "balance_loss_clip": 1.04929173, + "balance_loss_mlp": 1.03133857, + "epoch": 0.23556290395310386, + "flos": 21434289701760.0, + "grad_norm": 1.9721662885337694, + "language_loss": 0.76349282, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.78510457, + "num_input_tokens_seen": 84412640, + "step": 3918, + "time_per_iteration": 2.739431619644165 + }, + { + "auxiliary_loss_clip": 0.0109904, + "auxiliary_loss_mlp": 0.01052049, + "balance_loss_clip": 1.04440236, + "balance_loss_mlp": 1.03514528, + "epoch": 0.23562302720577183, + "flos": 18186421011840.0, + "grad_norm": 2.001330675769641, + "language_loss": 0.69002521, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.71153617, + "num_input_tokens_seen": 84431605, + "step": 3919, + "time_per_iteration": 2.8809926509857178 + }, + { + "auxiliary_loss_clip": 0.01106851, + "auxiliary_loss_mlp": 0.01039357, + "balance_loss_clip": 1.04772878, + "balance_loss_mlp": 1.02221501, + "epoch": 0.2356831504584398, + "flos": 33730497832320.0, + "grad_norm": 1.6908780146896767, + "language_loss": 0.70500779, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.72646987, + "num_input_tokens_seen": 84454210, + "step": 3920, + "time_per_iteration": 2.7984554767608643 + }, + { + "auxiliary_loss_clip": 0.01124832, + "auxiliary_loss_mlp": 0.01054073, + "balance_loss_clip": 1.05141807, + "balance_loss_mlp": 1.03756285, + "epoch": 0.23574327371110776, + "flos": 24932778560640.0, + "grad_norm": 1.7460619151295316, + "language_loss": 0.77363533, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.7954244, + "num_input_tokens_seen": 84475540, + "step": 3921, + "time_per_iteration": 2.8038690090179443 + }, + { + "auxiliary_loss_clip": 0.01113499, + "auxiliary_loss_mlp": 0.01043793, + "balance_loss_clip": 1.05042887, + "balance_loss_mlp": 1.02692485, + "epoch": 0.23580339696377575, + "flos": 17822107319040.0, + "grad_norm": 2.2761735813493775, + "language_loss": 0.74768102, + "learning_rate": 3.571901895946612e-06, + "loss": 0.76925397, + "num_input_tokens_seen": 84494580, + "step": 3922, + "time_per_iteration": 5.741380929946899 + }, + { + "auxiliary_loss_clip": 0.01116057, + "auxiliary_loss_mlp": 0.01041318, + "balance_loss_clip": 1.04831624, + "balance_loss_mlp": 1.02577269, + "epoch": 0.23586352021644372, + "flos": 26286611097600.0, + "grad_norm": 3.3386441952016868, + "language_loss": 0.79846609, + "learning_rate": 3.571661066327956e-06, + "loss": 0.82003981, + "num_input_tokens_seen": 84513850, + "step": 3923, + "time_per_iteration": 2.7889180183410645 + }, + { + "auxiliary_loss_clip": 0.01089456, + "auxiliary_loss_mlp": 0.0105728, + "balance_loss_clip": 1.04471469, + "balance_loss_mlp": 1.03935063, + "epoch": 0.23592364346911168, + "flos": 14246697484800.0, + "grad_norm": 4.698975622885271, + "language_loss": 0.74874711, + "learning_rate": 3.571420177111754e-06, + "loss": 0.77021456, + "num_input_tokens_seen": 84532315, + "step": 3924, + "time_per_iteration": 4.272740125656128 + }, + { + "auxiliary_loss_clip": 0.01145554, + "auxiliary_loss_mlp": 0.01046876, + "balance_loss_clip": 1.05115998, + "balance_loss_mlp": 1.030568, + "epoch": 0.23598376672177965, + "flos": 18587938216320.0, + "grad_norm": 2.8676741031402977, + "language_loss": 0.82357788, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.8455022, + "num_input_tokens_seen": 84550970, + "step": 3925, + "time_per_iteration": 2.6825013160705566 + }, + { + "auxiliary_loss_clip": 0.0112035, + "auxiliary_loss_mlp": 0.01048071, + "balance_loss_clip": 1.04567564, + "balance_loss_mlp": 1.0315721, + "epoch": 0.2360438899744476, + "flos": 22675542036480.0, + "grad_norm": 1.5755651433289561, + "language_loss": 0.59533024, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.61701441, + "num_input_tokens_seen": 84571655, + "step": 3926, + "time_per_iteration": 2.6960842609405518 + }, + { + "auxiliary_loss_clip": 0.01125496, + "auxiliary_loss_mlp": 0.01046163, + "balance_loss_clip": 1.04914129, + "balance_loss_mlp": 1.0302484, + "epoch": 0.23610401322711558, + "flos": 29570139014400.0, + "grad_norm": 2.4179456581838212, + "language_loss": 0.7155292, + "learning_rate": 3.570697151969235e-06, + "loss": 0.7372458, + "num_input_tokens_seen": 84593130, + "step": 3927, + "time_per_iteration": 2.786576986312866 + }, + { + "auxiliary_loss_clip": 0.01120941, + "auxiliary_loss_mlp": 0.01047009, + "balance_loss_clip": 1.04764938, + "balance_loss_mlp": 1.03125572, + "epoch": 0.23616413647978354, + "flos": 17858520731520.0, + "grad_norm": 1.9380358164668718, + "language_loss": 0.74792278, + "learning_rate": 3.570456024454221e-06, + "loss": 0.76960224, + "num_input_tokens_seen": 84612410, + "step": 3928, + "time_per_iteration": 4.450765609741211 + }, + { + "auxiliary_loss_clip": 0.01118656, + "auxiliary_loss_mlp": 0.01047112, + "balance_loss_clip": 1.04935324, + "balance_loss_mlp": 1.02949333, + "epoch": 0.23622425973245154, + "flos": 11034847157760.0, + "grad_norm": 4.3448767989564745, + "language_loss": 0.81905198, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.84070963, + "num_input_tokens_seen": 84627610, + "step": 3929, + "time_per_iteration": 2.654085874557495 + }, + { + "auxiliary_loss_clip": 0.01151721, + "auxiliary_loss_mlp": 0.0105167, + "balance_loss_clip": 1.05143714, + "balance_loss_mlp": 1.03314447, + "epoch": 0.2362843829851195, + "flos": 23404061681280.0, + "grad_norm": 3.048788180104446, + "language_loss": 0.72323942, + "learning_rate": 3.569973590777789e-06, + "loss": 0.74527335, + "num_input_tokens_seen": 84648415, + "step": 3930, + "time_per_iteration": 2.67429780960083 + }, + { + "auxiliary_loss_clip": 0.01143652, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.04880345, + "balance_loss_mlp": 1.01985574, + "epoch": 0.23634450623778747, + "flos": 39529855261440.0, + "grad_norm": 2.7450987997323333, + "language_loss": 0.74105632, + "learning_rate": 3.569732284634665e-06, + "loss": 0.76285434, + "num_input_tokens_seen": 84670080, + "step": 3931, + "time_per_iteration": 2.8017847537994385 + }, + { + "auxiliary_loss_clip": 0.01137617, + "auxiliary_loss_mlp": 0.01046002, + "balance_loss_clip": 1.05250037, + "balance_loss_mlp": 1.02853799, + "epoch": 0.23640462949045543, + "flos": 24207167917440.0, + "grad_norm": 2.2419024865888852, + "language_loss": 0.8018778, + "learning_rate": 3.569490918967136e-06, + "loss": 0.82371396, + "num_input_tokens_seen": 84686465, + "step": 3932, + "time_per_iteration": 2.6295793056488037 + }, + { + "auxiliary_loss_clip": 0.01108498, + "auxiliary_loss_mlp": 0.0104053, + "balance_loss_clip": 1.04981244, + "balance_loss_mlp": 1.02614117, + "epoch": 0.2364647527431234, + "flos": 26177622255360.0, + "grad_norm": 2.247824561482015, + "language_loss": 0.85683465, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.87832487, + "num_input_tokens_seen": 84708825, + "step": 3933, + "time_per_iteration": 2.7401201725006104 + }, + { + "auxiliary_loss_clip": 0.01101933, + "auxiliary_loss_mlp": 0.010512, + "balance_loss_clip": 1.04680276, + "balance_loss_mlp": 1.03112483, + "epoch": 0.23652487599579136, + "flos": 22637009721600.0, + "grad_norm": 2.0287283132247547, + "language_loss": 0.83179402, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.85332537, + "num_input_tokens_seen": 84726165, + "step": 3934, + "time_per_iteration": 2.8152921199798584 + }, + { + "auxiliary_loss_clip": 0.01148508, + "auxiliary_loss_mlp": 0.01042164, + "balance_loss_clip": 1.05208373, + "balance_loss_mlp": 1.02556968, + "epoch": 0.23658499924845935, + "flos": 21762261809280.0, + "grad_norm": 1.8368151879100059, + "language_loss": 0.78513408, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.80704081, + "num_input_tokens_seen": 84745815, + "step": 3935, + "time_per_iteration": 2.6769750118255615 + }, + { + "auxiliary_loss_clip": 0.01134595, + "auxiliary_loss_mlp": 0.01034926, + "balance_loss_clip": 1.05270088, + "balance_loss_mlp": 1.01891589, + "epoch": 0.23664512250112732, + "flos": 21798998444160.0, + "grad_norm": 1.5615220666884744, + "language_loss": 0.79614085, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.81783605, + "num_input_tokens_seen": 84765415, + "step": 3936, + "time_per_iteration": 2.7037193775177 + }, + { + "auxiliary_loss_clip": 0.01126163, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.04967618, + "balance_loss_mlp": 1.01779902, + "epoch": 0.23670524575379528, + "flos": 22637871648000.0, + "grad_norm": 1.671201383656535, + "language_loss": 0.7915628, + "learning_rate": 3.568283198083826e-06, + "loss": 0.81317174, + "num_input_tokens_seen": 84787080, + "step": 3937, + "time_per_iteration": 2.7639834880828857 + }, + { + "auxiliary_loss_clip": 0.01134519, + "auxiliary_loss_mlp": 0.01038533, + "balance_loss_clip": 1.05320358, + "balance_loss_mlp": 1.02313685, + "epoch": 0.23676536900646325, + "flos": 16725000263040.0, + "grad_norm": 1.8758026172480324, + "language_loss": 0.85389286, + "learning_rate": 3.568041475462147e-06, + "loss": 0.8756234, + "num_input_tokens_seen": 84805395, + "step": 3938, + "time_per_iteration": 2.6919057369232178 + }, + { + "auxiliary_loss_clip": 0.01145522, + "auxiliary_loss_mlp": 0.01047488, + "balance_loss_clip": 1.05159402, + "balance_loss_mlp": 1.03076303, + "epoch": 0.23682549225913122, + "flos": 11135611785600.0, + "grad_norm": 4.660879571039018, + "language_loss": 0.9365679, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.958498, + "num_input_tokens_seen": 84818090, + "step": 3939, + "time_per_iteration": 2.7249948978424072 + }, + { + "auxiliary_loss_clip": 0.01149288, + "auxiliary_loss_mlp": 0.01041833, + "balance_loss_clip": 1.0512023, + "balance_loss_mlp": 1.02463138, + "epoch": 0.23688561551179918, + "flos": 22559226819840.0, + "grad_norm": 1.884439522765895, + "language_loss": 0.82347792, + "learning_rate": 3.567557851847088e-06, + "loss": 0.84538913, + "num_input_tokens_seen": 84837695, + "step": 3940, + "time_per_iteration": 2.666647434234619 + }, + { + "auxiliary_loss_clip": 0.01128412, + "auxiliary_loss_mlp": 0.00775407, + "balance_loss_clip": 1.05063081, + "balance_loss_mlp": 1.00109661, + "epoch": 0.23694573876446715, + "flos": 18514895909760.0, + "grad_norm": 2.7155330970608214, + "language_loss": 0.88959104, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.90862918, + "num_input_tokens_seen": 84854630, + "step": 3941, + "time_per_iteration": 2.6898627281188965 + }, + { + "auxiliary_loss_clip": 0.01147095, + "auxiliary_loss_mlp": 0.01040548, + "balance_loss_clip": 1.04976177, + "balance_loss_mlp": 1.0227741, + "epoch": 0.23700586201713514, + "flos": 15335723980800.0, + "grad_norm": 2.436898535695529, + "language_loss": 0.8484506, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.870327, + "num_input_tokens_seen": 84871805, + "step": 3942, + "time_per_iteration": 2.560166835784912 + }, + { + "auxiliary_loss_clip": 0.01109105, + "auxiliary_loss_mlp": 0.01042997, + "balance_loss_clip": 1.04736543, + "balance_loss_mlp": 1.02447248, + "epoch": 0.2370659852698031, + "flos": 23947605262080.0, + "grad_norm": 1.9848651824816348, + "language_loss": 0.81126499, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.83278596, + "num_input_tokens_seen": 84889815, + "step": 3943, + "time_per_iteration": 2.7389075756073 + }, + { + "auxiliary_loss_clip": 0.01114013, + "auxiliary_loss_mlp": 0.01044642, + "balance_loss_clip": 1.0464983, + "balance_loss_mlp": 1.02618814, + "epoch": 0.23712610852247107, + "flos": 15332527670400.0, + "grad_norm": 2.1611381488400143, + "language_loss": 0.67060351, + "learning_rate": 3.566589891386959e-06, + "loss": 0.69219005, + "num_input_tokens_seen": 84904380, + "step": 3944, + "time_per_iteration": 2.6382999420166016 + }, + { + "auxiliary_loss_clip": 0.01117531, + "auxiliary_loss_mlp": 0.01038157, + "balance_loss_clip": 1.04629564, + "balance_loss_mlp": 1.02003753, + "epoch": 0.23718623177513903, + "flos": 19682567233920.0, + "grad_norm": 1.9578725621632602, + "language_loss": 0.75573617, + "learning_rate": 3.566347752735866e-06, + "loss": 0.77729309, + "num_input_tokens_seen": 84922935, + "step": 3945, + "time_per_iteration": 2.678377628326416 + }, + { + "auxiliary_loss_clip": 0.01128604, + "auxiliary_loss_mlp": 0.01039043, + "balance_loss_clip": 1.0493716, + "balance_loss_mlp": 1.02255654, + "epoch": 0.237246355027807, + "flos": 24973322037120.0, + "grad_norm": 1.4378865328543082, + "language_loss": 0.63750178, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.65917826, + "num_input_tokens_seen": 84943685, + "step": 3946, + "time_per_iteration": 2.77178955078125 + }, + { + "auxiliary_loss_clip": 0.01130702, + "auxiliary_loss_mlp": 0.01036796, + "balance_loss_clip": 1.0460459, + "balance_loss_mlp": 1.0186162, + "epoch": 0.23730647828047496, + "flos": 15377416692480.0, + "grad_norm": 2.53957699605931, + "language_loss": 0.77666485, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.79833984, + "num_input_tokens_seen": 84959505, + "step": 3947, + "time_per_iteration": 2.65461802482605 + }, + { + "auxiliary_loss_clip": 0.01145835, + "auxiliary_loss_mlp": 0.01040502, + "balance_loss_clip": 1.0566994, + "balance_loss_mlp": 1.02299047, + "epoch": 0.23736660153314296, + "flos": 28150662372480.0, + "grad_norm": 2.0053805098120123, + "language_loss": 0.80706096, + "learning_rate": 3.565620980442944e-06, + "loss": 0.82892442, + "num_input_tokens_seen": 84982130, + "step": 3948, + "time_per_iteration": 2.756716012954712 + }, + { + "auxiliary_loss_clip": 0.01129664, + "auxiliary_loss_mlp": 0.01044051, + "balance_loss_clip": 1.05104828, + "balance_loss_mlp": 1.02643192, + "epoch": 0.23742672478581092, + "flos": 22086570729600.0, + "grad_norm": 2.5980612684471374, + "language_loss": 0.80257607, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.82431316, + "num_input_tokens_seen": 85000640, + "step": 3949, + "time_per_iteration": 2.74457049369812 + }, + { + "auxiliary_loss_clip": 0.0112363, + "auxiliary_loss_mlp": 0.01038665, + "balance_loss_clip": 1.04977036, + "balance_loss_mlp": 1.02109337, + "epoch": 0.2374868480384789, + "flos": 19537093152000.0, + "grad_norm": 2.0592081961125093, + "language_loss": 0.73239946, + "learning_rate": 3.565136168723163e-06, + "loss": 0.75402236, + "num_input_tokens_seen": 85018970, + "step": 3950, + "time_per_iteration": 2.650508165359497 + }, + { + "auxiliary_loss_clip": 0.01145426, + "auxiliary_loss_mlp": 0.01037947, + "balance_loss_clip": 1.05055118, + "balance_loss_mlp": 1.02204442, + "epoch": 0.23754697129114685, + "flos": 19422501788160.0, + "grad_norm": 1.9969465766046124, + "language_loss": 0.72794384, + "learning_rate": 3.564893673833495e-06, + "loss": 0.74977756, + "num_input_tokens_seen": 85035905, + "step": 3951, + "time_per_iteration": 2.652399778366089 + }, + { + "auxiliary_loss_clip": 0.01122477, + "auxiliary_loss_mlp": 0.01039445, + "balance_loss_clip": 1.05080223, + "balance_loss_mlp": 1.0216229, + "epoch": 0.23760709454381482, + "flos": 19501002961920.0, + "grad_norm": 3.398248459712791, + "language_loss": 0.73703241, + "learning_rate": 3.564651119602903e-06, + "loss": 0.75865161, + "num_input_tokens_seen": 85054560, + "step": 3952, + "time_per_iteration": 2.7522144317626953 + }, + { + "auxiliary_loss_clip": 0.01100804, + "auxiliary_loss_mlp": 0.01042567, + "balance_loss_clip": 1.04366636, + "balance_loss_mlp": 1.02566266, + "epoch": 0.23766721779648278, + "flos": 27636600879360.0, + "grad_norm": 1.7524267936836437, + "language_loss": 0.71314329, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73457694, + "num_input_tokens_seen": 85074425, + "step": 3953, + "time_per_iteration": 2.7846672534942627 + }, + { + "auxiliary_loss_clip": 0.01151909, + "auxiliary_loss_mlp": 0.01047443, + "balance_loss_clip": 1.05282676, + "balance_loss_mlp": 1.02854872, + "epoch": 0.23772734104915075, + "flos": 23404348990080.0, + "grad_norm": 1.9722222736847754, + "language_loss": 0.81792426, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.83991784, + "num_input_tokens_seen": 85092865, + "step": 3954, + "time_per_iteration": 2.6262643337249756 + }, + { + "auxiliary_loss_clip": 0.01127802, + "auxiliary_loss_mlp": 0.01044439, + "balance_loss_clip": 1.05239391, + "balance_loss_mlp": 1.02616453, + "epoch": 0.23778746430181874, + "flos": 15705496540800.0, + "grad_norm": 2.2607510345904824, + "language_loss": 0.66270143, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.68442386, + "num_input_tokens_seen": 85110175, + "step": 3955, + "time_per_iteration": 2.672151803970337 + }, + { + "auxiliary_loss_clip": 0.01149182, + "auxiliary_loss_mlp": 0.0104812, + "balance_loss_clip": 1.05219805, + "balance_loss_mlp": 1.03104961, + "epoch": 0.2378475875544867, + "flos": 19426452284160.0, + "grad_norm": 1.4117933502593074, + "language_loss": 0.83963013, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.86160314, + "num_input_tokens_seen": 85129925, + "step": 3956, + "time_per_iteration": 2.6483681201934814 + }, + { + "auxiliary_loss_clip": 0.01103304, + "auxiliary_loss_mlp": 0.01042938, + "balance_loss_clip": 1.04726648, + "balance_loss_mlp": 1.02556944, + "epoch": 0.23790771080715467, + "flos": 22268565964800.0, + "grad_norm": 2.308539718278817, + "language_loss": 0.8482393, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.86970174, + "num_input_tokens_seen": 85147755, + "step": 3957, + "time_per_iteration": 2.718961715698242 + }, + { + "auxiliary_loss_clip": 0.01087747, + "auxiliary_loss_mlp": 0.01039974, + "balance_loss_clip": 1.04701853, + "balance_loss_mlp": 1.02428651, + "epoch": 0.23796783405982264, + "flos": 20047311889920.0, + "grad_norm": 2.068360920278316, + "language_loss": 0.70373344, + "learning_rate": 3.563194548575151e-06, + "loss": 0.72501063, + "num_input_tokens_seen": 85165270, + "step": 3958, + "time_per_iteration": 2.818115472793579 + }, + { + "auxiliary_loss_clip": 0.01102632, + "auxiliary_loss_mlp": 0.01042002, + "balance_loss_clip": 1.04540312, + "balance_loss_mlp": 1.02276158, + "epoch": 0.2380279573124906, + "flos": 14245943299200.0, + "grad_norm": 2.474231994209954, + "language_loss": 0.66273189, + "learning_rate": 3.562951579215745e-06, + "loss": 0.68417823, + "num_input_tokens_seen": 85181555, + "step": 3959, + "time_per_iteration": 2.71085786819458 + }, + { + "auxiliary_loss_clip": 0.01103257, + "auxiliary_loss_mlp": 0.01044748, + "balance_loss_clip": 1.04910731, + "balance_loss_mlp": 1.02760553, + "epoch": 0.23808808056515857, + "flos": 21179180332800.0, + "grad_norm": 1.922923950627842, + "language_loss": 0.72140026, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.74288028, + "num_input_tokens_seen": 85199455, + "step": 3960, + "time_per_iteration": 2.724398612976074 + }, + { + "auxiliary_loss_clip": 0.01065725, + "auxiliary_loss_mlp": 0.01041352, + "balance_loss_clip": 1.04778433, + "balance_loss_mlp": 1.02385175, + "epoch": 0.23814820381782653, + "flos": 22528308188160.0, + "grad_norm": 1.836282299199184, + "language_loss": 0.74303818, + "learning_rate": 3.562465462704307e-06, + "loss": 0.76410902, + "num_input_tokens_seen": 85219170, + "step": 3961, + "time_per_iteration": 4.592544794082642 + }, + { + "auxiliary_loss_clip": 0.01149701, + "auxiliary_loss_mlp": 0.010511, + "balance_loss_clip": 1.05083704, + "balance_loss_mlp": 1.0321815, + "epoch": 0.23820832707049452, + "flos": 22304332932480.0, + "grad_norm": 1.6798300631958207, + "language_loss": 0.6562922, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.67830026, + "num_input_tokens_seen": 85238480, + "step": 3962, + "time_per_iteration": 4.40812087059021 + }, + { + "auxiliary_loss_clip": 0.01121684, + "auxiliary_loss_mlp": 0.01042601, + "balance_loss_clip": 1.04743505, + "balance_loss_mlp": 1.02511263, + "epoch": 0.2382684503231625, + "flos": 24864225454080.0, + "grad_norm": 1.838705722688445, + "language_loss": 0.74284148, + "learning_rate": 3.561979109197483e-06, + "loss": 0.76448429, + "num_input_tokens_seen": 85259180, + "step": 3963, + "time_per_iteration": 2.7173969745635986 + }, + { + "auxiliary_loss_clip": 0.01120014, + "auxiliary_loss_mlp": 0.01045721, + "balance_loss_clip": 1.0530858, + "balance_loss_mlp": 1.02756512, + "epoch": 0.23832857357583045, + "flos": 21871609787520.0, + "grad_norm": 2.045875790034744, + "language_loss": 0.77264321, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.79430056, + "num_input_tokens_seen": 85278550, + "step": 3964, + "time_per_iteration": 4.25124716758728 + }, + { + "auxiliary_loss_clip": 0.01108604, + "auxiliary_loss_mlp": 0.01048343, + "balance_loss_clip": 1.04783297, + "balance_loss_mlp": 1.03124809, + "epoch": 0.23838869682849842, + "flos": 21288061434240.0, + "grad_norm": 2.3097885565999894, + "language_loss": 0.71521109, + "learning_rate": 3.561492518769045e-06, + "loss": 0.73678052, + "num_input_tokens_seen": 85297345, + "step": 3965, + "time_per_iteration": 2.757647752761841 + }, + { + "auxiliary_loss_clip": 0.01115176, + "auxiliary_loss_mlp": 0.01043319, + "balance_loss_clip": 1.04632521, + "balance_loss_mlp": 1.02647483, + "epoch": 0.23844882008116638, + "flos": 16180594755840.0, + "grad_norm": 2.673966650516871, + "language_loss": 0.78003007, + "learning_rate": 3.561249134732282e-06, + "loss": 0.801615, + "num_input_tokens_seen": 85315105, + "step": 3966, + "time_per_iteration": 2.71159291267395 + }, + { + "auxiliary_loss_clip": 0.01124693, + "auxiliary_loss_mlp": 0.01045448, + "balance_loss_clip": 1.05071902, + "balance_loss_mlp": 1.02899134, + "epoch": 0.23850894333383435, + "flos": 21069724613760.0, + "grad_norm": 2.116401462724705, + "language_loss": 0.68767631, + "learning_rate": 3.561005691492797e-06, + "loss": 0.70937771, + "num_input_tokens_seen": 85334735, + "step": 3967, + "time_per_iteration": 2.7072744369506836 + }, + { + "auxiliary_loss_clip": 0.01116174, + "auxiliary_loss_mlp": 0.01055757, + "balance_loss_clip": 1.04883289, + "balance_loss_mlp": 1.03803015, + "epoch": 0.23856906658650234, + "flos": 17201606849280.0, + "grad_norm": 3.581336577718575, + "language_loss": 0.68005061, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.70176995, + "num_input_tokens_seen": 85352875, + "step": 3968, + "time_per_iteration": 4.378219842910767 + }, + { + "auxiliary_loss_clip": 0.01097883, + "auxiliary_loss_mlp": 0.01044394, + "balance_loss_clip": 1.05052614, + "balance_loss_mlp": 1.0274837, + "epoch": 0.2386291898391703, + "flos": 29494223619840.0, + "grad_norm": 2.210255088762028, + "language_loss": 0.77106255, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.79248536, + "num_input_tokens_seen": 85372205, + "step": 3969, + "time_per_iteration": 2.847663164138794 + }, + { + "auxiliary_loss_clip": 0.01121681, + "auxiliary_loss_mlp": 0.01039809, + "balance_loss_clip": 1.0498476, + "balance_loss_mlp": 1.02334595, + "epoch": 0.23868931309183827, + "flos": 21142443697920.0, + "grad_norm": 2.1326335149840583, + "language_loss": 0.7617563, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.78337121, + "num_input_tokens_seen": 85389705, + "step": 3970, + "time_per_iteration": 2.766862392425537 + }, + { + "auxiliary_loss_clip": 0.01106309, + "auxiliary_loss_mlp": 0.01049131, + "balance_loss_clip": 1.04287159, + "balance_loss_mlp": 1.03111875, + "epoch": 0.23874943634450624, + "flos": 25659394784640.0, + "grad_norm": 2.3319107764636415, + "language_loss": 0.85474384, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.87629819, + "num_input_tokens_seen": 85407855, + "step": 3971, + "time_per_iteration": 2.7597670555114746 + }, + { + "auxiliary_loss_clip": 0.01062507, + "auxiliary_loss_mlp": 0.01039144, + "balance_loss_clip": 1.03465796, + "balance_loss_mlp": 1.03661716, + "epoch": 0.2388095595971742, + "flos": 58986618624000.0, + "grad_norm": 0.7451796217314707, + "language_loss": 0.62797832, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.6489948, + "num_input_tokens_seen": 85470885, + "step": 3972, + "time_per_iteration": 3.2572779655456543 + }, + { + "auxiliary_loss_clip": 0.0112174, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.0492239, + "balance_loss_mlp": 1.02109838, + "epoch": 0.23886968284984217, + "flos": 16800341040000.0, + "grad_norm": 1.9449657433446057, + "language_loss": 0.82093811, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.84253484, + "num_input_tokens_seen": 85488460, + "step": 3973, + "time_per_iteration": 2.6394145488739014 + }, + { + "auxiliary_loss_clip": 0.01115852, + "auxiliary_loss_mlp": 0.01050239, + "balance_loss_clip": 1.04884124, + "balance_loss_mlp": 1.03272736, + "epoch": 0.23892980610251013, + "flos": 22382654538240.0, + "grad_norm": 1.5639820592628684, + "language_loss": 0.79418832, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.81584924, + "num_input_tokens_seen": 85508590, + "step": 3974, + "time_per_iteration": 2.6926944255828857 + }, + { + "auxiliary_loss_clip": 0.01134012, + "auxiliary_loss_mlp": 0.01042703, + "balance_loss_clip": 1.05169725, + "balance_loss_mlp": 1.02475047, + "epoch": 0.23898992935517813, + "flos": 12823198519680.0, + "grad_norm": 1.8382350241534648, + "language_loss": 0.8420803, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.86384743, + "num_input_tokens_seen": 85525970, + "step": 3975, + "time_per_iteration": 2.6402463912963867 + }, + { + "auxiliary_loss_clip": 0.01126962, + "auxiliary_loss_mlp": 0.01042445, + "balance_loss_clip": 1.04938245, + "balance_loss_mlp": 1.02545786, + "epoch": 0.2390500526078461, + "flos": 22345666508160.0, + "grad_norm": 2.129124681208868, + "language_loss": 0.84249294, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.864187, + "num_input_tokens_seen": 85543700, + "step": 3976, + "time_per_iteration": 2.624758720397949 + }, + { + "auxiliary_loss_clip": 0.01075224, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.0434798, + "balance_loss_mlp": 1.02103186, + "epoch": 0.23911017586051406, + "flos": 22635142214400.0, + "grad_norm": 1.8888081312271703, + "language_loss": 0.74451673, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.76563722, + "num_input_tokens_seen": 85562765, + "step": 3977, + "time_per_iteration": 2.848529815673828 + }, + { + "auxiliary_loss_clip": 0.01151335, + "auxiliary_loss_mlp": 0.01045957, + "balance_loss_clip": 1.05476987, + "balance_loss_mlp": 1.02829063, + "epoch": 0.23917029911318202, + "flos": 23653281219840.0, + "grad_norm": 1.6816446874821869, + "language_loss": 0.72515011, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.74712306, + "num_input_tokens_seen": 85581755, + "step": 3978, + "time_per_iteration": 2.6967527866363525 + }, + { + "auxiliary_loss_clip": 0.01123321, + "auxiliary_loss_mlp": 0.01045192, + "balance_loss_clip": 1.04713726, + "balance_loss_mlp": 1.02802634, + "epoch": 0.23923042236585, + "flos": 22783597125120.0, + "grad_norm": 2.5130493367739413, + "language_loss": 0.78474021, + "learning_rate": 3.558079758168997e-06, + "loss": 0.80642533, + "num_input_tokens_seen": 85599455, + "step": 3979, + "time_per_iteration": 2.6679623126983643 + }, + { + "auxiliary_loss_clip": 0.01123187, + "auxiliary_loss_mlp": 0.01052255, + "balance_loss_clip": 1.04774463, + "balance_loss_mlp": 1.03390861, + "epoch": 0.23929054561851795, + "flos": 28147717457280.0, + "grad_norm": 1.8353092232149775, + "language_loss": 0.81943917, + "learning_rate": 3.557835546134977e-06, + "loss": 0.84119362, + "num_input_tokens_seen": 85619970, + "step": 3980, + "time_per_iteration": 2.7941136360168457 + }, + { + "auxiliary_loss_clip": 0.01094849, + "auxiliary_loss_mlp": 0.01037854, + "balance_loss_clip": 1.04719615, + "balance_loss_mlp": 1.02036595, + "epoch": 0.23935066887118592, + "flos": 21686525982720.0, + "grad_norm": 1.7388406045293963, + "language_loss": 0.83562148, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.85694849, + "num_input_tokens_seen": 85638850, + "step": 3981, + "time_per_iteration": 2.773372173309326 + }, + { + "auxiliary_loss_clip": 0.01126579, + "auxiliary_loss_mlp": 0.01045152, + "balance_loss_clip": 1.05084574, + "balance_loss_mlp": 1.0267818, + "epoch": 0.2394107921238539, + "flos": 32122274198400.0, + "grad_norm": 2.0270942419393676, + "language_loss": 0.76690662, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.78862393, + "num_input_tokens_seen": 85656285, + "step": 3982, + "time_per_iteration": 2.770089864730835 + }, + { + "auxiliary_loss_clip": 0.01107786, + "auxiliary_loss_mlp": 0.01043737, + "balance_loss_clip": 1.04928303, + "balance_loss_mlp": 1.02757215, + "epoch": 0.23947091537652188, + "flos": 17019180650880.0, + "grad_norm": 2.333665248317953, + "language_loss": 0.78243405, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.80394924, + "num_input_tokens_seen": 85673020, + "step": 3983, + "time_per_iteration": 2.8361902236938477 + }, + { + "auxiliary_loss_clip": 0.01136012, + "auxiliary_loss_mlp": 0.00775416, + "balance_loss_clip": 1.0530262, + "balance_loss_mlp": 1.00106907, + "epoch": 0.23953103862918984, + "flos": 20593584904320.0, + "grad_norm": 1.8468424363822287, + "language_loss": 0.73274761, + "learning_rate": 3.556858107358737e-06, + "loss": 0.75186193, + "num_input_tokens_seen": 85692565, + "step": 3984, + "time_per_iteration": 2.720289468765259 + }, + { + "auxiliary_loss_clip": 0.01102619, + "auxiliary_loss_mlp": 0.01051209, + "balance_loss_clip": 1.04748976, + "balance_loss_mlp": 1.0330658, + "epoch": 0.2395911618818578, + "flos": 20704405340160.0, + "grad_norm": 1.906378165207968, + "language_loss": 0.79090226, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.81244051, + "num_input_tokens_seen": 85709730, + "step": 3985, + "time_per_iteration": 2.8464138507843018 + }, + { + "auxiliary_loss_clip": 0.01102898, + "auxiliary_loss_mlp": 0.01047238, + "balance_loss_clip": 1.04676175, + "balance_loss_mlp": 1.02930927, + "epoch": 0.23965128513452577, + "flos": 27053519402880.0, + "grad_norm": 1.780185130038595, + "language_loss": 0.73194253, + "learning_rate": 3.556369033716254e-06, + "loss": 0.7534439, + "num_input_tokens_seen": 85730045, + "step": 3986, + "time_per_iteration": 2.873837471008301 + }, + { + "auxiliary_loss_clip": 0.01143561, + "auxiliary_loss_mlp": 0.01052533, + "balance_loss_clip": 1.05392861, + "balance_loss_mlp": 1.03523529, + "epoch": 0.23971140838719374, + "flos": 23144319457920.0, + "grad_norm": 1.9275946084378768, + "language_loss": 0.88014174, + "learning_rate": 3.556124408363871e-06, + "loss": 0.90210271, + "num_input_tokens_seen": 85747590, + "step": 3987, + "time_per_iteration": 2.778970718383789 + }, + { + "auxiliary_loss_clip": 0.01131181, + "auxiliary_loss_mlp": 0.01037226, + "balance_loss_clip": 1.05180991, + "balance_loss_mlp": 1.02253985, + "epoch": 0.23977153163986173, + "flos": 18034554309120.0, + "grad_norm": 8.94948058332038, + "language_loss": 0.82985806, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.85154212, + "num_input_tokens_seen": 85763460, + "step": 3988, + "time_per_iteration": 2.6707162857055664 + }, + { + "auxiliary_loss_clip": 0.01132219, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.04952908, + "balance_loss_mlp": 1.02213907, + "epoch": 0.2398316548925297, + "flos": 18113378705280.0, + "grad_norm": 1.6085860818119202, + "language_loss": 0.85336304, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.87507904, + "num_input_tokens_seen": 85782050, + "step": 3989, + "time_per_iteration": 2.644075632095337 + }, + { + "auxiliary_loss_clip": 0.01144734, + "auxiliary_loss_mlp": 0.01039049, + "balance_loss_clip": 1.05094743, + "balance_loss_mlp": 1.02263403, + "epoch": 0.23989177814519766, + "flos": 12567730014720.0, + "grad_norm": 1.981474679784042, + "language_loss": 0.84109843, + "learning_rate": 3.555390178293477e-06, + "loss": 0.86293626, + "num_input_tokens_seen": 85797400, + "step": 3990, + "time_per_iteration": 2.5778160095214844 + }, + { + "auxiliary_loss_clip": 0.01131361, + "auxiliary_loss_mlp": 0.01042102, + "balance_loss_clip": 1.04863191, + "balance_loss_mlp": 1.02565074, + "epoch": 0.23995190139786562, + "flos": 25264593423360.0, + "grad_norm": 1.5352138463261382, + "language_loss": 0.75853264, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.78026724, + "num_input_tokens_seen": 85818995, + "step": 3991, + "time_per_iteration": 2.7569639682769775 + }, + { + "auxiliary_loss_clip": 0.01040828, + "auxiliary_loss_mlp": 0.0100398, + "balance_loss_clip": 1.02825403, + "balance_loss_mlp": 1.00114298, + "epoch": 0.2400120246505336, + "flos": 61960379650560.0, + "grad_norm": 0.8795356934357302, + "language_loss": 0.63683558, + "learning_rate": 3.554900396661656e-06, + "loss": 0.65728366, + "num_input_tokens_seen": 85876695, + "step": 3992, + "time_per_iteration": 3.2559213638305664 + }, + { + "auxiliary_loss_clip": 0.01055123, + "auxiliary_loss_mlp": 0.01005737, + "balance_loss_clip": 1.02834392, + "balance_loss_mlp": 1.00292385, + "epoch": 0.24007214790320155, + "flos": 66708560540160.0, + "grad_norm": 0.7639831296699208, + "language_loss": 0.6297875, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65039611, + "num_input_tokens_seen": 85940990, + "step": 3993, + "time_per_iteration": 3.2946221828460693 + }, + { + "auxiliary_loss_clip": 0.0110983, + "auxiliary_loss_mlp": 0.01048609, + "balance_loss_clip": 1.05077267, + "balance_loss_mlp": 1.03078759, + "epoch": 0.24013227115586952, + "flos": 25809070757760.0, + "grad_norm": 1.7227387633537015, + "language_loss": 0.7656548, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.78723919, + "num_input_tokens_seen": 85961165, + "step": 3994, + "time_per_iteration": 2.7735466957092285 + }, + { + "auxiliary_loss_clip": 0.01120115, + "auxiliary_loss_mlp": 0.01051235, + "balance_loss_clip": 1.04648936, + "balance_loss_mlp": 1.0323168, + "epoch": 0.2401923944085375, + "flos": 25557480921600.0, + "grad_norm": 1.7819538389347498, + "language_loss": 0.78550023, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.80721372, + "num_input_tokens_seen": 85982710, + "step": 3995, + "time_per_iteration": 2.8184118270874023 + }, + { + "auxiliary_loss_clip": 0.01034, + "auxiliary_loss_mlp": 0.01026353, + "balance_loss_clip": 1.02876425, + "balance_loss_mlp": 1.0237658, + "epoch": 0.24025251766120548, + "flos": 54941138478720.0, + "grad_norm": 0.9088717203971356, + "language_loss": 0.6345036, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.65510708, + "num_input_tokens_seen": 86046935, + "step": 3996, + "time_per_iteration": 3.304704189300537 + }, + { + "auxiliary_loss_clip": 0.01122635, + "auxiliary_loss_mlp": 0.01046678, + "balance_loss_clip": 1.04812241, + "balance_loss_mlp": 1.02960706, + "epoch": 0.24031264091387344, + "flos": 20631075724800.0, + "grad_norm": 2.5673853359086403, + "language_loss": 0.69455099, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.7162441, + "num_input_tokens_seen": 86064355, + "step": 3997, + "time_per_iteration": 2.6638269424438477 + }, + { + "auxiliary_loss_clip": 0.01136246, + "auxiliary_loss_mlp": 0.01041204, + "balance_loss_clip": 1.0500989, + "balance_loss_mlp": 1.02390659, + "epoch": 0.2403727641665414, + "flos": 20886256920960.0, + "grad_norm": 1.9944619018673675, + "language_loss": 0.87352818, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.89530265, + "num_input_tokens_seen": 86081340, + "step": 3998, + "time_per_iteration": 2.6837756633758545 + }, + { + "auxiliary_loss_clip": 0.01126262, + "auxiliary_loss_mlp": 0.01038814, + "balance_loss_clip": 1.04337883, + "balance_loss_mlp": 1.02173114, + "epoch": 0.24043288741920937, + "flos": 22820046451200.0, + "grad_norm": 1.5798261831400109, + "language_loss": 0.75723118, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.77888191, + "num_input_tokens_seen": 86102260, + "step": 3999, + "time_per_iteration": 2.659717321395874 + }, + { + "auxiliary_loss_clip": 0.01116532, + "auxiliary_loss_mlp": 0.01049627, + "balance_loss_clip": 1.04679537, + "balance_loss_mlp": 1.03259242, + "epoch": 0.24049301067187734, + "flos": 27959652823680.0, + "grad_norm": 2.380373207595884, + "language_loss": 0.72602308, + "learning_rate": 3.552938912398679e-06, + "loss": 0.74768472, + "num_input_tokens_seen": 86123400, + "step": 4000, + "time_per_iteration": 4.285717487335205 + }, + { + "auxiliary_loss_clip": 0.01138397, + "auxiliary_loss_mlp": 0.01040819, + "balance_loss_clip": 1.05207551, + "balance_loss_mlp": 1.02389169, + "epoch": 0.24055313392454533, + "flos": 27451409333760.0, + "grad_norm": 2.3105318706157862, + "language_loss": 0.67128104, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.69307321, + "num_input_tokens_seen": 86144060, + "step": 4001, + "time_per_iteration": 4.2180609703063965 + }, + { + "auxiliary_loss_clip": 0.01144863, + "auxiliary_loss_mlp": 0.01043304, + "balance_loss_clip": 1.04859209, + "balance_loss_mlp": 1.02525568, + "epoch": 0.2406132571772133, + "flos": 25556618995200.0, + "grad_norm": 2.360624564793828, + "language_loss": 0.82895994, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.85084158, + "num_input_tokens_seen": 86163005, + "step": 4002, + "time_per_iteration": 2.6369640827178955 + }, + { + "auxiliary_loss_clip": 0.01106477, + "auxiliary_loss_mlp": 0.01045072, + "balance_loss_clip": 1.0493201, + "balance_loss_mlp": 1.0283823, + "epoch": 0.24067338042988126, + "flos": 24791398629120.0, + "grad_norm": 2.016027139567785, + "language_loss": 0.83058953, + "learning_rate": 3.552202383898897e-06, + "loss": 0.85210502, + "num_input_tokens_seen": 86182580, + "step": 4003, + "time_per_iteration": 4.312098979949951 + }, + { + "auxiliary_loss_clip": 0.01114745, + "auxiliary_loss_mlp": 0.01042117, + "balance_loss_clip": 1.0474503, + "balance_loss_mlp": 1.02458131, + "epoch": 0.24073350368254923, + "flos": 21177923356800.0, + "grad_norm": 1.971328156333658, + "language_loss": 0.8672772, + "learning_rate": 3.551956756667215e-06, + "loss": 0.8888458, + "num_input_tokens_seen": 86200665, + "step": 4004, + "time_per_iteration": 2.646578311920166 + }, + { + "auxiliary_loss_clip": 0.01115631, + "auxiliary_loss_mlp": 0.01054344, + "balance_loss_clip": 1.04529011, + "balance_loss_mlp": 1.03736866, + "epoch": 0.2407936269352172, + "flos": 22494300986880.0, + "grad_norm": 1.9965130860947515, + "language_loss": 0.78239757, + "learning_rate": 3.551711070585177e-06, + "loss": 0.80409735, + "num_input_tokens_seen": 86221640, + "step": 4005, + "time_per_iteration": 2.7220566272735596 + }, + { + "auxiliary_loss_clip": 0.01090518, + "auxiliary_loss_mlp": 0.01039515, + "balance_loss_clip": 1.04414058, + "balance_loss_mlp": 1.02164578, + "epoch": 0.24085375018788516, + "flos": 18551129754240.0, + "grad_norm": 1.6390993289809686, + "language_loss": 0.79391652, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.8152169, + "num_input_tokens_seen": 86240795, + "step": 4006, + "time_per_iteration": 2.7188642024993896 + }, + { + "auxiliary_loss_clip": 0.01130191, + "auxiliary_loss_mlp": 0.00777161, + "balance_loss_clip": 1.0482645, + "balance_loss_mlp": 1.00115335, + "epoch": 0.24091387344055312, + "flos": 24170539023360.0, + "grad_norm": 1.6765272633695874, + "language_loss": 0.71939242, + "learning_rate": 3.551219521907302e-06, + "loss": 0.73846585, + "num_input_tokens_seen": 86262000, + "step": 4007, + "time_per_iteration": 4.3504638671875 + }, + { + "auxiliary_loss_clip": 0.01101925, + "auxiliary_loss_mlp": 0.01047677, + "balance_loss_clip": 1.04589975, + "balance_loss_mlp": 1.03132153, + "epoch": 0.24097399669322112, + "flos": 11036319615360.0, + "grad_norm": 1.6891966370612705, + "language_loss": 0.76460171, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.78609765, + "num_input_tokens_seen": 86279680, + "step": 4008, + "time_per_iteration": 2.700744152069092 + }, + { + "auxiliary_loss_clip": 0.01136495, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.05069256, + "balance_loss_mlp": 1.02192402, + "epoch": 0.24103411994588908, + "flos": 17165085696000.0, + "grad_norm": 2.427830882471808, + "language_loss": 0.74601823, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.76777172, + "num_input_tokens_seen": 86297180, + "step": 4009, + "time_per_iteration": 2.6175808906555176 + }, + { + "auxiliary_loss_clip": 0.01134079, + "auxiliary_loss_mlp": 0.01041957, + "balance_loss_clip": 1.05032861, + "balance_loss_mlp": 1.02532756, + "epoch": 0.24109424319855705, + "flos": 20667956014080.0, + "grad_norm": 1.6643292794637636, + "language_loss": 0.80064976, + "learning_rate": 3.550481757745804e-06, + "loss": 0.82241005, + "num_input_tokens_seen": 86317660, + "step": 4010, + "time_per_iteration": 2.680511236190796 + }, + { + "auxiliary_loss_clip": 0.01118599, + "auxiliary_loss_mlp": 0.01047241, + "balance_loss_clip": 1.04658401, + "balance_loss_mlp": 1.02779818, + "epoch": 0.241154366451225, + "flos": 28181796485760.0, + "grad_norm": 3.8737422865874245, + "language_loss": 0.70889425, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.73055267, + "num_input_tokens_seen": 86338325, + "step": 4011, + "time_per_iteration": 2.716404676437378 + }, + { + "auxiliary_loss_clip": 0.01065208, + "auxiliary_loss_mlp": 0.01047099, + "balance_loss_clip": 1.0414176, + "balance_loss_mlp": 1.02802527, + "epoch": 0.24121448970389298, + "flos": 21689722293120.0, + "grad_norm": 1.675052333388822, + "language_loss": 0.69279736, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.71392041, + "num_input_tokens_seen": 86357615, + "step": 4012, + "time_per_iteration": 2.804694890975952 + }, + { + "auxiliary_loss_clip": 0.01138123, + "auxiliary_loss_mlp": 0.0104149, + "balance_loss_clip": 1.05126536, + "balance_loss_mlp": 1.02213097, + "epoch": 0.24127461295656094, + "flos": 39676191269760.0, + "grad_norm": 1.5084253296098848, + "language_loss": 0.732813, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.75460911, + "num_input_tokens_seen": 86380355, + "step": 4013, + "time_per_iteration": 2.8192849159240723 + }, + { + "auxiliary_loss_clip": 0.01148497, + "auxiliary_loss_mlp": 0.01037798, + "balance_loss_clip": 1.05201018, + "balance_loss_mlp": 1.02044141, + "epoch": 0.2413347362092289, + "flos": 19135863256320.0, + "grad_norm": 1.8372553923739565, + "language_loss": 0.88272971, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.90459263, + "num_input_tokens_seen": 86399125, + "step": 4014, + "time_per_iteration": 2.6029160022735596 + }, + { + "auxiliary_loss_clip": 0.0111397, + "auxiliary_loss_mlp": 0.01046282, + "balance_loss_clip": 1.04315281, + "balance_loss_mlp": 1.0278163, + "epoch": 0.2413948594618969, + "flos": 26939430829440.0, + "grad_norm": 1.9589493379590102, + "language_loss": 0.94862974, + "learning_rate": 3.549250975045952e-06, + "loss": 0.97023225, + "num_input_tokens_seen": 86418625, + "step": 4015, + "time_per_iteration": 2.6958773136138916 + }, + { + "auxiliary_loss_clip": 0.01120117, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.04570341, + "balance_loss_mlp": 1.02331638, + "epoch": 0.24145498271456486, + "flos": 25228108183680.0, + "grad_norm": 1.5486712647521637, + "language_loss": 0.8271699, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.84878188, + "num_input_tokens_seen": 86438375, + "step": 4016, + "time_per_iteration": 2.7045071125030518 + }, + { + "auxiliary_loss_clip": 0.01098573, + "auxiliary_loss_mlp": 0.01045564, + "balance_loss_clip": 1.04334974, + "balance_loss_mlp": 1.02838039, + "epoch": 0.24151510596723283, + "flos": 40661759617920.0, + "grad_norm": 1.8022012115417119, + "language_loss": 0.69207114, + "learning_rate": 3.54875825066639e-06, + "loss": 0.71351254, + "num_input_tokens_seen": 86463230, + "step": 4017, + "time_per_iteration": 2.8596649169921875 + }, + { + "auxiliary_loss_clip": 0.01141299, + "auxiliary_loss_mlp": 0.01051243, + "balance_loss_clip": 1.05106175, + "balance_loss_mlp": 1.03278995, + "epoch": 0.2415752292199008, + "flos": 18146667634560.0, + "grad_norm": 1.6419835865444041, + "language_loss": 0.84953403, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.87145936, + "num_input_tokens_seen": 86481230, + "step": 4018, + "time_per_iteration": 2.627629518508911 + }, + { + "auxiliary_loss_clip": 0.01046489, + "auxiliary_loss_mlp": 0.01014362, + "balance_loss_clip": 1.02139664, + "balance_loss_mlp": 1.01140559, + "epoch": 0.24163535247256876, + "flos": 67288409792640.0, + "grad_norm": 0.8221446343976555, + "language_loss": 0.60642469, + "learning_rate": 3.548265291370558e-06, + "loss": 0.62703323, + "num_input_tokens_seen": 86541260, + "step": 4019, + "time_per_iteration": 3.269498586654663 + }, + { + "auxiliary_loss_clip": 0.01114983, + "auxiliary_loss_mlp": 0.01049089, + "balance_loss_clip": 1.04582107, + "balance_loss_mlp": 1.0312674, + "epoch": 0.24169547572523672, + "flos": 24929941386240.0, + "grad_norm": 1.8826005215725077, + "language_loss": 0.73324752, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.75488818, + "num_input_tokens_seen": 86559580, + "step": 4020, + "time_per_iteration": 2.7341055870056152 + }, + { + "auxiliary_loss_clip": 0.01111064, + "auxiliary_loss_mlp": 0.01040515, + "balance_loss_clip": 1.04833841, + "balance_loss_mlp": 1.02315772, + "epoch": 0.24175559897790472, + "flos": 18728312567040.0, + "grad_norm": 1.7964731743776612, + "language_loss": 0.81617332, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.83768916, + "num_input_tokens_seen": 86577560, + "step": 4021, + "time_per_iteration": 2.7154345512390137 + }, + { + "auxiliary_loss_clip": 0.01149117, + "auxiliary_loss_mlp": 0.01050015, + "balance_loss_clip": 1.04972911, + "balance_loss_mlp": 1.03070307, + "epoch": 0.24181572223057268, + "flos": 23039281111680.0, + "grad_norm": 2.078765142897874, + "language_loss": 0.76601863, + "learning_rate": 3.547525412122378e-06, + "loss": 0.78800994, + "num_input_tokens_seen": 86595350, + "step": 4022, + "time_per_iteration": 2.622262716293335 + }, + { + "auxiliary_loss_clip": 0.01102927, + "auxiliary_loss_mlp": 0.01053151, + "balance_loss_clip": 1.042714, + "balance_loss_mlp": 1.03271914, + "epoch": 0.24187584548324065, + "flos": 20376145923840.0, + "grad_norm": 1.7360501926549048, + "language_loss": 0.75283015, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.774391, + "num_input_tokens_seen": 86614805, + "step": 4023, + "time_per_iteration": 2.7339353561401367 + }, + { + "auxiliary_loss_clip": 0.01121416, + "auxiliary_loss_mlp": 0.01047921, + "balance_loss_clip": 1.04916334, + "balance_loss_mlp": 1.0309217, + "epoch": 0.2419359687359086, + "flos": 21397517153280.0, + "grad_norm": 2.4319797200103466, + "language_loss": 0.82542646, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.84711981, + "num_input_tokens_seen": 86633700, + "step": 4024, + "time_per_iteration": 2.6887242794036865 + }, + { + "auxiliary_loss_clip": 0.01133297, + "auxiliary_loss_mlp": 0.01047865, + "balance_loss_clip": 1.05029452, + "balance_loss_mlp": 1.03038907, + "epoch": 0.24199609198857658, + "flos": 18369385914240.0, + "grad_norm": 1.7776330743080708, + "language_loss": 0.85974258, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.88155425, + "num_input_tokens_seen": 86650905, + "step": 4025, + "time_per_iteration": 2.7049782276153564 + }, + { + "auxiliary_loss_clip": 0.01092706, + "auxiliary_loss_mlp": 0.01064486, + "balance_loss_clip": 1.04161918, + "balance_loss_mlp": 1.04456651, + "epoch": 0.24205621524124454, + "flos": 19463871277440.0, + "grad_norm": 1.8800874250001207, + "language_loss": 0.71681315, + "learning_rate": 3.546538084949365e-06, + "loss": 0.73838508, + "num_input_tokens_seen": 86669185, + "step": 4026, + "time_per_iteration": 2.7773284912109375 + }, + { + "auxiliary_loss_clip": 0.01135992, + "auxiliary_loss_mlp": 0.01046992, + "balance_loss_clip": 1.05109096, + "balance_loss_mlp": 1.03088713, + "epoch": 0.2421163384939125, + "flos": 14976330451200.0, + "grad_norm": 1.967847260356932, + "language_loss": 0.64436764, + "learning_rate": 3.546291106520509e-06, + "loss": 0.66619748, + "num_input_tokens_seen": 86686805, + "step": 4027, + "time_per_iteration": 2.6143524646759033 + }, + { + "auxiliary_loss_clip": 0.01136637, + "auxiliary_loss_mlp": 0.00775283, + "balance_loss_clip": 1.05106425, + "balance_loss_mlp": 1.00103092, + "epoch": 0.2421764617465805, + "flos": 18662057930880.0, + "grad_norm": 3.6118562291520813, + "language_loss": 0.70909715, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.72821641, + "num_input_tokens_seen": 86705520, + "step": 4028, + "time_per_iteration": 2.656334400177002 + }, + { + "auxiliary_loss_clip": 0.01053475, + "auxiliary_loss_mlp": 0.01050053, + "balance_loss_clip": 1.02715707, + "balance_loss_mlp": 1.04756165, + "epoch": 0.24223658499924847, + "flos": 64347327164160.0, + "grad_norm": 0.865443083354021, + "language_loss": 0.55302447, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57405978, + "num_input_tokens_seen": 86767320, + "step": 4029, + "time_per_iteration": 3.1736607551574707 + }, + { + "auxiliary_loss_clip": 0.0113268, + "auxiliary_loss_mlp": 0.01051074, + "balance_loss_clip": 1.04679179, + "balance_loss_mlp": 1.03252554, + "epoch": 0.24229670825191643, + "flos": 25775243124480.0, + "grad_norm": 1.6290009052774777, + "language_loss": 0.74065894, + "learning_rate": 3.54554981945833e-06, + "loss": 0.76249647, + "num_input_tokens_seen": 86788110, + "step": 4030, + "time_per_iteration": 2.644153118133545 + }, + { + "auxiliary_loss_clip": 0.01146282, + "auxiliary_loss_mlp": 0.01053008, + "balance_loss_clip": 1.04945433, + "balance_loss_mlp": 1.03495932, + "epoch": 0.2423568315045844, + "flos": 20667094087680.0, + "grad_norm": 2.044571760348203, + "language_loss": 0.76492965, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.78692257, + "num_input_tokens_seen": 86807640, + "step": 4031, + "time_per_iteration": 2.608718156814575 + }, + { + "auxiliary_loss_clip": 0.01130345, + "auxiliary_loss_mlp": 0.00776083, + "balance_loss_clip": 1.04857934, + "balance_loss_mlp": 1.00130129, + "epoch": 0.24241695475725236, + "flos": 22416805393920.0, + "grad_norm": 2.367928778009572, + "language_loss": 0.65578043, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.67484468, + "num_input_tokens_seen": 86826795, + "step": 4032, + "time_per_iteration": 2.713796377182007 + }, + { + "auxiliary_loss_clip": 0.01128183, + "auxiliary_loss_mlp": 0.0104339, + "balance_loss_clip": 1.04551542, + "balance_loss_mlp": 1.02591443, + "epoch": 0.24247707800992033, + "flos": 17128995505920.0, + "grad_norm": 2.055558599382263, + "language_loss": 0.81589901, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.83761466, + "num_input_tokens_seen": 86843175, + "step": 4033, + "time_per_iteration": 2.6381332874298096 + }, + { + "auxiliary_loss_clip": 0.01101134, + "auxiliary_loss_mlp": 0.01042507, + "balance_loss_clip": 1.04264998, + "balance_loss_mlp": 1.02450657, + "epoch": 0.2425372012625883, + "flos": 31613743399680.0, + "grad_norm": 2.655330103252085, + "language_loss": 0.68830204, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.70973849, + "num_input_tokens_seen": 86863185, + "step": 4034, + "time_per_iteration": 2.8269567489624023 + }, + { + "auxiliary_loss_clip": 0.01129717, + "auxiliary_loss_mlp": 0.01036472, + "balance_loss_clip": 1.05142426, + "balance_loss_mlp": 1.01839972, + "epoch": 0.24259732451525629, + "flos": 16326032924160.0, + "grad_norm": 2.305872962411053, + "language_loss": 0.96432853, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.98599035, + "num_input_tokens_seen": 86880040, + "step": 4035, + "time_per_iteration": 2.687131643295288 + }, + { + "auxiliary_loss_clip": 0.01116249, + "auxiliary_loss_mlp": 0.01051012, + "balance_loss_clip": 1.0467937, + "balance_loss_mlp": 1.03419125, + "epoch": 0.24265744776792425, + "flos": 22856639431680.0, + "grad_norm": 1.5931877581057647, + "language_loss": 0.7820307, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.80370331, + "num_input_tokens_seen": 86900610, + "step": 4036, + "time_per_iteration": 2.7576112747192383 + }, + { + "auxiliary_loss_clip": 0.01137826, + "auxiliary_loss_mlp": 0.01049747, + "balance_loss_clip": 1.05010104, + "balance_loss_mlp": 1.03141224, + "epoch": 0.24271757102059222, + "flos": 21871573873920.0, + "grad_norm": 1.6332934168141529, + "language_loss": 0.74266672, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.76454246, + "num_input_tokens_seen": 86919385, + "step": 4037, + "time_per_iteration": 2.7860629558563232 + }, + { + "auxiliary_loss_clip": 0.01100993, + "auxiliary_loss_mlp": 0.01042879, + "balance_loss_clip": 1.04173183, + "balance_loss_mlp": 1.02453303, + "epoch": 0.24277769427326018, + "flos": 19208582340480.0, + "grad_norm": 8.14050816007968, + "language_loss": 0.76632005, + "learning_rate": 3.543570475921171e-06, + "loss": 0.78775871, + "num_input_tokens_seen": 86938885, + "step": 4038, + "time_per_iteration": 2.691695213317871 + }, + { + "auxiliary_loss_clip": 0.01129874, + "auxiliary_loss_mlp": 0.01043604, + "balance_loss_clip": 1.04768467, + "balance_loss_mlp": 1.0249598, + "epoch": 0.24283781752592815, + "flos": 19499889640320.0, + "grad_norm": 3.2334161052349817, + "language_loss": 0.71992457, + "learning_rate": 3.543322794484905e-06, + "loss": 0.7416594, + "num_input_tokens_seen": 86957705, + "step": 4039, + "time_per_iteration": 4.128135442733765 + }, + { + "auxiliary_loss_clip": 0.0112766, + "auxiliary_loss_mlp": 0.01048109, + "balance_loss_clip": 1.04597354, + "balance_loss_mlp": 1.02921474, + "epoch": 0.2428979407785961, + "flos": 19902196944000.0, + "grad_norm": 1.6158763194283545, + "language_loss": 0.78655136, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.80830908, + "num_input_tokens_seen": 86975845, + "step": 4040, + "time_per_iteration": 4.174723863601685 + }, + { + "auxiliary_loss_clip": 0.01090567, + "auxiliary_loss_mlp": 0.01038965, + "balance_loss_clip": 1.04526615, + "balance_loss_mlp": 1.02268124, + "epoch": 0.2429580640312641, + "flos": 24715878284160.0, + "grad_norm": 2.432557236688664, + "language_loss": 0.80599713, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.8272925, + "num_input_tokens_seen": 86994800, + "step": 4041, + "time_per_iteration": 2.7933273315429688 + }, + { + "auxiliary_loss_clip": 0.01108653, + "auxiliary_loss_mlp": 0.01044101, + "balance_loss_clip": 1.04587245, + "balance_loss_mlp": 1.02733982, + "epoch": 0.24301818728393207, + "flos": 25630343660160.0, + "grad_norm": 1.9967913274059828, + "language_loss": 0.76708287, + "learning_rate": 3.542579399075957e-06, + "loss": 0.78861034, + "num_input_tokens_seen": 87016845, + "step": 4042, + "time_per_iteration": 4.336673021316528 + }, + { + "auxiliary_loss_clip": 0.01056541, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.04354727, + "balance_loss_mlp": 1.01928389, + "epoch": 0.24307831053660003, + "flos": 26141388410880.0, + "grad_norm": 1.8431659047813937, + "language_loss": 0.81232125, + "learning_rate": 3.542331483604246e-06, + "loss": 0.83324039, + "num_input_tokens_seen": 87036270, + "step": 4043, + "time_per_iteration": 2.9156856536865234 + }, + { + "auxiliary_loss_clip": 0.01126576, + "auxiliary_loss_mlp": 0.01038857, + "balance_loss_clip": 1.04610896, + "balance_loss_mlp": 1.02012897, + "epoch": 0.243138433789268, + "flos": 14972415868800.0, + "grad_norm": 2.052349433785912, + "language_loss": 0.73095596, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.75261033, + "num_input_tokens_seen": 87049920, + "step": 4044, + "time_per_iteration": 2.6324286460876465 + }, + { + "auxiliary_loss_clip": 0.0113453, + "auxiliary_loss_mlp": 0.01042417, + "balance_loss_clip": 1.04967666, + "balance_loss_mlp": 1.02445269, + "epoch": 0.24319855704193596, + "flos": 25191694771200.0, + "grad_norm": 1.8848950918191658, + "language_loss": 0.83676481, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.85853434, + "num_input_tokens_seen": 87068230, + "step": 4045, + "time_per_iteration": 2.68994402885437 + }, + { + "auxiliary_loss_clip": 0.010753, + "auxiliary_loss_mlp": 0.01047988, + "balance_loss_clip": 1.04608011, + "balance_loss_mlp": 1.03117943, + "epoch": 0.24325868029460393, + "flos": 22127221946880.0, + "grad_norm": 1.9701839557075844, + "language_loss": 0.86895847, + "learning_rate": 3.541587386314541e-06, + "loss": 0.89019132, + "num_input_tokens_seen": 87086435, + "step": 4046, + "time_per_iteration": 2.908737897872925 + }, + { + "auxiliary_loss_clip": 0.01120714, + "auxiliary_loss_mlp": 0.01038682, + "balance_loss_clip": 1.04705977, + "balance_loss_mlp": 1.02070522, + "epoch": 0.2433188035472719, + "flos": 23582106420480.0, + "grad_norm": 1.8855160425980928, + "language_loss": 0.72759771, + "learning_rate": 3.5413392369578e-06, + "loss": 0.74919164, + "num_input_tokens_seen": 87105340, + "step": 4047, + "time_per_iteration": 4.310218095779419 + }, + { + "auxiliary_loss_clip": 0.01124014, + "auxiliary_loss_mlp": 0.01045256, + "balance_loss_clip": 1.04447186, + "balance_loss_mlp": 1.02637279, + "epoch": 0.2433789267999399, + "flos": 24462815990400.0, + "grad_norm": 2.592486480291502, + "language_loss": 0.73029542, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.75198811, + "num_input_tokens_seen": 87125780, + "step": 4048, + "time_per_iteration": 2.699544668197632 + }, + { + "auxiliary_loss_clip": 0.01112707, + "auxiliary_loss_mlp": 0.01045312, + "balance_loss_clip": 1.04923105, + "balance_loss_mlp": 1.02869391, + "epoch": 0.24343905005260785, + "flos": 16727909264640.0, + "grad_norm": 1.921127999919884, + "language_loss": 0.73616529, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.7577455, + "num_input_tokens_seen": 87144470, + "step": 4049, + "time_per_iteration": 2.6988370418548584 + }, + { + "auxiliary_loss_clip": 0.01093349, + "auxiliary_loss_mlp": 0.01041657, + "balance_loss_clip": 1.04289758, + "balance_loss_mlp": 1.02583802, + "epoch": 0.24349917330527582, + "flos": 20043756443520.0, + "grad_norm": 2.073976648883723, + "language_loss": 0.7377705, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.75912058, + "num_input_tokens_seen": 87162830, + "step": 4050, + "time_per_iteration": 2.718212604522705 + }, + { + "auxiliary_loss_clip": 0.01116995, + "auxiliary_loss_mlp": 0.01043968, + "balance_loss_clip": 1.04518783, + "balance_loss_mlp": 1.02800608, + "epoch": 0.24355929655794378, + "flos": 17420554200960.0, + "grad_norm": 2.361179977901575, + "language_loss": 0.75518602, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.77679563, + "num_input_tokens_seen": 87180905, + "step": 4051, + "time_per_iteration": 2.6522655487060547 + }, + { + "auxiliary_loss_clip": 0.01092567, + "auxiliary_loss_mlp": 0.01042511, + "balance_loss_clip": 1.04197812, + "balance_loss_mlp": 1.02507067, + "epoch": 0.24361941981061175, + "flos": 25410929431680.0, + "grad_norm": 2.2644912923037985, + "language_loss": 0.70717591, + "learning_rate": 3.540097613646296e-06, + "loss": 0.72852671, + "num_input_tokens_seen": 87202290, + "step": 4052, + "time_per_iteration": 2.794059991836548 + }, + { + "auxiliary_loss_clip": 0.0111622, + "auxiliary_loss_mlp": 0.01045494, + "balance_loss_clip": 1.04823005, + "balance_loss_mlp": 1.02833986, + "epoch": 0.2436795430632797, + "flos": 22820800636800.0, + "grad_norm": 1.7022998331113812, + "language_loss": 0.80989587, + "learning_rate": 3.539849113744351e-06, + "loss": 0.83151299, + "num_input_tokens_seen": 87221650, + "step": 4053, + "time_per_iteration": 2.682805299758911 + }, + { + "auxiliary_loss_clip": 0.01148244, + "auxiliary_loss_mlp": 0.01038109, + "balance_loss_clip": 1.05124915, + "balance_loss_mlp": 1.0210743, + "epoch": 0.2437396663159477, + "flos": 15157786982400.0, + "grad_norm": 1.5338885161808513, + "language_loss": 0.77628779, + "learning_rate": 3.539600555451172e-06, + "loss": 0.79815125, + "num_input_tokens_seen": 87238515, + "step": 4054, + "time_per_iteration": 2.635181427001953 + }, + { + "auxiliary_loss_clip": 0.01095192, + "auxiliary_loss_mlp": 0.01055244, + "balance_loss_clip": 1.04067969, + "balance_loss_mlp": 1.03783989, + "epoch": 0.24379978956861567, + "flos": 22091131756800.0, + "grad_norm": 1.8808929031646056, + "language_loss": 0.84398115, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.86548549, + "num_input_tokens_seen": 87256290, + "step": 4055, + "time_per_iteration": 2.757601261138916 + }, + { + "auxiliary_loss_clip": 0.01110063, + "auxiliary_loss_mlp": 0.01045315, + "balance_loss_clip": 1.04298997, + "balance_loss_mlp": 1.02767169, + "epoch": 0.24385991282128364, + "flos": 31467766527360.0, + "grad_norm": 2.5636936013515776, + "language_loss": 0.55038011, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.57193393, + "num_input_tokens_seen": 87277085, + "step": 4056, + "time_per_iteration": 2.7788894176483154 + }, + { + "auxiliary_loss_clip": 0.0113756, + "auxiliary_loss_mlp": 0.01046233, + "balance_loss_clip": 1.04897046, + "balance_loss_mlp": 1.02876842, + "epoch": 0.2439200360739516, + "flos": 23838795987840.0, + "grad_norm": 2.64902132986976, + "language_loss": 0.80583262, + "learning_rate": 3.538854530318506e-06, + "loss": 0.82767057, + "num_input_tokens_seen": 87293020, + "step": 4057, + "time_per_iteration": 2.78110671043396 + }, + { + "auxiliary_loss_clip": 0.01132987, + "auxiliary_loss_mlp": 0.01048497, + "balance_loss_clip": 1.04877245, + "balance_loss_mlp": 1.03145027, + "epoch": 0.24398015932661957, + "flos": 19169978198400.0, + "grad_norm": 1.8133503864036424, + "language_loss": 0.79202968, + "learning_rate": 3.538605738554673e-06, + "loss": 0.81384456, + "num_input_tokens_seen": 87311445, + "step": 4058, + "time_per_iteration": 2.6609115600585938 + }, + { + "auxiliary_loss_clip": 0.01147749, + "auxiliary_loss_mlp": 0.01045059, + "balance_loss_clip": 1.04827118, + "balance_loss_mlp": 1.02920449, + "epoch": 0.24404028257928753, + "flos": 25262474520960.0, + "grad_norm": 3.3482411666646086, + "language_loss": 0.85503888, + "learning_rate": 3.538356888446756e-06, + "loss": 0.87696695, + "num_input_tokens_seen": 87332055, + "step": 4059, + "time_per_iteration": 2.724241256713867 + }, + { + "auxiliary_loss_clip": 0.01126127, + "auxiliary_loss_mlp": 0.01038967, + "balance_loss_clip": 1.04837418, + "balance_loss_mlp": 1.02296889, + "epoch": 0.2441004058319555, + "flos": 26467600752000.0, + "grad_norm": 2.2060888459440617, + "language_loss": 0.7483452, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.76999605, + "num_input_tokens_seen": 87351295, + "step": 4060, + "time_per_iteration": 2.6769304275512695 + }, + { + "auxiliary_loss_clip": 0.01111679, + "auxiliary_loss_mlp": 0.01051445, + "balance_loss_clip": 1.04629493, + "balance_loss_mlp": 1.03247917, + "epoch": 0.2441605290846235, + "flos": 26760524163840.0, + "grad_norm": 2.624850134940939, + "language_loss": 0.73482168, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.75645292, + "num_input_tokens_seen": 87370650, + "step": 4061, + "time_per_iteration": 2.7570559978485107 + }, + { + "auxiliary_loss_clip": 0.01144554, + "auxiliary_loss_mlp": 0.01039707, + "balance_loss_clip": 1.05180097, + "balance_loss_mlp": 1.02394772, + "epoch": 0.24422065233729146, + "flos": 21105850717440.0, + "grad_norm": 4.11905418985837, + "language_loss": 0.76135921, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.78320187, + "num_input_tokens_seen": 87389020, + "step": 4062, + "time_per_iteration": 2.6387689113616943 + }, + { + "auxiliary_loss_clip": 0.01104974, + "auxiliary_loss_mlp": 0.01041222, + "balance_loss_clip": 1.04618907, + "balance_loss_mlp": 1.02458024, + "epoch": 0.24428077558995942, + "flos": 25263156879360.0, + "grad_norm": 2.5628995075758954, + "language_loss": 0.85376853, + "learning_rate": 3.537360904763011e-06, + "loss": 0.87523055, + "num_input_tokens_seen": 87409695, + "step": 4063, + "time_per_iteration": 2.7785301208496094 + }, + { + "auxiliary_loss_clip": 0.01119987, + "auxiliary_loss_mlp": 0.01047158, + "balance_loss_clip": 1.04776239, + "balance_loss_mlp": 1.02789354, + "epoch": 0.24434089884262739, + "flos": 20485278420480.0, + "grad_norm": 2.760332484942286, + "language_loss": 0.6845879, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.70625937, + "num_input_tokens_seen": 87428250, + "step": 4064, + "time_per_iteration": 2.6691763401031494 + }, + { + "auxiliary_loss_clip": 0.01138225, + "auxiliary_loss_mlp": 0.01046639, + "balance_loss_clip": 1.04773867, + "balance_loss_mlp": 1.02892423, + "epoch": 0.24440102209529535, + "flos": 23621895711360.0, + "grad_norm": 1.603702751214229, + "language_loss": 0.70247531, + "learning_rate": 3.536862563102088e-06, + "loss": 0.72432399, + "num_input_tokens_seen": 87449380, + "step": 4065, + "time_per_iteration": 2.6677680015563965 + }, + { + "auxiliary_loss_clip": 0.01150465, + "auxiliary_loss_mlp": 0.0104697, + "balance_loss_clip": 1.05127215, + "balance_loss_mlp": 1.02803993, + "epoch": 0.24446114534796332, + "flos": 20554729367040.0, + "grad_norm": 1.788543447431289, + "language_loss": 0.84282506, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.86479944, + "num_input_tokens_seen": 87465365, + "step": 4066, + "time_per_iteration": 2.5993456840515137 + }, + { + "auxiliary_loss_clip": 0.01067736, + "auxiliary_loss_mlp": 0.01002523, + "balance_loss_clip": 1.03198457, + "balance_loss_mlp": 1.00028193, + "epoch": 0.24452126860063128, + "flos": 60389575009920.0, + "grad_norm": 0.7359455307187547, + "language_loss": 0.52283657, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54353911, + "num_input_tokens_seen": 87522525, + "step": 4067, + "time_per_iteration": 3.056666374206543 + }, + { + "auxiliary_loss_clip": 0.01123042, + "auxiliary_loss_mlp": 0.01045731, + "balance_loss_clip": 1.04955244, + "balance_loss_mlp": 1.0279212, + "epoch": 0.24458139185329927, + "flos": 15121660878720.0, + "grad_norm": 2.6392300526537493, + "language_loss": 0.7185899, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.74027765, + "num_input_tokens_seen": 87539170, + "step": 4068, + "time_per_iteration": 2.700847864151001 + }, + { + "auxiliary_loss_clip": 0.01086004, + "auxiliary_loss_mlp": 0.01047493, + "balance_loss_clip": 1.04378593, + "balance_loss_mlp": 1.02920675, + "epoch": 0.24464151510596724, + "flos": 27998723842560.0, + "grad_norm": 2.4202919064349744, + "language_loss": 0.78083313, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.80216813, + "num_input_tokens_seen": 87558875, + "step": 4069, + "time_per_iteration": 2.9363162517547607 + }, + { + "auxiliary_loss_clip": 0.01119666, + "auxiliary_loss_mlp": 0.0105204, + "balance_loss_clip": 1.05164659, + "balance_loss_mlp": 1.03445613, + "epoch": 0.2447016383586352, + "flos": 19792884879360.0, + "grad_norm": 4.167143793475273, + "language_loss": 0.80607939, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.82779646, + "num_input_tokens_seen": 87576485, + "step": 4070, + "time_per_iteration": 2.658191204071045 + }, + { + "auxiliary_loss_clip": 0.01127014, + "auxiliary_loss_mlp": 0.01049283, + "balance_loss_clip": 1.04832387, + "balance_loss_mlp": 1.03218853, + "epoch": 0.24476176161130317, + "flos": 26067340523520.0, + "grad_norm": 1.5316441932107319, + "language_loss": 0.84351504, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.86527801, + "num_input_tokens_seen": 87598620, + "step": 4071, + "time_per_iteration": 2.7118849754333496 + }, + { + "auxiliary_loss_clip": 0.01120333, + "auxiliary_loss_mlp": 0.01057334, + "balance_loss_clip": 1.04778695, + "balance_loss_mlp": 1.03601933, + "epoch": 0.24482188486397113, + "flos": 18843550375680.0, + "grad_norm": 1.8860726044388547, + "language_loss": 0.80115497, + "learning_rate": 3.535116532028798e-06, + "loss": 0.82293165, + "num_input_tokens_seen": 87616595, + "step": 4072, + "time_per_iteration": 2.6662774085998535 + }, + { + "auxiliary_loss_clip": 0.01134806, + "auxiliary_loss_mlp": 0.0104215, + "balance_loss_clip": 1.05156791, + "balance_loss_mlp": 1.02614021, + "epoch": 0.2448820081166391, + "flos": 21251791676160.0, + "grad_norm": 3.990887653020168, + "language_loss": 0.70466423, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.72643375, + "num_input_tokens_seen": 87635755, + "step": 4073, + "time_per_iteration": 2.7366209030151367 + }, + { + "auxiliary_loss_clip": 0.01110472, + "auxiliary_loss_mlp": 0.01047265, + "balance_loss_clip": 1.04666865, + "balance_loss_mlp": 1.03090906, + "epoch": 0.2449421313693071, + "flos": 23950586090880.0, + "grad_norm": 2.943884117668681, + "language_loss": 0.67292917, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.69450659, + "num_input_tokens_seen": 87652885, + "step": 4074, + "time_per_iteration": 2.7158730030059814 + }, + { + "auxiliary_loss_clip": 0.01062567, + "auxiliary_loss_mlp": 0.01002121, + "balance_loss_clip": 1.02741885, + "balance_loss_mlp": 0.99986744, + "epoch": 0.24500225462197506, + "flos": 60687669980160.0, + "grad_norm": 0.8927046346070237, + "language_loss": 0.68608266, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.70672953, + "num_input_tokens_seen": 87713220, + "step": 4075, + "time_per_iteration": 3.2283740043640137 + }, + { + "auxiliary_loss_clip": 0.01146172, + "auxiliary_loss_mlp": 0.01042507, + "balance_loss_clip": 1.05287361, + "balance_loss_mlp": 1.02612722, + "epoch": 0.24506237787464302, + "flos": 26284204886400.0, + "grad_norm": 2.3370219869490563, + "language_loss": 0.79263043, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.81451714, + "num_input_tokens_seen": 87732680, + "step": 4076, + "time_per_iteration": 2.6744346618652344 + }, + { + "auxiliary_loss_clip": 0.01128421, + "auxiliary_loss_mlp": 0.00775989, + "balance_loss_clip": 1.04903293, + "balance_loss_mlp": 1.001266, + "epoch": 0.245122501127311, + "flos": 20552287242240.0, + "grad_norm": 1.816414447330212, + "language_loss": 0.81986046, + "learning_rate": 3.533867620434151e-06, + "loss": 0.83890456, + "num_input_tokens_seen": 87751880, + "step": 4077, + "time_per_iteration": 2.729391098022461 + }, + { + "auxiliary_loss_clip": 0.01148302, + "auxiliary_loss_mlp": 0.01047154, + "balance_loss_clip": 1.05185413, + "balance_loss_mlp": 1.0288794, + "epoch": 0.24518262437997895, + "flos": 29132603447040.0, + "grad_norm": 2.0328430965985045, + "language_loss": 0.62790757, + "learning_rate": 3.533617663584082e-06, + "loss": 0.64986217, + "num_input_tokens_seen": 87771795, + "step": 4078, + "time_per_iteration": 2.694767713546753 + }, + { + "auxiliary_loss_clip": 0.01114498, + "auxiliary_loss_mlp": 0.01039203, + "balance_loss_clip": 1.04953861, + "balance_loss_mlp": 1.02270436, + "epoch": 0.24524274763264692, + "flos": 23476924419840.0, + "grad_norm": 1.5687748074794818, + "language_loss": 0.75811553, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.7796526, + "num_input_tokens_seen": 87793640, + "step": 4079, + "time_per_iteration": 4.288895130157471 + }, + { + "auxiliary_loss_clip": 0.01142871, + "auxiliary_loss_mlp": 0.01047138, + "balance_loss_clip": 1.04899406, + "balance_loss_mlp": 1.02955461, + "epoch": 0.24530287088531488, + "flos": 17201175886080.0, + "grad_norm": 1.8811380892336844, + "language_loss": 0.74537313, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.76727325, + "num_input_tokens_seen": 87812390, + "step": 4080, + "time_per_iteration": 2.683969736099243 + }, + { + "auxiliary_loss_clip": 0.01115604, + "auxiliary_loss_mlp": 0.01041593, + "balance_loss_clip": 1.04717278, + "balance_loss_mlp": 1.02558291, + "epoch": 0.24536299413798288, + "flos": 14867449349760.0, + "grad_norm": 2.2859558621761997, + "language_loss": 0.83389306, + "learning_rate": 3.532867444142186e-06, + "loss": 0.85546505, + "num_input_tokens_seen": 87830640, + "step": 4081, + "time_per_iteration": 2.772573947906494 + }, + { + "auxiliary_loss_clip": 0.01114607, + "auxiliary_loss_mlp": 0.01040674, + "balance_loss_clip": 1.04734826, + "balance_loss_mlp": 1.02473605, + "epoch": 0.24542311739065084, + "flos": 35262051886080.0, + "grad_norm": 1.8658741711896472, + "language_loss": 0.73223484, + "learning_rate": 3.532617254729267e-06, + "loss": 0.7537877, + "num_input_tokens_seen": 87850450, + "step": 4082, + "time_per_iteration": 4.3304970264434814 + }, + { + "auxiliary_loss_clip": 0.01104397, + "auxiliary_loss_mlp": 0.01047151, + "balance_loss_clip": 1.04542649, + "balance_loss_mlp": 1.03163004, + "epoch": 0.2454832406433188, + "flos": 21503130117120.0, + "grad_norm": 1.7143564189307843, + "language_loss": 0.72032338, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.74183893, + "num_input_tokens_seen": 87868810, + "step": 4083, + "time_per_iteration": 2.7463390827178955 + }, + { + "auxiliary_loss_clip": 0.01115479, + "auxiliary_loss_mlp": 0.01048832, + "balance_loss_clip": 1.04441845, + "balance_loss_mlp": 1.02979386, + "epoch": 0.24554336389598677, + "flos": 14756664827520.0, + "grad_norm": 2.556114612666859, + "language_loss": 0.74363655, + "learning_rate": 3.532116701561919e-06, + "loss": 0.76527965, + "num_input_tokens_seen": 87885685, + "step": 4084, + "time_per_iteration": 2.6828086376190186 + }, + { + "auxiliary_loss_clip": 0.01126215, + "auxiliary_loss_mlp": 0.01040078, + "balance_loss_clip": 1.04541206, + "balance_loss_mlp": 1.02269721, + "epoch": 0.24560348714865474, + "flos": 14976402278400.0, + "grad_norm": 2.030442784512354, + "language_loss": 0.85540497, + "learning_rate": 3.531866337826471e-06, + "loss": 0.87706792, + "num_input_tokens_seen": 87903715, + "step": 4085, + "time_per_iteration": 4.236302852630615 + }, + { + "auxiliary_loss_clip": 0.01110493, + "auxiliary_loss_mlp": 0.01046501, + "balance_loss_clip": 1.04634261, + "balance_loss_mlp": 1.02932286, + "epoch": 0.2456636104013227, + "flos": 22675326554880.0, + "grad_norm": 2.028282258660301, + "language_loss": 0.78985649, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.8114264, + "num_input_tokens_seen": 87923375, + "step": 4086, + "time_per_iteration": 2.6638717651367188 + }, + { + "auxiliary_loss_clip": 0.01087456, + "auxiliary_loss_mlp": 0.0104508, + "balance_loss_clip": 1.04792905, + "balance_loss_mlp": 1.02847362, + "epoch": 0.2457237336539907, + "flos": 27417869009280.0, + "grad_norm": 5.7080500305845865, + "language_loss": 0.75053227, + "learning_rate": 3.531365436099496e-06, + "loss": 0.77185762, + "num_input_tokens_seen": 87943115, + "step": 4087, + "time_per_iteration": 2.8027901649475098 + }, + { + "auxiliary_loss_clip": 0.01090549, + "auxiliary_loss_mlp": 0.01045493, + "balance_loss_clip": 1.04807436, + "balance_loss_mlp": 1.02680135, + "epoch": 0.24578385690665866, + "flos": 20412379768320.0, + "grad_norm": 2.066557704160291, + "language_loss": 0.79291761, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.81427807, + "num_input_tokens_seen": 87959505, + "step": 4088, + "time_per_iteration": 2.78812575340271 + }, + { + "auxiliary_loss_clip": 0.0110062, + "auxiliary_loss_mlp": 0.01035541, + "balance_loss_clip": 1.04435658, + "balance_loss_mlp": 1.01949525, + "epoch": 0.24584398015932662, + "flos": 23915393740800.0, + "grad_norm": 1.4918864539426413, + "language_loss": 0.77053773, + "learning_rate": 3.5308643020944e-06, + "loss": 0.79189926, + "num_input_tokens_seen": 87979725, + "step": 4089, + "time_per_iteration": 2.75034761428833 + }, + { + "auxiliary_loss_clip": 0.01125156, + "auxiliary_loss_mlp": 0.0104201, + "balance_loss_clip": 1.04609382, + "balance_loss_mlp": 1.02470064, + "epoch": 0.2459041034119946, + "flos": 41496359103360.0, + "grad_norm": 2.3383647352821737, + "language_loss": 0.81814516, + "learning_rate": 3.530613648011309e-06, + "loss": 0.83981681, + "num_input_tokens_seen": 87998270, + "step": 4090, + "time_per_iteration": 2.891878604888916 + }, + { + "auxiliary_loss_clip": 0.01121872, + "auxiliary_loss_mlp": 0.01050145, + "balance_loss_clip": 1.04687834, + "balance_loss_mlp": 1.03163147, + "epoch": 0.24596422666466256, + "flos": 19936814676480.0, + "grad_norm": 1.8221600402702927, + "language_loss": 0.73833978, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.76005995, + "num_input_tokens_seen": 88016760, + "step": 4091, + "time_per_iteration": 2.6410961151123047 + }, + { + "auxiliary_loss_clip": 0.01114038, + "auxiliary_loss_mlp": 0.01045509, + "balance_loss_clip": 1.05517268, + "balance_loss_mlp": 1.0279969, + "epoch": 0.24602434991733052, + "flos": 21544391865600.0, + "grad_norm": 1.8983812190731213, + "language_loss": 0.7706998, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.79229522, + "num_input_tokens_seen": 88036465, + "step": 4092, + "time_per_iteration": 2.7038323879241943 + }, + { + "auxiliary_loss_clip": 0.01115501, + "auxiliary_loss_mlp": 0.01040797, + "balance_loss_clip": 1.04371238, + "balance_loss_mlp": 1.02255797, + "epoch": 0.24608447316999849, + "flos": 23185078416000.0, + "grad_norm": 3.1365051823944627, + "language_loss": 0.81200075, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.83356375, + "num_input_tokens_seen": 88053270, + "step": 4093, + "time_per_iteration": 2.680634021759033 + }, + { + "auxiliary_loss_clip": 0.01135527, + "auxiliary_loss_mlp": 0.01043826, + "balance_loss_clip": 1.04879606, + "balance_loss_mlp": 1.02613521, + "epoch": 0.24614459642266648, + "flos": 19641951930240.0, + "grad_norm": 1.9167765067224862, + "language_loss": 0.86932534, + "learning_rate": 3.529610451363797e-06, + "loss": 0.89111882, + "num_input_tokens_seen": 88072305, + "step": 4094, + "time_per_iteration": 2.6558003425598145 + }, + { + "auxiliary_loss_clip": 0.01007267, + "auxiliary_loss_mlp": 0.01019789, + "balance_loss_clip": 1.03124738, + "balance_loss_mlp": 1.01697576, + "epoch": 0.24620471967533444, + "flos": 61739816186880.0, + "grad_norm": 0.7554163750993251, + "language_loss": 0.57503664, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59530711, + "num_input_tokens_seen": 88137995, + "step": 4095, + "time_per_iteration": 3.3576478958129883 + }, + { + "auxiliary_loss_clip": 0.01051219, + "auxiliary_loss_mlp": 0.0102022, + "balance_loss_clip": 1.03409493, + "balance_loss_mlp": 1.01790738, + "epoch": 0.2462648429280024, + "flos": 69154436315520.0, + "grad_norm": 0.655284075812517, + "language_loss": 0.56260574, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58332014, + "num_input_tokens_seen": 88208490, + "step": 4096, + "time_per_iteration": 3.376516580581665 + }, + { + "auxiliary_loss_clip": 0.0112712, + "auxiliary_loss_mlp": 0.01040362, + "balance_loss_clip": 1.05330801, + "balance_loss_mlp": 1.0236733, + "epoch": 0.24632496618067037, + "flos": 29459605887360.0, + "grad_norm": 1.7306008966026363, + "language_loss": 0.77629399, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.79796875, + "num_input_tokens_seen": 88228050, + "step": 4097, + "time_per_iteration": 2.6973912715911865 + }, + { + "auxiliary_loss_clip": 0.01114293, + "auxiliary_loss_mlp": 0.01047339, + "balance_loss_clip": 1.04898906, + "balance_loss_mlp": 1.02842093, + "epoch": 0.24638508943333834, + "flos": 24316444068480.0, + "grad_norm": 2.4079595240953613, + "language_loss": 0.75890571, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.78052205, + "num_input_tokens_seen": 88248090, + "step": 4098, + "time_per_iteration": 2.739947557449341 + }, + { + "auxiliary_loss_clip": 0.0112794, + "auxiliary_loss_mlp": 0.01046194, + "balance_loss_clip": 1.05179596, + "balance_loss_mlp": 1.03002954, + "epoch": 0.2464452126860063, + "flos": 26613254401920.0, + "grad_norm": 2.5671853201902737, + "language_loss": 0.68179071, + "learning_rate": 3.528355150558764e-06, + "loss": 0.7035321, + "num_input_tokens_seen": 88267545, + "step": 4099, + "time_per_iteration": 2.7144618034362793 + }, + { + "auxiliary_loss_clip": 0.01133513, + "auxiliary_loss_mlp": 0.01045673, + "balance_loss_clip": 1.05187321, + "balance_loss_mlp": 1.02897191, + "epoch": 0.24650533593867427, + "flos": 31212405763200.0, + "grad_norm": 2.0343787496625656, + "language_loss": 0.65915, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.68094188, + "num_input_tokens_seen": 88289785, + "step": 4100, + "time_per_iteration": 2.724008560180664 + }, + { + "auxiliary_loss_clip": 0.01054067, + "auxiliary_loss_mlp": 0.01041004, + "balance_loss_clip": 1.03763318, + "balance_loss_mlp": 1.03830957, + "epoch": 0.24656545919134226, + "flos": 68494002900480.0, + "grad_norm": 0.7229502883874133, + "language_loss": 0.61514676, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63609749, + "num_input_tokens_seen": 88357320, + "step": 4101, + "time_per_iteration": 3.3748011589050293 + }, + { + "auxiliary_loss_clip": 0.01144305, + "auxiliary_loss_mlp": 0.01041937, + "balance_loss_clip": 1.05133915, + "balance_loss_mlp": 1.02455676, + "epoch": 0.24662558244401023, + "flos": 20084192179200.0, + "grad_norm": 2.2333045722985028, + "language_loss": 0.73272061, + "learning_rate": 3.527601274535012e-06, + "loss": 0.754583, + "num_input_tokens_seen": 88377040, + "step": 4102, + "time_per_iteration": 2.7457518577575684 + }, + { + "auxiliary_loss_clip": 0.01124231, + "auxiliary_loss_mlp": 0.01043636, + "balance_loss_clip": 1.04909408, + "balance_loss_mlp": 1.02699423, + "epoch": 0.2466857056966782, + "flos": 30701361012480.0, + "grad_norm": 2.9311552217427774, + "language_loss": 0.76528364, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.78696227, + "num_input_tokens_seen": 88395085, + "step": 4103, + "time_per_iteration": 2.732285499572754 + }, + { + "auxiliary_loss_clip": 0.01128751, + "auxiliary_loss_mlp": 0.01051695, + "balance_loss_clip": 1.04730439, + "balance_loss_mlp": 1.03313375, + "epoch": 0.24674582894934616, + "flos": 22528523669760.0, + "grad_norm": 2.3173933836652902, + "language_loss": 0.78658336, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.80838788, + "num_input_tokens_seen": 88413205, + "step": 4104, + "time_per_iteration": 2.7234179973602295 + }, + { + "auxiliary_loss_clip": 0.01134641, + "auxiliary_loss_mlp": 0.01045411, + "balance_loss_clip": 1.05110276, + "balance_loss_mlp": 1.02601588, + "epoch": 0.24680595220201412, + "flos": 20704297599360.0, + "grad_norm": 1.883953093480743, + "language_loss": 0.8375451, + "learning_rate": 3.526846877170133e-06, + "loss": 0.85934561, + "num_input_tokens_seen": 88431525, + "step": 4105, + "time_per_iteration": 2.7051403522491455 + }, + { + "auxiliary_loss_clip": 0.01149885, + "auxiliary_loss_mlp": 0.01051204, + "balance_loss_clip": 1.05490828, + "balance_loss_mlp": 1.03340602, + "epoch": 0.2468660754546821, + "flos": 21831174051840.0, + "grad_norm": 1.9903096770852142, + "language_loss": 0.76503521, + "learning_rate": 3.52659529557275e-06, + "loss": 0.78704607, + "num_input_tokens_seen": 88451210, + "step": 4106, + "time_per_iteration": 2.6324243545532227 + }, + { + "auxiliary_loss_clip": 0.01107346, + "auxiliary_loss_mlp": 0.01058334, + "balance_loss_clip": 1.0438261, + "balance_loss_mlp": 1.03743649, + "epoch": 0.24692619870735008, + "flos": 15267709578240.0, + "grad_norm": 2.3469304270549487, + "language_loss": 0.72399199, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.74564874, + "num_input_tokens_seen": 88467790, + "step": 4107, + "time_per_iteration": 2.6767516136169434 + }, + { + "auxiliary_loss_clip": 0.01149014, + "auxiliary_loss_mlp": 0.01055902, + "balance_loss_clip": 1.05365527, + "balance_loss_mlp": 1.03840184, + "epoch": 0.24698632196001805, + "flos": 29680097523840.0, + "grad_norm": 2.655550859638868, + "language_loss": 0.65495557, + "learning_rate": 3.526091958721587e-06, + "loss": 0.67700469, + "num_input_tokens_seen": 88490330, + "step": 4108, + "time_per_iteration": 2.666501760482788 + }, + { + "auxiliary_loss_clip": 0.01095567, + "auxiliary_loss_mlp": 0.01053352, + "balance_loss_clip": 1.04577923, + "balance_loss_mlp": 1.0351851, + "epoch": 0.247046445212686, + "flos": 39165469741440.0, + "grad_norm": 1.631565192024798, + "language_loss": 0.72685403, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.74834323, + "num_input_tokens_seen": 88512435, + "step": 4109, + "time_per_iteration": 2.8588712215423584 + }, + { + "auxiliary_loss_clip": 0.01110552, + "auxiliary_loss_mlp": 0.01048877, + "balance_loss_clip": 1.04754984, + "balance_loss_mlp": 1.03132939, + "epoch": 0.24710656846535398, + "flos": 22998845376000.0, + "grad_norm": 1.9000447272053396, + "language_loss": 0.79328829, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.81488264, + "num_input_tokens_seen": 88529780, + "step": 4110, + "time_per_iteration": 2.7403078079223633 + }, + { + "auxiliary_loss_clip": 0.01114435, + "auxiliary_loss_mlp": 0.0104359, + "balance_loss_clip": 1.04750848, + "balance_loss_mlp": 1.02536333, + "epoch": 0.24716669171802194, + "flos": 26432803451520.0, + "grad_norm": 1.9757162932013852, + "language_loss": 0.80630267, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.82788301, + "num_input_tokens_seen": 88547200, + "step": 4111, + "time_per_iteration": 2.6893255710601807 + }, + { + "auxiliary_loss_clip": 0.01143907, + "auxiliary_loss_mlp": 0.0104799, + "balance_loss_clip": 1.0493356, + "balance_loss_mlp": 1.03203976, + "epoch": 0.2472268149706899, + "flos": 23329870139520.0, + "grad_norm": 1.928179444788623, + "language_loss": 0.75401616, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.77593511, + "num_input_tokens_seen": 88566415, + "step": 4112, + "time_per_iteration": 2.641103506088257 + }, + { + "auxiliary_loss_clip": 0.01112249, + "auxiliary_loss_mlp": 0.00775958, + "balance_loss_clip": 1.04847336, + "balance_loss_mlp": 1.00114262, + "epoch": 0.24728693822335787, + "flos": 23768734510080.0, + "grad_norm": 2.1227710866712908, + "language_loss": 0.8244158, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.84329784, + "num_input_tokens_seen": 88585225, + "step": 4113, + "time_per_iteration": 2.831209182739258 + }, + { + "auxiliary_loss_clip": 0.01143893, + "auxiliary_loss_mlp": 0.01043423, + "balance_loss_clip": 1.04927897, + "balance_loss_mlp": 1.02574396, + "epoch": 0.24734706147602586, + "flos": 19317499355520.0, + "grad_norm": 2.5263325514304813, + "language_loss": 0.8704375, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.89231074, + "num_input_tokens_seen": 88603280, + "step": 4114, + "time_per_iteration": 2.7264626026153564 + }, + { + "auxiliary_loss_clip": 0.01096969, + "auxiliary_loss_mlp": 0.01047533, + "balance_loss_clip": 1.04748011, + "balance_loss_mlp": 1.03005731, + "epoch": 0.24740718472869383, + "flos": 28036932935040.0, + "grad_norm": 1.6498261942323098, + "language_loss": 0.75283766, + "learning_rate": 3.524328457352734e-06, + "loss": 0.77428269, + "num_input_tokens_seen": 88624925, + "step": 4115, + "time_per_iteration": 2.755342483520508 + }, + { + "auxiliary_loss_clip": 0.01018711, + "auxiliary_loss_mlp": 0.01070163, + "balance_loss_clip": 1.03186083, + "balance_loss_mlp": 1.06756425, + "epoch": 0.2474673079813618, + "flos": 68107569408000.0, + "grad_norm": 0.6879904854197085, + "language_loss": 0.58123159, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60212028, + "num_input_tokens_seen": 88691475, + "step": 4116, + "time_per_iteration": 3.4015462398529053 + }, + { + "auxiliary_loss_clip": 0.01122111, + "auxiliary_loss_mlp": 0.01038886, + "balance_loss_clip": 1.04813063, + "balance_loss_mlp": 1.02213693, + "epoch": 0.24752743123402976, + "flos": 29462119839360.0, + "grad_norm": 19.427234883564427, + "language_loss": 0.83599627, + "learning_rate": 3.523824079451235e-06, + "loss": 0.85760617, + "num_input_tokens_seen": 88713425, + "step": 4117, + "time_per_iteration": 2.7881336212158203 + }, + { + "auxiliary_loss_clip": 0.01041379, + "auxiliary_loss_mlp": 0.00755386, + "balance_loss_clip": 1.02616835, + "balance_loss_mlp": 1.0023396, + "epoch": 0.24758755448669773, + "flos": 58350459824640.0, + "grad_norm": 0.909523411860611, + "language_loss": 0.63518536, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.65315294, + "num_input_tokens_seen": 88769995, + "step": 4118, + "time_per_iteration": 3.1125216484069824 + }, + { + "auxiliary_loss_clip": 0.01126335, + "auxiliary_loss_mlp": 0.01048787, + "balance_loss_clip": 1.04487431, + "balance_loss_mlp": 1.03127515, + "epoch": 0.2476476777393657, + "flos": 20484416494080.0, + "grad_norm": 2.1708029437062546, + "language_loss": 0.79272264, + "learning_rate": 3.523319470415491e-06, + "loss": 0.81447387, + "num_input_tokens_seen": 88789970, + "step": 4119, + "time_per_iteration": 6.294121503829956 + }, + { + "auxiliary_loss_clip": 0.01133521, + "auxiliary_loss_mlp": 0.01044138, + "balance_loss_clip": 1.05223441, + "balance_loss_mlp": 1.02707899, + "epoch": 0.24770780099203366, + "flos": 20485853038080.0, + "grad_norm": 1.7395275513138477, + "language_loss": 0.74590164, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.76767826, + "num_input_tokens_seen": 88810000, + "step": 4120, + "time_per_iteration": 2.6947290897369385 + }, + { + "auxiliary_loss_clip": 0.01135162, + "auxiliary_loss_mlp": 0.01051636, + "balance_loss_clip": 1.04963648, + "balance_loss_mlp": 1.03435111, + "epoch": 0.24776792424470165, + "flos": 15153405523200.0, + "grad_norm": 3.32651820696464, + "language_loss": 0.88006538, + "learning_rate": 3.522814630322041e-06, + "loss": 0.90193337, + "num_input_tokens_seen": 88827515, + "step": 4121, + "time_per_iteration": 4.181556224822998 + }, + { + "auxiliary_loss_clip": 0.01147178, + "auxiliary_loss_mlp": 0.01042601, + "balance_loss_clip": 1.05039763, + "balance_loss_mlp": 1.02431381, + "epoch": 0.2478280474973696, + "flos": 21725453347200.0, + "grad_norm": 2.0457274986343204, + "language_loss": 0.69676709, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.71866482, + "num_input_tokens_seen": 88845025, + "step": 4122, + "time_per_iteration": 2.7041239738464355 + }, + { + "auxiliary_loss_clip": 0.01147132, + "auxiliary_loss_mlp": 0.01045532, + "balance_loss_clip": 1.05045271, + "balance_loss_mlp": 1.02655339, + "epoch": 0.24788817075003758, + "flos": 20412200200320.0, + "grad_norm": 2.4058135017179976, + "language_loss": 0.8026911, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.82461774, + "num_input_tokens_seen": 88861740, + "step": 4123, + "time_per_iteration": 2.6154532432556152 + }, + { + "auxiliary_loss_clip": 0.01085408, + "auxiliary_loss_mlp": 0.0105298, + "balance_loss_clip": 1.04720712, + "balance_loss_mlp": 1.0354923, + "epoch": 0.24794829400270554, + "flos": 22594455083520.0, + "grad_norm": 2.2195758993023578, + "language_loss": 0.74967635, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.77106017, + "num_input_tokens_seen": 88879740, + "step": 4124, + "time_per_iteration": 2.787986993789673 + }, + { + "auxiliary_loss_clip": 0.01131947, + "auxiliary_loss_mlp": 0.01044392, + "balance_loss_clip": 1.04892588, + "balance_loss_mlp": 1.02809608, + "epoch": 0.2480084172553735, + "flos": 39676047615360.0, + "grad_norm": 1.4128536066198873, + "language_loss": 0.73432529, + "learning_rate": 3.521804257268357e-06, + "loss": 0.75608873, + "num_input_tokens_seen": 88904095, + "step": 4125, + "time_per_iteration": 4.472416162490845 + }, + { + "auxiliary_loss_clip": 0.01109646, + "auxiliary_loss_mlp": 0.00776697, + "balance_loss_clip": 1.04420686, + "balance_loss_mlp": 1.00122678, + "epoch": 0.24806854050804147, + "flos": 22053712763520.0, + "grad_norm": 1.9607758383710057, + "language_loss": 0.69630861, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.71517205, + "num_input_tokens_seen": 88920740, + "step": 4126, + "time_per_iteration": 2.7412056922912598 + }, + { + "auxiliary_loss_clip": 0.01133758, + "auxiliary_loss_mlp": 0.01051914, + "balance_loss_clip": 1.047984, + "balance_loss_mlp": 1.03331721, + "epoch": 0.24812866376070947, + "flos": 15486764670720.0, + "grad_norm": 2.275786464609162, + "language_loss": 0.81219494, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.83405173, + "num_input_tokens_seen": 88938510, + "step": 4127, + "time_per_iteration": 2.620143413543701 + }, + { + "auxiliary_loss_clip": 0.01136685, + "auxiliary_loss_mlp": 0.00775421, + "balance_loss_clip": 1.04974318, + "balance_loss_mlp": 1.00120401, + "epoch": 0.24818878701337743, + "flos": 14757419013120.0, + "grad_norm": 6.503475382998669, + "language_loss": 0.8435086, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.86262965, + "num_input_tokens_seen": 88955235, + "step": 4128, + "time_per_iteration": 2.6764745712280273 + }, + { + "auxiliary_loss_clip": 0.01117625, + "auxiliary_loss_mlp": 0.01057179, + "balance_loss_clip": 1.04831362, + "balance_loss_mlp": 1.03814149, + "epoch": 0.2482489102660454, + "flos": 27089501852160.0, + "grad_norm": 7.318299516736359, + "language_loss": 0.6572547, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.67900276, + "num_input_tokens_seen": 88975210, + "step": 4129, + "time_per_iteration": 2.7178256511688232 + }, + { + "auxiliary_loss_clip": 0.01098796, + "auxiliary_loss_mlp": 0.01044421, + "balance_loss_clip": 1.04595077, + "balance_loss_mlp": 1.02570498, + "epoch": 0.24830903351871336, + "flos": 26467528924800.0, + "grad_norm": 1.8507928533331595, + "language_loss": 0.7496134, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.77104557, + "num_input_tokens_seen": 88996120, + "step": 4130, + "time_per_iteration": 2.82098126411438 + }, + { + "auxiliary_loss_clip": 0.01078173, + "auxiliary_loss_mlp": 0.01050295, + "balance_loss_clip": 1.04238284, + "balance_loss_mlp": 1.03163934, + "epoch": 0.24836915677138133, + "flos": 10228436870400.0, + "grad_norm": 2.098795320061471, + "language_loss": 0.7680133, + "learning_rate": 3.520286966670535e-06, + "loss": 0.78929794, + "num_input_tokens_seen": 89008685, + "step": 4131, + "time_per_iteration": 2.7543740272521973 + }, + { + "auxiliary_loss_clip": 0.0113176, + "auxiliary_loss_mlp": 0.0104424, + "balance_loss_clip": 1.04992545, + "balance_loss_mlp": 1.02781272, + "epoch": 0.2484292800240493, + "flos": 30080429579520.0, + "grad_norm": 2.181098565661814, + "language_loss": 0.83579504, + "learning_rate": 3.520033883075255e-06, + "loss": 0.85755503, + "num_input_tokens_seen": 89031160, + "step": 4132, + "time_per_iteration": 2.681339979171753 + }, + { + "auxiliary_loss_clip": 0.01120332, + "auxiliary_loss_mlp": 0.01043901, + "balance_loss_clip": 1.04574823, + "balance_loss_mlp": 1.02506626, + "epoch": 0.24848940327671726, + "flos": 13442944803840.0, + "grad_norm": 1.8557605687682572, + "language_loss": 0.71320271, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.73484504, + "num_input_tokens_seen": 89047235, + "step": 4133, + "time_per_iteration": 2.6573541164398193 + }, + { + "auxiliary_loss_clip": 0.01150987, + "auxiliary_loss_mlp": 0.0104789, + "balance_loss_clip": 1.05105197, + "balance_loss_mlp": 1.02624202, + "epoch": 0.24854952652938525, + "flos": 19970247260160.0, + "grad_norm": 3.222598228665933, + "language_loss": 0.61894202, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.64093071, + "num_input_tokens_seen": 89064790, + "step": 4134, + "time_per_iteration": 2.5639493465423584 + }, + { + "auxiliary_loss_clip": 0.01135356, + "auxiliary_loss_mlp": 0.01045434, + "balance_loss_clip": 1.04877877, + "balance_loss_mlp": 1.02764797, + "epoch": 0.24860964978205322, + "flos": 18150187167360.0, + "grad_norm": 1.882175713893398, + "language_loss": 0.78382719, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.80563509, + "num_input_tokens_seen": 89083250, + "step": 4135, + "time_per_iteration": 2.6075639724731445 + }, + { + "auxiliary_loss_clip": 0.01123928, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.05297661, + "balance_loss_mlp": 1.01917946, + "epoch": 0.24866977303472118, + "flos": 11728641329280.0, + "grad_norm": 2.4269193192884186, + "language_loss": 0.83582413, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.85742044, + "num_input_tokens_seen": 89100905, + "step": 4136, + "time_per_iteration": 2.623377799987793 + }, + { + "auxiliary_loss_clip": 0.01119838, + "auxiliary_loss_mlp": 0.01045223, + "balance_loss_clip": 1.05071807, + "balance_loss_mlp": 1.02713883, + "epoch": 0.24872989628738915, + "flos": 34823582565120.0, + "grad_norm": 2.1322549527950665, + "language_loss": 0.7057327, + "learning_rate": 3.518767600693314e-06, + "loss": 0.72738326, + "num_input_tokens_seen": 89122630, + "step": 4137, + "time_per_iteration": 2.814115524291992 + }, + { + "auxiliary_loss_clip": 0.01133507, + "auxiliary_loss_mlp": 0.00775347, + "balance_loss_clip": 1.0449059, + "balance_loss_mlp": 1.00107706, + "epoch": 0.2487900195400571, + "flos": 13699347062400.0, + "grad_norm": 2.085766315480858, + "language_loss": 0.66914427, + "learning_rate": 3.518514171403042e-06, + "loss": 0.68823284, + "num_input_tokens_seen": 89141050, + "step": 4138, + "time_per_iteration": 2.646043539047241 + }, + { + "auxiliary_loss_clip": 0.01103579, + "auxiliary_loss_mlp": 0.01036477, + "balance_loss_clip": 1.04612446, + "balance_loss_mlp": 1.02000237, + "epoch": 0.24885014279272508, + "flos": 25337815297920.0, + "grad_norm": 1.983116672544965, + "language_loss": 0.83913636, + "learning_rate": 3.51826068453056e-06, + "loss": 0.86053687, + "num_input_tokens_seen": 89160810, + "step": 4139, + "time_per_iteration": 2.741090774536133 + }, + { + "auxiliary_loss_clip": 0.01111549, + "auxiliary_loss_mlp": 0.01040434, + "balance_loss_clip": 1.04586422, + "balance_loss_mlp": 1.02192068, + "epoch": 0.24891026604539307, + "flos": 20631434860800.0, + "grad_norm": 1.4951428686450043, + "language_loss": 0.78923917, + "learning_rate": 3.518007140085481e-06, + "loss": 0.81075907, + "num_input_tokens_seen": 89180610, + "step": 4140, + "time_per_iteration": 2.712780237197876 + }, + { + "auxiliary_loss_clip": 0.01048621, + "auxiliary_loss_mlp": 0.01096526, + "balance_loss_clip": 1.02931261, + "balance_loss_mlp": 1.09464228, + "epoch": 0.24897038929806103, + "flos": 66960294030720.0, + "grad_norm": 0.8293539951671052, + "language_loss": 0.61007011, + "learning_rate": 3.51775353807742e-06, + "loss": 0.63152146, + "num_input_tokens_seen": 89241880, + "step": 4141, + "time_per_iteration": 3.240020513534546 + }, + { + "auxiliary_loss_clip": 0.01147379, + "auxiliary_loss_mlp": 0.01049841, + "balance_loss_clip": 1.05116534, + "balance_loss_mlp": 1.03240097, + "epoch": 0.249030512550729, + "flos": 36392555612160.0, + "grad_norm": 2.1246942361961025, + "language_loss": 0.72794569, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.74991786, + "num_input_tokens_seen": 89263340, + "step": 4142, + "time_per_iteration": 2.7316160202026367 + }, + { + "auxiliary_loss_clip": 0.01133287, + "auxiliary_loss_mlp": 0.01044374, + "balance_loss_clip": 1.04780602, + "balance_loss_mlp": 1.02705276, + "epoch": 0.24909063580339696, + "flos": 20154576879360.0, + "grad_norm": 1.7635050074541005, + "language_loss": 0.80630821, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.82808483, + "num_input_tokens_seen": 89282870, + "step": 4143, + "time_per_iteration": 2.6763389110565186 + }, + { + "auxiliary_loss_clip": 0.01117552, + "auxiliary_loss_mlp": 0.01036613, + "balance_loss_clip": 1.04615402, + "balance_loss_mlp": 1.02026916, + "epoch": 0.24915075905606493, + "flos": 26396569607040.0, + "grad_norm": 2.7235452599944145, + "language_loss": 0.59766376, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.61920542, + "num_input_tokens_seen": 89303830, + "step": 4144, + "time_per_iteration": 2.789417266845703 + }, + { + "auxiliary_loss_clip": 0.01128344, + "auxiliary_loss_mlp": 0.01045393, + "balance_loss_clip": 1.04464769, + "balance_loss_mlp": 1.02850127, + "epoch": 0.2492108823087329, + "flos": 27527216987520.0, + "grad_norm": 2.1754585056135047, + "language_loss": 0.78476733, + "learning_rate": 3.516738554607708e-06, + "loss": 0.80650467, + "num_input_tokens_seen": 89324350, + "step": 4145, + "time_per_iteration": 2.8416056632995605 + }, + { + "auxiliary_loss_clip": 0.01140077, + "auxiliary_loss_mlp": 0.00778414, + "balance_loss_clip": 1.04980016, + "balance_loss_mlp": 1.00122261, + "epoch": 0.24927100556140086, + "flos": 16691388111360.0, + "grad_norm": 2.035933799021365, + "language_loss": 0.64925039, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.66843534, + "num_input_tokens_seen": 89342875, + "step": 4146, + "time_per_iteration": 2.818240165710449 + }, + { + "auxiliary_loss_clip": 0.01036642, + "auxiliary_loss_mlp": 0.0100618, + "balance_loss_clip": 1.02582741, + "balance_loss_mlp": 1.00403452, + "epoch": 0.24933112881406885, + "flos": 62772464286720.0, + "grad_norm": 0.9560925601012792, + "language_loss": 0.67304933, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.69347757, + "num_input_tokens_seen": 89404925, + "step": 4147, + "time_per_iteration": 3.339989185333252 + }, + { + "auxiliary_loss_clip": 0.01123141, + "auxiliary_loss_mlp": 0.0104863, + "balance_loss_clip": 1.04991198, + "balance_loss_mlp": 1.03078485, + "epoch": 0.24939125206673682, + "flos": 26651894457600.0, + "grad_norm": 2.4221411280533554, + "language_loss": 0.89285177, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.9145695, + "num_input_tokens_seen": 89425090, + "step": 4148, + "time_per_iteration": 2.7497105598449707 + }, + { + "auxiliary_loss_clip": 0.01098234, + "auxiliary_loss_mlp": 0.01049718, + "balance_loss_clip": 1.04725289, + "balance_loss_mlp": 1.02874899, + "epoch": 0.24945137531940478, + "flos": 20704333512960.0, + "grad_norm": 1.90098046882646, + "language_loss": 0.68272161, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.70420116, + "num_input_tokens_seen": 89442615, + "step": 4149, + "time_per_iteration": 2.7739884853363037 + }, + { + "auxiliary_loss_clip": 0.01134907, + "auxiliary_loss_mlp": 0.01044357, + "balance_loss_clip": 1.0508213, + "balance_loss_mlp": 1.02747655, + "epoch": 0.24951149857207275, + "flos": 23768662682880.0, + "grad_norm": 1.67166255010053, + "language_loss": 0.71424097, + "learning_rate": 3.515468531258095e-06, + "loss": 0.73603356, + "num_input_tokens_seen": 89463025, + "step": 4150, + "time_per_iteration": 2.6801233291625977 + }, + { + "auxiliary_loss_clip": 0.01098898, + "auxiliary_loss_mlp": 0.0104939, + "balance_loss_clip": 1.04628861, + "balance_loss_mlp": 1.03149676, + "epoch": 0.2495716218247407, + "flos": 15664881237120.0, + "grad_norm": 4.371450104119659, + "language_loss": 0.72732216, + "learning_rate": 3.515214354149478e-06, + "loss": 0.74880505, + "num_input_tokens_seen": 89480225, + "step": 4151, + "time_per_iteration": 2.7118351459503174 + }, + { + "auxiliary_loss_clip": 0.01142805, + "auxiliary_loss_mlp": 0.01054095, + "balance_loss_clip": 1.05117846, + "balance_loss_mlp": 1.0357486, + "epoch": 0.24963174507740868, + "flos": 24052499953920.0, + "grad_norm": 3.4200711789217397, + "language_loss": 0.63707078, + "learning_rate": 3.514960119583781e-06, + "loss": 0.65903974, + "num_input_tokens_seen": 89496985, + "step": 4152, + "time_per_iteration": 2.6352219581604004 + }, + { + "auxiliary_loss_clip": 0.01128057, + "auxiliary_loss_mlp": 0.01043812, + "balance_loss_clip": 1.05110407, + "balance_loss_mlp": 1.02628791, + "epoch": 0.24969186833007664, + "flos": 21799501234560.0, + "grad_norm": 3.664579624689737, + "language_loss": 0.77259195, + "learning_rate": 3.514705827570645e-06, + "loss": 0.79431069, + "num_input_tokens_seen": 89514420, + "step": 4153, + "time_per_iteration": 2.6120872497558594 + }, + { + "auxiliary_loss_clip": 0.01135035, + "auxiliary_loss_mlp": 0.01042909, + "balance_loss_clip": 1.05221617, + "balance_loss_mlp": 1.02620757, + "epoch": 0.24975199158274464, + "flos": 19938143479680.0, + "grad_norm": 2.5781435797973833, + "language_loss": 0.7677725, + "learning_rate": 3.514451478119711e-06, + "loss": 0.78955191, + "num_input_tokens_seen": 89532925, + "step": 4154, + "time_per_iteration": 2.7488853931427 + }, + { + "auxiliary_loss_clip": 0.0113655, + "auxiliary_loss_mlp": 0.01051969, + "balance_loss_clip": 1.05146766, + "balance_loss_mlp": 1.03251421, + "epoch": 0.2498121148354126, + "flos": 25338389915520.0, + "grad_norm": 1.9052782276095375, + "language_loss": 0.70335877, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.72524405, + "num_input_tokens_seen": 89552855, + "step": 4155, + "time_per_iteration": 2.6622395515441895 + }, + { + "auxiliary_loss_clip": 0.01127695, + "auxiliary_loss_mlp": 0.01047805, + "balance_loss_clip": 1.05243564, + "balance_loss_mlp": 1.03074658, + "epoch": 0.24987223808808057, + "flos": 20558787603840.0, + "grad_norm": 1.6974192026095432, + "language_loss": 0.74953228, + "learning_rate": 3.513942606943036e-06, + "loss": 0.77128726, + "num_input_tokens_seen": 89572830, + "step": 4156, + "time_per_iteration": 2.7599329948425293 + }, + { + "auxiliary_loss_clip": 0.01127061, + "auxiliary_loss_mlp": 0.01040498, + "balance_loss_clip": 1.04922485, + "balance_loss_mlp": 1.02404737, + "epoch": 0.24993236134074853, + "flos": 19749037351680.0, + "grad_norm": 2.6541448192787858, + "language_loss": 0.76703429, + "learning_rate": 3.513688085236591e-06, + "loss": 0.78870988, + "num_input_tokens_seen": 89590345, + "step": 4157, + "time_per_iteration": 4.172720432281494 + }, + { + "auxiliary_loss_clip": 0.01087279, + "auxiliary_loss_mlp": 0.01050682, + "balance_loss_clip": 1.04686046, + "balance_loss_mlp": 1.03302717, + "epoch": 0.2499924845934165, + "flos": 18770292587520.0, + "grad_norm": 6.508490360255271, + "language_loss": 0.81656492, + "learning_rate": 3.513433506130942e-06, + "loss": 0.83794451, + "num_input_tokens_seen": 89610295, + "step": 4158, + "time_per_iteration": 4.373260736465454 + }, + { + "auxiliary_loss_clip": 0.01115824, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.04740119, + "balance_loss_mlp": 1.02166879, + "epoch": 0.25005260784608446, + "flos": 16872198197760.0, + "grad_norm": 2.799032697181286, + "language_loss": 0.76568067, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.78723395, + "num_input_tokens_seen": 89627795, + "step": 4159, + "time_per_iteration": 2.6529338359832764 + }, + { + "auxiliary_loss_clip": 0.01139337, + "auxiliary_loss_mlp": 0.01038581, + "balance_loss_clip": 1.05149508, + "balance_loss_mlp": 1.02013946, + "epoch": 0.2501127310987524, + "flos": 22124923476480.0, + "grad_norm": 2.4918403268433122, + "language_loss": 0.71557873, + "learning_rate": 3.512924175760649e-06, + "loss": 0.73735791, + "num_input_tokens_seen": 89648090, + "step": 4160, + "time_per_iteration": 4.178418874740601 + }, + { + "auxiliary_loss_clip": 0.01062459, + "auxiliary_loss_mlp": 0.01001923, + "balance_loss_clip": 1.02823949, + "balance_loss_mlp": 0.99992067, + "epoch": 0.2501728543514204, + "flos": 69458061980160.0, + "grad_norm": 0.7611682305123987, + "language_loss": 0.56783372, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.58847755, + "num_input_tokens_seen": 89710345, + "step": 4161, + "time_per_iteration": 3.1690969467163086 + }, + { + "auxiliary_loss_clip": 0.0114076, + "auxiliary_loss_mlp": 0.01048659, + "balance_loss_clip": 1.05206347, + "balance_loss_mlp": 1.0308131, + "epoch": 0.25023297760408836, + "flos": 16289978647680.0, + "grad_norm": 4.523737291621751, + "language_loss": 0.80654883, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.82844305, + "num_input_tokens_seen": 89729390, + "step": 4162, + "time_per_iteration": 2.630491018295288 + }, + { + "auxiliary_loss_clip": 0.01127145, + "auxiliary_loss_mlp": 0.00776859, + "balance_loss_clip": 1.04807281, + "balance_loss_mlp": 1.00124371, + "epoch": 0.2502931008567563, + "flos": 12237998140800.0, + "grad_norm": 3.0029202967601107, + "language_loss": 0.87312925, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.89216936, + "num_input_tokens_seen": 89742805, + "step": 4163, + "time_per_iteration": 2.660985231399536 + }, + { + "auxiliary_loss_clip": 0.01133331, + "auxiliary_loss_mlp": 0.01039671, + "balance_loss_clip": 1.0538981, + "balance_loss_mlp": 1.02234972, + "epoch": 0.25035322410942434, + "flos": 23181882105600.0, + "grad_norm": 1.700690076898522, + "language_loss": 0.83170879, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.85343885, + "num_input_tokens_seen": 89761145, + "step": 4164, + "time_per_iteration": 4.217406988143921 + }, + { + "auxiliary_loss_clip": 0.01131608, + "auxiliary_loss_mlp": 0.01047174, + "balance_loss_clip": 1.05681539, + "balance_loss_mlp": 1.0309732, + "epoch": 0.2504133473620923, + "flos": 20917534688640.0, + "grad_norm": 1.61687510361108, + "language_loss": 0.73889691, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.76068473, + "num_input_tokens_seen": 89780905, + "step": 4165, + "time_per_iteration": 2.7395150661468506 + }, + { + "auxiliary_loss_clip": 0.01112927, + "auxiliary_loss_mlp": 0.01043589, + "balance_loss_clip": 1.04912043, + "balance_loss_mlp": 1.02611279, + "epoch": 0.2504734706147603, + "flos": 20776549806720.0, + "grad_norm": 1.856982928728685, + "language_loss": 0.74739552, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.7689606, + "num_input_tokens_seen": 89799230, + "step": 4166, + "time_per_iteration": 2.7567081451416016 + }, + { + "auxiliary_loss_clip": 0.01110594, + "auxiliary_loss_mlp": 0.01042647, + "balance_loss_clip": 1.04968488, + "balance_loss_mlp": 1.02651834, + "epoch": 0.25053359386742824, + "flos": 24349373861760.0, + "grad_norm": 2.0013578528528724, + "language_loss": 0.82254446, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.84407687, + "num_input_tokens_seen": 89818240, + "step": 4167, + "time_per_iteration": 2.692664384841919 + }, + { + "auxiliary_loss_clip": 0.01130059, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_clip": 1.05185139, + "balance_loss_mlp": 1.02695727, + "epoch": 0.2505937171200962, + "flos": 21214336769280.0, + "grad_norm": 2.4392619558537407, + "language_loss": 0.79381847, + "learning_rate": 3.51088456024312e-06, + "loss": 0.81555605, + "num_input_tokens_seen": 89834485, + "step": 4168, + "time_per_iteration": 2.6286962032318115 + }, + { + "auxiliary_loss_clip": 0.01138966, + "auxiliary_loss_mlp": 0.01046302, + "balance_loss_clip": 1.05118442, + "balance_loss_mlp": 1.02704966, + "epoch": 0.25065384037276417, + "flos": 41427231379200.0, + "grad_norm": 2.2753262043393243, + "language_loss": 0.69603884, + "learning_rate": 3.510629350383849e-06, + "loss": 0.71789157, + "num_input_tokens_seen": 89855645, + "step": 4169, + "time_per_iteration": 2.7935590744018555 + }, + { + "auxiliary_loss_clip": 0.01110761, + "auxiliary_loss_mlp": 0.01049625, + "balance_loss_clip": 1.04870963, + "balance_loss_mlp": 1.03274524, + "epoch": 0.25071396362543213, + "flos": 26102389219200.0, + "grad_norm": 1.8250030020409629, + "language_loss": 0.78045398, + "learning_rate": 3.510374083241361e-06, + "loss": 0.80205786, + "num_input_tokens_seen": 89874895, + "step": 4170, + "time_per_iteration": 2.7728679180145264 + }, + { + "auxiliary_loss_clip": 0.01128286, + "auxiliary_loss_mlp": 0.01043437, + "balance_loss_clip": 1.05320668, + "balance_loss_mlp": 1.02662849, + "epoch": 0.2507740868781001, + "flos": 19098982967040.0, + "grad_norm": 2.5073993684848004, + "language_loss": 0.76440209, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.78611928, + "num_input_tokens_seen": 89891700, + "step": 4171, + "time_per_iteration": 2.7825160026550293 + }, + { + "auxiliary_loss_clip": 0.01061117, + "auxiliary_loss_mlp": 0.01002396, + "balance_loss_clip": 1.027282, + "balance_loss_mlp": 1.00034571, + "epoch": 0.25083421013076806, + "flos": 64341868296960.0, + "grad_norm": 0.8424544393272001, + "language_loss": 0.6006161, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62125123, + "num_input_tokens_seen": 89955775, + "step": 4172, + "time_per_iteration": 3.1981940269470215 + }, + { + "auxiliary_loss_clip": 0.01125517, + "auxiliary_loss_mlp": 0.01046213, + "balance_loss_clip": 1.05005789, + "balance_loss_mlp": 1.02821243, + "epoch": 0.25089433338343603, + "flos": 24279599692800.0, + "grad_norm": 1.4368714421460043, + "language_loss": 0.79106563, + "learning_rate": 3.509607938211409e-06, + "loss": 0.81278288, + "num_input_tokens_seen": 89977150, + "step": 4173, + "time_per_iteration": 2.8311028480529785 + }, + { + "auxiliary_loss_clip": 0.01152553, + "auxiliary_loss_mlp": 0.0104675, + "balance_loss_clip": 1.05725241, + "balance_loss_mlp": 1.02986968, + "epoch": 0.250954456636104, + "flos": 14721472477440.0, + "grad_norm": 2.103663042812158, + "language_loss": 0.83371937, + "learning_rate": 3.509352442032875e-06, + "loss": 0.85571229, + "num_input_tokens_seen": 89994925, + "step": 4174, + "time_per_iteration": 2.696199893951416 + }, + { + "auxiliary_loss_clip": 0.01095749, + "auxiliary_loss_mlp": 0.01049206, + "balance_loss_clip": 1.04728913, + "balance_loss_mlp": 1.03095484, + "epoch": 0.25101457988877196, + "flos": 22273593868800.0, + "grad_norm": 43.022796554959484, + "language_loss": 0.71023381, + "learning_rate": 3.509096888619545e-06, + "loss": 0.73168337, + "num_input_tokens_seen": 90013235, + "step": 4175, + "time_per_iteration": 2.8337926864624023 + }, + { + "auxiliary_loss_clip": 0.01119154, + "auxiliary_loss_mlp": 0.01038924, + "balance_loss_clip": 1.05135846, + "balance_loss_mlp": 1.02145982, + "epoch": 0.2510747031414399, + "flos": 25188929424000.0, + "grad_norm": 2.017414900854033, + "language_loss": 0.80957019, + "learning_rate": 3.50884127798111e-06, + "loss": 0.83115101, + "num_input_tokens_seen": 90032150, + "step": 4176, + "time_per_iteration": 2.936908483505249 + }, + { + "auxiliary_loss_clip": 0.01127542, + "auxiliary_loss_mlp": 0.0104611, + "balance_loss_clip": 1.0535233, + "balance_loss_mlp": 1.02753711, + "epoch": 0.25113482639410795, + "flos": 20704189858560.0, + "grad_norm": 2.475574978330162, + "language_loss": 0.82294285, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.84467936, + "num_input_tokens_seen": 90049085, + "step": 4177, + "time_per_iteration": 2.7630460262298584 + }, + { + "auxiliary_loss_clip": 0.01110202, + "auxiliary_loss_mlp": 0.01051495, + "balance_loss_clip": 1.05168724, + "balance_loss_mlp": 1.03386414, + "epoch": 0.2511949496467759, + "flos": 21506936958720.0, + "grad_norm": 2.1761277698635593, + "language_loss": 0.82517993, + "learning_rate": 3.508329885067698e-06, + "loss": 0.84679693, + "num_input_tokens_seen": 90067695, + "step": 4178, + "time_per_iteration": 2.7356274127960205 + }, + { + "auxiliary_loss_clip": 0.01145101, + "auxiliary_loss_mlp": 0.00775573, + "balance_loss_clip": 1.05324888, + "balance_loss_mlp": 1.00148535, + "epoch": 0.2512550728994439, + "flos": 20701999128960.0, + "grad_norm": 2.1475299559000947, + "language_loss": 0.75229692, + "learning_rate": 3.508074102812112e-06, + "loss": 0.77150369, + "num_input_tokens_seen": 90083890, + "step": 4179, + "time_per_iteration": 2.631096363067627 + }, + { + "auxiliary_loss_clip": 0.01109293, + "auxiliary_loss_mlp": 0.01056583, + "balance_loss_clip": 1.04920673, + "balance_loss_mlp": 1.03833175, + "epoch": 0.25131519615211184, + "flos": 18478626151680.0, + "grad_norm": 1.9599833122138943, + "language_loss": 0.69976825, + "learning_rate": 3.507818263370206e-06, + "loss": 0.72142696, + "num_input_tokens_seen": 90100995, + "step": 4180, + "time_per_iteration": 2.708122730255127 + }, + { + "auxiliary_loss_clip": 0.01147992, + "auxiliary_loss_mlp": 0.01045783, + "balance_loss_clip": 1.05343485, + "balance_loss_mlp": 1.02909422, + "epoch": 0.2513753194047798, + "flos": 20484955198080.0, + "grad_norm": 1.8622914556591927, + "language_loss": 0.85940182, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.88133955, + "num_input_tokens_seen": 90120365, + "step": 4181, + "time_per_iteration": 2.633091449737549 + }, + { + "auxiliary_loss_clip": 0.01148017, + "auxiliary_loss_mlp": 0.01049707, + "balance_loss_clip": 1.05351245, + "balance_loss_mlp": 1.03270781, + "epoch": 0.25143544265744777, + "flos": 37670077704960.0, + "grad_norm": 2.0695978407502467, + "language_loss": 0.6856631, + "learning_rate": 3.507306412966238e-06, + "loss": 0.70764029, + "num_input_tokens_seen": 90142610, + "step": 4182, + "time_per_iteration": 2.8169894218444824 + }, + { + "auxiliary_loss_clip": 0.01041202, + "auxiliary_loss_mlp": 0.010083, + "balance_loss_clip": 1.02456141, + "balance_loss_mlp": 1.00577307, + "epoch": 0.25149556591011574, + "flos": 69367457923200.0, + "grad_norm": 0.8403189096432666, + "language_loss": 0.70032597, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72082102, + "num_input_tokens_seen": 90200555, + "step": 4183, + "time_per_iteration": 3.2070610523223877 + }, + { + "auxiliary_loss_clip": 0.01130203, + "auxiliary_loss_mlp": 0.01042834, + "balance_loss_clip": 1.05145216, + "balance_loss_mlp": 1.02441609, + "epoch": 0.2515556891627837, + "flos": 13990402967040.0, + "grad_norm": 1.8802113118438855, + "language_loss": 0.73834902, + "learning_rate": 3.506794333933431e-06, + "loss": 0.76007938, + "num_input_tokens_seen": 90218120, + "step": 4184, + "time_per_iteration": 2.691950559616089 + }, + { + "auxiliary_loss_clip": 0.01136971, + "auxiliary_loss_mlp": 0.01047362, + "balance_loss_clip": 1.05233765, + "balance_loss_mlp": 1.0297792, + "epoch": 0.25161581241545167, + "flos": 22163527618560.0, + "grad_norm": 1.8676646084141537, + "language_loss": 0.8334859, + "learning_rate": 3.506538208705484e-06, + "loss": 0.85532916, + "num_input_tokens_seen": 90236790, + "step": 4185, + "time_per_iteration": 2.6931228637695312 + }, + { + "auxiliary_loss_clip": 0.01022217, + "auxiliary_loss_mlp": 0.01010846, + "balance_loss_clip": 1.03471541, + "balance_loss_mlp": 1.00902176, + "epoch": 0.25167593566811963, + "flos": 69358407696000.0, + "grad_norm": 0.7883550117667959, + "language_loss": 0.61448294, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.63481361, + "num_input_tokens_seen": 90297070, + "step": 4186, + "time_per_iteration": 3.175295829772949 + }, + { + "auxiliary_loss_clip": 0.01107804, + "auxiliary_loss_mlp": 0.01041844, + "balance_loss_clip": 1.04873872, + "balance_loss_mlp": 1.02405787, + "epoch": 0.2517360589207876, + "flos": 13261452359040.0, + "grad_norm": 1.8553357788385085, + "language_loss": 0.79070914, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.81220555, + "num_input_tokens_seen": 90315255, + "step": 4187, + "time_per_iteration": 2.887378215789795 + }, + { + "auxiliary_loss_clip": 0.01091434, + "auxiliary_loss_mlp": 0.01049489, + "balance_loss_clip": 1.0482558, + "balance_loss_mlp": 1.03138089, + "epoch": 0.25179618217345556, + "flos": 20376828282240.0, + "grad_norm": 3.7749228259968586, + "language_loss": 0.79629189, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.8177011, + "num_input_tokens_seen": 90334990, + "step": 4188, + "time_per_iteration": 2.8985629081726074 + }, + { + "auxiliary_loss_clip": 0.01133381, + "auxiliary_loss_mlp": 0.01046993, + "balance_loss_clip": 1.05168021, + "balance_loss_mlp": 1.03012538, + "epoch": 0.25185630542612353, + "flos": 27664718250240.0, + "grad_norm": 1.7363151422402578, + "language_loss": 0.74419165, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.76599538, + "num_input_tokens_seen": 90351825, + "step": 4189, + "time_per_iteration": 2.697097063064575 + }, + { + "auxiliary_loss_clip": 0.01118534, + "auxiliary_loss_mlp": 0.01044827, + "balance_loss_clip": 1.04871011, + "balance_loss_mlp": 1.02862656, + "epoch": 0.25191642867879155, + "flos": 20996430912000.0, + "grad_norm": 2.0536634388060078, + "language_loss": 0.84721291, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.86884648, + "num_input_tokens_seen": 90369860, + "step": 4190, + "time_per_iteration": 2.731227397918701 + }, + { + "auxiliary_loss_clip": 0.01118209, + "auxiliary_loss_mlp": 0.01044895, + "balance_loss_clip": 1.04597688, + "balance_loss_mlp": 1.02633369, + "epoch": 0.2519765519314595, + "flos": 21105671149440.0, + "grad_norm": 2.0130913170662783, + "language_loss": 0.75695485, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.77858591, + "num_input_tokens_seen": 90389245, + "step": 4191, + "time_per_iteration": 2.7403173446655273 + }, + { + "auxiliary_loss_clip": 0.01048031, + "auxiliary_loss_mlp": 0.01014765, + "balance_loss_clip": 1.02375531, + "balance_loss_mlp": 1.0122261, + "epoch": 0.2520366751841275, + "flos": 62744993360640.0, + "grad_norm": 0.7280864395517058, + "language_loss": 0.57129633, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59192419, + "num_input_tokens_seen": 90456735, + "step": 4192, + "time_per_iteration": 3.237478017807007 + }, + { + "auxiliary_loss_clip": 0.01121978, + "auxiliary_loss_mlp": 0.01041578, + "balance_loss_clip": 1.05535698, + "balance_loss_mlp": 1.02374434, + "epoch": 0.25209679843679544, + "flos": 22230716008320.0, + "grad_norm": 1.8423117439969312, + "language_loss": 0.76066267, + "learning_rate": 3.504487151087323e-06, + "loss": 0.78229821, + "num_input_tokens_seen": 90474165, + "step": 4193, + "time_per_iteration": 2.699486255645752 + }, + { + "auxiliary_loss_clip": 0.01137884, + "auxiliary_loss_mlp": 0.01046125, + "balance_loss_clip": 1.05232048, + "balance_loss_mlp": 1.02869618, + "epoch": 0.2521569216894634, + "flos": 12166643773440.0, + "grad_norm": 3.5003037089711437, + "language_loss": 0.84335077, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.86519086, + "num_input_tokens_seen": 90491660, + "step": 4194, + "time_per_iteration": 2.6561896800994873 + }, + { + "auxiliary_loss_clip": 0.01149932, + "auxiliary_loss_mlp": 0.01050793, + "balance_loss_clip": 1.05253458, + "balance_loss_mlp": 1.03461599, + "epoch": 0.2522170449421314, + "flos": 23699786353920.0, + "grad_norm": 1.3753304678825264, + "language_loss": 0.88249695, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.90450418, + "num_input_tokens_seen": 90514025, + "step": 4195, + "time_per_iteration": 2.67887806892395 + }, + { + "auxiliary_loss_clip": 0.01150202, + "auxiliary_loss_mlp": 0.01041959, + "balance_loss_clip": 1.05412734, + "balance_loss_mlp": 1.02199149, + "epoch": 0.25227716819479934, + "flos": 20955456472320.0, + "grad_norm": 2.4146072325129087, + "language_loss": 0.85488242, + "learning_rate": 3.503717062883053e-06, + "loss": 0.87680399, + "num_input_tokens_seen": 90533530, + "step": 4196, + "time_per_iteration": 2.6358916759490967 + }, + { + "auxiliary_loss_clip": 0.01137804, + "auxiliary_loss_mlp": 0.01049246, + "balance_loss_clip": 1.05213511, + "balance_loss_mlp": 1.03193665, + "epoch": 0.2523372914474673, + "flos": 23331342597120.0, + "grad_norm": 1.9329643035636839, + "language_loss": 0.8319478, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.8538183, + "num_input_tokens_seen": 90554025, + "step": 4197, + "time_per_iteration": 5.738839387893677 + }, + { + "auxiliary_loss_clip": 0.01140063, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_clip": 1.05392218, + "balance_loss_mlp": 1.03224671, + "epoch": 0.25239741470013527, + "flos": 36970321875840.0, + "grad_norm": 2.1358917159416104, + "language_loss": 0.72820318, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.75011802, + "num_input_tokens_seen": 90576930, + "step": 4198, + "time_per_iteration": 2.8819963932037354 + }, + { + "auxiliary_loss_clip": 0.01152924, + "auxiliary_loss_mlp": 0.01048555, + "balance_loss_clip": 1.05455935, + "balance_loss_mlp": 1.03045893, + "epoch": 0.25245753795280323, + "flos": 18515757836160.0, + "grad_norm": 6.722547943004915, + "language_loss": 0.76560014, + "learning_rate": 3.50294646148888e-06, + "loss": 0.78761488, + "num_input_tokens_seen": 90595710, + "step": 4199, + "time_per_iteration": 2.636993169784546 + }, + { + "auxiliary_loss_clip": 0.01125413, + "auxiliary_loss_mlp": 0.00776026, + "balance_loss_clip": 1.05274642, + "balance_loss_mlp": 1.00117147, + "epoch": 0.2525176612054712, + "flos": 32344884737280.0, + "grad_norm": 1.814097809936595, + "language_loss": 0.73571241, + "learning_rate": 3.502689480360739e-06, + "loss": 0.75472683, + "num_input_tokens_seen": 90617945, + "step": 4200, + "time_per_iteration": 4.297755002975464 + }, + { + "auxiliary_loss_clip": 0.01137136, + "auxiliary_loss_mlp": 0.01047957, + "balance_loss_clip": 1.05050063, + "balance_loss_mlp": 1.03187585, + "epoch": 0.25257778445813917, + "flos": 45258217459200.0, + "grad_norm": 1.6490086858694837, + "language_loss": 0.8223114, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.84416234, + "num_input_tokens_seen": 90640855, + "step": 4201, + "time_per_iteration": 2.859703302383423 + }, + { + "auxiliary_loss_clip": 0.01098423, + "auxiliary_loss_mlp": 0.01048, + "balance_loss_clip": 1.05422068, + "balance_loss_mlp": 1.03126347, + "epoch": 0.25263790771080713, + "flos": 23367791923200.0, + "grad_norm": 1.9307441853812024, + "language_loss": 0.74854887, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77001321, + "num_input_tokens_seen": 90661350, + "step": 4202, + "time_per_iteration": 2.7475366592407227 + }, + { + "auxiliary_loss_clip": 0.01134371, + "auxiliary_loss_mlp": 0.01040637, + "balance_loss_clip": 1.05362439, + "balance_loss_mlp": 1.02392364, + "epoch": 0.25269803096347515, + "flos": 18515039564160.0, + "grad_norm": 1.882597455778369, + "language_loss": 0.7323755, + "learning_rate": 3.501918195122491e-06, + "loss": 0.75412554, + "num_input_tokens_seen": 90680540, + "step": 4203, + "time_per_iteration": 2.6547653675079346 + }, + { + "auxiliary_loss_clip": 0.01128208, + "auxiliary_loss_mlp": 0.01039636, + "balance_loss_clip": 1.05176711, + "balance_loss_mlp": 1.02239835, + "epoch": 0.2527581542161431, + "flos": 24610552629120.0, + "grad_norm": 1.4386036639708744, + "language_loss": 0.77731073, + "learning_rate": 3.501660986124297e-06, + "loss": 0.79898918, + "num_input_tokens_seen": 90703460, + "step": 4204, + "time_per_iteration": 4.4116432666778564 + }, + { + "auxiliary_loss_clip": 0.01115267, + "auxiliary_loss_mlp": 0.01052396, + "balance_loss_clip": 1.05262613, + "balance_loss_mlp": 1.03453815, + "epoch": 0.2528182774688111, + "flos": 12641275111680.0, + "grad_norm": 1.9357035590368088, + "language_loss": 0.72175288, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.74342954, + "num_input_tokens_seen": 90718815, + "step": 4205, + "time_per_iteration": 2.6750712394714355 + }, + { + "auxiliary_loss_clip": 0.01124756, + "auxiliary_loss_mlp": 0.01044172, + "balance_loss_clip": 1.05032194, + "balance_loss_mlp": 1.02801895, + "epoch": 0.25287840072147905, + "flos": 46936789879680.0, + "grad_norm": 1.4680577763339375, + "language_loss": 0.75594378, + "learning_rate": 3.50114639730826e-06, + "loss": 0.77763301, + "num_input_tokens_seen": 90742125, + "step": 4206, + "time_per_iteration": 2.876408815383911 + }, + { + "auxiliary_loss_clip": 0.01107683, + "auxiliary_loss_mlp": 0.01044618, + "balance_loss_clip": 1.04771221, + "balance_loss_mlp": 1.02780974, + "epoch": 0.252938523974147, + "flos": 18879712392960.0, + "grad_norm": 1.5378963492414741, + "language_loss": 0.78807724, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.80960023, + "num_input_tokens_seen": 90760785, + "step": 4207, + "time_per_iteration": 2.7176475524902344 + }, + { + "auxiliary_loss_clip": 0.01133715, + "auxiliary_loss_mlp": 0.01055631, + "balance_loss_clip": 1.0547328, + "balance_loss_mlp": 1.03984797, + "epoch": 0.252998647226815, + "flos": 21434720664960.0, + "grad_norm": 1.5723877129370716, + "language_loss": 0.76399815, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.78589159, + "num_input_tokens_seen": 90780045, + "step": 4208, + "time_per_iteration": 2.797658920288086 + }, + { + "auxiliary_loss_clip": 0.01131059, + "auxiliary_loss_mlp": 0.01040866, + "balance_loss_clip": 1.05162513, + "balance_loss_mlp": 1.02465391, + "epoch": 0.25305877047948294, + "flos": 25442171285760.0, + "grad_norm": 3.9595354320915166, + "language_loss": 0.69848049, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.72019976, + "num_input_tokens_seen": 90797980, + "step": 4209, + "time_per_iteration": 2.738159418106079 + }, + { + "auxiliary_loss_clip": 0.01046521, + "auxiliary_loss_mlp": 0.0100386, + "balance_loss_clip": 1.02250004, + "balance_loss_mlp": 1.0015471, + "epoch": 0.2531188937321509, + "flos": 60185603629440.0, + "grad_norm": 0.7787603502724176, + "language_loss": 0.55091059, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.57141441, + "num_input_tokens_seen": 90864865, + "step": 4210, + "time_per_iteration": 3.196953535079956 + }, + { + "auxiliary_loss_clip": 0.01113643, + "auxiliary_loss_mlp": 0.01038759, + "balance_loss_clip": 1.05103207, + "balance_loss_mlp": 1.02215338, + "epoch": 0.25317901698481887, + "flos": 19682387665920.0, + "grad_norm": 1.8504444580052586, + "language_loss": 0.8006835, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.82220757, + "num_input_tokens_seen": 90882885, + "step": 4211, + "time_per_iteration": 2.7241647243499756 + }, + { + "auxiliary_loss_clip": 0.01095085, + "auxiliary_loss_mlp": 0.01044368, + "balance_loss_clip": 1.04594803, + "balance_loss_mlp": 1.02844775, + "epoch": 0.25323914023748684, + "flos": 24424355502720.0, + "grad_norm": 1.531596575729193, + "language_loss": 0.78362429, + "learning_rate": 3.499601265005622e-06, + "loss": 0.80501878, + "num_input_tokens_seen": 90902985, + "step": 4212, + "time_per_iteration": 2.788607358932495 + }, + { + "auxiliary_loss_clip": 0.01133893, + "auxiliary_loss_mlp": 0.01041038, + "balance_loss_clip": 1.04857254, + "balance_loss_mlp": 1.02401471, + "epoch": 0.2532992634901548, + "flos": 25447450584960.0, + "grad_norm": 2.123277134845907, + "language_loss": 0.53516036, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.55690968, + "num_input_tokens_seen": 90923550, + "step": 4213, + "time_per_iteration": 2.6675784587860107 + }, + { + "auxiliary_loss_clip": 0.01120924, + "auxiliary_loss_mlp": 0.01044765, + "balance_loss_clip": 1.05005503, + "balance_loss_mlp": 1.0273726, + "epoch": 0.25335938674282277, + "flos": 18880538405760.0, + "grad_norm": 2.4965805840577002, + "language_loss": 0.65416414, + "learning_rate": 3.499085765880308e-06, + "loss": 0.67582107, + "num_input_tokens_seen": 90943260, + "step": 4214, + "time_per_iteration": 2.691359281539917 + }, + { + "auxiliary_loss_clip": 0.01046401, + "auxiliary_loss_mlp": 0.01002761, + "balance_loss_clip": 1.02238619, + "balance_loss_mlp": 1.00056791, + "epoch": 0.25341950999549073, + "flos": 53062649936640.0, + "grad_norm": 0.8515065776804692, + "language_loss": 0.58004916, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60054076, + "num_input_tokens_seen": 90996295, + "step": 4215, + "time_per_iteration": 2.981840133666992 + }, + { + "auxiliary_loss_clip": 0.01124794, + "auxiliary_loss_mlp": 0.01043531, + "balance_loss_clip": 1.05316496, + "balance_loss_mlp": 1.02655554, + "epoch": 0.2534796332481587, + "flos": 39020247054720.0, + "grad_norm": 1.7497766885830588, + "language_loss": 0.83251095, + "learning_rate": 3.498570039373066e-06, + "loss": 0.85419416, + "num_input_tokens_seen": 91017545, + "step": 4216, + "time_per_iteration": 2.912137508392334 + }, + { + "auxiliary_loss_clip": 0.0112972, + "auxiliary_loss_mlp": 0.01040052, + "balance_loss_clip": 1.05088937, + "balance_loss_mlp": 1.02338624, + "epoch": 0.2535397565008267, + "flos": 23586990670080.0, + "grad_norm": 3.3733415491927996, + "language_loss": 0.80008072, + "learning_rate": 3.498312090875666e-06, + "loss": 0.82177842, + "num_input_tokens_seen": 91037715, + "step": 4217, + "time_per_iteration": 2.6532363891601562 + }, + { + "auxiliary_loss_clip": 0.01116019, + "auxiliary_loss_mlp": 0.01038346, + "balance_loss_clip": 1.04436612, + "balance_loss_mlp": 1.02234793, + "epoch": 0.2535998797534947, + "flos": 19281373251840.0, + "grad_norm": 2.333881972650505, + "language_loss": 0.75585902, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.77740264, + "num_input_tokens_seen": 91055295, + "step": 4218, + "time_per_iteration": 2.650867223739624 + }, + { + "auxiliary_loss_clip": 0.0113544, + "auxiliary_loss_mlp": 0.0104021, + "balance_loss_clip": 1.04940748, + "balance_loss_mlp": 1.0229727, + "epoch": 0.25366000300616265, + "flos": 24024382583040.0, + "grad_norm": 2.040148074486094, + "language_loss": 0.74188256, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.76363909, + "num_input_tokens_seen": 91075485, + "step": 4219, + "time_per_iteration": 2.727161169052124 + }, + { + "auxiliary_loss_clip": 0.01138406, + "auxiliary_loss_mlp": 0.01048455, + "balance_loss_clip": 1.05222011, + "balance_loss_mlp": 1.03138447, + "epoch": 0.2537201262588306, + "flos": 16289368116480.0, + "grad_norm": 4.990704095966988, + "language_loss": 0.81355274, + "learning_rate": 3.497537904525736e-06, + "loss": 0.83542132, + "num_input_tokens_seen": 91093620, + "step": 4220, + "time_per_iteration": 2.6146652698516846 + }, + { + "auxiliary_loss_clip": 0.01100698, + "auxiliary_loss_mlp": 0.01049127, + "balance_loss_clip": 1.04988587, + "balance_loss_mlp": 1.03041148, + "epoch": 0.2537802495114986, + "flos": 23294677789440.0, + "grad_norm": 2.3092995740689197, + "language_loss": 0.70819569, + "learning_rate": 3.497279728822468e-06, + "loss": 0.72969389, + "num_input_tokens_seen": 91114110, + "step": 4221, + "time_per_iteration": 2.851747751235962 + }, + { + "auxiliary_loss_clip": 0.0114682, + "auxiliary_loss_mlp": 0.01039444, + "balance_loss_clip": 1.05224657, + "balance_loss_mlp": 1.02257586, + "epoch": 0.25384037276416654, + "flos": 17639142416640.0, + "grad_norm": 2.4229893622177188, + "language_loss": 0.61689377, + "learning_rate": 3.497021496342202e-06, + "loss": 0.63875645, + "num_input_tokens_seen": 91133135, + "step": 4222, + "time_per_iteration": 2.6394412517547607 + }, + { + "auxiliary_loss_clip": 0.01138378, + "auxiliary_loss_mlp": 0.01051871, + "balance_loss_clip": 1.05371165, + "balance_loss_mlp": 1.03528929, + "epoch": 0.2539004960168345, + "flos": 21507044699520.0, + "grad_norm": 1.6839261376783914, + "language_loss": 0.74744058, + "learning_rate": 3.496763207094731e-06, + "loss": 0.76934308, + "num_input_tokens_seen": 91151805, + "step": 4223, + "time_per_iteration": 2.648322105407715 + }, + { + "auxiliary_loss_clip": 0.01092255, + "auxiliary_loss_mlp": 0.01039082, + "balance_loss_clip": 1.04767203, + "balance_loss_mlp": 1.02325082, + "epoch": 0.2539606192695025, + "flos": 23950909313280.0, + "grad_norm": 1.7092524284111348, + "language_loss": 0.80226004, + "learning_rate": 3.49650486108985e-06, + "loss": 0.82357341, + "num_input_tokens_seen": 91172270, + "step": 4224, + "time_per_iteration": 2.7572662830352783 + }, + { + "auxiliary_loss_clip": 0.01130506, + "auxiliary_loss_mlp": 0.00774076, + "balance_loss_clip": 1.05102324, + "balance_loss_mlp": 1.00112057, + "epoch": 0.25402074252217044, + "flos": 24169784837760.0, + "grad_norm": 1.4497407173280796, + "language_loss": 0.77330017, + "learning_rate": 3.496246458337354e-06, + "loss": 0.792346, + "num_input_tokens_seen": 91192080, + "step": 4225, + "time_per_iteration": 2.7661190032958984 + }, + { + "auxiliary_loss_clip": 0.01130647, + "auxiliary_loss_mlp": 0.01049954, + "balance_loss_clip": 1.04919255, + "balance_loss_mlp": 1.03271639, + "epoch": 0.2540808657748384, + "flos": 22303758314880.0, + "grad_norm": 2.0615353379683055, + "language_loss": 0.84638137, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.86818743, + "num_input_tokens_seen": 91211450, + "step": 4226, + "time_per_iteration": 2.690683126449585 + }, + { + "auxiliary_loss_clip": 0.01143268, + "auxiliary_loss_mlp": 0.01043336, + "balance_loss_clip": 1.05067408, + "balance_loss_mlp": 1.02613425, + "epoch": 0.25414098902750637, + "flos": 27599541022080.0, + "grad_norm": 1.5600656222031943, + "language_loss": 0.70886129, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.73072731, + "num_input_tokens_seen": 91231835, + "step": 4227, + "time_per_iteration": 2.6647307872772217 + }, + { + "auxiliary_loss_clip": 0.01055229, + "auxiliary_loss_mlp": 0.01001956, + "balance_loss_clip": 1.02168798, + "balance_loss_mlp": 0.9995476, + "epoch": 0.25420111228017434, + "flos": 58170834887040.0, + "grad_norm": 0.9869295588353136, + "language_loss": 0.61927998, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.63985181, + "num_input_tokens_seen": 91288755, + "step": 4228, + "time_per_iteration": 2.986067533493042 + }, + { + "auxiliary_loss_clip": 0.01124878, + "auxiliary_loss_mlp": 0.01040149, + "balance_loss_clip": 1.0464859, + "balance_loss_mlp": 1.02212501, + "epoch": 0.2542612355328423, + "flos": 11464409905920.0, + "grad_norm": 2.314170874410929, + "language_loss": 0.86946094, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.8911112, + "num_input_tokens_seen": 91302485, + "step": 4229, + "time_per_iteration": 2.629518985748291 + }, + { + "auxiliary_loss_clip": 0.01102882, + "auxiliary_loss_mlp": 0.01042519, + "balance_loss_clip": 1.04811144, + "balance_loss_mlp": 1.0241369, + "epoch": 0.2543213587855103, + "flos": 22965879669120.0, + "grad_norm": 1.7811216632522446, + "language_loss": 0.77265114, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.79410517, + "num_input_tokens_seen": 91321120, + "step": 4230, + "time_per_iteration": 2.715655565261841 + }, + { + "auxiliary_loss_clip": 0.01133364, + "auxiliary_loss_mlp": 0.01047482, + "balance_loss_clip": 1.0504818, + "balance_loss_mlp": 1.03074503, + "epoch": 0.2543814820381783, + "flos": 18253178438400.0, + "grad_norm": 1.8956341732473607, + "language_loss": 0.7550717, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.77688015, + "num_input_tokens_seen": 91338575, + "step": 4231, + "time_per_iteration": 2.6945316791534424 + }, + { + "auxiliary_loss_clip": 0.0113214, + "auxiliary_loss_mlp": 0.01038979, + "balance_loss_clip": 1.04939127, + "balance_loss_mlp": 1.0230999, + "epoch": 0.25444160529084625, + "flos": 15632705629440.0, + "grad_norm": 1.6179274617095247, + "language_loss": 0.73618764, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.75789881, + "num_input_tokens_seen": 91357355, + "step": 4232, + "time_per_iteration": 2.6219112873077393 + }, + { + "auxiliary_loss_clip": 0.01149145, + "auxiliary_loss_mlp": 0.01043104, + "balance_loss_clip": 1.05579972, + "balance_loss_mlp": 1.02589035, + "epoch": 0.2545017285435142, + "flos": 24601610142720.0, + "grad_norm": 2.2856831174377388, + "language_loss": 0.86333203, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.88525456, + "num_input_tokens_seen": 91376515, + "step": 4233, + "time_per_iteration": 2.675877809524536 + }, + { + "auxiliary_loss_clip": 0.01080108, + "auxiliary_loss_mlp": 0.01040124, + "balance_loss_clip": 1.04641938, + "balance_loss_mlp": 1.02457917, + "epoch": 0.2545618517961822, + "flos": 24679069822080.0, + "grad_norm": 1.5382450997432586, + "language_loss": 0.75319451, + "learning_rate": 3.493918281539737e-06, + "loss": 0.77439684, + "num_input_tokens_seen": 91397595, + "step": 4234, + "time_per_iteration": 2.9050087928771973 + }, + { + "auxiliary_loss_clip": 0.01117427, + "auxiliary_loss_mlp": 0.01044439, + "balance_loss_clip": 1.05171227, + "balance_loss_mlp": 1.02897787, + "epoch": 0.25462197504885015, + "flos": 23915106432000.0, + "grad_norm": 2.6382014960101765, + "language_loss": 0.74923635, + "learning_rate": 3.493659311850379e-06, + "loss": 0.77085495, + "num_input_tokens_seen": 91417775, + "step": 4235, + "time_per_iteration": 2.788041353225708 + }, + { + "auxiliary_loss_clip": 0.01124445, + "auxiliary_loss_mlp": 0.00776537, + "balance_loss_clip": 1.05315781, + "balance_loss_mlp": 1.00115323, + "epoch": 0.2546820983015181, + "flos": 24789387467520.0, + "grad_norm": 1.9882672691222136, + "language_loss": 0.64451182, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.66352159, + "num_input_tokens_seen": 91437665, + "step": 4236, + "time_per_iteration": 2.8649141788482666 + }, + { + "auxiliary_loss_clip": 0.01144465, + "auxiliary_loss_mlp": 0.01036249, + "balance_loss_clip": 1.05185175, + "balance_loss_mlp": 1.02122915, + "epoch": 0.2547422215541861, + "flos": 18734130570240.0, + "grad_norm": 1.6410229940010734, + "language_loss": 0.6714325, + "learning_rate": 3.493141202562354e-06, + "loss": 0.69323969, + "num_input_tokens_seen": 91456705, + "step": 4237, + "time_per_iteration": 4.262012958526611 + }, + { + "auxiliary_loss_clip": 0.01147064, + "auxiliary_loss_mlp": 0.01049012, + "balance_loss_clip": 1.05240059, + "balance_loss_mlp": 1.03203678, + "epoch": 0.25480234480685404, + "flos": 21032449274880.0, + "grad_norm": 2.0013967295828237, + "language_loss": 0.75415373, + "learning_rate": 3.492882062983333e-06, + "loss": 0.77611452, + "num_input_tokens_seen": 91475535, + "step": 4238, + "time_per_iteration": 2.6378636360168457 + }, + { + "auxiliary_loss_clip": 0.01137265, + "auxiliary_loss_mlp": 0.01046047, + "balance_loss_clip": 1.05366278, + "balance_loss_mlp": 1.02843964, + "epoch": 0.254862468059522, + "flos": 25082167224960.0, + "grad_norm": 3.4417299363308613, + "language_loss": 0.80712521, + "learning_rate": 3.492622866794074e-06, + "loss": 0.82895833, + "num_input_tokens_seen": 91499140, + "step": 4239, + "time_per_iteration": 4.348390579223633 + }, + { + "auxiliary_loss_clip": 0.01128023, + "auxiliary_loss_mlp": 0.01045872, + "balance_loss_clip": 1.0522213, + "balance_loss_mlp": 1.02870631, + "epoch": 0.25492259131219, + "flos": 20558392554240.0, + "grad_norm": 1.7312526359597522, + "language_loss": 0.77521586, + "learning_rate": 3.492363614004407e-06, + "loss": 0.79695487, + "num_input_tokens_seen": 91518335, + "step": 4240, + "time_per_iteration": 2.7501273155212402 + }, + { + "auxiliary_loss_clip": 0.01151347, + "auxiliary_loss_mlp": 0.01040734, + "balance_loss_clip": 1.05296493, + "balance_loss_mlp": 1.0226146, + "epoch": 0.25498271456485794, + "flos": 25042485674880.0, + "grad_norm": 3.3593092651087595, + "language_loss": 0.83430749, + "learning_rate": 3.492104304624162e-06, + "loss": 0.85622829, + "num_input_tokens_seen": 91537655, + "step": 4241, + "time_per_iteration": 2.7480928897857666 + }, + { + "auxiliary_loss_clip": 0.01137407, + "auxiliary_loss_mlp": 0.01045384, + "balance_loss_clip": 1.05306387, + "balance_loss_mlp": 1.02887392, + "epoch": 0.2550428378175259, + "flos": 26178412354560.0, + "grad_norm": 1.6379574895871623, + "language_loss": 0.73322648, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.75505441, + "num_input_tokens_seen": 91557545, + "step": 4242, + "time_per_iteration": 2.713635206222534 + }, + { + "auxiliary_loss_clip": 0.0114709, + "auxiliary_loss_mlp": 0.00774169, + "balance_loss_clip": 1.05182981, + "balance_loss_mlp": 1.00115824, + "epoch": 0.2551029610701939, + "flos": 15267170874240.0, + "grad_norm": 3.2486673035230993, + "language_loss": 0.72336024, + "learning_rate": 3.491585516131273e-06, + "loss": 0.7425729, + "num_input_tokens_seen": 91574405, + "step": 4243, + "time_per_iteration": 4.298815727233887 + }, + { + "auxiliary_loss_clip": 0.0113532, + "auxiliary_loss_mlp": 0.01045095, + "balance_loss_clip": 1.05183125, + "balance_loss_mlp": 1.02797616, + "epoch": 0.2551630843228619, + "flos": 18112193556480.0, + "grad_norm": 1.8323151946393021, + "language_loss": 0.82076979, + "learning_rate": 3.491326037038301e-06, + "loss": 0.842574, + "num_input_tokens_seen": 91593755, + "step": 4244, + "time_per_iteration": 2.6497015953063965 + }, + { + "auxiliary_loss_clip": 0.01054616, + "auxiliary_loss_mlp": 0.01017916, + "balance_loss_clip": 1.03294289, + "balance_loss_mlp": 1.01572227, + "epoch": 0.25522320757552985, + "flos": 70520192167680.0, + "grad_norm": 0.6914168393706984, + "language_loss": 0.57701397, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.59773928, + "num_input_tokens_seen": 91660335, + "step": 4245, + "time_per_iteration": 3.2938833236694336 + }, + { + "auxiliary_loss_clip": 0.01146552, + "auxiliary_loss_mlp": 0.01052395, + "balance_loss_clip": 1.0508852, + "balance_loss_mlp": 1.03577745, + "epoch": 0.2552833308281978, + "flos": 22893088757760.0, + "grad_norm": 2.1326330958670567, + "language_loss": 0.65120399, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.6731934, + "num_input_tokens_seen": 91678500, + "step": 4246, + "time_per_iteration": 2.5949065685272217 + }, + { + "auxiliary_loss_clip": 0.01127579, + "auxiliary_loss_mlp": 0.01044633, + "balance_loss_clip": 1.04806828, + "balance_loss_mlp": 1.02944601, + "epoch": 0.2553434540808658, + "flos": 22053605022720.0, + "grad_norm": 1.7151532201527704, + "language_loss": 0.81580049, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.83752257, + "num_input_tokens_seen": 91696430, + "step": 4247, + "time_per_iteration": 2.673624277114868 + }, + { + "auxiliary_loss_clip": 0.01140059, + "auxiliary_loss_mlp": 0.01044068, + "balance_loss_clip": 1.05152941, + "balance_loss_mlp": 1.02543616, + "epoch": 0.25540357733353375, + "flos": 16544190176640.0, + "grad_norm": 2.241724474505105, + "language_loss": 0.83335149, + "learning_rate": 3.490287555252514e-06, + "loss": 0.85519278, + "num_input_tokens_seen": 91713270, + "step": 4248, + "time_per_iteration": 2.617570400238037 + }, + { + "auxiliary_loss_clip": 0.01112618, + "auxiliary_loss_mlp": 0.01042154, + "balance_loss_clip": 1.04433584, + "balance_loss_mlp": 1.02458215, + "epoch": 0.2554637005862017, + "flos": 17565022702080.0, + "grad_norm": 2.084670538042193, + "language_loss": 0.84011936, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.8616671, + "num_input_tokens_seen": 91728865, + "step": 4249, + "time_per_iteration": 2.6617467403411865 + }, + { + "auxiliary_loss_clip": 0.01001275, + "auxiliary_loss_mlp": 0.01002657, + "balance_loss_clip": 1.0228157, + "balance_loss_mlp": 0.9996174, + "epoch": 0.2555238238388697, + "flos": 72244763953920.0, + "grad_norm": 0.765792812565725, + "language_loss": 0.56274796, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58278728, + "num_input_tokens_seen": 91787470, + "step": 4250, + "time_per_iteration": 3.24300479888916 + }, + { + "auxiliary_loss_clip": 0.01117816, + "auxiliary_loss_mlp": 0.01036136, + "balance_loss_clip": 1.04929769, + "balance_loss_mlp": 1.01839769, + "epoch": 0.25558394709153764, + "flos": 24389414547840.0, + "grad_norm": 2.294460262471245, + "language_loss": 0.80566651, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.82720602, + "num_input_tokens_seen": 91805640, + "step": 4251, + "time_per_iteration": 2.732752561569214 + }, + { + "auxiliary_loss_clip": 0.01030367, + "auxiliary_loss_mlp": 0.01001193, + "balance_loss_clip": 1.02468216, + "balance_loss_mlp": 0.99888068, + "epoch": 0.2556440703442056, + "flos": 69231213636480.0, + "grad_norm": 0.7932625116211053, + "language_loss": 0.6608988, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.68121445, + "num_input_tokens_seen": 91869695, + "step": 4252, + "time_per_iteration": 3.304985523223877 + }, + { + "auxiliary_loss_clip": 0.01130428, + "auxiliary_loss_mlp": 0.01036056, + "balance_loss_clip": 1.0499115, + "balance_loss_mlp": 1.02097619, + "epoch": 0.2557041935968736, + "flos": 24863902231680.0, + "grad_norm": 2.60951363435401, + "language_loss": 0.73882902, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.76049387, + "num_input_tokens_seen": 91889920, + "step": 4253, + "time_per_iteration": 2.706052303314209 + }, + { + "auxiliary_loss_clip": 0.01097964, + "auxiliary_loss_mlp": 0.01044298, + "balance_loss_clip": 1.04340124, + "balance_loss_mlp": 1.02782309, + "epoch": 0.25576431684954154, + "flos": 22492110257280.0, + "grad_norm": 2.978807414856607, + "language_loss": 0.72565317, + "learning_rate": 3.488728137415357e-06, + "loss": 0.7470758, + "num_input_tokens_seen": 91908665, + "step": 4254, + "time_per_iteration": 2.7579715251922607 + }, + { + "auxiliary_loss_clip": 0.01098791, + "auxiliary_loss_mlp": 0.00774228, + "balance_loss_clip": 1.04665136, + "balance_loss_mlp": 1.001104, + "epoch": 0.2558244401022095, + "flos": 19826748426240.0, + "grad_norm": 1.7240740787107458, + "language_loss": 0.80729312, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.82602334, + "num_input_tokens_seen": 91927855, + "step": 4255, + "time_per_iteration": 2.788978099822998 + }, + { + "auxiliary_loss_clip": 0.01124525, + "auxiliary_loss_mlp": 0.01040748, + "balance_loss_clip": 1.05111384, + "balance_loss_mlp": 1.02414227, + "epoch": 0.2558845633548775, + "flos": 23220486247680.0, + "grad_norm": 1.5275751549355678, + "language_loss": 0.85734111, + "learning_rate": 3.488207879742721e-06, + "loss": 0.87899381, + "num_input_tokens_seen": 91948500, + "step": 4256, + "time_per_iteration": 2.7916831970214844 + }, + { + "auxiliary_loss_clip": 0.01102599, + "auxiliary_loss_mlp": 0.01049743, + "balance_loss_clip": 1.04525566, + "balance_loss_mlp": 1.03164732, + "epoch": 0.2559446866075455, + "flos": 16837867774080.0, + "grad_norm": 1.8301502951270987, + "language_loss": 0.74872649, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.77024996, + "num_input_tokens_seen": 91968375, + "step": 4257, + "time_per_iteration": 2.7754952907562256 + }, + { + "auxiliary_loss_clip": 0.0102418, + "auxiliary_loss_mlp": 0.01011535, + "balance_loss_clip": 1.03534186, + "balance_loss_mlp": 1.00959146, + "epoch": 0.25600480986021346, + "flos": 57593786895360.0, + "grad_norm": 0.8003890262370261, + "language_loss": 0.65255105, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.67290819, + "num_input_tokens_seen": 92028490, + "step": 4258, + "time_per_iteration": 3.269063949584961 + }, + { + "auxiliary_loss_clip": 0.01091736, + "auxiliary_loss_mlp": 0.00773347, + "balance_loss_clip": 1.04549718, + "balance_loss_mlp": 1.00111449, + "epoch": 0.2560649331128814, + "flos": 27819529868160.0, + "grad_norm": 1.5266978755669562, + "language_loss": 0.76443565, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.78308654, + "num_input_tokens_seen": 92048060, + "step": 4259, + "time_per_iteration": 2.805574893951416 + }, + { + "auxiliary_loss_clip": 0.01026212, + "auxiliary_loss_mlp": 0.01016368, + "balance_loss_clip": 1.02208054, + "balance_loss_mlp": 1.01372147, + "epoch": 0.2561250563655494, + "flos": 70950509101440.0, + "grad_norm": 0.7927643603688844, + "language_loss": 0.58455491, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60498071, + "num_input_tokens_seen": 92118180, + "step": 4260, + "time_per_iteration": 3.3904550075531006 + }, + { + "auxiliary_loss_clip": 0.01133193, + "auxiliary_loss_mlp": 0.01048996, + "balance_loss_clip": 1.04874313, + "balance_loss_mlp": 1.03204465, + "epoch": 0.25618517961821735, + "flos": 27012329481600.0, + "grad_norm": 3.3188145253338543, + "language_loss": 0.77064955, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.79247141, + "num_input_tokens_seen": 92137570, + "step": 4261, + "time_per_iteration": 2.769864082336426 + }, + { + "auxiliary_loss_clip": 0.01144035, + "auxiliary_loss_mlp": 0.01040091, + "balance_loss_clip": 1.05178332, + "balance_loss_mlp": 1.02465355, + "epoch": 0.2562453028708853, + "flos": 23068296322560.0, + "grad_norm": 1.5699122250769224, + "language_loss": 0.83367205, + "learning_rate": 3.486645752648842e-06, + "loss": 0.85551333, + "num_input_tokens_seen": 92157625, + "step": 4262, + "time_per_iteration": 2.682828426361084 + }, + { + "auxiliary_loss_clip": 0.01134556, + "auxiliary_loss_mlp": 0.01041299, + "balance_loss_clip": 1.05219626, + "balance_loss_mlp": 1.02344143, + "epoch": 0.2563054261235533, + "flos": 15120942606720.0, + "grad_norm": 2.340862226505914, + "language_loss": 0.73892939, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.76068795, + "num_input_tokens_seen": 92175350, + "step": 4263, + "time_per_iteration": 2.70947003364563 + }, + { + "auxiliary_loss_clip": 0.0111297, + "auxiliary_loss_mlp": 0.00773371, + "balance_loss_clip": 1.05221081, + "balance_loss_mlp": 1.00093555, + "epoch": 0.25636554937622125, + "flos": 27854865872640.0, + "grad_norm": 1.8143922917988324, + "language_loss": 0.82766259, + "learning_rate": 3.486124592522163e-06, + "loss": 0.84652603, + "num_input_tokens_seen": 92196070, + "step": 4264, + "time_per_iteration": 2.7249553203582764 + }, + { + "auxiliary_loss_clip": 0.01133012, + "auxiliary_loss_mlp": 0.01041877, + "balance_loss_clip": 1.05265546, + "balance_loss_mlp": 1.02468669, + "epoch": 0.2564256726288892, + "flos": 28906509288960.0, + "grad_norm": 2.8986425954305206, + "language_loss": 0.74346334, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.76521224, + "num_input_tokens_seen": 92216310, + "step": 4265, + "time_per_iteration": 2.7149150371551514 + }, + { + "auxiliary_loss_clip": 0.01110152, + "auxiliary_loss_mlp": 0.01036531, + "balance_loss_clip": 1.04754925, + "balance_loss_mlp": 1.02034247, + "epoch": 0.2564857958815572, + "flos": 18514931823360.0, + "grad_norm": 15.50909821859273, + "language_loss": 0.81623137, + "learning_rate": 3.485603206979513e-06, + "loss": 0.83769822, + "num_input_tokens_seen": 92234510, + "step": 4266, + "time_per_iteration": 2.6890153884887695 + }, + { + "auxiliary_loss_clip": 0.01083702, + "auxiliary_loss_mlp": 0.01050109, + "balance_loss_clip": 1.0468955, + "balance_loss_mlp": 1.0318346, + "epoch": 0.25654591913422514, + "flos": 25808280658560.0, + "grad_norm": 2.4522850064786037, + "language_loss": 0.79120672, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.81254482, + "num_input_tokens_seen": 92254070, + "step": 4267, + "time_per_iteration": 2.8390700817108154 + }, + { + "auxiliary_loss_clip": 0.01094597, + "auxiliary_loss_mlp": 0.01044608, + "balance_loss_clip": 1.04643822, + "balance_loss_mlp": 1.0276804, + "epoch": 0.2566060423868931, + "flos": 19099665325440.0, + "grad_norm": 1.6765306902124857, + "language_loss": 0.79241312, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.81380516, + "num_input_tokens_seen": 92275060, + "step": 4268, + "time_per_iteration": 2.7324178218841553 + }, + { + "auxiliary_loss_clip": 0.01106667, + "auxiliary_loss_mlp": 0.00778662, + "balance_loss_clip": 1.04940808, + "balance_loss_mlp": 1.00098729, + "epoch": 0.25666616563956113, + "flos": 23842674656640.0, + "grad_norm": 1.8248642507450341, + "language_loss": 0.67737979, + "learning_rate": 3.484820706183595e-06, + "loss": 0.69623303, + "num_input_tokens_seen": 92293610, + "step": 4269, + "time_per_iteration": 2.7897677421569824 + }, + { + "auxiliary_loss_clip": 0.01123993, + "auxiliary_loss_mlp": 0.01043408, + "balance_loss_clip": 1.05155373, + "balance_loss_mlp": 1.02596736, + "epoch": 0.2567262888922291, + "flos": 14604259420800.0, + "grad_norm": 3.069203267679029, + "language_loss": 0.79117787, + "learning_rate": 3.484559759962666e-06, + "loss": 0.81285185, + "num_input_tokens_seen": 92308305, + "step": 4270, + "time_per_iteration": 2.8076114654541016 + }, + { + "auxiliary_loss_clip": 0.01094814, + "auxiliary_loss_mlp": 0.010436, + "balance_loss_clip": 1.04357839, + "balance_loss_mlp": 1.02393079, + "epoch": 0.25678641214489706, + "flos": 32923117877760.0, + "grad_norm": 2.413207422396751, + "language_loss": 0.68088073, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.7022649, + "num_input_tokens_seen": 92329875, + "step": 4271, + "time_per_iteration": 2.8195667266845703 + }, + { + "auxiliary_loss_clip": 0.01136281, + "auxiliary_loss_mlp": 0.00774788, + "balance_loss_clip": 1.05146289, + "balance_loss_mlp": 1.00110972, + "epoch": 0.256846535397565, + "flos": 24098933260800.0, + "grad_norm": 3.3671515903121216, + "language_loss": 0.87362605, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.89273679, + "num_input_tokens_seen": 92348780, + "step": 4272, + "time_per_iteration": 2.6910364627838135 + }, + { + "auxiliary_loss_clip": 0.01122968, + "auxiliary_loss_mlp": 0.01046328, + "balance_loss_clip": 1.05348301, + "balance_loss_mlp": 1.02854192, + "epoch": 0.256906658650233, + "flos": 19718441942400.0, + "grad_norm": 1.6813472119330561, + "language_loss": 0.81420678, + "learning_rate": 3.483776583571541e-06, + "loss": 0.83589977, + "num_input_tokens_seen": 92368175, + "step": 4273, + "time_per_iteration": 2.6883673667907715 + }, + { + "auxiliary_loss_clip": 0.01097944, + "auxiliary_loss_mlp": 0.01041741, + "balance_loss_clip": 1.043715, + "balance_loss_mlp": 1.02459884, + "epoch": 0.25696678190290095, + "flos": 22926018551040.0, + "grad_norm": 3.3251008044769947, + "language_loss": 0.76944637, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.79084325, + "num_input_tokens_seen": 92387755, + "step": 4274, + "time_per_iteration": 2.7613401412963867 + }, + { + "auxiliary_loss_clip": 0.01112797, + "auxiliary_loss_mlp": 0.01039272, + "balance_loss_clip": 1.04380774, + "balance_loss_mlp": 1.02220166, + "epoch": 0.2570269051555689, + "flos": 27307838672640.0, + "grad_norm": 2.1172072427968933, + "language_loss": 0.83780324, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.85932392, + "num_input_tokens_seen": 92409850, + "step": 4275, + "time_per_iteration": 2.7835779190063477 + }, + { + "auxiliary_loss_clip": 0.01120289, + "auxiliary_loss_mlp": 0.01039714, + "balance_loss_clip": 1.05141211, + "balance_loss_mlp": 1.02223814, + "epoch": 0.2570870284082369, + "flos": 27563414918400.0, + "grad_norm": 2.725989678545036, + "language_loss": 0.7874397, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.80903983, + "num_input_tokens_seen": 92431250, + "step": 4276, + "time_per_iteration": 5.679298400878906 + }, + { + "auxiliary_loss_clip": 0.01136261, + "auxiliary_loss_mlp": 0.01046327, + "balance_loss_clip": 1.05269814, + "balance_loss_mlp": 1.02982879, + "epoch": 0.25714715166090485, + "flos": 28730834847360.0, + "grad_norm": 4.7083902318823885, + "language_loss": 0.79273927, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.81456512, + "num_input_tokens_seen": 92452065, + "step": 4277, + "time_per_iteration": 2.691035270690918 + }, + { + "auxiliary_loss_clip": 0.01146238, + "auxiliary_loss_mlp": 0.01040113, + "balance_loss_clip": 1.05214763, + "balance_loss_mlp": 1.02367437, + "epoch": 0.2572072749135728, + "flos": 20116152305280.0, + "grad_norm": 2.017980063834791, + "language_loss": 0.78986102, + "learning_rate": 3.482470164419295e-06, + "loss": 0.81172454, + "num_input_tokens_seen": 92470025, + "step": 4278, + "time_per_iteration": 4.2404680252075195 + }, + { + "auxiliary_loss_clip": 0.01126121, + "auxiliary_loss_mlp": 0.01037901, + "balance_loss_clip": 1.05176449, + "balance_loss_mlp": 1.02102113, + "epoch": 0.2572673981662408, + "flos": 26030855283840.0, + "grad_norm": 2.8070462448385904, + "language_loss": 0.74898899, + "learning_rate": 3.482208711902952e-06, + "loss": 0.77062923, + "num_input_tokens_seen": 92489825, + "step": 4279, + "time_per_iteration": 2.65977144241333 + }, + { + "auxiliary_loss_clip": 0.01134686, + "auxiliary_loss_mlp": 0.01051687, + "balance_loss_clip": 1.04973292, + "balance_loss_mlp": 1.03423464, + "epoch": 0.25732752141890874, + "flos": 16106618695680.0, + "grad_norm": 2.4256697448035687, + "language_loss": 0.85603923, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.87790298, + "num_input_tokens_seen": 92507270, + "step": 4280, + "time_per_iteration": 2.6072864532470703 + }, + { + "auxiliary_loss_clip": 0.01136623, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.05183434, + "balance_loss_mlp": 1.02147269, + "epoch": 0.2573876446715767, + "flos": 22524429519360.0, + "grad_norm": 3.9579835716917695, + "language_loss": 0.79381943, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.8155762, + "num_input_tokens_seen": 92526300, + "step": 4281, + "time_per_iteration": 2.613163471221924 + }, + { + "auxiliary_loss_clip": 0.01110196, + "auxiliary_loss_mlp": 0.01038018, + "balance_loss_clip": 1.04847932, + "balance_loss_mlp": 1.02099478, + "epoch": 0.2574477679242447, + "flos": 23950837486080.0, + "grad_norm": 2.240063499401578, + "language_loss": 0.87314785, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.89462996, + "num_input_tokens_seen": 92546465, + "step": 4282, + "time_per_iteration": 4.489396333694458 + }, + { + "auxiliary_loss_clip": 0.01148783, + "auxiliary_loss_mlp": 0.01046594, + "balance_loss_clip": 1.0526619, + "balance_loss_mlp": 1.02959502, + "epoch": 0.2575078911769127, + "flos": 21981711951360.0, + "grad_norm": 1.5167715532309152, + "language_loss": 0.70110047, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.72305429, + "num_input_tokens_seen": 92567260, + "step": 4283, + "time_per_iteration": 2.619131565093994 + }, + { + "auxiliary_loss_clip": 0.01144466, + "auxiliary_loss_mlp": 0.00774605, + "balance_loss_clip": 1.05443883, + "balance_loss_mlp": 1.0010494, + "epoch": 0.25756801442958066, + "flos": 21945406279680.0, + "grad_norm": 1.950947388276708, + "language_loss": 0.80411774, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.82330847, + "num_input_tokens_seen": 92585425, + "step": 4284, + "time_per_iteration": 2.656998634338379 + }, + { + "auxiliary_loss_clip": 0.01105473, + "auxiliary_loss_mlp": 0.01039993, + "balance_loss_clip": 1.05797076, + "balance_loss_mlp": 1.02488899, + "epoch": 0.2576281376822486, + "flos": 35261980058880.0, + "grad_norm": 2.2559612506718434, + "language_loss": 0.70473522, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.72618985, + "num_input_tokens_seen": 92604770, + "step": 4285, + "time_per_iteration": 2.880835771560669 + }, + { + "auxiliary_loss_clip": 0.01127807, + "auxiliary_loss_mlp": 0.0104515, + "balance_loss_clip": 1.05229783, + "balance_loss_mlp": 1.02971268, + "epoch": 0.2576882609349166, + "flos": 14132285688960.0, + "grad_norm": 1.8739093647405893, + "language_loss": 0.58494061, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.6066702, + "num_input_tokens_seen": 92622635, + "step": 4286, + "time_per_iteration": 2.63923978805542 + }, + { + "auxiliary_loss_clip": 0.01138174, + "auxiliary_loss_mlp": 0.01046794, + "balance_loss_clip": 1.05271184, + "balance_loss_mlp": 1.03020048, + "epoch": 0.25774838418758456, + "flos": 23258336204160.0, + "grad_norm": 1.4732857929087761, + "language_loss": 0.63687879, + "learning_rate": 3.480115069207354e-06, + "loss": 0.65872842, + "num_input_tokens_seen": 92642960, + "step": 4287, + "time_per_iteration": 2.67764949798584 + }, + { + "auxiliary_loss_clip": 0.01127889, + "auxiliary_loss_mlp": 0.01045385, + "balance_loss_clip": 1.05252934, + "balance_loss_mlp": 1.02769411, + "epoch": 0.2578085074402525, + "flos": 22601745544320.0, + "grad_norm": 2.134546441867425, + "language_loss": 0.71780413, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.73953688, + "num_input_tokens_seen": 92662455, + "step": 4288, + "time_per_iteration": 2.7174036502838135 + }, + { + "auxiliary_loss_clip": 0.0110996, + "auxiliary_loss_mlp": 0.01042748, + "balance_loss_clip": 1.04934072, + "balance_loss_mlp": 1.02691674, + "epoch": 0.2578686306929205, + "flos": 24571840746240.0, + "grad_norm": 1.4449800602700236, + "language_loss": 0.77059102, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79211813, + "num_input_tokens_seen": 92683520, + "step": 4289, + "time_per_iteration": 2.748249053955078 + }, + { + "auxiliary_loss_clip": 0.01146276, + "auxiliary_loss_mlp": 0.00775089, + "balance_loss_clip": 1.05252326, + "balance_loss_mlp": 1.001122, + "epoch": 0.25792875394558845, + "flos": 18113953322880.0, + "grad_norm": 2.0235699584636295, + "language_loss": 0.85416883, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.87338245, + "num_input_tokens_seen": 92701450, + "step": 4290, + "time_per_iteration": 2.593461751937866 + }, + { + "auxiliary_loss_clip": 0.01114221, + "auxiliary_loss_mlp": 0.01056837, + "balance_loss_clip": 1.05081999, + "balance_loss_mlp": 1.03660691, + "epoch": 0.2579888771982564, + "flos": 17712902995200.0, + "grad_norm": 2.4272093439618847, + "language_loss": 0.72360331, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.74531388, + "num_input_tokens_seen": 92720355, + "step": 4291, + "time_per_iteration": 2.6838138103485107 + }, + { + "auxiliary_loss_clip": 0.01150945, + "auxiliary_loss_mlp": 0.0104494, + "balance_loss_clip": 1.05378067, + "balance_loss_mlp": 1.02758288, + "epoch": 0.2580490004509244, + "flos": 16434878112000.0, + "grad_norm": 2.78045823134535, + "language_loss": 0.80846477, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.83042365, + "num_input_tokens_seen": 92736755, + "step": 4292, + "time_per_iteration": 2.595710277557373 + }, + { + "auxiliary_loss_clip": 0.0115367, + "auxiliary_loss_mlp": 0.01044878, + "balance_loss_clip": 1.05773902, + "balance_loss_mlp": 1.02702022, + "epoch": 0.25810912370359235, + "flos": 33835141128960.0, + "grad_norm": 2.057533015633898, + "language_loss": 0.67592025, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.69790578, + "num_input_tokens_seen": 92757655, + "step": 4293, + "time_per_iteration": 2.699570894241333 + }, + { + "auxiliary_loss_clip": 0.0110485, + "auxiliary_loss_mlp": 0.01048043, + "balance_loss_clip": 1.04971898, + "balance_loss_mlp": 1.03190207, + "epoch": 0.2581692469562603, + "flos": 25192197561600.0, + "grad_norm": 2.0097854631835217, + "language_loss": 0.75671911, + "learning_rate": 3.478280185054542e-06, + "loss": 0.77824801, + "num_input_tokens_seen": 92776100, + "step": 4294, + "time_per_iteration": 2.7217960357666016 + }, + { + "auxiliary_loss_clip": 0.01098332, + "auxiliary_loss_mlp": 0.01053556, + "balance_loss_clip": 1.0444684, + "balance_loss_mlp": 1.03404188, + "epoch": 0.2582293702089283, + "flos": 34932212271360.0, + "grad_norm": 1.7798433628760433, + "language_loss": 0.8047998, + "learning_rate": 3.478017834441318e-06, + "loss": 0.82631868, + "num_input_tokens_seen": 92798880, + "step": 4295, + "time_per_iteration": 2.871460437774658 + }, + { + "auxiliary_loss_clip": 0.01055358, + "auxiliary_loss_mlp": 0.01044188, + "balance_loss_clip": 1.04843688, + "balance_loss_mlp": 1.0256989, + "epoch": 0.2582894934615963, + "flos": 26833746038400.0, + "grad_norm": 2.1012913939780753, + "language_loss": 0.72843397, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.74942946, + "num_input_tokens_seen": 92817750, + "step": 4296, + "time_per_iteration": 3.173367738723755 + }, + { + "auxiliary_loss_clip": 0.01091622, + "auxiliary_loss_mlp": 0.01038465, + "balance_loss_clip": 1.05392241, + "balance_loss_mlp": 1.02106011, + "epoch": 0.25834961671426426, + "flos": 23515241253120.0, + "grad_norm": 1.5772062283828172, + "language_loss": 0.86928564, + "learning_rate": 3.477492965085067e-06, + "loss": 0.8905865, + "num_input_tokens_seen": 92837995, + "step": 4297, + "time_per_iteration": 3.1598868370056152 + }, + { + "auxiliary_loss_clip": 0.01149748, + "auxiliary_loss_mlp": 0.01047412, + "balance_loss_clip": 1.05517435, + "balance_loss_mlp": 1.03090191, + "epoch": 0.25840973996693223, + "flos": 22451028076800.0, + "grad_norm": 1.8030727150796175, + "language_loss": 0.84720427, + "learning_rate": 3.477230446361943e-06, + "loss": 0.86917591, + "num_input_tokens_seen": 92857245, + "step": 4298, + "time_per_iteration": 2.632448196411133 + }, + { + "auxiliary_loss_clip": 0.01135108, + "auxiliary_loss_mlp": 0.00775458, + "balance_loss_clip": 1.05262494, + "balance_loss_mlp": 1.00111055, + "epoch": 0.2584698632196002, + "flos": 11290854366720.0, + "grad_norm": 2.0124667048247686, + "language_loss": 0.83514953, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.8542552, + "num_input_tokens_seen": 92873265, + "step": 4299, + "time_per_iteration": 2.631248950958252 + }, + { + "auxiliary_loss_clip": 0.01117485, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.05216849, + "balance_loss_mlp": 1.01868308, + "epoch": 0.25852998647226816, + "flos": 17929982839680.0, + "grad_norm": 2.419754138344463, + "language_loss": 0.82422709, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.84575242, + "num_input_tokens_seen": 92890880, + "step": 4300, + "time_per_iteration": 2.650834083557129 + }, + { + "auxiliary_loss_clip": 0.0113846, + "auxiliary_loss_mlp": 0.01041208, + "balance_loss_clip": 1.0535903, + "balance_loss_mlp": 1.02343392, + "epoch": 0.2585901097249361, + "flos": 33256117889280.0, + "grad_norm": 2.971673559214411, + "language_loss": 0.66949177, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.69128841, + "num_input_tokens_seen": 92910770, + "step": 4301, + "time_per_iteration": 2.729519844055176 + }, + { + "auxiliary_loss_clip": 0.01139778, + "auxiliary_loss_mlp": 0.01040158, + "balance_loss_clip": 1.05335701, + "balance_loss_mlp": 1.02245533, + "epoch": 0.2586502329776041, + "flos": 18441278985600.0, + "grad_norm": 2.29820997177689, + "language_loss": 0.81177735, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.83357668, + "num_input_tokens_seen": 92929520, + "step": 4302, + "time_per_iteration": 2.5496692657470703 + }, + { + "auxiliary_loss_clip": 0.01105433, + "auxiliary_loss_mlp": 0.01042423, + "balance_loss_clip": 1.05242491, + "balance_loss_mlp": 1.02542388, + "epoch": 0.25871035623027205, + "flos": 17968120104960.0, + "grad_norm": 1.8036447001063776, + "language_loss": 0.92147923, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94295776, + "num_input_tokens_seen": 92947890, + "step": 4303, + "time_per_iteration": 2.686222791671753 + }, + { + "auxiliary_loss_clip": 0.01141887, + "auxiliary_loss_mlp": 0.01040139, + "balance_loss_clip": 1.05643094, + "balance_loss_mlp": 1.02322304, + "epoch": 0.25877047948294, + "flos": 27777729415680.0, + "grad_norm": 2.7085759571044368, + "language_loss": 0.67138135, + "learning_rate": 3.475654158020507e-06, + "loss": 0.69320166, + "num_input_tokens_seen": 92967690, + "step": 4304, + "time_per_iteration": 2.665797472000122 + }, + { + "auxiliary_loss_clip": 0.01113882, + "auxiliary_loss_mlp": 0.01041979, + "balance_loss_clip": 1.0509342, + "balance_loss_mlp": 1.02498007, + "epoch": 0.258830602735608, + "flos": 27125843437440.0, + "grad_norm": 2.126938769919949, + "language_loss": 0.72085559, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.74241412, + "num_input_tokens_seen": 92986830, + "step": 4305, + "time_per_iteration": 2.7514076232910156 + }, + { + "auxiliary_loss_clip": 0.01103045, + "auxiliary_loss_mlp": 0.00775987, + "balance_loss_clip": 1.04804707, + "balance_loss_mlp": 1.00122118, + "epoch": 0.25889072598827595, + "flos": 17891486438400.0, + "grad_norm": 6.414506312387852, + "language_loss": 0.76175749, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.78054774, + "num_input_tokens_seen": 93002740, + "step": 4306, + "time_per_iteration": 2.7326161861419678 + }, + { + "auxiliary_loss_clip": 0.01049461, + "auxiliary_loss_mlp": 0.0102188, + "balance_loss_clip": 1.03476799, + "balance_loss_mlp": 1.01943636, + "epoch": 0.2589508492409439, + "flos": 53934955724160.0, + "grad_norm": 0.8427062291747792, + "language_loss": 0.57128024, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59199357, + "num_input_tokens_seen": 93058645, + "step": 4307, + "time_per_iteration": 3.1499595642089844 + }, + { + "auxiliary_loss_clip": 0.01123356, + "auxiliary_loss_mlp": 0.01045032, + "balance_loss_clip": 1.0514828, + "balance_loss_mlp": 1.02858078, + "epoch": 0.2590109724936119, + "flos": 22125785402880.0, + "grad_norm": 1.5299746109283647, + "language_loss": 0.71727359, + "learning_rate": 3.474602179854327e-06, + "loss": 0.73895752, + "num_input_tokens_seen": 93077140, + "step": 4308, + "time_per_iteration": 2.6824283599853516 + }, + { + "auxiliary_loss_clip": 0.01152705, + "auxiliary_loss_mlp": 0.01046843, + "balance_loss_clip": 1.05659723, + "balance_loss_mlp": 1.02976048, + "epoch": 0.2590710957462799, + "flos": 13474294398720.0, + "grad_norm": 1.8339599204524273, + "language_loss": 0.83940542, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.86140084, + "num_input_tokens_seen": 93093580, + "step": 4309, + "time_per_iteration": 2.560194253921509 + }, + { + "auxiliary_loss_clip": 0.01137306, + "auxiliary_loss_mlp": 0.01044025, + "balance_loss_clip": 1.05587196, + "balance_loss_mlp": 1.02815771, + "epoch": 0.25913121899894787, + "flos": 22307098279680.0, + "grad_norm": 1.5397823214091813, + "language_loss": 0.84657532, + "learning_rate": 3.474075855228966e-06, + "loss": 0.86838865, + "num_input_tokens_seen": 93112345, + "step": 4310, + "time_per_iteration": 2.627716064453125 + }, + { + "auxiliary_loss_clip": 0.01143598, + "auxiliary_loss_mlp": 0.01047667, + "balance_loss_clip": 1.05802059, + "balance_loss_mlp": 1.03141904, + "epoch": 0.25919134225161583, + "flos": 25811728364160.0, + "grad_norm": 2.0190220849922094, + "language_loss": 0.77145267, + "learning_rate": 3.473812609065639e-06, + "loss": 0.79336536, + "num_input_tokens_seen": 93131545, + "step": 4311, + "time_per_iteration": 2.694856643676758 + }, + { + "auxiliary_loss_clip": 0.01110239, + "auxiliary_loss_mlp": 0.01052381, + "balance_loss_clip": 1.04629123, + "balance_loss_mlp": 1.03498793, + "epoch": 0.2592514655042838, + "flos": 31212262108800.0, + "grad_norm": 1.9233367952735905, + "language_loss": 0.72848439, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.75011057, + "num_input_tokens_seen": 93150730, + "step": 4312, + "time_per_iteration": 2.7577714920043945 + }, + { + "auxiliary_loss_clip": 0.01150768, + "auxiliary_loss_mlp": 0.01044439, + "balance_loss_clip": 1.05618715, + "balance_loss_mlp": 1.02845287, + "epoch": 0.25931158875695176, + "flos": 18474998878080.0, + "grad_norm": 1.8485738044524733, + "language_loss": 0.70193493, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.72388697, + "num_input_tokens_seen": 93167895, + "step": 4313, + "time_per_iteration": 2.6447813510894775 + }, + { + "auxiliary_loss_clip": 0.01150117, + "auxiliary_loss_mlp": 0.01054192, + "balance_loss_clip": 1.05624926, + "balance_loss_mlp": 1.03845656, + "epoch": 0.2593717120096197, + "flos": 19207935895680.0, + "grad_norm": 1.8538125013537565, + "language_loss": 0.80462205, + "learning_rate": 3.473022535292867e-06, + "loss": 0.82666522, + "num_input_tokens_seen": 93187650, + "step": 4314, + "time_per_iteration": 2.6073296070098877 + }, + { + "auxiliary_loss_clip": 0.01110006, + "auxiliary_loss_mlp": 0.01049511, + "balance_loss_clip": 1.04867387, + "balance_loss_mlp": 1.03253555, + "epoch": 0.2594318352622877, + "flos": 31248100903680.0, + "grad_norm": 2.061113629574459, + "language_loss": 0.670748, + "learning_rate": 3.472759065640968e-06, + "loss": 0.69234318, + "num_input_tokens_seen": 93207370, + "step": 4315, + "time_per_iteration": 6.427948236465454 + }, + { + "auxiliary_loss_clip": 0.01096074, + "auxiliary_loss_mlp": 0.01056601, + "balance_loss_clip": 1.04853845, + "balance_loss_mlp": 1.0407939, + "epoch": 0.25949195851495566, + "flos": 22237144542720.0, + "grad_norm": 2.0096953575355125, + "language_loss": 0.79649067, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.81801736, + "num_input_tokens_seen": 93227925, + "step": 4316, + "time_per_iteration": 2.7463796138763428 + }, + { + "auxiliary_loss_clip": 0.01096584, + "auxiliary_loss_mlp": 0.01048328, + "balance_loss_clip": 1.0487628, + "balance_loss_mlp": 1.03112638, + "epoch": 0.2595520817676236, + "flos": 28075716645120.0, + "grad_norm": 3.2727308584132584, + "language_loss": 0.77498394, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.79643309, + "num_input_tokens_seen": 93250020, + "step": 4317, + "time_per_iteration": 4.658867359161377 + }, + { + "auxiliary_loss_clip": 0.01155612, + "auxiliary_loss_mlp": 0.01054128, + "balance_loss_clip": 1.05959845, + "balance_loss_mlp": 1.03734958, + "epoch": 0.2596122050202916, + "flos": 20190954378240.0, + "grad_norm": 2.117435309152476, + "language_loss": 0.77656054, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.79865795, + "num_input_tokens_seen": 93269070, + "step": 4318, + "time_per_iteration": 2.5934906005859375 + }, + { + "auxiliary_loss_clip": 0.01146449, + "auxiliary_loss_mlp": 0.01045441, + "balance_loss_clip": 1.0530901, + "balance_loss_mlp": 1.02733302, + "epoch": 0.25967232827295955, + "flos": 22527949052160.0, + "grad_norm": 1.6144223240331488, + "language_loss": 0.76362926, + "learning_rate": 3.471704628661598e-06, + "loss": 0.78554815, + "num_input_tokens_seen": 93290250, + "step": 4319, + "time_per_iteration": 2.607649564743042 + }, + { + "auxiliary_loss_clip": 0.01125042, + "auxiliary_loss_mlp": 0.01041624, + "balance_loss_clip": 1.05419481, + "balance_loss_mlp": 1.02587628, + "epoch": 0.2597324515256275, + "flos": 21068252156160.0, + "grad_norm": 1.6090277746740278, + "language_loss": 0.76549125, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.78715789, + "num_input_tokens_seen": 93310090, + "step": 4320, + "time_per_iteration": 2.722574472427368 + }, + { + "auxiliary_loss_clip": 0.01116281, + "auxiliary_loss_mlp": 0.01042709, + "balance_loss_clip": 1.05157554, + "balance_loss_mlp": 1.02546, + "epoch": 0.2597925747782955, + "flos": 22050013662720.0, + "grad_norm": 1.6564648175426406, + "language_loss": 0.71067965, + "learning_rate": 3.471177075288801e-06, + "loss": 0.73226953, + "num_input_tokens_seen": 93329570, + "step": 4321, + "time_per_iteration": 4.276093244552612 + }, + { + "auxiliary_loss_clip": 0.01125031, + "auxiliary_loss_mlp": 0.01055033, + "balance_loss_clip": 1.05191207, + "balance_loss_mlp": 1.03549457, + "epoch": 0.2598526980309635, + "flos": 19536949497600.0, + "grad_norm": 1.9031382952841078, + "language_loss": 0.74805915, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.76985979, + "num_input_tokens_seen": 93347920, + "step": 4322, + "time_per_iteration": 2.6573097705841064 + }, + { + "auxiliary_loss_clip": 0.0111558, + "auxiliary_loss_mlp": 0.0104757, + "balance_loss_clip": 1.05213332, + "balance_loss_mlp": 1.03004622, + "epoch": 0.25991282128363147, + "flos": 24495207079680.0, + "grad_norm": 1.8978708709823064, + "language_loss": 0.73837054, + "learning_rate": 3.470649298767278e-06, + "loss": 0.76000202, + "num_input_tokens_seen": 93367145, + "step": 4323, + "time_per_iteration": 2.75765061378479 + }, + { + "auxiliary_loss_clip": 0.01139686, + "auxiliary_loss_mlp": 0.00775622, + "balance_loss_clip": 1.0509938, + "balance_loss_mlp": 1.00099182, + "epoch": 0.25997294453629943, + "flos": 24201457655040.0, + "grad_norm": 2.107506603705316, + "language_loss": 0.67186093, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.69101399, + "num_input_tokens_seen": 93386555, + "step": 4324, + "time_per_iteration": 2.752307891845703 + }, + { + "auxiliary_loss_clip": 0.0109649, + "auxiliary_loss_mlp": 0.01045367, + "balance_loss_clip": 1.05030632, + "balance_loss_mlp": 1.03026319, + "epoch": 0.2600330677889674, + "flos": 31431460855680.0, + "grad_norm": 2.121769328280442, + "language_loss": 0.71064055, + "learning_rate": 3.470121299177082e-06, + "loss": 0.732059, + "num_input_tokens_seen": 93405590, + "step": 4325, + "time_per_iteration": 2.824281692504883 + }, + { + "auxiliary_loss_clip": 0.01134613, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.04941416, + "balance_loss_mlp": 1.01839304, + "epoch": 0.26009319104163536, + "flos": 32266527217920.0, + "grad_norm": 1.8496839878379767, + "language_loss": 0.73106551, + "learning_rate": 3.469857215756257e-06, + "loss": 0.75276732, + "num_input_tokens_seen": 93424750, + "step": 4326, + "time_per_iteration": 2.7235658168792725 + }, + { + "auxiliary_loss_clip": 0.01118123, + "auxiliary_loss_mlp": 0.00776184, + "balance_loss_clip": 1.05001175, + "balance_loss_mlp": 1.00100303, + "epoch": 0.26015331429430333, + "flos": 26286754752000.0, + "grad_norm": 1.7229255626307804, + "language_loss": 0.86908734, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.88803041, + "num_input_tokens_seen": 93443465, + "step": 4327, + "time_per_iteration": 2.7072155475616455 + }, + { + "auxiliary_loss_clip": 0.01153995, + "auxiliary_loss_mlp": 0.00775932, + "balance_loss_clip": 1.05640841, + "balance_loss_mlp": 1.0008533, + "epoch": 0.2602134375469713, + "flos": 21142335957120.0, + "grad_norm": 1.4664721830580452, + "language_loss": 0.80265766, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.82195687, + "num_input_tokens_seen": 93462580, + "step": 4328, + "time_per_iteration": 2.6463024616241455 + }, + { + "auxiliary_loss_clip": 0.0111992, + "auxiliary_loss_mlp": 0.00774533, + "balance_loss_clip": 1.04837036, + "balance_loss_mlp": 1.00092077, + "epoch": 0.26027356079963926, + "flos": 25921327737600.0, + "grad_norm": 1.6317826670237516, + "language_loss": 0.88094193, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.89988649, + "num_input_tokens_seen": 93482790, + "step": 4329, + "time_per_iteration": 2.7130861282348633 + }, + { + "auxiliary_loss_clip": 0.011478, + "auxiliary_loss_mlp": 0.01040633, + "balance_loss_clip": 1.05545115, + "balance_loss_mlp": 1.02431321, + "epoch": 0.2603336840523072, + "flos": 26359222440960.0, + "grad_norm": 1.8335620949826397, + "language_loss": 0.77834195, + "learning_rate": 3.468800324801802e-06, + "loss": 0.80022621, + "num_input_tokens_seen": 93498795, + "step": 4330, + "time_per_iteration": 2.6223180294036865 + }, + { + "auxiliary_loss_clip": 0.01148961, + "auxiliary_loss_mlp": 0.01047898, + "balance_loss_clip": 1.0536809, + "balance_loss_mlp": 1.03081572, + "epoch": 0.2603938073049752, + "flos": 23513661054720.0, + "grad_norm": 1.5875829464999673, + "language_loss": 0.75683081, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.77879941, + "num_input_tokens_seen": 93518335, + "step": 4331, + "time_per_iteration": 2.6383559703826904 + }, + { + "auxiliary_loss_clip": 0.01130325, + "auxiliary_loss_mlp": 0.01042577, + "balance_loss_clip": 1.05964541, + "balance_loss_mlp": 1.0261023, + "epoch": 0.26045393055764315, + "flos": 25374300537600.0, + "grad_norm": 1.3798785286413686, + "language_loss": 0.69174874, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.71347773, + "num_input_tokens_seen": 93539170, + "step": 4332, + "time_per_iteration": 2.675203800201416 + }, + { + "auxiliary_loss_clip": 0.01117119, + "auxiliary_loss_mlp": 0.01048864, + "balance_loss_clip": 1.04849494, + "balance_loss_mlp": 1.03203201, + "epoch": 0.2605140538103111, + "flos": 27635272076160.0, + "grad_norm": 6.1371153370044915, + "language_loss": 0.79897749, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.82063735, + "num_input_tokens_seen": 93558480, + "step": 4333, + "time_per_iteration": 2.7595479488372803 + }, + { + "auxiliary_loss_clip": 0.01144159, + "auxiliary_loss_mlp": 0.01039411, + "balance_loss_clip": 1.05260658, + "balance_loss_mlp": 1.02317452, + "epoch": 0.2605741770629791, + "flos": 13769839503360.0, + "grad_norm": 1.9478362516602954, + "language_loss": 0.80919975, + "learning_rate": 3.467742542694501e-06, + "loss": 0.83103544, + "num_input_tokens_seen": 93575220, + "step": 4334, + "time_per_iteration": 2.585676670074463 + }, + { + "auxiliary_loss_clip": 0.01121127, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.04868293, + "balance_loss_mlp": 1.02051783, + "epoch": 0.26063430031564705, + "flos": 26031681296640.0, + "grad_norm": 1.8490049893982383, + "language_loss": 0.8027274, + "learning_rate": 3.46747795800024e-06, + "loss": 0.82431591, + "num_input_tokens_seen": 93597015, + "step": 4335, + "time_per_iteration": 2.730853796005249 + }, + { + "auxiliary_loss_clip": 0.01060862, + "auxiliary_loss_mlp": 0.01054521, + "balance_loss_clip": 1.03598261, + "balance_loss_mlp": 1.05267298, + "epoch": 0.26069442356831507, + "flos": 62443809820800.0, + "grad_norm": 1.1166557113782816, + "language_loss": 0.60850358, + "learning_rate": 3.467213317659068e-06, + "loss": 0.62965739, + "num_input_tokens_seen": 93657775, + "step": 4336, + "time_per_iteration": 3.1322128772735596 + }, + { + "auxiliary_loss_clip": 0.01111016, + "auxiliary_loss_mlp": 0.01046835, + "balance_loss_clip": 1.05039525, + "balance_loss_mlp": 1.02976441, + "epoch": 0.26075454682098304, + "flos": 13626376583040.0, + "grad_norm": 2.784557437613843, + "language_loss": 0.7679469, + "learning_rate": 3.46694862168102e-06, + "loss": 0.78952539, + "num_input_tokens_seen": 93676145, + "step": 4337, + "time_per_iteration": 2.704305410385132 + }, + { + "auxiliary_loss_clip": 0.0112146, + "auxiliary_loss_mlp": 0.01045064, + "balance_loss_clip": 1.04997659, + "balance_loss_mlp": 1.02728987, + "epoch": 0.260814670073651, + "flos": 12126531260160.0, + "grad_norm": 2.7677016823816976, + "language_loss": 0.74653983, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.76820505, + "num_input_tokens_seen": 93692480, + "step": 4338, + "time_per_iteration": 2.652679204940796 + }, + { + "auxiliary_loss_clip": 0.01140171, + "auxiliary_loss_mlp": 0.01040507, + "balance_loss_clip": 1.05246329, + "balance_loss_mlp": 1.02314997, + "epoch": 0.26087479332631897, + "flos": 15122522805120.0, + "grad_norm": 2.378816803290104, + "language_loss": 0.81061137, + "learning_rate": 3.466419062854447e-06, + "loss": 0.8324182, + "num_input_tokens_seen": 93710165, + "step": 4339, + "time_per_iteration": 2.7237682342529297 + }, + { + "auxiliary_loss_clip": 0.01090328, + "auxiliary_loss_mlp": 0.01040213, + "balance_loss_clip": 1.04649866, + "balance_loss_mlp": 1.02436984, + "epoch": 0.26093491657898693, + "flos": 24680937329280.0, + "grad_norm": 1.6860698424881835, + "language_loss": 0.76643449, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.78773987, + "num_input_tokens_seen": 93730185, + "step": 4340, + "time_per_iteration": 2.817647695541382 + }, + { + "auxiliary_loss_clip": 0.01082903, + "auxiliary_loss_mlp": 0.01040837, + "balance_loss_clip": 1.04781985, + "balance_loss_mlp": 1.02381396, + "epoch": 0.2609950398316549, + "flos": 25116138512640.0, + "grad_norm": 1.954971477972507, + "language_loss": 0.82689369, + "learning_rate": 3.465889281600845e-06, + "loss": 0.84813106, + "num_input_tokens_seen": 93747690, + "step": 4341, + "time_per_iteration": 2.822387218475342 + }, + { + "auxiliary_loss_clip": 0.01148407, + "auxiliary_loss_mlp": 0.0104134, + "balance_loss_clip": 1.0550344, + "balance_loss_mlp": 1.02387536, + "epoch": 0.26105516308432286, + "flos": 28548588216960.0, + "grad_norm": 2.3225619433460083, + "language_loss": 0.76828772, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.79018521, + "num_input_tokens_seen": 93767405, + "step": 4342, + "time_per_iteration": 2.7091987133026123 + }, + { + "auxiliary_loss_clip": 0.01137117, + "auxiliary_loss_mlp": 0.01036127, + "balance_loss_clip": 1.05262113, + "balance_loss_mlp": 1.01837635, + "epoch": 0.2611152863369908, + "flos": 39530609447040.0, + "grad_norm": 1.8380809165191976, + "language_loss": 0.66072762, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.68246007, + "num_input_tokens_seen": 93789950, + "step": 4343, + "time_per_iteration": 2.7885191440582275 + }, + { + "auxiliary_loss_clip": 0.01076135, + "auxiliary_loss_mlp": 0.01045298, + "balance_loss_clip": 1.04419374, + "balance_loss_mlp": 1.02715397, + "epoch": 0.2611754095896588, + "flos": 13735329511680.0, + "grad_norm": 1.9033089414913282, + "language_loss": 0.73626471, + "learning_rate": 3.465094192845553e-06, + "loss": 0.75747907, + "num_input_tokens_seen": 93807835, + "step": 4344, + "time_per_iteration": 2.7622575759887695 + }, + { + "auxiliary_loss_clip": 0.01150726, + "auxiliary_loss_mlp": 0.01042349, + "balance_loss_clip": 1.05625904, + "balance_loss_mlp": 1.02560019, + "epoch": 0.26123553284232676, + "flos": 21506649649920.0, + "grad_norm": 2.7815673216786045, + "language_loss": 0.86820161, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.89013231, + "num_input_tokens_seen": 93825670, + "step": 4345, + "time_per_iteration": 2.615021228790283 + }, + { + "auxiliary_loss_clip": 0.01121997, + "auxiliary_loss_mlp": 0.01036853, + "balance_loss_clip": 1.05178094, + "balance_loss_mlp": 1.02056956, + "epoch": 0.2612956560949947, + "flos": 21139786091520.0, + "grad_norm": 1.9109970692142244, + "language_loss": 0.76235008, + "learning_rate": 3.464563855876015e-06, + "loss": 0.78393853, + "num_input_tokens_seen": 93844045, + "step": 4346, + "time_per_iteration": 2.660766363143921 + }, + { + "auxiliary_loss_clip": 0.01140284, + "auxiliary_loss_mlp": 0.01045855, + "balance_loss_clip": 1.05571795, + "balance_loss_mlp": 1.02870095, + "epoch": 0.2613557793476627, + "flos": 25119011600640.0, + "grad_norm": 1.6628741865434964, + "language_loss": 0.75995654, + "learning_rate": 3.464298604081606e-06, + "loss": 0.78181791, + "num_input_tokens_seen": 93864380, + "step": 4347, + "time_per_iteration": 2.6985979080200195 + }, + { + "auxiliary_loss_clip": 0.0110699, + "auxiliary_loss_mlp": 0.01041742, + "balance_loss_clip": 1.05063343, + "balance_loss_mlp": 1.02501726, + "epoch": 0.26141590260033065, + "flos": 26067699659520.0, + "grad_norm": 1.7474860409603998, + "language_loss": 0.73196864, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.75345594, + "num_input_tokens_seen": 93885475, + "step": 4348, + "time_per_iteration": 2.7511887550354004 + }, + { + "auxiliary_loss_clip": 0.01110529, + "auxiliary_loss_mlp": 0.01045849, + "balance_loss_clip": 1.05199265, + "balance_loss_mlp": 1.0290519, + "epoch": 0.2614760258529987, + "flos": 25701518459520.0, + "grad_norm": 2.6377025292028944, + "language_loss": 0.91262084, + "learning_rate": 3.463767933923799e-06, + "loss": 0.93418467, + "num_input_tokens_seen": 93905545, + "step": 4349, + "time_per_iteration": 2.720240354537964 + }, + { + "auxiliary_loss_clip": 0.0113714, + "auxiliary_loss_mlp": 0.01048228, + "balance_loss_clip": 1.05569661, + "balance_loss_mlp": 1.03184831, + "epoch": 0.26153614910566664, + "flos": 17457147181440.0, + "grad_norm": 1.7232851278977876, + "language_loss": 0.80046499, + "learning_rate": 3.463502515580524e-06, + "loss": 0.82231867, + "num_input_tokens_seen": 93924185, + "step": 4350, + "time_per_iteration": 2.652054786682129 + }, + { + "auxiliary_loss_clip": 0.0113538, + "auxiliary_loss_mlp": 0.01049567, + "balance_loss_clip": 1.05652642, + "balance_loss_mlp": 1.03299654, + "epoch": 0.2615962723583346, + "flos": 17712831168000.0, + "grad_norm": 10.816271600027287, + "language_loss": 0.62736505, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.64921451, + "num_input_tokens_seen": 93942825, + "step": 4351, + "time_per_iteration": 2.6674954891204834 + }, + { + "auxiliary_loss_clip": 0.01138265, + "auxiliary_loss_mlp": 0.01048518, + "balance_loss_clip": 1.05201697, + "balance_loss_mlp": 1.03168559, + "epoch": 0.26165639561100257, + "flos": 23257725672960.0, + "grad_norm": 1.9014393183165526, + "language_loss": 0.84131002, + "learning_rate": 3.462971512415555e-06, + "loss": 0.86317784, + "num_input_tokens_seen": 93962045, + "step": 4352, + "time_per_iteration": 2.8033063411712646 + }, + { + "auxiliary_loss_clip": 0.01065372, + "auxiliary_loss_mlp": 0.0102292, + "balance_loss_clip": 1.04145527, + "balance_loss_mlp": 1.02078664, + "epoch": 0.26171651886367053, + "flos": 66737970800640.0, + "grad_norm": 0.8050815788583346, + "language_loss": 0.70591724, + "learning_rate": 3.462705927613996e-06, + "loss": 0.7268002, + "num_input_tokens_seen": 94021175, + "step": 4353, + "time_per_iteration": 3.101954936981201 + }, + { + "auxiliary_loss_clip": 0.01115948, + "auxiliary_loss_mlp": 0.01069336, + "balance_loss_clip": 1.04858005, + "balance_loss_mlp": 1.05013168, + "epoch": 0.2617766421163385, + "flos": 22349581090560.0, + "grad_norm": 1.6494861832481549, + "language_loss": 0.77562749, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.79748034, + "num_input_tokens_seen": 94043370, + "step": 4354, + "time_per_iteration": 2.772723436355591 + }, + { + "auxiliary_loss_clip": 0.01089887, + "auxiliary_loss_mlp": 0.01058882, + "balance_loss_clip": 1.04805279, + "balance_loss_mlp": 1.04082203, + "epoch": 0.26183676536900646, + "flos": 26067125041920.0, + "grad_norm": 1.8339738923409379, + "language_loss": 0.68351537, + "learning_rate": 3.462174591623085e-06, + "loss": 0.70500308, + "num_input_tokens_seen": 94063510, + "step": 4355, + "time_per_iteration": 5.908639430999756 + }, + { + "auxiliary_loss_clip": 0.01094509, + "auxiliary_loss_mlp": 0.01039879, + "balance_loss_clip": 1.0486095, + "balance_loss_mlp": 1.02164054, + "epoch": 0.26189688862167443, + "flos": 20996466825600.0, + "grad_norm": 1.9440617828376934, + "language_loss": 0.67573452, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.69707847, + "num_input_tokens_seen": 94083865, + "step": 4356, + "time_per_iteration": 4.351539611816406 + }, + { + "auxiliary_loss_clip": 0.01057297, + "auxiliary_loss_mlp": 0.0100707, + "balance_loss_clip": 1.03335488, + "balance_loss_mlp": 1.00484037, + "epoch": 0.2619570118743424, + "flos": 65798261141760.0, + "grad_norm": 0.6809064288126679, + "language_loss": 0.53124392, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55188763, + "num_input_tokens_seen": 94144095, + "step": 4357, + "time_per_iteration": 3.0896964073181152 + }, + { + "auxiliary_loss_clip": 0.01139918, + "auxiliary_loss_mlp": 0.0104768, + "balance_loss_clip": 1.05365348, + "balance_loss_mlp": 1.03106248, + "epoch": 0.26201713512701036, + "flos": 28766817296640.0, + "grad_norm": 1.8814759411194193, + "language_loss": 0.84233022, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.86420614, + "num_input_tokens_seen": 94163035, + "step": 4358, + "time_per_iteration": 2.723057746887207 + }, + { + "auxiliary_loss_clip": 0.01127273, + "auxiliary_loss_mlp": 0.01043309, + "balance_loss_clip": 1.04886353, + "balance_loss_mlp": 1.02411628, + "epoch": 0.2620772583796783, + "flos": 26432516142720.0, + "grad_norm": 2.354545555797757, + "language_loss": 0.67324048, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.69494629, + "num_input_tokens_seen": 94182520, + "step": 4359, + "time_per_iteration": 2.7128403186798096 + }, + { + "auxiliary_loss_clip": 0.01118602, + "auxiliary_loss_mlp": 0.01045018, + "balance_loss_clip": 1.04637527, + "balance_loss_mlp": 1.02880526, + "epoch": 0.2621373816323463, + "flos": 20156552127360.0, + "grad_norm": 1.8862311303010293, + "language_loss": 0.78726596, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.80890214, + "num_input_tokens_seen": 94201795, + "step": 4360, + "time_per_iteration": 4.41027569770813 + }, + { + "auxiliary_loss_clip": 0.01119481, + "auxiliary_loss_mlp": 0.01042435, + "balance_loss_clip": 1.04831719, + "balance_loss_mlp": 1.02640164, + "epoch": 0.26219750488501425, + "flos": 28621235473920.0, + "grad_norm": 1.8399079957082187, + "language_loss": 0.67980468, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.70142382, + "num_input_tokens_seen": 94222390, + "step": 4361, + "time_per_iteration": 2.7642054557800293 + }, + { + "auxiliary_loss_clip": 0.01139509, + "auxiliary_loss_mlp": 0.01055985, + "balance_loss_clip": 1.05313993, + "balance_loss_mlp": 1.03842545, + "epoch": 0.2622576281376823, + "flos": 15042549173760.0, + "grad_norm": 2.1489496912575166, + "language_loss": 0.84068632, + "learning_rate": 3.46031316964119e-06, + "loss": 0.86264122, + "num_input_tokens_seen": 94239980, + "step": 4362, + "time_per_iteration": 2.6152050495147705 + }, + { + "auxiliary_loss_clip": 0.01105407, + "auxiliary_loss_mlp": 0.01046107, + "balance_loss_clip": 1.04752779, + "balance_loss_mlp": 1.02867842, + "epoch": 0.26231775139035024, + "flos": 26396174557440.0, + "grad_norm": 2.0545933935481835, + "language_loss": 0.65068752, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.67220271, + "num_input_tokens_seen": 94260715, + "step": 4363, + "time_per_iteration": 2.7297046184539795 + }, + { + "auxiliary_loss_clip": 0.01040739, + "auxiliary_loss_mlp": 0.01017272, + "balance_loss_clip": 1.02776587, + "balance_loss_mlp": 1.01506662, + "epoch": 0.2623778746430182, + "flos": 65408918647680.0, + "grad_norm": 0.9195643121956573, + "language_loss": 0.61104208, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.6316222, + "num_input_tokens_seen": 94321285, + "step": 4364, + "time_per_iteration": 3.3122286796569824 + }, + { + "auxiliary_loss_clip": 0.01151556, + "auxiliary_loss_mlp": 0.01050336, + "balance_loss_clip": 1.0550462, + "balance_loss_mlp": 1.03201365, + "epoch": 0.26243799789568617, + "flos": 12604215254400.0, + "grad_norm": 2.6922753747731387, + "language_loss": 0.7223357, + "learning_rate": 3.459514586533184e-06, + "loss": 0.74435461, + "num_input_tokens_seen": 94335420, + "step": 4365, + "time_per_iteration": 2.588611364364624 + }, + { + "auxiliary_loss_clip": 0.01123747, + "auxiliary_loss_mlp": 0.00776591, + "balance_loss_clip": 1.05296087, + "balance_loss_mlp": 1.00093484, + "epoch": 0.26249812114835414, + "flos": 28623821253120.0, + "grad_norm": 1.9684942716361389, + "language_loss": 0.77178609, + "learning_rate": 3.459248281460509e-06, + "loss": 0.79078948, + "num_input_tokens_seen": 94357440, + "step": 4366, + "time_per_iteration": 2.7489407062530518 + }, + { + "auxiliary_loss_clip": 0.01149499, + "auxiliary_loss_mlp": 0.0104305, + "balance_loss_clip": 1.05433846, + "balance_loss_mlp": 1.02652764, + "epoch": 0.2625582444010221, + "flos": 14465393441280.0, + "grad_norm": 1.9587652436204308, + "language_loss": 0.76205176, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.78397727, + "num_input_tokens_seen": 94375690, + "step": 4367, + "time_per_iteration": 2.63778018951416 + }, + { + "auxiliary_loss_clip": 0.01136158, + "auxiliary_loss_mlp": 0.01045138, + "balance_loss_clip": 1.0523572, + "balance_loss_mlp": 1.02903318, + "epoch": 0.26261836765369007, + "flos": 16613174246400.0, + "grad_norm": 2.055472748506688, + "language_loss": 0.69400585, + "learning_rate": 3.458715505320736e-06, + "loss": 0.71581888, + "num_input_tokens_seen": 94393190, + "step": 4368, + "time_per_iteration": 2.6515018939971924 + }, + { + "auxiliary_loss_clip": 0.01123905, + "auxiliary_loss_mlp": 0.01045619, + "balance_loss_clip": 1.05272579, + "balance_loss_mlp": 1.02791643, + "epoch": 0.26267849090635803, + "flos": 20519932066560.0, + "grad_norm": 1.8794244148025279, + "language_loss": 0.79255176, + "learning_rate": 3.458449034273841e-06, + "loss": 0.81424701, + "num_input_tokens_seen": 94410975, + "step": 4369, + "time_per_iteration": 2.717142343521118 + }, + { + "auxiliary_loss_clip": 0.01119662, + "auxiliary_loss_mlp": 0.01040752, + "balance_loss_clip": 1.05190969, + "balance_loss_mlp": 1.02344334, + "epoch": 0.262738614159026, + "flos": 21323936142720.0, + "grad_norm": 4.796099217910503, + "language_loss": 0.83591807, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.85752219, + "num_input_tokens_seen": 94429985, + "step": 4370, + "time_per_iteration": 2.742966890335083 + }, + { + "auxiliary_loss_clip": 0.01137822, + "auxiliary_loss_mlp": 0.01053822, + "balance_loss_clip": 1.05178714, + "balance_loss_mlp": 1.0345341, + "epoch": 0.26279873741169396, + "flos": 17603590930560.0, + "grad_norm": 1.7275848609842401, + "language_loss": 0.71854705, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.7404635, + "num_input_tokens_seen": 94448660, + "step": 4371, + "time_per_iteration": 2.691899538040161 + }, + { + "auxiliary_loss_clip": 0.01062293, + "auxiliary_loss_mlp": 0.01003561, + "balance_loss_clip": 1.02797341, + "balance_loss_mlp": 1.00147498, + "epoch": 0.2628588606643619, + "flos": 60949746587520.0, + "grad_norm": 0.6802377941963699, + "language_loss": 0.56387627, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58453482, + "num_input_tokens_seen": 94515630, + "step": 4372, + "time_per_iteration": 3.279158115386963 + }, + { + "auxiliary_loss_clip": 0.01124406, + "auxiliary_loss_mlp": 0.01038838, + "balance_loss_clip": 1.05295706, + "balance_loss_mlp": 1.02169585, + "epoch": 0.2629189839170299, + "flos": 27016315891200.0, + "grad_norm": 1.9842369613103452, + "language_loss": 0.77777553, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.79940796, + "num_input_tokens_seen": 94535385, + "step": 4373, + "time_per_iteration": 2.8367159366607666 + }, + { + "auxiliary_loss_clip": 0.01104424, + "auxiliary_loss_mlp": 0.01039426, + "balance_loss_clip": 1.05070519, + "balance_loss_mlp": 1.02314186, + "epoch": 0.26297910716969786, + "flos": 17019863009280.0, + "grad_norm": 7.588420148526772, + "language_loss": 0.71397603, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.73541456, + "num_input_tokens_seen": 94552650, + "step": 4374, + "time_per_iteration": 2.722332239151001 + }, + { + "auxiliary_loss_clip": 0.0112606, + "auxiliary_loss_mlp": 0.01045748, + "balance_loss_clip": 1.05836225, + "balance_loss_mlp": 1.02748489, + "epoch": 0.2630392304223659, + "flos": 24897370728960.0, + "grad_norm": 1.8414201938467747, + "language_loss": 0.81212163, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.83383965, + "num_input_tokens_seen": 94574075, + "step": 4375, + "time_per_iteration": 2.7654781341552734 + }, + { + "auxiliary_loss_clip": 0.01118996, + "auxiliary_loss_mlp": 0.01045139, + "balance_loss_clip": 1.04959798, + "balance_loss_mlp": 1.02901626, + "epoch": 0.26309935367503384, + "flos": 32854026067200.0, + "grad_norm": 1.6461571134793078, + "language_loss": 0.6613251, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.68296647, + "num_input_tokens_seen": 94594255, + "step": 4376, + "time_per_iteration": 2.778731107711792 + }, + { + "auxiliary_loss_clip": 0.01096695, + "auxiliary_loss_mlp": 0.0106417, + "balance_loss_clip": 1.04752398, + "balance_loss_mlp": 1.04587138, + "epoch": 0.2631594769277018, + "flos": 15887958652800.0, + "grad_norm": 1.7628322447974545, + "language_loss": 0.69351411, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.71512282, + "num_input_tokens_seen": 94611410, + "step": 4377, + "time_per_iteration": 2.7606706619262695 + }, + { + "auxiliary_loss_clip": 0.01141095, + "auxiliary_loss_mlp": 0.01043033, + "balance_loss_clip": 1.0561285, + "balance_loss_mlp": 1.02606952, + "epoch": 0.2632196001803698, + "flos": 50804943557760.0, + "grad_norm": 2.1982489321824352, + "language_loss": 0.79961169, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.82145292, + "num_input_tokens_seen": 94636575, + "step": 4378, + "time_per_iteration": 2.9000468254089355 + }, + { + "auxiliary_loss_clip": 0.01127331, + "auxiliary_loss_mlp": 0.01045659, + "balance_loss_clip": 1.05713558, + "balance_loss_mlp": 1.03063893, + "epoch": 0.26327972343303774, + "flos": 13733031041280.0, + "grad_norm": 1.912468890890116, + "language_loss": 0.76285684, + "learning_rate": 3.455781283723846e-06, + "loss": 0.78458679, + "num_input_tokens_seen": 94654345, + "step": 4379, + "time_per_iteration": 2.6757192611694336 + }, + { + "auxiliary_loss_clip": 0.01114814, + "auxiliary_loss_mlp": 0.01043, + "balance_loss_clip": 1.05360019, + "balance_loss_mlp": 1.02465415, + "epoch": 0.2633398466857057, + "flos": 23769057732480.0, + "grad_norm": 1.982346793660648, + "language_loss": 0.77895945, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.80053759, + "num_input_tokens_seen": 94673985, + "step": 4380, + "time_per_iteration": 2.745392084121704 + }, + { + "auxiliary_loss_clip": 0.01125918, + "auxiliary_loss_mlp": 0.01040915, + "balance_loss_clip": 1.04945278, + "balance_loss_mlp": 1.02351093, + "epoch": 0.26339996993837367, + "flos": 27600223380480.0, + "grad_norm": 2.2040025999375215, + "language_loss": 0.64148676, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.66315508, + "num_input_tokens_seen": 94693145, + "step": 4381, + "time_per_iteration": 2.8020689487457275 + }, + { + "auxiliary_loss_clip": 0.01136752, + "auxiliary_loss_mlp": 0.01038794, + "balance_loss_clip": 1.05113709, + "balance_loss_mlp": 1.02225995, + "epoch": 0.26346009319104163, + "flos": 16946317912320.0, + "grad_norm": 1.9675616702193486, + "language_loss": 0.82470775, + "learning_rate": 3.454979881632595e-06, + "loss": 0.8464632, + "num_input_tokens_seen": 94710185, + "step": 4382, + "time_per_iteration": 2.66001558303833 + }, + { + "auxiliary_loss_clip": 0.01106019, + "auxiliary_loss_mlp": 0.01045742, + "balance_loss_clip": 1.04899645, + "balance_loss_mlp": 1.02726483, + "epoch": 0.2635202164437096, + "flos": 37232218915200.0, + "grad_norm": 4.511875880791621, + "language_loss": 0.70333207, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.7248497, + "num_input_tokens_seen": 94730280, + "step": 4383, + "time_per_iteration": 2.851227045059204 + }, + { + "auxiliary_loss_clip": 0.01136676, + "auxiliary_loss_mlp": 0.01039697, + "balance_loss_clip": 1.05237031, + "balance_loss_mlp": 1.0239253, + "epoch": 0.26358033969637756, + "flos": 20996359084800.0, + "grad_norm": 3.019496854013466, + "language_loss": 0.69455528, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.71631902, + "num_input_tokens_seen": 94748560, + "step": 4384, + "time_per_iteration": 2.670023202896118 + }, + { + "auxiliary_loss_clip": 0.01135763, + "auxiliary_loss_mlp": 0.01039573, + "balance_loss_clip": 1.05114567, + "balance_loss_mlp": 1.02275276, + "epoch": 0.26364046294904553, + "flos": 27746092512000.0, + "grad_norm": 2.2712502599605036, + "language_loss": 0.70067525, + "learning_rate": 3.45417798298451e-06, + "loss": 0.72242868, + "num_input_tokens_seen": 94767570, + "step": 4385, + "time_per_iteration": 2.7232449054718018 + }, + { + "auxiliary_loss_clip": 0.01112529, + "auxiliary_loss_mlp": 0.0104946, + "balance_loss_clip": 1.04893148, + "balance_loss_mlp": 1.03190076, + "epoch": 0.2637005862017135, + "flos": 22893088757760.0, + "grad_norm": 1.8128608655109948, + "language_loss": 0.85684925, + "learning_rate": 3.453910573136482e-06, + "loss": 0.87846911, + "num_input_tokens_seen": 94784985, + "step": 4386, + "time_per_iteration": 2.727924108505249 + }, + { + "auxiliary_loss_clip": 0.01126521, + "auxiliary_loss_mlp": 0.01046433, + "balance_loss_clip": 1.0510478, + "balance_loss_mlp": 1.02955282, + "epoch": 0.26376070945438146, + "flos": 15048834053760.0, + "grad_norm": 2.174412940978395, + "language_loss": 0.7796396, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.80136907, + "num_input_tokens_seen": 94802545, + "step": 4387, + "time_per_iteration": 2.666287660598755 + }, + { + "auxiliary_loss_clip": 0.01134058, + "auxiliary_loss_mlp": 0.01041407, + "balance_loss_clip": 1.05609179, + "balance_loss_mlp": 1.02537298, + "epoch": 0.2638208327070494, + "flos": 21141833166720.0, + "grad_norm": 2.003302761742054, + "language_loss": 0.76126039, + "learning_rate": 3.453375588053264e-06, + "loss": 0.78301507, + "num_input_tokens_seen": 94820730, + "step": 4388, + "time_per_iteration": 2.6321358680725098 + }, + { + "auxiliary_loss_clip": 0.01148944, + "auxiliary_loss_mlp": 0.01036978, + "balance_loss_clip": 1.05455542, + "balance_loss_mlp": 1.02002645, + "epoch": 0.26388095595971744, + "flos": 21725597001600.0, + "grad_norm": 2.534815675842734, + "language_loss": 0.86675179, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.88861108, + "num_input_tokens_seen": 94839175, + "step": 4389, + "time_per_iteration": 2.6122422218322754 + }, + { + "auxiliary_loss_clip": 0.01048602, + "auxiliary_loss_mlp": 0.01002085, + "balance_loss_clip": 1.03000987, + "balance_loss_mlp": 0.99961758, + "epoch": 0.2639410792123854, + "flos": 65515537192320.0, + "grad_norm": 0.8388510572165676, + "language_loss": 0.60285747, + "learning_rate": 3.452840382521457e-06, + "loss": 0.62336433, + "num_input_tokens_seen": 94898865, + "step": 4390, + "time_per_iteration": 3.1867401599884033 + }, + { + "auxiliary_loss_clip": 0.01128567, + "auxiliary_loss_mlp": 0.01040305, + "balance_loss_clip": 1.05022383, + "balance_loss_mlp": 1.02319825, + "epoch": 0.2640012024650534, + "flos": 23948574929280.0, + "grad_norm": 1.6144448841655068, + "language_loss": 0.77730125, + "learning_rate": 3.4525726971127e-06, + "loss": 0.79899001, + "num_input_tokens_seen": 94917490, + "step": 4391, + "time_per_iteration": 2.707310676574707 + }, + { + "auxiliary_loss_clip": 0.01031384, + "auxiliary_loss_mlp": 0.00755302, + "balance_loss_clip": 1.02553821, + "balance_loss_mlp": 1.00244236, + "epoch": 0.26406132571772134, + "flos": 56441163369600.0, + "grad_norm": 0.8840896383522404, + "language_loss": 0.58758044, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60544735, + "num_input_tokens_seen": 94969065, + "step": 4392, + "time_per_iteration": 3.211859941482544 + }, + { + "auxiliary_loss_clip": 0.01136937, + "auxiliary_loss_mlp": 0.0105019, + "balance_loss_clip": 1.05295539, + "balance_loss_mlp": 1.03322649, + "epoch": 0.2641214489703893, + "flos": 22090557139200.0, + "grad_norm": 1.9286153229889427, + "language_loss": 0.68954027, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.71141154, + "num_input_tokens_seen": 94988540, + "step": 4393, + "time_per_iteration": 2.6483278274536133 + }, + { + "auxiliary_loss_clip": 0.01140079, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.05395103, + "balance_loss_mlp": 1.02398562, + "epoch": 0.26418157222305727, + "flos": 16544764794240.0, + "grad_norm": 2.0454829511435193, + "language_loss": 0.84071863, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.86253464, + "num_input_tokens_seen": 95004810, + "step": 4394, + "time_per_iteration": 4.3396079540252686 + }, + { + "auxiliary_loss_clip": 0.01124083, + "auxiliary_loss_mlp": 0.01045374, + "balance_loss_clip": 1.04999089, + "balance_loss_mlp": 1.02661061, + "epoch": 0.26424169547572524, + "flos": 18002486442240.0, + "grad_norm": 2.096391063208514, + "language_loss": 0.70044839, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.72214299, + "num_input_tokens_seen": 95024085, + "step": 4395, + "time_per_iteration": 2.8730056285858154 + }, + { + "auxiliary_loss_clip": 0.01110387, + "auxiliary_loss_mlp": 0.01037389, + "balance_loss_clip": 1.04736662, + "balance_loss_mlp": 1.02071214, + "epoch": 0.2643018187283932, + "flos": 16983162288000.0, + "grad_norm": 2.1761517020490606, + "language_loss": 0.86876452, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.89024228, + "num_input_tokens_seen": 95042515, + "step": 4396, + "time_per_iteration": 4.384250640869141 + }, + { + "auxiliary_loss_clip": 0.01010716, + "auxiliary_loss_mlp": 0.01021406, + "balance_loss_clip": 1.02197146, + "balance_loss_mlp": 1.01856887, + "epoch": 0.26436194198106117, + "flos": 59664359416320.0, + "grad_norm": 0.7957760850485174, + "language_loss": 0.55022657, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57054776, + "num_input_tokens_seen": 95094835, + "step": 4397, + "time_per_iteration": 3.0656893253326416 + }, + { + "auxiliary_loss_clip": 0.01132938, + "auxiliary_loss_mlp": 0.01050463, + "balance_loss_clip": 1.0485754, + "balance_loss_mlp": 1.03357744, + "epoch": 0.26442206523372913, + "flos": 32921322197760.0, + "grad_norm": 1.9110208887501443, + "language_loss": 0.77881467, + "learning_rate": 3.450697357532435e-06, + "loss": 0.80064869, + "num_input_tokens_seen": 95113480, + "step": 4398, + "time_per_iteration": 2.740917444229126 + }, + { + "auxiliary_loss_clip": 0.01139914, + "auxiliary_loss_mlp": 0.01040709, + "balance_loss_clip": 1.05469537, + "balance_loss_mlp": 1.02347112, + "epoch": 0.2644821884863971, + "flos": 21031300039680.0, + "grad_norm": 1.7657486248278176, + "language_loss": 0.67534482, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.69715106, + "num_input_tokens_seen": 95132580, + "step": 4399, + "time_per_iteration": 4.305487871170044 + }, + { + "auxiliary_loss_clip": 0.01097219, + "auxiliary_loss_mlp": 0.01042048, + "balance_loss_clip": 1.04840231, + "balance_loss_mlp": 1.02503681, + "epoch": 0.26454231173906506, + "flos": 20776801201920.0, + "grad_norm": 1.6309197312133479, + "language_loss": 0.86614597, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.88753855, + "num_input_tokens_seen": 95152375, + "step": 4400, + "time_per_iteration": 2.695883274078369 + }, + { + "auxiliary_loss_clip": 0.01119339, + "auxiliary_loss_mlp": 0.01039987, + "balance_loss_clip": 1.0483284, + "balance_loss_mlp": 1.0226419, + "epoch": 0.264602434991733, + "flos": 16618669027200.0, + "grad_norm": 3.1942141071602546, + "language_loss": 0.76518428, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.78677756, + "num_input_tokens_seen": 95170265, + "step": 4401, + "time_per_iteration": 2.69415545463562 + }, + { + "auxiliary_loss_clip": 0.01100665, + "auxiliary_loss_mlp": 0.01046326, + "balance_loss_clip": 1.04473615, + "balance_loss_mlp": 1.02758598, + "epoch": 0.26466255824440105, + "flos": 19062677295360.0, + "grad_norm": 2.336049134907364, + "language_loss": 0.88363832, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.90510821, + "num_input_tokens_seen": 95188655, + "step": 4402, + "time_per_iteration": 2.7073450088500977 + }, + { + "auxiliary_loss_clip": 0.01105803, + "auxiliary_loss_mlp": 0.01040704, + "balance_loss_clip": 1.04894042, + "balance_loss_mlp": 1.02327585, + "epoch": 0.264722681497069, + "flos": 22638554006400.0, + "grad_norm": 1.7301089969072252, + "language_loss": 0.7811445, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.80260956, + "num_input_tokens_seen": 95209615, + "step": 4403, + "time_per_iteration": 2.7213027477264404 + }, + { + "auxiliary_loss_clip": 0.01128649, + "auxiliary_loss_mlp": 0.01038032, + "balance_loss_clip": 1.04674816, + "balance_loss_mlp": 1.02050877, + "epoch": 0.264782804749737, + "flos": 22492253911680.0, + "grad_norm": 2.1369132533571604, + "language_loss": 0.88594282, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.90760964, + "num_input_tokens_seen": 95227810, + "step": 4404, + "time_per_iteration": 2.6888909339904785 + }, + { + "auxiliary_loss_clip": 0.01123789, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.04607344, + "balance_loss_mlp": 1.02416539, + "epoch": 0.26484292800240494, + "flos": 16800269212800.0, + "grad_norm": 1.7519644069859235, + "language_loss": 0.76134694, + "learning_rate": 3.448819322433709e-06, + "loss": 0.78299075, + "num_input_tokens_seen": 95245890, + "step": 4405, + "time_per_iteration": 2.7172482013702393 + }, + { + "auxiliary_loss_clip": 0.01148976, + "auxiliary_loss_mlp": 0.01040198, + "balance_loss_clip": 1.05348206, + "balance_loss_mlp": 1.02266204, + "epoch": 0.2649030512550729, + "flos": 20449583280000.0, + "grad_norm": 1.711457274305917, + "language_loss": 0.69873697, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.72062874, + "num_input_tokens_seen": 95264955, + "step": 4406, + "time_per_iteration": 2.584300994873047 + }, + { + "auxiliary_loss_clip": 0.01121151, + "auxiliary_loss_mlp": 0.01050453, + "balance_loss_clip": 1.05182838, + "balance_loss_mlp": 1.03432453, + "epoch": 0.2649631745077409, + "flos": 22416123035520.0, + "grad_norm": 1.7200250795424956, + "language_loss": 0.83956587, + "learning_rate": 3.448282246369912e-06, + "loss": 0.86128193, + "num_input_tokens_seen": 95284245, + "step": 4407, + "time_per_iteration": 2.731316328048706 + }, + { + "auxiliary_loss_clip": 0.01108599, + "auxiliary_loss_mlp": 0.01031757, + "balance_loss_clip": 1.04695201, + "balance_loss_mlp": 1.01501989, + "epoch": 0.26502329776040884, + "flos": 35116110927360.0, + "grad_norm": 1.8896460113896294, + "language_loss": 0.7597363, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.78113985, + "num_input_tokens_seen": 95307125, + "step": 4408, + "time_per_iteration": 2.8600730895996094 + }, + { + "auxiliary_loss_clip": 0.01091919, + "auxiliary_loss_mlp": 0.01044721, + "balance_loss_clip": 1.04267502, + "balance_loss_mlp": 1.02679181, + "epoch": 0.2650834210130768, + "flos": 38687498438400.0, + "grad_norm": 1.7769050714437231, + "language_loss": 0.70612216, + "learning_rate": 3.447744950630084e-06, + "loss": 0.72748852, + "num_input_tokens_seen": 95329150, + "step": 4409, + "time_per_iteration": 2.936380386352539 + }, + { + "auxiliary_loss_clip": 0.01131548, + "auxiliary_loss_mlp": 0.01040186, + "balance_loss_clip": 1.04774857, + "balance_loss_mlp": 1.02218497, + "epoch": 0.26514354426574477, + "flos": 24716847951360.0, + "grad_norm": 1.7357795205395667, + "language_loss": 0.7337513, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.75546867, + "num_input_tokens_seen": 95349880, + "step": 4410, + "time_per_iteration": 2.7315077781677246 + }, + { + "auxiliary_loss_clip": 0.01141374, + "auxiliary_loss_mlp": 0.0104966, + "balance_loss_clip": 1.05183268, + "balance_loss_mlp": 1.03216028, + "epoch": 0.26520366751841273, + "flos": 20340055733760.0, + "grad_norm": 1.8886288474708937, + "language_loss": 0.73828322, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.76019359, + "num_input_tokens_seen": 95368570, + "step": 4411, + "time_per_iteration": 2.641920566558838 + }, + { + "auxiliary_loss_clip": 0.01099594, + "auxiliary_loss_mlp": 0.01041576, + "balance_loss_clip": 1.04986739, + "balance_loss_mlp": 1.02431464, + "epoch": 0.2652637907710807, + "flos": 22343870828160.0, + "grad_norm": 1.9943391034693418, + "language_loss": 0.82447588, + "learning_rate": 3.446938595306071e-06, + "loss": 0.84588754, + "num_input_tokens_seen": 95387065, + "step": 4412, + "time_per_iteration": 2.8344247341156006 + }, + { + "auxiliary_loss_clip": 0.01135402, + "auxiliary_loss_mlp": 0.01052016, + "balance_loss_clip": 1.05143464, + "balance_loss_mlp": 1.03544593, + "epoch": 0.26532391402374866, + "flos": 19354235990400.0, + "grad_norm": 1.775443234311944, + "language_loss": 0.7446382, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.76651239, + "num_input_tokens_seen": 95406345, + "step": 4413, + "time_per_iteration": 2.657975196838379 + }, + { + "auxiliary_loss_clip": 0.01056582, + "auxiliary_loss_mlp": 0.01008584, + "balance_loss_clip": 1.03258443, + "balance_loss_mlp": 1.00659275, + "epoch": 0.26538403727641663, + "flos": 44787611422080.0, + "grad_norm": 0.873557285042922, + "language_loss": 0.56965125, + "learning_rate": 3.446400750732793e-06, + "loss": 0.59030288, + "num_input_tokens_seen": 95463595, + "step": 4414, + "time_per_iteration": 3.1158244609832764 + }, + { + "auxiliary_loss_clip": 0.01107803, + "auxiliary_loss_mlp": 0.01046612, + "balance_loss_clip": 1.04481411, + "balance_loss_mlp": 1.03048313, + "epoch": 0.26544416052908465, + "flos": 28182119708160.0, + "grad_norm": 1.5786807831647507, + "language_loss": 0.74238014, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.76392424, + "num_input_tokens_seen": 95484115, + "step": 4415, + "time_per_iteration": 2.7223031520843506 + }, + { + "auxiliary_loss_clip": 0.01095743, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.04215193, + "balance_loss_mlp": 1.02402353, + "epoch": 0.2655042837817526, + "flos": 17565274097280.0, + "grad_norm": 2.5102345694159016, + "language_loss": 0.86855936, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.88996005, + "num_input_tokens_seen": 95501435, + "step": 4416, + "time_per_iteration": 2.7001683712005615 + }, + { + "auxiliary_loss_clip": 0.01141467, + "auxiliary_loss_mlp": 0.01046153, + "balance_loss_clip": 1.05359149, + "balance_loss_mlp": 1.02761602, + "epoch": 0.2655644070344206, + "flos": 23404636298880.0, + "grad_norm": 1.6343137061510633, + "language_loss": 0.76870787, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.79058409, + "num_input_tokens_seen": 95520135, + "step": 4417, + "time_per_iteration": 2.662196397781372 + }, + { + "auxiliary_loss_clip": 0.01119441, + "auxiliary_loss_mlp": 0.01041503, + "balance_loss_clip": 1.04989183, + "balance_loss_mlp": 1.02242982, + "epoch": 0.26562453028708854, + "flos": 26468462678400.0, + "grad_norm": 1.6334113226277946, + "language_loss": 0.80320108, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.82481045, + "num_input_tokens_seen": 95541705, + "step": 4418, + "time_per_iteration": 2.7742624282836914 + }, + { + "auxiliary_loss_clip": 0.0113892, + "auxiliary_loss_mlp": 0.01045476, + "balance_loss_clip": 1.05182683, + "balance_loss_mlp": 1.02721274, + "epoch": 0.2656846535397565, + "flos": 19207576759680.0, + "grad_norm": 2.164903581235647, + "language_loss": 0.67788607, + "learning_rate": 3.445055179644071e-06, + "loss": 0.69972998, + "num_input_tokens_seen": 95560300, + "step": 4419, + "time_per_iteration": 2.6437718868255615 + }, + { + "auxiliary_loss_clip": 0.01149692, + "auxiliary_loss_mlp": 0.01046258, + "balance_loss_clip": 1.05360699, + "balance_loss_mlp": 1.02711296, + "epoch": 0.2657447767924245, + "flos": 30551325903360.0, + "grad_norm": 1.9366129468869788, + "language_loss": 0.79625547, + "learning_rate": 3.444785900995585e-06, + "loss": 0.81821501, + "num_input_tokens_seen": 95580150, + "step": 4420, + "time_per_iteration": 2.6594905853271484 + }, + { + "auxiliary_loss_clip": 0.01126984, + "auxiliary_loss_mlp": 0.01053725, + "balance_loss_clip": 1.05294895, + "balance_loss_mlp": 1.03368592, + "epoch": 0.26580490004509244, + "flos": 20922742160640.0, + "grad_norm": 1.9122536358412747, + "language_loss": 0.81690109, + "learning_rate": 3.444516567560673e-06, + "loss": 0.83870822, + "num_input_tokens_seen": 95597570, + "step": 4421, + "time_per_iteration": 2.681410551071167 + }, + { + "auxiliary_loss_clip": 0.0113176, + "auxiliary_loss_mlp": 0.01046737, + "balance_loss_clip": 1.05015123, + "balance_loss_mlp": 1.02904677, + "epoch": 0.2658650232977604, + "flos": 43945682584320.0, + "grad_norm": 1.6112293393448585, + "language_loss": 0.65704989, + "learning_rate": 3.444247179349548e-06, + "loss": 0.6788348, + "num_input_tokens_seen": 95619415, + "step": 4422, + "time_per_iteration": 2.8766117095947266 + }, + { + "auxiliary_loss_clip": 0.01130944, + "auxiliary_loss_mlp": 0.01047224, + "balance_loss_clip": 1.04903376, + "balance_loss_mlp": 1.03039181, + "epoch": 0.26592514655042837, + "flos": 29716439109120.0, + "grad_norm": 2.1017056533749896, + "language_loss": 0.74229872, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.76408041, + "num_input_tokens_seen": 95639155, + "step": 4423, + "time_per_iteration": 2.6983659267425537 + }, + { + "auxiliary_loss_clip": 0.01130559, + "auxiliary_loss_mlp": 0.01057709, + "balance_loss_clip": 1.04790974, + "balance_loss_mlp": 1.03822982, + "epoch": 0.26598526980309634, + "flos": 46677730014720.0, + "grad_norm": 1.6865310965149165, + "language_loss": 0.77855694, + "learning_rate": 3.443708238639522e-06, + "loss": 0.80043966, + "num_input_tokens_seen": 95663320, + "step": 4424, + "time_per_iteration": 2.900214433670044 + }, + { + "auxiliary_loss_clip": 0.01132339, + "auxiliary_loss_mlp": 0.01049395, + "balance_loss_clip": 1.04963291, + "balance_loss_mlp": 1.03181148, + "epoch": 0.2660453930557643, + "flos": 11509442582400.0, + "grad_norm": 2.0755220631041684, + "language_loss": 0.78940654, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.81122386, + "num_input_tokens_seen": 95680260, + "step": 4425, + "time_per_iteration": 2.6266820430755615 + }, + { + "auxiliary_loss_clip": 0.01123867, + "auxiliary_loss_mlp": 0.01043959, + "balance_loss_clip": 1.05143404, + "balance_loss_mlp": 1.02767467, + "epoch": 0.26610551630843227, + "flos": 24791578197120.0, + "grad_norm": 1.5673316066045293, + "language_loss": 0.80135047, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.82302874, + "num_input_tokens_seen": 95701140, + "step": 4426, + "time_per_iteration": 2.7015280723571777 + }, + { + "auxiliary_loss_clip": 0.01150747, + "auxiliary_loss_mlp": 0.0104448, + "balance_loss_clip": 1.0554285, + "balance_loss_mlp": 1.02678883, + "epoch": 0.26616563956110023, + "flos": 27636385397760.0, + "grad_norm": 1.617839398314704, + "language_loss": 0.77174348, + "learning_rate": 3.442899417008333e-06, + "loss": 0.79369569, + "num_input_tokens_seen": 95722060, + "step": 4427, + "time_per_iteration": 2.6438984870910645 + }, + { + "auxiliary_loss_clip": 0.01112968, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.05125654, + "balance_loss_mlp": 1.02069747, + "epoch": 0.26622576281376825, + "flos": 28362893880960.0, + "grad_norm": 1.5634759975385293, + "language_loss": 0.76754683, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.78905165, + "num_input_tokens_seen": 95742495, + "step": 4428, + "time_per_iteration": 2.7695741653442383 + }, + { + "auxiliary_loss_clip": 0.01114899, + "auxiliary_loss_mlp": 0.00775922, + "balance_loss_clip": 1.04922283, + "balance_loss_mlp": 1.0008111, + "epoch": 0.2662858860664362, + "flos": 18041341979520.0, + "grad_norm": 1.815928660217762, + "language_loss": 0.82900071, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.84790885, + "num_input_tokens_seen": 95761510, + "step": 4429, + "time_per_iteration": 2.764183282852173 + }, + { + "auxiliary_loss_clip": 0.01106492, + "auxiliary_loss_mlp": 0.01039033, + "balance_loss_clip": 1.05041027, + "balance_loss_mlp": 1.02201009, + "epoch": 0.2663460093191042, + "flos": 22745818995840.0, + "grad_norm": 1.6463341595476202, + "language_loss": 0.71996218, + "learning_rate": 3.442090102943143e-06, + "loss": 0.74141741, + "num_input_tokens_seen": 95782385, + "step": 4430, + "time_per_iteration": 2.7244491577148438 + }, + { + "auxiliary_loss_clip": 0.01148257, + "auxiliary_loss_mlp": 0.01049268, + "balance_loss_clip": 1.05231071, + "balance_loss_mlp": 1.03068352, + "epoch": 0.26640613257177215, + "flos": 16508782344960.0, + "grad_norm": 1.9574919733512919, + "language_loss": 0.82021642, + "learning_rate": 3.441820222206035e-06, + "loss": 0.84219164, + "num_input_tokens_seen": 95800595, + "step": 4431, + "time_per_iteration": 2.5910067558288574 + }, + { + "auxiliary_loss_clip": 0.01143334, + "auxiliary_loss_mlp": 0.01050031, + "balance_loss_clip": 1.0540812, + "balance_loss_mlp": 1.03141046, + "epoch": 0.2664662558244401, + "flos": 23075945919360.0, + "grad_norm": 2.074794485495937, + "language_loss": 0.76745522, + "learning_rate": 3.44155028679496e-06, + "loss": 0.7893889, + "num_input_tokens_seen": 95818480, + "step": 4432, + "time_per_iteration": 2.6548166275024414 + }, + { + "auxiliary_loss_clip": 0.01089372, + "auxiliary_loss_mlp": 0.01052807, + "balance_loss_clip": 1.04526138, + "balance_loss_mlp": 1.03232694, + "epoch": 0.2665263790771081, + "flos": 23769273214080.0, + "grad_norm": 1.872584196626497, + "language_loss": 0.82903433, + "learning_rate": 3.441280296720154e-06, + "loss": 0.85045612, + "num_input_tokens_seen": 95837205, + "step": 4433, + "time_per_iteration": 4.2740867137908936 + }, + { + "auxiliary_loss_clip": 0.01142798, + "auxiliary_loss_mlp": 0.01045231, + "balance_loss_clip": 1.05565643, + "balance_loss_mlp": 1.02671802, + "epoch": 0.26658650232977604, + "flos": 28001273708160.0, + "grad_norm": 2.548777168378285, + "language_loss": 0.76308644, + "learning_rate": 3.441010251991854e-06, + "loss": 0.78496677, + "num_input_tokens_seen": 95858395, + "step": 4434, + "time_per_iteration": 4.203384160995483 + }, + { + "auxiliary_loss_clip": 0.0114611, + "auxiliary_loss_mlp": 0.01044925, + "balance_loss_clip": 1.05197668, + "balance_loss_mlp": 1.02772319, + "epoch": 0.266646625582444, + "flos": 22163635359360.0, + "grad_norm": 2.3452347637055393, + "language_loss": 0.82496321, + "learning_rate": 3.440740152620301e-06, + "loss": 0.84687358, + "num_input_tokens_seen": 95877875, + "step": 4435, + "time_per_iteration": 4.102782964706421 + }, + { + "auxiliary_loss_clip": 0.01104916, + "auxiliary_loss_mlp": 0.01062101, + "balance_loss_clip": 1.04567468, + "balance_loss_mlp": 1.04245555, + "epoch": 0.266706748835112, + "flos": 27853537069440.0, + "grad_norm": 1.994258420562806, + "language_loss": 0.87634504, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.89801526, + "num_input_tokens_seen": 95895820, + "step": 4436, + "time_per_iteration": 2.8048155307769775 + }, + { + "auxiliary_loss_clip": 0.01121439, + "auxiliary_loss_mlp": 0.01047617, + "balance_loss_clip": 1.04637265, + "balance_loss_mlp": 1.03054643, + "epoch": 0.26676687208777994, + "flos": 25812123413760.0, + "grad_norm": 1.4763923958478316, + "language_loss": 0.787242, + "learning_rate": 3.440199789988407e-06, + "loss": 0.80893254, + "num_input_tokens_seen": 95918025, + "step": 4437, + "time_per_iteration": 2.7382607460021973 + }, + { + "auxiliary_loss_clip": 0.01093686, + "auxiliary_loss_mlp": 0.01048829, + "balance_loss_clip": 1.05000877, + "balance_loss_mlp": 1.03117394, + "epoch": 0.2668269953404479, + "flos": 36064583504640.0, + "grad_norm": 4.5178491997969115, + "language_loss": 0.63910848, + "learning_rate": 3.439929526748556e-06, + "loss": 0.66053367, + "num_input_tokens_seen": 95937725, + "step": 4438, + "time_per_iteration": 2.956014633178711 + }, + { + "auxiliary_loss_clip": 0.01080658, + "auxiliary_loss_mlp": 0.01047394, + "balance_loss_clip": 1.0432179, + "balance_loss_mlp": 1.02994168, + "epoch": 0.26688711859311587, + "flos": 26570987072640.0, + "grad_norm": 1.84569516037299, + "language_loss": 0.75897747, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.78025794, + "num_input_tokens_seen": 95956335, + "step": 4439, + "time_per_iteration": 4.428173065185547 + }, + { + "auxiliary_loss_clip": 0.01089075, + "auxiliary_loss_mlp": 0.01041089, + "balance_loss_clip": 1.04845262, + "balance_loss_mlp": 1.02181315, + "epoch": 0.26694724184578383, + "flos": 26761565658240.0, + "grad_norm": 2.10654378697334, + "language_loss": 0.7172367, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.73853838, + "num_input_tokens_seen": 95977135, + "step": 4440, + "time_per_iteration": 2.9196605682373047 + }, + { + "auxiliary_loss_clip": 0.01124038, + "auxiliary_loss_mlp": 0.01049644, + "balance_loss_clip": 1.04784775, + "balance_loss_mlp": 1.02931881, + "epoch": 0.2670073650984518, + "flos": 20959586536320.0, + "grad_norm": 1.869180757677473, + "language_loss": 0.66229129, + "learning_rate": 3.439118409456376e-06, + "loss": 0.68402815, + "num_input_tokens_seen": 95995435, + "step": 4441, + "time_per_iteration": 2.666428804397583 + }, + { + "auxiliary_loss_clip": 0.01137041, + "auxiliary_loss_mlp": 0.01049045, + "balance_loss_clip": 1.04973912, + "balance_loss_mlp": 1.02953053, + "epoch": 0.2670674883511198, + "flos": 28366054277760.0, + "grad_norm": 3.888081439634283, + "language_loss": 0.76102316, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.78288412, + "num_input_tokens_seen": 96016340, + "step": 4442, + "time_per_iteration": 2.6413686275482178 + }, + { + "auxiliary_loss_clip": 0.0100646, + "auxiliary_loss_mlp": 0.0105848, + "balance_loss_clip": 1.02694619, + "balance_loss_mlp": 1.05538034, + "epoch": 0.2671276116037878, + "flos": 58971319430400.0, + "grad_norm": 0.9410220376713593, + "language_loss": 0.61210632, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63275576, + "num_input_tokens_seen": 96071205, + "step": 4443, + "time_per_iteration": 3.2342116832733154 + }, + { + "auxiliary_loss_clip": 0.01123665, + "auxiliary_loss_mlp": 0.01039982, + "balance_loss_clip": 1.05413401, + "balance_loss_mlp": 1.02239847, + "epoch": 0.26718773485645575, + "flos": 43945072053120.0, + "grad_norm": 1.5620381861600383, + "language_loss": 0.76195556, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.78359205, + "num_input_tokens_seen": 96094240, + "step": 4444, + "time_per_iteration": 3.136178731918335 + }, + { + "auxiliary_loss_clip": 0.01142711, + "auxiliary_loss_mlp": 0.01040756, + "balance_loss_clip": 1.05331576, + "balance_loss_mlp": 1.0213964, + "epoch": 0.2672478581091237, + "flos": 25228323665280.0, + "grad_norm": 1.6750833182703528, + "language_loss": 0.80892444, + "learning_rate": 3.438036155780158e-06, + "loss": 0.83075905, + "num_input_tokens_seen": 96114105, + "step": 4445, + "time_per_iteration": 2.660952091217041 + }, + { + "auxiliary_loss_clip": 0.01124381, + "auxiliary_loss_mlp": 0.01048514, + "balance_loss_clip": 1.05190587, + "balance_loss_mlp": 1.02901077, + "epoch": 0.2673079813617917, + "flos": 15268176455040.0, + "grad_norm": 2.1125172985353533, + "language_loss": 0.89060926, + "learning_rate": 3.43776545600926e-06, + "loss": 0.9123382, + "num_input_tokens_seen": 96132140, + "step": 4446, + "time_per_iteration": 2.6609115600585938 + }, + { + "auxiliary_loss_clip": 0.011447, + "auxiliary_loss_mlp": 0.01053132, + "balance_loss_clip": 1.05528426, + "balance_loss_mlp": 1.03541803, + "epoch": 0.26736810461445965, + "flos": 25812733944960.0, + "grad_norm": 2.4310086382368783, + "language_loss": 0.67756736, + "learning_rate": 3.437494701718153e-06, + "loss": 0.69954574, + "num_input_tokens_seen": 96152090, + "step": 4447, + "time_per_iteration": 2.6696949005126953 + }, + { + "auxiliary_loss_clip": 0.01144309, + "auxiliary_loss_mlp": 0.0104489, + "balance_loss_clip": 1.05496442, + "balance_loss_mlp": 1.02572155, + "epoch": 0.2674282278671276, + "flos": 24312709054080.0, + "grad_norm": 1.9687667134305082, + "language_loss": 0.830899, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.85279107, + "num_input_tokens_seen": 96170015, + "step": 4448, + "time_per_iteration": 2.639463424682617 + }, + { + "auxiliary_loss_clip": 0.0111564, + "auxiliary_loss_mlp": 0.01054364, + "balance_loss_clip": 1.05101895, + "balance_loss_mlp": 1.03557646, + "epoch": 0.2674883511197956, + "flos": 22815521337600.0, + "grad_norm": 1.479052407292424, + "language_loss": 0.84231561, + "learning_rate": 3.436953029616378e-06, + "loss": 0.8640157, + "num_input_tokens_seen": 96188065, + "step": 4449, + "time_per_iteration": 2.812290906906128 + }, + { + "auxiliary_loss_clip": 0.0113237, + "auxiliary_loss_mlp": 0.01055905, + "balance_loss_clip": 1.05103493, + "balance_loss_mlp": 1.03552055, + "epoch": 0.26754847437246354, + "flos": 25370170473600.0, + "grad_norm": 1.7379167843341312, + "language_loss": 0.84231997, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.86420268, + "num_input_tokens_seen": 96205780, + "step": 4450, + "time_per_iteration": 2.7598626613616943 + }, + { + "auxiliary_loss_clip": 0.01109743, + "auxiliary_loss_mlp": 0.01057779, + "balance_loss_clip": 1.04833305, + "balance_loss_mlp": 1.04044628, + "epoch": 0.2676085976251315, + "flos": 20230420446720.0, + "grad_norm": 8.035146429526597, + "language_loss": 0.80842566, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.83010095, + "num_input_tokens_seen": 96224990, + "step": 4451, + "time_per_iteration": 2.7467129230499268 + }, + { + "auxiliary_loss_clip": 0.01141732, + "auxiliary_loss_mlp": 0.01055516, + "balance_loss_clip": 1.0553689, + "balance_loss_mlp": 1.0379324, + "epoch": 0.26766872087779947, + "flos": 28038225824640.0, + "grad_norm": 1.6378235408468254, + "language_loss": 0.86285019, + "learning_rate": 3.436140112818882e-06, + "loss": 0.88482267, + "num_input_tokens_seen": 96245345, + "step": 4452, + "time_per_iteration": 2.7442660331726074 + }, + { + "auxiliary_loss_clip": 0.01134475, + "auxiliary_loss_mlp": 0.01047993, + "balance_loss_clip": 1.05496478, + "balance_loss_mlp": 1.02926481, + "epoch": 0.26772884413046744, + "flos": 18325179250560.0, + "grad_norm": 2.119384740597093, + "language_loss": 0.83521158, + "learning_rate": 3.435869031622194e-06, + "loss": 0.85703623, + "num_input_tokens_seen": 96259000, + "step": 4453, + "time_per_iteration": 2.659623146057129 + }, + { + "auxiliary_loss_clip": 0.01141347, + "auxiliary_loss_mlp": 0.01063496, + "balance_loss_clip": 1.05624223, + "balance_loss_mlp": 1.04485118, + "epoch": 0.2677889673831354, + "flos": 22127509255680.0, + "grad_norm": 1.8460317519144305, + "language_loss": 0.79565918, + "learning_rate": 3.435597895977208e-06, + "loss": 0.8177076, + "num_input_tokens_seen": 96277000, + "step": 4454, + "time_per_iteration": 2.6458942890167236 + }, + { + "auxiliary_loss_clip": 0.01130641, + "auxiliary_loss_mlp": 0.01056871, + "balance_loss_clip": 1.05338597, + "balance_loss_mlp": 1.03869116, + "epoch": 0.2678490906358034, + "flos": 23729699404800.0, + "grad_norm": 1.5255880946203295, + "language_loss": 0.7241919, + "learning_rate": 3.435326705894206e-06, + "loss": 0.74606699, + "num_input_tokens_seen": 96297010, + "step": 4455, + "time_per_iteration": 2.7328429222106934 + }, + { + "auxiliary_loss_clip": 0.01112613, + "auxiliary_loss_mlp": 0.01052208, + "balance_loss_clip": 1.04858243, + "balance_loss_mlp": 1.03508949, + "epoch": 0.2679092138884714, + "flos": 21762872340480.0, + "grad_norm": 1.5657028408886426, + "language_loss": 0.74017322, + "learning_rate": 3.435055461383471e-06, + "loss": 0.76182139, + "num_input_tokens_seen": 96315780, + "step": 4456, + "time_per_iteration": 2.700190544128418 + }, + { + "auxiliary_loss_clip": 0.0114232, + "auxiliary_loss_mlp": 0.01048809, + "balance_loss_clip": 1.05394006, + "balance_loss_mlp": 1.03033149, + "epoch": 0.26796933714113935, + "flos": 19861186590720.0, + "grad_norm": 2.4373070589767774, + "language_loss": 0.70647967, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.72839093, + "num_input_tokens_seen": 96333465, + "step": 4457, + "time_per_iteration": 2.6334941387176514 + }, + { + "auxiliary_loss_clip": 0.01112923, + "auxiliary_loss_mlp": 0.01063608, + "balance_loss_clip": 1.05205595, + "balance_loss_mlp": 1.04513049, + "epoch": 0.2680294603938073, + "flos": 20047886507520.0, + "grad_norm": 1.8228045543818674, + "language_loss": 0.7903617, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81212699, + "num_input_tokens_seen": 96352005, + "step": 4458, + "time_per_iteration": 2.7377572059631348 + }, + { + "auxiliary_loss_clip": 0.01030327, + "auxiliary_loss_mlp": 0.01043883, + "balance_loss_clip": 1.0366354, + "balance_loss_mlp": 1.0414269, + "epoch": 0.2680895836464753, + "flos": 72113763052800.0, + "grad_norm": 0.9600198584891941, + "language_loss": 0.58691025, + "learning_rate": 3.434241401387739e-06, + "loss": 0.60765231, + "num_input_tokens_seen": 96406265, + "step": 4459, + "time_per_iteration": 3.2385354042053223 + }, + { + "auxiliary_loss_clip": 0.0108842, + "auxiliary_loss_mlp": 0.01056025, + "balance_loss_clip": 1.04306948, + "balance_loss_mlp": 1.0379889, + "epoch": 0.26814970689914325, + "flos": 20449044576000.0, + "grad_norm": 2.1196386888642382, + "language_loss": 0.84988648, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.87133086, + "num_input_tokens_seen": 96425225, + "step": 4460, + "time_per_iteration": 2.767054319381714 + }, + { + "auxiliary_loss_clip": 0.01134128, + "auxiliary_loss_mlp": 0.01059054, + "balance_loss_clip": 1.0525527, + "balance_loss_mlp": 1.03916979, + "epoch": 0.2682098301518112, + "flos": 17566674727680.0, + "grad_norm": 1.6839260392555548, + "language_loss": 0.68334675, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.70527858, + "num_input_tokens_seen": 96443780, + "step": 4461, + "time_per_iteration": 2.7217342853546143 + }, + { + "auxiliary_loss_clip": 0.0111525, + "auxiliary_loss_mlp": 0.01054739, + "balance_loss_clip": 1.05045152, + "balance_loss_mlp": 1.03649962, + "epoch": 0.2682699534044792, + "flos": 18333259810560.0, + "grad_norm": 1.7146103847032579, + "language_loss": 0.67240328, + "learning_rate": 3.43342685191282e-06, + "loss": 0.69410318, + "num_input_tokens_seen": 96464530, + "step": 4462, + "time_per_iteration": 2.730682134628296 + }, + { + "auxiliary_loss_clip": 0.01116667, + "auxiliary_loss_mlp": 0.01046675, + "balance_loss_clip": 1.05230319, + "balance_loss_mlp": 1.02710128, + "epoch": 0.26833007665714714, + "flos": 25301294144640.0, + "grad_norm": 1.7796857642272712, + "language_loss": 0.69503593, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.71666932, + "num_input_tokens_seen": 96483345, + "step": 4463, + "time_per_iteration": 2.738046407699585 + }, + { + "auxiliary_loss_clip": 0.01118676, + "auxiliary_loss_mlp": 0.01049589, + "balance_loss_clip": 1.0492326, + "balance_loss_mlp": 1.02862048, + "epoch": 0.2683901999098151, + "flos": 16099759198080.0, + "grad_norm": 2.5866232358274277, + "language_loss": 0.77943784, + "learning_rate": 3.432883547133931e-06, + "loss": 0.80112046, + "num_input_tokens_seen": 96498305, + "step": 4464, + "time_per_iteration": 2.6794681549072266 + }, + { + "auxiliary_loss_clip": 0.01133564, + "auxiliary_loss_mlp": 0.01042879, + "balance_loss_clip": 1.05244994, + "balance_loss_mlp": 1.02410388, + "epoch": 0.2684503231624831, + "flos": 27308054154240.0, + "grad_norm": 2.2986867036088285, + "language_loss": 0.71375966, + "learning_rate": 3.432611813236704e-06, + "loss": 0.73552406, + "num_input_tokens_seen": 96519740, + "step": 4465, + "time_per_iteration": 2.699575662612915 + }, + { + "auxiliary_loss_clip": 0.01042347, + "auxiliary_loss_mlp": 0.01001834, + "balance_loss_clip": 1.02813911, + "balance_loss_mlp": 0.9993788, + "epoch": 0.26851044641515104, + "flos": 71858007239040.0, + "grad_norm": 0.7242654721351415, + "language_loss": 0.53150702, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.5519489, + "num_input_tokens_seen": 96588870, + "step": 4466, + "time_per_iteration": 3.3984062671661377 + }, + { + "auxiliary_loss_clip": 0.01118674, + "auxiliary_loss_mlp": 0.0105552, + "balance_loss_clip": 1.04732478, + "balance_loss_mlp": 1.03381157, + "epoch": 0.268570569667819, + "flos": 18733771434240.0, + "grad_norm": 2.1738333593055796, + "language_loss": 0.74038142, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.76212335, + "num_input_tokens_seen": 96605100, + "step": 4467, + "time_per_iteration": 2.6631343364715576 + }, + { + "auxiliary_loss_clip": 0.01126618, + "auxiliary_loss_mlp": 0.00777879, + "balance_loss_clip": 1.05088973, + "balance_loss_mlp": 1.00093102, + "epoch": 0.268630692920487, + "flos": 18178376365440.0, + "grad_norm": 3.586661477808892, + "language_loss": 0.80481976, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.82386476, + "num_input_tokens_seen": 96621410, + "step": 4468, + "time_per_iteration": 2.64806866645813 + }, + { + "auxiliary_loss_clip": 0.01059326, + "auxiliary_loss_mlp": 0.01006331, + "balance_loss_clip": 1.02527809, + "balance_loss_mlp": 1.0036248, + "epoch": 0.268690816173155, + "flos": 68731768978560.0, + "grad_norm": 0.8399316740346766, + "language_loss": 0.59498715, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.61564374, + "num_input_tokens_seen": 96684810, + "step": 4469, + "time_per_iteration": 3.1989517211914062 + }, + { + "auxiliary_loss_clip": 0.01156531, + "auxiliary_loss_mlp": 0.01048741, + "balance_loss_clip": 1.05689096, + "balance_loss_mlp": 1.02854705, + "epoch": 0.26875093942582295, + "flos": 23293636295040.0, + "grad_norm": 2.165956170420043, + "language_loss": 0.82055074, + "learning_rate": 3.431252329084972e-06, + "loss": 0.84260345, + "num_input_tokens_seen": 96701920, + "step": 4470, + "time_per_iteration": 2.6167352199554443 + }, + { + "auxiliary_loss_clip": 0.01117064, + "auxiliary_loss_mlp": 0.01054605, + "balance_loss_clip": 1.04794455, + "balance_loss_mlp": 1.03563929, + "epoch": 0.2688110626784909, + "flos": 21543458112000.0, + "grad_norm": 1.6543166375172473, + "language_loss": 0.82841349, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.8501302, + "num_input_tokens_seen": 96721260, + "step": 4471, + "time_per_iteration": 4.177881956100464 + }, + { + "auxiliary_loss_clip": 0.01133274, + "auxiliary_loss_mlp": 0.01045934, + "balance_loss_clip": 1.05339766, + "balance_loss_mlp": 1.02762365, + "epoch": 0.2688711859311589, + "flos": 28400600183040.0, + "grad_norm": 2.017001756898941, + "language_loss": 0.69309431, + "learning_rate": 3.43070815543947e-06, + "loss": 0.71488637, + "num_input_tokens_seen": 96740385, + "step": 4472, + "time_per_iteration": 2.6611149311065674 + }, + { + "auxiliary_loss_clip": 0.01150636, + "auxiliary_loss_mlp": 0.01046679, + "balance_loss_clip": 1.05448234, + "balance_loss_mlp": 1.02882099, + "epoch": 0.26893130918382685, + "flos": 25994944661760.0, + "grad_norm": 1.889152474147147, + "language_loss": 0.67809618, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.70006931, + "num_input_tokens_seen": 96761860, + "step": 4473, + "time_per_iteration": 2.6570448875427246 + }, + { + "auxiliary_loss_clip": 0.01123821, + "auxiliary_loss_mlp": 0.01056077, + "balance_loss_clip": 1.05778623, + "balance_loss_mlp": 1.03800452, + "epoch": 0.2689914324364948, + "flos": 20339624770560.0, + "grad_norm": 2.20378943201051, + "language_loss": 0.82835853, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.8501575, + "num_input_tokens_seen": 96781890, + "step": 4474, + "time_per_iteration": 5.79376220703125 + }, + { + "auxiliary_loss_clip": 0.01138349, + "auxiliary_loss_mlp": 0.01055982, + "balance_loss_clip": 1.05353034, + "balance_loss_mlp": 1.03841054, + "epoch": 0.2690515556891628, + "flos": 19464553635840.0, + "grad_norm": 2.404484364093812, + "language_loss": 0.71004206, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.73198539, + "num_input_tokens_seen": 96800390, + "step": 4475, + "time_per_iteration": 2.5969674587249756 + }, + { + "auxiliary_loss_clip": 0.01112288, + "auxiliary_loss_mlp": 0.00776382, + "balance_loss_clip": 1.05001771, + "balance_loss_mlp": 1.00081563, + "epoch": 0.26911167894183075, + "flos": 18146631720960.0, + "grad_norm": 1.8574153972172647, + "language_loss": 0.73638999, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.75527668, + "num_input_tokens_seen": 96816685, + "step": 4476, + "time_per_iteration": 2.70358943939209 + }, + { + "auxiliary_loss_clip": 0.01119256, + "auxiliary_loss_mlp": 0.01043783, + "balance_loss_clip": 1.05050373, + "balance_loss_mlp": 1.02605665, + "epoch": 0.2691718021944987, + "flos": 19975131509760.0, + "grad_norm": 1.5040704863343832, + "language_loss": 0.80439913, + "learning_rate": 3.429346772085922e-06, + "loss": 0.82602954, + "num_input_tokens_seen": 96836285, + "step": 4477, + "time_per_iteration": 4.313180208206177 + }, + { + "auxiliary_loss_clip": 0.01097359, + "auxiliary_loss_mlp": 0.0104976, + "balance_loss_clip": 1.04965031, + "balance_loss_mlp": 1.0309844, + "epoch": 0.2692319254471667, + "flos": 37447215770880.0, + "grad_norm": 1.7971929656919947, + "language_loss": 0.65181434, + "learning_rate": 3.429074332770984e-06, + "loss": 0.67328548, + "num_input_tokens_seen": 96857745, + "step": 4478, + "time_per_iteration": 2.8882603645324707 + }, + { + "auxiliary_loss_clip": 0.01130488, + "auxiliary_loss_mlp": 0.01050401, + "balance_loss_clip": 1.04841042, + "balance_loss_mlp": 1.03163743, + "epoch": 0.26929204869983464, + "flos": 22127796564480.0, + "grad_norm": 1.933707281531851, + "language_loss": 0.80987537, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.83168429, + "num_input_tokens_seen": 96877295, + "step": 4479, + "time_per_iteration": 2.670370578765869 + }, + { + "auxiliary_loss_clip": 0.01127626, + "auxiliary_loss_mlp": 0.00776143, + "balance_loss_clip": 1.05010593, + "balance_loss_mlp": 1.0010041, + "epoch": 0.2693521719525026, + "flos": 19792813052160.0, + "grad_norm": 16.364114673072947, + "language_loss": 0.81205857, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.83109629, + "num_input_tokens_seen": 96896160, + "step": 4480, + "time_per_iteration": 2.687922954559326 + }, + { + "auxiliary_loss_clip": 0.01098242, + "auxiliary_loss_mlp": 0.01051142, + "balance_loss_clip": 1.04720628, + "balance_loss_mlp": 1.03243792, + "epoch": 0.2694122952051706, + "flos": 20994383836800.0, + "grad_norm": 1.5167677573266813, + "language_loss": 0.77982032, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.80131412, + "num_input_tokens_seen": 96915410, + "step": 4481, + "time_per_iteration": 2.783400058746338 + }, + { + "auxiliary_loss_clip": 0.01138325, + "auxiliary_loss_mlp": 0.01055373, + "balance_loss_clip": 1.05098486, + "balance_loss_mlp": 1.03693104, + "epoch": 0.2694724184578386, + "flos": 25849291011840.0, + "grad_norm": 1.817845708033507, + "language_loss": 0.74072635, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.76266336, + "num_input_tokens_seen": 96937865, + "step": 4482, + "time_per_iteration": 2.704923629760742 + }, + { + "auxiliary_loss_clip": 0.01124372, + "auxiliary_loss_mlp": 0.01046467, + "balance_loss_clip": 1.05258846, + "balance_loss_mlp": 1.02826333, + "epoch": 0.26953254171050656, + "flos": 21726961718400.0, + "grad_norm": 2.016330221700464, + "language_loss": 0.72562164, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.74733007, + "num_input_tokens_seen": 96957710, + "step": 4483, + "time_per_iteration": 2.697889804840088 + }, + { + "auxiliary_loss_clip": 0.0113896, + "auxiliary_loss_mlp": 0.01056121, + "balance_loss_clip": 1.04867983, + "balance_loss_mlp": 1.03658295, + "epoch": 0.2695926649631745, + "flos": 19682926369920.0, + "grad_norm": 2.3663265895203356, + "language_loss": 0.86904967, + "learning_rate": 3.427438559239605e-06, + "loss": 0.89100051, + "num_input_tokens_seen": 96975890, + "step": 4484, + "time_per_iteration": 2.6893441677093506 + }, + { + "auxiliary_loss_clip": 0.01139698, + "auxiliary_loss_mlp": 0.01049025, + "balance_loss_clip": 1.05224931, + "balance_loss_mlp": 1.03148949, + "epoch": 0.2696527882158425, + "flos": 32886596724480.0, + "grad_norm": 1.783447205979712, + "language_loss": 0.6663093, + "learning_rate": 3.427165740807239e-06, + "loss": 0.68819648, + "num_input_tokens_seen": 96998595, + "step": 4485, + "time_per_iteration": 2.795172929763794 + }, + { + "auxiliary_loss_clip": 0.01112833, + "auxiliary_loss_mlp": 0.01053324, + "balance_loss_clip": 1.04507363, + "balance_loss_mlp": 1.03475094, + "epoch": 0.26971291146851045, + "flos": 12124843320960.0, + "grad_norm": 2.5437851063433743, + "language_loss": 0.73155308, + "learning_rate": 3.426892868256604e-06, + "loss": 0.75321472, + "num_input_tokens_seen": 97013715, + "step": 4486, + "time_per_iteration": 2.6854116916656494 + }, + { + "auxiliary_loss_clip": 0.01156209, + "auxiliary_loss_mlp": 0.01047906, + "balance_loss_clip": 1.05688012, + "balance_loss_mlp": 1.03062034, + "epoch": 0.2697730347211784, + "flos": 22634459856000.0, + "grad_norm": 2.2389379935408456, + "language_loss": 0.84326887, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.86531007, + "num_input_tokens_seen": 97031570, + "step": 4487, + "time_per_iteration": 2.6117801666259766 + }, + { + "auxiliary_loss_clip": 0.01127332, + "auxiliary_loss_mlp": 0.0105083, + "balance_loss_clip": 1.05733204, + "balance_loss_mlp": 1.03228104, + "epoch": 0.2698331579738464, + "flos": 23513050523520.0, + "grad_norm": 2.345170862120161, + "language_loss": 0.7189706, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.74075222, + "num_input_tokens_seen": 97049815, + "step": 4488, + "time_per_iteration": 2.7384660243988037 + }, + { + "auxiliary_loss_clip": 0.01074601, + "auxiliary_loss_mlp": 0.01061378, + "balance_loss_clip": 1.0494225, + "balance_loss_mlp": 1.04040885, + "epoch": 0.26989328122651435, + "flos": 24641040297600.0, + "grad_norm": 1.6359957516545125, + "language_loss": 0.83725536, + "learning_rate": 3.426073925998578e-06, + "loss": 0.85861516, + "num_input_tokens_seen": 97067570, + "step": 4489, + "time_per_iteration": 2.9274613857269287 + }, + { + "auxiliary_loss_clip": 0.01129648, + "auxiliary_loss_mlp": 0.01061235, + "balance_loss_clip": 1.05630314, + "balance_loss_mlp": 1.04203057, + "epoch": 0.2699534044791823, + "flos": 10772555068800.0, + "grad_norm": 2.6678463269995785, + "language_loss": 0.90056908, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.9224779, + "num_input_tokens_seen": 97082180, + "step": 4490, + "time_per_iteration": 2.9096486568450928 + }, + { + "auxiliary_loss_clip": 0.01075397, + "auxiliary_loss_mlp": 0.01052666, + "balance_loss_clip": 1.04493999, + "balance_loss_mlp": 1.03319883, + "epoch": 0.2700135277318503, + "flos": 36171597098880.0, + "grad_norm": 2.0876908666200573, + "language_loss": 0.73380542, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.75508606, + "num_input_tokens_seen": 97103470, + "step": 4491, + "time_per_iteration": 2.9016802310943604 + }, + { + "auxiliary_loss_clip": 0.01156852, + "auxiliary_loss_mlp": 0.01052294, + "balance_loss_clip": 1.05944943, + "balance_loss_mlp": 1.03453195, + "epoch": 0.27007365098451824, + "flos": 17418614866560.0, + "grad_norm": 2.7575700534068783, + "language_loss": 0.74795783, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.77004933, + "num_input_tokens_seen": 97118100, + "step": 4492, + "time_per_iteration": 2.6685187816619873 + }, + { + "auxiliary_loss_clip": 0.01130467, + "auxiliary_loss_mlp": 0.01050253, + "balance_loss_clip": 1.05300546, + "balance_loss_mlp": 1.03205013, + "epoch": 0.2701337742371862, + "flos": 23185688947200.0, + "grad_norm": 3.551039047250381, + "language_loss": 0.89015245, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.91195965, + "num_input_tokens_seen": 97136765, + "step": 4493, + "time_per_iteration": 2.7044742107391357 + }, + { + "auxiliary_loss_clip": 0.01142037, + "auxiliary_loss_mlp": 0.0104825, + "balance_loss_clip": 1.05408192, + "balance_loss_mlp": 1.03079772, + "epoch": 0.2701938974898542, + "flos": 24389450461440.0, + "grad_norm": 1.665337194117132, + "language_loss": 0.71139705, + "learning_rate": 3.424707940835998e-06, + "loss": 0.73329991, + "num_input_tokens_seen": 97157470, + "step": 4494, + "time_per_iteration": 2.6299519538879395 + }, + { + "auxiliary_loss_clip": 0.01120214, + "auxiliary_loss_mlp": 0.01045805, + "balance_loss_clip": 1.05193532, + "balance_loss_mlp": 1.02893662, + "epoch": 0.2702540207425222, + "flos": 26214322976640.0, + "grad_norm": 2.4718809008283045, + "language_loss": 0.8642354, + "learning_rate": 3.42443458168683e-06, + "loss": 0.88589561, + "num_input_tokens_seen": 97176905, + "step": 4495, + "time_per_iteration": 2.627389907836914 + }, + { + "auxiliary_loss_clip": 0.01151814, + "auxiliary_loss_mlp": 0.0105053, + "balance_loss_clip": 1.05591631, + "balance_loss_mlp": 1.03308964, + "epoch": 0.27031414399519016, + "flos": 22926377687040.0, + "grad_norm": 2.1521214825296844, + "language_loss": 0.76781964, + "learning_rate": 3.424161168522959e-06, + "loss": 0.78984308, + "num_input_tokens_seen": 97196380, + "step": 4496, + "time_per_iteration": 2.5360703468322754 + }, + { + "auxiliary_loss_clip": 0.01064272, + "auxiliary_loss_mlp": 0.01049575, + "balance_loss_clip": 1.03151321, + "balance_loss_mlp": 1.04716671, + "epoch": 0.2703742672478581, + "flos": 63019780404480.0, + "grad_norm": 0.7153442156657138, + "language_loss": 0.50134224, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52248067, + "num_input_tokens_seen": 97260100, + "step": 4497, + "time_per_iteration": 3.1133949756622314 + }, + { + "auxiliary_loss_clip": 0.01106563, + "auxiliary_loss_mlp": 0.01051954, + "balance_loss_clip": 1.05492568, + "balance_loss_mlp": 1.03482318, + "epoch": 0.2704343905005261, + "flos": 18840820942080.0, + "grad_norm": 2.421164292554959, + "language_loss": 0.72386497, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.74545014, + "num_input_tokens_seen": 97277935, + "step": 4498, + "time_per_iteration": 2.7409775257110596 + }, + { + "auxiliary_loss_clip": 0.01038432, + "auxiliary_loss_mlp": 0.01028244, + "balance_loss_clip": 1.0322926, + "balance_loss_mlp": 1.02582395, + "epoch": 0.27049451375319405, + "flos": 71233412618880.0, + "grad_norm": 0.7537228186848703, + "language_loss": 0.5917033, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61237001, + "num_input_tokens_seen": 97338845, + "step": 4499, + "time_per_iteration": 3.2331602573394775 + }, + { + "auxiliary_loss_clip": 0.01124574, + "auxiliary_loss_mlp": 0.01044613, + "balance_loss_clip": 1.05154204, + "balance_loss_mlp": 1.02593243, + "epoch": 0.270554637005862, + "flos": 24278594112000.0, + "grad_norm": 2.1159538878254756, + "language_loss": 0.73629957, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.75799143, + "num_input_tokens_seen": 97356640, + "step": 4500, + "time_per_iteration": 2.7513487339019775 + }, + { + "auxiliary_loss_clip": 0.01116688, + "auxiliary_loss_mlp": 0.01047016, + "balance_loss_clip": 1.04657793, + "balance_loss_mlp": 1.02878881, + "epoch": 0.27061476025853, + "flos": 17632318832640.0, + "grad_norm": 2.8997006330289925, + "language_loss": 0.81041664, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.83205366, + "num_input_tokens_seen": 97372585, + "step": 4501, + "time_per_iteration": 2.703014850616455 + }, + { + "auxiliary_loss_clip": 0.01104056, + "auxiliary_loss_mlp": 0.01053779, + "balance_loss_clip": 1.04828477, + "balance_loss_mlp": 1.03331053, + "epoch": 0.27067488351119795, + "flos": 22710123855360.0, + "grad_norm": 4.2139696132912565, + "language_loss": 0.7261312, + "learning_rate": 3.422519555811735e-06, + "loss": 0.74770957, + "num_input_tokens_seen": 97393315, + "step": 4502, + "time_per_iteration": 2.732167959213257 + }, + { + "auxiliary_loss_clip": 0.01129704, + "auxiliary_loss_mlp": 0.01047167, + "balance_loss_clip": 1.04821455, + "balance_loss_mlp": 1.0268774, + "epoch": 0.2707350067638659, + "flos": 41719616087040.0, + "grad_norm": 1.748421457410976, + "language_loss": 0.67973912, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.70150787, + "num_input_tokens_seen": 97417860, + "step": 4503, + "time_per_iteration": 2.7950186729431152 + }, + { + "auxiliary_loss_clip": 0.01100008, + "auxiliary_loss_mlp": 0.01051668, + "balance_loss_clip": 1.04750037, + "balance_loss_mlp": 1.03180754, + "epoch": 0.2707951300165339, + "flos": 20193037367040.0, + "grad_norm": 1.847411158173202, + "language_loss": 0.67971921, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.70123595, + "num_input_tokens_seen": 97436780, + "step": 4504, + "time_per_iteration": 2.7830374240875244 + }, + { + "auxiliary_loss_clip": 0.01142201, + "auxiliary_loss_mlp": 0.01052204, + "balance_loss_clip": 1.05604792, + "balance_loss_mlp": 1.03451371, + "epoch": 0.27085525326920185, + "flos": 21433966479360.0, + "grad_norm": 1.4870002594081857, + "language_loss": 0.75395846, + "learning_rate": 3.421698021097902e-06, + "loss": 0.77590245, + "num_input_tokens_seen": 97456190, + "step": 4505, + "time_per_iteration": 2.6758666038513184 + }, + { + "auxiliary_loss_clip": 0.01155407, + "auxiliary_loss_mlp": 0.01064618, + "balance_loss_clip": 1.05439496, + "balance_loss_mlp": 1.04436409, + "epoch": 0.2709153765218698, + "flos": 17675232606720.0, + "grad_norm": 2.0635482699578254, + "language_loss": 0.73474276, + "learning_rate": 3.42142406835758e-06, + "loss": 0.75694299, + "num_input_tokens_seen": 97474545, + "step": 4506, + "time_per_iteration": 2.652395009994507 + }, + { + "auxiliary_loss_clip": 0.01130629, + "auxiliary_loss_mlp": 0.01053462, + "balance_loss_clip": 1.05147469, + "balance_loss_mlp": 1.0338285, + "epoch": 0.2709754997745378, + "flos": 24456243801600.0, + "grad_norm": 2.6352592870517144, + "language_loss": 0.80730569, + "learning_rate": 3.421150061716715e-06, + "loss": 0.82914662, + "num_input_tokens_seen": 97494520, + "step": 4507, + "time_per_iteration": 2.7858307361602783 + }, + { + "auxiliary_loss_clip": 0.01041671, + "auxiliary_loss_mlp": 0.010698, + "balance_loss_clip": 1.0261147, + "balance_loss_mlp": 1.0667243, + "epoch": 0.2710356230272058, + "flos": 65210798206080.0, + "grad_norm": 0.7655673562950965, + "language_loss": 0.5085085, + "learning_rate": 3.420876001185698e-06, + "loss": 0.52962321, + "num_input_tokens_seen": 97552455, + "step": 4508, + "time_per_iteration": 3.144418716430664 + }, + { + "auxiliary_loss_clip": 0.01072779, + "auxiliary_loss_mlp": 0.01046589, + "balance_loss_clip": 1.04359698, + "balance_loss_mlp": 1.02843356, + "epoch": 0.27109574627987376, + "flos": 25484438615040.0, + "grad_norm": 1.9710162430227722, + "language_loss": 0.74710357, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.76829731, + "num_input_tokens_seen": 97572650, + "step": 4509, + "time_per_iteration": 2.8052053451538086 + }, + { + "auxiliary_loss_clip": 0.01130819, + "auxiliary_loss_mlp": 0.01042284, + "balance_loss_clip": 1.05107474, + "balance_loss_mlp": 1.0254159, + "epoch": 0.2711558695325417, + "flos": 19682782715520.0, + "grad_norm": 2.0468089657674353, + "language_loss": 0.70937192, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.73110294, + "num_input_tokens_seen": 97591150, + "step": 4510, + "time_per_iteration": 2.6244139671325684 + }, + { + "auxiliary_loss_clip": 0.01135912, + "auxiliary_loss_mlp": 0.0103914, + "balance_loss_clip": 1.05330467, + "balance_loss_mlp": 1.02156901, + "epoch": 0.2712159927852097, + "flos": 18587758648320.0, + "grad_norm": 2.4701723872261256, + "language_loss": 0.70409644, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.72584701, + "num_input_tokens_seen": 97607410, + "step": 4511, + "time_per_iteration": 4.112820863723755 + }, + { + "auxiliary_loss_clip": 0.0112023, + "auxiliary_loss_mlp": 0.01049105, + "balance_loss_clip": 1.048491, + "balance_loss_mlp": 1.03115225, + "epoch": 0.27127611603787766, + "flos": 25630235919360.0, + "grad_norm": 6.028868725677894, + "language_loss": 0.81324005, + "learning_rate": 3.419779220367979e-06, + "loss": 0.83493352, + "num_input_tokens_seen": 97626870, + "step": 4512, + "time_per_iteration": 4.285844087600708 + }, + { + "auxiliary_loss_clip": 0.01147816, + "auxiliary_loss_mlp": 0.01038614, + "balance_loss_clip": 1.05365086, + "balance_loss_mlp": 1.02323616, + "epoch": 0.2713362392905456, + "flos": 23148952312320.0, + "grad_norm": 2.7707983308205053, + "language_loss": 0.80467856, + "learning_rate": 3.419504890542124e-06, + "loss": 0.82654285, + "num_input_tokens_seen": 97646595, + "step": 4513, + "time_per_iteration": 4.415290117263794 + }, + { + "auxiliary_loss_clip": 0.01119685, + "auxiliary_loss_mlp": 0.01044412, + "balance_loss_clip": 1.04594898, + "balance_loss_mlp": 1.02709103, + "epoch": 0.2713963625432136, + "flos": 18366045949440.0, + "grad_norm": 1.8005970142501413, + "language_loss": 0.88150048, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.90314144, + "num_input_tokens_seen": 97665485, + "step": 4514, + "time_per_iteration": 2.691697835922241 + }, + { + "auxiliary_loss_clip": 0.01129072, + "auxiliary_loss_mlp": 0.01051817, + "balance_loss_clip": 1.05358005, + "balance_loss_mlp": 1.03337574, + "epoch": 0.27145648579588155, + "flos": 22491751121280.0, + "grad_norm": 1.6419144417830658, + "language_loss": 0.91461927, + "learning_rate": 3.418956069417517e-06, + "loss": 0.93642819, + "num_input_tokens_seen": 97683800, + "step": 4515, + "time_per_iteration": 2.6709890365600586 + }, + { + "auxiliary_loss_clip": 0.01100451, + "auxiliary_loss_mlp": 0.01057835, + "balance_loss_clip": 1.04920852, + "balance_loss_mlp": 1.03761721, + "epoch": 0.2715166090485495, + "flos": 19239177749760.0, + "grad_norm": 2.0250040358395944, + "language_loss": 0.74093282, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.76251566, + "num_input_tokens_seen": 97700505, + "step": 4516, + "time_per_iteration": 2.7001607418060303 + }, + { + "auxiliary_loss_clip": 0.01136738, + "auxiliary_loss_mlp": 0.01052795, + "balance_loss_clip": 1.05046439, + "balance_loss_mlp": 1.03483033, + "epoch": 0.2715767323012175, + "flos": 17709598944000.0, + "grad_norm": 2.811509606055916, + "language_loss": 0.75989574, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.78179109, + "num_input_tokens_seen": 97717410, + "step": 4517, + "time_per_iteration": 4.207966089248657 + }, + { + "auxiliary_loss_clip": 0.01097642, + "auxiliary_loss_mlp": 0.01058771, + "balance_loss_clip": 1.04378986, + "balance_loss_mlp": 1.03962636, + "epoch": 0.27163685555388545, + "flos": 22382834106240.0, + "grad_norm": 2.3161178488466097, + "language_loss": 0.77046895, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.79203308, + "num_input_tokens_seen": 97734545, + "step": 4518, + "time_per_iteration": 2.754009246826172 + }, + { + "auxiliary_loss_clip": 0.01118909, + "auxiliary_loss_mlp": 0.01047823, + "balance_loss_clip": 1.05136919, + "balance_loss_mlp": 1.03077579, + "epoch": 0.2716969788065534, + "flos": 22346708002560.0, + "grad_norm": 2.717268994046331, + "language_loss": 0.68388188, + "learning_rate": 3.41785778156811e-06, + "loss": 0.70554924, + "num_input_tokens_seen": 97754000, + "step": 4519, + "time_per_iteration": 2.7800872325897217 + }, + { + "auxiliary_loss_clip": 0.01134075, + "auxiliary_loss_mlp": 0.01053278, + "balance_loss_clip": 1.05009973, + "balance_loss_mlp": 1.03611171, + "epoch": 0.2717571020592214, + "flos": 25228467319680.0, + "grad_norm": 2.367483937305651, + "language_loss": 0.75572526, + "learning_rate": 3.417583075166451e-06, + "loss": 0.7775988, + "num_input_tokens_seen": 97772080, + "step": 4520, + "time_per_iteration": 2.694591760635376 + }, + { + "auxiliary_loss_clip": 0.01138275, + "auxiliary_loss_mlp": 0.0106095, + "balance_loss_clip": 1.05209494, + "balance_loss_mlp": 1.04226971, + "epoch": 0.2718172253118894, + "flos": 20189769229440.0, + "grad_norm": 3.3698654303080935, + "language_loss": 0.76434267, + "learning_rate": 3.4173083150099e-06, + "loss": 0.78633487, + "num_input_tokens_seen": 97789370, + "step": 4521, + "time_per_iteration": 2.675443649291992 + }, + { + "auxiliary_loss_clip": 0.01117262, + "auxiliary_loss_mlp": 0.0106414, + "balance_loss_clip": 1.04636955, + "balance_loss_mlp": 1.04578209, + "epoch": 0.27187734856455736, + "flos": 14319129260160.0, + "grad_norm": 2.1933848209734936, + "language_loss": 0.75041616, + "learning_rate": 3.417033501108875e-06, + "loss": 0.77223015, + "num_input_tokens_seen": 97807385, + "step": 4522, + "time_per_iteration": 2.769519329071045 + }, + { + "auxiliary_loss_clip": 0.01151707, + "auxiliary_loss_mlp": 0.01045506, + "balance_loss_clip": 1.05433989, + "balance_loss_mlp": 1.02813768, + "epoch": 0.27193747181722533, + "flos": 21107682311040.0, + "grad_norm": 1.9328965147806931, + "language_loss": 0.73074079, + "learning_rate": 3.416758633473798e-06, + "loss": 0.75271285, + "num_input_tokens_seen": 97827930, + "step": 4523, + "time_per_iteration": 2.6642134189605713 + }, + { + "auxiliary_loss_clip": 0.01120278, + "auxiliary_loss_mlp": 0.01048373, + "balance_loss_clip": 1.05034256, + "balance_loss_mlp": 1.03014588, + "epoch": 0.2719975950698933, + "flos": 19682782715520.0, + "grad_norm": 1.3899676528871532, + "language_loss": 0.74113363, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.76282012, + "num_input_tokens_seen": 97847440, + "step": 4524, + "time_per_iteration": 2.6365647315979004 + }, + { + "auxiliary_loss_clip": 0.0115251, + "auxiliary_loss_mlp": 0.01059779, + "balance_loss_clip": 1.05642283, + "balance_loss_mlp": 1.04233861, + "epoch": 0.27205771832256126, + "flos": 24754482426240.0, + "grad_norm": 1.6567279945506783, + "language_loss": 0.7639389, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78606176, + "num_input_tokens_seen": 97867620, + "step": 4525, + "time_per_iteration": 2.7116904258728027 + }, + { + "auxiliary_loss_clip": 0.01133976, + "auxiliary_loss_mlp": 0.01063183, + "balance_loss_clip": 1.05110538, + "balance_loss_mlp": 1.0458858, + "epoch": 0.2721178415752292, + "flos": 21755581879680.0, + "grad_norm": 1.8049087044415455, + "language_loss": 0.81449121, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.8364628, + "num_input_tokens_seen": 97884345, + "step": 4526, + "time_per_iteration": 2.583151340484619 + }, + { + "auxiliary_loss_clip": 0.01150721, + "auxiliary_loss_mlp": 0.01050593, + "balance_loss_clip": 1.05157495, + "balance_loss_mlp": 1.03235435, + "epoch": 0.2721779648278972, + "flos": 12676826597760.0, + "grad_norm": 2.689071598576449, + "language_loss": 0.77230763, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.79432082, + "num_input_tokens_seen": 97901500, + "step": 4527, + "time_per_iteration": 2.6060924530029297 + }, + { + "auxiliary_loss_clip": 0.01109469, + "auxiliary_loss_mlp": 0.00777538, + "balance_loss_clip": 1.04898691, + "balance_loss_mlp": 1.00073338, + "epoch": 0.27223808808056515, + "flos": 16253206099200.0, + "grad_norm": 2.5564103940467313, + "language_loss": 0.8187297, + "learning_rate": 3.415383489652503e-06, + "loss": 0.83759975, + "num_input_tokens_seen": 97917800, + "step": 4528, + "time_per_iteration": 2.697845458984375 + }, + { + "auxiliary_loss_clip": 0.01116518, + "auxiliary_loss_mlp": 0.01058829, + "balance_loss_clip": 1.05005443, + "balance_loss_mlp": 1.04094744, + "epoch": 0.2722982113332331, + "flos": 27745805203200.0, + "grad_norm": 1.774189879269534, + "language_loss": 0.77156031, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.7933138, + "num_input_tokens_seen": 97937225, + "step": 4529, + "time_per_iteration": 2.75425124168396 + }, + { + "auxiliary_loss_clip": 0.01123493, + "auxiliary_loss_mlp": 0.01053103, + "balance_loss_clip": 1.0518961, + "balance_loss_mlp": 1.03634179, + "epoch": 0.2723583345859011, + "flos": 21726243446400.0, + "grad_norm": 2.104422440945624, + "language_loss": 0.82359695, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.84536296, + "num_input_tokens_seen": 97956845, + "step": 4530, + "time_per_iteration": 2.6822023391723633 + }, + { + "auxiliary_loss_clip": 0.01136812, + "auxiliary_loss_mlp": 0.01047087, + "balance_loss_clip": 1.05334496, + "balance_loss_mlp": 1.02971828, + "epoch": 0.27241845783856905, + "flos": 17347260499200.0, + "grad_norm": 2.321764638586046, + "language_loss": 0.91554427, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.93738323, + "num_input_tokens_seen": 97972465, + "step": 4531, + "time_per_iteration": 2.6979331970214844 + }, + { + "auxiliary_loss_clip": 0.01138188, + "auxiliary_loss_mlp": 0.01046663, + "balance_loss_clip": 1.05187678, + "balance_loss_mlp": 1.02856672, + "epoch": 0.272478581091237, + "flos": 24754302858240.0, + "grad_norm": 1.9110068503115385, + "language_loss": 0.76398945, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.78583801, + "num_input_tokens_seen": 97990770, + "step": 4532, + "time_per_iteration": 2.6663877964019775 + }, + { + "auxiliary_loss_clip": 0.01113354, + "auxiliary_loss_mlp": 0.01040904, + "balance_loss_clip": 1.05224109, + "balance_loss_mlp": 1.02386856, + "epoch": 0.272538704343905, + "flos": 17890624512000.0, + "grad_norm": 2.311201731752709, + "language_loss": 0.88514459, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.90668714, + "num_input_tokens_seen": 98005775, + "step": 4533, + "time_per_iteration": 2.693161725997925 + }, + { + "auxiliary_loss_clip": 0.01122748, + "auxiliary_loss_mlp": 0.01040937, + "balance_loss_clip": 1.05127299, + "balance_loss_mlp": 1.02398562, + "epoch": 0.272598827596573, + "flos": 22932016122240.0, + "grad_norm": 2.2174577403643245, + "language_loss": 0.71288157, + "learning_rate": 3.413731546022929e-06, + "loss": 0.73451841, + "num_input_tokens_seen": 98025750, + "step": 4534, + "time_per_iteration": 2.7371840476989746 + }, + { + "auxiliary_loss_clip": 0.01121649, + "auxiliary_loss_mlp": 0.01040323, + "balance_loss_clip": 1.05089378, + "balance_loss_mlp": 1.02177453, + "epoch": 0.27265895084924097, + "flos": 24238409771520.0, + "grad_norm": 1.6997646677502514, + "language_loss": 0.91605014, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.93766987, + "num_input_tokens_seen": 98044955, + "step": 4535, + "time_per_iteration": 2.72127103805542 + }, + { + "auxiliary_loss_clip": 0.0113065, + "auxiliary_loss_mlp": 0.01045251, + "balance_loss_clip": 1.05495596, + "balance_loss_mlp": 1.02739298, + "epoch": 0.27271907410190893, + "flos": 27013155494400.0, + "grad_norm": 1.6448383128638457, + "language_loss": 0.72919363, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.7509526, + "num_input_tokens_seen": 98065860, + "step": 4536, + "time_per_iteration": 2.778991460800171 + }, + { + "auxiliary_loss_clip": 0.01137601, + "auxiliary_loss_mlp": 0.01044231, + "balance_loss_clip": 1.05134857, + "balance_loss_mlp": 1.02601612, + "epoch": 0.2727791973545769, + "flos": 34452588942720.0, + "grad_norm": 1.7760428855271044, + "language_loss": 0.71682841, + "learning_rate": 3.41290485034781e-06, + "loss": 0.73864675, + "num_input_tokens_seen": 98085450, + "step": 4537, + "time_per_iteration": 2.7746009826660156 + }, + { + "auxiliary_loss_clip": 0.01119602, + "auxiliary_loss_mlp": 0.01042982, + "balance_loss_clip": 1.04899096, + "balance_loss_mlp": 1.02455187, + "epoch": 0.27283932060724486, + "flos": 15041723160960.0, + "grad_norm": 2.103574663853892, + "language_loss": 0.77419543, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.79582125, + "num_input_tokens_seen": 98099115, + "step": 4538, + "time_per_iteration": 2.6432113647460938 + }, + { + "auxiliary_loss_clip": 0.011333, + "auxiliary_loss_mlp": 0.01044735, + "balance_loss_clip": 1.05075216, + "balance_loss_mlp": 1.02784324, + "epoch": 0.2728994438599128, + "flos": 21652411040640.0, + "grad_norm": 1.824827492408775, + "language_loss": 0.90160263, + "learning_rate": 3.412353451992847e-06, + "loss": 0.923383, + "num_input_tokens_seen": 98118415, + "step": 4539, + "time_per_iteration": 2.620088815689087 + }, + { + "auxiliary_loss_clip": 0.0112346, + "auxiliary_loss_mlp": 0.01044264, + "balance_loss_clip": 1.04970992, + "balance_loss_mlp": 1.0250001, + "epoch": 0.2729595671125808, + "flos": 17488424949120.0, + "grad_norm": 1.7778813807473632, + "language_loss": 0.88033229, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.90200949, + "num_input_tokens_seen": 98136300, + "step": 4540, + "time_per_iteration": 2.7115092277526855 + }, + { + "auxiliary_loss_clip": 0.01139055, + "auxiliary_loss_mlp": 0.00775653, + "balance_loss_clip": 1.0515871, + "balance_loss_mlp": 1.00068974, + "epoch": 0.27301969036524876, + "flos": 19318145800320.0, + "grad_norm": 3.2240434674097758, + "language_loss": 0.82471287, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.84385997, + "num_input_tokens_seen": 98154580, + "step": 4541, + "time_per_iteration": 2.6112682819366455 + }, + { + "auxiliary_loss_clip": 0.01123955, + "auxiliary_loss_mlp": 0.01045117, + "balance_loss_clip": 1.05166435, + "balance_loss_mlp": 1.02798617, + "epoch": 0.2730798136179167, + "flos": 21065666376960.0, + "grad_norm": 2.102491799578544, + "language_loss": 0.79535306, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.81704378, + "num_input_tokens_seen": 98173115, + "step": 4542, + "time_per_iteration": 2.7202932834625244 + }, + { + "auxiliary_loss_clip": 0.01130053, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.05406725, + "balance_loss_mlp": 1.02263296, + "epoch": 0.2731399368705847, + "flos": 19171737964800.0, + "grad_norm": 1.955696716620197, + "language_loss": 0.89326978, + "learning_rate": 3.411250012687582e-06, + "loss": 0.91496956, + "num_input_tokens_seen": 98190260, + "step": 4543, + "time_per_iteration": 2.6846654415130615 + }, + { + "auxiliary_loss_clip": 0.01118776, + "auxiliary_loss_mlp": 0.00776653, + "balance_loss_clip": 1.04913735, + "balance_loss_mlp": 1.00080073, + "epoch": 0.27320006012325265, + "flos": 18290130554880.0, + "grad_norm": 2.4410785724718997, + "language_loss": 0.64012986, + "learning_rate": 3.410974019048255e-06, + "loss": 0.65908414, + "num_input_tokens_seen": 98207115, + "step": 4544, + "time_per_iteration": 2.6373775005340576 + }, + { + "auxiliary_loss_clip": 0.01123945, + "auxiliary_loss_mlp": 0.01044578, + "balance_loss_clip": 1.05455351, + "balance_loss_mlp": 1.02582633, + "epoch": 0.2732601833759206, + "flos": 34860929731200.0, + "grad_norm": 3.5876362405970643, + "language_loss": 0.69788039, + "learning_rate": 3.410697971904651e-06, + "loss": 0.71956557, + "num_input_tokens_seen": 98230610, + "step": 4545, + "time_per_iteration": 2.7943291664123535 + }, + { + "auxiliary_loss_clip": 0.0103839, + "auxiliary_loss_mlp": 0.01023664, + "balance_loss_clip": 1.02576709, + "balance_loss_mlp": 1.02123213, + "epoch": 0.2733203066285886, + "flos": 53910824762880.0, + "grad_norm": 0.7314456658795918, + "language_loss": 0.61636353, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.63698411, + "num_input_tokens_seen": 98293585, + "step": 4546, + "time_per_iteration": 3.2244455814361572 + }, + { + "auxiliary_loss_clip": 0.0105925, + "auxiliary_loss_mlp": 0.01053726, + "balance_loss_clip": 1.04915786, + "balance_loss_mlp": 1.03472424, + "epoch": 0.2733804298812566, + "flos": 20660378244480.0, + "grad_norm": 1.905103737754333, + "language_loss": 0.6467241, + "learning_rate": 3.410145717146488e-06, + "loss": 0.66785389, + "num_input_tokens_seen": 98311680, + "step": 4547, + "time_per_iteration": 2.7815287113189697 + }, + { + "auxiliary_loss_clip": 0.01123347, + "auxiliary_loss_mlp": 0.00774125, + "balance_loss_clip": 1.05267262, + "balance_loss_mlp": 1.00081313, + "epoch": 0.27344055313392457, + "flos": 25884339707520.0, + "grad_norm": 1.90846373489731, + "language_loss": 0.77248073, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.79145551, + "num_input_tokens_seen": 98330770, + "step": 4548, + "time_per_iteration": 2.8113017082214355 + }, + { + "auxiliary_loss_clip": 0.01122557, + "auxiliary_loss_mlp": 0.01050902, + "balance_loss_clip": 1.05430245, + "balance_loss_mlp": 1.03526139, + "epoch": 0.27350067638659253, + "flos": 22929753565440.0, + "grad_norm": 1.9713428286290122, + "language_loss": 0.82792878, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.84966338, + "num_input_tokens_seen": 98349860, + "step": 4549, + "time_per_iteration": 2.6938650608062744 + }, + { + "auxiliary_loss_clip": 0.01135405, + "auxiliary_loss_mlp": 0.01048728, + "balance_loss_clip": 1.04898036, + "balance_loss_mlp": 1.02902281, + "epoch": 0.2735607996392605, + "flos": 16574821499520.0, + "grad_norm": 3.4543610040263655, + "language_loss": 0.71193838, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.73377967, + "num_input_tokens_seen": 98367040, + "step": 4550, + "time_per_iteration": 2.638643503189087 + }, + { + "auxiliary_loss_clip": 0.01107347, + "auxiliary_loss_mlp": 0.01042242, + "balance_loss_clip": 1.05066109, + "balance_loss_mlp": 1.02569556, + "epoch": 0.27362092289192846, + "flos": 19645291895040.0, + "grad_norm": 3.3050607953849576, + "language_loss": 0.78899491, + "learning_rate": 3.409040566039563e-06, + "loss": 0.81049079, + "num_input_tokens_seen": 98384010, + "step": 4551, + "time_per_iteration": 4.352613210678101 + }, + { + "auxiliary_loss_clip": 0.01107945, + "auxiliary_loss_mlp": 0.01052105, + "balance_loss_clip": 1.04898548, + "balance_loss_mlp": 1.03342533, + "epoch": 0.27368104614459643, + "flos": 17639142416640.0, + "grad_norm": 2.480443972085862, + "language_loss": 0.71220398, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.73380452, + "num_input_tokens_seen": 98399625, + "step": 4552, + "time_per_iteration": 4.194540739059448 + }, + { + "auxiliary_loss_clip": 0.01123037, + "auxiliary_loss_mlp": 0.01045225, + "balance_loss_clip": 1.05144608, + "balance_loss_mlp": 1.0275104, + "epoch": 0.2737411693972644, + "flos": 21580015178880.0, + "grad_norm": 2.1026303213651967, + "language_loss": 0.71636003, + "learning_rate": 3.408487669858431e-06, + "loss": 0.73804259, + "num_input_tokens_seen": 98417310, + "step": 4553, + "time_per_iteration": 2.7323882579803467 + }, + { + "auxiliary_loss_clip": 0.01134032, + "auxiliary_loss_mlp": 0.01045217, + "balance_loss_clip": 1.05039358, + "balance_loss_mlp": 1.02658415, + "epoch": 0.27380129264993236, + "flos": 25484043565440.0, + "grad_norm": 1.7325126580228065, + "language_loss": 0.58917797, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.6109705, + "num_input_tokens_seen": 98438670, + "step": 4554, + "time_per_iteration": 2.7384533882141113 + }, + { + "auxiliary_loss_clip": 0.01129927, + "auxiliary_loss_mlp": 0.01042216, + "balance_loss_clip": 1.05440903, + "balance_loss_mlp": 1.02400088, + "epoch": 0.2738614159026003, + "flos": 18661196004480.0, + "grad_norm": 1.7915916386168997, + "language_loss": 0.73645991, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.75818133, + "num_input_tokens_seen": 98456060, + "step": 4555, + "time_per_iteration": 2.742417335510254 + }, + { + "auxiliary_loss_clip": 0.01141373, + "auxiliary_loss_mlp": 0.01039158, + "balance_loss_clip": 1.0561738, + "balance_loss_mlp": 1.02152658, + "epoch": 0.2739215391552683, + "flos": 23477139901440.0, + "grad_norm": 2.8904145278515303, + "language_loss": 0.77755523, + "learning_rate": 3.407657925038002e-06, + "loss": 0.79936051, + "num_input_tokens_seen": 98473765, + "step": 4556, + "time_per_iteration": 4.419378280639648 + }, + { + "auxiliary_loss_clip": 0.01150896, + "auxiliary_loss_mlp": 0.01049261, + "balance_loss_clip": 1.05645621, + "balance_loss_mlp": 1.02959132, + "epoch": 0.27398166240793626, + "flos": 17128636369920.0, + "grad_norm": 7.460972643049535, + "language_loss": 0.82236463, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.84436619, + "num_input_tokens_seen": 98490590, + "step": 4557, + "time_per_iteration": 2.6087756156921387 + }, + { + "auxiliary_loss_clip": 0.01089746, + "auxiliary_loss_mlp": 0.01046447, + "balance_loss_clip": 1.04229808, + "balance_loss_mlp": 1.02811229, + "epoch": 0.2740417856606042, + "flos": 23404744039680.0, + "grad_norm": 2.034332886344347, + "language_loss": 0.7293033, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.75066525, + "num_input_tokens_seen": 98510590, + "step": 4558, + "time_per_iteration": 2.7908921241760254 + }, + { + "auxiliary_loss_clip": 0.0112554, + "auxiliary_loss_mlp": 0.01051481, + "balance_loss_clip": 1.05215442, + "balance_loss_mlp": 1.03334939, + "epoch": 0.2741019089132722, + "flos": 12780428400000.0, + "grad_norm": 2.134307291688894, + "language_loss": 0.67842996, + "learning_rate": 3.406827699810819e-06, + "loss": 0.70020014, + "num_input_tokens_seen": 98527875, + "step": 4559, + "time_per_iteration": 2.7246246337890625 + }, + { + "auxiliary_loss_clip": 0.01121642, + "auxiliary_loss_mlp": 0.01055203, + "balance_loss_clip": 1.04958165, + "balance_loss_mlp": 1.03646374, + "epoch": 0.27416203216594015, + "flos": 20631542601600.0, + "grad_norm": 2.095192605103166, + "language_loss": 0.7249226, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.74669105, + "num_input_tokens_seen": 98547575, + "step": 4560, + "time_per_iteration": 2.634526252746582 + }, + { + "auxiliary_loss_clip": 0.01131443, + "auxiliary_loss_mlp": 0.01049928, + "balance_loss_clip": 1.05592251, + "balance_loss_mlp": 1.03115225, + "epoch": 0.27422215541860817, + "flos": 26541576812160.0, + "grad_norm": 2.095026193088577, + "language_loss": 0.81413525, + "learning_rate": 3.406273949573303e-06, + "loss": 0.83594894, + "num_input_tokens_seen": 98566290, + "step": 4561, + "time_per_iteration": 2.711106538772583 + }, + { + "auxiliary_loss_clip": 0.01156737, + "auxiliary_loss_mlp": 0.01043903, + "balance_loss_clip": 1.05919766, + "balance_loss_mlp": 1.02688003, + "epoch": 0.27428227867127614, + "flos": 23331163029120.0, + "grad_norm": 1.7066421621801435, + "language_loss": 0.75436246, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.77636886, + "num_input_tokens_seen": 98586255, + "step": 4562, + "time_per_iteration": 2.699544668197632 + }, + { + "auxiliary_loss_clip": 0.01155238, + "auxiliary_loss_mlp": 0.01038722, + "balance_loss_clip": 1.06035113, + "balance_loss_mlp": 1.02138865, + "epoch": 0.2743424019239441, + "flos": 23035115134080.0, + "grad_norm": 1.784616644228294, + "language_loss": 0.74751598, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.76945561, + "num_input_tokens_seen": 98606030, + "step": 4563, + "time_per_iteration": 2.788313627243042 + }, + { + "auxiliary_loss_clip": 0.01119321, + "auxiliary_loss_mlp": 0.01048987, + "balance_loss_clip": 1.04918432, + "balance_loss_mlp": 1.02912664, + "epoch": 0.27440252517661207, + "flos": 21981101420160.0, + "grad_norm": 1.7657560231579414, + "language_loss": 0.63026172, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.65194476, + "num_input_tokens_seen": 98625225, + "step": 4564, + "time_per_iteration": 2.810922145843506 + }, + { + "auxiliary_loss_clip": 0.01128901, + "auxiliary_loss_mlp": 0.01046032, + "balance_loss_clip": 1.05438292, + "balance_loss_mlp": 1.02732766, + "epoch": 0.27446264842928003, + "flos": 40187451502080.0, + "grad_norm": 1.9571814389681148, + "language_loss": 0.78683448, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.8085838, + "num_input_tokens_seen": 98649470, + "step": 4565, + "time_per_iteration": 2.846803665161133 + }, + { + "auxiliary_loss_clip": 0.01095875, + "auxiliary_loss_mlp": 0.01050978, + "balance_loss_clip": 1.04981828, + "balance_loss_mlp": 1.03370428, + "epoch": 0.274522771681948, + "flos": 13479681438720.0, + "grad_norm": 2.4708024317398003, + "language_loss": 0.68715227, + "learning_rate": 3.404888640957477e-06, + "loss": 0.70862079, + "num_input_tokens_seen": 98666915, + "step": 4566, + "time_per_iteration": 2.714352607727051 + }, + { + "auxiliary_loss_clip": 0.01142259, + "auxiliary_loss_mlp": 0.01049797, + "balance_loss_clip": 1.05835438, + "balance_loss_mlp": 1.03326273, + "epoch": 0.27458289493461596, + "flos": 28622133313920.0, + "grad_norm": 2.1203833431876435, + "language_loss": 0.60966527, + "learning_rate": 3.404611419371723e-06, + "loss": 0.63158584, + "num_input_tokens_seen": 98688240, + "step": 4567, + "time_per_iteration": 2.71791934967041 + }, + { + "auxiliary_loss_clip": 0.01135855, + "auxiliary_loss_mlp": 0.01047435, + "balance_loss_clip": 1.05527198, + "balance_loss_mlp": 1.02756321, + "epoch": 0.2746430181872839, + "flos": 20119815492480.0, + "grad_norm": 4.134990661591929, + "language_loss": 0.82529241, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.84712529, + "num_input_tokens_seen": 98708245, + "step": 4568, + "time_per_iteration": 2.6779236793518066 + }, + { + "auxiliary_loss_clip": 0.01141648, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.06012177, + "balance_loss_mlp": 1.01916456, + "epoch": 0.2747031414399519, + "flos": 20193468330240.0, + "grad_norm": 2.0524329167860254, + "language_loss": 0.68425417, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.70604521, + "num_input_tokens_seen": 98724575, + "step": 4569, + "time_per_iteration": 2.6595280170440674 + }, + { + "auxiliary_loss_clip": 0.0111585, + "auxiliary_loss_mlp": 0.01047943, + "balance_loss_clip": 1.04627442, + "balance_loss_mlp": 1.02938235, + "epoch": 0.27476326469261986, + "flos": 13516346246400.0, + "grad_norm": 2.9457223850766283, + "language_loss": 0.70966327, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.73130119, + "num_input_tokens_seen": 98740700, + "step": 4570, + "time_per_iteration": 2.7404215335845947 + }, + { + "auxiliary_loss_clip": 0.01035018, + "auxiliary_loss_mlp": 0.01027544, + "balance_loss_clip": 1.03062916, + "balance_loss_mlp": 1.02521896, + "epoch": 0.2748233879452878, + "flos": 65937127121280.0, + "grad_norm": 0.7294499123437721, + "language_loss": 0.55835986, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.57898545, + "num_input_tokens_seen": 98803030, + "step": 4571, + "time_per_iteration": 3.369403123855591 + }, + { + "auxiliary_loss_clip": 0.01096573, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_clip": 1.0493505, + "balance_loss_mlp": 1.03134847, + "epoch": 0.2748835111979558, + "flos": 17384212615680.0, + "grad_norm": 2.8212366896407772, + "language_loss": 0.78388298, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.80534041, + "num_input_tokens_seen": 98820505, + "step": 4572, + "time_per_iteration": 2.835817813873291 + }, + { + "auxiliary_loss_clip": 0.01145371, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.05474758, + "balance_loss_mlp": 1.02365255, + "epoch": 0.27494363445062375, + "flos": 23587565287680.0, + "grad_norm": 3.882915196153325, + "language_loss": 0.8126958, + "learning_rate": 3.402946971702147e-06, + "loss": 0.83453798, + "num_input_tokens_seen": 98842150, + "step": 4573, + "time_per_iteration": 2.709415912628174 + }, + { + "auxiliary_loss_clip": 0.01135124, + "auxiliary_loss_mlp": 0.01042886, + "balance_loss_clip": 1.0529685, + "balance_loss_mlp": 1.0252434, + "epoch": 0.2750037577032918, + "flos": 17164582905600.0, + "grad_norm": 1.740498780022663, + "language_loss": 0.79043669, + "learning_rate": 3.402669377496223e-06, + "loss": 0.81221676, + "num_input_tokens_seen": 98861050, + "step": 4574, + "time_per_iteration": 2.651921272277832 + }, + { + "auxiliary_loss_clip": 0.01104251, + "auxiliary_loss_mlp": 0.01052183, + "balance_loss_clip": 1.05164313, + "balance_loss_mlp": 1.03518367, + "epoch": 0.27506388095595974, + "flos": 24491903028480.0, + "grad_norm": 2.03666793953709, + "language_loss": 0.74517256, + "learning_rate": 3.402391730100936e-06, + "loss": 0.76673687, + "num_input_tokens_seen": 98879695, + "step": 4575, + "time_per_iteration": 2.7622992992401123 + }, + { + "auxiliary_loss_clip": 0.01126178, + "auxiliary_loss_mlp": 0.01042992, + "balance_loss_clip": 1.05188203, + "balance_loss_mlp": 1.02700627, + "epoch": 0.2751240042086277, + "flos": 38764706722560.0, + "grad_norm": 2.5671977719319745, + "language_loss": 0.71951419, + "learning_rate": 3.402114029526814e-06, + "loss": 0.74120593, + "num_input_tokens_seen": 98902035, + "step": 4576, + "time_per_iteration": 2.85740065574646 + }, + { + "auxiliary_loss_clip": 0.01102681, + "auxiliary_loss_mlp": 0.00778132, + "balance_loss_clip": 1.0506314, + "balance_loss_mlp": 1.00075579, + "epoch": 0.27518412746129567, + "flos": 26907039740160.0, + "grad_norm": 1.8050360629969575, + "language_loss": 0.73217857, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.7509867, + "num_input_tokens_seen": 98921835, + "step": 4577, + "time_per_iteration": 2.9024770259857178 + }, + { + "auxiliary_loss_clip": 0.01130618, + "auxiliary_loss_mlp": 0.01043838, + "balance_loss_clip": 1.05657601, + "balance_loss_mlp": 1.02571797, + "epoch": 0.27524425071396363, + "flos": 24900531125760.0, + "grad_norm": 1.7818656930434014, + "language_loss": 0.76073247, + "learning_rate": 3.401558468884188e-06, + "loss": 0.78247702, + "num_input_tokens_seen": 98939610, + "step": 4578, + "time_per_iteration": 2.7173874378204346 + }, + { + "auxiliary_loss_clip": 0.01120877, + "auxiliary_loss_mlp": 0.01047646, + "balance_loss_clip": 1.05252147, + "balance_loss_mlp": 1.02741659, + "epoch": 0.2753043739666316, + "flos": 26288047641600.0, + "grad_norm": 2.6134371594901773, + "language_loss": 0.66563278, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.68731803, + "num_input_tokens_seen": 98962250, + "step": 4579, + "time_per_iteration": 2.730104446411133 + }, + { + "auxiliary_loss_clip": 0.01113502, + "auxiliary_loss_mlp": 0.01058443, + "balance_loss_clip": 1.04683816, + "balance_loss_mlp": 1.03911948, + "epoch": 0.27536449721929956, + "flos": 24206772867840.0, + "grad_norm": 1.8779975195253575, + "language_loss": 0.80174518, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.82346463, + "num_input_tokens_seen": 98981845, + "step": 4580, + "time_per_iteration": 2.8395349979400635 + }, + { + "auxiliary_loss_clip": 0.01141995, + "auxiliary_loss_mlp": 0.01050029, + "balance_loss_clip": 1.05684924, + "balance_loss_mlp": 1.02942991, + "epoch": 0.27542462047196753, + "flos": 19537272720000.0, + "grad_norm": 1.5301552660019138, + "language_loss": 0.67242241, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.69434267, + "num_input_tokens_seen": 99001855, + "step": 4581, + "time_per_iteration": 2.788644552230835 + }, + { + "auxiliary_loss_clip": 0.01132258, + "auxiliary_loss_mlp": 0.0104746, + "balance_loss_clip": 1.0560689, + "balance_loss_mlp": 1.03050864, + "epoch": 0.2754847437246355, + "flos": 14319165173760.0, + "grad_norm": 1.785645052077455, + "language_loss": 0.77915615, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80095327, + "num_input_tokens_seen": 99019880, + "step": 4582, + "time_per_iteration": 2.730393409729004 + }, + { + "auxiliary_loss_clip": 0.0110084, + "auxiliary_loss_mlp": 0.01042256, + "balance_loss_clip": 1.05119133, + "balance_loss_mlp": 1.02575767, + "epoch": 0.27554486697730346, + "flos": 18838773866880.0, + "grad_norm": 1.737971373642785, + "language_loss": 0.84479475, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.86622572, + "num_input_tokens_seen": 99037570, + "step": 4583, + "time_per_iteration": 2.7274270057678223 + }, + { + "auxiliary_loss_clip": 0.01139632, + "auxiliary_loss_mlp": 0.01044098, + "balance_loss_clip": 1.05364764, + "balance_loss_mlp": 1.02693176, + "epoch": 0.2756049902299714, + "flos": 22382295402240.0, + "grad_norm": 1.6883560409679848, + "language_loss": 0.67007428, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.69191158, + "num_input_tokens_seen": 99056875, + "step": 4584, + "time_per_iteration": 2.643176794052124 + }, + { + "auxiliary_loss_clip": 0.01080495, + "auxiliary_loss_mlp": 0.01054092, + "balance_loss_clip": 1.04106402, + "balance_loss_mlp": 1.03475666, + "epoch": 0.2756651134826394, + "flos": 19573901614080.0, + "grad_norm": 1.8352571769398758, + "language_loss": 0.77349764, + "learning_rate": 3.399612333050327e-06, + "loss": 0.79484355, + "num_input_tokens_seen": 99074685, + "step": 4585, + "time_per_iteration": 2.6824886798858643 + }, + { + "auxiliary_loss_clip": 0.01142822, + "auxiliary_loss_mlp": 0.00775816, + "balance_loss_clip": 1.05703616, + "balance_loss_mlp": 1.00084651, + "epoch": 0.27572523673530736, + "flos": 23586559706880.0, + "grad_norm": 1.697985370469672, + "language_loss": 0.7201665, + "learning_rate": 3.399334101267362e-06, + "loss": 0.73935288, + "num_input_tokens_seen": 99095300, + "step": 4586, + "time_per_iteration": 2.672872304916382 + }, + { + "auxiliary_loss_clip": 0.01125604, + "auxiliary_loss_mlp": 0.01038583, + "balance_loss_clip": 1.05329537, + "balance_loss_mlp": 1.02184618, + "epoch": 0.2757853599879754, + "flos": 22820118278400.0, + "grad_norm": 2.166019285475688, + "language_loss": 0.80385983, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.82550168, + "num_input_tokens_seen": 99115965, + "step": 4587, + "time_per_iteration": 2.716212272644043 + }, + { + "auxiliary_loss_clip": 0.01139286, + "auxiliary_loss_mlp": 0.0104661, + "balance_loss_clip": 1.05435753, + "balance_loss_mlp": 1.02916992, + "epoch": 0.27584548324064334, + "flos": 18551704371840.0, + "grad_norm": 3.416975868515595, + "language_loss": 0.83000016, + "learning_rate": 3.398777478523316e-06, + "loss": 0.85185915, + "num_input_tokens_seen": 99134265, + "step": 4588, + "time_per_iteration": 2.6104485988616943 + }, + { + "auxiliary_loss_clip": 0.01109827, + "auxiliary_loss_mlp": 0.01042868, + "balance_loss_clip": 1.04756808, + "balance_loss_mlp": 1.02567828, + "epoch": 0.2759056064933113, + "flos": 23769883745280.0, + "grad_norm": 1.3306263403060763, + "language_loss": 0.75309169, + "learning_rate": 3.398499087583342e-06, + "loss": 0.77461863, + "num_input_tokens_seen": 99156185, + "step": 4589, + "time_per_iteration": 4.333514928817749 + }, + { + "auxiliary_loss_clip": 0.01138237, + "auxiliary_loss_mlp": 0.01046648, + "balance_loss_clip": 1.0555464, + "balance_loss_mlp": 1.02944636, + "epoch": 0.27596572974597927, + "flos": 24281898163200.0, + "grad_norm": 1.9812216556422375, + "language_loss": 0.8860873, + "learning_rate": 3.398220643612143e-06, + "loss": 0.90793616, + "num_input_tokens_seen": 99176735, + "step": 4590, + "time_per_iteration": 4.256460428237915 + }, + { + "auxiliary_loss_clip": 0.01132985, + "auxiliary_loss_mlp": 0.01048634, + "balance_loss_clip": 1.05280411, + "balance_loss_mlp": 1.03025222, + "epoch": 0.27602585299864724, + "flos": 35040985632000.0, + "grad_norm": 1.594737426944321, + "language_loss": 0.71265185, + "learning_rate": 3.397942146620277e-06, + "loss": 0.7344681, + "num_input_tokens_seen": 99199765, + "step": 4591, + "time_per_iteration": 2.8263018131256104 + }, + { + "auxiliary_loss_clip": 0.01114882, + "auxiliary_loss_mlp": 0.01048296, + "balance_loss_clip": 1.05395412, + "balance_loss_mlp": 1.0301044, + "epoch": 0.2760859762513152, + "flos": 24309405002880.0, + "grad_norm": 3.793452037579163, + "language_loss": 0.80017495, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.82180673, + "num_input_tokens_seen": 99218435, + "step": 4592, + "time_per_iteration": 4.289790153503418 + }, + { + "auxiliary_loss_clip": 0.01051224, + "auxiliary_loss_mlp": 0.00755885, + "balance_loss_clip": 1.02655387, + "balance_loss_mlp": 1.00253439, + "epoch": 0.27614609950398317, + "flos": 71260739890560.0, + "grad_norm": 0.710408868807485, + "language_loss": 0.61613023, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63420129, + "num_input_tokens_seen": 99276200, + "step": 4593, + "time_per_iteration": 3.201831817626953 + }, + { + "auxiliary_loss_clip": 0.01130969, + "auxiliary_loss_mlp": 0.01042983, + "balance_loss_clip": 1.05307889, + "balance_loss_mlp": 1.02640104, + "epoch": 0.27620622275665113, + "flos": 29674854138240.0, + "grad_norm": 1.9659750468178385, + "language_loss": 0.778301, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.80004054, + "num_input_tokens_seen": 99297625, + "step": 4594, + "time_per_iteration": 2.7222111225128174 + }, + { + "auxiliary_loss_clip": 0.0113791, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.05557215, + "balance_loss_mlp": 1.02168524, + "epoch": 0.2762663460093191, + "flos": 15378063137280.0, + "grad_norm": 1.5118783378909677, + "language_loss": 0.91944981, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.9412154, + "num_input_tokens_seen": 99315790, + "step": 4595, + "time_per_iteration": 4.290736198425293 + }, + { + "auxiliary_loss_clip": 0.01134891, + "auxiliary_loss_mlp": 0.01052323, + "balance_loss_clip": 1.05374146, + "balance_loss_mlp": 1.03413117, + "epoch": 0.27632646926198706, + "flos": 20704082117760.0, + "grad_norm": 1.7744098894398055, + "language_loss": 0.69208467, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.71395689, + "num_input_tokens_seen": 99334615, + "step": 4596, + "time_per_iteration": 2.7178540229797363 + }, + { + "auxiliary_loss_clip": 0.01125254, + "auxiliary_loss_mlp": 0.01048102, + "balance_loss_clip": 1.05075955, + "balance_loss_mlp": 1.02977943, + "epoch": 0.276386592514655, + "flos": 32813374849920.0, + "grad_norm": 1.7305541104386353, + "language_loss": 0.63536781, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.65710139, + "num_input_tokens_seen": 99356685, + "step": 4597, + "time_per_iteration": 2.7713348865509033 + }, + { + "auxiliary_loss_clip": 0.01150233, + "auxiliary_loss_mlp": 0.01046127, + "balance_loss_clip": 1.05762243, + "balance_loss_mlp": 1.02949786, + "epoch": 0.276446715767323, + "flos": 18551704371840.0, + "grad_norm": 2.077440653118394, + "language_loss": 0.86298984, + "learning_rate": 3.395991183985887e-06, + "loss": 0.8849535, + "num_input_tokens_seen": 99374810, + "step": 4598, + "time_per_iteration": 2.6077804565429688 + }, + { + "auxiliary_loss_clip": 0.01151532, + "auxiliary_loss_mlp": 0.01046218, + "balance_loss_clip": 1.0559516, + "balance_loss_mlp": 1.02790797, + "epoch": 0.27650683901999096, + "flos": 22819615488000.0, + "grad_norm": 2.6195813063936493, + "language_loss": 0.79957914, + "learning_rate": 3.395712263209037e-06, + "loss": 0.82155669, + "num_input_tokens_seen": 99391290, + "step": 4599, + "time_per_iteration": 2.67372989654541 + }, + { + "auxiliary_loss_clip": 0.01127397, + "auxiliary_loss_mlp": 0.01049332, + "balance_loss_clip": 1.04922533, + "balance_loss_mlp": 1.03152239, + "epoch": 0.276566962272659, + "flos": 21361534704000.0, + "grad_norm": 1.7492576371751551, + "language_loss": 0.78788924, + "learning_rate": 3.395433289506639e-06, + "loss": 0.80965656, + "num_input_tokens_seen": 99409120, + "step": 4600, + "time_per_iteration": 2.7197396755218506 + }, + { + "auxiliary_loss_clip": 0.01119636, + "auxiliary_loss_mlp": 0.01049981, + "balance_loss_clip": 1.05458808, + "balance_loss_mlp": 1.03226674, + "epoch": 0.27662708552532694, + "flos": 17710604524800.0, + "grad_norm": 2.9827767838021906, + "language_loss": 0.7372371, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.75893331, + "num_input_tokens_seen": 99426180, + "step": 4601, + "time_per_iteration": 2.7212698459625244 + }, + { + "auxiliary_loss_clip": 0.01137986, + "auxiliary_loss_mlp": 0.01053484, + "balance_loss_clip": 1.05503917, + "balance_loss_mlp": 1.03514934, + "epoch": 0.2766872087779949, + "flos": 21252725429760.0, + "grad_norm": 1.7018676665174548, + "language_loss": 0.80055201, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.82246667, + "num_input_tokens_seen": 99447720, + "step": 4602, + "time_per_iteration": 2.6929776668548584 + }, + { + "auxiliary_loss_clip": 0.01131471, + "auxiliary_loss_mlp": 0.01060998, + "balance_loss_clip": 1.05209374, + "balance_loss_mlp": 1.04194784, + "epoch": 0.2767473320306629, + "flos": 12931900053120.0, + "grad_norm": 2.3561631161543986, + "language_loss": 0.77018148, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.79210615, + "num_input_tokens_seen": 99464720, + "step": 4603, + "time_per_iteration": 2.7761597633361816 + }, + { + "auxiliary_loss_clip": 0.01118804, + "auxiliary_loss_mlp": 0.01044782, + "balance_loss_clip": 1.05331254, + "balance_loss_mlp": 1.02858686, + "epoch": 0.27680745528333084, + "flos": 15012851604480.0, + "grad_norm": 1.686999686787164, + "language_loss": 0.81469357, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.83632934, + "num_input_tokens_seen": 99482310, + "step": 4604, + "time_per_iteration": 2.6715614795684814 + }, + { + "auxiliary_loss_clip": 0.01096642, + "auxiliary_loss_mlp": 0.0104217, + "balance_loss_clip": 1.04733086, + "balance_loss_mlp": 1.02428889, + "epoch": 0.2768675785359988, + "flos": 22637835734400.0, + "grad_norm": 1.8500484413544072, + "language_loss": 0.7021662, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.72355425, + "num_input_tokens_seen": 99501255, + "step": 4605, + "time_per_iteration": 2.824810266494751 + }, + { + "auxiliary_loss_clip": 0.01051326, + "auxiliary_loss_mlp": 0.01005015, + "balance_loss_clip": 1.02826095, + "balance_loss_mlp": 1.00244009, + "epoch": 0.27692770178866677, + "flos": 66130542881280.0, + "grad_norm": 0.7013581781305706, + "language_loss": 0.57222801, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.59279138, + "num_input_tokens_seen": 99568925, + "step": 4606, + "time_per_iteration": 3.288269519805908 + }, + { + "auxiliary_loss_clip": 0.01125032, + "auxiliary_loss_mlp": 0.01050719, + "balance_loss_clip": 1.05177283, + "balance_loss_mlp": 1.03280139, + "epoch": 0.27698782504133473, + "flos": 26464979059200.0, + "grad_norm": 1.9503980757161308, + "language_loss": 0.69579148, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.71754897, + "num_input_tokens_seen": 99588455, + "step": 4607, + "time_per_iteration": 2.7865042686462402 + }, + { + "auxiliary_loss_clip": 0.0113039, + "auxiliary_loss_mlp": 0.01040949, + "balance_loss_clip": 1.05402029, + "balance_loss_mlp": 1.0242002, + "epoch": 0.2770479482940027, + "flos": 25884806584320.0, + "grad_norm": 1.5552750364168406, + "language_loss": 0.69727945, + "learning_rate": 3.393199595837555e-06, + "loss": 0.71899283, + "num_input_tokens_seen": 99609355, + "step": 4608, + "time_per_iteration": 2.7139909267425537 + }, + { + "auxiliary_loss_clip": 0.0109619, + "auxiliary_loss_mlp": 0.01041619, + "balance_loss_clip": 1.04789758, + "balance_loss_mlp": 1.024894, + "epoch": 0.27710807154667066, + "flos": 22857249962880.0, + "grad_norm": 1.922338327624115, + "language_loss": 0.73170602, + "learning_rate": 3.392920146281499e-06, + "loss": 0.75308412, + "num_input_tokens_seen": 99628780, + "step": 4609, + "time_per_iteration": 2.8674490451812744 + }, + { + "auxiliary_loss_clip": 0.01105896, + "auxiliary_loss_mlp": 0.01054215, + "balance_loss_clip": 1.04444993, + "balance_loss_mlp": 1.03615475, + "epoch": 0.27716819479933863, + "flos": 17711071401600.0, + "grad_norm": 2.284482242639661, + "language_loss": 0.84028268, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.86188376, + "num_input_tokens_seen": 99644545, + "step": 4610, + "time_per_iteration": 2.6861605644226074 + }, + { + "auxiliary_loss_clip": 0.01074905, + "auxiliary_loss_mlp": 0.00781444, + "balance_loss_clip": 1.04093325, + "balance_loss_mlp": 1.00102568, + "epoch": 0.2772283180520066, + "flos": 19646046080640.0, + "grad_norm": 2.0943450829127044, + "language_loss": 0.68915951, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.70772296, + "num_input_tokens_seen": 99663125, + "step": 4611, + "time_per_iteration": 2.799345016479492 + }, + { + "auxiliary_loss_clip": 0.01144902, + "auxiliary_loss_mlp": 0.01042567, + "balance_loss_clip": 1.05466819, + "balance_loss_mlp": 1.02591395, + "epoch": 0.27728844130467456, + "flos": 21032628842880.0, + "grad_norm": 2.6988182686748785, + "language_loss": 0.73646772, + "learning_rate": 3.392081480737698e-06, + "loss": 0.75834239, + "num_input_tokens_seen": 99682645, + "step": 4612, + "time_per_iteration": 2.643157720565796 + }, + { + "auxiliary_loss_clip": 0.01139286, + "auxiliary_loss_mlp": 0.00775997, + "balance_loss_clip": 1.05283117, + "balance_loss_mlp": 1.00099993, + "epoch": 0.2773485645573425, + "flos": 18989204025600.0, + "grad_norm": 2.0654093622255436, + "language_loss": 0.66356897, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.68272179, + "num_input_tokens_seen": 99700520, + "step": 4613, + "time_per_iteration": 2.6685144901275635 + }, + { + "auxiliary_loss_clip": 0.01096758, + "auxiliary_loss_mlp": 0.01051618, + "balance_loss_clip": 1.04526055, + "balance_loss_mlp": 1.03354573, + "epoch": 0.27740868781001055, + "flos": 21468440557440.0, + "grad_norm": 1.5160858700983233, + "language_loss": 0.79385912, + "learning_rate": 3.39152210641815e-06, + "loss": 0.8153429, + "num_input_tokens_seen": 99720355, + "step": 4614, + "time_per_iteration": 2.82061505317688 + }, + { + "auxiliary_loss_clip": 0.01129896, + "auxiliary_loss_mlp": 0.01047714, + "balance_loss_clip": 1.04873419, + "balance_loss_mlp": 1.02978539, + "epoch": 0.2774688110626785, + "flos": 19827825834240.0, + "grad_norm": 2.763943164845673, + "language_loss": 0.80632633, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.82810241, + "num_input_tokens_seen": 99736090, + "step": 4615, + "time_per_iteration": 2.607448101043701 + }, + { + "auxiliary_loss_clip": 0.01114657, + "auxiliary_loss_mlp": 0.01051705, + "balance_loss_clip": 1.04532576, + "balance_loss_mlp": 1.03447962, + "epoch": 0.2775289343153465, + "flos": 18216226321920.0, + "grad_norm": 2.3373471978129543, + "language_loss": 0.646945, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.66860855, + "num_input_tokens_seen": 99751805, + "step": 4616, + "time_per_iteration": 2.693556308746338 + }, + { + "auxiliary_loss_clip": 0.01133374, + "auxiliary_loss_mlp": 0.01047225, + "balance_loss_clip": 1.0536505, + "balance_loss_mlp": 1.03001153, + "epoch": 0.27758905756801444, + "flos": 16472476673280.0, + "grad_norm": 2.175848824107301, + "language_loss": 0.82324976, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.84505582, + "num_input_tokens_seen": 99770610, + "step": 4617, + "time_per_iteration": 2.64677357673645 + }, + { + "auxiliary_loss_clip": 0.01147475, + "auxiliary_loss_mlp": 0.01049438, + "balance_loss_clip": 1.05210304, + "balance_loss_mlp": 1.03261721, + "epoch": 0.2776491808206824, + "flos": 18728240739840.0, + "grad_norm": 2.8579401527932236, + "language_loss": 0.77031851, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.79228759, + "num_input_tokens_seen": 99787305, + "step": 4618, + "time_per_iteration": 2.555001735687256 + }, + { + "auxiliary_loss_clip": 0.01151182, + "auxiliary_loss_mlp": 0.01042958, + "balance_loss_clip": 1.05599475, + "balance_loss_mlp": 1.0268048, + "epoch": 0.27770930407335037, + "flos": 28038189911040.0, + "grad_norm": 1.6850470881083441, + "language_loss": 0.85102153, + "learning_rate": 3.390122747388459e-06, + "loss": 0.87296283, + "num_input_tokens_seen": 99808940, + "step": 4619, + "time_per_iteration": 2.753230094909668 + }, + { + "auxiliary_loss_clip": 0.01121872, + "auxiliary_loss_mlp": 0.01041506, + "balance_loss_clip": 1.05075216, + "balance_loss_mlp": 1.02592564, + "epoch": 0.27776942732601834, + "flos": 23549823072000.0, + "grad_norm": 1.6763124645732197, + "language_loss": 0.7707957, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.79242951, + "num_input_tokens_seen": 99829575, + "step": 4620, + "time_per_iteration": 2.7764816284179688 + }, + { + "auxiliary_loss_clip": 0.01091863, + "auxiliary_loss_mlp": 0.01042513, + "balance_loss_clip": 1.04290819, + "balance_loss_mlp": 1.02517962, + "epoch": 0.2778295505786863, + "flos": 23908713811200.0, + "grad_norm": 1.985202794634515, + "language_loss": 0.78144193, + "learning_rate": 3.389562634707122e-06, + "loss": 0.80278563, + "num_input_tokens_seen": 99847575, + "step": 4621, + "time_per_iteration": 2.740419387817383 + }, + { + "auxiliary_loss_clip": 0.01113871, + "auxiliary_loss_mlp": 0.01054223, + "balance_loss_clip": 1.04857588, + "balance_loss_mlp": 1.03642535, + "epoch": 0.27788967383135427, + "flos": 25554571920000.0, + "grad_norm": 2.864120631038579, + "language_loss": 0.87357259, + "learning_rate": 3.389282499322611e-06, + "loss": 0.89525354, + "num_input_tokens_seen": 99864995, + "step": 4622, + "time_per_iteration": 2.8351151943206787 + }, + { + "auxiliary_loss_clip": 0.01096216, + "auxiliary_loss_mlp": 0.01052098, + "balance_loss_clip": 1.0477345, + "balance_loss_mlp": 1.0349195, + "epoch": 0.27794979708402223, + "flos": 16252631481600.0, + "grad_norm": 1.7857472181098575, + "language_loss": 0.81315404, + "learning_rate": 3.389002311256369e-06, + "loss": 0.83463717, + "num_input_tokens_seen": 99881540, + "step": 4623, + "time_per_iteration": 2.7112133502960205 + }, + { + "auxiliary_loss_clip": 0.01119674, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.05434608, + "balance_loss_mlp": 1.02628374, + "epoch": 0.2780099203366902, + "flos": 20667632791680.0, + "grad_norm": 2.1551340516102897, + "language_loss": 0.80889726, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.83052659, + "num_input_tokens_seen": 99899595, + "step": 4624, + "time_per_iteration": 2.6492481231689453 + }, + { + "auxiliary_loss_clip": 0.01112812, + "auxiliary_loss_mlp": 0.0077763, + "balance_loss_clip": 1.05008531, + "balance_loss_mlp": 1.00092447, + "epoch": 0.27807004358935816, + "flos": 17739583822080.0, + "grad_norm": 2.21671742511245, + "language_loss": 0.76949263, + "learning_rate": 3.388441777121191e-06, + "loss": 0.78839707, + "num_input_tokens_seen": 99913020, + "step": 4625, + "time_per_iteration": 2.6312057971954346 + }, + { + "auxiliary_loss_clip": 0.01106879, + "auxiliary_loss_mlp": 0.01046687, + "balance_loss_clip": 1.04205859, + "balance_loss_mlp": 1.02767277, + "epoch": 0.2781301668420261, + "flos": 16727119165440.0, + "grad_norm": 1.790813282848893, + "language_loss": 0.69947815, + "learning_rate": 3.388161431073511e-06, + "loss": 0.72101378, + "num_input_tokens_seen": 99931405, + "step": 4626, + "time_per_iteration": 2.7656819820404053 + }, + { + "auxiliary_loss_clip": 0.0110548, + "auxiliary_loss_mlp": 0.01041917, + "balance_loss_clip": 1.04827905, + "balance_loss_mlp": 1.02385652, + "epoch": 0.27819029009469415, + "flos": 13844749317120.0, + "grad_norm": 2.1086116607571546, + "language_loss": 0.92367601, + "learning_rate": 3.38788103238661e-06, + "loss": 0.94515002, + "num_input_tokens_seen": 99948100, + "step": 4627, + "time_per_iteration": 2.8608667850494385 + }, + { + "auxiliary_loss_clip": 0.01149683, + "auxiliary_loss_mlp": 0.01040775, + "balance_loss_clip": 1.05388021, + "balance_loss_mlp": 1.0248611, + "epoch": 0.2782504133473621, + "flos": 27089286370560.0, + "grad_norm": 1.7290354122756755, + "language_loss": 0.85490036, + "learning_rate": 3.387600581071121e-06, + "loss": 0.87680495, + "num_input_tokens_seen": 99966470, + "step": 4628, + "time_per_iteration": 2.6468069553375244 + }, + { + "auxiliary_loss_clip": 0.01114712, + "auxiliary_loss_mlp": 0.0104202, + "balance_loss_clip": 1.0482378, + "balance_loss_mlp": 1.02509212, + "epoch": 0.2783105366000301, + "flos": 21068826773760.0, + "grad_norm": 1.5106040860694088, + "language_loss": 0.79246545, + "learning_rate": 3.387320077137679e-06, + "loss": 0.81403273, + "num_input_tokens_seen": 99985930, + "step": 4629, + "time_per_iteration": 5.656833648681641 + }, + { + "auxiliary_loss_clip": 0.01100825, + "auxiliary_loss_mlp": 0.01040328, + "balance_loss_clip": 1.04602218, + "balance_loss_mlp": 1.02339983, + "epoch": 0.27837065985269804, + "flos": 26501823434880.0, + "grad_norm": 1.5125577415085874, + "language_loss": 0.84574991, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.86716145, + "num_input_tokens_seen": 100006235, + "step": 4630, + "time_per_iteration": 2.70917010307312 + }, + { + "auxiliary_loss_clip": 0.01123828, + "auxiliary_loss_mlp": 0.01038547, + "balance_loss_clip": 1.04848623, + "balance_loss_mlp": 1.02099967, + "epoch": 0.278430783105366, + "flos": 20223201813120.0, + "grad_norm": 2.1016222667741857, + "language_loss": 0.81134796, + "learning_rate": 3.386758911459485e-06, + "loss": 0.83297169, + "num_input_tokens_seen": 100023655, + "step": 4631, + "time_per_iteration": 4.19342041015625 + }, + { + "auxiliary_loss_clip": 0.01149092, + "auxiliary_loss_mlp": 0.01049428, + "balance_loss_clip": 1.05402875, + "balance_loss_mlp": 1.03257155, + "epoch": 0.278490906358034, + "flos": 25592888753280.0, + "grad_norm": 3.9436500565538295, + "language_loss": 0.71196103, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.7339462, + "num_input_tokens_seen": 100043280, + "step": 4632, + "time_per_iteration": 2.620439291000366 + }, + { + "auxiliary_loss_clip": 0.01132813, + "auxiliary_loss_mlp": 0.01044268, + "balance_loss_clip": 1.05435467, + "balance_loss_mlp": 1.02798355, + "epoch": 0.27855102961070194, + "flos": 16171544528640.0, + "grad_norm": 1.8243983980851597, + "language_loss": 0.82563186, + "learning_rate": 3.386197535437145e-06, + "loss": 0.84740269, + "num_input_tokens_seen": 100057690, + "step": 4633, + "time_per_iteration": 2.6531693935394287 + }, + { + "auxiliary_loss_clip": 0.01122775, + "auxiliary_loss_mlp": 0.01039803, + "balance_loss_clip": 1.04714537, + "balance_loss_mlp": 1.02130151, + "epoch": 0.2786111528633699, + "flos": 22927598749440.0, + "grad_norm": 1.6667943176882647, + "language_loss": 0.87727869, + "learning_rate": 3.385916768573529e-06, + "loss": 0.89890444, + "num_input_tokens_seen": 100075875, + "step": 4634, + "time_per_iteration": 4.391691446304321 + }, + { + "auxiliary_loss_clip": 0.01118626, + "auxiliary_loss_mlp": 0.01042889, + "balance_loss_clip": 1.04900146, + "balance_loss_mlp": 1.02503181, + "epoch": 0.27867127611603787, + "flos": 23404205335680.0, + "grad_norm": 1.8664238108113964, + "language_loss": 0.7701081, + "learning_rate": 3.38563594915581e-06, + "loss": 0.79172325, + "num_input_tokens_seen": 100092930, + "step": 4635, + "time_per_iteration": 2.7107748985290527 + }, + { + "auxiliary_loss_clip": 0.01148262, + "auxiliary_loss_mlp": 0.01044984, + "balance_loss_clip": 1.05233121, + "balance_loss_mlp": 1.02705491, + "epoch": 0.27873139936870583, + "flos": 19829010983040.0, + "grad_norm": 1.6280540509164947, + "language_loss": 0.65174443, + "learning_rate": 3.385355077194637e-06, + "loss": 0.67367697, + "num_input_tokens_seen": 100110790, + "step": 4636, + "time_per_iteration": 2.660099744796753 + }, + { + "auxiliary_loss_clip": 0.01134021, + "auxiliary_loss_mlp": 0.01042528, + "balance_loss_clip": 1.048437, + "balance_loss_mlp": 1.0243845, + "epoch": 0.2787915226213738, + "flos": 17707659609600.0, + "grad_norm": 2.8501862977667667, + "language_loss": 0.83485681, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.85662234, + "num_input_tokens_seen": 100126970, + "step": 4637, + "time_per_iteration": 2.6234302520751953 + }, + { + "auxiliary_loss_clip": 0.01117465, + "auxiliary_loss_mlp": 0.01043194, + "balance_loss_clip": 1.04580319, + "balance_loss_mlp": 1.02658796, + "epoch": 0.27885164587404176, + "flos": 22090557139200.0, + "grad_norm": 1.4481958644660236, + "language_loss": 0.75996393, + "learning_rate": 3.384793175684533e-06, + "loss": 0.78157055, + "num_input_tokens_seen": 100146720, + "step": 4638, + "time_per_iteration": 2.6488263607025146 + }, + { + "auxiliary_loss_clip": 0.0113367, + "auxiliary_loss_mlp": 0.01047522, + "balance_loss_clip": 1.04905438, + "balance_loss_mlp": 1.02935445, + "epoch": 0.27891176912670973, + "flos": 19207684500480.0, + "grad_norm": 1.973043880665722, + "language_loss": 0.71658665, + "learning_rate": 3.38451214615691e-06, + "loss": 0.73839855, + "num_input_tokens_seen": 100165920, + "step": 4639, + "time_per_iteration": 2.606290817260742 + }, + { + "auxiliary_loss_clip": 0.01134631, + "auxiliary_loss_mlp": 0.01040486, + "balance_loss_clip": 1.04905224, + "balance_loss_mlp": 1.02213931, + "epoch": 0.27897189237937775, + "flos": 27600007898880.0, + "grad_norm": 1.9413688357819885, + "language_loss": 0.6546669, + "learning_rate": 3.384231064128447e-06, + "loss": 0.67641807, + "num_input_tokens_seen": 100185525, + "step": 4640, + "time_per_iteration": 2.670572280883789 + }, + { + "auxiliary_loss_clip": 0.01134835, + "auxiliary_loss_mlp": 0.01040753, + "balance_loss_clip": 1.05033112, + "balance_loss_mlp": 1.02394438, + "epoch": 0.2790320156320457, + "flos": 21178210665600.0, + "grad_norm": 2.0528630099938385, + "language_loss": 0.72150993, + "learning_rate": 3.383949929609804e-06, + "loss": 0.74326581, + "num_input_tokens_seen": 100204850, + "step": 4641, + "time_per_iteration": 2.693377733230591 + }, + { + "auxiliary_loss_clip": 0.01112862, + "auxiliary_loss_mlp": 0.01043132, + "balance_loss_clip": 1.05076349, + "balance_loss_mlp": 1.02322423, + "epoch": 0.2790921388847137, + "flos": 22783920347520.0, + "grad_norm": 1.7365449070814052, + "language_loss": 0.74695385, + "learning_rate": 3.383668742611641e-06, + "loss": 0.7685138, + "num_input_tokens_seen": 100224520, + "step": 4642, + "time_per_iteration": 2.7462241649627686 + }, + { + "auxiliary_loss_clip": 0.0111075, + "auxiliary_loss_mlp": 0.01045242, + "balance_loss_clip": 1.04543257, + "balance_loss_mlp": 1.02603781, + "epoch": 0.27915226213738165, + "flos": 23400649889280.0, + "grad_norm": 1.8272594017764643, + "language_loss": 0.85924351, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.88080341, + "num_input_tokens_seen": 100243935, + "step": 4643, + "time_per_iteration": 2.725135564804077 + }, + { + "auxiliary_loss_clip": 0.01105223, + "auxiliary_loss_mlp": 0.01045051, + "balance_loss_clip": 1.04933143, + "balance_loss_mlp": 1.02697933, + "epoch": 0.2792123853900496, + "flos": 22747794243840.0, + "grad_norm": 1.7474380366240072, + "language_loss": 0.83161986, + "learning_rate": 3.383106211219407e-06, + "loss": 0.85312265, + "num_input_tokens_seen": 100262290, + "step": 4644, + "time_per_iteration": 2.7356133460998535 + }, + { + "auxiliary_loss_clip": 0.01135825, + "auxiliary_loss_mlp": 0.01044339, + "balance_loss_clip": 1.04996896, + "balance_loss_mlp": 1.02672005, + "epoch": 0.2792725086427176, + "flos": 15049372757760.0, + "grad_norm": 1.8326156585035789, + "language_loss": 0.79077673, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.81257844, + "num_input_tokens_seen": 100280015, + "step": 4645, + "time_per_iteration": 2.6605966091156006 + }, + { + "auxiliary_loss_clip": 0.01043101, + "auxiliary_loss_mlp": 0.01005168, + "balance_loss_clip": 1.02972245, + "balance_loss_mlp": 1.00273657, + "epoch": 0.27933263189538554, + "flos": 62544861757440.0, + "grad_norm": 0.7804050577208047, + "language_loss": 0.62298429, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64346695, + "num_input_tokens_seen": 100338935, + "step": 4646, + "time_per_iteration": 3.203944206237793 + }, + { + "auxiliary_loss_clip": 0.01116876, + "auxiliary_loss_mlp": 0.01036795, + "balance_loss_clip": 1.05170095, + "balance_loss_mlp": 1.02054703, + "epoch": 0.2793927551480535, + "flos": 25118365155840.0, + "grad_norm": 1.6679902986930268, + "language_loss": 0.89280778, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.91434449, + "num_input_tokens_seen": 100359905, + "step": 4647, + "time_per_iteration": 2.829617500305176 + }, + { + "auxiliary_loss_clip": 0.0113911, + "auxiliary_loss_mlp": 0.01047084, + "balance_loss_clip": 1.05125523, + "balance_loss_mlp": 1.02880919, + "epoch": 0.27945287840072147, + "flos": 21324582587520.0, + "grad_norm": 1.8012650128540075, + "language_loss": 0.86784112, + "learning_rate": 3.381980519149988e-06, + "loss": 0.88970304, + "num_input_tokens_seen": 100376955, + "step": 4648, + "time_per_iteration": 2.632321357727051 + }, + { + "auxiliary_loss_clip": 0.01134603, + "auxiliary_loss_mlp": 0.01044893, + "balance_loss_clip": 1.05110133, + "balance_loss_mlp": 1.02733302, + "epoch": 0.27951300165338944, + "flos": 27450547407360.0, + "grad_norm": 2.0026822782024705, + "language_loss": 0.73003638, + "learning_rate": 3.38169896509385e-06, + "loss": 0.75183129, + "num_input_tokens_seen": 100397545, + "step": 4649, + "time_per_iteration": 2.7211172580718994 + }, + { + "auxiliary_loss_clip": 0.01111127, + "auxiliary_loss_mlp": 0.01044981, + "balance_loss_clip": 1.04752195, + "balance_loss_mlp": 1.02557421, + "epoch": 0.2795731249060574, + "flos": 15159008044800.0, + "grad_norm": 2.1164331968139325, + "language_loss": 0.80629992, + "learning_rate": 3.381417358643549e-06, + "loss": 0.82786095, + "num_input_tokens_seen": 100415080, + "step": 4650, + "time_per_iteration": 2.7502310276031494 + }, + { + "auxiliary_loss_clip": 0.01039445, + "auxiliary_loss_mlp": 0.00754956, + "balance_loss_clip": 1.03124094, + "balance_loss_mlp": 1.00203133, + "epoch": 0.27963324815872537, + "flos": 60120103178880.0, + "grad_norm": 0.8151234776797575, + "language_loss": 0.58806145, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.60600549, + "num_input_tokens_seen": 100471105, + "step": 4651, + "time_per_iteration": 3.2224526405334473 + }, + { + "auxiliary_loss_clip": 0.01135312, + "auxiliary_loss_mlp": 0.01047398, + "balance_loss_clip": 1.04708123, + "balance_loss_mlp": 1.02753818, + "epoch": 0.27969337141139333, + "flos": 21765960910080.0, + "grad_norm": 1.7351399642666463, + "language_loss": 0.74332011, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.76514727, + "num_input_tokens_seen": 100492520, + "step": 4652, + "time_per_iteration": 2.685736894607544 + }, + { + "auxiliary_loss_clip": 0.01148943, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_clip": 1.05235481, + "balance_loss_mlp": 1.02742696, + "epoch": 0.27975349466406135, + "flos": 39851398834560.0, + "grad_norm": 2.2003219434248633, + "language_loss": 0.79789567, + "learning_rate": 3.380572225034461e-06, + "loss": 0.81984192, + "num_input_tokens_seen": 100512870, + "step": 4653, + "time_per_iteration": 2.7558584213256836 + }, + { + "auxiliary_loss_clip": 0.01121239, + "auxiliary_loss_mlp": 0.01050268, + "balance_loss_clip": 1.04883742, + "balance_loss_mlp": 1.03280401, + "epoch": 0.2798136179167293, + "flos": 21579799697280.0, + "grad_norm": 2.080129868341082, + "language_loss": 0.78903222, + "learning_rate": 3.380290409114312e-06, + "loss": 0.81074733, + "num_input_tokens_seen": 100531655, + "step": 4654, + "time_per_iteration": 2.6496095657348633 + }, + { + "auxiliary_loss_clip": 0.01101836, + "auxiliary_loss_mlp": 0.01052085, + "balance_loss_clip": 1.04982615, + "balance_loss_mlp": 1.03267753, + "epoch": 0.2798737411693973, + "flos": 21537676022400.0, + "grad_norm": 2.0985102630300134, + "language_loss": 0.81319463, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.83473378, + "num_input_tokens_seen": 100548005, + "step": 4655, + "time_per_iteration": 2.742586135864258 + }, + { + "auxiliary_loss_clip": 0.01112605, + "auxiliary_loss_mlp": 0.00776867, + "balance_loss_clip": 1.04759109, + "balance_loss_mlp": 1.00071263, + "epoch": 0.27993386442206525, + "flos": 26981051713920.0, + "grad_norm": 1.7515804597190672, + "language_loss": 0.81455064, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.83344543, + "num_input_tokens_seen": 100567980, + "step": 4656, + "time_per_iteration": 2.796480894088745 + }, + { + "auxiliary_loss_clip": 0.01120191, + "auxiliary_loss_mlp": 0.01050328, + "balance_loss_clip": 1.05115008, + "balance_loss_mlp": 1.03204143, + "epoch": 0.2799939876747332, + "flos": 24349876652160.0, + "grad_norm": 2.044588364139205, + "language_loss": 0.83203471, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.85373986, + "num_input_tokens_seen": 100588630, + "step": 4657, + "time_per_iteration": 2.6785871982574463 + }, + { + "auxiliary_loss_clip": 0.01111476, + "auxiliary_loss_mlp": 0.01052182, + "balance_loss_clip": 1.04937756, + "balance_loss_mlp": 1.03294206, + "epoch": 0.2800541109274012, + "flos": 33656988648960.0, + "grad_norm": 2.165484252442401, + "language_loss": 0.63694274, + "learning_rate": 3.379162622133105e-06, + "loss": 0.65857935, + "num_input_tokens_seen": 100608775, + "step": 4658, + "time_per_iteration": 2.879409074783325 + }, + { + "auxiliary_loss_clip": 0.01136248, + "auxiliary_loss_mlp": 0.010462, + "balance_loss_clip": 1.0495683, + "balance_loss_mlp": 1.02822304, + "epoch": 0.28011423418006914, + "flos": 21614417429760.0, + "grad_norm": 1.7192056687926605, + "language_loss": 0.78342974, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.80525422, + "num_input_tokens_seen": 100627975, + "step": 4659, + "time_per_iteration": 2.6989047527313232 + }, + { + "auxiliary_loss_clip": 0.0111004, + "auxiliary_loss_mlp": 0.01054733, + "balance_loss_clip": 1.04974771, + "balance_loss_mlp": 1.03588593, + "epoch": 0.2801743574327371, + "flos": 23112431159040.0, + "grad_norm": 1.755148683242289, + "language_loss": 0.79341501, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.8150627, + "num_input_tokens_seen": 100645430, + "step": 4660, + "time_per_iteration": 2.715477705001831 + }, + { + "auxiliary_loss_clip": 0.01108147, + "auxiliary_loss_mlp": 0.01046506, + "balance_loss_clip": 1.05007386, + "balance_loss_mlp": 1.02897, + "epoch": 0.2802344806854051, + "flos": 12641418766080.0, + "grad_norm": 2.2526204230687115, + "language_loss": 0.80604905, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.82759559, + "num_input_tokens_seen": 100663775, + "step": 4661, + "time_per_iteration": 2.7715258598327637 + }, + { + "auxiliary_loss_clip": 0.01125452, + "auxiliary_loss_mlp": 0.01056292, + "balance_loss_clip": 1.05232596, + "balance_loss_mlp": 1.03836262, + "epoch": 0.28029460393807304, + "flos": 37267878142080.0, + "grad_norm": 1.5529278028038542, + "language_loss": 0.79010582, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.81192333, + "num_input_tokens_seen": 100686085, + "step": 4662, + "time_per_iteration": 2.81427264213562 + }, + { + "auxiliary_loss_clip": 0.01133119, + "auxiliary_loss_mlp": 0.01052014, + "balance_loss_clip": 1.05226839, + "balance_loss_mlp": 1.03252363, + "epoch": 0.280354727190741, + "flos": 20741106061440.0, + "grad_norm": 1.6202884167711182, + "language_loss": 0.69617724, + "learning_rate": 3.377751711782227e-06, + "loss": 0.71802866, + "num_input_tokens_seen": 100705135, + "step": 4663, + "time_per_iteration": 2.697368860244751 + }, + { + "auxiliary_loss_clip": 0.01124677, + "auxiliary_loss_mlp": 0.01049339, + "balance_loss_clip": 1.05170035, + "balance_loss_mlp": 1.03104067, + "epoch": 0.28041485044340897, + "flos": 21471026336640.0, + "grad_norm": 1.9196144000248758, + "language_loss": 0.77708608, + "learning_rate": 3.377469372935791e-06, + "loss": 0.79882622, + "num_input_tokens_seen": 100724960, + "step": 4664, + "time_per_iteration": 2.7275149822235107 + }, + { + "auxiliary_loss_clip": 0.01107718, + "auxiliary_loss_mlp": 0.01048769, + "balance_loss_clip": 1.0480299, + "balance_loss_mlp": 1.03099537, + "epoch": 0.28047497369607693, + "flos": 14794263388800.0, + "grad_norm": 1.999889511399453, + "language_loss": 0.79593849, + "learning_rate": 3.377186981855578e-06, + "loss": 0.81750339, + "num_input_tokens_seen": 100741995, + "step": 4665, + "time_per_iteration": 2.710507392883301 + }, + { + "auxiliary_loss_clip": 0.01132609, + "auxiliary_loss_mlp": 0.01044622, + "balance_loss_clip": 1.04908824, + "balance_loss_mlp": 1.02724159, + "epoch": 0.2805350969487449, + "flos": 23070738447360.0, + "grad_norm": 1.8624041004678782, + "language_loss": 0.81080002, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.83257234, + "num_input_tokens_seen": 100758985, + "step": 4666, + "time_per_iteration": 2.6129403114318848 + }, + { + "auxiliary_loss_clip": 0.01108409, + "auxiliary_loss_mlp": 0.01071225, + "balance_loss_clip": 1.04823136, + "balance_loss_mlp": 1.05097127, + "epoch": 0.2805952202014129, + "flos": 20479855466880.0, + "grad_norm": 2.103406835637469, + "language_loss": 0.84507895, + "learning_rate": 3.376622043036658e-06, + "loss": 0.86687529, + "num_input_tokens_seen": 100777820, + "step": 4667, + "time_per_iteration": 2.7332448959350586 + }, + { + "auxiliary_loss_clip": 0.01123034, + "auxiliary_loss_mlp": 0.00775483, + "balance_loss_clip": 1.05581784, + "balance_loss_mlp": 1.00072694, + "epoch": 0.2806553434540809, + "flos": 27417330305280.0, + "grad_norm": 3.1307253624061486, + "language_loss": 0.79295927, + "learning_rate": 3.376339495319373e-06, + "loss": 0.81194448, + "num_input_tokens_seen": 100798205, + "step": 4668, + "time_per_iteration": 5.80406928062439 + }, + { + "auxiliary_loss_clip": 0.01086886, + "auxiliary_loss_mlp": 0.01042603, + "balance_loss_clip": 1.04659402, + "balance_loss_mlp": 1.02432859, + "epoch": 0.28071546670674885, + "flos": 26505019745280.0, + "grad_norm": 1.6340052887006857, + "language_loss": 0.76323926, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.7845341, + "num_input_tokens_seen": 100819800, + "step": 4669, + "time_per_iteration": 2.909986734390259 + }, + { + "auxiliary_loss_clip": 0.01135126, + "auxiliary_loss_mlp": 0.01048727, + "balance_loss_clip": 1.05091906, + "balance_loss_mlp": 1.03104806, + "epoch": 0.2807755899594168, + "flos": 20558679863040.0, + "grad_norm": 2.509610012971093, + "language_loss": 0.79246378, + "learning_rate": 3.375774243322725e-06, + "loss": 0.81430233, + "num_input_tokens_seen": 100837880, + "step": 4670, + "time_per_iteration": 4.177394866943359 + }, + { + "auxiliary_loss_clip": 0.01106377, + "auxiliary_loss_mlp": 0.01050214, + "balance_loss_clip": 1.04797912, + "balance_loss_mlp": 1.03053236, + "epoch": 0.2808357132120848, + "flos": 24313319585280.0, + "grad_norm": 2.7368773080153455, + "language_loss": 0.79247916, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.81404507, + "num_input_tokens_seen": 100856350, + "step": 4671, + "time_per_iteration": 2.711390256881714 + }, + { + "auxiliary_loss_clip": 0.01127751, + "auxiliary_loss_mlp": 0.01045588, + "balance_loss_clip": 1.05121446, + "balance_loss_mlp": 1.02806473, + "epoch": 0.28089583646475275, + "flos": 26432408401920.0, + "grad_norm": 1.6750085767967255, + "language_loss": 0.74537772, + "learning_rate": 3.37520878264809e-06, + "loss": 0.76711112, + "num_input_tokens_seen": 100876135, + "step": 4672, + "time_per_iteration": 2.661121129989624 + }, + { + "auxiliary_loss_clip": 0.01124033, + "auxiliary_loss_mlp": 0.01050888, + "balance_loss_clip": 1.04696918, + "balance_loss_mlp": 1.03130245, + "epoch": 0.2809559597174207, + "flos": 23111820627840.0, + "grad_norm": 2.8450273884489805, + "language_loss": 0.75648308, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.77823234, + "num_input_tokens_seen": 100894790, + "step": 4673, + "time_per_iteration": 2.672701120376587 + }, + { + "auxiliary_loss_clip": 0.0113134, + "auxiliary_loss_mlp": 0.01042591, + "balance_loss_clip": 1.04937172, + "balance_loss_mlp": 1.02492452, + "epoch": 0.2810160829700887, + "flos": 20923496346240.0, + "grad_norm": 1.8533271967959946, + "language_loss": 0.72668427, + "learning_rate": 3.374643113381237e-06, + "loss": 0.74842358, + "num_input_tokens_seen": 100915100, + "step": 4674, + "time_per_iteration": 4.2516560554504395 + }, + { + "auxiliary_loss_clip": 0.01138771, + "auxiliary_loss_mlp": 0.01046386, + "balance_loss_clip": 1.05174136, + "balance_loss_mlp": 1.02751493, + "epoch": 0.28107620622275664, + "flos": 14355901808640.0, + "grad_norm": 2.0688845921593377, + "language_loss": 0.77195638, + "learning_rate": 3.374360200552541e-06, + "loss": 0.79380798, + "num_input_tokens_seen": 100932795, + "step": 4675, + "time_per_iteration": 2.618218183517456 + }, + { + "auxiliary_loss_clip": 0.01149881, + "auxiliary_loss_mlp": 0.01047998, + "balance_loss_clip": 1.05321908, + "balance_loss_mlp": 1.02948523, + "epoch": 0.2811363294754246, + "flos": 20919078973440.0, + "grad_norm": 1.9283078401930889, + "language_loss": 0.70211101, + "learning_rate": 3.374077235607968e-06, + "loss": 0.7240898, + "num_input_tokens_seen": 100950505, + "step": 4676, + "time_per_iteration": 2.59861159324646 + }, + { + "auxiliary_loss_clip": 0.01144319, + "auxiliary_loss_mlp": 0.01042342, + "balance_loss_clip": 1.05481541, + "balance_loss_mlp": 1.02517629, + "epoch": 0.28119645272809257, + "flos": 20594841880320.0, + "grad_norm": 1.6132814643409343, + "language_loss": 0.7048012, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.72666782, + "num_input_tokens_seen": 100968790, + "step": 4677, + "time_per_iteration": 2.6064453125 + }, + { + "auxiliary_loss_clip": 0.01125461, + "auxiliary_loss_mlp": 0.01047839, + "balance_loss_clip": 1.04849231, + "balance_loss_mlp": 1.02783537, + "epoch": 0.28125657598076054, + "flos": 25337420248320.0, + "grad_norm": 1.5663130673511025, + "language_loss": 0.639018, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.66075099, + "num_input_tokens_seen": 100990205, + "step": 4678, + "time_per_iteration": 2.6609809398651123 + }, + { + "auxiliary_loss_clip": 0.01134563, + "auxiliary_loss_mlp": 0.01050264, + "balance_loss_clip": 1.05104351, + "balance_loss_mlp": 1.03315794, + "epoch": 0.2813166992334285, + "flos": 24827093769600.0, + "grad_norm": 5.827919401990006, + "language_loss": 0.70568973, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.72753799, + "num_input_tokens_seen": 101009815, + "step": 4679, + "time_per_iteration": 2.7039310932159424 + }, + { + "auxiliary_loss_clip": 0.01134537, + "auxiliary_loss_mlp": 0.01040896, + "balance_loss_clip": 1.05048108, + "balance_loss_mlp": 1.02283621, + "epoch": 0.2813768224860965, + "flos": 21760753438080.0, + "grad_norm": 2.2073803144691255, + "language_loss": 0.74848735, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.77024174, + "num_input_tokens_seen": 101026780, + "step": 4680, + "time_per_iteration": 2.6897919178009033 + }, + { + "auxiliary_loss_clip": 0.01149427, + "auxiliary_loss_mlp": 0.01039945, + "balance_loss_clip": 1.05414999, + "balance_loss_mlp": 1.02363694, + "epoch": 0.2814369457387645, + "flos": 24316803204480.0, + "grad_norm": 2.2743778704427267, + "language_loss": 0.7719292, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.793823, + "num_input_tokens_seen": 101046215, + "step": 4681, + "time_per_iteration": 2.6178102493286133 + }, + { + "auxiliary_loss_clip": 0.01138594, + "auxiliary_loss_mlp": 0.01037179, + "balance_loss_clip": 1.05333447, + "balance_loss_mlp": 1.01864183, + "epoch": 0.28149706899143245, + "flos": 18515326872960.0, + "grad_norm": 2.5230258038951723, + "language_loss": 0.74197519, + "learning_rate": 3.372378352108146e-06, + "loss": 0.76373291, + "num_input_tokens_seen": 101063365, + "step": 4682, + "time_per_iteration": 2.5892751216888428 + }, + { + "auxiliary_loss_clip": 0.01145225, + "auxiliary_loss_mlp": 0.01043744, + "balance_loss_clip": 1.05250573, + "balance_loss_mlp": 1.02619636, + "epoch": 0.2815571922441004, + "flos": 24863255786880.0, + "grad_norm": 1.5493572746384299, + "language_loss": 0.81096184, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.83285153, + "num_input_tokens_seen": 101083835, + "step": 4683, + "time_per_iteration": 2.6272947788238525 + }, + { + "auxiliary_loss_clip": 0.01089095, + "auxiliary_loss_mlp": 0.01048071, + "balance_loss_clip": 1.04691851, + "balance_loss_mlp": 1.02916479, + "epoch": 0.2816173154967684, + "flos": 19901622326400.0, + "grad_norm": 1.5570192452178944, + "language_loss": 0.76437271, + "learning_rate": 3.371811641167852e-06, + "loss": 0.78574431, + "num_input_tokens_seen": 101101740, + "step": 4684, + "time_per_iteration": 2.7542243003845215 + }, + { + "auxiliary_loss_clip": 0.01090035, + "auxiliary_loss_mlp": 0.01043858, + "balance_loss_clip": 1.04495156, + "balance_loss_mlp": 1.02659678, + "epoch": 0.28167743874943635, + "flos": 17491333950720.0, + "grad_norm": 3.250404845672824, + "language_loss": 0.76287019, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.78420913, + "num_input_tokens_seen": 101120480, + "step": 4685, + "time_per_iteration": 2.724954843521118 + }, + { + "auxiliary_loss_clip": 0.01116834, + "auxiliary_loss_mlp": 0.01045285, + "balance_loss_clip": 1.05042076, + "balance_loss_mlp": 1.02820265, + "epoch": 0.2817375620021043, + "flos": 25302120157440.0, + "grad_norm": 1.80192319881426, + "language_loss": 0.75822544, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.77984667, + "num_input_tokens_seen": 101142910, + "step": 4686, + "time_per_iteration": 2.7375218868255615 + }, + { + "auxiliary_loss_clip": 0.01113965, + "auxiliary_loss_mlp": 0.01054481, + "balance_loss_clip": 1.04542971, + "balance_loss_mlp": 1.03530002, + "epoch": 0.2817976852547723, + "flos": 18693227957760.0, + "grad_norm": 5.9534421572259095, + "language_loss": 0.62298906, + "learning_rate": 3.370961184640025e-06, + "loss": 0.64467359, + "num_input_tokens_seen": 101160030, + "step": 4687, + "time_per_iteration": 2.7273154258728027 + }, + { + "auxiliary_loss_clip": 0.01125077, + "auxiliary_loss_mlp": 0.01052662, + "balance_loss_clip": 1.05122471, + "balance_loss_mlp": 1.03501928, + "epoch": 0.28185780850744024, + "flos": 22742263549440.0, + "grad_norm": 3.512847657951686, + "language_loss": 0.76642895, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.78820634, + "num_input_tokens_seen": 101177675, + "step": 4688, + "time_per_iteration": 2.6962485313415527 + }, + { + "auxiliary_loss_clip": 0.01111064, + "auxiliary_loss_mlp": 0.01038903, + "balance_loss_clip": 1.050143, + "balance_loss_mlp": 1.0222497, + "epoch": 0.2819179317601082, + "flos": 14933919467520.0, + "grad_norm": 2.029299855452059, + "language_loss": 0.78377295, + "learning_rate": 3.37039395366863e-06, + "loss": 0.80527258, + "num_input_tokens_seen": 101192225, + "step": 4689, + "time_per_iteration": 2.7611160278320312 + }, + { + "auxiliary_loss_clip": 0.01101002, + "auxiliary_loss_mlp": 0.01042004, + "balance_loss_clip": 1.044873, + "balance_loss_mlp": 1.02469492, + "epoch": 0.2819780550127762, + "flos": 23145325038720.0, + "grad_norm": 1.6619977361488503, + "language_loss": 0.78151089, + "learning_rate": 3.37011026022934e-06, + "loss": 0.80294096, + "num_input_tokens_seen": 101210870, + "step": 4690, + "time_per_iteration": 2.8166253566741943 + }, + { + "auxiliary_loss_clip": 0.01144307, + "auxiliary_loss_mlp": 0.0077562, + "balance_loss_clip": 1.04972041, + "balance_loss_mlp": 1.00065684, + "epoch": 0.28203817826544414, + "flos": 21616356764160.0, + "grad_norm": 1.8251699545436237, + "language_loss": 0.87835205, + "learning_rate": 3.369826514835332e-06, + "loss": 0.8975513, + "num_input_tokens_seen": 101229965, + "step": 4691, + "time_per_iteration": 2.755540609359741 + }, + { + "auxiliary_loss_clip": 0.01120177, + "auxiliary_loss_mlp": 0.01057161, + "balance_loss_clip": 1.0480932, + "balance_loss_mlp": 1.03866005, + "epoch": 0.2820983015181121, + "flos": 24026788794240.0, + "grad_norm": 2.0164591316320086, + "language_loss": 0.81783265, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.83960605, + "num_input_tokens_seen": 101250980, + "step": 4692, + "time_per_iteration": 2.766826868057251 + }, + { + "auxiliary_loss_clip": 0.01108273, + "auxiliary_loss_mlp": 0.01044592, + "balance_loss_clip": 1.05000174, + "balance_loss_mlp": 1.02690101, + "epoch": 0.2821584247707801, + "flos": 30007925976960.0, + "grad_norm": 1.5153062693168577, + "language_loss": 0.74520338, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.76673198, + "num_input_tokens_seen": 101273335, + "step": 4693, + "time_per_iteration": 2.833829402923584 + }, + { + "auxiliary_loss_clip": 0.01107692, + "auxiliary_loss_mlp": 0.01038565, + "balance_loss_clip": 1.04546356, + "balance_loss_mlp": 1.02018356, + "epoch": 0.2822185480234481, + "flos": 21396762967680.0, + "grad_norm": 1.6139880108231377, + "language_loss": 0.77396065, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.79542327, + "num_input_tokens_seen": 101292110, + "step": 4694, + "time_per_iteration": 2.6783409118652344 + }, + { + "auxiliary_loss_clip": 0.01131719, + "auxiliary_loss_mlp": 0.01043428, + "balance_loss_clip": 1.05066633, + "balance_loss_mlp": 1.02610695, + "epoch": 0.28227867127611606, + "flos": 27452809964160.0, + "grad_norm": 2.1245298140537354, + "language_loss": 0.67171001, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.69346148, + "num_input_tokens_seen": 101312815, + "step": 4695, + "time_per_iteration": 2.657508373260498 + }, + { + "auxiliary_loss_clip": 0.01129418, + "auxiliary_loss_mlp": 0.01047718, + "balance_loss_clip": 1.05160189, + "balance_loss_mlp": 1.02857292, + "epoch": 0.282338794528784, + "flos": 22593736811520.0, + "grad_norm": 2.1132011275006297, + "language_loss": 0.75410438, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.77587581, + "num_input_tokens_seen": 101329045, + "step": 4696, + "time_per_iteration": 2.6419622898101807 + }, + { + "auxiliary_loss_clip": 0.01108873, + "auxiliary_loss_mlp": 0.01050131, + "balance_loss_clip": 1.04857826, + "balance_loss_mlp": 1.03241634, + "epoch": 0.282398917781452, + "flos": 42010923386880.0, + "grad_norm": 1.6547739374499746, + "language_loss": 0.62379837, + "learning_rate": 3.368122952024877e-06, + "loss": 0.64538848, + "num_input_tokens_seen": 101352715, + "step": 4697, + "time_per_iteration": 2.863271951675415 + }, + { + "auxiliary_loss_clip": 0.01098306, + "auxiliary_loss_mlp": 0.01038026, + "balance_loss_clip": 1.04702902, + "balance_loss_mlp": 1.0213964, + "epoch": 0.28245904103411995, + "flos": 23224724052480.0, + "grad_norm": 1.3648463295211168, + "language_loss": 0.73178887, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.75315219, + "num_input_tokens_seen": 101374640, + "step": 4698, + "time_per_iteration": 2.7437515258789062 + }, + { + "auxiliary_loss_clip": 0.01138661, + "auxiliary_loss_mlp": 0.01044687, + "balance_loss_clip": 1.04783368, + "balance_loss_mlp": 1.02820039, + "epoch": 0.2825191642867879, + "flos": 25374623760000.0, + "grad_norm": 1.73143255072412, + "language_loss": 0.75260699, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.77444041, + "num_input_tokens_seen": 101393595, + "step": 4699, + "time_per_iteration": 2.6352651119232178 + }, + { + "auxiliary_loss_clip": 0.01130406, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_clip": 1.04642487, + "balance_loss_mlp": 1.02379072, + "epoch": 0.2825792875394559, + "flos": 17236799199360.0, + "grad_norm": 2.939003683920128, + "language_loss": 0.80683541, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.82856727, + "num_input_tokens_seen": 101409265, + "step": 4700, + "time_per_iteration": 2.597543478012085 + }, + { + "auxiliary_loss_clip": 0.01118395, + "auxiliary_loss_mlp": 0.01052226, + "balance_loss_clip": 1.05168593, + "balance_loss_mlp": 1.03699148, + "epoch": 0.28263941079212385, + "flos": 26723967096960.0, + "grad_norm": 1.8973185440197946, + "language_loss": 0.82377315, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.84547931, + "num_input_tokens_seen": 101428365, + "step": 4701, + "time_per_iteration": 2.6613359451293945 + }, + { + "auxiliary_loss_clip": 0.01079732, + "auxiliary_loss_mlp": 0.01044955, + "balance_loss_clip": 1.04725862, + "balance_loss_mlp": 1.02782488, + "epoch": 0.2826995340447918, + "flos": 25921327737600.0, + "grad_norm": 2.6106451650427913, + "language_loss": 0.72911763, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.75036454, + "num_input_tokens_seen": 101447280, + "step": 4702, + "time_per_iteration": 2.927156448364258 + }, + { + "auxiliary_loss_clip": 0.0114189, + "auxiliary_loss_mlp": 0.01039287, + "balance_loss_clip": 1.05118549, + "balance_loss_mlp": 1.02240694, + "epoch": 0.2827596572974598, + "flos": 22379709623040.0, + "grad_norm": 2.1110096252533754, + "language_loss": 0.78497601, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.80678773, + "num_input_tokens_seen": 101465435, + "step": 4703, + "time_per_iteration": 2.603217124938965 + }, + { + "auxiliary_loss_clip": 0.01115372, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_clip": 1.04668045, + "balance_loss_mlp": 1.03100109, + "epoch": 0.28281978055012774, + "flos": 33547137880320.0, + "grad_norm": 1.6207045759516274, + "language_loss": 0.69310379, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.71475154, + "num_input_tokens_seen": 101486355, + "step": 4704, + "time_per_iteration": 2.737741708755493 + }, + { + "auxiliary_loss_clip": 0.0110991, + "auxiliary_loss_mlp": 0.0104005, + "balance_loss_clip": 1.05106401, + "balance_loss_mlp": 1.02204967, + "epoch": 0.2828799038027957, + "flos": 23440870143360.0, + "grad_norm": 2.0629797483939893, + "language_loss": 0.70487976, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.72637939, + "num_input_tokens_seen": 101505875, + "step": 4705, + "time_per_iteration": 2.7810943126678467 + }, + { + "auxiliary_loss_clip": 0.01051193, + "auxiliary_loss_mlp": 0.01011527, + "balance_loss_clip": 1.02885246, + "balance_loss_mlp": 1.00905895, + "epoch": 0.2829400270554637, + "flos": 69873690251520.0, + "grad_norm": 0.7331461257989402, + "language_loss": 0.59262896, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.6132561, + "num_input_tokens_seen": 101565045, + "step": 4706, + "time_per_iteration": 3.223500967025757 + }, + { + "auxiliary_loss_clip": 0.01117208, + "auxiliary_loss_mlp": 0.01042955, + "balance_loss_clip": 1.04750693, + "balance_loss_mlp": 1.02711248, + "epoch": 0.2830001503081317, + "flos": 24789028331520.0, + "grad_norm": 1.4542369915695899, + "language_loss": 0.82314008, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84474176, + "num_input_tokens_seen": 101585825, + "step": 4707, + "time_per_iteration": 5.995711326599121 + }, + { + "auxiliary_loss_clip": 0.0112325, + "auxiliary_loss_mlp": 0.01043198, + "balance_loss_clip": 1.04714823, + "balance_loss_mlp": 1.02451742, + "epoch": 0.28306027356079966, + "flos": 27669387018240.0, + "grad_norm": 1.6937335335925583, + "language_loss": 0.80196846, + "learning_rate": 3.36499490449902e-06, + "loss": 0.82363296, + "num_input_tokens_seen": 101606105, + "step": 4708, + "time_per_iteration": 2.730365753173828 + }, + { + "auxiliary_loss_clip": 0.01036827, + "auxiliary_loss_mlp": 0.01004906, + "balance_loss_clip": 1.0241586, + "balance_loss_mlp": 1.00274837, + "epoch": 0.2831203968134676, + "flos": 60527938199040.0, + "grad_norm": 0.8797441515413378, + "language_loss": 0.62768304, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.64810038, + "num_input_tokens_seen": 101656875, + "step": 4709, + "time_per_iteration": 3.0734164714813232 + }, + { + "auxiliary_loss_clip": 0.01113275, + "auxiliary_loss_mlp": 0.01045412, + "balance_loss_clip": 1.04819441, + "balance_loss_mlp": 1.02711344, + "epoch": 0.2831805200661356, + "flos": 22054790171520.0, + "grad_norm": 1.4416556980461737, + "language_loss": 0.74092108, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.76250798, + "num_input_tokens_seen": 101676225, + "step": 4710, + "time_per_iteration": 4.214928388595581 + }, + { + "auxiliary_loss_clip": 0.01108833, + "auxiliary_loss_mlp": 0.01058426, + "balance_loss_clip": 1.04568553, + "balance_loss_mlp": 1.0393765, + "epoch": 0.28324064331880355, + "flos": 22600668136320.0, + "grad_norm": 2.192994300890924, + "language_loss": 0.7857554, + "learning_rate": 3.364140713048579e-06, + "loss": 0.80742794, + "num_input_tokens_seen": 101693710, + "step": 4711, + "time_per_iteration": 2.9334824085235596 + }, + { + "auxiliary_loss_clip": 0.01135754, + "auxiliary_loss_mlp": 0.00775746, + "balance_loss_clip": 1.05244637, + "balance_loss_mlp": 1.00072622, + "epoch": 0.2833007665714715, + "flos": 30404127968640.0, + "grad_norm": 2.328121287113732, + "language_loss": 0.70832199, + "learning_rate": 3.363855879093996e-06, + "loss": 0.72743702, + "num_input_tokens_seen": 101714010, + "step": 4712, + "time_per_iteration": 2.8570704460144043 + }, + { + "auxiliary_loss_clip": 0.0114641, + "auxiliary_loss_mlp": 0.01050688, + "balance_loss_clip": 1.05171633, + "balance_loss_mlp": 1.03284216, + "epoch": 0.2833608898241395, + "flos": 23549499849600.0, + "grad_norm": 2.3843934106626157, + "language_loss": 0.81725228, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.83922327, + "num_input_tokens_seen": 101732995, + "step": 4713, + "time_per_iteration": 4.343034029006958 + }, + { + "auxiliary_loss_clip": 0.01120505, + "auxiliary_loss_mlp": 0.01048075, + "balance_loss_clip": 1.05054498, + "balance_loss_mlp": 1.03044379, + "epoch": 0.28342101307680745, + "flos": 20266726118400.0, + "grad_norm": 1.7964609324305687, + "language_loss": 0.75316995, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.77485573, + "num_input_tokens_seen": 101751385, + "step": 4714, + "time_per_iteration": 2.656919479370117 + }, + { + "auxiliary_loss_clip": 0.01129168, + "auxiliary_loss_mlp": 0.01051102, + "balance_loss_clip": 1.050372, + "balance_loss_mlp": 1.03424633, + "epoch": 0.2834811363294754, + "flos": 30847050576000.0, + "grad_norm": 1.4082553086863412, + "language_loss": 0.78457153, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.80637431, + "num_input_tokens_seen": 101773825, + "step": 4715, + "time_per_iteration": 2.721869468688965 + }, + { + "auxiliary_loss_clip": 0.01117334, + "auxiliary_loss_mlp": 0.01046437, + "balance_loss_clip": 1.04618871, + "balance_loss_mlp": 1.0294199, + "epoch": 0.2835412595821434, + "flos": 22711021695360.0, + "grad_norm": 1.791082386208426, + "language_loss": 0.73825723, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.75989497, + "num_input_tokens_seen": 101791920, + "step": 4716, + "time_per_iteration": 2.689964532852173 + }, + { + "auxiliary_loss_clip": 0.0111778, + "auxiliary_loss_mlp": 0.01054857, + "balance_loss_clip": 1.04580188, + "balance_loss_mlp": 1.03397131, + "epoch": 0.28360138283481134, + "flos": 18077719478400.0, + "grad_norm": 2.1425450832247868, + "language_loss": 0.74293232, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.76465869, + "num_input_tokens_seen": 101809515, + "step": 4717, + "time_per_iteration": 2.653107166290283 + }, + { + "auxiliary_loss_clip": 0.01112398, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_clip": 1.04736984, + "balance_loss_mlp": 1.03526437, + "epoch": 0.2836615060874793, + "flos": 17854785717120.0, + "grad_norm": 1.96982951308544, + "language_loss": 0.67022157, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.69186902, + "num_input_tokens_seen": 101827735, + "step": 4718, + "time_per_iteration": 2.7287323474884033 + }, + { + "auxiliary_loss_clip": 0.01119996, + "auxiliary_loss_mlp": 0.01052629, + "balance_loss_clip": 1.04606366, + "balance_loss_mlp": 1.03479528, + "epoch": 0.2837216293401473, + "flos": 25740302169600.0, + "grad_norm": 1.7409435577223806, + "language_loss": 0.72453725, + "learning_rate": 3.361860593925566e-06, + "loss": 0.7462635, + "num_input_tokens_seen": 101845970, + "step": 4719, + "time_per_iteration": 2.7101874351501465 + }, + { + "auxiliary_loss_clip": 0.01129472, + "auxiliary_loss_mlp": 0.01044, + "balance_loss_clip": 1.04724336, + "balance_loss_mlp": 1.02711964, + "epoch": 0.2837817525928153, + "flos": 20923532259840.0, + "grad_norm": 1.8163652523997504, + "language_loss": 0.80517805, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.82691276, + "num_input_tokens_seen": 101865040, + "step": 4720, + "time_per_iteration": 2.630380392074585 + }, + { + "auxiliary_loss_clip": 0.01130938, + "auxiliary_loss_mlp": 0.01047274, + "balance_loss_clip": 1.04798317, + "balance_loss_mlp": 1.02935672, + "epoch": 0.28384187584548326, + "flos": 18916700423040.0, + "grad_norm": 2.340232614040239, + "language_loss": 0.79146183, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.81324387, + "num_input_tokens_seen": 101883735, + "step": 4721, + "time_per_iteration": 2.6779117584228516 + }, + { + "auxiliary_loss_clip": 0.01091324, + "auxiliary_loss_mlp": 0.00778191, + "balance_loss_clip": 1.04653215, + "balance_loss_mlp": 1.00074911, + "epoch": 0.2839019990981512, + "flos": 27343964776320.0, + "grad_norm": 1.7859505861297744, + "language_loss": 0.82514244, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.84383762, + "num_input_tokens_seen": 101903025, + "step": 4722, + "time_per_iteration": 2.8601412773132324 + }, + { + "auxiliary_loss_clip": 0.0114735, + "auxiliary_loss_mlp": 0.0104339, + "balance_loss_clip": 1.05396807, + "balance_loss_mlp": 1.02641416, + "epoch": 0.2839621223508192, + "flos": 18114312458880.0, + "grad_norm": 1.8976073667217488, + "language_loss": 0.70048773, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.72239512, + "num_input_tokens_seen": 101922255, + "step": 4723, + "time_per_iteration": 2.6259007453918457 + }, + { + "auxiliary_loss_clip": 0.0111455, + "auxiliary_loss_mlp": 0.01051142, + "balance_loss_clip": 1.04818106, + "balance_loss_mlp": 1.03247368, + "epoch": 0.28402224560348716, + "flos": 26358360514560.0, + "grad_norm": 1.540245146059843, + "language_loss": 0.78676599, + "learning_rate": 3.360433840760998e-06, + "loss": 0.80842292, + "num_input_tokens_seen": 101943100, + "step": 4724, + "time_per_iteration": 2.7364859580993652 + }, + { + "auxiliary_loss_clip": 0.01116323, + "auxiliary_loss_mlp": 0.01063488, + "balance_loss_clip": 1.04846072, + "balance_loss_mlp": 1.04442668, + "epoch": 0.2840823688561551, + "flos": 24060795995520.0, + "grad_norm": 1.6728910575536384, + "language_loss": 0.92433345, + "learning_rate": 3.36014833532143e-06, + "loss": 0.94613159, + "num_input_tokens_seen": 101963160, + "step": 4725, + "time_per_iteration": 2.653244733810425 + }, + { + "auxiliary_loss_clip": 0.01137335, + "auxiliary_loss_mlp": 0.01047317, + "balance_loss_clip": 1.05249703, + "balance_loss_mlp": 1.02951932, + "epoch": 0.2841424921088231, + "flos": 29459821368960.0, + "grad_norm": 1.5774329387244128, + "language_loss": 0.88881439, + "learning_rate": 3.3598627783049e-06, + "loss": 0.91066098, + "num_input_tokens_seen": 101984300, + "step": 4726, + "time_per_iteration": 2.6815872192382812 + }, + { + "auxiliary_loss_clip": 0.01132666, + "auxiliary_loss_mlp": 0.01049768, + "balance_loss_clip": 1.05290008, + "balance_loss_mlp": 1.03223181, + "epoch": 0.28420261536149105, + "flos": 48100367053440.0, + "grad_norm": 2.008368257744288, + "language_loss": 0.78913373, + "learning_rate": 3.359577169722238e-06, + "loss": 0.81095803, + "num_input_tokens_seen": 102005765, + "step": 4727, + "time_per_iteration": 2.8668875694274902 + }, + { + "auxiliary_loss_clip": 0.01134036, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.05225933, + "balance_loss_mlp": 1.02603006, + "epoch": 0.284262738614159, + "flos": 25666146541440.0, + "grad_norm": 2.1196929739552433, + "language_loss": 0.66590458, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.68766308, + "num_input_tokens_seen": 102022755, + "step": 4728, + "time_per_iteration": 2.6871252059936523 + }, + { + "auxiliary_loss_clip": 0.01111522, + "auxiliary_loss_mlp": 0.01054966, + "balance_loss_clip": 1.04948676, + "balance_loss_mlp": 1.03766847, + "epoch": 0.284322861866827, + "flos": 19718980646400.0, + "grad_norm": 1.7247901443745783, + "language_loss": 0.76369143, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.78535628, + "num_input_tokens_seen": 102041850, + "step": 4729, + "time_per_iteration": 2.671739339828491 + }, + { + "auxiliary_loss_clip": 0.01121198, + "auxiliary_loss_mlp": 0.01054506, + "balance_loss_clip": 1.05166233, + "balance_loss_mlp": 1.03707767, + "epoch": 0.28438298511949495, + "flos": 23915250086400.0, + "grad_norm": 1.8284571123244682, + "language_loss": 0.67062581, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.69238287, + "num_input_tokens_seen": 102059500, + "step": 4730, + "time_per_iteration": 2.6957883834838867 + }, + { + "auxiliary_loss_clip": 0.01120949, + "auxiliary_loss_mlp": 0.01040777, + "balance_loss_clip": 1.05008078, + "balance_loss_mlp": 1.02283621, + "epoch": 0.2844431083721629, + "flos": 26067340523520.0, + "grad_norm": 1.8142087038783352, + "language_loss": 0.7456513, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.76726854, + "num_input_tokens_seen": 102080460, + "step": 4731, + "time_per_iteration": 2.7621212005615234 + }, + { + "auxiliary_loss_clip": 0.01100065, + "auxiliary_loss_mlp": 0.0104061, + "balance_loss_clip": 1.04959893, + "balance_loss_mlp": 1.02338386, + "epoch": 0.2845032316248309, + "flos": 25810435474560.0, + "grad_norm": 1.4533231430590194, + "language_loss": 0.83672202, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.85812879, + "num_input_tokens_seen": 102100950, + "step": 4732, + "time_per_iteration": 2.807701587677002 + }, + { + "auxiliary_loss_clip": 0.01135958, + "auxiliary_loss_mlp": 0.01049006, + "balance_loss_clip": 1.05248308, + "balance_loss_mlp": 1.03040957, + "epoch": 0.2845633548774989, + "flos": 19823192979840.0, + "grad_norm": 2.88493918484894, + "language_loss": 0.78892827, + "learning_rate": 3.357862435944109e-06, + "loss": 0.8107779, + "num_input_tokens_seen": 102119345, + "step": 4733, + "time_per_iteration": 2.66524076461792 + }, + { + "auxiliary_loss_clip": 0.01153472, + "auxiliary_loss_mlp": 0.01047702, + "balance_loss_clip": 1.05533004, + "balance_loss_mlp": 1.02984452, + "epoch": 0.28462347813016686, + "flos": 23182815859200.0, + "grad_norm": 2.2364375024988776, + "language_loss": 0.71791029, + "learning_rate": 3.357576466701875e-06, + "loss": 0.73992205, + "num_input_tokens_seen": 102139050, + "step": 4734, + "time_per_iteration": 2.6941637992858887 + }, + { + "auxiliary_loss_clip": 0.01125779, + "auxiliary_loss_mlp": 0.01035132, + "balance_loss_clip": 1.05455363, + "balance_loss_mlp": 1.01766825, + "epoch": 0.2846836013828348, + "flos": 18660477732480.0, + "grad_norm": 1.8491255089189595, + "language_loss": 0.73942113, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.76103032, + "num_input_tokens_seen": 102157935, + "step": 4735, + "time_per_iteration": 2.736027956008911 + }, + { + "auxiliary_loss_clip": 0.01124029, + "auxiliary_loss_mlp": 0.01048016, + "balance_loss_clip": 1.05248201, + "balance_loss_mlp": 1.03177929, + "epoch": 0.2847437246355028, + "flos": 14173511523840.0, + "grad_norm": 1.7217440703764713, + "language_loss": 0.79690897, + "learning_rate": 3.357004373789946e-06, + "loss": 0.81862932, + "num_input_tokens_seen": 102175325, + "step": 4736, + "time_per_iteration": 2.7069075107574463 + }, + { + "auxiliary_loss_clip": 0.01152237, + "auxiliary_loss_mlp": 0.01048515, + "balance_loss_clip": 1.0569663, + "balance_loss_mlp": 1.03019249, + "epoch": 0.28480384788817076, + "flos": 29278364837760.0, + "grad_norm": 2.5331890881723327, + "language_loss": 0.59956342, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.62157094, + "num_input_tokens_seen": 102196625, + "step": 4737, + "time_per_iteration": 2.718904972076416 + }, + { + "auxiliary_loss_clip": 0.01131951, + "auxiliary_loss_mlp": 0.0104121, + "balance_loss_clip": 1.05099404, + "balance_loss_mlp": 1.02437758, + "epoch": 0.2848639711408387, + "flos": 22601314581120.0, + "grad_norm": 1.8696274848062555, + "language_loss": 0.86556888, + "learning_rate": 3.356432075047052e-06, + "loss": 0.88730049, + "num_input_tokens_seen": 102214975, + "step": 4738, + "time_per_iteration": 2.719223976135254 + }, + { + "auxiliary_loss_clip": 0.01127313, + "auxiliary_loss_mlp": 0.01051123, + "balance_loss_clip": 1.05986989, + "balance_loss_mlp": 1.03207278, + "epoch": 0.2849240943935067, + "flos": 17599460866560.0, + "grad_norm": 2.688438536338364, + "language_loss": 0.90028232, + "learning_rate": 3.356145848516118e-06, + "loss": 0.92206669, + "num_input_tokens_seen": 102231885, + "step": 4739, + "time_per_iteration": 2.674363851547241 + }, + { + "auxiliary_loss_clip": 0.01136036, + "auxiliary_loss_mlp": 0.01044124, + "balance_loss_clip": 1.05522013, + "balance_loss_mlp": 1.02627802, + "epoch": 0.28498421764617465, + "flos": 24862573428480.0, + "grad_norm": 1.41783833400805, + "language_loss": 0.7216897, + "learning_rate": 3.355859570559998e-06, + "loss": 0.74349129, + "num_input_tokens_seen": 102252725, + "step": 4740, + "time_per_iteration": 2.688591957092285 + }, + { + "auxiliary_loss_clip": 0.01130927, + "auxiliary_loss_mlp": 0.010392, + "balance_loss_clip": 1.05868936, + "balance_loss_mlp": 1.02229571, + "epoch": 0.2850443408988426, + "flos": 22782555630720.0, + "grad_norm": 3.325446081949271, + "language_loss": 0.77782756, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.79952878, + "num_input_tokens_seen": 102271730, + "step": 4741, + "time_per_iteration": 2.6747119426727295 + }, + { + "auxiliary_loss_clip": 0.01107503, + "auxiliary_loss_mlp": 0.01048819, + "balance_loss_clip": 1.04771924, + "balance_loss_mlp": 1.03065109, + "epoch": 0.2851044641515106, + "flos": 18844053166080.0, + "grad_norm": 1.6557809034578879, + "language_loss": 0.75952959, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.78109288, + "num_input_tokens_seen": 102291325, + "step": 4742, + "time_per_iteration": 2.7584095001220703 + }, + { + "auxiliary_loss_clip": 0.01151989, + "auxiliary_loss_mlp": 0.01057399, + "balance_loss_clip": 1.05341601, + "balance_loss_mlp": 1.03720486, + "epoch": 0.28516458740417855, + "flos": 18880502492160.0, + "grad_norm": 2.0538587827096713, + "language_loss": 0.57376975, + "learning_rate": 3.355000428249086e-06, + "loss": 0.59586358, + "num_input_tokens_seen": 102309000, + "step": 4743, + "time_per_iteration": 2.621572494506836 + }, + { + "auxiliary_loss_clip": 0.01116239, + "auxiliary_loss_mlp": 0.01056356, + "balance_loss_clip": 1.05067348, + "balance_loss_mlp": 1.03747356, + "epoch": 0.2852247106568465, + "flos": 25299821687040.0, + "grad_norm": 1.6259491452975234, + "language_loss": 0.74499846, + "learning_rate": 3.354713944700797e-06, + "loss": 0.76672441, + "num_input_tokens_seen": 102329240, + "step": 4744, + "time_per_iteration": 2.8029959201812744 + }, + { + "auxiliary_loss_clip": 0.01132324, + "auxiliary_loss_mlp": 0.01047205, + "balance_loss_clip": 1.05420351, + "balance_loss_mlp": 1.03014612, + "epoch": 0.2852848339095145, + "flos": 11655383541120.0, + "grad_norm": 2.4725597828733563, + "language_loss": 0.77258176, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.79437709, + "num_input_tokens_seen": 102344440, + "step": 4745, + "time_per_iteration": 2.5961194038391113 + }, + { + "auxiliary_loss_clip": 0.01124474, + "auxiliary_loss_mlp": 0.01040571, + "balance_loss_clip": 1.05262041, + "balance_loss_mlp": 1.02427554, + "epoch": 0.2853449571621825, + "flos": 12933228856320.0, + "grad_norm": 1.9164884333366974, + "language_loss": 0.8275286, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.84917903, + "num_input_tokens_seen": 102360985, + "step": 4746, + "time_per_iteration": 4.211855411529541 + }, + { + "auxiliary_loss_clip": 0.01101779, + "auxiliary_loss_mlp": 0.01043428, + "balance_loss_clip": 1.0488627, + "balance_loss_mlp": 1.02497482, + "epoch": 0.28540508041485046, + "flos": 20010575255040.0, + "grad_norm": 1.8281951571940926, + "language_loss": 0.79537141, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.81682348, + "num_input_tokens_seen": 102380320, + "step": 4747, + "time_per_iteration": 4.276613712310791 + }, + { + "auxiliary_loss_clip": 0.01046154, + "auxiliary_loss_mlp": 0.01017989, + "balance_loss_clip": 1.02844512, + "balance_loss_mlp": 1.01572371, + "epoch": 0.28546520366751843, + "flos": 68139349966080.0, + "grad_norm": 0.7754147669680839, + "language_loss": 0.6049211, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.62556255, + "num_input_tokens_seen": 102439140, + "step": 4748, + "time_per_iteration": 3.0963478088378906 + }, + { + "auxiliary_loss_clip": 0.01148062, + "auxiliary_loss_mlp": 0.01048043, + "balance_loss_clip": 1.05367923, + "balance_loss_mlp": 1.03001821, + "epoch": 0.2855253269201864, + "flos": 13251540205440.0, + "grad_norm": 2.39914017508816, + "language_loss": 0.8061412, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.82810223, + "num_input_tokens_seen": 102450990, + "step": 4749, + "time_per_iteration": 4.199607610702515 + }, + { + "auxiliary_loss_clip": 0.01135936, + "auxiliary_loss_mlp": 0.01045252, + "balance_loss_clip": 1.05160487, + "balance_loss_mlp": 1.02788317, + "epoch": 0.28558545017285436, + "flos": 28620876337920.0, + "grad_norm": 1.92101956988616, + "language_loss": 0.70763719, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.72944903, + "num_input_tokens_seen": 102471820, + "step": 4750, + "time_per_iteration": 2.6975722312927246 + }, + { + "auxiliary_loss_clip": 0.01132057, + "auxiliary_loss_mlp": 0.0104367, + "balance_loss_clip": 1.05308008, + "balance_loss_mlp": 1.02660573, + "epoch": 0.2856455734255223, + "flos": 34130470752000.0, + "grad_norm": 1.619747991653998, + "language_loss": 0.81983078, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.84158808, + "num_input_tokens_seen": 102492625, + "step": 4751, + "time_per_iteration": 2.685194969177246 + }, + { + "auxiliary_loss_clip": 0.01146027, + "auxiliary_loss_mlp": 0.01046872, + "balance_loss_clip": 1.0541997, + "balance_loss_mlp": 1.03009951, + "epoch": 0.2857056966781903, + "flos": 39786149779200.0, + "grad_norm": 2.1857777553010203, + "language_loss": 0.80359828, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.82552731, + "num_input_tokens_seen": 102514145, + "step": 4752, + "time_per_iteration": 4.363154649734497 + }, + { + "auxiliary_loss_clip": 0.01130862, + "auxiliary_loss_mlp": 0.010456, + "balance_loss_clip": 1.04920304, + "balance_loss_mlp": 1.02675319, + "epoch": 0.28576581993085826, + "flos": 21872292145920.0, + "grad_norm": 2.612706759191024, + "language_loss": 0.78674287, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.8085075, + "num_input_tokens_seen": 102532365, + "step": 4753, + "time_per_iteration": 2.6128499507904053 + }, + { + "auxiliary_loss_clip": 0.0114991, + "auxiliary_loss_mlp": 0.01051658, + "balance_loss_clip": 1.05356765, + "balance_loss_mlp": 1.03166628, + "epoch": 0.2858259431835262, + "flos": 19091656592640.0, + "grad_norm": 3.5161743537336596, + "language_loss": 0.8947711, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.91678679, + "num_input_tokens_seen": 102548425, + "step": 4754, + "time_per_iteration": 2.5410687923431396 + }, + { + "auxiliary_loss_clip": 0.01130155, + "auxiliary_loss_mlp": 0.010468, + "balance_loss_clip": 1.05048347, + "balance_loss_mlp": 1.03026593, + "epoch": 0.2858860664361942, + "flos": 20334309557760.0, + "grad_norm": 2.3617926288322724, + "language_loss": 0.82039523, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84216481, + "num_input_tokens_seen": 102566370, + "step": 4755, + "time_per_iteration": 2.6514527797698975 + }, + { + "auxiliary_loss_clip": 0.01098878, + "auxiliary_loss_mlp": 0.01049575, + "balance_loss_clip": 1.04732597, + "balance_loss_mlp": 1.03233767, + "epoch": 0.28594618968886215, + "flos": 24461738582400.0, + "grad_norm": 1.6385978416895255, + "language_loss": 0.83764589, + "learning_rate": 3.351272138300922e-06, + "loss": 0.8591305, + "num_input_tokens_seen": 102588715, + "step": 4756, + "time_per_iteration": 2.7975916862487793 + }, + { + "auxiliary_loss_clip": 0.01023363, + "auxiliary_loss_mlp": 0.01007772, + "balance_loss_clip": 1.01913142, + "balance_loss_mlp": 1.00524473, + "epoch": 0.2860063129415301, + "flos": 71652850709760.0, + "grad_norm": 0.8721113874523594, + "language_loss": 0.6097033, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63001466, + "num_input_tokens_seen": 102656715, + "step": 4757, + "time_per_iteration": 3.406625986099243 + }, + { + "auxiliary_loss_clip": 0.01147819, + "auxiliary_loss_mlp": 0.01038916, + "balance_loss_clip": 1.05585599, + "balance_loss_mlp": 1.021595, + "epoch": 0.2860664361941981, + "flos": 20558679863040.0, + "grad_norm": 2.030913944398288, + "language_loss": 0.66206789, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.68393528, + "num_input_tokens_seen": 102676545, + "step": 4758, + "time_per_iteration": 2.589768648147583 + }, + { + "auxiliary_loss_clip": 0.01133475, + "auxiliary_loss_mlp": 0.01042694, + "balance_loss_clip": 1.04988813, + "balance_loss_mlp": 1.02581418, + "epoch": 0.2861265594468661, + "flos": 35996389534080.0, + "grad_norm": 2.019963236438103, + "language_loss": 0.63374877, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.65551043, + "num_input_tokens_seen": 102702875, + "step": 4759, + "time_per_iteration": 2.809325695037842 + }, + { + "auxiliary_loss_clip": 0.01129183, + "auxiliary_loss_mlp": 0.00777076, + "balance_loss_clip": 1.04924989, + "balance_loss_mlp": 1.00088644, + "epoch": 0.28618668269953407, + "flos": 20047419630720.0, + "grad_norm": 1.9693348774443893, + "language_loss": 0.74033993, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.75940251, + "num_input_tokens_seen": 102723160, + "step": 4760, + "time_per_iteration": 2.6797397136688232 + }, + { + "auxiliary_loss_clip": 0.01124387, + "auxiliary_loss_mlp": 0.01045022, + "balance_loss_clip": 1.05517232, + "balance_loss_mlp": 1.02849925, + "epoch": 0.28624680595220203, + "flos": 24971849579520.0, + "grad_norm": 2.574168946313644, + "language_loss": 0.72227889, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.74397296, + "num_input_tokens_seen": 102743855, + "step": 4761, + "time_per_iteration": 2.672394275665283 + }, + { + "auxiliary_loss_clip": 0.01079005, + "auxiliary_loss_mlp": 0.01049385, + "balance_loss_clip": 1.04688287, + "balance_loss_mlp": 1.03218305, + "epoch": 0.28630692920487, + "flos": 22492253911680.0, + "grad_norm": 2.095293128310336, + "language_loss": 0.74758703, + "learning_rate": 3.349548466945793e-06, + "loss": 0.76887095, + "num_input_tokens_seen": 102761370, + "step": 4762, + "time_per_iteration": 2.8573946952819824 + }, + { + "auxiliary_loss_clip": 0.01108257, + "auxiliary_loss_mlp": 0.01044255, + "balance_loss_clip": 1.05117726, + "balance_loss_mlp": 1.02725577, + "epoch": 0.28636705245753796, + "flos": 21249888255360.0, + "grad_norm": 1.4714690500952254, + "language_loss": 0.76185489, + "learning_rate": 3.349261009210496e-06, + "loss": 0.78338003, + "num_input_tokens_seen": 102780885, + "step": 4763, + "time_per_iteration": 2.7058494091033936 + }, + { + "auxiliary_loss_clip": 0.01103052, + "auxiliary_loss_mlp": 0.01041715, + "balance_loss_clip": 1.0442332, + "balance_loss_mlp": 1.0234046, + "epoch": 0.28642717571020593, + "flos": 24095772864000.0, + "grad_norm": 2.250941696220621, + "language_loss": 0.77264833, + "learning_rate": 3.348973500311086e-06, + "loss": 0.79409599, + "num_input_tokens_seen": 102801000, + "step": 4764, + "time_per_iteration": 2.7363107204437256 + }, + { + "auxiliary_loss_clip": 0.0111141, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.04883742, + "balance_loss_mlp": 1.02520347, + "epoch": 0.2864872989628739, + "flos": 22601386408320.0, + "grad_norm": 3.808468667851145, + "language_loss": 0.71222258, + "learning_rate": 3.348685940258466e-06, + "loss": 0.73377991, + "num_input_tokens_seen": 102820230, + "step": 4765, + "time_per_iteration": 2.7225682735443115 + }, + { + "auxiliary_loss_clip": 0.01127531, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.0501802, + "balance_loss_mlp": 1.02118707, + "epoch": 0.28654742221554186, + "flos": 32745073138560.0, + "grad_norm": 1.6284115173108313, + "language_loss": 0.76206756, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.78371924, + "num_input_tokens_seen": 102842670, + "step": 4766, + "time_per_iteration": 2.724776268005371 + }, + { + "auxiliary_loss_clip": 0.01130255, + "auxiliary_loss_mlp": 0.01038205, + "balance_loss_clip": 1.0502758, + "balance_loss_mlp": 1.02133691, + "epoch": 0.2866075454682098, + "flos": 26981626331520.0, + "grad_norm": 1.7313176116986193, + "language_loss": 0.77457404, + "learning_rate": 3.348110666737214e-06, + "loss": 0.79625863, + "num_input_tokens_seen": 102864480, + "step": 4767, + "time_per_iteration": 2.7313742637634277 + }, + { + "auxiliary_loss_clip": 0.0114162, + "auxiliary_loss_mlp": 0.01042697, + "balance_loss_clip": 1.05109096, + "balance_loss_mlp": 1.02519727, + "epoch": 0.2866676687208778, + "flos": 23253847004160.0, + "grad_norm": 1.7818476838857593, + "language_loss": 0.65043855, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.67228168, + "num_input_tokens_seen": 102883740, + "step": 4768, + "time_per_iteration": 2.6173784732818604 + }, + { + "auxiliary_loss_clip": 0.01123197, + "auxiliary_loss_mlp": 0.01041331, + "balance_loss_clip": 1.04803848, + "balance_loss_mlp": 1.02385533, + "epoch": 0.28672779197354575, + "flos": 21579727870080.0, + "grad_norm": 1.5842392137882455, + "language_loss": 0.70497799, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.7266233, + "num_input_tokens_seen": 102902945, + "step": 4769, + "time_per_iteration": 2.627859115600586 + }, + { + "auxiliary_loss_clip": 0.01078118, + "auxiliary_loss_mlp": 0.01033792, + "balance_loss_clip": 1.04276228, + "balance_loss_mlp": 1.01722169, + "epoch": 0.2867879152262137, + "flos": 19865568049920.0, + "grad_norm": 1.555057890983365, + "language_loss": 0.74735439, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.76847351, + "num_input_tokens_seen": 102922405, + "step": 4770, + "time_per_iteration": 2.807286262512207 + }, + { + "auxiliary_loss_clip": 0.01094623, + "auxiliary_loss_mlp": 0.01041164, + "balance_loss_clip": 1.04522562, + "balance_loss_mlp": 1.02336657, + "epoch": 0.2868480384788817, + "flos": 28213325648640.0, + "grad_norm": 2.2768786529491427, + "language_loss": 0.6760053, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.6973632, + "num_input_tokens_seen": 102938980, + "step": 4771, + "time_per_iteration": 2.7709410190582275 + }, + { + "auxiliary_loss_clip": 0.01041422, + "auxiliary_loss_mlp": 0.01015109, + "balance_loss_clip": 1.01907253, + "balance_loss_mlp": 1.01243877, + "epoch": 0.2869081617315497, + "flos": 65424286690560.0, + "grad_norm": 0.770068198596698, + "language_loss": 0.56874299, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.58930826, + "num_input_tokens_seen": 103000405, + "step": 4772, + "time_per_iteration": 3.0978245735168457 + }, + { + "auxiliary_loss_clip": 0.01067739, + "auxiliary_loss_mlp": 0.0077878, + "balance_loss_clip": 1.04115915, + "balance_loss_mlp": 1.00089169, + "epoch": 0.28696828498421767, + "flos": 18660729127680.0, + "grad_norm": 2.7874039039613345, + "language_loss": 0.82870376, + "learning_rate": 3.346383619630856e-06, + "loss": 0.84716898, + "num_input_tokens_seen": 103017970, + "step": 4773, + "time_per_iteration": 2.7716143131256104 + }, + { + "auxiliary_loss_clip": 0.0114188, + "auxiliary_loss_mlp": 0.01043405, + "balance_loss_clip": 1.04776216, + "balance_loss_mlp": 1.02553546, + "epoch": 0.28702840823688563, + "flos": 23659745667840.0, + "grad_norm": 11.069053071667042, + "language_loss": 0.77580261, + "learning_rate": 3.34609559969027e-06, + "loss": 0.79765546, + "num_input_tokens_seen": 103036385, + "step": 4774, + "time_per_iteration": 2.604790687561035 + }, + { + "auxiliary_loss_clip": 0.01119567, + "auxiliary_loss_mlp": 0.01042061, + "balance_loss_clip": 1.04915977, + "balance_loss_mlp": 1.02414346, + "epoch": 0.2870885314895536, + "flos": 13804744544640.0, + "grad_norm": 1.9103573283121942, + "language_loss": 0.73611873, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.75773501, + "num_input_tokens_seen": 103052170, + "step": 4775, + "time_per_iteration": 2.6234211921691895 + }, + { + "auxiliary_loss_clip": 0.01133151, + "auxiliary_loss_mlp": 0.01045326, + "balance_loss_clip": 1.04905081, + "balance_loss_mlp": 1.02782607, + "epoch": 0.28714865474222157, + "flos": 17786771314560.0, + "grad_norm": 1.6535491049734306, + "language_loss": 0.88343942, + "learning_rate": 3.34551940668778e-06, + "loss": 0.9052242, + "num_input_tokens_seen": 103070510, + "step": 4776, + "time_per_iteration": 2.6941640377044678 + }, + { + "auxiliary_loss_clip": 0.01132773, + "auxiliary_loss_mlp": 0.0104327, + "balance_loss_clip": 1.05156159, + "balance_loss_mlp": 1.02712941, + "epoch": 0.28720877799488953, + "flos": 15997486199040.0, + "grad_norm": 1.7321020140737395, + "language_loss": 0.74257779, + "learning_rate": 3.345231233647726e-06, + "loss": 0.76433825, + "num_input_tokens_seen": 103089590, + "step": 4777, + "time_per_iteration": 2.645650863647461 + }, + { + "auxiliary_loss_clip": 0.01126691, + "auxiliary_loss_mlp": 0.01045293, + "balance_loss_clip": 1.05245948, + "balance_loss_mlp": 1.02812648, + "epoch": 0.2872689012475575, + "flos": 20923137210240.0, + "grad_norm": 1.9446580110028222, + "language_loss": 0.80069196, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.82241178, + "num_input_tokens_seen": 103109080, + "step": 4778, + "time_per_iteration": 2.7606308460235596 + }, + { + "auxiliary_loss_clip": 0.01123482, + "auxiliary_loss_mlp": 0.01044505, + "balance_loss_clip": 1.05461526, + "balance_loss_mlp": 1.02750611, + "epoch": 0.28732902450022546, + "flos": 21325121291520.0, + "grad_norm": 1.7560492266469991, + "language_loss": 0.7396307, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.76131058, + "num_input_tokens_seen": 103127755, + "step": 4779, + "time_per_iteration": 2.831167221069336 + }, + { + "auxiliary_loss_clip": 0.01122102, + "auxiliary_loss_mlp": 0.01043876, + "balance_loss_clip": 1.04866719, + "balance_loss_mlp": 1.0262928, + "epoch": 0.2873891477528934, + "flos": 20850382212480.0, + "grad_norm": 1.5882306223862566, + "language_loss": 0.76327771, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.7849375, + "num_input_tokens_seen": 103147035, + "step": 4780, + "time_per_iteration": 2.6548538208007812 + }, + { + "auxiliary_loss_clip": 0.01102465, + "auxiliary_loss_mlp": 0.01042038, + "balance_loss_clip": 1.04413557, + "balance_loss_mlp": 1.02517641, + "epoch": 0.2874492710055614, + "flos": 17420051410560.0, + "grad_norm": 1.5896497572299877, + "language_loss": 0.81445092, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83589596, + "num_input_tokens_seen": 103165410, + "step": 4781, + "time_per_iteration": 2.6422417163848877 + }, + { + "auxiliary_loss_clip": 0.01109573, + "auxiliary_loss_mlp": 0.01045358, + "balance_loss_clip": 1.05339658, + "balance_loss_mlp": 1.0277034, + "epoch": 0.28750939425822936, + "flos": 13406818700160.0, + "grad_norm": 1.8389370421072637, + "language_loss": 0.86738765, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.888937, + "num_input_tokens_seen": 103183710, + "step": 4782, + "time_per_iteration": 2.7507951259613037 + }, + { + "auxiliary_loss_clip": 0.01113582, + "auxiliary_loss_mlp": 0.01043351, + "balance_loss_clip": 1.05343366, + "balance_loss_mlp": 1.02604771, + "epoch": 0.2875695175108973, + "flos": 21870029589120.0, + "grad_norm": 1.5283433651606986, + "language_loss": 0.71153063, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.73309994, + "num_input_tokens_seen": 103203790, + "step": 4783, + "time_per_iteration": 2.7166218757629395 + }, + { + "auxiliary_loss_clip": 0.0112343, + "auxiliary_loss_mlp": 0.01047879, + "balance_loss_clip": 1.05475473, + "balance_loss_mlp": 1.030761, + "epoch": 0.2876296407635653, + "flos": 26245457089920.0, + "grad_norm": 1.6861942701171202, + "language_loss": 0.76872855, + "learning_rate": 3.343212594663047e-06, + "loss": 0.79044163, + "num_input_tokens_seen": 103223925, + "step": 4784, + "time_per_iteration": 2.693665027618408 + }, + { + "auxiliary_loss_clip": 0.01095423, + "auxiliary_loss_mlp": 0.01053931, + "balance_loss_clip": 1.04587293, + "balance_loss_mlp": 1.03514349, + "epoch": 0.28768976401623325, + "flos": 25373654092800.0, + "grad_norm": 4.596098798847224, + "language_loss": 0.75646108, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.77795458, + "num_input_tokens_seen": 103244760, + "step": 4785, + "time_per_iteration": 4.380687236785889 + }, + { + "auxiliary_loss_clip": 0.01144615, + "auxiliary_loss_mlp": 0.01048905, + "balance_loss_clip": 1.0532378, + "balance_loss_mlp": 1.03213263, + "epoch": 0.28774988726890127, + "flos": 30664372982400.0, + "grad_norm": 2.434913324661012, + "language_loss": 0.83660555, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.85854077, + "num_input_tokens_seen": 103261995, + "step": 4786, + "time_per_iteration": 4.138700723648071 + }, + { + "auxiliary_loss_clip": 0.01113505, + "auxiliary_loss_mlp": 0.0077478, + "balance_loss_clip": 1.05201936, + "balance_loss_mlp": 1.00095487, + "epoch": 0.28781001052156924, + "flos": 20595452411520.0, + "grad_norm": 1.8737605513707083, + "language_loss": 0.80388975, + "learning_rate": 3.342346699429516e-06, + "loss": 0.82277262, + "num_input_tokens_seen": 103279780, + "step": 4787, + "time_per_iteration": 2.7030651569366455 + }, + { + "auxiliary_loss_clip": 0.01120528, + "auxiliary_loss_mlp": 0.01039353, + "balance_loss_clip": 1.0489651, + "balance_loss_mlp": 1.02212751, + "epoch": 0.2878701337742372, + "flos": 26542330997760.0, + "grad_norm": 1.8370986188087255, + "language_loss": 0.83052301, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.85212183, + "num_input_tokens_seen": 103300580, + "step": 4788, + "time_per_iteration": 2.7650442123413086 + }, + { + "auxiliary_loss_clip": 0.01110861, + "auxiliary_loss_mlp": 0.01044904, + "balance_loss_clip": 1.0567044, + "balance_loss_mlp": 1.0279882, + "epoch": 0.28793025702690517, + "flos": 28146855530880.0, + "grad_norm": 7.859878454786593, + "language_loss": 0.73045379, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.75201148, + "num_input_tokens_seen": 103320430, + "step": 4789, + "time_per_iteration": 4.340694189071655 + }, + { + "auxiliary_loss_clip": 0.01123471, + "auxiliary_loss_mlp": 0.01042567, + "balance_loss_clip": 1.04852343, + "balance_loss_mlp": 1.02599669, + "epoch": 0.28799038027957313, + "flos": 23805471144960.0, + "grad_norm": 1.7615007973154742, + "language_loss": 0.84425223, + "learning_rate": 3.341480346078704e-06, + "loss": 0.86591256, + "num_input_tokens_seen": 103337695, + "step": 4790, + "time_per_iteration": 2.6953821182250977 + }, + { + "auxiliary_loss_clip": 0.01136004, + "auxiliary_loss_mlp": 0.01049022, + "balance_loss_clip": 1.05240703, + "balance_loss_mlp": 1.03145027, + "epoch": 0.2880505035322411, + "flos": 22344122223360.0, + "grad_norm": 1.743209341690147, + "language_loss": 0.78031182, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.80216199, + "num_input_tokens_seen": 103357010, + "step": 4791, + "time_per_iteration": 4.299259424209595 + }, + { + "auxiliary_loss_clip": 0.01120123, + "auxiliary_loss_mlp": 0.01036962, + "balance_loss_clip": 1.05015528, + "balance_loss_mlp": 1.01999843, + "epoch": 0.28811062678490906, + "flos": 18004246208640.0, + "grad_norm": 2.2148694233914474, + "language_loss": 0.70164073, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.72321159, + "num_input_tokens_seen": 103375600, + "step": 4792, + "time_per_iteration": 2.646732807159424 + }, + { + "auxiliary_loss_clip": 0.01107079, + "auxiliary_loss_mlp": 0.01037734, + "balance_loss_clip": 1.05645919, + "balance_loss_mlp": 1.02149773, + "epoch": 0.28817075003757703, + "flos": 22090880361600.0, + "grad_norm": 1.9192442052106609, + "language_loss": 0.79200894, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.81345713, + "num_input_tokens_seen": 103395225, + "step": 4793, + "time_per_iteration": 2.765010356903076 + }, + { + "auxiliary_loss_clip": 0.01117839, + "auxiliary_loss_mlp": 0.01038019, + "balance_loss_clip": 1.05114603, + "balance_loss_mlp": 1.02235532, + "epoch": 0.288230873290245, + "flos": 41683130847360.0, + "grad_norm": 1.7689864288971164, + "language_loss": 0.78136635, + "learning_rate": 3.340324496161797e-06, + "loss": 0.80292487, + "num_input_tokens_seen": 103417245, + "step": 4794, + "time_per_iteration": 2.868473529815674 + }, + { + "auxiliary_loss_clip": 0.01134193, + "auxiliary_loss_mlp": 0.0104583, + "balance_loss_clip": 1.05259347, + "balance_loss_mlp": 1.02856886, + "epoch": 0.28829099654291296, + "flos": 18624423456000.0, + "grad_norm": 2.1692523829597063, + "language_loss": 0.8320052, + "learning_rate": 3.340035406592074e-06, + "loss": 0.85380542, + "num_input_tokens_seen": 103435500, + "step": 4795, + "time_per_iteration": 2.6216471195220947 + }, + { + "auxiliary_loss_clip": 0.01126764, + "auxiliary_loss_mlp": 0.01043565, + "balance_loss_clip": 1.05043364, + "balance_loss_mlp": 1.0279845, + "epoch": 0.2883511197955809, + "flos": 24674832017280.0, + "grad_norm": 2.290853867887048, + "language_loss": 0.74744678, + "learning_rate": 3.339746266208074e-06, + "loss": 0.76915002, + "num_input_tokens_seen": 103451040, + "step": 4796, + "time_per_iteration": 2.6819822788238525 + }, + { + "auxiliary_loss_clip": 0.01136938, + "auxiliary_loss_mlp": 0.01040822, + "balance_loss_clip": 1.05140758, + "balance_loss_mlp": 1.02221298, + "epoch": 0.2884112430482489, + "flos": 23112143850240.0, + "grad_norm": 1.9890524806298786, + "language_loss": 0.73144913, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.7532267, + "num_input_tokens_seen": 103471330, + "step": 4797, + "time_per_iteration": 2.666097640991211 + }, + { + "auxiliary_loss_clip": 0.01104454, + "auxiliary_loss_mlp": 0.00775335, + "balance_loss_clip": 1.04594803, + "balance_loss_mlp": 1.00097072, + "epoch": 0.28847136630091685, + "flos": 16873347432960.0, + "grad_norm": 1.9324008515617646, + "language_loss": 0.74650872, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.76530659, + "num_input_tokens_seen": 103488060, + "step": 4798, + "time_per_iteration": 2.7281830310821533 + }, + { + "auxiliary_loss_clip": 0.0113412, + "auxiliary_loss_mlp": 0.01043523, + "balance_loss_clip": 1.04996431, + "balance_loss_mlp": 1.02463984, + "epoch": 0.2885314895535849, + "flos": 25657527277440.0, + "grad_norm": 3.037553219769834, + "language_loss": 0.66004431, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.68182075, + "num_input_tokens_seen": 103503600, + "step": 4799, + "time_per_iteration": 2.6416096687316895 + }, + { + "auxiliary_loss_clip": 0.01144575, + "auxiliary_loss_mlp": 0.01049843, + "balance_loss_clip": 1.05205584, + "balance_loss_mlp": 1.03268862, + "epoch": 0.28859161280625284, + "flos": 21107251347840.0, + "grad_norm": 1.7946911133370596, + "language_loss": 0.8231616, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.84510577, + "num_input_tokens_seen": 103524195, + "step": 4800, + "time_per_iteration": 2.704357624053955 + }, + { + "auxiliary_loss_clip": 0.01105166, + "auxiliary_loss_mlp": 0.01040519, + "balance_loss_clip": 1.04861474, + "balance_loss_mlp": 1.02392507, + "epoch": 0.2886517360589208, + "flos": 26469540086400.0, + "grad_norm": 1.5930665564066124, + "language_loss": 0.9080106, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.92946744, + "num_input_tokens_seen": 103545235, + "step": 4801, + "time_per_iteration": 2.8163902759552 + }, + { + "auxiliary_loss_clip": 0.01119221, + "auxiliary_loss_mlp": 0.00775037, + "balance_loss_clip": 1.05178905, + "balance_loss_mlp": 1.0008862, + "epoch": 0.28871185931158877, + "flos": 25265275781760.0, + "grad_norm": 2.098995863955026, + "language_loss": 0.74342406, + "learning_rate": 3.33801035741839e-06, + "loss": 0.76236671, + "num_input_tokens_seen": 103563305, + "step": 4802, + "time_per_iteration": 2.8244271278381348 + }, + { + "auxiliary_loss_clip": 0.01029511, + "auxiliary_loss_mlp": 0.01004263, + "balance_loss_clip": 1.02472734, + "balance_loss_mlp": 1.00193822, + "epoch": 0.28877198256425674, + "flos": 66665431284480.0, + "grad_norm": 0.7780596068321518, + "language_loss": 0.62987334, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65021104, + "num_input_tokens_seen": 103625025, + "step": 4803, + "time_per_iteration": 3.299269676208496 + }, + { + "auxiliary_loss_clip": 0.01083739, + "auxiliary_loss_mlp": 0.01051002, + "balance_loss_clip": 1.03981495, + "balance_loss_mlp": 1.03369915, + "epoch": 0.2888321058169247, + "flos": 20303031790080.0, + "grad_norm": 1.8528386679599225, + "language_loss": 0.71095157, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.73229897, + "num_input_tokens_seen": 103644235, + "step": 4804, + "time_per_iteration": 2.762883424758911 + }, + { + "auxiliary_loss_clip": 0.01135071, + "auxiliary_loss_mlp": 0.01047534, + "balance_loss_clip": 1.05108273, + "balance_loss_mlp": 1.0289135, + "epoch": 0.28889222906959267, + "flos": 25516721963520.0, + "grad_norm": 1.926588918304246, + "language_loss": 0.67916834, + "learning_rate": 3.337141717919346e-06, + "loss": 0.70099443, + "num_input_tokens_seen": 103664700, + "step": 4805, + "time_per_iteration": 2.6848111152648926 + }, + { + "auxiliary_loss_clip": 0.01135111, + "auxiliary_loss_mlp": 0.01046638, + "balance_loss_clip": 1.05359602, + "balance_loss_mlp": 1.03029394, + "epoch": 0.28895235232226063, + "flos": 32671312560000.0, + "grad_norm": 1.4381182508216341, + "language_loss": 0.69720542, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.71902293, + "num_input_tokens_seen": 103686595, + "step": 4806, + "time_per_iteration": 2.762458562850952 + }, + { + "auxiliary_loss_clip": 0.01120642, + "auxiliary_loss_mlp": 0.01052311, + "balance_loss_clip": 1.05073118, + "balance_loss_mlp": 1.03559768, + "epoch": 0.2890124755749286, + "flos": 29714679342720.0, + "grad_norm": 1.4600495853323927, + "language_loss": 0.71255589, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.73428547, + "num_input_tokens_seen": 103707525, + "step": 4807, + "time_per_iteration": 2.740931987762451 + }, + { + "auxiliary_loss_clip": 0.01106054, + "auxiliary_loss_mlp": 0.01043407, + "balance_loss_clip": 1.05087459, + "balance_loss_mlp": 1.02625299, + "epoch": 0.28907259882759656, + "flos": 22674464628480.0, + "grad_norm": 1.6111027163793539, + "language_loss": 0.81489629, + "learning_rate": 3.336272622079382e-06, + "loss": 0.83639085, + "num_input_tokens_seen": 103727905, + "step": 4808, + "time_per_iteration": 2.722787380218506 + }, + { + "auxiliary_loss_clip": 0.01098162, + "auxiliary_loss_mlp": 0.01048507, + "balance_loss_clip": 1.04795146, + "balance_loss_mlp": 1.03160298, + "epoch": 0.2891327220802645, + "flos": 22566050403840.0, + "grad_norm": 1.7874609682529725, + "language_loss": 0.78304112, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.80450785, + "num_input_tokens_seen": 103748335, + "step": 4809, + "time_per_iteration": 2.742063522338867 + }, + { + "auxiliary_loss_clip": 0.01091743, + "auxiliary_loss_mlp": 0.01047553, + "balance_loss_clip": 1.04519784, + "balance_loss_mlp": 1.02924204, + "epoch": 0.2891928453329325, + "flos": 21652806090240.0, + "grad_norm": 1.7709564567634208, + "language_loss": 0.78864932, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.81004226, + "num_input_tokens_seen": 103767020, + "step": 4810, + "time_per_iteration": 2.7578415870666504 + }, + { + "auxiliary_loss_clip": 0.01090252, + "auxiliary_loss_mlp": 0.01039009, + "balance_loss_clip": 1.04552603, + "balance_loss_mlp": 1.02280235, + "epoch": 0.28925296858560046, + "flos": 23222102359680.0, + "grad_norm": 1.6298276151024105, + "language_loss": 0.76974982, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.79104245, + "num_input_tokens_seen": 103786355, + "step": 4811, + "time_per_iteration": 2.7336831092834473 + }, + { + "auxiliary_loss_clip": 0.01132677, + "auxiliary_loss_mlp": 0.01047674, + "balance_loss_clip": 1.05356216, + "balance_loss_mlp": 1.03038859, + "epoch": 0.2893130918382685, + "flos": 28621666437120.0, + "grad_norm": 1.4740946425962824, + "language_loss": 0.77044773, + "learning_rate": 3.335113118275117e-06, + "loss": 0.79225123, + "num_input_tokens_seen": 103809345, + "step": 4812, + "time_per_iteration": 2.745115280151367 + }, + { + "auxiliary_loss_clip": 0.01024348, + "auxiliary_loss_mlp": 0.01009076, + "balance_loss_clip": 1.02794337, + "balance_loss_mlp": 1.00728762, + "epoch": 0.28937321509093644, + "flos": 72301288982400.0, + "grad_norm": 0.8337141037006477, + "language_loss": 0.60292435, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62325859, + "num_input_tokens_seen": 103871180, + "step": 4813, + "time_per_iteration": 3.3592262268066406 + }, + { + "auxiliary_loss_clip": 0.01094544, + "auxiliary_loss_mlp": 0.01044805, + "balance_loss_clip": 1.0431211, + "balance_loss_mlp": 1.02734065, + "epoch": 0.2894333383436044, + "flos": 16216397637120.0, + "grad_norm": 3.1340543474440623, + "language_loss": 0.82301223, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.84440577, + "num_input_tokens_seen": 103889040, + "step": 4814, + "time_per_iteration": 2.7069244384765625 + }, + { + "auxiliary_loss_clip": 0.01101478, + "auxiliary_loss_mlp": 0.01052591, + "balance_loss_clip": 1.05051374, + "balance_loss_mlp": 1.03556752, + "epoch": 0.2894934615962724, + "flos": 24828278918400.0, + "grad_norm": 1.6672038490985601, + "language_loss": 0.73249441, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.75403512, + "num_input_tokens_seen": 103910380, + "step": 4815, + "time_per_iteration": 2.764214515686035 + }, + { + "auxiliary_loss_clip": 0.01131126, + "auxiliary_loss_mlp": 0.01045124, + "balance_loss_clip": 1.05259883, + "balance_loss_mlp": 1.02997231, + "epoch": 0.28955358484894034, + "flos": 20449978329600.0, + "grad_norm": 1.9821106518618066, + "language_loss": 0.70783043, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.72959292, + "num_input_tokens_seen": 103929955, + "step": 4816, + "time_per_iteration": 2.7809629440307617 + }, + { + "auxiliary_loss_clip": 0.01119261, + "auxiliary_loss_mlp": 0.01048806, + "balance_loss_clip": 1.04862189, + "balance_loss_mlp": 1.03097248, + "epoch": 0.2896137081016083, + "flos": 22565188477440.0, + "grad_norm": 2.3636227133284122, + "language_loss": 0.7445122, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.76619279, + "num_input_tokens_seen": 103948020, + "step": 4817, + "time_per_iteration": 2.829183578491211 + }, + { + "auxiliary_loss_clip": 0.01108198, + "auxiliary_loss_mlp": 0.01054129, + "balance_loss_clip": 1.05107522, + "balance_loss_mlp": 1.03633142, + "epoch": 0.28967383135427627, + "flos": 26687948734080.0, + "grad_norm": 1.8479613371686012, + "language_loss": 0.76190692, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.78353024, + "num_input_tokens_seen": 103968740, + "step": 4818, + "time_per_iteration": 2.827925443649292 + }, + { + "auxiliary_loss_clip": 0.01074516, + "auxiliary_loss_mlp": 0.01041914, + "balance_loss_clip": 1.04805899, + "balance_loss_mlp": 1.02477193, + "epoch": 0.28973395460694423, + "flos": 15558262692480.0, + "grad_norm": 1.9558897556763024, + "language_loss": 0.80060315, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.82176751, + "num_input_tokens_seen": 103986005, + "step": 4819, + "time_per_iteration": 2.8941574096679688 + }, + { + "auxiliary_loss_clip": 0.01110223, + "auxiliary_loss_mlp": 0.01048219, + "balance_loss_clip": 1.0494163, + "balance_loss_mlp": 1.02931273, + "epoch": 0.2897940778596122, + "flos": 18697465762560.0, + "grad_norm": 1.8074124972104149, + "language_loss": 0.78504574, + "learning_rate": 3.332791681244776e-06, + "loss": 0.80663019, + "num_input_tokens_seen": 104005070, + "step": 4820, + "time_per_iteration": 2.7016515731811523 + }, + { + "auxiliary_loss_clip": 0.01096478, + "auxiliary_loss_mlp": 0.01037037, + "balance_loss_clip": 1.04924846, + "balance_loss_mlp": 1.02028775, + "epoch": 0.28985420111228016, + "flos": 18770292587520.0, + "grad_norm": 2.105369007151224, + "language_loss": 0.72925651, + "learning_rate": 3.332501274072231e-06, + "loss": 0.7505917, + "num_input_tokens_seen": 104022945, + "step": 4821, + "time_per_iteration": 2.743091583251953 + }, + { + "auxiliary_loss_clip": 0.01132782, + "auxiliary_loss_mlp": 0.01040556, + "balance_loss_clip": 1.05055594, + "balance_loss_mlp": 1.02290142, + "epoch": 0.28991432436494813, + "flos": 23069840607360.0, + "grad_norm": 2.331696646407205, + "language_loss": 0.71962738, + "learning_rate": 3.332210816371104e-06, + "loss": 0.74136078, + "num_input_tokens_seen": 104042080, + "step": 4822, + "time_per_iteration": 2.768996477127075 + }, + { + "auxiliary_loss_clip": 0.01128837, + "auxiliary_loss_mlp": 0.01048176, + "balance_loss_clip": 1.05237818, + "balance_loss_mlp": 1.03142738, + "epoch": 0.2899744476176161, + "flos": 17603195880960.0, + "grad_norm": 1.8111020118629353, + "language_loss": 0.662521, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.68429112, + "num_input_tokens_seen": 104060975, + "step": 4823, + "time_per_iteration": 2.733591318130493 + }, + { + "auxiliary_loss_clip": 0.01107872, + "auxiliary_loss_mlp": 0.01042255, + "balance_loss_clip": 1.04404497, + "balance_loss_mlp": 1.02588761, + "epoch": 0.29003457087028406, + "flos": 22309360836480.0, + "grad_norm": 4.579803152663717, + "language_loss": 0.81162238, + "learning_rate": 3.331629749427164e-06, + "loss": 0.83312368, + "num_input_tokens_seen": 104081395, + "step": 4824, + "time_per_iteration": 4.278540849685669 + }, + { + "auxiliary_loss_clip": 0.01143667, + "auxiliary_loss_mlp": 0.01043888, + "balance_loss_clip": 1.05104661, + "balance_loss_mlp": 1.025828, + "epoch": 0.2900946941229521, + "flos": 21944975316480.0, + "grad_norm": 2.265114761106369, + "language_loss": 0.72592747, + "learning_rate": 3.331339140206385e-06, + "loss": 0.74780297, + "num_input_tokens_seen": 104099995, + "step": 4825, + "time_per_iteration": 4.177908658981323 + }, + { + "auxiliary_loss_clip": 0.01147795, + "auxiliary_loss_mlp": 0.01036998, + "balance_loss_clip": 1.05434549, + "balance_loss_mlp": 1.01930714, + "epoch": 0.29015481737562004, + "flos": 17932173569280.0, + "grad_norm": 2.216571865047856, + "language_loss": 0.73680669, + "learning_rate": 3.331048480501092e-06, + "loss": 0.75865459, + "num_input_tokens_seen": 104118930, + "step": 4826, + "time_per_iteration": 2.6371700763702393 + }, + { + "auxiliary_loss_clip": 0.0113072, + "auxiliary_loss_mlp": 0.01040585, + "balance_loss_clip": 1.05073726, + "balance_loss_mlp": 1.02483773, + "epoch": 0.290214940628288, + "flos": 22783525297920.0, + "grad_norm": 2.324527624383577, + "language_loss": 0.68556225, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.70727527, + "num_input_tokens_seen": 104136940, + "step": 4827, + "time_per_iteration": 2.6447484493255615 + }, + { + "auxiliary_loss_clip": 0.01125924, + "auxiliary_loss_mlp": 0.0104453, + "balance_loss_clip": 1.04981911, + "balance_loss_mlp": 1.02650571, + "epoch": 0.290275063880956, + "flos": 20006481104640.0, + "grad_norm": 1.8485927197530279, + "language_loss": 0.80266023, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.82436466, + "num_input_tokens_seen": 104154280, + "step": 4828, + "time_per_iteration": 4.131803274154663 + }, + { + "auxiliary_loss_clip": 0.01144317, + "auxiliary_loss_mlp": 0.01049939, + "balance_loss_clip": 1.05393863, + "balance_loss_mlp": 1.03288054, + "epoch": 0.29033518713362394, + "flos": 22053605022720.0, + "grad_norm": 1.8003854621941846, + "language_loss": 0.80658895, + "learning_rate": 3.33017619858836e-06, + "loss": 0.8285315, + "num_input_tokens_seen": 104172605, + "step": 4829, + "time_per_iteration": 2.760899066925049 + }, + { + "auxiliary_loss_clip": 0.011197, + "auxiliary_loss_mlp": 0.01044046, + "balance_loss_clip": 1.05093288, + "balance_loss_mlp": 1.02680826, + "epoch": 0.2903953103862919, + "flos": 25630056351360.0, + "grad_norm": 1.5734536519128175, + "language_loss": 0.82911146, + "learning_rate": 3.329885337055249e-06, + "loss": 0.85074902, + "num_input_tokens_seen": 104194120, + "step": 4830, + "time_per_iteration": 4.403480529785156 + }, + { + "auxiliary_loss_clip": 0.01137563, + "auxiliary_loss_mlp": 0.01048934, + "balance_loss_clip": 1.05430257, + "balance_loss_mlp": 1.03155351, + "epoch": 0.29045543363895987, + "flos": 16945851035520.0, + "grad_norm": 2.2586543311689486, + "language_loss": 0.79236752, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.81423253, + "num_input_tokens_seen": 104210875, + "step": 4831, + "time_per_iteration": 2.6066412925720215 + }, + { + "auxiliary_loss_clip": 0.01143728, + "auxiliary_loss_mlp": 0.01045824, + "balance_loss_clip": 1.05470276, + "balance_loss_mlp": 1.03000546, + "epoch": 0.29051555689162784, + "flos": 26395492199040.0, + "grad_norm": 1.9694662738232038, + "language_loss": 0.7459774, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.76787293, + "num_input_tokens_seen": 104229875, + "step": 4832, + "time_per_iteration": 2.8411331176757812 + }, + { + "auxiliary_loss_clip": 0.01122405, + "auxiliary_loss_mlp": 0.01037758, + "balance_loss_clip": 1.05429769, + "balance_loss_mlp": 1.02335787, + "epoch": 0.2905756801442958, + "flos": 21103875469440.0, + "grad_norm": 1.979215737756815, + "language_loss": 0.76150024, + "learning_rate": 3.329012449923736e-06, + "loss": 0.78310186, + "num_input_tokens_seen": 104250405, + "step": 4833, + "time_per_iteration": 2.7510006427764893 + }, + { + "auxiliary_loss_clip": 0.01107016, + "auxiliary_loss_mlp": 0.01040024, + "balance_loss_clip": 1.04580688, + "balance_loss_mlp": 1.02383542, + "epoch": 0.29063580339696377, + "flos": 15706071158400.0, + "grad_norm": 1.7715964188803632, + "language_loss": 0.64404124, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.66551173, + "num_input_tokens_seen": 104269185, + "step": 4834, + "time_per_iteration": 2.6475064754486084 + }, + { + "auxiliary_loss_clip": 0.01117159, + "auxiliary_loss_mlp": 0.01032155, + "balance_loss_clip": 1.05111325, + "balance_loss_mlp": 1.01724815, + "epoch": 0.29069592664963173, + "flos": 24644990793600.0, + "grad_norm": 1.4640588842294755, + "language_loss": 0.71717769, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.73867083, + "num_input_tokens_seen": 104289400, + "step": 4835, + "time_per_iteration": 2.6991324424743652 + }, + { + "auxiliary_loss_clip": 0.01117393, + "auxiliary_loss_mlp": 0.01037314, + "balance_loss_clip": 1.04881835, + "balance_loss_mlp": 1.02187634, + "epoch": 0.2907560499022997, + "flos": 24973753000320.0, + "grad_norm": 1.657223137158586, + "language_loss": 0.79492378, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.81647086, + "num_input_tokens_seen": 104310485, + "step": 4836, + "time_per_iteration": 2.7060084342956543 + }, + { + "auxiliary_loss_clip": 0.01107347, + "auxiliary_loss_mlp": 0.01045193, + "balance_loss_clip": 1.05334711, + "balance_loss_mlp": 1.02744293, + "epoch": 0.29081617315496766, + "flos": 18657496903680.0, + "grad_norm": 1.9442300400082562, + "language_loss": 0.81372344, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.83524883, + "num_input_tokens_seen": 104327330, + "step": 4837, + "time_per_iteration": 2.640610933303833 + }, + { + "auxiliary_loss_clip": 0.01116355, + "auxiliary_loss_mlp": 0.01039398, + "balance_loss_clip": 1.04938102, + "balance_loss_mlp": 1.0233283, + "epoch": 0.2908762964076356, + "flos": 35331035955840.0, + "grad_norm": 6.209911556378307, + "language_loss": 0.67358792, + "learning_rate": 3.327556630259381e-06, + "loss": 0.69514549, + "num_input_tokens_seen": 104350350, + "step": 4838, + "time_per_iteration": 2.758422374725342 + }, + { + "auxiliary_loss_clip": 0.01147958, + "auxiliary_loss_mlp": 0.00775113, + "balance_loss_clip": 1.05402315, + "balance_loss_mlp": 1.00096607, + "epoch": 0.29093641966030365, + "flos": 23076305055360.0, + "grad_norm": 1.5628414298261506, + "language_loss": 0.71139944, + "learning_rate": 3.327265315259095e-06, + "loss": 0.73063016, + "num_input_tokens_seen": 104369995, + "step": 4839, + "time_per_iteration": 2.683349132537842 + }, + { + "auxiliary_loss_clip": 0.0114095, + "auxiliary_loss_mlp": 0.01036937, + "balance_loss_clip": 1.04966319, + "balance_loss_mlp": 1.02147555, + "epoch": 0.2909965429129716, + "flos": 35955415094400.0, + "grad_norm": 1.9403130873020338, + "language_loss": 0.7539593, + "learning_rate": 3.326973949928776e-06, + "loss": 0.77573812, + "num_input_tokens_seen": 104392285, + "step": 4840, + "time_per_iteration": 2.696808099746704 + }, + { + "auxiliary_loss_clip": 0.01093571, + "auxiliary_loss_mlp": 0.01045095, + "balance_loss_clip": 1.04470551, + "balance_loss_mlp": 1.02825069, + "epoch": 0.2910566661656396, + "flos": 30880231764480.0, + "grad_norm": 1.7841334294021773, + "language_loss": 0.60546595, + "learning_rate": 3.326682534279471e-06, + "loss": 0.62685257, + "num_input_tokens_seen": 104412640, + "step": 4841, + "time_per_iteration": 2.74575138092041 + }, + { + "auxiliary_loss_clip": 0.01120271, + "auxiliary_loss_mlp": 0.01039624, + "balance_loss_clip": 1.04983509, + "balance_loss_mlp": 1.02288651, + "epoch": 0.29111678941830754, + "flos": 30010188533760.0, + "grad_norm": 1.408353605568525, + "language_loss": 0.71321762, + "learning_rate": 3.326391068322232e-06, + "loss": 0.73481655, + "num_input_tokens_seen": 104435245, + "step": 4842, + "time_per_iteration": 2.7568962574005127 + }, + { + "auxiliary_loss_clip": 0.01130885, + "auxiliary_loss_mlp": 0.01037088, + "balance_loss_clip": 1.05042899, + "balance_loss_mlp": 1.02191257, + "epoch": 0.2911769126709755, + "flos": 22857393617280.0, + "grad_norm": 2.1183002067983585, + "language_loss": 0.73610562, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.75778532, + "num_input_tokens_seen": 104455395, + "step": 4843, + "time_per_iteration": 2.6703171730041504 + }, + { + "auxiliary_loss_clip": 0.0108851, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_clip": 1.04775739, + "balance_loss_mlp": 1.02058005, + "epoch": 0.2912370359236435, + "flos": 21650507619840.0, + "grad_norm": 4.868884277111801, + "language_loss": 0.58445942, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.60570699, + "num_input_tokens_seen": 104473350, + "step": 4844, + "time_per_iteration": 2.7461965084075928 + }, + { + "auxiliary_loss_clip": 0.01138917, + "auxiliary_loss_mlp": 0.01039428, + "balance_loss_clip": 1.05586743, + "balance_loss_mlp": 1.0222863, + "epoch": 0.29129715917631144, + "flos": 22893340152960.0, + "grad_norm": 1.9200815982611392, + "language_loss": 0.86459565, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.88637912, + "num_input_tokens_seen": 104492265, + "step": 4845, + "time_per_iteration": 2.711101770401001 + }, + { + "auxiliary_loss_clip": 0.01115849, + "auxiliary_loss_mlp": 0.01052584, + "balance_loss_clip": 1.05018926, + "balance_loss_mlp": 1.03505993, + "epoch": 0.2913572824289794, + "flos": 22674464628480.0, + "grad_norm": 1.7226223126663984, + "language_loss": 0.67067879, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.69236308, + "num_input_tokens_seen": 104510755, + "step": 4846, + "time_per_iteration": 2.698076009750366 + }, + { + "auxiliary_loss_clip": 0.01120746, + "auxiliary_loss_mlp": 0.01040428, + "balance_loss_clip": 1.05198884, + "balance_loss_mlp": 1.02457917, + "epoch": 0.29141740568164737, + "flos": 23107403255040.0, + "grad_norm": 1.9884880347168128, + "language_loss": 0.70629871, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.7279104, + "num_input_tokens_seen": 104530830, + "step": 4847, + "time_per_iteration": 2.6693859100341797 + }, + { + "auxiliary_loss_clip": 0.01129385, + "auxiliary_loss_mlp": 0.01036362, + "balance_loss_clip": 1.0490911, + "balance_loss_mlp": 1.02048314, + "epoch": 0.29147752893431533, + "flos": 23587026583680.0, + "grad_norm": 1.4444788582363046, + "language_loss": 0.73975939, + "learning_rate": 3.324641216731237e-06, + "loss": 0.76141691, + "num_input_tokens_seen": 104550115, + "step": 4848, + "time_per_iteration": 2.779012680053711 + }, + { + "auxiliary_loss_clip": 0.0112526, + "auxiliary_loss_mlp": 0.01051811, + "balance_loss_clip": 1.04831481, + "balance_loss_mlp": 1.03391802, + "epoch": 0.2915376521869833, + "flos": 20591968792320.0, + "grad_norm": 3.067540232947916, + "language_loss": 0.76738584, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.7891565, + "num_input_tokens_seen": 104566255, + "step": 4849, + "time_per_iteration": 2.6103999614715576 + }, + { + "auxiliary_loss_clip": 0.01124372, + "auxiliary_loss_mlp": 0.01041862, + "balance_loss_clip": 1.04718697, + "balance_loss_mlp": 1.02541125, + "epoch": 0.29159777543965126, + "flos": 20811490761600.0, + "grad_norm": 1.7266499063872853, + "language_loss": 0.78276592, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.80442822, + "num_input_tokens_seen": 104585235, + "step": 4850, + "time_per_iteration": 2.6395609378814697 + }, + { + "auxiliary_loss_clip": 0.01111964, + "auxiliary_loss_mlp": 0.01038044, + "balance_loss_clip": 1.04907775, + "balance_loss_mlp": 1.0209378, + "epoch": 0.29165789869231923, + "flos": 24244155947520.0, + "grad_norm": 1.8024770323318549, + "language_loss": 0.7657702, + "learning_rate": 3.323765612674296e-06, + "loss": 0.78727031, + "num_input_tokens_seen": 104605315, + "step": 4851, + "time_per_iteration": 2.7265985012054443 + }, + { + "auxiliary_loss_clip": 0.01132156, + "auxiliary_loss_mlp": 0.01045641, + "balance_loss_clip": 1.052459, + "balance_loss_mlp": 1.03083527, + "epoch": 0.29171802194498725, + "flos": 28949925853440.0, + "grad_norm": 1.3639310788782566, + "language_loss": 0.77680421, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.7985822, + "num_input_tokens_seen": 104626055, + "step": 4852, + "time_per_iteration": 2.7161712646484375 + }, + { + "auxiliary_loss_clip": 0.01120344, + "auxiliary_loss_mlp": 0.01051407, + "balance_loss_clip": 1.05108476, + "balance_loss_mlp": 1.03523064, + "epoch": 0.2917781451976552, + "flos": 22598226011520.0, + "grad_norm": 1.6397145219173752, + "language_loss": 0.7816534, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80337089, + "num_input_tokens_seen": 104646005, + "step": 4853, + "time_per_iteration": 2.748053789138794 + }, + { + "auxiliary_loss_clip": 0.01108012, + "auxiliary_loss_mlp": 0.01041349, + "balance_loss_clip": 1.04923177, + "balance_loss_mlp": 1.02535105, + "epoch": 0.2918382684503232, + "flos": 21574448570880.0, + "grad_norm": 2.273586870261815, + "language_loss": 0.8791436, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90063715, + "num_input_tokens_seen": 104661620, + "step": 4854, + "time_per_iteration": 2.7663791179656982 + }, + { + "auxiliary_loss_clip": 0.01128591, + "auxiliary_loss_mlp": 0.01054226, + "balance_loss_clip": 1.05255818, + "balance_loss_mlp": 1.03502131, + "epoch": 0.29189839170299114, + "flos": 24353503925760.0, + "grad_norm": 1.7143523369489482, + "language_loss": 0.86374146, + "learning_rate": 3.322597437887519e-06, + "loss": 0.88556957, + "num_input_tokens_seen": 104681445, + "step": 4855, + "time_per_iteration": 2.613903284072876 + }, + { + "auxiliary_loss_clip": 0.01039808, + "auxiliary_loss_mlp": 0.01005184, + "balance_loss_clip": 1.02170599, + "balance_loss_mlp": 1.00303864, + "epoch": 0.2919585149556591, + "flos": 71316726215040.0, + "grad_norm": 0.7954079009769616, + "language_loss": 0.60148996, + "learning_rate": 3.322305268780566e-06, + "loss": 0.6219399, + "num_input_tokens_seen": 104747945, + "step": 4856, + "time_per_iteration": 3.273501396179199 + }, + { + "auxiliary_loss_clip": 0.01115701, + "auxiliary_loss_mlp": 0.00774991, + "balance_loss_clip": 1.04708552, + "balance_loss_mlp": 1.00107539, + "epoch": 0.2920186382083271, + "flos": 15633208419840.0, + "grad_norm": 1.7540806356878256, + "language_loss": 0.6825304, + "learning_rate": 3.322013049531664e-06, + "loss": 0.70143735, + "num_input_tokens_seen": 104766225, + "step": 4857, + "time_per_iteration": 2.6799964904785156 + }, + { + "auxiliary_loss_clip": 0.01129839, + "auxiliary_loss_mlp": 0.00774071, + "balance_loss_clip": 1.05058599, + "balance_loss_mlp": 1.00106227, + "epoch": 0.29207876146099504, + "flos": 28366018364160.0, + "grad_norm": 1.9069678720023968, + "language_loss": 0.83446503, + "learning_rate": 3.321720780151895e-06, + "loss": 0.85350412, + "num_input_tokens_seen": 104785345, + "step": 4858, + "time_per_iteration": 2.7004997730255127 + }, + { + "auxiliary_loss_clip": 0.01143419, + "auxiliary_loss_mlp": 0.01047414, + "balance_loss_clip": 1.05265319, + "balance_loss_mlp": 1.03119004, + "epoch": 0.292138884713663, + "flos": 21870963342720.0, + "grad_norm": 1.7162042036272904, + "language_loss": 0.77357888, + "learning_rate": 3.321428460652342e-06, + "loss": 0.79548717, + "num_input_tokens_seen": 104804560, + "step": 4859, + "time_per_iteration": 2.5901620388031006 + }, + { + "auxiliary_loss_clip": 0.01105726, + "auxiliary_loss_mlp": 0.01044957, + "balance_loss_clip": 1.05237806, + "balance_loss_mlp": 1.02816057, + "epoch": 0.29219900796633097, + "flos": 20992552243200.0, + "grad_norm": 2.2554676354860246, + "language_loss": 0.68046212, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.70196903, + "num_input_tokens_seen": 104821105, + "step": 4860, + "time_per_iteration": 2.7831058502197266 + }, + { + "auxiliary_loss_clip": 0.01117304, + "auxiliary_loss_mlp": 0.01041096, + "balance_loss_clip": 1.05229402, + "balance_loss_mlp": 1.02662396, + "epoch": 0.29225913121899894, + "flos": 35004608133120.0, + "grad_norm": 2.539974445673703, + "language_loss": 0.75258791, + "learning_rate": 3.320843671338222e-06, + "loss": 0.77417195, + "num_input_tokens_seen": 104841440, + "step": 4861, + "time_per_iteration": 2.7506070137023926 + }, + { + "auxiliary_loss_clip": 0.01128031, + "auxiliary_loss_mlp": 0.0105121, + "balance_loss_clip": 1.04845262, + "balance_loss_mlp": 1.03620112, + "epoch": 0.2923192544716669, + "flos": 13515663888000.0, + "grad_norm": 3.0942357088370245, + "language_loss": 0.91498685, + "learning_rate": 3.320551201545832e-06, + "loss": 0.93677926, + "num_input_tokens_seen": 104858210, + "step": 4862, + "time_per_iteration": 2.589700937271118 + }, + { + "auxiliary_loss_clip": 0.01131947, + "auxiliary_loss_mlp": 0.01042917, + "balance_loss_clip": 1.05090141, + "balance_loss_mlp": 1.02786124, + "epoch": 0.29237937772433487, + "flos": 19463512141440.0, + "grad_norm": 2.2124063953391464, + "language_loss": 0.73112279, + "learning_rate": 3.320258681678008e-06, + "loss": 0.75287139, + "num_input_tokens_seen": 104875620, + "step": 4863, + "time_per_iteration": 4.142335653305054 + }, + { + "auxiliary_loss_clip": 0.01061699, + "auxiliary_loss_mlp": 0.01044676, + "balance_loss_clip": 1.04478168, + "balance_loss_mlp": 1.02934611, + "epoch": 0.29243950097700283, + "flos": 20850597694080.0, + "grad_norm": 1.893468710780351, + "language_loss": 0.77841508, + "learning_rate": 3.319966111745842e-06, + "loss": 0.79947883, + "num_input_tokens_seen": 104894600, + "step": 4864, + "time_per_iteration": 4.309613943099976 + }, + { + "auxiliary_loss_clip": 0.01102707, + "auxiliary_loss_mlp": 0.01050983, + "balance_loss_clip": 1.04593945, + "balance_loss_mlp": 1.03424644, + "epoch": 0.29249962422967085, + "flos": 23584225322880.0, + "grad_norm": 1.5703024458168264, + "language_loss": 0.81861019, + "learning_rate": 3.319673491760429e-06, + "loss": 0.84014714, + "num_input_tokens_seen": 104914530, + "step": 4865, + "time_per_iteration": 2.762397527694702 + }, + { + "auxiliary_loss_clip": 0.0109576, + "auxiliary_loss_mlp": 0.01046651, + "balance_loss_clip": 1.05265307, + "balance_loss_mlp": 1.02924657, + "epoch": 0.2925597474823388, + "flos": 22273342473600.0, + "grad_norm": 2.2072447614425554, + "language_loss": 0.85522473, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.87664878, + "num_input_tokens_seen": 104933460, + "step": 4866, + "time_per_iteration": 2.8033764362335205 + }, + { + "auxiliary_loss_clip": 0.01110933, + "auxiliary_loss_mlp": 0.01039812, + "balance_loss_clip": 1.04811919, + "balance_loss_mlp": 1.02410054, + "epoch": 0.2926198707350068, + "flos": 34456108475520.0, + "grad_norm": 1.7213351696608077, + "language_loss": 0.75498515, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.7764926, + "num_input_tokens_seen": 104954495, + "step": 4867, + "time_per_iteration": 4.2950732707977295 + }, + { + "auxiliary_loss_clip": 0.01083116, + "auxiliary_loss_mlp": 0.01052463, + "balance_loss_clip": 1.04825687, + "balance_loss_mlp": 1.03576183, + "epoch": 0.29267999398767475, + "flos": 20704153944960.0, + "grad_norm": 1.9203033465249189, + "language_loss": 0.73236179, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.75371754, + "num_input_tokens_seen": 104971915, + "step": 4868, + "time_per_iteration": 2.775538921356201 + }, + { + "auxiliary_loss_clip": 0.01091396, + "auxiliary_loss_mlp": 0.01045538, + "balance_loss_clip": 1.04888034, + "balance_loss_mlp": 1.02836001, + "epoch": 0.2927401172403427, + "flos": 18368667642240.0, + "grad_norm": 1.663889887662616, + "language_loss": 0.74540651, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.76677585, + "num_input_tokens_seen": 104991335, + "step": 4869, + "time_per_iteration": 2.734683036804199 + }, + { + "auxiliary_loss_clip": 0.01116568, + "auxiliary_loss_mlp": 0.01040323, + "balance_loss_clip": 1.050179, + "balance_loss_mlp": 1.02405143, + "epoch": 0.2928002404930107, + "flos": 26104041244800.0, + "grad_norm": 1.5721867242720646, + "language_loss": 0.76492888, + "learning_rate": 3.318209641423088e-06, + "loss": 0.78649783, + "num_input_tokens_seen": 105012015, + "step": 4870, + "time_per_iteration": 4.413575649261475 + }, + { + "auxiliary_loss_clip": 0.01133789, + "auxiliary_loss_mlp": 0.0105055, + "balance_loss_clip": 1.05237079, + "balance_loss_mlp": 1.0328114, + "epoch": 0.29286036374567864, + "flos": 21324726241920.0, + "grad_norm": 2.0174334678237655, + "language_loss": 0.6773119, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.69915527, + "num_input_tokens_seen": 105031460, + "step": 4871, + "time_per_iteration": 2.68796706199646 + }, + { + "auxiliary_loss_clip": 0.01112736, + "auxiliary_loss_mlp": 0.01051475, + "balance_loss_clip": 1.04638386, + "balance_loss_mlp": 1.03515494, + "epoch": 0.2929204869983466, + "flos": 29569492569600.0, + "grad_norm": 4.945083241782643, + "language_loss": 0.77463269, + "learning_rate": 3.317623751303933e-06, + "loss": 0.79627478, + "num_input_tokens_seen": 105052965, + "step": 4872, + "time_per_iteration": 2.7679827213287354 + }, + { + "auxiliary_loss_clip": 0.01078644, + "auxiliary_loss_mlp": 0.01045822, + "balance_loss_clip": 1.0468123, + "balance_loss_mlp": 1.0273211, + "epoch": 0.2929806102510146, + "flos": 19058259922560.0, + "grad_norm": 1.9468785945114855, + "language_loss": 0.72814691, + "learning_rate": 3.317330731292164e-06, + "loss": 0.74939156, + "num_input_tokens_seen": 105071840, + "step": 4873, + "time_per_iteration": 2.8704919815063477 + }, + { + "auxiliary_loss_clip": 0.01135073, + "auxiliary_loss_mlp": 0.01044722, + "balance_loss_clip": 1.0525651, + "balance_loss_mlp": 1.02705503, + "epoch": 0.29304073350368254, + "flos": 21944221130880.0, + "grad_norm": 1.9420707280566882, + "language_loss": 0.78093398, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.80273187, + "num_input_tokens_seen": 105089445, + "step": 4874, + "time_per_iteration": 2.6573073863983154 + }, + { + "auxiliary_loss_clip": 0.01093774, + "auxiliary_loss_mlp": 0.01045077, + "balance_loss_clip": 1.05151463, + "balance_loss_mlp": 1.02790475, + "epoch": 0.2931008567563505, + "flos": 15450818135040.0, + "grad_norm": 1.8901262824755785, + "language_loss": 0.77336359, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.794752, + "num_input_tokens_seen": 105106210, + "step": 4875, + "time_per_iteration": 2.6960959434509277 + }, + { + "auxiliary_loss_clip": 0.01141436, + "auxiliary_loss_mlp": 0.01038673, + "balance_loss_clip": 1.05718327, + "balance_loss_mlp": 1.02218604, + "epoch": 0.29316098000901847, + "flos": 16983162288000.0, + "grad_norm": 1.556341262673854, + "language_loss": 0.69037539, + "learning_rate": 3.316451371581431e-06, + "loss": 0.71217644, + "num_input_tokens_seen": 105124200, + "step": 4876, + "time_per_iteration": 2.6719844341278076 + }, + { + "auxiliary_loss_clip": 0.01121768, + "auxiliary_loss_mlp": 0.01047732, + "balance_loss_clip": 1.04729414, + "balance_loss_mlp": 1.03105509, + "epoch": 0.29322110326168643, + "flos": 16357705741440.0, + "grad_norm": 2.0371531421747466, + "language_loss": 0.82111382, + "learning_rate": 3.316158151823096e-06, + "loss": 0.84280884, + "num_input_tokens_seen": 105140400, + "step": 4877, + "time_per_iteration": 2.632293462753296 + }, + { + "auxiliary_loss_clip": 0.01139233, + "auxiliary_loss_mlp": 0.01040634, + "balance_loss_clip": 1.05428672, + "balance_loss_mlp": 1.02392054, + "epoch": 0.29328122651435445, + "flos": 13990869843840.0, + "grad_norm": 3.614839551588232, + "language_loss": 0.67366385, + "learning_rate": 3.315864882155911e-06, + "loss": 0.69546252, + "num_input_tokens_seen": 105157535, + "step": 4878, + "time_per_iteration": 2.5839362144470215 + }, + { + "auxiliary_loss_clip": 0.01100237, + "auxiliary_loss_mlp": 0.01045253, + "balance_loss_clip": 1.04628241, + "balance_loss_mlp": 1.02817595, + "epoch": 0.2933413497670224, + "flos": 25264593423360.0, + "grad_norm": 2.0985622071445063, + "language_loss": 0.73632258, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.75777751, + "num_input_tokens_seen": 105175185, + "step": 4879, + "time_per_iteration": 2.738429307937622 + }, + { + "auxiliary_loss_clip": 0.01104776, + "auxiliary_loss_mlp": 0.00776504, + "balance_loss_clip": 1.05266857, + "balance_loss_mlp": 1.00116253, + "epoch": 0.2934014730196904, + "flos": 32123746656000.0, + "grad_norm": 1.8172867500477656, + "language_loss": 0.66441375, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.68322659, + "num_input_tokens_seen": 105194540, + "step": 4880, + "time_per_iteration": 2.7889339923858643 + }, + { + "auxiliary_loss_clip": 0.01130875, + "auxiliary_loss_mlp": 0.01049004, + "balance_loss_clip": 1.05021453, + "balance_loss_mlp": 1.03249359, + "epoch": 0.29346159627235835, + "flos": 24352498344960.0, + "grad_norm": 1.9971358437235982, + "language_loss": 0.70130688, + "learning_rate": 3.314984773812481e-06, + "loss": 0.72310567, + "num_input_tokens_seen": 105213215, + "step": 4881, + "time_per_iteration": 2.705906629562378 + }, + { + "auxiliary_loss_clip": 0.01112418, + "auxiliary_loss_mlp": 0.00775734, + "balance_loss_clip": 1.04823685, + "balance_loss_mlp": 1.00119698, + "epoch": 0.2935217195250263, + "flos": 22746752749440.0, + "grad_norm": 1.8949601379230998, + "language_loss": 0.83497417, + "learning_rate": 3.314691304621127e-06, + "loss": 0.85385573, + "num_input_tokens_seen": 105231585, + "step": 4882, + "time_per_iteration": 2.715853691101074 + }, + { + "auxiliary_loss_clip": 0.01148283, + "auxiliary_loss_mlp": 0.01045596, + "balance_loss_clip": 1.05350292, + "balance_loss_mlp": 1.02825117, + "epoch": 0.2935818427776943, + "flos": 21725561088000.0, + "grad_norm": 2.6750396503443827, + "language_loss": 0.71433568, + "learning_rate": 3.314397785576548e-06, + "loss": 0.73627448, + "num_input_tokens_seen": 105250120, + "step": 4883, + "time_per_iteration": 2.629642963409424 + }, + { + "auxiliary_loss_clip": 0.01123143, + "auxiliary_loss_mlp": 0.01040743, + "balance_loss_clip": 1.05262315, + "balance_loss_mlp": 1.0230521, + "epoch": 0.29364196603036224, + "flos": 23804968354560.0, + "grad_norm": 2.1262053984109226, + "language_loss": 0.92650437, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.94814324, + "num_input_tokens_seen": 105266065, + "step": 4884, + "time_per_iteration": 2.727379322052002 + }, + { + "auxiliary_loss_clip": 0.01138638, + "auxiliary_loss_mlp": 0.01039707, + "balance_loss_clip": 1.05512667, + "balance_loss_mlp": 1.0232085, + "epoch": 0.2937020892830302, + "flos": 23470064922240.0, + "grad_norm": 2.19754538449792, + "language_loss": 0.73535883, + "learning_rate": 3.313810597972234e-06, + "loss": 0.75714231, + "num_input_tokens_seen": 105282155, + "step": 4885, + "time_per_iteration": 2.706212043762207 + }, + { + "auxiliary_loss_clip": 0.01124089, + "auxiliary_loss_mlp": 0.01045234, + "balance_loss_clip": 1.04882109, + "balance_loss_mlp": 1.02791286, + "epoch": 0.2937622125356982, + "flos": 24272740195200.0, + "grad_norm": 2.8259058407064566, + "language_loss": 0.84815478, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.86984795, + "num_input_tokens_seen": 105299225, + "step": 4886, + "time_per_iteration": 2.651383876800537 + }, + { + "auxiliary_loss_clip": 0.01112051, + "auxiliary_loss_mlp": 0.01040147, + "balance_loss_clip": 1.04674077, + "balance_loss_mlp": 1.023839, + "epoch": 0.29382233578836614, + "flos": 20662461233280.0, + "grad_norm": 2.312079302728887, + "language_loss": 0.77030611, + "learning_rate": 3.313223211088603e-06, + "loss": 0.7918281, + "num_input_tokens_seen": 105315710, + "step": 4887, + "time_per_iteration": 2.8299317359924316 + }, + { + "auxiliary_loss_clip": 0.01121167, + "auxiliary_loss_mlp": 0.01044419, + "balance_loss_clip": 1.05137563, + "balance_loss_mlp": 1.02809978, + "epoch": 0.2938824590410341, + "flos": 16545052103040.0, + "grad_norm": 4.814706857660641, + "language_loss": 0.79822707, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.81988299, + "num_input_tokens_seen": 105333505, + "step": 4888, + "time_per_iteration": 2.6942543983459473 + }, + { + "auxiliary_loss_clip": 0.01114672, + "auxiliary_loss_mlp": 0.01035208, + "balance_loss_clip": 1.05101824, + "balance_loss_mlp": 1.01886487, + "epoch": 0.29394258229370207, + "flos": 37925474382720.0, + "grad_norm": 1.8060574020422921, + "language_loss": 0.55514884, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57664764, + "num_input_tokens_seen": 105355605, + "step": 4889, + "time_per_iteration": 2.838529586791992 + }, + { + "auxiliary_loss_clip": 0.01136079, + "auxiliary_loss_mlp": 0.01040242, + "balance_loss_clip": 1.05230045, + "balance_loss_mlp": 1.02257514, + "epoch": 0.29400270554637004, + "flos": 20044690197120.0, + "grad_norm": 1.9006309093473746, + "language_loss": 0.84414017, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.86590338, + "num_input_tokens_seen": 105374225, + "step": 4890, + "time_per_iteration": 2.653601884841919 + }, + { + "auxiliary_loss_clip": 0.01138833, + "auxiliary_loss_mlp": 0.01044226, + "balance_loss_clip": 1.05449104, + "balance_loss_mlp": 1.02767992, + "epoch": 0.294062828799038, + "flos": 15266380775040.0, + "grad_norm": 2.3284792525221625, + "language_loss": 0.72417939, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.74600995, + "num_input_tokens_seen": 105391565, + "step": 4891, + "time_per_iteration": 2.6499764919281006 + }, + { + "auxiliary_loss_clip": 0.01148906, + "auxiliary_loss_mlp": 0.01046245, + "balance_loss_clip": 1.05517375, + "balance_loss_mlp": 1.02797008, + "epoch": 0.294122952051706, + "flos": 22747147799040.0, + "grad_norm": 1.6858898954482169, + "language_loss": 0.77310836, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.7950598, + "num_input_tokens_seen": 105409840, + "step": 4892, + "time_per_iteration": 2.6123669147491455 + }, + { + "auxiliary_loss_clip": 0.01143283, + "auxiliary_loss_mlp": 0.01036481, + "balance_loss_clip": 1.05147183, + "balance_loss_mlp": 1.01932704, + "epoch": 0.294183075304374, + "flos": 24972891073920.0, + "grad_norm": 1.8056938004749827, + "language_loss": 0.77826709, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.80006474, + "num_input_tokens_seen": 105428645, + "step": 4893, + "time_per_iteration": 2.6142194271087646 + }, + { + "auxiliary_loss_clip": 0.01106286, + "auxiliary_loss_mlp": 0.01045871, + "balance_loss_clip": 1.0508399, + "balance_loss_mlp": 1.02912164, + "epoch": 0.29424319855704195, + "flos": 30952986762240.0, + "grad_norm": 3.6552959609210944, + "language_loss": 0.85032988, + "learning_rate": 3.311165788957864e-06, + "loss": 0.87185144, + "num_input_tokens_seen": 105447480, + "step": 4894, + "time_per_iteration": 2.837883234024048 + }, + { + "auxiliary_loss_clip": 0.01131513, + "auxiliary_loss_mlp": 0.01038131, + "balance_loss_clip": 1.05098557, + "balance_loss_mlp": 1.02169216, + "epoch": 0.2943033218097099, + "flos": 15231583474560.0, + "grad_norm": 3.570255241204836, + "language_loss": 0.90650308, + "learning_rate": 3.310871672543274e-06, + "loss": 0.92819947, + "num_input_tokens_seen": 105464600, + "step": 4895, + "time_per_iteration": 2.588153839111328 + }, + { + "auxiliary_loss_clip": 0.01138224, + "auxiliary_loss_mlp": 0.01045554, + "balance_loss_clip": 1.05338621, + "balance_loss_mlp": 1.02777958, + "epoch": 0.2943634450623779, + "flos": 21725884310400.0, + "grad_norm": 1.7548452829513195, + "language_loss": 0.86612183, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.88795966, + "num_input_tokens_seen": 105481510, + "step": 4896, + "time_per_iteration": 2.6405279636383057 + }, + { + "auxiliary_loss_clip": 0.01142594, + "auxiliary_loss_mlp": 0.01053714, + "balance_loss_clip": 1.05662429, + "balance_loss_mlp": 1.03620195, + "epoch": 0.29442356831504585, + "flos": 22602104680320.0, + "grad_norm": 2.0549220420715906, + "language_loss": 0.73394442, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.75590742, + "num_input_tokens_seen": 105501390, + "step": 4897, + "time_per_iteration": 2.6669554710388184 + }, + { + "auxiliary_loss_clip": 0.01128563, + "auxiliary_loss_mlp": 0.01050668, + "balance_loss_clip": 1.04556203, + "balance_loss_mlp": 1.03214252, + "epoch": 0.2944836915677138, + "flos": 20011401267840.0, + "grad_norm": 2.0814872266581426, + "language_loss": 0.74344778, + "learning_rate": 3.309989025093813e-06, + "loss": 0.76524007, + "num_input_tokens_seen": 105519600, + "step": 4898, + "time_per_iteration": 2.6286890506744385 + }, + { + "auxiliary_loss_clip": 0.01140269, + "auxiliary_loss_mlp": 0.01047883, + "balance_loss_clip": 1.05775058, + "balance_loss_mlp": 1.02880955, + "epoch": 0.2945438148203818, + "flos": 20045875345920.0, + "grad_norm": 2.610474436320842, + "language_loss": 0.70560962, + "learning_rate": 3.309694709912618e-06, + "loss": 0.72749114, + "num_input_tokens_seen": 105535970, + "step": 4899, + "time_per_iteration": 2.6050777435302734 + }, + { + "auxiliary_loss_clip": 0.01122842, + "auxiliary_loss_mlp": 0.00775757, + "balance_loss_clip": 1.05115175, + "balance_loss_mlp": 1.00110114, + "epoch": 0.29460393807304974, + "flos": 23733542160000.0, + "grad_norm": 2.6981557529788587, + "language_loss": 0.78938496, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.80837095, + "num_input_tokens_seen": 105556735, + "step": 4900, + "time_per_iteration": 2.7517058849334717 + }, + { + "auxiliary_loss_clip": 0.0110429, + "auxiliary_loss_mlp": 0.01059395, + "balance_loss_clip": 1.04257679, + "balance_loss_mlp": 1.03992808, + "epoch": 0.2946640613257177, + "flos": 14976079056000.0, + "grad_norm": 1.7286923709762618, + "language_loss": 0.80861294, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.83024979, + "num_input_tokens_seen": 105574875, + "step": 4901, + "time_per_iteration": 2.58297061920166 + }, + { + "auxiliary_loss_clip": 0.01114064, + "auxiliary_loss_mlp": 0.01035256, + "balance_loss_clip": 1.05081403, + "balance_loss_mlp": 1.01993775, + "epoch": 0.2947241845783857, + "flos": 24243904552320.0, + "grad_norm": 2.2236242529025954, + "language_loss": 0.57768303, + "learning_rate": 3.308811466431157e-06, + "loss": 0.59917623, + "num_input_tokens_seen": 105594225, + "step": 4902, + "time_per_iteration": 2.6765553951263428 + }, + { + "auxiliary_loss_clip": 0.01122886, + "auxiliary_loss_mlp": 0.01044406, + "balance_loss_clip": 1.05165744, + "balance_loss_mlp": 1.02809834, + "epoch": 0.29478430783105364, + "flos": 19938394874880.0, + "grad_norm": 1.6365628527843905, + "language_loss": 0.7553789, + "learning_rate": 3.308516952661925e-06, + "loss": 0.77705181, + "num_input_tokens_seen": 105614000, + "step": 4903, + "time_per_iteration": 5.72201132774353 + }, + { + "auxiliary_loss_clip": 0.01117125, + "auxiliary_loss_mlp": 0.01054328, + "balance_loss_clip": 1.05058551, + "balance_loss_mlp": 1.03506362, + "epoch": 0.2948444310837216, + "flos": 27381347856000.0, + "grad_norm": 1.79479894391178, + "language_loss": 0.62782186, + "learning_rate": 3.3082223892736e-06, + "loss": 0.64953631, + "num_input_tokens_seen": 105634575, + "step": 4904, + "time_per_iteration": 2.7290875911712646 + }, + { + "auxiliary_loss_clip": 0.01135143, + "auxiliary_loss_mlp": 0.01043669, + "balance_loss_clip": 1.05146813, + "balance_loss_mlp": 1.02669382, + "epoch": 0.2949045543363896, + "flos": 23405462311680.0, + "grad_norm": 1.4755442774564356, + "language_loss": 0.73145443, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.75324261, + "num_input_tokens_seen": 105654385, + "step": 4905, + "time_per_iteration": 2.6482555866241455 + }, + { + "auxiliary_loss_clip": 0.01112476, + "auxiliary_loss_mlp": 0.01046266, + "balance_loss_clip": 1.05017638, + "balance_loss_mlp": 1.028265, + "epoch": 0.2949646775890576, + "flos": 23951483930880.0, + "grad_norm": 1.7800977730713317, + "language_loss": 0.8199898, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.84157723, + "num_input_tokens_seen": 105673570, + "step": 4906, + "time_per_iteration": 2.737182378768921 + }, + { + "auxiliary_loss_clip": 0.01094663, + "auxiliary_loss_mlp": 0.01040505, + "balance_loss_clip": 1.04579425, + "balance_loss_mlp": 1.02372003, + "epoch": 0.29502480084172555, + "flos": 22784315397120.0, + "grad_norm": 2.8763815934933867, + "language_loss": 0.87373984, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.89509153, + "num_input_tokens_seen": 105691940, + "step": 4907, + "time_per_iteration": 4.367825746536255 + }, + { + "auxiliary_loss_clip": 0.01149393, + "auxiliary_loss_mlp": 0.01043671, + "balance_loss_clip": 1.05400407, + "balance_loss_mlp": 1.02501488, + "epoch": 0.2950849240943935, + "flos": 19646656611840.0, + "grad_norm": 2.047818146937445, + "language_loss": 0.81910521, + "learning_rate": 3.307043639752782e-06, + "loss": 0.84103584, + "num_input_tokens_seen": 105709825, + "step": 4908, + "time_per_iteration": 2.578582525253296 + }, + { + "auxiliary_loss_clip": 0.01055582, + "auxiliary_loss_mlp": 0.01003419, + "balance_loss_clip": 1.02453518, + "balance_loss_mlp": 1.00138056, + "epoch": 0.2951450473470615, + "flos": 71002829260800.0, + "grad_norm": 0.7982723827999523, + "language_loss": 0.57287854, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59346855, + "num_input_tokens_seen": 105766880, + "step": 4909, + "time_per_iteration": 4.640491247177124 + }, + { + "auxiliary_loss_clip": 0.01135445, + "auxiliary_loss_mlp": 0.00774301, + "balance_loss_clip": 1.05580318, + "balance_loss_mlp": 1.00097156, + "epoch": 0.29520517059972945, + "flos": 22966310632320.0, + "grad_norm": 1.756295161453336, + "language_loss": 0.87018639, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.88928384, + "num_input_tokens_seen": 105786875, + "step": 4910, + "time_per_iteration": 2.642312526702881 + }, + { + "auxiliary_loss_clip": 0.01131096, + "auxiliary_loss_mlp": 0.0104303, + "balance_loss_clip": 1.05359542, + "balance_loss_mlp": 1.02744913, + "epoch": 0.2952652938523974, + "flos": 20485673470080.0, + "grad_norm": 1.692596753939278, + "language_loss": 0.73332304, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.75506431, + "num_input_tokens_seen": 105805315, + "step": 4911, + "time_per_iteration": 2.6130573749542236 + }, + { + "auxiliary_loss_clip": 0.01132917, + "auxiliary_loss_mlp": 0.01038473, + "balance_loss_clip": 1.05330253, + "balance_loss_mlp": 1.02193832, + "epoch": 0.2953254171050654, + "flos": 19646584784640.0, + "grad_norm": 1.8009313294920104, + "language_loss": 0.89653587, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.91824973, + "num_input_tokens_seen": 105825125, + "step": 4912, + "time_per_iteration": 2.660090684890747 + }, + { + "auxiliary_loss_clip": 0.01114053, + "auxiliary_loss_mlp": 0.010529, + "balance_loss_clip": 1.0482899, + "balance_loss_mlp": 1.03503084, + "epoch": 0.29538554035773334, + "flos": 22747973811840.0, + "grad_norm": 1.3579869674800176, + "language_loss": 0.83175462, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.85342413, + "num_input_tokens_seen": 105846085, + "step": 4913, + "time_per_iteration": 2.743364095687866 + }, + { + "auxiliary_loss_clip": 0.01142468, + "auxiliary_loss_mlp": 0.01043093, + "balance_loss_clip": 1.04977608, + "balance_loss_mlp": 1.02690446, + "epoch": 0.2954456636104013, + "flos": 21871861182720.0, + "grad_norm": 1.9704695859403116, + "language_loss": 0.76919919, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.79105484, + "num_input_tokens_seen": 105865400, + "step": 4914, + "time_per_iteration": 2.6778385639190674 + }, + { + "auxiliary_loss_clip": 0.01121315, + "auxiliary_loss_mlp": 0.01045386, + "balance_loss_clip": 1.05064511, + "balance_loss_mlp": 1.02818418, + "epoch": 0.2955057868630693, + "flos": 40442560871040.0, + "grad_norm": 1.678810736285401, + "language_loss": 0.81829619, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.8399632, + "num_input_tokens_seen": 105887920, + "step": 4915, + "time_per_iteration": 2.9347212314605713 + }, + { + "auxiliary_loss_clip": 0.01068117, + "auxiliary_loss_mlp": 0.01044435, + "balance_loss_clip": 1.04405856, + "balance_loss_mlp": 1.02722168, + "epoch": 0.29556591011573724, + "flos": 22564506119040.0, + "grad_norm": 2.129336551193515, + "language_loss": 0.84701812, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.86814368, + "num_input_tokens_seen": 105904035, + "step": 4916, + "time_per_iteration": 2.9183273315429688 + }, + { + "auxiliary_loss_clip": 0.01125851, + "auxiliary_loss_mlp": 0.01036694, + "balance_loss_clip": 1.04655123, + "balance_loss_mlp": 1.01975429, + "epoch": 0.2956260333684052, + "flos": 22089300163200.0, + "grad_norm": 2.1082729468541683, + "language_loss": 0.69490808, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.71653348, + "num_input_tokens_seen": 105922685, + "step": 4917, + "time_per_iteration": 2.7400357723236084 + }, + { + "auxiliary_loss_clip": 0.01123659, + "auxiliary_loss_mlp": 0.01038633, + "balance_loss_clip": 1.05140972, + "balance_loss_mlp": 1.02214622, + "epoch": 0.2956861566210732, + "flos": 16435488643200.0, + "grad_norm": 2.699189623646437, + "language_loss": 0.91076934, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.93239224, + "num_input_tokens_seen": 105940425, + "step": 4918, + "time_per_iteration": 2.7343270778656006 + }, + { + "auxiliary_loss_clip": 0.01147937, + "auxiliary_loss_mlp": 0.01043258, + "balance_loss_clip": 1.0551039, + "balance_loss_mlp": 1.02629495, + "epoch": 0.2957462798737412, + "flos": 25812087500160.0, + "grad_norm": 1.9388581576792214, + "language_loss": 0.72399175, + "learning_rate": 3.303797991757425e-06, + "loss": 0.74590373, + "num_input_tokens_seen": 105960550, + "step": 4919, + "time_per_iteration": 2.718583822250366 + }, + { + "auxiliary_loss_clip": 0.01119627, + "auxiliary_loss_mlp": 0.01045651, + "balance_loss_clip": 1.04843163, + "balance_loss_mlp": 1.02838945, + "epoch": 0.29580640312640916, + "flos": 16690849407360.0, + "grad_norm": 1.8826298231205452, + "language_loss": 0.75919485, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.78084767, + "num_input_tokens_seen": 105978820, + "step": 4920, + "time_per_iteration": 2.7425734996795654 + }, + { + "auxiliary_loss_clip": 0.01121739, + "auxiliary_loss_mlp": 0.01052293, + "balance_loss_clip": 1.05511427, + "balance_loss_mlp": 1.03449547, + "epoch": 0.2958665263790771, + "flos": 23945594100480.0, + "grad_norm": 5.307541834842734, + "language_loss": 0.69020098, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.71194124, + "num_input_tokens_seen": 105997545, + "step": 4921, + "time_per_iteration": 2.7755305767059326 + }, + { + "auxiliary_loss_clip": 0.01120164, + "auxiliary_loss_mlp": 0.01043, + "balance_loss_clip": 1.05075121, + "balance_loss_mlp": 1.02453458, + "epoch": 0.2959266496317451, + "flos": 18478410670080.0, + "grad_norm": 1.8488664920888758, + "language_loss": 0.7462194, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.767851, + "num_input_tokens_seen": 106015320, + "step": 4922, + "time_per_iteration": 2.740687131881714 + }, + { + "auxiliary_loss_clip": 0.01152013, + "auxiliary_loss_mlp": 0.00775382, + "balance_loss_clip": 1.05429566, + "balance_loss_mlp": 1.00129843, + "epoch": 0.29598677288441305, + "flos": 25957489754880.0, + "grad_norm": 1.7662799143188246, + "language_loss": 0.77148855, + "learning_rate": 3.302616272134737e-06, + "loss": 0.79076254, + "num_input_tokens_seen": 106034555, + "step": 4923, + "time_per_iteration": 2.664875030517578 + }, + { + "auxiliary_loss_clip": 0.01117655, + "auxiliary_loss_mlp": 0.01042537, + "balance_loss_clip": 1.05065989, + "balance_loss_mlp": 1.0247035, + "epoch": 0.296046896137081, + "flos": 25155999630720.0, + "grad_norm": 1.7775190737024398, + "language_loss": 0.86232758, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.88392955, + "num_input_tokens_seen": 106054200, + "step": 4924, + "time_per_iteration": 2.7413501739501953 + }, + { + "auxiliary_loss_clip": 0.01132544, + "auxiliary_loss_mlp": 0.01038356, + "balance_loss_clip": 1.05098939, + "balance_loss_mlp": 1.02114248, + "epoch": 0.296107019389749, + "flos": 21761148487680.0, + "grad_norm": 1.479657736715748, + "language_loss": 0.82050943, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.84221852, + "num_input_tokens_seen": 106074700, + "step": 4925, + "time_per_iteration": 2.676556348800659 + }, + { + "auxiliary_loss_clip": 0.01078547, + "auxiliary_loss_mlp": 0.01051683, + "balance_loss_clip": 1.04153097, + "balance_loss_mlp": 1.03283572, + "epoch": 0.29616714264241695, + "flos": 17960039544960.0, + "grad_norm": 2.5440905583969697, + "language_loss": 0.86138272, + "learning_rate": 3.301729463727452e-06, + "loss": 0.88268495, + "num_input_tokens_seen": 106091415, + "step": 4926, + "time_per_iteration": 2.675780773162842 + }, + { + "auxiliary_loss_clip": 0.01108502, + "auxiliary_loss_mlp": 0.01035423, + "balance_loss_clip": 1.04910469, + "balance_loss_mlp": 1.0193243, + "epoch": 0.2962272658950849, + "flos": 15012779777280.0, + "grad_norm": 2.332235960138756, + "language_loss": 0.85897464, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.88041389, + "num_input_tokens_seen": 106109135, + "step": 4927, + "time_per_iteration": 2.7407169342041016 + }, + { + "auxiliary_loss_clip": 0.01131541, + "auxiliary_loss_mlp": 0.01039363, + "balance_loss_clip": 1.05158448, + "balance_loss_mlp": 1.02312613, + "epoch": 0.2962873891477529, + "flos": 14720861946240.0, + "grad_norm": 3.581765820174834, + "language_loss": 0.80772752, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.8294366, + "num_input_tokens_seen": 106125750, + "step": 4928, + "time_per_iteration": 2.6719777584075928 + }, + { + "auxiliary_loss_clip": 0.01123889, + "auxiliary_loss_mlp": 0.01043191, + "balance_loss_clip": 1.04852009, + "balance_loss_mlp": 1.02346206, + "epoch": 0.29634751240042084, + "flos": 26723787528960.0, + "grad_norm": 2.79065826833615, + "language_loss": 0.7313869, + "learning_rate": 3.300842211064773e-06, + "loss": 0.75305772, + "num_input_tokens_seen": 106142835, + "step": 4929, + "time_per_iteration": 2.75266695022583 + }, + { + "auxiliary_loss_clip": 0.0112132, + "auxiliary_loss_mlp": 0.01054118, + "balance_loss_clip": 1.0495156, + "balance_loss_mlp": 1.03481805, + "epoch": 0.2964076356530888, + "flos": 14571293713920.0, + "grad_norm": 2.360375509218164, + "language_loss": 0.71534413, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.73709846, + "num_input_tokens_seen": 106160680, + "step": 4930, + "time_per_iteration": 2.799149990081787 + }, + { + "auxiliary_loss_clip": 0.01028509, + "auxiliary_loss_mlp": 0.01003992, + "balance_loss_clip": 1.03094876, + "balance_loss_mlp": 1.00229919, + "epoch": 0.29646775890575683, + "flos": 63104315063040.0, + "grad_norm": 0.8053244370028285, + "language_loss": 0.6061247, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.6264497, + "num_input_tokens_seen": 106224415, + "step": 4931, + "time_per_iteration": 3.218900442123413 + }, + { + "auxiliary_loss_clip": 0.01007041, + "auxiliary_loss_mlp": 0.01005936, + "balance_loss_clip": 1.02247667, + "balance_loss_mlp": 1.00395727, + "epoch": 0.2965278821584248, + "flos": 63067686168960.0, + "grad_norm": 0.7408573754586586, + "language_loss": 0.52380091, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54393071, + "num_input_tokens_seen": 106279140, + "step": 4932, + "time_per_iteration": 3.26432728767395 + }, + { + "auxiliary_loss_clip": 0.01129633, + "auxiliary_loss_mlp": 0.01042438, + "balance_loss_clip": 1.04917526, + "balance_loss_mlp": 1.02584457, + "epoch": 0.29658800541109276, + "flos": 23768734510080.0, + "grad_norm": 2.012094119717185, + "language_loss": 0.81540775, + "learning_rate": 3.299658516973972e-06, + "loss": 0.83712846, + "num_input_tokens_seen": 106298190, + "step": 4933, + "time_per_iteration": 2.804293155670166 + }, + { + "auxiliary_loss_clip": 0.01092845, + "auxiliary_loss_mlp": 0.01036901, + "balance_loss_clip": 1.04405773, + "balance_loss_mlp": 1.01966333, + "epoch": 0.2966481286637607, + "flos": 23988543788160.0, + "grad_norm": 1.916542141573101, + "language_loss": 0.75165296, + "learning_rate": 3.299362470215261e-06, + "loss": 0.77295041, + "num_input_tokens_seen": 106319065, + "step": 4934, + "time_per_iteration": 2.797697067260742 + }, + { + "auxiliary_loss_clip": 0.01126398, + "auxiliary_loss_mlp": 0.01047716, + "balance_loss_clip": 1.04985118, + "balance_loss_mlp": 1.03013301, + "epoch": 0.2967082519164287, + "flos": 17165157523200.0, + "grad_norm": 1.8491505675561635, + "language_loss": 0.62093496, + "learning_rate": 3.299066374184594e-06, + "loss": 0.64267612, + "num_input_tokens_seen": 106338040, + "step": 4935, + "time_per_iteration": 2.6466407775878906 + }, + { + "auxiliary_loss_clip": 0.01129018, + "auxiliary_loss_mlp": 0.01041652, + "balance_loss_clip": 1.05052114, + "balance_loss_mlp": 1.02452123, + "epoch": 0.29676837516909665, + "flos": 29387712816000.0, + "grad_norm": 1.4269626202910053, + "language_loss": 0.79485404, + "learning_rate": 3.2987702288932e-06, + "loss": 0.81656075, + "num_input_tokens_seen": 106358900, + "step": 4936, + "time_per_iteration": 2.7333009243011475 + }, + { + "auxiliary_loss_clip": 0.01100808, + "auxiliary_loss_mlp": 0.01048756, + "balance_loss_clip": 1.04970682, + "balance_loss_mlp": 1.03040934, + "epoch": 0.2968284984217646, + "flos": 34751222616960.0, + "grad_norm": 1.5951903019521643, + "language_loss": 0.73993498, + "learning_rate": 3.298474034352309e-06, + "loss": 0.76143062, + "num_input_tokens_seen": 106381805, + "step": 4937, + "time_per_iteration": 2.853935718536377 + }, + { + "auxiliary_loss_clip": 0.01094789, + "auxiliary_loss_mlp": 0.01038743, + "balance_loss_clip": 1.05060768, + "balance_loss_mlp": 1.0209924, + "epoch": 0.2968886216744326, + "flos": 21544104556800.0, + "grad_norm": 1.654578873057457, + "language_loss": 0.78373563, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.80507094, + "num_input_tokens_seen": 106402365, + "step": 4938, + "time_per_iteration": 2.803147077560425 + }, + { + "auxiliary_loss_clip": 0.0111878, + "auxiliary_loss_mlp": 0.01048023, + "balance_loss_clip": 1.05193913, + "balance_loss_mlp": 1.02931857, + "epoch": 0.29694874492710055, + "flos": 12787323811200.0, + "grad_norm": 2.4827377035181013, + "language_loss": 0.76842266, + "learning_rate": 3.297881497566964e-06, + "loss": 0.79009068, + "num_input_tokens_seen": 106419800, + "step": 4939, + "time_per_iteration": 2.8867270946502686 + }, + { + "auxiliary_loss_clip": 0.0111051, + "auxiliary_loss_mlp": 0.01041172, + "balance_loss_clip": 1.04666841, + "balance_loss_mlp": 1.02361226, + "epoch": 0.2970088681797685, + "flos": 24569973239040.0, + "grad_norm": 1.8055035581570296, + "language_loss": 0.78354549, + "learning_rate": 3.297585155344979e-06, + "loss": 0.80506229, + "num_input_tokens_seen": 106440300, + "step": 4940, + "time_per_iteration": 2.783046245574951 + }, + { + "auxiliary_loss_clip": 0.01117762, + "auxiliary_loss_mlp": 0.01037936, + "balance_loss_clip": 1.0486958, + "balance_loss_mlp": 1.01876736, + "epoch": 0.2970689914324365, + "flos": 23659171050240.0, + "grad_norm": 1.6305550110852276, + "language_loss": 0.75628781, + "learning_rate": 3.297288763918435e-06, + "loss": 0.77784479, + "num_input_tokens_seen": 106460035, + "step": 4941, + "time_per_iteration": 2.74379825592041 + }, + { + "auxiliary_loss_clip": 0.01138083, + "auxiliary_loss_mlp": 0.01051629, + "balance_loss_clip": 1.05272233, + "balance_loss_mlp": 1.03276968, + "epoch": 0.29712911468510445, + "flos": 39670301439360.0, + "grad_norm": 2.3053326725865313, + "language_loss": 0.74158287, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.76347995, + "num_input_tokens_seen": 106481095, + "step": 4942, + "time_per_iteration": 4.468350410461426 + }, + { + "auxiliary_loss_clip": 0.01111068, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.05172181, + "balance_loss_mlp": 1.02589595, + "epoch": 0.2971892379377724, + "flos": 26395312631040.0, + "grad_norm": 2.42728921351593, + "language_loss": 0.702492, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.72404563, + "num_input_tokens_seen": 106501590, + "step": 4943, + "time_per_iteration": 4.2555251121521 + }, + { + "auxiliary_loss_clip": 0.01124177, + "auxiliary_loss_mlp": 0.01041442, + "balance_loss_clip": 1.04988825, + "balance_loss_mlp": 1.02360821, + "epoch": 0.2972493611904404, + "flos": 17603195880960.0, + "grad_norm": 2.221197725988377, + "language_loss": 0.795506, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.81716216, + "num_input_tokens_seen": 106519430, + "step": 4944, + "time_per_iteration": 2.6572201251983643 + }, + { + "auxiliary_loss_clip": 0.0111705, + "auxiliary_loss_mlp": 0.01041351, + "balance_loss_clip": 1.04914248, + "balance_loss_mlp": 1.02521038, + "epoch": 0.2973094844431084, + "flos": 20412774817920.0, + "grad_norm": 2.187472317578873, + "language_loss": 0.83260202, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.85418606, + "num_input_tokens_seen": 106535870, + "step": 4945, + "time_per_iteration": 2.6700363159179688 + }, + { + "auxiliary_loss_clip": 0.01090371, + "auxiliary_loss_mlp": 0.01039575, + "balance_loss_clip": 1.04623246, + "balance_loss_mlp": 1.02256417, + "epoch": 0.29736960769577636, + "flos": 17493488766720.0, + "grad_norm": 1.8830005833778707, + "language_loss": 0.67067397, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.69197345, + "num_input_tokens_seen": 106553560, + "step": 4946, + "time_per_iteration": 4.29357385635376 + }, + { + "auxiliary_loss_clip": 0.01127819, + "auxiliary_loss_mlp": 0.00777134, + "balance_loss_clip": 1.04997563, + "balance_loss_mlp": 1.00115252, + "epoch": 0.2974297309484443, + "flos": 26103969417600.0, + "grad_norm": 1.879721590970614, + "language_loss": 0.73877805, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.75782764, + "num_input_tokens_seen": 106574115, + "step": 4947, + "time_per_iteration": 2.657038450241089 + }, + { + "auxiliary_loss_clip": 0.01109701, + "auxiliary_loss_mlp": 0.01045546, + "balance_loss_clip": 1.04896843, + "balance_loss_mlp": 1.02705622, + "epoch": 0.2974898542011123, + "flos": 25666433850240.0, + "grad_norm": 2.0989098852090633, + "language_loss": 0.73522758, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.75678003, + "num_input_tokens_seen": 106593070, + "step": 4948, + "time_per_iteration": 4.4359636306762695 + }, + { + "auxiliary_loss_clip": 0.01139863, + "auxiliary_loss_mlp": 0.01040301, + "balance_loss_clip": 1.04885721, + "balance_loss_mlp": 1.02332592, + "epoch": 0.29754997745378026, + "flos": 18661339658880.0, + "grad_norm": 2.06615582769113, + "language_loss": 0.8397494, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.86155105, + "num_input_tokens_seen": 106610695, + "step": 4949, + "time_per_iteration": 2.6052157878875732 + }, + { + "auxiliary_loss_clip": 0.01128522, + "auxiliary_loss_mlp": 0.01041578, + "balance_loss_clip": 1.04901218, + "balance_loss_mlp": 1.02459633, + "epoch": 0.2976101007064482, + "flos": 22274599449600.0, + "grad_norm": 2.2184783420455814, + "language_loss": 0.71360326, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73530424, + "num_input_tokens_seen": 106631300, + "step": 4950, + "time_per_iteration": 2.678953170776367 + }, + { + "auxiliary_loss_clip": 0.01095366, + "auxiliary_loss_mlp": 0.01039981, + "balance_loss_clip": 1.04944646, + "balance_loss_mlp": 1.0239712, + "epoch": 0.2976702239591162, + "flos": 21945657674880.0, + "grad_norm": 3.098719098855731, + "language_loss": 0.82645297, + "learning_rate": 3.294322145875789e-06, + "loss": 0.84780639, + "num_input_tokens_seen": 106650065, + "step": 4951, + "time_per_iteration": 2.7566003799438477 + }, + { + "auxiliary_loss_clip": 0.01118264, + "auxiliary_loss_mlp": 0.01039186, + "balance_loss_clip": 1.04655933, + "balance_loss_mlp": 1.02190065, + "epoch": 0.29773034721178415, + "flos": 24637197542400.0, + "grad_norm": 15.690000260498868, + "language_loss": 0.74144769, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.76302218, + "num_input_tokens_seen": 106668230, + "step": 4952, + "time_per_iteration": 2.7019882202148438 + }, + { + "auxiliary_loss_clip": 0.01063128, + "auxiliary_loss_mlp": 0.01049349, + "balance_loss_clip": 1.0433315, + "balance_loss_mlp": 1.03133702, + "epoch": 0.2977904704644521, + "flos": 20557566541440.0, + "grad_norm": 1.6701113978494808, + "language_loss": 0.84251344, + "learning_rate": 3.293728232937228e-06, + "loss": 0.86363828, + "num_input_tokens_seen": 106687785, + "step": 4953, + "time_per_iteration": 2.9622793197631836 + }, + { + "auxiliary_loss_clip": 0.01120636, + "auxiliary_loss_mlp": 0.01040588, + "balance_loss_clip": 1.04966831, + "balance_loss_mlp": 1.02428031, + "epoch": 0.2978505937171201, + "flos": 18916449027840.0, + "grad_norm": 2.301918041259246, + "language_loss": 0.74366152, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.76527375, + "num_input_tokens_seen": 106706875, + "step": 4954, + "time_per_iteration": 2.767455577850342 + }, + { + "auxiliary_loss_clip": 0.01138563, + "auxiliary_loss_mlp": 0.01036281, + "balance_loss_clip": 1.04899216, + "balance_loss_mlp": 1.02028275, + "epoch": 0.29791071696978805, + "flos": 19317750750720.0, + "grad_norm": 2.0603039788066155, + "language_loss": 0.75687683, + "learning_rate": 3.293134123765452e-06, + "loss": 0.77862525, + "num_input_tokens_seen": 106725105, + "step": 4955, + "time_per_iteration": 2.638389825820923 + }, + { + "auxiliary_loss_clip": 0.01094257, + "auxiliary_loss_mlp": 0.01042355, + "balance_loss_clip": 1.04760742, + "balance_loss_mlp": 1.02505171, + "epoch": 0.297970840222456, + "flos": 18806813740800.0, + "grad_norm": 2.358195616275362, + "language_loss": 0.72600436, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.74737054, + "num_input_tokens_seen": 106744780, + "step": 4956, + "time_per_iteration": 2.777873992919922 + }, + { + "auxiliary_loss_clip": 0.01134603, + "auxiliary_loss_mlp": 0.0104754, + "balance_loss_clip": 1.04957581, + "balance_loss_mlp": 1.02930105, + "epoch": 0.298030963475124, + "flos": 22852760762880.0, + "grad_norm": 2.0297274127598435, + "language_loss": 0.79068756, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81250894, + "num_input_tokens_seen": 106764670, + "step": 4957, + "time_per_iteration": 2.719581365585327 + }, + { + "auxiliary_loss_clip": 0.01134843, + "auxiliary_loss_mlp": 0.01041974, + "balance_loss_clip": 1.05054235, + "balance_loss_mlp": 1.02383018, + "epoch": 0.298091086727792, + "flos": 21868485304320.0, + "grad_norm": 1.706880580606115, + "language_loss": 0.70570725, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.7274754, + "num_input_tokens_seen": 106783695, + "step": 4958, + "time_per_iteration": 2.613697052001953 + }, + { + "auxiliary_loss_clip": 0.01108077, + "auxiliary_loss_mlp": 0.0104267, + "balance_loss_clip": 1.05166888, + "balance_loss_mlp": 1.0253129, + "epoch": 0.29815120998045996, + "flos": 21175014355200.0, + "grad_norm": 1.5383051389102413, + "language_loss": 0.78736448, + "learning_rate": 3.291945317082743e-06, + "loss": 0.80887192, + "num_input_tokens_seen": 106803150, + "step": 4959, + "time_per_iteration": 2.751455545425415 + }, + { + "auxiliary_loss_clip": 0.01129828, + "auxiliary_loss_mlp": 0.01045919, + "balance_loss_clip": 1.04906321, + "balance_loss_mlp": 1.0290029, + "epoch": 0.29821133323312793, + "flos": 19896271200000.0, + "grad_norm": 1.6624120752671379, + "language_loss": 0.79747117, + "learning_rate": 3.291647992907147e-06, + "loss": 0.81922865, + "num_input_tokens_seen": 106820705, + "step": 4960, + "time_per_iteration": 2.6345505714416504 + }, + { + "auxiliary_loss_clip": 0.01110987, + "auxiliary_loss_mlp": 0.01052912, + "balance_loss_clip": 1.04863763, + "balance_loss_mlp": 1.03449416, + "epoch": 0.2982714564857959, + "flos": 12750766744320.0, + "grad_norm": 2.376132196895137, + "language_loss": 0.73364639, + "learning_rate": 3.291350619752129e-06, + "loss": 0.75528538, + "num_input_tokens_seen": 106837335, + "step": 4961, + "time_per_iteration": 2.725008010864258 + }, + { + "auxiliary_loss_clip": 0.01130001, + "auxiliary_loss_mlp": 0.0104294, + "balance_loss_clip": 1.04824948, + "balance_loss_mlp": 1.02640533, + "epoch": 0.29833157973846386, + "flos": 22271905929600.0, + "grad_norm": 2.036560430862295, + "language_loss": 0.62106621, + "learning_rate": 3.291053197628967e-06, + "loss": 0.64279556, + "num_input_tokens_seen": 106856250, + "step": 4962, + "time_per_iteration": 2.690870523452759 + }, + { + "auxiliary_loss_clip": 0.01128362, + "auxiliary_loss_mlp": 0.01051341, + "balance_loss_clip": 1.05034256, + "balance_loss_mlp": 1.03310251, + "epoch": 0.2983917029911318, + "flos": 15372999319680.0, + "grad_norm": 2.046461333274312, + "language_loss": 0.82866591, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.85046291, + "num_input_tokens_seen": 106873370, + "step": 4963, + "time_per_iteration": 2.637723207473755 + }, + { + "auxiliary_loss_clip": 0.01112844, + "auxiliary_loss_mlp": 0.01044675, + "balance_loss_clip": 1.05338502, + "balance_loss_mlp": 1.0272826, + "epoch": 0.2984518262437998, + "flos": 15377632174080.0, + "grad_norm": 2.580714695656121, + "language_loss": 0.65933317, + "learning_rate": 3.290458206523322e-06, + "loss": 0.68090838, + "num_input_tokens_seen": 106890330, + "step": 4964, + "time_per_iteration": 2.7210114002227783 + }, + { + "auxiliary_loss_clip": 0.01128428, + "auxiliary_loss_mlp": 0.01039216, + "balance_loss_clip": 1.04990005, + "balance_loss_mlp": 1.02345669, + "epoch": 0.29851194949646775, + "flos": 18108458542080.0, + "grad_norm": 1.8191471944851214, + "language_loss": 0.71093529, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.73261172, + "num_input_tokens_seen": 106909190, + "step": 4965, + "time_per_iteration": 2.7070064544677734 + }, + { + "auxiliary_loss_clip": 0.01151396, + "auxiliary_loss_mlp": 0.01056357, + "balance_loss_clip": 1.05813003, + "balance_loss_mlp": 1.03827357, + "epoch": 0.2985720727491357, + "flos": 22018233104640.0, + "grad_norm": 2.164601494744612, + "language_loss": 0.65952027, + "learning_rate": 3.289863019680461e-06, + "loss": 0.68159783, + "num_input_tokens_seen": 106927825, + "step": 4966, + "time_per_iteration": 2.5820860862731934 + }, + { + "auxiliary_loss_clip": 0.01148496, + "auxiliary_loss_mlp": 0.01042183, + "balance_loss_clip": 1.05610132, + "balance_loss_mlp": 1.02496934, + "epoch": 0.2986321960018037, + "flos": 13041355772160.0, + "grad_norm": 5.631297794621363, + "language_loss": 0.73553479, + "learning_rate": 3.289565352885785e-06, + "loss": 0.75744158, + "num_input_tokens_seen": 106943155, + "step": 4967, + "time_per_iteration": 2.558378219604492 + }, + { + "auxiliary_loss_clip": 0.01110231, + "auxiliary_loss_mlp": 0.01041561, + "balance_loss_clip": 1.04339898, + "balance_loss_mlp": 1.02440643, + "epoch": 0.29869231925447165, + "flos": 14465034305280.0, + "grad_norm": 2.07351823246568, + "language_loss": 0.71246195, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.73397982, + "num_input_tokens_seen": 106960295, + "step": 4968, + "time_per_iteration": 2.663163900375366 + }, + { + "auxiliary_loss_clip": 0.01124763, + "auxiliary_loss_mlp": 0.01043588, + "balance_loss_clip": 1.04864979, + "balance_loss_mlp": 1.02545607, + "epoch": 0.2987524425071396, + "flos": 31650228639360.0, + "grad_norm": 2.159507035183752, + "language_loss": 0.76744419, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.78912771, + "num_input_tokens_seen": 106982870, + "step": 4969, + "time_per_iteration": 2.729922294616699 + }, + { + "auxiliary_loss_clip": 0.0114364, + "auxiliary_loss_mlp": 0.01036255, + "balance_loss_clip": 1.05239987, + "balance_loss_mlp": 1.02054322, + "epoch": 0.2988125657598076, + "flos": 21433427775360.0, + "grad_norm": 2.2724385668179936, + "language_loss": 0.69836891, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.72016788, + "num_input_tokens_seen": 107002405, + "step": 4970, + "time_per_iteration": 2.6299381256103516 + }, + { + "auxiliary_loss_clip": 0.01135061, + "auxiliary_loss_mlp": 0.01048009, + "balance_loss_clip": 1.05199289, + "balance_loss_mlp": 1.02973413, + "epoch": 0.2988726890124756, + "flos": 18076965292800.0, + "grad_norm": 2.0648779209654258, + "language_loss": 0.85228848, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.87411916, + "num_input_tokens_seen": 107017310, + "step": 4971, + "time_per_iteration": 2.6508536338806152 + }, + { + "auxiliary_loss_clip": 0.01112297, + "auxiliary_loss_mlp": 0.01054091, + "balance_loss_clip": 1.04895663, + "balance_loss_mlp": 1.03510106, + "epoch": 0.29893281226514357, + "flos": 21755653706880.0, + "grad_norm": 2.125047221260382, + "language_loss": 0.79404521, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.81570905, + "num_input_tokens_seen": 107034645, + "step": 4972, + "time_per_iteration": 2.7924270629882812 + }, + { + "auxiliary_loss_clip": 0.01145651, + "auxiliary_loss_mlp": 0.01050789, + "balance_loss_clip": 1.05367875, + "balance_loss_mlp": 1.03427887, + "epoch": 0.29899293551781153, + "flos": 16836718538880.0, + "grad_norm": 2.200462139835186, + "language_loss": 0.85242772, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.87439215, + "num_input_tokens_seen": 107051125, + "step": 4973, + "time_per_iteration": 2.5249850749969482 + }, + { + "auxiliary_loss_clip": 0.011108, + "auxiliary_loss_mlp": 0.0104405, + "balance_loss_clip": 1.04758012, + "balance_loss_mlp": 1.02664554, + "epoch": 0.2990530587704795, + "flos": 11729215946880.0, + "grad_norm": 2.0029664307268664, + "language_loss": 0.77612329, + "learning_rate": 3.287480316742863e-06, + "loss": 0.79767179, + "num_input_tokens_seen": 107068815, + "step": 4974, + "time_per_iteration": 2.6555633544921875 + }, + { + "auxiliary_loss_clip": 0.01115732, + "auxiliary_loss_mlp": 0.00779073, + "balance_loss_clip": 1.04864824, + "balance_loss_mlp": 1.00132942, + "epoch": 0.29911318202314746, + "flos": 28039877850240.0, + "grad_norm": 1.735885031779611, + "language_loss": 0.72557616, + "learning_rate": 3.287182259060815e-06, + "loss": 0.74452424, + "num_input_tokens_seen": 107090420, + "step": 4975, + "time_per_iteration": 2.826773166656494 + }, + { + "auxiliary_loss_clip": 0.01137332, + "auxiliary_loss_mlp": 0.01043625, + "balance_loss_clip": 1.05628741, + "balance_loss_mlp": 1.02561235, + "epoch": 0.2991733052758154, + "flos": 18733555952640.0, + "grad_norm": 2.282255680734404, + "language_loss": 0.76357341, + "learning_rate": 3.286884152568687e-06, + "loss": 0.78538299, + "num_input_tokens_seen": 107107255, + "step": 4976, + "time_per_iteration": 2.7506988048553467 + }, + { + "auxiliary_loss_clip": 0.01130399, + "auxiliary_loss_mlp": 0.01046525, + "balance_loss_clip": 1.0515976, + "balance_loss_mlp": 1.02988303, + "epoch": 0.2992334285284834, + "flos": 15559160532480.0, + "grad_norm": 2.005019372487673, + "language_loss": 0.86173046, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.88349968, + "num_input_tokens_seen": 107123840, + "step": 4977, + "time_per_iteration": 2.665029764175415 + }, + { + "auxiliary_loss_clip": 0.01118345, + "auxiliary_loss_mlp": 0.01041325, + "balance_loss_clip": 1.05032945, + "balance_loss_mlp": 1.02443314, + "epoch": 0.29929355178115136, + "flos": 21797561900160.0, + "grad_norm": 1.7658271873172786, + "language_loss": 0.68290305, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.70449972, + "num_input_tokens_seen": 107143475, + "step": 4978, + "time_per_iteration": 2.8401222229003906 + }, + { + "auxiliary_loss_clip": 0.011259, + "auxiliary_loss_mlp": 0.0104045, + "balance_loss_clip": 1.05556107, + "balance_loss_mlp": 1.02268767, + "epoch": 0.2993536750338193, + "flos": 21178533888000.0, + "grad_norm": 2.254262103488659, + "language_loss": 0.76281357, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.78447711, + "num_input_tokens_seen": 107161725, + "step": 4979, + "time_per_iteration": 2.7814600467681885 + }, + { + "auxiliary_loss_clip": 0.01090165, + "auxiliary_loss_mlp": 0.0104942, + "balance_loss_clip": 1.04378402, + "balance_loss_mlp": 1.03001285, + "epoch": 0.2994137982864873, + "flos": 32122130544000.0, + "grad_norm": 2.1261514095664253, + "language_loss": 0.68627954, + "learning_rate": 3.285691238725484e-06, + "loss": 0.70767546, + "num_input_tokens_seen": 107183935, + "step": 4980, + "time_per_iteration": 2.891620635986328 + }, + { + "auxiliary_loss_clip": 0.01130184, + "auxiliary_loss_mlp": 0.00774942, + "balance_loss_clip": 1.0525018, + "balance_loss_mlp": 1.00121665, + "epoch": 0.29947392153915525, + "flos": 21105419754240.0, + "grad_norm": 2.1372298066204114, + "language_loss": 0.73153281, + "learning_rate": 3.285392888352555e-06, + "loss": 0.75058407, + "num_input_tokens_seen": 107204285, + "step": 4981, + "time_per_iteration": 5.394481420516968 + }, + { + "auxiliary_loss_clip": 0.01131964, + "auxiliary_loss_mlp": 0.0103921, + "balance_loss_clip": 1.0491364, + "balance_loss_mlp": 1.02280653, + "epoch": 0.2995340447918232, + "flos": 21542632099200.0, + "grad_norm": 1.6530173596529, + "language_loss": 0.86516619, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.88687789, + "num_input_tokens_seen": 107225265, + "step": 4982, + "time_per_iteration": 4.269104480743408 + }, + { + "auxiliary_loss_clip": 0.01122605, + "auxiliary_loss_mlp": 0.01045235, + "balance_loss_clip": 1.05186415, + "balance_loss_mlp": 1.02632844, + "epoch": 0.2995941680444912, + "flos": 16725143917440.0, + "grad_norm": 2.446225936700185, + "language_loss": 0.86517423, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.88685262, + "num_input_tokens_seen": 107241335, + "step": 4983, + "time_per_iteration": 2.844748020172119 + }, + { + "auxiliary_loss_clip": 0.01127565, + "auxiliary_loss_mlp": 0.01041992, + "balance_loss_clip": 1.05255556, + "balance_loss_mlp": 1.02594662, + "epoch": 0.2996542912971592, + "flos": 20923496346240.0, + "grad_norm": 2.024163877740881, + "language_loss": 0.78712893, + "learning_rate": 3.284497544825668e-06, + "loss": 0.80882448, + "num_input_tokens_seen": 107259375, + "step": 4984, + "time_per_iteration": 2.6945550441741943 + }, + { + "auxiliary_loss_clip": 0.01110139, + "auxiliary_loss_mlp": 0.01046002, + "balance_loss_clip": 1.0492574, + "balance_loss_mlp": 1.02761972, + "epoch": 0.29971441454982717, + "flos": 25079868754560.0, + "grad_norm": 1.5529534411437271, + "language_loss": 0.78736818, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.8089295, + "num_input_tokens_seen": 107279890, + "step": 4985, + "time_per_iteration": 2.8082690238952637 + }, + { + "auxiliary_loss_clip": 0.01083189, + "auxiliary_loss_mlp": 0.01050178, + "balance_loss_clip": 1.04330277, + "balance_loss_mlp": 1.02925658, + "epoch": 0.29977453780249513, + "flos": 52555911840000.0, + "grad_norm": 2.2301347819864112, + "language_loss": 0.72089684, + "learning_rate": 3.283900405580837e-06, + "loss": 0.74223053, + "num_input_tokens_seen": 107303430, + "step": 4986, + "time_per_iteration": 4.54891562461853 + }, + { + "auxiliary_loss_clip": 0.01119419, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_clip": 1.04838538, + "balance_loss_mlp": 1.03007603, + "epoch": 0.2998346610551631, + "flos": 22237144542720.0, + "grad_norm": 2.1453051702670787, + "language_loss": 0.73143345, + "learning_rate": 3.283601762924312e-06, + "loss": 0.75310332, + "num_input_tokens_seen": 107323700, + "step": 4987, + "time_per_iteration": 4.324375152587891 + }, + { + "auxiliary_loss_clip": 0.01111213, + "auxiliary_loss_mlp": 0.01039103, + "balance_loss_clip": 1.04803324, + "balance_loss_mlp": 1.0233314, + "epoch": 0.29989478430783106, + "flos": 16873203778560.0, + "grad_norm": 2.095598578062247, + "language_loss": 0.80221194, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.82371509, + "num_input_tokens_seen": 107341965, + "step": 4988, + "time_per_iteration": 2.772221565246582 + }, + { + "auxiliary_loss_clip": 0.01114945, + "auxiliary_loss_mlp": 0.00777889, + "balance_loss_clip": 1.04905486, + "balance_loss_mlp": 1.0013597, + "epoch": 0.29995490756049903, + "flos": 23768878164480.0, + "grad_norm": 1.6966696236855432, + "language_loss": 0.70858777, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.72751617, + "num_input_tokens_seen": 107362615, + "step": 4989, + "time_per_iteration": 2.7470130920410156 + }, + { + "auxiliary_loss_clip": 0.0110827, + "auxiliary_loss_mlp": 0.01046589, + "balance_loss_clip": 1.0506041, + "balance_loss_mlp": 1.02906489, + "epoch": 0.300015030813167, + "flos": 14465321614080.0, + "grad_norm": 1.9545100728262668, + "language_loss": 0.85589516, + "learning_rate": 3.282705542954199e-06, + "loss": 0.87744367, + "num_input_tokens_seen": 107378980, + "step": 4990, + "time_per_iteration": 2.808276414871216 + }, + { + "auxiliary_loss_clip": 0.01133569, + "auxiliary_loss_mlp": 0.0103974, + "balance_loss_clip": 1.05172086, + "balance_loss_mlp": 1.02152538, + "epoch": 0.30007515406583496, + "flos": 25191982080000.0, + "grad_norm": 1.8023870470649808, + "language_loss": 0.67019355, + "learning_rate": 3.28240670566841e-06, + "loss": 0.69192666, + "num_input_tokens_seen": 107397640, + "step": 4991, + "time_per_iteration": 2.7097268104553223 + }, + { + "auxiliary_loss_clip": 0.0112021, + "auxiliary_loss_mlp": 0.01041383, + "balance_loss_clip": 1.04660511, + "balance_loss_mlp": 1.02248883, + "epoch": 0.3001352773185029, + "flos": 19391188106880.0, + "grad_norm": 1.684252307124257, + "language_loss": 0.78640115, + "learning_rate": 3.28210781975363e-06, + "loss": 0.80801708, + "num_input_tokens_seen": 107416020, + "step": 4992, + "time_per_iteration": 2.66925311088562 + }, + { + "auxiliary_loss_clip": 0.01143243, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.05240428, + "balance_loss_mlp": 1.02457952, + "epoch": 0.3001954005711709, + "flos": 21543853161600.0, + "grad_norm": 2.3134173579188175, + "language_loss": 0.82057947, + "learning_rate": 3.281808885221193e-06, + "loss": 0.84243113, + "num_input_tokens_seen": 107436340, + "step": 4993, + "time_per_iteration": 2.613849639892578 + }, + { + "auxiliary_loss_clip": 0.01096023, + "auxiliary_loss_mlp": 0.01048917, + "balance_loss_clip": 1.04667079, + "balance_loss_mlp": 1.02997458, + "epoch": 0.30025552382383885, + "flos": 17384320356480.0, + "grad_norm": 2.1042579138834197, + "language_loss": 0.86142659, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.88287598, + "num_input_tokens_seen": 107454585, + "step": 4994, + "time_per_iteration": 2.703126907348633 + }, + { + "auxiliary_loss_clip": 0.01118329, + "auxiliary_loss_mlp": 0.01041975, + "balance_loss_clip": 1.05592799, + "balance_loss_mlp": 1.02504694, + "epoch": 0.3003156470765068, + "flos": 29533330552320.0, + "grad_norm": 1.5905866784601752, + "language_loss": 0.80834931, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.82995236, + "num_input_tokens_seen": 107477180, + "step": 4995, + "time_per_iteration": 2.8100333213806152 + }, + { + "auxiliary_loss_clip": 0.01117939, + "auxiliary_loss_mlp": 0.01043612, + "balance_loss_clip": 1.05073023, + "balance_loss_mlp": 1.02623129, + "epoch": 0.3003757703291748, + "flos": 43646402465280.0, + "grad_norm": 1.9490007813217745, + "language_loss": 0.67086798, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.69248348, + "num_input_tokens_seen": 107500250, + "step": 4996, + "time_per_iteration": 2.989062786102295 + }, + { + "auxiliary_loss_clip": 0.01114657, + "auxiliary_loss_mlp": 0.01042055, + "balance_loss_clip": 1.04888701, + "balance_loss_mlp": 1.02449584, + "epoch": 0.30043589358184275, + "flos": 22528380015360.0, + "grad_norm": 4.4692930536610245, + "language_loss": 0.75825363, + "learning_rate": 3.280612661141615e-06, + "loss": 0.7798208, + "num_input_tokens_seen": 107520070, + "step": 4997, + "time_per_iteration": 2.733402967453003 + }, + { + "auxiliary_loss_clip": 0.01131118, + "auxiliary_loss_mlp": 0.0104737, + "balance_loss_clip": 1.05176449, + "balance_loss_mlp": 1.03149128, + "epoch": 0.30049601683451077, + "flos": 20995892208000.0, + "grad_norm": 2.0588160995259197, + "language_loss": 0.78425241, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.80603731, + "num_input_tokens_seen": 107539285, + "step": 4998, + "time_per_iteration": 2.7973837852478027 + }, + { + "auxiliary_loss_clip": 0.011392, + "auxiliary_loss_mlp": 0.01044927, + "balance_loss_clip": 1.05180395, + "balance_loss_mlp": 1.0287745, + "epoch": 0.30055614008717874, + "flos": 23916004272000.0, + "grad_norm": 18.871291300313036, + "language_loss": 0.73622382, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.7580651, + "num_input_tokens_seen": 107560260, + "step": 4999, + "time_per_iteration": 2.7197916507720947 + }, + { + "auxiliary_loss_clip": 0.01131684, + "auxiliary_loss_mlp": 0.01044515, + "balance_loss_clip": 1.05033612, + "balance_loss_mlp": 1.02750361, + "epoch": 0.3006162633398467, + "flos": 19169798630400.0, + "grad_norm": 1.6090337016392804, + "language_loss": 0.75454789, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.77630985, + "num_input_tokens_seen": 107579260, + "step": 5000, + "time_per_iteration": 2.688054323196411 + }, + { + "auxiliary_loss_clip": 0.01138443, + "auxiliary_loss_mlp": 0.01041074, + "balance_loss_clip": 1.0505259, + "balance_loss_mlp": 1.02564812, + "epoch": 0.30067638659251467, + "flos": 14679241061760.0, + "grad_norm": 1.7985326326547535, + "language_loss": 0.81841409, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.84020931, + "num_input_tokens_seen": 107595245, + "step": 5001, + "time_per_iteration": 2.6519837379455566 + }, + { + "auxiliary_loss_clip": 0.01128756, + "auxiliary_loss_mlp": 0.01048602, + "balance_loss_clip": 1.05139947, + "balance_loss_mlp": 1.03068447, + "epoch": 0.30073650984518263, + "flos": 23368007404800.0, + "grad_norm": 1.8684342377814658, + "language_loss": 0.7999261, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.82169974, + "num_input_tokens_seen": 107613985, + "step": 5002, + "time_per_iteration": 2.6749327182769775 + }, + { + "auxiliary_loss_clip": 0.01091983, + "auxiliary_loss_mlp": 0.0104282, + "balance_loss_clip": 1.04869151, + "balance_loss_mlp": 1.02431834, + "epoch": 0.3007966330978506, + "flos": 22966633854720.0, + "grad_norm": 1.9577039368374018, + "language_loss": 0.70993537, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.73128337, + "num_input_tokens_seen": 107631435, + "step": 5003, + "time_per_iteration": 2.908494472503662 + }, + { + "auxiliary_loss_clip": 0.01110546, + "auxiliary_loss_mlp": 0.01043883, + "balance_loss_clip": 1.05014396, + "balance_loss_mlp": 1.02643037, + "epoch": 0.30085675635051856, + "flos": 27818452460160.0, + "grad_norm": 1.956987555909332, + "language_loss": 0.70556092, + "learning_rate": 3.27851739984233e-06, + "loss": 0.72710526, + "num_input_tokens_seen": 107650530, + "step": 5004, + "time_per_iteration": 2.8064236640930176 + }, + { + "auxiliary_loss_clip": 0.01119172, + "auxiliary_loss_mlp": 0.01045143, + "balance_loss_clip": 1.05067444, + "balance_loss_mlp": 1.02800083, + "epoch": 0.3009168796031865, + "flos": 10882729059840.0, + "grad_norm": 2.8453259041050805, + "language_loss": 0.81459486, + "learning_rate": 3.278217882782715e-06, + "loss": 0.83623803, + "num_input_tokens_seen": 107662240, + "step": 5005, + "time_per_iteration": 2.633951425552368 + }, + { + "auxiliary_loss_clip": 0.01130639, + "auxiliary_loss_mlp": 0.01043853, + "balance_loss_clip": 1.0514015, + "balance_loss_mlp": 1.02742577, + "epoch": 0.3009770028558545, + "flos": 23805399317760.0, + "grad_norm": 3.7156546302240043, + "language_loss": 0.74672973, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.76847464, + "num_input_tokens_seen": 107680330, + "step": 5006, + "time_per_iteration": 2.7556662559509277 + }, + { + "auxiliary_loss_clip": 0.01101239, + "auxiliary_loss_mlp": 0.00775371, + "balance_loss_clip": 1.04850578, + "balance_loss_mlp": 1.00104856, + "epoch": 0.30103712610852246, + "flos": 26468211283200.0, + "grad_norm": 2.0504029481480153, + "language_loss": 0.71090448, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.72967064, + "num_input_tokens_seen": 107700020, + "step": 5007, + "time_per_iteration": 2.83591365814209 + }, + { + "auxiliary_loss_clip": 0.01129575, + "auxiliary_loss_mlp": 0.01038114, + "balance_loss_clip": 1.05173922, + "balance_loss_mlp": 1.0206027, + "epoch": 0.3010972493611904, + "flos": 22856459863680.0, + "grad_norm": 2.302333802055736, + "language_loss": 0.76504552, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.78672242, + "num_input_tokens_seen": 107718575, + "step": 5008, + "time_per_iteration": 2.7624082565307617 + }, + { + "auxiliary_loss_clip": 0.0112694, + "auxiliary_loss_mlp": 0.01039735, + "balance_loss_clip": 1.05119205, + "balance_loss_mlp": 1.02284265, + "epoch": 0.3011573726138584, + "flos": 24053685102720.0, + "grad_norm": 1.840633361886899, + "language_loss": 0.84215975, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.86382657, + "num_input_tokens_seen": 107738635, + "step": 5009, + "time_per_iteration": 2.7053475379943848 + }, + { + "auxiliary_loss_clip": 0.01135722, + "auxiliary_loss_mlp": 0.0104281, + "balance_loss_clip": 1.05079174, + "balance_loss_mlp": 1.02389145, + "epoch": 0.30121749586652635, + "flos": 20259687052800.0, + "grad_norm": 1.970244045667646, + "language_loss": 0.83804011, + "learning_rate": 3.276719570659604e-06, + "loss": 0.85982549, + "num_input_tokens_seen": 107753415, + "step": 5010, + "time_per_iteration": 2.677002429962158 + }, + { + "auxiliary_loss_clip": 0.01108582, + "auxiliary_loss_mlp": 0.01038214, + "balance_loss_clip": 1.04942024, + "balance_loss_mlp": 1.02294374, + "epoch": 0.3012776191191944, + "flos": 26943058103040.0, + "grad_norm": 2.3216326772862246, + "language_loss": 0.85401523, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.87548327, + "num_input_tokens_seen": 107773840, + "step": 5011, + "time_per_iteration": 2.807887077331543 + }, + { + "auxiliary_loss_clip": 0.01119452, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.04522014, + "balance_loss_mlp": 1.02680194, + "epoch": 0.30133774237186234, + "flos": 20412307941120.0, + "grad_norm": 2.58081844210284, + "language_loss": 0.72122502, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.74286604, + "num_input_tokens_seen": 107792020, + "step": 5012, + "time_per_iteration": 2.689375400543213 + }, + { + "auxiliary_loss_clip": 0.01127162, + "auxiliary_loss_mlp": 0.01042946, + "balance_loss_clip": 1.04826403, + "balance_loss_mlp": 1.02628016, + "epoch": 0.3013978656245303, + "flos": 19792453916160.0, + "grad_norm": 2.871668468467944, + "language_loss": 0.88278735, + "learning_rate": 3.275820002334819e-06, + "loss": 0.90448833, + "num_input_tokens_seen": 107809595, + "step": 5013, + "time_per_iteration": 2.6482350826263428 + }, + { + "auxiliary_loss_clip": 0.01110184, + "auxiliary_loss_mlp": 0.01050326, + "balance_loss_clip": 1.04318821, + "balance_loss_mlp": 1.0286417, + "epoch": 0.30145798887719827, + "flos": 16249650652800.0, + "grad_norm": 1.8756845710135603, + "language_loss": 0.82593644, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.84754151, + "num_input_tokens_seen": 107827230, + "step": 5014, + "time_per_iteration": 2.6681008338928223 + }, + { + "auxiliary_loss_clip": 0.01092673, + "auxiliary_loss_mlp": 0.01047692, + "balance_loss_clip": 1.04461288, + "balance_loss_mlp": 1.03045392, + "epoch": 0.30151811212986623, + "flos": 24571733005440.0, + "grad_norm": 1.7101695757694795, + "language_loss": 0.68239003, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.7037937, + "num_input_tokens_seen": 107847195, + "step": 5015, + "time_per_iteration": 2.725411891937256 + }, + { + "auxiliary_loss_clip": 0.01110447, + "auxiliary_loss_mlp": 0.01043819, + "balance_loss_clip": 1.0448432, + "balance_loss_mlp": 1.02652168, + "epoch": 0.3015782353825342, + "flos": 21872076664320.0, + "grad_norm": 2.2766913154728625, + "language_loss": 0.74497074, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.76651341, + "num_input_tokens_seen": 107866420, + "step": 5016, + "time_per_iteration": 2.710721492767334 + }, + { + "auxiliary_loss_clip": 0.01133464, + "auxiliary_loss_mlp": 0.01041604, + "balance_loss_clip": 1.05026031, + "balance_loss_mlp": 1.02444994, + "epoch": 0.30163835863520216, + "flos": 28769331248640.0, + "grad_norm": 1.7847015072033203, + "language_loss": 0.65504754, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.67679822, + "num_input_tokens_seen": 107889090, + "step": 5017, + "time_per_iteration": 2.7239317893981934 + }, + { + "auxiliary_loss_clip": 0.01091977, + "auxiliary_loss_mlp": 0.01057247, + "balance_loss_clip": 1.04233074, + "balance_loss_mlp": 1.03813791, + "epoch": 0.30169848188787013, + "flos": 22966202891520.0, + "grad_norm": 2.1696927992492783, + "language_loss": 0.68739498, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.70888722, + "num_input_tokens_seen": 107907520, + "step": 5018, + "time_per_iteration": 2.6655359268188477 + }, + { + "auxiliary_loss_clip": 0.01135218, + "auxiliary_loss_mlp": 0.01042787, + "balance_loss_clip": 1.0482893, + "balance_loss_mlp": 1.02783799, + "epoch": 0.3017586051405381, + "flos": 21835268202240.0, + "grad_norm": 1.9457029488983892, + "language_loss": 0.78853333, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.8103134, + "num_input_tokens_seen": 107925650, + "step": 5019, + "time_per_iteration": 2.669679641723633 + }, + { + "auxiliary_loss_clip": 0.01112458, + "auxiliary_loss_mlp": 0.01044161, + "balance_loss_clip": 1.04863656, + "balance_loss_mlp": 1.02766263, + "epoch": 0.30181872839320606, + "flos": 22160403135360.0, + "grad_norm": 3.674249330665847, + "language_loss": 0.70038712, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.72195333, + "num_input_tokens_seen": 107943975, + "step": 5020, + "time_per_iteration": 2.704000234603882 + }, + { + "auxiliary_loss_clip": 0.01143422, + "auxiliary_loss_mlp": 0.01049684, + "balance_loss_clip": 1.05071819, + "balance_loss_mlp": 1.03320909, + "epoch": 0.301878851645874, + "flos": 18114168804480.0, + "grad_norm": 5.641410405732297, + "language_loss": 0.78549969, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.80743068, + "num_input_tokens_seen": 107962950, + "step": 5021, + "time_per_iteration": 4.521278142929077 + }, + { + "auxiliary_loss_clip": 0.01129372, + "auxiliary_loss_mlp": 0.01031797, + "balance_loss_clip": 1.04859924, + "balance_loss_mlp": 1.01572752, + "epoch": 0.301938974898542, + "flos": 17602226213760.0, + "grad_norm": 3.308202374048827, + "language_loss": 0.75482392, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.77643561, + "num_input_tokens_seen": 107979700, + "step": 5022, + "time_per_iteration": 4.1478235721588135 + }, + { + "auxiliary_loss_clip": 0.01141828, + "auxiliary_loss_mlp": 0.01043797, + "balance_loss_clip": 1.04905522, + "balance_loss_mlp": 1.02676249, + "epoch": 0.30199909815120995, + "flos": 11181219079680.0, + "grad_norm": 1.7715139184612991, + "language_loss": 0.69534874, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.71720505, + "num_input_tokens_seen": 107996645, + "step": 5023, + "time_per_iteration": 2.582491636276245 + }, + { + "auxiliary_loss_clip": 0.01112614, + "auxiliary_loss_mlp": 0.01040881, + "balance_loss_clip": 1.04434311, + "balance_loss_mlp": 1.02471602, + "epoch": 0.302059221403878, + "flos": 21907843632000.0, + "grad_norm": 4.128865002464027, + "language_loss": 0.71400636, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.73554134, + "num_input_tokens_seen": 108015020, + "step": 5024, + "time_per_iteration": 2.6789708137512207 + }, + { + "auxiliary_loss_clip": 0.01125475, + "auxiliary_loss_mlp": 0.01051317, + "balance_loss_clip": 1.04789031, + "balance_loss_mlp": 1.03441346, + "epoch": 0.30211934465654594, + "flos": 26396390039040.0, + "grad_norm": 2.5352325664815396, + "language_loss": 0.73949707, + "learning_rate": 3.272217377978061e-06, + "loss": 0.76126498, + "num_input_tokens_seen": 108036430, + "step": 5025, + "time_per_iteration": 2.7021281719207764 + }, + { + "auxiliary_loss_clip": 0.01129438, + "auxiliary_loss_mlp": 0.01049255, + "balance_loss_clip": 1.05115628, + "balance_loss_mlp": 1.03333473, + "epoch": 0.3021794679092139, + "flos": 23400470321280.0, + "grad_norm": 1.5312912087399582, + "language_loss": 0.67339373, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.69518065, + "num_input_tokens_seen": 108054250, + "step": 5026, + "time_per_iteration": 4.172817230224609 + }, + { + "auxiliary_loss_clip": 0.01131398, + "auxiliary_loss_mlp": 0.01045765, + "balance_loss_clip": 1.05058789, + "balance_loss_mlp": 1.02871835, + "epoch": 0.30223959116188187, + "flos": 20260979942400.0, + "grad_norm": 1.8656003857402752, + "language_loss": 0.84821522, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.86998689, + "num_input_tokens_seen": 108071495, + "step": 5027, + "time_per_iteration": 2.66186785697937 + }, + { + "auxiliary_loss_clip": 0.01104085, + "auxiliary_loss_mlp": 0.01045706, + "balance_loss_clip": 1.04686451, + "balance_loss_mlp": 1.03030431, + "epoch": 0.30229971441454984, + "flos": 26687840993280.0, + "grad_norm": 1.633485895123786, + "language_loss": 0.78574622, + "learning_rate": 3.271315635661351e-06, + "loss": 0.80724418, + "num_input_tokens_seen": 108092135, + "step": 5028, + "time_per_iteration": 4.454678297042847 + }, + { + "auxiliary_loss_clip": 0.01113383, + "auxiliary_loss_mlp": 0.01048022, + "balance_loss_clip": 1.04682207, + "balance_loss_mlp": 1.03115392, + "epoch": 0.3023598376672178, + "flos": 34345323953280.0, + "grad_norm": 1.9340935936746968, + "language_loss": 0.77085543, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.79246956, + "num_input_tokens_seen": 108112945, + "step": 5029, + "time_per_iteration": 2.841707229614258 + }, + { + "auxiliary_loss_clip": 0.01111921, + "auxiliary_loss_mlp": 0.01048937, + "balance_loss_clip": 1.04846191, + "balance_loss_mlp": 1.02920818, + "epoch": 0.30241996091988577, + "flos": 23112143850240.0, + "grad_norm": 2.1432001376374257, + "language_loss": 0.8240397, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.84564829, + "num_input_tokens_seen": 108130325, + "step": 5030, + "time_per_iteration": 2.8557751178741455 + }, + { + "auxiliary_loss_clip": 0.01090897, + "auxiliary_loss_mlp": 0.00775419, + "balance_loss_clip": 1.04519463, + "balance_loss_mlp": 1.00112486, + "epoch": 0.30248008417255373, + "flos": 19390002958080.0, + "grad_norm": 2.2374457582531098, + "language_loss": 0.6987617, + "learning_rate": 3.270413459468905e-06, + "loss": 0.71742487, + "num_input_tokens_seen": 108150300, + "step": 5031, + "time_per_iteration": 2.7827746868133545 + }, + { + "auxiliary_loss_clip": 0.01121676, + "auxiliary_loss_mlp": 0.01044463, + "balance_loss_clip": 1.04549253, + "balance_loss_mlp": 1.02800059, + "epoch": 0.3025402074252217, + "flos": 23769704177280.0, + "grad_norm": 1.8685207024800563, + "language_loss": 0.82324117, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.84490258, + "num_input_tokens_seen": 108170330, + "step": 5032, + "time_per_iteration": 2.6529927253723145 + }, + { + "auxiliary_loss_clip": 0.01104945, + "auxiliary_loss_mlp": 0.01059072, + "balance_loss_clip": 1.05129266, + "balance_loss_mlp": 1.03951025, + "epoch": 0.30260033067788966, + "flos": 25994118648960.0, + "grad_norm": 2.130148669813867, + "language_loss": 0.73156881, + "learning_rate": 3.269811767783906e-06, + "loss": 0.75320899, + "num_input_tokens_seen": 108191265, + "step": 5033, + "time_per_iteration": 2.7259597778320312 + }, + { + "auxiliary_loss_clip": 0.01124221, + "auxiliary_loss_mlp": 0.01049397, + "balance_loss_clip": 1.04687023, + "balance_loss_mlp": 1.03221893, + "epoch": 0.3026604539305576, + "flos": 25374551932800.0, + "grad_norm": 1.564237149834404, + "language_loss": 0.74164939, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76338559, + "num_input_tokens_seen": 108211615, + "step": 5034, + "time_per_iteration": 2.674745798110962 + }, + { + "auxiliary_loss_clip": 0.01140313, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.04939198, + "balance_loss_mlp": 1.02224064, + "epoch": 0.3027205771832256, + "flos": 25812733944960.0, + "grad_norm": 1.8295549596836873, + "language_loss": 0.72133434, + "learning_rate": 3.269209883493352e-06, + "loss": 0.74312872, + "num_input_tokens_seen": 108231080, + "step": 5035, + "time_per_iteration": 2.6429855823516846 + }, + { + "auxiliary_loss_clip": 0.01123118, + "auxiliary_loss_mlp": 0.01038432, + "balance_loss_clip": 1.04499483, + "balance_loss_mlp": 1.02267289, + "epoch": 0.30278070043589356, + "flos": 27344539393920.0, + "grad_norm": 2.468501372591198, + "language_loss": 0.86918867, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.89080417, + "num_input_tokens_seen": 108251125, + "step": 5036, + "time_per_iteration": 2.6735007762908936 + }, + { + "auxiliary_loss_clip": 0.01097642, + "auxiliary_loss_mlp": 0.01051442, + "balance_loss_clip": 1.04504728, + "balance_loss_mlp": 1.0331912, + "epoch": 0.3028408236885616, + "flos": 24786227070720.0, + "grad_norm": 2.859596651876304, + "language_loss": 0.77406383, + "learning_rate": 3.268607806688536e-06, + "loss": 0.79555464, + "num_input_tokens_seen": 108272545, + "step": 5037, + "time_per_iteration": 2.7311182022094727 + }, + { + "auxiliary_loss_clip": 0.01102304, + "auxiliary_loss_mlp": 0.01044604, + "balance_loss_clip": 1.0462358, + "balance_loss_mlp": 1.02683008, + "epoch": 0.30290094694122954, + "flos": 12932474670720.0, + "grad_norm": 2.32450780354164, + "language_loss": 0.77307165, + "learning_rate": 3.268306696121816e-06, + "loss": 0.79454064, + "num_input_tokens_seen": 108289725, + "step": 5038, + "time_per_iteration": 2.677525043487549 + }, + { + "auxiliary_loss_clip": 0.01113965, + "auxiliary_loss_mlp": 0.01037105, + "balance_loss_clip": 1.04819584, + "balance_loss_mlp": 1.02067804, + "epoch": 0.3029610701938975, + "flos": 25916443488000.0, + "grad_norm": 2.1234468188232976, + "language_loss": 0.74140579, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.76291645, + "num_input_tokens_seen": 108310690, + "step": 5039, + "time_per_iteration": 2.7086853981018066 + }, + { + "auxiliary_loss_clip": 0.01137739, + "auxiliary_loss_mlp": 0.00774651, + "balance_loss_clip": 1.05068994, + "balance_loss_mlp": 1.00113058, + "epoch": 0.3030211934465655, + "flos": 21980993679360.0, + "grad_norm": 2.3826017374700372, + "language_loss": 0.79777801, + "learning_rate": 3.267704330716847e-06, + "loss": 0.81690192, + "num_input_tokens_seen": 108328905, + "step": 5040, + "time_per_iteration": 2.665175199508667 + }, + { + "auxiliary_loss_clip": 0.01114198, + "auxiliary_loss_mlp": 0.01038229, + "balance_loss_clip": 1.04937124, + "balance_loss_mlp": 1.02279687, + "epoch": 0.30308131669923344, + "flos": 20991977625600.0, + "grad_norm": 1.7800027985776907, + "language_loss": 0.81872481, + "learning_rate": 3.267403075901438e-06, + "loss": 0.84024912, + "num_input_tokens_seen": 108346680, + "step": 5041, + "time_per_iteration": 2.6471712589263916 + }, + { + "auxiliary_loss_clip": 0.01018002, + "auxiliary_loss_mlp": 0.01004656, + "balance_loss_clip": 1.0244385, + "balance_loss_mlp": 1.00277221, + "epoch": 0.3031414399519014, + "flos": 60548875827840.0, + "grad_norm": 0.7715538683836823, + "language_loss": 0.59505904, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61528552, + "num_input_tokens_seen": 108413885, + "step": 5042, + "time_per_iteration": 3.3167309761047363 + }, + { + "auxiliary_loss_clip": 0.0114486, + "auxiliary_loss_mlp": 0.01036647, + "balance_loss_clip": 1.05319929, + "balance_loss_mlp": 1.01940918, + "epoch": 0.30320156320456937, + "flos": 21907664064000.0, + "grad_norm": 1.838538817411587, + "language_loss": 0.71149278, + "learning_rate": 3.266800422101892e-06, + "loss": 0.73330784, + "num_input_tokens_seen": 108433640, + "step": 5043, + "time_per_iteration": 2.6266753673553467 + }, + { + "auxiliary_loss_clip": 0.01095086, + "auxiliary_loss_mlp": 0.01036293, + "balance_loss_clip": 1.04519725, + "balance_loss_mlp": 1.01948404, + "epoch": 0.30326168645723733, + "flos": 21652770176640.0, + "grad_norm": 3.620919115388089, + "language_loss": 0.69573802, + "learning_rate": 3.266499023140606e-06, + "loss": 0.71705186, + "num_input_tokens_seen": 108452640, + "step": 5044, + "time_per_iteration": 2.7561492919921875 + }, + { + "auxiliary_loss_clip": 0.01127659, + "auxiliary_loss_mlp": 0.01039805, + "balance_loss_clip": 1.05019724, + "balance_loss_mlp": 1.02335382, + "epoch": 0.3033218097099053, + "flos": 21871286565120.0, + "grad_norm": 1.3797061223764004, + "language_loss": 0.77188826, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.79356289, + "num_input_tokens_seen": 108472470, + "step": 5045, + "time_per_iteration": 2.6529667377471924 + }, + { + "auxiliary_loss_clip": 0.01141388, + "auxiliary_loss_mlp": 0.00775246, + "balance_loss_clip": 1.05165195, + "balance_loss_mlp": 1.00136316, + "epoch": 0.30338193296257326, + "flos": 27089717333760.0, + "grad_norm": 1.772786200303907, + "language_loss": 0.72473782, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.74390417, + "num_input_tokens_seen": 108493025, + "step": 5046, + "time_per_iteration": 2.8433380126953125 + }, + { + "auxiliary_loss_clip": 0.01131475, + "auxiliary_loss_mlp": 0.01040342, + "balance_loss_clip": 1.04979491, + "balance_loss_mlp": 1.02119732, + "epoch": 0.30344205621524123, + "flos": 19534363718400.0, + "grad_norm": 1.7729778222487513, + "language_loss": 0.81406343, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.83578163, + "num_input_tokens_seen": 108513480, + "step": 5047, + "time_per_iteration": 2.6653506755828857 + }, + { + "auxiliary_loss_clip": 0.01078955, + "auxiliary_loss_mlp": 0.01042974, + "balance_loss_clip": 1.04126537, + "balance_loss_mlp": 1.02565265, + "epoch": 0.3035021794679092, + "flos": 23910976368000.0, + "grad_norm": 2.0012909108595287, + "language_loss": 0.7191782, + "learning_rate": 3.265292947152084e-06, + "loss": 0.74039751, + "num_input_tokens_seen": 108533155, + "step": 5048, + "time_per_iteration": 2.7198410034179688 + }, + { + "auxiliary_loss_clip": 0.01117557, + "auxiliary_loss_mlp": 0.01037944, + "balance_loss_clip": 1.04860258, + "balance_loss_mlp": 1.02263796, + "epoch": 0.30356230272057716, + "flos": 16143606725760.0, + "grad_norm": 1.6260333435769418, + "language_loss": 0.75220919, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.77376425, + "num_input_tokens_seen": 108551900, + "step": 5049, + "time_per_iteration": 2.6649906635284424 + }, + { + "auxiliary_loss_clip": 0.01131404, + "auxiliary_loss_mlp": 0.01035526, + "balance_loss_clip": 1.04947305, + "balance_loss_mlp": 1.01907563, + "epoch": 0.3036224259732452, + "flos": 28914697589760.0, + "grad_norm": 1.5855456549340856, + "language_loss": 0.82088244, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.84255171, + "num_input_tokens_seen": 108574005, + "step": 5050, + "time_per_iteration": 2.657400131225586 + }, + { + "auxiliary_loss_clip": 0.01106158, + "auxiliary_loss_mlp": 0.0103828, + "balance_loss_clip": 1.05031502, + "balance_loss_mlp": 1.02079201, + "epoch": 0.30368254922591315, + "flos": 21105599322240.0, + "grad_norm": 2.7844840544166436, + "language_loss": 0.74196702, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.7634114, + "num_input_tokens_seen": 108592715, + "step": 5051, + "time_per_iteration": 2.8018569946289062 + }, + { + "auxiliary_loss_clip": 0.01079332, + "auxiliary_loss_mlp": 0.00775567, + "balance_loss_clip": 1.04338145, + "balance_loss_mlp": 1.00118661, + "epoch": 0.3037426724785811, + "flos": 23002293081600.0, + "grad_norm": 1.6849730779493737, + "language_loss": 0.76015687, + "learning_rate": 3.264086103483033e-06, + "loss": 0.77870589, + "num_input_tokens_seen": 108611770, + "step": 5052, + "time_per_iteration": 2.9220657348632812 + }, + { + "auxiliary_loss_clip": 0.01143047, + "auxiliary_loss_mlp": 0.01043624, + "balance_loss_clip": 1.0504849, + "balance_loss_mlp": 1.02656555, + "epoch": 0.3038027957312491, + "flos": 15632705629440.0, + "grad_norm": 2.421175308310746, + "language_loss": 0.82370055, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.84556723, + "num_input_tokens_seen": 108629070, + "step": 5053, + "time_per_iteration": 2.5955326557159424 + }, + { + "auxiliary_loss_clip": 0.01113702, + "auxiliary_loss_mlp": 0.01042002, + "balance_loss_clip": 1.0471338, + "balance_loss_mlp": 1.02475214, + "epoch": 0.30386291898391704, + "flos": 12713994195840.0, + "grad_norm": 1.8307418288785484, + "language_loss": 0.70979112, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.73134822, + "num_input_tokens_seen": 108646315, + "step": 5054, + "time_per_iteration": 2.7001569271087646 + }, + { + "auxiliary_loss_clip": 0.01140964, + "auxiliary_loss_mlp": 0.01039805, + "balance_loss_clip": 1.05088401, + "balance_loss_mlp": 1.0225668, + "epoch": 0.303923042236585, + "flos": 26359437922560.0, + "grad_norm": 2.314538095600907, + "language_loss": 0.69049591, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.71230358, + "num_input_tokens_seen": 108665920, + "step": 5055, + "time_per_iteration": 2.6685287952423096 + }, + { + "auxiliary_loss_clip": 0.01113325, + "auxiliary_loss_mlp": 0.01036352, + "balance_loss_clip": 1.04871488, + "balance_loss_mlp": 1.01880479, + "epoch": 0.30398316548925297, + "flos": 19719232041600.0, + "grad_norm": 1.959915959447654, + "language_loss": 0.67298615, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.69448292, + "num_input_tokens_seen": 108683485, + "step": 5056, + "time_per_iteration": 2.6933648586273193 + }, + { + "auxiliary_loss_clip": 0.01110454, + "auxiliary_loss_mlp": 0.01043223, + "balance_loss_clip": 1.04604077, + "balance_loss_mlp": 1.02673686, + "epoch": 0.30404328874192094, + "flos": 24239846315520.0, + "grad_norm": 1.7045430221851803, + "language_loss": 0.82544303, + "learning_rate": 3.262576470461507e-06, + "loss": 0.84697986, + "num_input_tokens_seen": 108702700, + "step": 5057, + "time_per_iteration": 2.740187406539917 + }, + { + "auxiliary_loss_clip": 0.01115402, + "auxiliary_loss_mlp": 0.01039139, + "balance_loss_clip": 1.04719019, + "balance_loss_mlp": 1.0222472, + "epoch": 0.3041034119945889, + "flos": 24498942094080.0, + "grad_norm": 1.8459128585017135, + "language_loss": 0.88849652, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91004193, + "num_input_tokens_seen": 108721860, + "step": 5058, + "time_per_iteration": 2.7015340328216553 + }, + { + "auxiliary_loss_clip": 0.01102971, + "auxiliary_loss_mlp": 0.01047692, + "balance_loss_clip": 1.04598641, + "balance_loss_mlp": 1.03040063, + "epoch": 0.30416353524725687, + "flos": 28288881907200.0, + "grad_norm": 7.837576661900421, + "language_loss": 0.71809238, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.73959899, + "num_input_tokens_seen": 108743215, + "step": 5059, + "time_per_iteration": 2.7542827129364014 + }, + { + "auxiliary_loss_clip": 0.01083101, + "auxiliary_loss_mlp": 0.01042605, + "balance_loss_clip": 1.04435182, + "balance_loss_mlp": 1.02670228, + "epoch": 0.30422365849992483, + "flos": 23660392112640.0, + "grad_norm": 2.424944175434462, + "language_loss": 0.73316336, + "learning_rate": 3.26167011603268e-06, + "loss": 0.7544204, + "num_input_tokens_seen": 108765505, + "step": 5060, + "time_per_iteration": 4.655209541320801 + }, + { + "auxiliary_loss_clip": 0.01140365, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.05072367, + "balance_loss_mlp": 1.02234221, + "epoch": 0.3042837817525928, + "flos": 22998773548800.0, + "grad_norm": 2.6284704346086, + "language_loss": 0.77279079, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.79457664, + "num_input_tokens_seen": 108783370, + "step": 5061, + "time_per_iteration": 4.1857099533081055 + }, + { + "auxiliary_loss_clip": 0.01105214, + "auxiliary_loss_mlp": 0.01039505, + "balance_loss_clip": 1.05216312, + "balance_loss_mlp": 1.02225542, + "epoch": 0.30434390500526076, + "flos": 22082332924800.0, + "grad_norm": 1.9238999634605745, + "language_loss": 0.81891274, + "learning_rate": 3.261065640514415e-06, + "loss": 0.84035993, + "num_input_tokens_seen": 108797430, + "step": 5062, + "time_per_iteration": 2.7250373363494873 + }, + { + "auxiliary_loss_clip": 0.01132809, + "auxiliary_loss_mlp": 0.01036348, + "balance_loss_clip": 1.04662633, + "balance_loss_mlp": 1.02098203, + "epoch": 0.3044040282579287, + "flos": 25483504861440.0, + "grad_norm": 1.8479376829176948, + "language_loss": 0.74707627, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.76876783, + "num_input_tokens_seen": 108816945, + "step": 5063, + "time_per_iteration": 2.6387155055999756 + }, + { + "auxiliary_loss_clip": 0.01126143, + "auxiliary_loss_mlp": 0.00775405, + "balance_loss_clip": 1.04923415, + "balance_loss_mlp": 1.00135541, + "epoch": 0.30446415151059675, + "flos": 21945478106880.0, + "grad_norm": 1.691336757602503, + "language_loss": 0.84400523, + "learning_rate": 3.26046097371721e-06, + "loss": 0.86302078, + "num_input_tokens_seen": 108836615, + "step": 5064, + "time_per_iteration": 2.645256519317627 + }, + { + "auxiliary_loss_clip": 0.01125608, + "auxiliary_loss_mlp": 0.01040172, + "balance_loss_clip": 1.04725182, + "balance_loss_mlp": 1.02311337, + "epoch": 0.3045242747632647, + "flos": 16435416816000.0, + "grad_norm": 2.198572989748056, + "language_loss": 0.76257896, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.78423673, + "num_input_tokens_seen": 108855165, + "step": 5065, + "time_per_iteration": 4.119553565979004 + }, + { + "auxiliary_loss_clip": 0.01110206, + "auxiliary_loss_mlp": 0.01043438, + "balance_loss_clip": 1.04441273, + "balance_loss_mlp": 1.0260098, + "epoch": 0.3045843980159327, + "flos": 31540341957120.0, + "grad_norm": 1.985168773674731, + "language_loss": 0.62328786, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.64482433, + "num_input_tokens_seen": 108874690, + "step": 5066, + "time_per_iteration": 4.380331516265869 + }, + { + "auxiliary_loss_clip": 0.01112307, + "auxiliary_loss_mlp": 0.0104907, + "balance_loss_clip": 1.04790235, + "balance_loss_mlp": 1.03186774, + "epoch": 0.30464452126860064, + "flos": 17853636481920.0, + "grad_norm": 2.188592288059769, + "language_loss": 0.83193344, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.85354722, + "num_input_tokens_seen": 108893140, + "step": 5067, + "time_per_iteration": 2.628598213195801 + }, + { + "auxiliary_loss_clip": 0.01136833, + "auxiliary_loss_mlp": 0.01045137, + "balance_loss_clip": 1.04994464, + "balance_loss_mlp": 1.02904344, + "epoch": 0.3047046445212686, + "flos": 20631398947200.0, + "grad_norm": 4.883769852075586, + "language_loss": 0.62878895, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65060866, + "num_input_tokens_seen": 108911880, + "step": 5068, + "time_per_iteration": 2.583193302154541 + }, + { + "auxiliary_loss_clip": 0.01127244, + "auxiliary_loss_mlp": 0.01039272, + "balance_loss_clip": 1.04866779, + "balance_loss_mlp": 1.02316117, + "epoch": 0.3047647677739366, + "flos": 21287594557440.0, + "grad_norm": 4.297243307498397, + "language_loss": 0.74780715, + "learning_rate": 3.258948470480793e-06, + "loss": 0.7694723, + "num_input_tokens_seen": 108930440, + "step": 5069, + "time_per_iteration": 2.643608570098877 + }, + { + "auxiliary_loss_clip": 0.01103787, + "auxiliary_loss_mlp": 0.01045252, + "balance_loss_clip": 1.04608154, + "balance_loss_mlp": 1.02922475, + "epoch": 0.30482489102660454, + "flos": 20995928121600.0, + "grad_norm": 1.9753352797934713, + "language_loss": 0.75726902, + "learning_rate": 3.258645826569261e-06, + "loss": 0.77875942, + "num_input_tokens_seen": 108949125, + "step": 5070, + "time_per_iteration": 2.715672016143799 + }, + { + "auxiliary_loss_clip": 0.01140483, + "auxiliary_loss_mlp": 0.0077507, + "balance_loss_clip": 1.04843533, + "balance_loss_mlp": 1.0012939, + "epoch": 0.3048850142792725, + "flos": 26290812988800.0, + "grad_norm": 1.7281078039111346, + "language_loss": 0.81636953, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.83552504, + "num_input_tokens_seen": 108972190, + "step": 5071, + "time_per_iteration": 2.635542869567871 + }, + { + "auxiliary_loss_clip": 0.01108476, + "auxiliary_loss_mlp": 0.01045674, + "balance_loss_clip": 1.04286063, + "balance_loss_mlp": 1.02776885, + "epoch": 0.30494513753194047, + "flos": 22346241125760.0, + "grad_norm": 2.0085610287172173, + "language_loss": 0.76208484, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.78362632, + "num_input_tokens_seen": 108990325, + "step": 5072, + "time_per_iteration": 2.6662180423736572 + }, + { + "auxiliary_loss_clip": 0.01099158, + "auxiliary_loss_mlp": 0.01044752, + "balance_loss_clip": 1.04694605, + "balance_loss_mlp": 1.02821743, + "epoch": 0.30500526078460843, + "flos": 19537667769600.0, + "grad_norm": 1.8424983506970039, + "language_loss": 0.70873296, + "learning_rate": 3.257737608512723e-06, + "loss": 0.7301721, + "num_input_tokens_seen": 109009505, + "step": 5073, + "time_per_iteration": 2.815281867980957 + }, + { + "auxiliary_loss_clip": 0.01133011, + "auxiliary_loss_mlp": 0.01055026, + "balance_loss_clip": 1.05032837, + "balance_loss_mlp": 1.03757334, + "epoch": 0.3050653840372764, + "flos": 14465321614080.0, + "grad_norm": 2.0666195830085434, + "language_loss": 0.76370406, + "learning_rate": 3.257434773758163e-06, + "loss": 0.78558439, + "num_input_tokens_seen": 109026350, + "step": 5074, + "time_per_iteration": 2.748568534851074 + }, + { + "auxiliary_loss_clip": 0.01115721, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.04921389, + "balance_loss_mlp": 1.02149391, + "epoch": 0.30512550728994436, + "flos": 24243796811520.0, + "grad_norm": 1.8649350467458667, + "language_loss": 0.74393201, + "learning_rate": 3.25713189132155e-06, + "loss": 0.76546526, + "num_input_tokens_seen": 109044165, + "step": 5075, + "time_per_iteration": 2.7015154361724854 + }, + { + "auxiliary_loss_clip": 0.01141745, + "auxiliary_loss_mlp": 0.01047345, + "balance_loss_clip": 1.0498178, + "balance_loss_mlp": 1.02825916, + "epoch": 0.30518563054261233, + "flos": 16360542915840.0, + "grad_norm": 2.030111139920667, + "language_loss": 0.75904357, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.78093445, + "num_input_tokens_seen": 109060665, + "step": 5076, + "time_per_iteration": 2.5811965465545654 + }, + { + "auxiliary_loss_clip": 0.01116901, + "auxiliary_loss_mlp": 0.01040641, + "balance_loss_clip": 1.04864156, + "balance_loss_mlp": 1.02466679, + "epoch": 0.30524575379528035, + "flos": 21579584215680.0, + "grad_norm": 1.6479970241835653, + "language_loss": 0.79240596, + "learning_rate": 3.25652598344811e-06, + "loss": 0.81398141, + "num_input_tokens_seen": 109080035, + "step": 5077, + "time_per_iteration": 2.680205821990967 + }, + { + "auxiliary_loss_clip": 0.01087088, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.04356635, + "balance_loss_mlp": 1.01881564, + "epoch": 0.3053058770479483, + "flos": 16545231671040.0, + "grad_norm": 1.6765288024346336, + "language_loss": 0.74525034, + "learning_rate": 3.256222958034259e-06, + "loss": 0.76645821, + "num_input_tokens_seen": 109097385, + "step": 5078, + "time_per_iteration": 2.7247111797332764 + }, + { + "auxiliary_loss_clip": 0.01085086, + "auxiliary_loss_mlp": 0.01054049, + "balance_loss_clip": 1.04356313, + "balance_loss_mlp": 1.03728211, + "epoch": 0.3053660003006163, + "flos": 12312907954560.0, + "grad_norm": 1.7442741256404064, + "language_loss": 0.66648543, + "learning_rate": 3.255919884984307e-06, + "loss": 0.68787676, + "num_input_tokens_seen": 109115495, + "step": 5079, + "time_per_iteration": 2.746490716934204 + }, + { + "auxiliary_loss_clip": 0.01127155, + "auxiliary_loss_mlp": 0.01040504, + "balance_loss_clip": 1.04811811, + "balance_loss_mlp": 1.0248282, + "epoch": 0.30542612355328425, + "flos": 23112287504640.0, + "grad_norm": 2.3583709354228213, + "language_loss": 0.79841697, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.82009357, + "num_input_tokens_seen": 109134235, + "step": 5080, + "time_per_iteration": 2.7156612873077393 + }, + { + "auxiliary_loss_clip": 0.01124116, + "auxiliary_loss_mlp": 0.00772863, + "balance_loss_clip": 1.04919219, + "balance_loss_mlp": 1.00125837, + "epoch": 0.3054862468059522, + "flos": 24389450461440.0, + "grad_norm": 2.2636550763480074, + "language_loss": 0.81280053, + "learning_rate": 3.255313596022074e-06, + "loss": 0.8317703, + "num_input_tokens_seen": 109152760, + "step": 5081, + "time_per_iteration": 2.6763248443603516 + }, + { + "auxiliary_loss_clip": 0.01120003, + "auxiliary_loss_mlp": 0.01044443, + "balance_loss_clip": 1.04644883, + "balance_loss_mlp": 1.02843297, + "epoch": 0.3055463700586202, + "flos": 29386096704000.0, + "grad_norm": 7.924214405919456, + "language_loss": 0.71839154, + "learning_rate": 3.255010380132783e-06, + "loss": 0.74003601, + "num_input_tokens_seen": 109173925, + "step": 5082, + "time_per_iteration": 2.7159903049468994 + }, + { + "auxiliary_loss_clip": 0.0112721, + "auxiliary_loss_mlp": 0.01043614, + "balance_loss_clip": 1.04611564, + "balance_loss_mlp": 1.02554226, + "epoch": 0.30560649331128814, + "flos": 25591775431680.0, + "grad_norm": 2.25447896755926, + "language_loss": 0.73108822, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.75279647, + "num_input_tokens_seen": 109192510, + "step": 5083, + "time_per_iteration": 2.646739959716797 + }, + { + "auxiliary_loss_clip": 0.01107487, + "auxiliary_loss_mlp": 0.00775151, + "balance_loss_clip": 1.04263341, + "balance_loss_mlp": 1.00127327, + "epoch": 0.3056666165639561, + "flos": 19128321400320.0, + "grad_norm": 1.7470718607902291, + "language_loss": 0.71378291, + "learning_rate": 3.254403805595344e-06, + "loss": 0.73260927, + "num_input_tokens_seen": 109210885, + "step": 5084, + "time_per_iteration": 2.6846230030059814 + }, + { + "auxiliary_loss_clip": 0.01099017, + "auxiliary_loss_mlp": 0.01047221, + "balance_loss_clip": 1.04366112, + "balance_loss_mlp": 1.02929187, + "epoch": 0.30572673981662407, + "flos": 15523860441600.0, + "grad_norm": 1.8852357422602322, + "language_loss": 0.78966236, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.81112474, + "num_input_tokens_seen": 109229180, + "step": 5085, + "time_per_iteration": 2.7193636894226074 + }, + { + "auxiliary_loss_clip": 0.01130512, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.04483652, + "balance_loss_mlp": 1.01910806, + "epoch": 0.30578686306929204, + "flos": 21506541909120.0, + "grad_norm": 1.9742516674355037, + "language_loss": 0.78476739, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.80641937, + "num_input_tokens_seen": 109249510, + "step": 5086, + "time_per_iteration": 2.5860135555267334 + }, + { + "auxiliary_loss_clip": 0.01103374, + "auxiliary_loss_mlp": 0.01052848, + "balance_loss_clip": 1.04314184, + "balance_loss_mlp": 1.03509736, + "epoch": 0.30584698632196, + "flos": 20954271323520.0, + "grad_norm": 1.8682002339545791, + "language_loss": 0.76727784, + "learning_rate": 3.253493587064563e-06, + "loss": 0.78884006, + "num_input_tokens_seen": 109268200, + "step": 5087, + "time_per_iteration": 2.732639789581299 + }, + { + "auxiliary_loss_clip": 0.01125241, + "auxiliary_loss_mlp": 0.01041401, + "balance_loss_clip": 1.04509556, + "balance_loss_mlp": 1.02450943, + "epoch": 0.30590710957462797, + "flos": 24681116897280.0, + "grad_norm": 2.048016576932303, + "language_loss": 0.72534674, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.74701315, + "num_input_tokens_seen": 109288370, + "step": 5088, + "time_per_iteration": 2.66654109954834 + }, + { + "auxiliary_loss_clip": 0.01128516, + "auxiliary_loss_mlp": 0.01043444, + "balance_loss_clip": 1.04584277, + "balance_loss_mlp": 1.02587295, + "epoch": 0.30596723282729593, + "flos": 17086907744640.0, + "grad_norm": 2.359735204382993, + "language_loss": 0.79327172, + "learning_rate": 3.252886537028521e-06, + "loss": 0.8149913, + "num_input_tokens_seen": 109306730, + "step": 5089, + "time_per_iteration": 2.613231897354126 + }, + { + "auxiliary_loss_clip": 0.01110444, + "auxiliary_loss_mlp": 0.01041514, + "balance_loss_clip": 1.04634953, + "balance_loss_mlp": 1.02470577, + "epoch": 0.30602735607996395, + "flos": 22857106308480.0, + "grad_norm": 1.8271327477144206, + "language_loss": 0.77158219, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.79310179, + "num_input_tokens_seen": 109327360, + "step": 5090, + "time_per_iteration": 2.7469358444213867 + }, + { + "auxiliary_loss_clip": 0.01116264, + "auxiliary_loss_mlp": 0.01050158, + "balance_loss_clip": 1.04506445, + "balance_loss_mlp": 1.03317034, + "epoch": 0.3060874793326319, + "flos": 29861482227840.0, + "grad_norm": 1.7853121536190235, + "language_loss": 0.76108491, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.78274912, + "num_input_tokens_seen": 109348135, + "step": 5091, + "time_per_iteration": 2.7344727516174316 + }, + { + "auxiliary_loss_clip": 0.01076722, + "auxiliary_loss_mlp": 0.01049007, + "balance_loss_clip": 1.04582906, + "balance_loss_mlp": 1.02905178, + "epoch": 0.3061476025852999, + "flos": 20448577699200.0, + "grad_norm": 1.9985396703734173, + "language_loss": 0.71938324, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.74064058, + "num_input_tokens_seen": 109366220, + "step": 5092, + "time_per_iteration": 2.767212390899658 + }, + { + "auxiliary_loss_clip": 0.01114871, + "auxiliary_loss_mlp": 0.01040516, + "balance_loss_clip": 1.04740167, + "balance_loss_mlp": 1.0246855, + "epoch": 0.30620772583796785, + "flos": 19391475415680.0, + "grad_norm": 3.231748461445431, + "language_loss": 0.82655406, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.84810787, + "num_input_tokens_seen": 109385260, + "step": 5093, + "time_per_iteration": 2.705643892288208 + }, + { + "auxiliary_loss_clip": 0.01136927, + "auxiliary_loss_mlp": 0.00773786, + "balance_loss_clip": 1.04842925, + "balance_loss_mlp": 1.00142932, + "epoch": 0.3062678490906358, + "flos": 24024562151040.0, + "grad_norm": 1.6185046249293755, + "language_loss": 0.75340986, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.77251703, + "num_input_tokens_seen": 109405025, + "step": 5094, + "time_per_iteration": 2.6171963214874268 + }, + { + "auxiliary_loss_clip": 0.01112613, + "auxiliary_loss_mlp": 0.01042135, + "balance_loss_clip": 1.04798305, + "balance_loss_mlp": 1.02639914, + "epoch": 0.3063279723433038, + "flos": 19754639873280.0, + "grad_norm": 2.1053112950674824, + "language_loss": 0.75988996, + "learning_rate": 3.251064247058868e-06, + "loss": 0.7814374, + "num_input_tokens_seen": 109422465, + "step": 5095, + "time_per_iteration": 2.7002673149108887 + }, + { + "auxiliary_loss_clip": 0.0112272, + "auxiliary_loss_mlp": 0.01043966, + "balance_loss_clip": 1.04654729, + "balance_loss_mlp": 1.0278492, + "epoch": 0.30638809559597174, + "flos": 22450022496000.0, + "grad_norm": 8.237851994820396, + "language_loss": 0.80608332, + "learning_rate": 3.250760365955042e-06, + "loss": 0.82775021, + "num_input_tokens_seen": 109440575, + "step": 5096, + "time_per_iteration": 2.675551414489746 + }, + { + "auxiliary_loss_clip": 0.01125431, + "auxiliary_loss_mlp": 0.01036388, + "balance_loss_clip": 1.04639602, + "balance_loss_mlp": 1.02030659, + "epoch": 0.3064482188486397, + "flos": 17165157523200.0, + "grad_norm": 3.1166257890970566, + "language_loss": 0.81695235, + "learning_rate": 3.250456437422258e-06, + "loss": 0.83857059, + "num_input_tokens_seen": 109459050, + "step": 5097, + "time_per_iteration": 2.6616358757019043 + }, + { + "auxiliary_loss_clip": 0.01138165, + "auxiliary_loss_mlp": 0.01042971, + "balance_loss_clip": 1.04782009, + "balance_loss_mlp": 1.02522099, + "epoch": 0.3065083421013077, + "flos": 23768483114880.0, + "grad_norm": 2.1722798378639663, + "language_loss": 0.78152639, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80333775, + "num_input_tokens_seen": 109475860, + "step": 5098, + "time_per_iteration": 2.581339120864868 + }, + { + "auxiliary_loss_clip": 0.01093696, + "auxiliary_loss_mlp": 0.01039814, + "balance_loss_clip": 1.04763365, + "balance_loss_mlp": 1.02302897, + "epoch": 0.30656846535397564, + "flos": 26431833784320.0, + "grad_norm": 1.8342329708039284, + "language_loss": 0.84488571, + "learning_rate": 3.249848438115917e-06, + "loss": 0.86622083, + "num_input_tokens_seen": 109494760, + "step": 5099, + "time_per_iteration": 2.761580467224121 + }, + { + "auxiliary_loss_clip": 0.0113763, + "auxiliary_loss_mlp": 0.01044142, + "balance_loss_clip": 1.04598331, + "balance_loss_mlp": 1.02683902, + "epoch": 0.3066285886066436, + "flos": 26651786716800.0, + "grad_norm": 1.7645297710058767, + "language_loss": 0.85650218, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.87831986, + "num_input_tokens_seen": 109516480, + "step": 5100, + "time_per_iteration": 4.130753517150879 + }, + { + "auxiliary_loss_clip": 0.01099546, + "auxiliary_loss_mlp": 0.01040494, + "balance_loss_clip": 1.04097986, + "balance_loss_mlp": 1.02268374, + "epoch": 0.30668871185931157, + "flos": 15049947375360.0, + "grad_norm": 1.8121599631247622, + "language_loss": 0.78980827, + "learning_rate": 3.249240249232065e-06, + "loss": 0.81120867, + "num_input_tokens_seen": 109534615, + "step": 5101, + "time_per_iteration": 4.324965000152588 + }, + { + "auxiliary_loss_clip": 0.01102347, + "auxiliary_loss_mlp": 0.01054476, + "balance_loss_clip": 1.04654586, + "balance_loss_mlp": 1.03549778, + "epoch": 0.30674883511197953, + "flos": 20082109190400.0, + "grad_norm": 3.103169454759946, + "language_loss": 0.8002606, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.82182884, + "num_input_tokens_seen": 109554040, + "step": 5102, + "time_per_iteration": 2.6799395084381104 + }, + { + "auxiliary_loss_clip": 0.01142197, + "auxiliary_loss_mlp": 0.01041215, + "balance_loss_clip": 1.05097044, + "balance_loss_mlp": 1.02254653, + "epoch": 0.30680895836464755, + "flos": 22893807029760.0, + "grad_norm": 2.1213785434731416, + "language_loss": 0.88774347, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.90957761, + "num_input_tokens_seen": 109574345, + "step": 5103, + "time_per_iteration": 2.65173077583313 + }, + { + "auxiliary_loss_clip": 0.01117159, + "auxiliary_loss_mlp": 0.01047865, + "balance_loss_clip": 1.04379106, + "balance_loss_mlp": 1.03051972, + "epoch": 0.3068690816173155, + "flos": 23696159080320.0, + "grad_norm": 1.7904968866721789, + "language_loss": 0.73977435, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.7614246, + "num_input_tokens_seen": 109593670, + "step": 5104, + "time_per_iteration": 4.15887975692749 + }, + { + "auxiliary_loss_clip": 0.01124364, + "auxiliary_loss_mlp": 0.00776702, + "balance_loss_clip": 1.04378068, + "balance_loss_mlp": 1.00128829, + "epoch": 0.3069292048699835, + "flos": 23551044134400.0, + "grad_norm": 3.7241561762804496, + "language_loss": 0.72777617, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.74678683, + "num_input_tokens_seen": 109613385, + "step": 5105, + "time_per_iteration": 2.657212972640991 + }, + { + "auxiliary_loss_clip": 0.01112354, + "auxiliary_loss_mlp": 0.01041782, + "balance_loss_clip": 1.0451684, + "balance_loss_mlp": 1.02401972, + "epoch": 0.30698932812265145, + "flos": 24531656405760.0, + "grad_norm": 1.9297281358185925, + "language_loss": 0.87290782, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.89444917, + "num_input_tokens_seen": 109632395, + "step": 5106, + "time_per_iteration": 4.409428119659424 + }, + { + "auxiliary_loss_clip": 0.0110831, + "auxiliary_loss_mlp": 0.01052851, + "balance_loss_clip": 1.04540682, + "balance_loss_mlp": 1.03390849, + "epoch": 0.3070494513753194, + "flos": 20996430912000.0, + "grad_norm": 2.254355123120303, + "language_loss": 0.71420276, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.73581433, + "num_input_tokens_seen": 109651380, + "step": 5107, + "time_per_iteration": 2.7320871353149414 + }, + { + "auxiliary_loss_clip": 0.01101295, + "auxiliary_loss_mlp": 0.0104767, + "balance_loss_clip": 1.04618347, + "balance_loss_mlp": 1.03034878, + "epoch": 0.3071095746279874, + "flos": 19025940660480.0, + "grad_norm": 2.1230574515432705, + "language_loss": 0.72282934, + "learning_rate": 3.247110096547814e-06, + "loss": 0.74431896, + "num_input_tokens_seen": 109670240, + "step": 5108, + "time_per_iteration": 2.720196485519409 + }, + { + "auxiliary_loss_clip": 0.01112658, + "auxiliary_loss_mlp": 0.01040837, + "balance_loss_clip": 1.04619241, + "balance_loss_mlp": 1.02325416, + "epoch": 0.30716969788065535, + "flos": 21215521918080.0, + "grad_norm": 3.0053852764205695, + "language_loss": 0.8601433, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.88167822, + "num_input_tokens_seen": 109690810, + "step": 5109, + "time_per_iteration": 2.715580940246582 + }, + { + "auxiliary_loss_clip": 0.01109383, + "auxiliary_loss_mlp": 0.01036759, + "balance_loss_clip": 1.04432368, + "balance_loss_mlp": 1.02017736, + "epoch": 0.3072298211333233, + "flos": 25772765086080.0, + "grad_norm": 1.7463183423202828, + "language_loss": 0.67169911, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.69316053, + "num_input_tokens_seen": 109711145, + "step": 5110, + "time_per_iteration": 2.7133336067199707 + }, + { + "auxiliary_loss_clip": 0.01126653, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.04854119, + "balance_loss_mlp": 1.01736796, + "epoch": 0.3072899443859913, + "flos": 25848931875840.0, + "grad_norm": 1.4548971516988844, + "language_loss": 0.76673061, + "learning_rate": 3.246196464379919e-06, + "loss": 0.78833127, + "num_input_tokens_seen": 109731425, + "step": 5111, + "time_per_iteration": 2.692505121231079 + }, + { + "auxiliary_loss_clip": 0.01140411, + "auxiliary_loss_mlp": 0.0103997, + "balance_loss_clip": 1.04979658, + "balance_loss_mlp": 1.02360249, + "epoch": 0.30735006763865924, + "flos": 25922800195200.0, + "grad_norm": 3.7694679470365244, + "language_loss": 0.67143333, + "learning_rate": 3.245891825796765e-06, + "loss": 0.69323719, + "num_input_tokens_seen": 109752720, + "step": 5112, + "time_per_iteration": 2.6441125869750977 + }, + { + "auxiliary_loss_clip": 0.01133822, + "auxiliary_loss_mlp": 0.01044497, + "balance_loss_clip": 1.05147326, + "balance_loss_mlp": 1.02482784, + "epoch": 0.3074101908913272, + "flos": 30917004312960.0, + "grad_norm": 2.062737517485213, + "language_loss": 0.79524493, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.81702805, + "num_input_tokens_seen": 109772840, + "step": 5113, + "time_per_iteration": 2.7166647911071777 + }, + { + "auxiliary_loss_clip": 0.01102438, + "auxiliary_loss_mlp": 0.00774651, + "balance_loss_clip": 1.04638815, + "balance_loss_mlp": 1.00138378, + "epoch": 0.30747031414399517, + "flos": 18401058731520.0, + "grad_norm": 2.08885217843665, + "language_loss": 0.76926446, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.78803539, + "num_input_tokens_seen": 109790150, + "step": 5114, + "time_per_iteration": 2.6842217445373535 + }, + { + "auxiliary_loss_clip": 0.01100955, + "auxiliary_loss_mlp": 0.01034415, + "balance_loss_clip": 1.0446732, + "balance_loss_mlp": 1.01589036, + "epoch": 0.30753043739666314, + "flos": 22633166966400.0, + "grad_norm": 2.179333764681939, + "language_loss": 0.62607706, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.64743078, + "num_input_tokens_seen": 109807985, + "step": 5115, + "time_per_iteration": 2.7709848880767822 + }, + { + "auxiliary_loss_clip": 0.0113067, + "auxiliary_loss_mlp": 0.01041883, + "balance_loss_clip": 1.04829907, + "balance_loss_mlp": 1.02557516, + "epoch": 0.3075905606493311, + "flos": 27344072517120.0, + "grad_norm": 2.4707888757665684, + "language_loss": 0.82835108, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.85007656, + "num_input_tokens_seen": 109825920, + "step": 5116, + "time_per_iteration": 2.6891255378723145 + }, + { + "auxiliary_loss_clip": 0.01115169, + "auxiliary_loss_mlp": 0.01050095, + "balance_loss_clip": 1.04928303, + "balance_loss_mlp": 1.03291702, + "epoch": 0.3076506839019991, + "flos": 22090808534400.0, + "grad_norm": 1.792550086960714, + "language_loss": 0.75943851, + "learning_rate": 3.244367924446952e-06, + "loss": 0.78109109, + "num_input_tokens_seen": 109846220, + "step": 5117, + "time_per_iteration": 2.6685919761657715 + }, + { + "auxiliary_loss_clip": 0.01096356, + "auxiliary_loss_mlp": 0.010422, + "balance_loss_clip": 1.04583359, + "balance_loss_mlp": 1.02309084, + "epoch": 0.3077108071546671, + "flos": 21289533891840.0, + "grad_norm": 2.509228810910763, + "language_loss": 0.71450555, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.7358911, + "num_input_tokens_seen": 109863870, + "step": 5118, + "time_per_iteration": 2.7360472679138184 + }, + { + "auxiliary_loss_clip": 0.0109679, + "auxiliary_loss_mlp": 0.01040047, + "balance_loss_clip": 1.05069757, + "balance_loss_mlp": 1.02279758, + "epoch": 0.30777093040733505, + "flos": 21430985650560.0, + "grad_norm": 1.6950758291291428, + "language_loss": 0.74499059, + "learning_rate": 3.243758033520219e-06, + "loss": 0.76635897, + "num_input_tokens_seen": 109883500, + "step": 5119, + "time_per_iteration": 2.7963552474975586 + }, + { + "auxiliary_loss_clip": 0.01133391, + "auxiliary_loss_mlp": 0.01054336, + "balance_loss_clip": 1.05088997, + "balance_loss_mlp": 1.03520322, + "epoch": 0.307831053660003, + "flos": 23149275534720.0, + "grad_norm": 2.3083726349779785, + "language_loss": 0.79968077, + "learning_rate": 3.243453017305926e-06, + "loss": 0.821558, + "num_input_tokens_seen": 109904620, + "step": 5120, + "time_per_iteration": 2.7600536346435547 + }, + { + "auxiliary_loss_clip": 0.01127117, + "auxiliary_loss_mlp": 0.01045491, + "balance_loss_clip": 1.04772663, + "balance_loss_mlp": 1.02994657, + "epoch": 0.307891176912671, + "flos": 17019755268480.0, + "grad_norm": 1.7119475154385397, + "language_loss": 0.79864663, + "learning_rate": 3.24314795393977e-06, + "loss": 0.8203727, + "num_input_tokens_seen": 109922275, + "step": 5121, + "time_per_iteration": 2.6204211711883545 + }, + { + "auxiliary_loss_clip": 0.01105091, + "auxiliary_loss_mlp": 0.01039616, + "balance_loss_clip": 1.04669154, + "balance_loss_mlp": 1.02292657, + "epoch": 0.30795130016533895, + "flos": 27705046245120.0, + "grad_norm": 1.4682711249191758, + "language_loss": 0.82526803, + "learning_rate": 3.242842843433319e-06, + "loss": 0.84671509, + "num_input_tokens_seen": 109944265, + "step": 5122, + "time_per_iteration": 2.7210805416107178 + }, + { + "auxiliary_loss_clip": 0.01052784, + "auxiliary_loss_mlp": 0.01010188, + "balance_loss_clip": 1.03048515, + "balance_loss_mlp": 1.00826919, + "epoch": 0.3080114234180069, + "flos": 69058699591680.0, + "grad_norm": 0.7449761063336078, + "language_loss": 0.58609217, + "learning_rate": 3.242537685798143e-06, + "loss": 0.60672188, + "num_input_tokens_seen": 110014160, + "step": 5123, + "time_per_iteration": 3.303093433380127 + }, + { + "auxiliary_loss_clip": 0.01133855, + "auxiliary_loss_mlp": 0.00776294, + "balance_loss_clip": 1.04937184, + "balance_loss_mlp": 1.00136161, + "epoch": 0.3080715466706749, + "flos": 24060221377920.0, + "grad_norm": 1.5927838238117058, + "language_loss": 0.83550704, + "learning_rate": 3.242232481045813e-06, + "loss": 0.85460854, + "num_input_tokens_seen": 110034865, + "step": 5124, + "time_per_iteration": 2.7226438522338867 + }, + { + "auxiliary_loss_clip": 0.01143185, + "auxiliary_loss_mlp": 0.01038734, + "balance_loss_clip": 1.05123234, + "balance_loss_mlp": 1.02206898, + "epoch": 0.30813166992334284, + "flos": 25848680480640.0, + "grad_norm": 2.0767599752543657, + "language_loss": 0.79332423, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.81514347, + "num_input_tokens_seen": 110052930, + "step": 5125, + "time_per_iteration": 2.6514153480529785 + }, + { + "auxiliary_loss_clip": 0.01125892, + "auxiliary_loss_mlp": 0.01035278, + "balance_loss_clip": 1.04636812, + "balance_loss_mlp": 1.01694369, + "epoch": 0.3081917931760108, + "flos": 20449619193600.0, + "grad_norm": 1.764828299724452, + "language_loss": 0.64689863, + "learning_rate": 3.241621930235989e-06, + "loss": 0.66851032, + "num_input_tokens_seen": 110071765, + "step": 5126, + "time_per_iteration": 2.6408963203430176 + }, + { + "auxiliary_loss_clip": 0.01099238, + "auxiliary_loss_mlp": 0.01044536, + "balance_loss_clip": 1.05009556, + "balance_loss_mlp": 1.02698874, + "epoch": 0.3082519164286788, + "flos": 22166257052160.0, + "grad_norm": 1.5302214532460006, + "language_loss": 0.86800975, + "learning_rate": 3.241316584201646e-06, + "loss": 0.88944745, + "num_input_tokens_seen": 110092660, + "step": 5127, + "time_per_iteration": 2.793318748474121 + }, + { + "auxiliary_loss_clip": 0.01086461, + "auxiliary_loss_mlp": 0.01045743, + "balance_loss_clip": 1.04368591, + "balance_loss_mlp": 1.02862501, + "epoch": 0.30831203968134674, + "flos": 28913404700160.0, + "grad_norm": 1.6968110238499217, + "language_loss": 0.69155616, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.71287817, + "num_input_tokens_seen": 110114960, + "step": 5128, + "time_per_iteration": 2.777060031890869 + }, + { + "auxiliary_loss_clip": 0.01130807, + "auxiliary_loss_mlp": 0.00775186, + "balance_loss_clip": 1.05044532, + "balance_loss_mlp": 1.00153518, + "epoch": 0.3083721629340147, + "flos": 25667726739840.0, + "grad_norm": 1.7900045405252538, + "language_loss": 0.71075535, + "learning_rate": 3.240705750931993e-06, + "loss": 0.7298153, + "num_input_tokens_seen": 110135750, + "step": 5129, + "time_per_iteration": 2.7317588329315186 + }, + { + "auxiliary_loss_clip": 0.01030892, + "auxiliary_loss_mlp": 0.01007708, + "balance_loss_clip": 1.0286324, + "balance_loss_mlp": 1.00588405, + "epoch": 0.3084322861866827, + "flos": 68212679581440.0, + "grad_norm": 0.8221299931057983, + "language_loss": 0.59160221, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61198819, + "num_input_tokens_seen": 110189480, + "step": 5130, + "time_per_iteration": 3.2141849994659424 + }, + { + "auxiliary_loss_clip": 0.01115906, + "auxiliary_loss_mlp": 0.01041214, + "balance_loss_clip": 1.04513061, + "balance_loss_mlp": 1.02297497, + "epoch": 0.3084924094393507, + "flos": 20296495514880.0, + "grad_norm": 2.986922621878904, + "language_loss": 0.73292506, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.75449622, + "num_input_tokens_seen": 110206445, + "step": 5131, + "time_per_iteration": 2.6520204544067383 + }, + { + "auxiliary_loss_clip": 0.01099541, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.04438055, + "balance_loss_mlp": 1.01822817, + "epoch": 0.30855253269201866, + "flos": 23949831905280.0, + "grad_norm": 1.569237882810685, + "language_loss": 0.71420097, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.73554134, + "num_input_tokens_seen": 110226845, + "step": 5132, + "time_per_iteration": 2.8439948558807373 + }, + { + "auxiliary_loss_clip": 0.01134935, + "auxiliary_loss_mlp": 0.00774998, + "balance_loss_clip": 1.04922795, + "balance_loss_mlp": 1.00131333, + "epoch": 0.3086126559446866, + "flos": 19281876042240.0, + "grad_norm": 1.9070570981004293, + "language_loss": 0.89846021, + "learning_rate": 3.239483519913136e-06, + "loss": 0.91755956, + "num_input_tokens_seen": 110244095, + "step": 5133, + "time_per_iteration": 2.5872273445129395 + }, + { + "auxiliary_loss_clip": 0.01122429, + "auxiliary_loss_mlp": 0.01043613, + "balance_loss_clip": 1.04856205, + "balance_loss_mlp": 1.02580321, + "epoch": 0.3086727791973546, + "flos": 33760770019200.0, + "grad_norm": 1.7209646054950307, + "language_loss": 0.67267555, + "learning_rate": 3.239177844626102e-06, + "loss": 0.69433594, + "num_input_tokens_seen": 110264240, + "step": 5134, + "time_per_iteration": 2.7872183322906494 + }, + { + "auxiliary_loss_clip": 0.01124541, + "auxiliary_loss_mlp": 0.01041364, + "balance_loss_clip": 1.04777277, + "balance_loss_mlp": 1.02393556, + "epoch": 0.30873290245002255, + "flos": 16034151006720.0, + "grad_norm": 1.9145067593542924, + "language_loss": 0.82794344, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.84960246, + "num_input_tokens_seen": 110282450, + "step": 5135, + "time_per_iteration": 2.6355140209198 + }, + { + "auxiliary_loss_clip": 0.01026512, + "auxiliary_loss_mlp": 0.01003035, + "balance_loss_clip": 1.02417064, + "balance_loss_mlp": 1.00113988, + "epoch": 0.3087930257026905, + "flos": 65048304055680.0, + "grad_norm": 0.6923211570832432, + "language_loss": 0.55314827, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57344365, + "num_input_tokens_seen": 110343715, + "step": 5136, + "time_per_iteration": 3.31300687789917 + }, + { + "auxiliary_loss_clip": 0.01118007, + "auxiliary_loss_mlp": 0.00775624, + "balance_loss_clip": 1.04826593, + "balance_loss_mlp": 1.00124264, + "epoch": 0.3088531489553585, + "flos": 74738829824640.0, + "grad_norm": 2.038560176689262, + "language_loss": 0.76524079, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.78417706, + "num_input_tokens_seen": 110368430, + "step": 5137, + "time_per_iteration": 3.1237831115722656 + }, + { + "auxiliary_loss_clip": 0.01102933, + "auxiliary_loss_mlp": 0.010362, + "balance_loss_clip": 1.04592168, + "balance_loss_mlp": 1.02058411, + "epoch": 0.30891327220802645, + "flos": 21142300043520.0, + "grad_norm": 1.655645044155811, + "language_loss": 0.80083114, + "learning_rate": 3.237954673696424e-06, + "loss": 0.82222247, + "num_input_tokens_seen": 110386735, + "step": 5138, + "time_per_iteration": 2.775902509689331 + }, + { + "auxiliary_loss_clip": 0.01078807, + "auxiliary_loss_mlp": 0.0104514, + "balance_loss_clip": 1.03953338, + "balance_loss_mlp": 1.02583957, + "epoch": 0.3089733954606944, + "flos": 25664494515840.0, + "grad_norm": 1.3823165076112356, + "language_loss": 0.81288958, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.8341291, + "num_input_tokens_seen": 110406820, + "step": 5139, + "time_per_iteration": 4.48141074180603 + }, + { + "auxiliary_loss_clip": 0.01127056, + "auxiliary_loss_mlp": 0.01044845, + "balance_loss_clip": 1.04565382, + "balance_loss_mlp": 1.02575994, + "epoch": 0.3090335187133624, + "flos": 19427350124160.0, + "grad_norm": 2.1511159973406593, + "language_loss": 0.77260494, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.79432398, + "num_input_tokens_seen": 110424225, + "step": 5140, + "time_per_iteration": 4.1141037940979 + }, + { + "auxiliary_loss_clip": 0.01099157, + "auxiliary_loss_mlp": 0.01048812, + "balance_loss_clip": 1.04282403, + "balance_loss_mlp": 1.03233695, + "epoch": 0.30909364196603034, + "flos": 20011329440640.0, + "grad_norm": 1.77105935640331, + "language_loss": 0.78806967, + "learning_rate": 3.237036802553252e-06, + "loss": 0.80954939, + "num_input_tokens_seen": 110443310, + "step": 5141, + "time_per_iteration": 2.6497676372528076 + }, + { + "auxiliary_loss_clip": 0.01119702, + "auxiliary_loss_mlp": 0.0104967, + "balance_loss_clip": 1.04679799, + "balance_loss_mlp": 1.03138292, + "epoch": 0.3091537652186983, + "flos": 19677575243520.0, + "grad_norm": 2.261971688212118, + "language_loss": 0.86853915, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.89023286, + "num_input_tokens_seen": 110460215, + "step": 5142, + "time_per_iteration": 2.635495662689209 + }, + { + "auxiliary_loss_clip": 0.01127738, + "auxiliary_loss_mlp": 0.01048033, + "balance_loss_clip": 1.04709148, + "balance_loss_mlp": 1.03136778, + "epoch": 0.3092138884713663, + "flos": 17020042577280.0, + "grad_norm": 1.7222677689082588, + "language_loss": 0.79352587, + "learning_rate": 3.23642465389567e-06, + "loss": 0.81528366, + "num_input_tokens_seen": 110479385, + "step": 5143, + "time_per_iteration": 2.672196388244629 + }, + { + "auxiliary_loss_clip": 0.01108121, + "auxiliary_loss_mlp": 0.01046466, + "balance_loss_clip": 1.04830873, + "balance_loss_mlp": 1.02858496, + "epoch": 0.3092740117240343, + "flos": 25009986844800.0, + "grad_norm": 1.849759687088619, + "language_loss": 0.72079581, + "learning_rate": 3.236118509233055e-06, + "loss": 0.7423417, + "num_input_tokens_seen": 110499885, + "step": 5144, + "time_per_iteration": 4.2138121128082275 + }, + { + "auxiliary_loss_clip": 0.01130266, + "auxiliary_loss_mlp": 0.0105055, + "balance_loss_clip": 1.04617548, + "balance_loss_mlp": 1.03297877, + "epoch": 0.30933413497670226, + "flos": 25590410714880.0, + "grad_norm": 1.9804845877808144, + "language_loss": 0.74328083, + "learning_rate": 3.235812317696702e-06, + "loss": 0.76508898, + "num_input_tokens_seen": 110519690, + "step": 5145, + "time_per_iteration": 4.315273761749268 + }, + { + "auxiliary_loss_clip": 0.01110927, + "auxiliary_loss_mlp": 0.01045527, + "balance_loss_clip": 1.04372048, + "balance_loss_mlp": 1.02788365, + "epoch": 0.3093942582293702, + "flos": 24389665943040.0, + "grad_norm": 1.6657569174801012, + "language_loss": 0.76391518, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.78547978, + "num_input_tokens_seen": 110540520, + "step": 5146, + "time_per_iteration": 2.7259135246276855 + }, + { + "auxiliary_loss_clip": 0.0111122, + "auxiliary_loss_mlp": 0.01042459, + "balance_loss_clip": 1.04380584, + "balance_loss_mlp": 1.02553141, + "epoch": 0.3094543814820382, + "flos": 19646441130240.0, + "grad_norm": 2.148705061921787, + "language_loss": 0.66899967, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.6905365, + "num_input_tokens_seen": 110557950, + "step": 5147, + "time_per_iteration": 2.6804444789886475 + }, + { + "auxiliary_loss_clip": 0.01132642, + "auxiliary_loss_mlp": 0.0104049, + "balance_loss_clip": 1.04998684, + "balance_loss_mlp": 1.0238843, + "epoch": 0.30951450473470615, + "flos": 25663812157440.0, + "grad_norm": 2.0634223914225585, + "language_loss": 0.74823105, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.76996237, + "num_input_tokens_seen": 110578215, + "step": 5148, + "time_per_iteration": 2.637509346008301 + }, + { + "auxiliary_loss_clip": 0.0113505, + "auxiliary_loss_mlp": 0.01047495, + "balance_loss_clip": 1.0492146, + "balance_loss_mlp": 1.02901721, + "epoch": 0.3095746279873741, + "flos": 12020415505920.0, + "grad_norm": 2.1367843023537287, + "language_loss": 0.73082036, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.75264585, + "num_input_tokens_seen": 110592990, + "step": 5149, + "time_per_iteration": 2.6134157180786133 + }, + { + "auxiliary_loss_clip": 0.01097892, + "auxiliary_loss_mlp": 0.0104428, + "balance_loss_clip": 1.04601955, + "balance_loss_mlp": 1.02615988, + "epoch": 0.3096347512400421, + "flos": 23623044946560.0, + "grad_norm": 2.0797901111423274, + "language_loss": 0.845025, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.86644673, + "num_input_tokens_seen": 110612130, + "step": 5150, + "time_per_iteration": 2.7804181575775146 + }, + { + "auxiliary_loss_clip": 0.01086512, + "auxiliary_loss_mlp": 0.01047133, + "balance_loss_clip": 1.04168093, + "balance_loss_mlp": 1.02820301, + "epoch": 0.30969487449271005, + "flos": 22529313768960.0, + "grad_norm": 1.8768941622145223, + "language_loss": 0.78431082, + "learning_rate": 3.233974184780424e-06, + "loss": 0.80564725, + "num_input_tokens_seen": 110632045, + "step": 5151, + "time_per_iteration": 2.7539470195770264 + }, + { + "auxiliary_loss_clip": 0.01131879, + "auxiliary_loss_mlp": 0.01041443, + "balance_loss_clip": 1.04880977, + "balance_loss_mlp": 1.02362132, + "epoch": 0.309754997745378, + "flos": 15267925059840.0, + "grad_norm": 1.9606136965084777, + "language_loss": 0.67416716, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.69590038, + "num_input_tokens_seen": 110649340, + "step": 5152, + "time_per_iteration": 2.579238176345825 + }, + { + "auxiliary_loss_clip": 0.01080518, + "auxiliary_loss_mlp": 0.01045921, + "balance_loss_clip": 1.04402971, + "balance_loss_mlp": 1.02807546, + "epoch": 0.309815120998046, + "flos": 26979291947520.0, + "grad_norm": 5.6670540450328355, + "language_loss": 0.8251189, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.84638333, + "num_input_tokens_seen": 110668450, + "step": 5153, + "time_per_iteration": 2.792285203933716 + }, + { + "auxiliary_loss_clip": 0.01113849, + "auxiliary_loss_mlp": 0.00775793, + "balance_loss_clip": 1.04663801, + "balance_loss_mlp": 1.00127769, + "epoch": 0.30987524425071394, + "flos": 21143161969920.0, + "grad_norm": 1.937189485762574, + "language_loss": 0.73793215, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.75682855, + "num_input_tokens_seen": 110689410, + "step": 5154, + "time_per_iteration": 2.678454875946045 + }, + { + "auxiliary_loss_clip": 0.01132509, + "auxiliary_loss_mlp": 0.0103738, + "balance_loss_clip": 1.0507983, + "balance_loss_mlp": 1.02009416, + "epoch": 0.3099353675033819, + "flos": 15268284195840.0, + "grad_norm": 2.1601099672999586, + "language_loss": 0.76069349, + "learning_rate": 3.232747826832858e-06, + "loss": 0.78239238, + "num_input_tokens_seen": 110707350, + "step": 5155, + "time_per_iteration": 2.577634334564209 + }, + { + "auxiliary_loss_clip": 0.01131155, + "auxiliary_loss_mlp": 0.01040429, + "balance_loss_clip": 1.05483913, + "balance_loss_mlp": 1.02283418, + "epoch": 0.30999549075604993, + "flos": 15413794191360.0, + "grad_norm": 2.044896457109867, + "language_loss": 0.79096609, + "learning_rate": 3.232441120452094e-06, + "loss": 0.81268191, + "num_input_tokens_seen": 110724910, + "step": 5156, + "time_per_iteration": 2.628363609313965 + }, + { + "auxiliary_loss_clip": 0.01127429, + "auxiliary_loss_mlp": 0.01047381, + "balance_loss_clip": 1.04775023, + "balance_loss_mlp": 1.02779543, + "epoch": 0.3100556140087179, + "flos": 23184539712000.0, + "grad_norm": 2.468311845454126, + "language_loss": 0.74950963, + "learning_rate": 3.23213436733704e-06, + "loss": 0.77125776, + "num_input_tokens_seen": 110744010, + "step": 5157, + "time_per_iteration": 2.6231181621551514 + }, + { + "auxiliary_loss_clip": 0.01108321, + "auxiliary_loss_mlp": 0.01042715, + "balance_loss_clip": 1.04868615, + "balance_loss_mlp": 1.02634752, + "epoch": 0.31011573726138586, + "flos": 25742169676800.0, + "grad_norm": 1.6453166696914168, + "language_loss": 0.69648343, + "learning_rate": 3.231827567499327e-06, + "loss": 0.71799374, + "num_input_tokens_seen": 110765835, + "step": 5158, + "time_per_iteration": 2.734889030456543 + }, + { + "auxiliary_loss_clip": 0.01095116, + "auxiliary_loss_mlp": 0.01046106, + "balance_loss_clip": 1.04443944, + "balance_loss_mlp": 1.0301435, + "epoch": 0.3101758605140538, + "flos": 20011329440640.0, + "grad_norm": 1.9329481500014836, + "language_loss": 0.84861457, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.87002677, + "num_input_tokens_seen": 110784655, + "step": 5159, + "time_per_iteration": 2.665311813354492 + }, + { + "auxiliary_loss_clip": 0.01116498, + "auxiliary_loss_mlp": 0.01046065, + "balance_loss_clip": 1.04710639, + "balance_loss_mlp": 1.02877951, + "epoch": 0.3102359837667218, + "flos": 19135683688320.0, + "grad_norm": 1.9614748869944683, + "language_loss": 0.85129201, + "learning_rate": 3.231213827702462e-06, + "loss": 0.87291765, + "num_input_tokens_seen": 110802545, + "step": 5160, + "time_per_iteration": 2.597130298614502 + }, + { + "auxiliary_loss_clip": 0.01133056, + "auxiliary_loss_mlp": 0.01042602, + "balance_loss_clip": 1.0520395, + "balance_loss_mlp": 1.02582884, + "epoch": 0.31029610701938976, + "flos": 22265405568000.0, + "grad_norm": 1.9459577302566504, + "language_loss": 0.75555152, + "learning_rate": 3.230906887766584e-06, + "loss": 0.77730811, + "num_input_tokens_seen": 110820265, + "step": 5161, + "time_per_iteration": 2.583240032196045 + }, + { + "auxiliary_loss_clip": 0.0113313, + "auxiliary_loss_mlp": 0.01045414, + "balance_loss_clip": 1.05046988, + "balance_loss_mlp": 1.02797401, + "epoch": 0.3103562302720577, + "flos": 20805349536000.0, + "grad_norm": 1.9938857241338979, + "language_loss": 0.8156144, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.83739984, + "num_input_tokens_seen": 110836195, + "step": 5162, + "time_per_iteration": 2.495689630508423 + }, + { + "auxiliary_loss_clip": 0.01128762, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.04903293, + "balance_loss_mlp": 1.02450919, + "epoch": 0.3104163535247257, + "flos": 22344158136960.0, + "grad_norm": 1.777649785974679, + "language_loss": 0.82892883, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.85061604, + "num_input_tokens_seen": 110856420, + "step": 5163, + "time_per_iteration": 2.591036081314087 + }, + { + "auxiliary_loss_clip": 0.01147486, + "auxiliary_loss_mlp": 0.01044526, + "balance_loss_clip": 1.05307984, + "balance_loss_mlp": 1.0273242, + "epoch": 0.31047647677739365, + "flos": 21689363157120.0, + "grad_norm": 1.875247009463239, + "language_loss": 0.76131678, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78323686, + "num_input_tokens_seen": 110876650, + "step": 5164, + "time_per_iteration": 2.5745677947998047 + }, + { + "auxiliary_loss_clip": 0.01103275, + "auxiliary_loss_mlp": 0.01046349, + "balance_loss_clip": 1.04969811, + "balance_loss_mlp": 1.02880108, + "epoch": 0.3105366000300616, + "flos": 18917275040640.0, + "grad_norm": 3.462886730904856, + "language_loss": 0.74514711, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.7666434, + "num_input_tokens_seen": 110894445, + "step": 5165, + "time_per_iteration": 2.724846124649048 + }, + { + "auxiliary_loss_clip": 0.01100578, + "auxiliary_loss_mlp": 0.01057021, + "balance_loss_clip": 1.04695523, + "balance_loss_mlp": 1.03841233, + "epoch": 0.3105967232827296, + "flos": 18260397072000.0, + "grad_norm": 1.6273273492295701, + "language_loss": 0.75827682, + "learning_rate": 3.229371488178348e-06, + "loss": 0.77985275, + "num_input_tokens_seen": 110912855, + "step": 5166, + "time_per_iteration": 2.7309961318969727 + }, + { + "auxiliary_loss_clip": 0.01121318, + "auxiliary_loss_mlp": 0.01043526, + "balance_loss_clip": 1.04969096, + "balance_loss_mlp": 1.02665818, + "epoch": 0.31065684653539755, + "flos": 17672144037120.0, + "grad_norm": 2.1635307284170833, + "language_loss": 0.73621917, + "learning_rate": 3.229064268360444e-06, + "loss": 0.75786763, + "num_input_tokens_seen": 110928025, + "step": 5167, + "time_per_iteration": 2.623375654220581 + }, + { + "auxiliary_loss_clip": 0.01007539, + "auxiliary_loss_mlp": 0.01008435, + "balance_loss_clip": 1.02476823, + "balance_loss_mlp": 1.0059557, + "epoch": 0.3107169697880655, + "flos": 68531996511360.0, + "grad_norm": 0.7113763854018822, + "language_loss": 0.53030008, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55045986, + "num_input_tokens_seen": 110992215, + "step": 5168, + "time_per_iteration": 3.3115129470825195 + }, + { + "auxiliary_loss_clip": 0.01138497, + "auxiliary_loss_mlp": 0.01050074, + "balance_loss_clip": 1.05561399, + "balance_loss_mlp": 1.03151321, + "epoch": 0.3107770930407335, + "flos": 13188733274880.0, + "grad_norm": 3.621905149464154, + "language_loss": 0.79032969, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.81221539, + "num_input_tokens_seen": 111010400, + "step": 5169, + "time_per_iteration": 2.595463514328003 + }, + { + "auxiliary_loss_clip": 0.01121822, + "auxiliary_loss_mlp": 0.01047209, + "balance_loss_clip": 1.04804373, + "balance_loss_mlp": 1.02937579, + "epoch": 0.3108372162934015, + "flos": 31580849520000.0, + "grad_norm": 1.57130024638105, + "language_loss": 0.64071, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66240036, + "num_input_tokens_seen": 111033960, + "step": 5170, + "time_per_iteration": 2.746469497680664 + }, + { + "auxiliary_loss_clip": 0.0110491, + "auxiliary_loss_mlp": 0.00776539, + "balance_loss_clip": 1.04874384, + "balance_loss_mlp": 1.00120461, + "epoch": 0.31089733954606946, + "flos": 28729829266560.0, + "grad_norm": 2.172069963879317, + "language_loss": 0.7723515, + "learning_rate": 3.22783492314295e-06, + "loss": 0.79116607, + "num_input_tokens_seen": 111053265, + "step": 5171, + "time_per_iteration": 2.776974678039551 + }, + { + "auxiliary_loss_clip": 0.01100832, + "auxiliary_loss_mlp": 0.01048172, + "balance_loss_clip": 1.049088, + "balance_loss_mlp": 1.03055298, + "epoch": 0.3109574627987374, + "flos": 19683249592320.0, + "grad_norm": 1.830523579545495, + "language_loss": 0.84020013, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.86169016, + "num_input_tokens_seen": 111071130, + "step": 5172, + "time_per_iteration": 2.718118906021118 + }, + { + "auxiliary_loss_clip": 0.01091688, + "auxiliary_loss_mlp": 0.01045541, + "balance_loss_clip": 1.04622412, + "balance_loss_mlp": 1.02706313, + "epoch": 0.3110175860514054, + "flos": 14683981656960.0, + "grad_norm": 1.9540355263753015, + "language_loss": 0.83730888, + "learning_rate": 3.227219971129842e-06, + "loss": 0.8586812, + "num_input_tokens_seen": 111089560, + "step": 5173, + "time_per_iteration": 2.735163927078247 + }, + { + "auxiliary_loss_clip": 0.01145239, + "auxiliary_loss_mlp": 0.01042621, + "balance_loss_clip": 1.05589437, + "balance_loss_mlp": 1.02656341, + "epoch": 0.31107770930407336, + "flos": 25739655724800.0, + "grad_norm": 3.2612368513370495, + "language_loss": 0.83354348, + "learning_rate": 3.226912425313001e-06, + "loss": 0.85542202, + "num_input_tokens_seen": 111109960, + "step": 5174, + "time_per_iteration": 2.65226411819458 + }, + { + "auxiliary_loss_clip": 0.01122854, + "auxiliary_loss_mlp": 0.01046101, + "balance_loss_clip": 1.05162597, + "balance_loss_mlp": 1.02928042, + "epoch": 0.3111378325567413, + "flos": 19208259118080.0, + "grad_norm": 1.9777752297496725, + "language_loss": 0.85181922, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.87350869, + "num_input_tokens_seen": 111127960, + "step": 5175, + "time_per_iteration": 2.6930692195892334 + }, + { + "auxiliary_loss_clip": 0.01087659, + "auxiliary_loss_mlp": 0.01044685, + "balance_loss_clip": 1.04638839, + "balance_loss_mlp": 1.02623129, + "epoch": 0.3111979558094093, + "flos": 23696374561920.0, + "grad_norm": 1.845729409399547, + "language_loss": 0.82990116, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.8512246, + "num_input_tokens_seen": 111146730, + "step": 5176, + "time_per_iteration": 2.7975289821624756 + }, + { + "auxiliary_loss_clip": 0.01126555, + "auxiliary_loss_mlp": 0.01042513, + "balance_loss_clip": 1.04662132, + "balance_loss_mlp": 1.02361798, + "epoch": 0.31125807906207725, + "flos": 21033023892480.0, + "grad_norm": 1.9258407965023028, + "language_loss": 0.8096348, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.83132547, + "num_input_tokens_seen": 111166295, + "step": 5177, + "time_per_iteration": 2.6275687217712402 + }, + { + "auxiliary_loss_clip": 0.01134117, + "auxiliary_loss_mlp": 0.0077682, + "balance_loss_clip": 1.05381465, + "balance_loss_mlp": 1.00119591, + "epoch": 0.3113182023147452, + "flos": 23076628277760.0, + "grad_norm": 1.6855068015846089, + "language_loss": 0.80707169, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.82618099, + "num_input_tokens_seen": 111185665, + "step": 5178, + "time_per_iteration": 4.142611742019653 + }, + { + "auxiliary_loss_clip": 0.01119942, + "auxiliary_loss_mlp": 0.01047667, + "balance_loss_clip": 1.05289316, + "balance_loss_mlp": 1.03076327, + "epoch": 0.3113783255674132, + "flos": 11838994888320.0, + "grad_norm": 2.5880769767242633, + "language_loss": 0.80990803, + "learning_rate": 3.225373998592471e-06, + "loss": 0.83158416, + "num_input_tokens_seen": 111201615, + "step": 5179, + "time_per_iteration": 2.6429331302642822 + }, + { + "auxiliary_loss_clip": 0.01112505, + "auxiliary_loss_mlp": 0.01048581, + "balance_loss_clip": 1.05353093, + "balance_loss_mlp": 1.03139079, + "epoch": 0.31143844882008115, + "flos": 16289547684480.0, + "grad_norm": 2.4201759029551813, + "language_loss": 0.78532577, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.80693662, + "num_input_tokens_seen": 111220515, + "step": 5180, + "time_per_iteration": 4.1918723583221436 + }, + { + "auxiliary_loss_clip": 0.01107686, + "auxiliary_loss_mlp": 0.01037212, + "balance_loss_clip": 1.05114985, + "balance_loss_mlp": 1.02011788, + "epoch": 0.3114985720727491, + "flos": 23217792727680.0, + "grad_norm": 1.6775849826612523, + "language_loss": 0.83088589, + "learning_rate": 3.22475830255844e-06, + "loss": 0.85233486, + "num_input_tokens_seen": 111240395, + "step": 5181, + "time_per_iteration": 2.760340929031372 + }, + { + "auxiliary_loss_clip": 0.01110614, + "auxiliary_loss_mlp": 0.01044232, + "balance_loss_clip": 1.04879427, + "balance_loss_mlp": 1.02881861, + "epoch": 0.3115586953254171, + "flos": 30044626698240.0, + "grad_norm": 1.766790552230027, + "language_loss": 0.74396992, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.76551843, + "num_input_tokens_seen": 111261100, + "step": 5182, + "time_per_iteration": 2.7501730918884277 + }, + { + "auxiliary_loss_clip": 0.01093489, + "auxiliary_loss_mlp": 0.00776946, + "balance_loss_clip": 1.04811049, + "balance_loss_mlp": 1.00152898, + "epoch": 0.3116188185780851, + "flos": 25666326109440.0, + "grad_norm": 2.03695228940596, + "language_loss": 0.70169222, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.72039658, + "num_input_tokens_seen": 111281320, + "step": 5183, + "time_per_iteration": 4.26041579246521 + }, + { + "auxiliary_loss_clip": 0.01017812, + "auxiliary_loss_mlp": 0.01006564, + "balance_loss_clip": 1.01984847, + "balance_loss_mlp": 1.00418019, + "epoch": 0.31167894183075306, + "flos": 69510058917120.0, + "grad_norm": 0.9394459872440335, + "language_loss": 0.59573013, + "learning_rate": 3.223834410214408e-06, + "loss": 0.61597383, + "num_input_tokens_seen": 111341405, + "step": 5184, + "time_per_iteration": 4.992337226867676 + }, + { + "auxiliary_loss_clip": 0.01115495, + "auxiliary_loss_mlp": 0.01050891, + "balance_loss_clip": 1.04588842, + "balance_loss_mlp": 1.03422523, + "epoch": 0.31173906508342103, + "flos": 14939845211520.0, + "grad_norm": 2.48453112640368, + "language_loss": 0.70156622, + "learning_rate": 3.223526353268311e-06, + "loss": 0.72323, + "num_input_tokens_seen": 111358975, + "step": 5185, + "time_per_iteration": 2.6406824588775635 + }, + { + "auxiliary_loss_clip": 0.01122412, + "auxiliary_loss_mlp": 0.01051261, + "balance_loss_clip": 1.05447555, + "balance_loss_mlp": 1.03405905, + "epoch": 0.311799188336089, + "flos": 16176033728640.0, + "grad_norm": 2.8983279272522853, + "language_loss": 0.63588691, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.65762365, + "num_input_tokens_seen": 111375845, + "step": 5186, + "time_per_iteration": 2.683971881866455 + }, + { + "auxiliary_loss_clip": 0.01126858, + "auxiliary_loss_mlp": 0.01049881, + "balance_loss_clip": 1.05240881, + "balance_loss_mlp": 1.03145099, + "epoch": 0.31185931158875696, + "flos": 25009627708800.0, + "grad_norm": 2.2127415604209335, + "language_loss": 0.86427295, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.88604033, + "num_input_tokens_seen": 111394150, + "step": 5187, + "time_per_iteration": 2.6983299255371094 + }, + { + "auxiliary_loss_clip": 0.01146114, + "auxiliary_loss_mlp": 0.0077496, + "balance_loss_clip": 1.05417776, + "balance_loss_mlp": 1.00131774, + "epoch": 0.3119194348414249, + "flos": 37232901273600.0, + "grad_norm": 1.653121843679143, + "language_loss": 0.63481069, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.6540215, + "num_input_tokens_seen": 111418355, + "step": 5188, + "time_per_iteration": 2.6974728107452393 + }, + { + "auxiliary_loss_clip": 0.01106256, + "auxiliary_loss_mlp": 0.01044626, + "balance_loss_clip": 1.05064225, + "balance_loss_mlp": 1.02799582, + "epoch": 0.3119795580940929, + "flos": 15012779777280.0, + "grad_norm": 2.578497111530561, + "language_loss": 0.83241487, + "learning_rate": 3.222293661638346e-06, + "loss": 0.85392368, + "num_input_tokens_seen": 111435445, + "step": 5189, + "time_per_iteration": 2.6956889629364014 + }, + { + "auxiliary_loss_clip": 0.01031008, + "auxiliary_loss_mlp": 0.01045956, + "balance_loss_clip": 1.03804195, + "balance_loss_mlp": 1.02812243, + "epoch": 0.31203968134676086, + "flos": 15998168557440.0, + "grad_norm": 1.8156368008577992, + "language_loss": 0.79266763, + "learning_rate": 3.22198537282789e-06, + "loss": 0.81343722, + "num_input_tokens_seen": 111453430, + "step": 5190, + "time_per_iteration": 3.0180671215057373 + }, + { + "auxiliary_loss_clip": 0.01086186, + "auxiliary_loss_mlp": 0.01053443, + "balance_loss_clip": 1.04333639, + "balance_loss_mlp": 1.03413141, + "epoch": 0.3120998045994288, + "flos": 23837359443840.0, + "grad_norm": 1.571307617405072, + "language_loss": 0.75174087, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.77313721, + "num_input_tokens_seen": 111475325, + "step": 5191, + "time_per_iteration": 3.0170204639434814 + }, + { + "auxiliary_loss_clip": 0.01043661, + "auxiliary_loss_mlp": 0.00755081, + "balance_loss_clip": 1.02154636, + "balance_loss_mlp": 1.00261629, + "epoch": 0.3121599278520968, + "flos": 69184205712000.0, + "grad_norm": 0.8534965117798614, + "language_loss": 0.63942307, + "learning_rate": 3.221368656205247e-06, + "loss": 0.6574105, + "num_input_tokens_seen": 111533960, + "step": 5192, + "time_per_iteration": 3.288938045501709 + }, + { + "auxiliary_loss_clip": 0.01133662, + "auxiliary_loss_mlp": 0.01043466, + "balance_loss_clip": 1.05246997, + "balance_loss_mlp": 1.02569187, + "epoch": 0.31222005110476475, + "flos": 23806368984960.0, + "grad_norm": 1.9226654053779162, + "language_loss": 0.7976644, + "learning_rate": 3.221060228416446e-06, + "loss": 0.81943566, + "num_input_tokens_seen": 111554055, + "step": 5193, + "time_per_iteration": 2.758859157562256 + }, + { + "auxiliary_loss_clip": 0.01117628, + "auxiliary_loss_mlp": 0.01054751, + "balance_loss_clip": 1.04916263, + "balance_loss_mlp": 1.03508139, + "epoch": 0.3122801743574327, + "flos": 25226132935680.0, + "grad_norm": 2.5170295869133024, + "language_loss": 0.72488689, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.74661064, + "num_input_tokens_seen": 111574305, + "step": 5194, + "time_per_iteration": 2.69765567779541 + }, + { + "auxiliary_loss_clip": 0.01144699, + "auxiliary_loss_mlp": 0.01044476, + "balance_loss_clip": 1.05394197, + "balance_loss_mlp": 1.02819204, + "epoch": 0.3123402976101007, + "flos": 22966490200320.0, + "grad_norm": 1.775027795968239, + "language_loss": 0.76423192, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.78612363, + "num_input_tokens_seen": 111595680, + "step": 5195, + "time_per_iteration": 2.665656566619873 + }, + { + "auxiliary_loss_clip": 0.01144607, + "auxiliary_loss_mlp": 0.01042079, + "balance_loss_clip": 1.05148935, + "balance_loss_mlp": 1.02544916, + "epoch": 0.3124004208627687, + "flos": 25192089820800.0, + "grad_norm": 1.4414001308378115, + "language_loss": 0.78089559, + "learning_rate": 3.220134667280476e-06, + "loss": 0.80276251, + "num_input_tokens_seen": 111618135, + "step": 5196, + "time_per_iteration": 2.682476282119751 + }, + { + "auxiliary_loss_clip": 0.01032618, + "auxiliary_loss_mlp": 0.00755246, + "balance_loss_clip": 1.02237272, + "balance_loss_mlp": 1.00273037, + "epoch": 0.31246054411543667, + "flos": 67485165517440.0, + "grad_norm": 0.794984063014186, + "language_loss": 0.54770386, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56558245, + "num_input_tokens_seen": 111682220, + "step": 5197, + "time_per_iteration": 3.24509334564209 + }, + { + "auxiliary_loss_clip": 0.01144094, + "auxiliary_loss_mlp": 0.01042495, + "balance_loss_clip": 1.0547365, + "balance_loss_mlp": 1.02586555, + "epoch": 0.31252066736810463, + "flos": 17858520731520.0, + "grad_norm": 1.8260094290654212, + "language_loss": 0.66137004, + "learning_rate": 3.21951739516552e-06, + "loss": 0.68323588, + "num_input_tokens_seen": 111700815, + "step": 5198, + "time_per_iteration": 2.5970942974090576 + }, + { + "auxiliary_loss_clip": 0.01102297, + "auxiliary_loss_mlp": 0.01047482, + "balance_loss_clip": 1.0459094, + "balance_loss_mlp": 1.02898037, + "epoch": 0.3125807906207726, + "flos": 18475034791680.0, + "grad_norm": 2.530729988117139, + "language_loss": 0.6949119, + "learning_rate": 3.219208689735857e-06, + "loss": 0.71640968, + "num_input_tokens_seen": 111718195, + "step": 5199, + "time_per_iteration": 2.6682288646698 + }, + { + "auxiliary_loss_clip": 0.01132634, + "auxiliary_loss_mlp": 0.01050152, + "balance_loss_clip": 1.04906189, + "balance_loss_mlp": 1.03258061, + "epoch": 0.31264091387344056, + "flos": 18946541646720.0, + "grad_norm": 1.8087592578592666, + "language_loss": 0.78480452, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.8066324, + "num_input_tokens_seen": 111734440, + "step": 5200, + "time_per_iteration": 2.6664814949035645 + }, + { + "auxiliary_loss_clip": 0.01132139, + "auxiliary_loss_mlp": 0.01037041, + "balance_loss_clip": 1.05233109, + "balance_loss_mlp": 1.02036345, + "epoch": 0.3127010371261085, + "flos": 21468512384640.0, + "grad_norm": 2.0480479984687214, + "language_loss": 0.83231741, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.85400921, + "num_input_tokens_seen": 111751960, + "step": 5201, + "time_per_iteration": 2.674558401107788 + }, + { + "auxiliary_loss_clip": 0.01144703, + "auxiliary_loss_mlp": 0.01045083, + "balance_loss_clip": 1.05244124, + "balance_loss_mlp": 1.02697527, + "epoch": 0.3127611603787765, + "flos": 15336047203200.0, + "grad_norm": 3.6217323271444037, + "language_loss": 0.6910159, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.71291375, + "num_input_tokens_seen": 111769585, + "step": 5202, + "time_per_iteration": 2.563164710998535 + }, + { + "auxiliary_loss_clip": 0.01146715, + "auxiliary_loss_mlp": 0.01041598, + "balance_loss_clip": 1.05293012, + "balance_loss_mlp": 1.02608871, + "epoch": 0.31282128363144446, + "flos": 17602980399360.0, + "grad_norm": 1.898082303559049, + "language_loss": 0.84124672, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.86312985, + "num_input_tokens_seen": 111787880, + "step": 5203, + "time_per_iteration": 2.6024506092071533 + }, + { + "auxiliary_loss_clip": 0.01086755, + "auxiliary_loss_mlp": 0.01049344, + "balance_loss_clip": 1.04461396, + "balance_loss_mlp": 1.03139079, + "epoch": 0.3128814068841124, + "flos": 26756753235840.0, + "grad_norm": 2.246749233698224, + "language_loss": 0.61165982, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.63302082, + "num_input_tokens_seen": 111805950, + "step": 5204, + "time_per_iteration": 2.748486042022705 + }, + { + "auxiliary_loss_clip": 0.01105223, + "auxiliary_loss_mlp": 0.01043537, + "balance_loss_clip": 1.04439998, + "balance_loss_mlp": 1.02722907, + "epoch": 0.3129415301367804, + "flos": 22272372806400.0, + "grad_norm": 1.6432390116063589, + "language_loss": 0.65875763, + "learning_rate": 3.217355486684887e-06, + "loss": 0.68024528, + "num_input_tokens_seen": 111826135, + "step": 5205, + "time_per_iteration": 2.717499256134033 + }, + { + "auxiliary_loss_clip": 0.01134026, + "auxiliary_loss_mlp": 0.01046734, + "balance_loss_clip": 1.05126929, + "balance_loss_mlp": 1.02849531, + "epoch": 0.31300165338944835, + "flos": 26464907232000.0, + "grad_norm": 1.6106510494401134, + "language_loss": 0.76811433, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.78992188, + "num_input_tokens_seen": 111844700, + "step": 5206, + "time_per_iteration": 2.642439603805542 + }, + { + "auxiliary_loss_clip": 0.01140688, + "auxiliary_loss_mlp": 0.01041131, + "balance_loss_clip": 1.04956853, + "balance_loss_mlp": 1.02448893, + "epoch": 0.3130617766421163, + "flos": 21944652094080.0, + "grad_norm": 2.214530025407602, + "language_loss": 0.83204615, + "learning_rate": 3.216737382911672e-06, + "loss": 0.85386431, + "num_input_tokens_seen": 111861585, + "step": 5207, + "time_per_iteration": 2.616652727127075 + }, + { + "auxiliary_loss_clip": 0.01127002, + "auxiliary_loss_mlp": 0.0104831, + "balance_loss_clip": 1.0502398, + "balance_loss_mlp": 1.0328126, + "epoch": 0.3131218998947843, + "flos": 23292774368640.0, + "grad_norm": 1.5207985149404841, + "language_loss": 0.71359724, + "learning_rate": 3.216428261810999e-06, + "loss": 0.73535037, + "num_input_tokens_seen": 111882950, + "step": 5208, + "time_per_iteration": 2.674813747406006 + }, + { + "auxiliary_loss_clip": 0.01120564, + "auxiliary_loss_mlp": 0.01045064, + "balance_loss_clip": 1.04862344, + "balance_loss_mlp": 1.02827978, + "epoch": 0.3131820231474523, + "flos": 21139642437120.0, + "grad_norm": 1.848256205390157, + "language_loss": 0.74558908, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.76724535, + "num_input_tokens_seen": 111901640, + "step": 5209, + "time_per_iteration": 2.7193644046783447 + }, + { + "auxiliary_loss_clip": 0.01140035, + "auxiliary_loss_mlp": 0.01045727, + "balance_loss_clip": 1.04733396, + "balance_loss_mlp": 1.02937174, + "epoch": 0.31324214640012027, + "flos": 23909863046400.0, + "grad_norm": 2.0633998475681135, + "language_loss": 0.77254915, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.79440677, + "num_input_tokens_seen": 111919615, + "step": 5210, + "time_per_iteration": 2.6212270259857178 + }, + { + "auxiliary_loss_clip": 0.01125553, + "auxiliary_loss_mlp": 0.01039925, + "balance_loss_clip": 1.047261, + "balance_loss_mlp": 1.02385592, + "epoch": 0.31330226965278823, + "flos": 22236929061120.0, + "grad_norm": 1.9577389211395706, + "language_loss": 0.79128736, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.81294215, + "num_input_tokens_seen": 111938485, + "step": 5211, + "time_per_iteration": 2.6618316173553467 + }, + { + "auxiliary_loss_clip": 0.01132257, + "auxiliary_loss_mlp": 0.01042587, + "balance_loss_clip": 1.05107522, + "balance_loss_mlp": 1.02768588, + "epoch": 0.3133623929054562, + "flos": 19753993428480.0, + "grad_norm": 2.4581961413264195, + "language_loss": 0.79612064, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.81786901, + "num_input_tokens_seen": 111956425, + "step": 5212, + "time_per_iteration": 2.81793475151062 + }, + { + "auxiliary_loss_clip": 0.01125931, + "auxiliary_loss_mlp": 0.01053393, + "balance_loss_clip": 1.05156052, + "balance_loss_mlp": 1.03576159, + "epoch": 0.31342251615812416, + "flos": 27162256849920.0, + "grad_norm": 2.69561664367352, + "language_loss": 0.71024299, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.73203623, + "num_input_tokens_seen": 111975915, + "step": 5213, + "time_per_iteration": 2.6739485263824463 + }, + { + "auxiliary_loss_clip": 0.01132672, + "auxiliary_loss_mlp": 0.01045903, + "balance_loss_clip": 1.05284989, + "balance_loss_mlp": 1.02961898, + "epoch": 0.31348263941079213, + "flos": 20229809915520.0, + "grad_norm": 1.9828215257111186, + "language_loss": 0.77684069, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.79862642, + "num_input_tokens_seen": 111995055, + "step": 5214, + "time_per_iteration": 2.6108171939849854 + }, + { + "auxiliary_loss_clip": 0.01099316, + "auxiliary_loss_mlp": 0.01038553, + "balance_loss_clip": 1.0522778, + "balance_loss_mlp": 1.02317524, + "epoch": 0.3135427626634601, + "flos": 24607643627520.0, + "grad_norm": 2.2634840816113075, + "language_loss": 0.8300609, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.8514396, + "num_input_tokens_seen": 112015830, + "step": 5215, + "time_per_iteration": 2.77897047996521 + }, + { + "auxiliary_loss_clip": 0.01131919, + "auxiliary_loss_mlp": 0.01040929, + "balance_loss_clip": 1.05089617, + "balance_loss_mlp": 1.02375078, + "epoch": 0.31360288591612806, + "flos": 20959873845120.0, + "grad_norm": 2.280765330466862, + "language_loss": 0.79540187, + "learning_rate": 3.213953633415686e-06, + "loss": 0.81713033, + "num_input_tokens_seen": 112035065, + "step": 5216, + "time_per_iteration": 2.675492763519287 + }, + { + "auxiliary_loss_clip": 0.01119434, + "auxiliary_loss_mlp": 0.01049814, + "balance_loss_clip": 1.04817545, + "balance_loss_mlp": 1.03174222, + "epoch": 0.313663009168796, + "flos": 26980513009920.0, + "grad_norm": 1.97082305961493, + "language_loss": 0.69007474, + "learning_rate": 3.213644097593477e-06, + "loss": 0.7117672, + "num_input_tokens_seen": 112058405, + "step": 5217, + "time_per_iteration": 2.7360196113586426 + }, + { + "auxiliary_loss_clip": 0.01121348, + "auxiliary_loss_mlp": 0.01038659, + "balance_loss_clip": 1.04833519, + "balance_loss_mlp": 1.02275062, + "epoch": 0.313723132421464, + "flos": 18040911016320.0, + "grad_norm": 1.7253432561329243, + "language_loss": 0.81228399, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.83388406, + "num_input_tokens_seen": 112076420, + "step": 5218, + "time_per_iteration": 4.393778562545776 + }, + { + "auxiliary_loss_clip": 0.01139073, + "auxiliary_loss_mlp": 0.01041023, + "balance_loss_clip": 1.04819143, + "balance_loss_mlp": 1.02422082, + "epoch": 0.31378325567413196, + "flos": 22488913946880.0, + "grad_norm": 2.6452768271158167, + "language_loss": 0.69128895, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.71308994, + "num_input_tokens_seen": 112090775, + "step": 5219, + "time_per_iteration": 4.162578344345093 + }, + { + "auxiliary_loss_clip": 0.01117748, + "auxiliary_loss_mlp": 0.01044298, + "balance_loss_clip": 1.04879618, + "balance_loss_mlp": 1.0287652, + "epoch": 0.3138433789267999, + "flos": 22419247518720.0, + "grad_norm": 5.057996341652072, + "language_loss": 0.80019122, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.82181168, + "num_input_tokens_seen": 112110980, + "step": 5220, + "time_per_iteration": 2.693300247192383 + }, + { + "auxiliary_loss_clip": 0.01133002, + "auxiliary_loss_mlp": 0.01038024, + "balance_loss_clip": 1.05214572, + "balance_loss_mlp": 1.0220139, + "epoch": 0.3139035021794679, + "flos": 13005912026880.0, + "grad_norm": 1.7918234828134079, + "language_loss": 0.72575235, + "learning_rate": 3.212405494206986e-06, + "loss": 0.74746263, + "num_input_tokens_seen": 112129020, + "step": 5221, + "time_per_iteration": 2.6918861865997314 + }, + { + "auxiliary_loss_clip": 0.01105754, + "auxiliary_loss_mlp": 0.0104005, + "balance_loss_clip": 1.04538214, + "balance_loss_mlp": 1.02435017, + "epoch": 0.31396362543213585, + "flos": 16945994689920.0, + "grad_norm": 1.7850671432610508, + "language_loss": 0.82097268, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.84243071, + "num_input_tokens_seen": 112147865, + "step": 5222, + "time_per_iteration": 4.193262100219727 + }, + { + "auxiliary_loss_clip": 0.01136096, + "auxiliary_loss_mlp": 0.01044943, + "balance_loss_clip": 1.05302894, + "balance_loss_mlp": 1.02764595, + "epoch": 0.31402374868480387, + "flos": 20156731695360.0, + "grad_norm": 2.3946225731958073, + "language_loss": 0.70159894, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.7234093, + "num_input_tokens_seen": 112166745, + "step": 5223, + "time_per_iteration": 2.642608642578125 + }, + { + "auxiliary_loss_clip": 0.01120375, + "auxiliary_loss_mlp": 0.00773089, + "balance_loss_clip": 1.04545665, + "balance_loss_mlp": 1.0012387, + "epoch": 0.31408387193747184, + "flos": 21251073404160.0, + "grad_norm": 1.5662600408509175, + "language_loss": 0.80818307, + "learning_rate": 3.211476058893379e-06, + "loss": 0.82711768, + "num_input_tokens_seen": 112185895, + "step": 5224, + "time_per_iteration": 4.334134101867676 + }, + { + "auxiliary_loss_clip": 0.0113849, + "auxiliary_loss_mlp": 0.01044903, + "balance_loss_clip": 1.05376673, + "balance_loss_mlp": 1.02807033, + "epoch": 0.3141439951901398, + "flos": 27484267299840.0, + "grad_norm": 2.581635190586104, + "language_loss": 0.57647121, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.59830517, + "num_input_tokens_seen": 112204465, + "step": 5225, + "time_per_iteration": 2.680227041244507 + }, + { + "auxiliary_loss_clip": 0.01086502, + "auxiliary_loss_mlp": 0.01032759, + "balance_loss_clip": 1.04252625, + "balance_loss_mlp": 1.0179472, + "epoch": 0.31420411844280777, + "flos": 17852235851520.0, + "grad_norm": 2.0500851879408577, + "language_loss": 0.81726074, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.83845341, + "num_input_tokens_seen": 112221635, + "step": 5226, + "time_per_iteration": 2.8080878257751465 + }, + { + "auxiliary_loss_clip": 0.01123539, + "auxiliary_loss_mlp": 0.01053238, + "balance_loss_clip": 1.04718053, + "balance_loss_mlp": 1.03557122, + "epoch": 0.31426424169547573, + "flos": 21616967295360.0, + "grad_norm": 1.8156350578732643, + "language_loss": 0.7435357, + "learning_rate": 3.210546210126141e-06, + "loss": 0.76530349, + "num_input_tokens_seen": 112241240, + "step": 5227, + "time_per_iteration": 2.6420040130615234 + }, + { + "auxiliary_loss_clip": 0.01128154, + "auxiliary_loss_mlp": 0.01036288, + "balance_loss_clip": 1.05315053, + "balance_loss_mlp": 1.01981306, + "epoch": 0.3143243649481437, + "flos": 30920631586560.0, + "grad_norm": 1.9798889840887306, + "language_loss": 0.6779027, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.69954711, + "num_input_tokens_seen": 112262350, + "step": 5228, + "time_per_iteration": 2.6904454231262207 + }, + { + "auxiliary_loss_clip": 0.01116854, + "auxiliary_loss_mlp": 0.01042698, + "balance_loss_clip": 1.04812217, + "balance_loss_mlp": 1.02755868, + "epoch": 0.31438448820081166, + "flos": 22821411168000.0, + "grad_norm": 2.2592581290101648, + "language_loss": 0.802086, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.82368147, + "num_input_tokens_seen": 112283710, + "step": 5229, + "time_per_iteration": 2.720972776412964 + }, + { + "auxiliary_loss_clip": 0.01116185, + "auxiliary_loss_mlp": 0.01034979, + "balance_loss_clip": 1.04888391, + "balance_loss_mlp": 1.01917148, + "epoch": 0.3144446114534796, + "flos": 23292127923840.0, + "grad_norm": 2.206396959728329, + "language_loss": 0.69972271, + "learning_rate": 3.209615948222611e-06, + "loss": 0.72123438, + "num_input_tokens_seen": 112304285, + "step": 5230, + "time_per_iteration": 2.69555401802063 + }, + { + "auxiliary_loss_clip": 0.01094216, + "auxiliary_loss_mlp": 0.01051308, + "balance_loss_clip": 1.042889, + "balance_loss_mlp": 1.03331971, + "epoch": 0.3145047347061476, + "flos": 31355976424320.0, + "grad_norm": 11.083232715551919, + "language_loss": 0.79441226, + "learning_rate": 3.209305769168239e-06, + "loss": 0.81586754, + "num_input_tokens_seen": 112325110, + "step": 5231, + "time_per_iteration": 2.742414712905884 + }, + { + "auxiliary_loss_clip": 0.01111136, + "auxiliary_loss_mlp": 0.01044032, + "balance_loss_clip": 1.05004621, + "balance_loss_mlp": 1.02751017, + "epoch": 0.31456485795881556, + "flos": 10889552643840.0, + "grad_norm": 68.21693219117104, + "language_loss": 0.84846044, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.87001216, + "num_input_tokens_seen": 112339855, + "step": 5232, + "time_per_iteration": 2.681541919708252 + }, + { + "auxiliary_loss_clip": 0.01082351, + "auxiliary_loss_mlp": 0.01063678, + "balance_loss_clip": 1.04169703, + "balance_loss_mlp": 1.04589176, + "epoch": 0.3146249812114835, + "flos": 17092438439040.0, + "grad_norm": 1.732593505271442, + "language_loss": 0.79899549, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.82045579, + "num_input_tokens_seen": 112358480, + "step": 5233, + "time_per_iteration": 2.7261524200439453 + }, + { + "auxiliary_loss_clip": 0.01095476, + "auxiliary_loss_mlp": 0.01043701, + "balance_loss_clip": 1.04795146, + "balance_loss_mlp": 1.02775121, + "epoch": 0.3146851044641515, + "flos": 55291442889600.0, + "grad_norm": 1.8884411146751285, + "language_loss": 0.71124369, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.73263544, + "num_input_tokens_seen": 112382350, + "step": 5234, + "time_per_iteration": 3.0071427822113037 + }, + { + "auxiliary_loss_clip": 0.01105209, + "auxiliary_loss_mlp": 0.01036666, + "balance_loss_clip": 1.05008078, + "balance_loss_mlp": 1.02060878, + "epoch": 0.31474522771681945, + "flos": 27015884928000.0, + "grad_norm": 2.1537517260325396, + "language_loss": 0.72106552, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.74248433, + "num_input_tokens_seen": 112400260, + "step": 5235, + "time_per_iteration": 2.7347464561462402 + }, + { + "auxiliary_loss_clip": 0.011281, + "auxiliary_loss_mlp": 0.0103842, + "balance_loss_clip": 1.0479089, + "balance_loss_mlp": 1.0225656, + "epoch": 0.3148053509694875, + "flos": 21251935330560.0, + "grad_norm": 2.047935998004664, + "language_loss": 0.78640145, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.80806667, + "num_input_tokens_seen": 112419400, + "step": 5236, + "time_per_iteration": 2.6480181217193604 + }, + { + "auxiliary_loss_clip": 0.01142531, + "auxiliary_loss_mlp": 0.0104222, + "balance_loss_clip": 1.04929006, + "balance_loss_mlp": 1.02536416, + "epoch": 0.31486547422215544, + "flos": 31248675521280.0, + "grad_norm": 1.8469097199945863, + "language_loss": 0.75903904, + "learning_rate": 3.207443732256881e-06, + "loss": 0.78088653, + "num_input_tokens_seen": 112440825, + "step": 5237, + "time_per_iteration": 2.7113847732543945 + }, + { + "auxiliary_loss_clip": 0.01133953, + "auxiliary_loss_mlp": 0.01035749, + "balance_loss_clip": 1.04817045, + "balance_loss_mlp": 1.02128255, + "epoch": 0.3149255974748234, + "flos": 19828615933440.0, + "grad_norm": 2.176202072112168, + "language_loss": 0.79725033, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.81894737, + "num_input_tokens_seen": 112459180, + "step": 5238, + "time_per_iteration": 2.649968147277832 + }, + { + "auxiliary_loss_clip": 0.01046118, + "auxiliary_loss_mlp": 0.01018852, + "balance_loss_clip": 1.02561212, + "balance_loss_mlp": 1.01676548, + "epoch": 0.31498572072749137, + "flos": 67683965339520.0, + "grad_norm": 0.8324046464960934, + "language_loss": 0.67913729, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.69978696, + "num_input_tokens_seen": 112516680, + "step": 5239, + "time_per_iteration": 3.130643606185913 + }, + { + "auxiliary_loss_clip": 0.01121581, + "auxiliary_loss_mlp": 0.01043617, + "balance_loss_clip": 1.04828835, + "balance_loss_mlp": 1.02528274, + "epoch": 0.31504584398015933, + "flos": 19793136274560.0, + "grad_norm": 2.4702861290170235, + "language_loss": 0.82906926, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.85072124, + "num_input_tokens_seen": 112535895, + "step": 5240, + "time_per_iteration": 2.6314027309417725 + }, + { + "auxiliary_loss_clip": 0.0111196, + "auxiliary_loss_mlp": 0.0077379, + "balance_loss_clip": 1.04708409, + "balance_loss_mlp": 1.00132334, + "epoch": 0.3151059672328273, + "flos": 26615409217920.0, + "grad_norm": 1.6854261536361361, + "language_loss": 0.81405544, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.83291298, + "num_input_tokens_seen": 112557490, + "step": 5241, + "time_per_iteration": 2.7245657444000244 + }, + { + "auxiliary_loss_clip": 0.01138561, + "auxiliary_loss_mlp": 0.01038584, + "balance_loss_clip": 1.05094576, + "balance_loss_mlp": 1.0230633, + "epoch": 0.31516609048549526, + "flos": 24204438483840.0, + "grad_norm": 1.7554610875937957, + "language_loss": 0.74513441, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.7669059, + "num_input_tokens_seen": 112577075, + "step": 5242, + "time_per_iteration": 2.5925803184509277 + }, + { + "auxiliary_loss_clip": 0.01106752, + "auxiliary_loss_mlp": 0.01039069, + "balance_loss_clip": 1.04686832, + "balance_loss_mlp": 1.02230775, + "epoch": 0.31522621373816323, + "flos": 25958710817280.0, + "grad_norm": 12.905078117761404, + "language_loss": 0.73457384, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.75603199, + "num_input_tokens_seen": 112597620, + "step": 5243, + "time_per_iteration": 2.721261739730835 + }, + { + "auxiliary_loss_clip": 0.01126602, + "auxiliary_loss_mlp": 0.01041378, + "balance_loss_clip": 1.04783881, + "balance_loss_mlp": 1.02524936, + "epoch": 0.3152863369908312, + "flos": 21908813299200.0, + "grad_norm": 2.079273463581607, + "language_loss": 0.6462577, + "learning_rate": 3.205269272758513e-06, + "loss": 0.66793752, + "num_input_tokens_seen": 112617150, + "step": 5244, + "time_per_iteration": 2.6753153800964355 + }, + { + "auxiliary_loss_clip": 0.01087107, + "auxiliary_loss_mlp": 0.01037472, + "balance_loss_clip": 1.04454994, + "balance_loss_mlp": 1.02158141, + "epoch": 0.31534646024349916, + "flos": 16281072074880.0, + "grad_norm": 2.126512737541558, + "language_loss": 0.91117549, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.93242127, + "num_input_tokens_seen": 112631090, + "step": 5245, + "time_per_iteration": 2.717316150665283 + }, + { + "auxiliary_loss_clip": 0.01129236, + "auxiliary_loss_mlp": 0.01046116, + "balance_loss_clip": 1.04892504, + "balance_loss_mlp": 1.02911687, + "epoch": 0.3154065834961671, + "flos": 24717243000960.0, + "grad_norm": 2.0341104694483296, + "language_loss": 0.75199413, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.77374756, + "num_input_tokens_seen": 112651220, + "step": 5246, + "time_per_iteration": 2.738969564437866 + }, + { + "auxiliary_loss_clip": 0.01139621, + "auxiliary_loss_mlp": 0.01044826, + "balance_loss_clip": 1.04860735, + "balance_loss_mlp": 1.027946, + "epoch": 0.3154667067488351, + "flos": 35371148469120.0, + "grad_norm": 1.7161631839732394, + "language_loss": 0.61524433, + "learning_rate": 3.204336675750321e-06, + "loss": 0.63708878, + "num_input_tokens_seen": 112671560, + "step": 5247, + "time_per_iteration": 2.714258909225464 + }, + { + "auxiliary_loss_clip": 0.01129569, + "auxiliary_loss_mlp": 0.0104508, + "balance_loss_clip": 1.04842138, + "balance_loss_mlp": 1.0283072, + "epoch": 0.31552683000150306, + "flos": 17456464823040.0, + "grad_norm": 2.438581052681848, + "language_loss": 0.82096362, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.84271014, + "num_input_tokens_seen": 112689790, + "step": 5248, + "time_per_iteration": 2.6235198974609375 + }, + { + "auxiliary_loss_clip": 0.01121718, + "auxiliary_loss_mlp": 0.01047358, + "balance_loss_clip": 1.04964209, + "balance_loss_mlp": 1.0292145, + "epoch": 0.3155869532541711, + "flos": 18405763413120.0, + "grad_norm": 5.654706808285272, + "language_loss": 0.84601712, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.86770785, + "num_input_tokens_seen": 112708265, + "step": 5249, + "time_per_iteration": 2.664454698562622 + }, + { + "auxiliary_loss_clip": 0.01105599, + "auxiliary_loss_mlp": 0.01040266, + "balance_loss_clip": 1.04724038, + "balance_loss_mlp": 1.02252758, + "epoch": 0.31564707650683904, + "flos": 21579763783680.0, + "grad_norm": 2.1333510394712034, + "language_loss": 0.85412121, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.87557989, + "num_input_tokens_seen": 112727820, + "step": 5250, + "time_per_iteration": 2.7892768383026123 + }, + { + "auxiliary_loss_clip": 0.01110748, + "auxiliary_loss_mlp": 0.01044305, + "balance_loss_clip": 1.04626083, + "balance_loss_mlp": 1.02721059, + "epoch": 0.315707199759507, + "flos": 21030976817280.0, + "grad_norm": 3.250818956981283, + "language_loss": 0.68651402, + "learning_rate": 3.203092573767835e-06, + "loss": 0.70806456, + "num_input_tokens_seen": 112743140, + "step": 5251, + "time_per_iteration": 2.660738468170166 + }, + { + "auxiliary_loss_clip": 0.01141131, + "auxiliary_loss_mlp": 0.01040852, + "balance_loss_clip": 1.05063367, + "balance_loss_mlp": 1.02374566, + "epoch": 0.31576732301217497, + "flos": 26828861788800.0, + "grad_norm": 1.6959923935223091, + "language_loss": 0.79367268, + "learning_rate": 3.202781434189246e-06, + "loss": 0.81549257, + "num_input_tokens_seen": 112764705, + "step": 5252, + "time_per_iteration": 2.6600146293640137 + }, + { + "auxiliary_loss_clip": 0.01123952, + "auxiliary_loss_mlp": 0.01055554, + "balance_loss_clip": 1.04919744, + "balance_loss_mlp": 1.03742182, + "epoch": 0.31582744626484294, + "flos": 22711165349760.0, + "grad_norm": 1.5850214403847396, + "language_loss": 0.74167955, + "learning_rate": 3.202470249001066e-06, + "loss": 0.76347458, + "num_input_tokens_seen": 112785310, + "step": 5253, + "time_per_iteration": 2.6831557750701904 + }, + { + "auxiliary_loss_clip": 0.01117625, + "auxiliary_loss_mlp": 0.01042879, + "balance_loss_clip": 1.04685211, + "balance_loss_mlp": 1.02571261, + "epoch": 0.3158875695175109, + "flos": 23951914894080.0, + "grad_norm": 1.8578399335985847, + "language_loss": 0.73295557, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.75456059, + "num_input_tokens_seen": 112802905, + "step": 5254, + "time_per_iteration": 2.664445161819458 + }, + { + "auxiliary_loss_clip": 0.0112999, + "auxiliary_loss_mlp": 0.0104166, + "balance_loss_clip": 1.04998255, + "balance_loss_mlp": 1.02442837, + "epoch": 0.31594769277017887, + "flos": 13261883322240.0, + "grad_norm": 1.9116991379626416, + "language_loss": 0.77497417, + "learning_rate": 3.201847741843128e-06, + "loss": 0.7966907, + "num_input_tokens_seen": 112820305, + "step": 5255, + "time_per_iteration": 2.5817084312438965 + }, + { + "auxiliary_loss_clip": 0.01116092, + "auxiliary_loss_mlp": 0.01045862, + "balance_loss_clip": 1.0481391, + "balance_loss_mlp": 1.02718151, + "epoch": 0.31600781602284683, + "flos": 23368258800000.0, + "grad_norm": 2.396272573281143, + "language_loss": 0.7821492, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80376875, + "num_input_tokens_seen": 112841185, + "step": 5256, + "time_per_iteration": 2.6798577308654785 + }, + { + "auxiliary_loss_clip": 0.0109858, + "auxiliary_loss_mlp": 0.01042238, + "balance_loss_clip": 1.04874921, + "balance_loss_mlp": 1.02676511, + "epoch": 0.3160679392755148, + "flos": 19828580019840.0, + "grad_norm": 1.575034121408654, + "language_loss": 0.71175283, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.73316103, + "num_input_tokens_seen": 112860570, + "step": 5257, + "time_per_iteration": 4.252342462539673 + }, + { + "auxiliary_loss_clip": 0.01132481, + "auxiliary_loss_mlp": 0.01043271, + "balance_loss_clip": 1.05120182, + "balance_loss_mlp": 1.02524674, + "epoch": 0.31612806252818276, + "flos": 20193216935040.0, + "grad_norm": 2.0196036815267036, + "language_loss": 0.76539034, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.78714788, + "num_input_tokens_seen": 112877975, + "step": 5258, + "time_per_iteration": 4.240477085113525 + }, + { + "auxiliary_loss_clip": 0.01110908, + "auxiliary_loss_mlp": 0.01047088, + "balance_loss_clip": 1.04727268, + "balance_loss_mlp": 1.02917099, + "epoch": 0.31618818578085073, + "flos": 24235967646720.0, + "grad_norm": 3.2354010090655403, + "language_loss": 0.72901475, + "learning_rate": 3.200602180731467e-06, + "loss": 0.75059474, + "num_input_tokens_seen": 112896170, + "step": 5259, + "time_per_iteration": 2.726944923400879 + }, + { + "auxiliary_loss_clip": 0.01117115, + "auxiliary_loss_mlp": 0.00776982, + "balance_loss_clip": 1.04983401, + "balance_loss_mlp": 1.0013001, + "epoch": 0.3162483090335187, + "flos": 25081844002560.0, + "grad_norm": 2.1961272089612307, + "language_loss": 0.66124642, + "learning_rate": 3.20029067660664e-06, + "loss": 0.68018734, + "num_input_tokens_seen": 112916180, + "step": 5260, + "time_per_iteration": 2.7605621814727783 + }, + { + "auxiliary_loss_clip": 0.01130372, + "auxiliary_loss_mlp": 0.01037108, + "balance_loss_clip": 1.04645884, + "balance_loss_mlp": 1.02016842, + "epoch": 0.31630843228618666, + "flos": 26323383646080.0, + "grad_norm": 1.8277182943015604, + "language_loss": 0.71989, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.74156475, + "num_input_tokens_seen": 112936745, + "step": 5261, + "time_per_iteration": 4.231431484222412 + }, + { + "auxiliary_loss_clip": 0.01044321, + "auxiliary_loss_mlp": 0.01007323, + "balance_loss_clip": 1.02311194, + "balance_loss_mlp": 1.00424767, + "epoch": 0.3163685555388547, + "flos": 66758441552640.0, + "grad_norm": 0.7429950107461195, + "language_loss": 0.50646758, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.5269841, + "num_input_tokens_seen": 112994845, + "step": 5262, + "time_per_iteration": 3.232384443283081 + }, + { + "auxiliary_loss_clip": 0.01131333, + "auxiliary_loss_mlp": 0.01046761, + "balance_loss_clip": 1.05222106, + "balance_loss_mlp": 1.02932084, + "epoch": 0.31642867879152264, + "flos": 25995662933760.0, + "grad_norm": 1.5863649349069382, + "language_loss": 0.85187083, + "learning_rate": 3.19935589118856e-06, + "loss": 0.8736518, + "num_input_tokens_seen": 113015125, + "step": 5263, + "time_per_iteration": 4.33522629737854 + }, + { + "auxiliary_loss_clip": 0.01112644, + "auxiliary_loss_mlp": 0.01048382, + "balance_loss_clip": 1.04875994, + "balance_loss_mlp": 1.03256297, + "epoch": 0.3164888020441906, + "flos": 25774955815680.0, + "grad_norm": 1.550008856477613, + "language_loss": 0.81648135, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.83809161, + "num_input_tokens_seen": 113035535, + "step": 5264, + "time_per_iteration": 2.8155312538146973 + }, + { + "auxiliary_loss_clip": 0.01121259, + "auxiliary_loss_mlp": 0.0104222, + "balance_loss_clip": 1.04812968, + "balance_loss_mlp": 1.02431464, + "epoch": 0.3165489252968586, + "flos": 19756220071680.0, + "grad_norm": 2.234025317189389, + "language_loss": 0.78969181, + "learning_rate": 3.19873247349167e-06, + "loss": 0.81132656, + "num_input_tokens_seen": 113052720, + "step": 5265, + "time_per_iteration": 2.6533524990081787 + }, + { + "auxiliary_loss_clip": 0.0113452, + "auxiliary_loss_mlp": 0.01049591, + "balance_loss_clip": 1.05209899, + "balance_loss_mlp": 1.03144741, + "epoch": 0.31660904854952654, + "flos": 23183929180800.0, + "grad_norm": 1.789116232573577, + "language_loss": 0.74705631, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.76889741, + "num_input_tokens_seen": 113071435, + "step": 5266, + "time_per_iteration": 2.66683292388916 + }, + { + "auxiliary_loss_clip": 0.01108402, + "auxiliary_loss_mlp": 0.0104338, + "balance_loss_clip": 1.04636073, + "balance_loss_mlp": 1.02660751, + "epoch": 0.3166691718021945, + "flos": 20408501099520.0, + "grad_norm": 2.507852328081816, + "language_loss": 0.79178059, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.81329834, + "num_input_tokens_seen": 113088645, + "step": 5267, + "time_per_iteration": 2.6870310306549072 + }, + { + "auxiliary_loss_clip": 0.0103642, + "auxiliary_loss_mlp": 0.01002482, + "balance_loss_clip": 1.02563763, + "balance_loss_mlp": 1.00002623, + "epoch": 0.31672929505486247, + "flos": 70144781172480.0, + "grad_norm": 0.7343006553516018, + "language_loss": 0.57840127, + "learning_rate": 3.197797006055478e-06, + "loss": 0.59879029, + "num_input_tokens_seen": 113152775, + "step": 5268, + "time_per_iteration": 3.211494207382202 + }, + { + "auxiliary_loss_clip": 0.01144761, + "auxiliary_loss_mlp": 0.01044165, + "balance_loss_clip": 1.0517385, + "balance_loss_mlp": 1.02729666, + "epoch": 0.31678941830753043, + "flos": 14355758154240.0, + "grad_norm": 2.2657818682072146, + "language_loss": 0.73009932, + "learning_rate": 3.197485092719815e-06, + "loss": 0.75198865, + "num_input_tokens_seen": 113171410, + "step": 5269, + "time_per_iteration": 2.5840115547180176 + }, + { + "auxiliary_loss_clip": 0.01108492, + "auxiliary_loss_mlp": 0.01049824, + "balance_loss_clip": 1.0489136, + "balance_loss_mlp": 1.03283644, + "epoch": 0.3168495415601984, + "flos": 22747722416640.0, + "grad_norm": 2.2273308320264995, + "language_loss": 0.79972744, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.82131052, + "num_input_tokens_seen": 113189965, + "step": 5270, + "time_per_iteration": 2.858154535293579 + }, + { + "auxiliary_loss_clip": 0.01146892, + "auxiliary_loss_mlp": 0.01050124, + "balance_loss_clip": 1.05206418, + "balance_loss_mlp": 1.03207529, + "epoch": 0.31690966481286637, + "flos": 20115254465280.0, + "grad_norm": 9.25747726986636, + "language_loss": 0.7941646, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.81613475, + "num_input_tokens_seen": 113206355, + "step": 5271, + "time_per_iteration": 2.6510884761810303 + }, + { + "auxiliary_loss_clip": 0.01144344, + "auxiliary_loss_mlp": 0.01040088, + "balance_loss_clip": 1.05230093, + "balance_loss_mlp": 1.02269578, + "epoch": 0.31696978806553433, + "flos": 21178928937600.0, + "grad_norm": 1.806612869692892, + "language_loss": 0.72429144, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.74613577, + "num_input_tokens_seen": 113225440, + "step": 5272, + "time_per_iteration": 2.6807363033294678 + }, + { + "auxiliary_loss_clip": 0.01123855, + "auxiliary_loss_mlp": 0.01052611, + "balance_loss_clip": 1.04942703, + "balance_loss_mlp": 1.03365636, + "epoch": 0.3170299113182023, + "flos": 42997030439040.0, + "grad_norm": 2.241731745129767, + "language_loss": 0.69146693, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.71323156, + "num_input_tokens_seen": 113248840, + "step": 5273, + "time_per_iteration": 2.9202728271484375 + }, + { + "auxiliary_loss_clip": 0.01128467, + "auxiliary_loss_mlp": 0.00775845, + "balance_loss_clip": 1.04869509, + "balance_loss_mlp": 1.00146461, + "epoch": 0.31709003457087026, + "flos": 24460158384000.0, + "grad_norm": 1.872718303622414, + "language_loss": 0.67764306, + "learning_rate": 3.195924845146795e-06, + "loss": 0.69668615, + "num_input_tokens_seen": 113269630, + "step": 5274, + "time_per_iteration": 2.6541714668273926 + }, + { + "auxiliary_loss_clip": 0.01092683, + "auxiliary_loss_mlp": 0.0106112, + "balance_loss_clip": 1.04346347, + "balance_loss_mlp": 1.04305935, + "epoch": 0.3171501578235382, + "flos": 24135310759680.0, + "grad_norm": 1.7402048894999724, + "language_loss": 0.80815518, + "learning_rate": 3.195612659536081e-06, + "loss": 0.8296932, + "num_input_tokens_seen": 113291200, + "step": 5275, + "time_per_iteration": 2.840696096420288 + }, + { + "auxiliary_loss_clip": 0.0113287, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_clip": 1.04862475, + "balance_loss_mlp": 1.02979279, + "epoch": 0.31721028107620625, + "flos": 18879712392960.0, + "grad_norm": 2.28886723118271, + "language_loss": 0.72418922, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.74599648, + "num_input_tokens_seen": 113310170, + "step": 5276, + "time_per_iteration": 2.6426591873168945 + }, + { + "auxiliary_loss_clip": 0.01122606, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.05439019, + "balance_loss_mlp": 1.02588356, + "epoch": 0.3172704043288742, + "flos": 23147874904320.0, + "grad_norm": 1.4542936031710312, + "language_loss": 0.77923822, + "learning_rate": 3.194988152313236e-06, + "loss": 0.80087811, + "num_input_tokens_seen": 113331140, + "step": 5277, + "time_per_iteration": 2.7192864418029785 + }, + { + "auxiliary_loss_clip": 0.01113098, + "auxiliary_loss_mlp": 0.01054598, + "balance_loss_clip": 1.04708886, + "balance_loss_mlp": 1.03432024, + "epoch": 0.3173305275815422, + "flos": 17858520731520.0, + "grad_norm": 2.071832444797603, + "language_loss": 0.79029107, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.81196797, + "num_input_tokens_seen": 113350030, + "step": 5278, + "time_per_iteration": 2.606973648071289 + }, + { + "auxiliary_loss_clip": 0.01041198, + "auxiliary_loss_mlp": 0.01006121, + "balance_loss_clip": 1.02207565, + "balance_loss_mlp": 1.00391531, + "epoch": 0.31739065083421014, + "flos": 59973476883840.0, + "grad_norm": 0.8783580735908582, + "language_loss": 0.62817574, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.64864898, + "num_input_tokens_seen": 113395820, + "step": 5279, + "time_per_iteration": 2.998594284057617 + }, + { + "auxiliary_loss_clip": 0.01146927, + "auxiliary_loss_mlp": 0.01055699, + "balance_loss_clip": 1.05080009, + "balance_loss_mlp": 1.03651857, + "epoch": 0.3174507740868781, + "flos": 23800981944960.0, + "grad_norm": 1.4881688285488497, + "language_loss": 0.80855167, + "learning_rate": 3.194051051653053e-06, + "loss": 0.83057791, + "num_input_tokens_seen": 113416835, + "step": 5280, + "time_per_iteration": 2.662240743637085 + }, + { + "auxiliary_loss_clip": 0.0110603, + "auxiliary_loss_mlp": 0.01050191, + "balance_loss_clip": 1.04850507, + "balance_loss_mlp": 1.0339663, + "epoch": 0.31751089733954607, + "flos": 27638899349760.0, + "grad_norm": 1.6411021360183768, + "language_loss": 0.77964067, + "learning_rate": 3.19373859419346e-06, + "loss": 0.80120289, + "num_input_tokens_seen": 113440850, + "step": 5281, + "time_per_iteration": 2.8303840160369873 + }, + { + "auxiliary_loss_clip": 0.01119054, + "auxiliary_loss_mlp": 0.0103955, + "balance_loss_clip": 1.04812443, + "balance_loss_mlp": 1.02194262, + "epoch": 0.31757102059221404, + "flos": 23769273214080.0, + "grad_norm": 2.6184534699054116, + "language_loss": 0.78539747, + "learning_rate": 3.193426091467179e-06, + "loss": 0.80698353, + "num_input_tokens_seen": 113461000, + "step": 5282, + "time_per_iteration": 2.75915265083313 + }, + { + "auxiliary_loss_clip": 0.01122553, + "auxiliary_loss_mlp": 0.01050996, + "balance_loss_clip": 1.0517695, + "balance_loss_mlp": 1.03284001, + "epoch": 0.317631143844882, + "flos": 25264521596160.0, + "grad_norm": 1.8901773671102746, + "language_loss": 0.67857707, + "learning_rate": 3.193113543486061e-06, + "loss": 0.70031261, + "num_input_tokens_seen": 113480820, + "step": 5283, + "time_per_iteration": 2.710601329803467 + }, + { + "auxiliary_loss_clip": 0.01039071, + "auxiliary_loss_mlp": 0.01003581, + "balance_loss_clip": 1.02084279, + "balance_loss_mlp": 1.00145948, + "epoch": 0.31769126709754997, + "flos": 55825939221120.0, + "grad_norm": 0.7284643981615322, + "language_loss": 0.52787578, + "learning_rate": 3.192800950261958e-06, + "loss": 0.54830229, + "num_input_tokens_seen": 113536910, + "step": 5284, + "time_per_iteration": 3.1312994956970215 + }, + { + "auxiliary_loss_clip": 0.01123508, + "auxiliary_loss_mlp": 0.01041652, + "balance_loss_clip": 1.05256152, + "balance_loss_mlp": 1.02529633, + "epoch": 0.31775139035021793, + "flos": 16690562098560.0, + "grad_norm": 1.6358492252526933, + "language_loss": 0.70703542, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.72868699, + "num_input_tokens_seen": 113555480, + "step": 5285, + "time_per_iteration": 2.66414213180542 + }, + { + "auxiliary_loss_clip": 0.01051594, + "auxiliary_loss_mlp": 0.01001353, + "balance_loss_clip": 1.02112103, + "balance_loss_mlp": 0.99919558, + "epoch": 0.3178115136028859, + "flos": 64227241019520.0, + "grad_norm": 0.8795363824150627, + "language_loss": 0.60495377, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.62548316, + "num_input_tokens_seen": 113616790, + "step": 5286, + "time_per_iteration": 3.1636195182800293 + }, + { + "auxiliary_loss_clip": 0.01145219, + "auxiliary_loss_mlp": 0.01047411, + "balance_loss_clip": 1.05137587, + "balance_loss_mlp": 1.02995849, + "epoch": 0.31787163685555386, + "flos": 18697465762560.0, + "grad_norm": 10.257300688850748, + "language_loss": 0.72160053, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.74352682, + "num_input_tokens_seen": 113635320, + "step": 5287, + "time_per_iteration": 2.628863573074341 + }, + { + "auxiliary_loss_clip": 0.01132987, + "auxiliary_loss_mlp": 0.0105662, + "balance_loss_clip": 1.04966712, + "balance_loss_mlp": 1.03823805, + "epoch": 0.31793176010822183, + "flos": 21324762155520.0, + "grad_norm": 2.3229849512265126, + "language_loss": 0.75706261, + "learning_rate": 3.191550125172792e-06, + "loss": 0.77895868, + "num_input_tokens_seen": 113654000, + "step": 5288, + "time_per_iteration": 2.7565319538116455 + }, + { + "auxiliary_loss_clip": 0.01128698, + "auxiliary_loss_mlp": 0.01037369, + "balance_loss_clip": 1.04913831, + "balance_loss_mlp": 1.02223587, + "epoch": 0.31799188336088985, + "flos": 20958688696320.0, + "grad_norm": 3.550043827117326, + "language_loss": 0.87827504, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.89993572, + "num_input_tokens_seen": 113672375, + "step": 5289, + "time_per_iteration": 2.6671485900878906 + }, + { + "auxiliary_loss_clip": 0.01126628, + "auxiliary_loss_mlp": 0.01039655, + "balance_loss_clip": 1.05225897, + "balance_loss_mlp": 1.02443218, + "epoch": 0.3180520066135578, + "flos": 22491930689280.0, + "grad_norm": 1.767762146387748, + "language_loss": 0.68103814, + "learning_rate": 3.190924441478572e-06, + "loss": 0.70270097, + "num_input_tokens_seen": 113692385, + "step": 5290, + "time_per_iteration": 2.6986947059631348 + }, + { + "auxiliary_loss_clip": 0.01120385, + "auxiliary_loss_mlp": 0.01046806, + "balance_loss_clip": 1.04791737, + "balance_loss_mlp": 1.02924609, + "epoch": 0.3181121298662258, + "flos": 27235335070080.0, + "grad_norm": 2.1353951835610303, + "language_loss": 0.80298805, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.82465994, + "num_input_tokens_seen": 113712145, + "step": 5291, + "time_per_iteration": 2.67692494392395 + }, + { + "auxiliary_loss_clip": 0.01112404, + "auxiliary_loss_mlp": 0.01038285, + "balance_loss_clip": 1.05768418, + "balance_loss_mlp": 1.02066636, + "epoch": 0.31817225311889374, + "flos": 23180158252800.0, + "grad_norm": 4.0426741537939614, + "language_loss": 0.79877901, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.82028592, + "num_input_tokens_seen": 113731435, + "step": 5292, + "time_per_iteration": 2.8386974334716797 + }, + { + "auxiliary_loss_clip": 0.01126783, + "auxiliary_loss_mlp": 0.01037968, + "balance_loss_clip": 1.05076253, + "balance_loss_mlp": 1.0233407, + "epoch": 0.3182323763715617, + "flos": 23258803080960.0, + "grad_norm": 1.5696258430885255, + "language_loss": 0.74754488, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.7691924, + "num_input_tokens_seen": 113750825, + "step": 5293, + "time_per_iteration": 2.651566982269287 + }, + { + "auxiliary_loss_clip": 0.01129161, + "auxiliary_loss_mlp": 0.01045458, + "balance_loss_clip": 1.05253696, + "balance_loss_mlp": 1.03027081, + "epoch": 0.3182924996242297, + "flos": 29016683280000.0, + "grad_norm": 1.9205945835079516, + "language_loss": 0.74100351, + "learning_rate": 3.189672532265379e-06, + "loss": 0.76274973, + "num_input_tokens_seen": 113770010, + "step": 5294, + "time_per_iteration": 2.6593024730682373 + }, + { + "auxiliary_loss_clip": 0.01145372, + "auxiliary_loss_mlp": 0.01038723, + "balance_loss_clip": 1.05254447, + "balance_loss_mlp": 1.02166462, + "epoch": 0.31835262287689764, + "flos": 20449188230400.0, + "grad_norm": 3.618714545146935, + "language_loss": 0.76019043, + "learning_rate": 3.189359442151152e-06, + "loss": 0.78203136, + "num_input_tokens_seen": 113788640, + "step": 5295, + "time_per_iteration": 2.597567558288574 + }, + { + "auxiliary_loss_clip": 0.01110615, + "auxiliary_loss_mlp": 0.01046432, + "balance_loss_clip": 1.04994202, + "balance_loss_mlp": 1.02979052, + "epoch": 0.3184127461295656, + "flos": 25119478477440.0, + "grad_norm": 2.278908740959458, + "language_loss": 0.69146252, + "learning_rate": 3.189046306936296e-06, + "loss": 0.71303296, + "num_input_tokens_seen": 113809515, + "step": 5296, + "time_per_iteration": 4.286029100418091 + }, + { + "auxiliary_loss_clip": 0.01115954, + "auxiliary_loss_mlp": 0.01043279, + "balance_loss_clip": 1.04866266, + "balance_loss_mlp": 1.02709007, + "epoch": 0.31847286938223357, + "flos": 25551231955200.0, + "grad_norm": 1.7786470593469696, + "language_loss": 0.77374327, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.79533565, + "num_input_tokens_seen": 113829770, + "step": 5297, + "time_per_iteration": 4.164870023727417 + }, + { + "auxiliary_loss_clip": 0.0111312, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.05341816, + "balance_loss_mlp": 1.01857328, + "epoch": 0.31853299263490154, + "flos": 27782470010880.0, + "grad_norm": 2.4185702861431104, + "language_loss": 0.79294181, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.81443709, + "num_input_tokens_seen": 113849320, + "step": 5298, + "time_per_iteration": 2.761035919189453 + }, + { + "auxiliary_loss_clip": 0.01127152, + "auxiliary_loss_mlp": 0.01052383, + "balance_loss_clip": 1.05250955, + "balance_loss_mlp": 1.0361588, + "epoch": 0.3185931158875695, + "flos": 22706747976960.0, + "grad_norm": 2.109744523678234, + "language_loss": 0.74082595, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.76262128, + "num_input_tokens_seen": 113867860, + "step": 5299, + "time_per_iteration": 2.6674296855926514 + }, + { + "auxiliary_loss_clip": 0.01133842, + "auxiliary_loss_mlp": 0.01048899, + "balance_loss_clip": 1.05652189, + "balance_loss_mlp": 1.03213775, + "epoch": 0.31865323914023747, + "flos": 24571517523840.0, + "grad_norm": 2.0125699214837627, + "language_loss": 0.78636098, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.80818832, + "num_input_tokens_seen": 113886375, + "step": 5300, + "time_per_iteration": 2.721202850341797 + }, + { + "auxiliary_loss_clip": 0.01119633, + "auxiliary_loss_mlp": 0.01050293, + "balance_loss_clip": 1.04830885, + "balance_loss_mlp": 1.03297138, + "epoch": 0.31871336239290543, + "flos": 18186564666240.0, + "grad_norm": 1.8639511619571896, + "language_loss": 0.83660495, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.8583042, + "num_input_tokens_seen": 113904065, + "step": 5301, + "time_per_iteration": 4.22704291343689 + }, + { + "auxiliary_loss_clip": 0.01131996, + "auxiliary_loss_mlp": 0.01049945, + "balance_loss_clip": 1.05371821, + "balance_loss_mlp": 1.03263569, + "epoch": 0.31877348564557345, + "flos": 21826756679040.0, + "grad_norm": 2.3173946845583444, + "language_loss": 0.77328432, + "learning_rate": 3.187166549199015e-06, + "loss": 0.79510373, + "num_input_tokens_seen": 113918415, + "step": 5302, + "time_per_iteration": 2.6678919792175293 + }, + { + "auxiliary_loss_clip": 0.011364, + "auxiliary_loss_mlp": 0.01039827, + "balance_loss_clip": 1.04891157, + "balance_loss_mlp": 1.02270818, + "epoch": 0.3188336088982414, + "flos": 22015252275840.0, + "grad_norm": 2.352282677018458, + "language_loss": 0.79816842, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.81993073, + "num_input_tokens_seen": 113938135, + "step": 5303, + "time_per_iteration": 4.289660453796387 + }, + { + "auxiliary_loss_clip": 0.0113563, + "auxiliary_loss_mlp": 0.01045445, + "balance_loss_clip": 1.05256605, + "balance_loss_mlp": 1.02739668, + "epoch": 0.3188937321509094, + "flos": 20047886507520.0, + "grad_norm": 2.03328242361333, + "language_loss": 0.72914493, + "learning_rate": 3.186539603020047e-06, + "loss": 0.7509557, + "num_input_tokens_seen": 113957125, + "step": 5304, + "time_per_iteration": 2.6123225688934326 + }, + { + "auxiliary_loss_clip": 0.01106707, + "auxiliary_loss_mlp": 0.01038113, + "balance_loss_clip": 1.04701817, + "balance_loss_mlp": 1.02234125, + "epoch": 0.31895385540357735, + "flos": 25848105863040.0, + "grad_norm": 2.816339992135166, + "language_loss": 0.71918428, + "learning_rate": 3.186226062434068e-06, + "loss": 0.74063241, + "num_input_tokens_seen": 113974875, + "step": 5305, + "time_per_iteration": 2.7341108322143555 + }, + { + "auxiliary_loss_clip": 0.01120594, + "auxiliary_loss_mlp": 0.01042646, + "balance_loss_clip": 1.05007052, + "balance_loss_mlp": 1.0271126, + "epoch": 0.3190139786562453, + "flos": 23477714519040.0, + "grad_norm": 2.1368418928112067, + "language_loss": 0.64082253, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.66245496, + "num_input_tokens_seen": 113994450, + "step": 5306, + "time_per_iteration": 2.678497791290283 + }, + { + "auxiliary_loss_clip": 0.01113987, + "auxiliary_loss_mlp": 0.01046306, + "balance_loss_clip": 1.04777002, + "balance_loss_mlp": 1.02913976, + "epoch": 0.3190741019089133, + "flos": 29095543589760.0, + "grad_norm": 2.249856956834014, + "language_loss": 0.7981708, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.81977379, + "num_input_tokens_seen": 114013945, + "step": 5307, + "time_per_iteration": 2.684825897216797 + }, + { + "auxiliary_loss_clip": 0.01110939, + "auxiliary_loss_mlp": 0.01046246, + "balance_loss_clip": 1.04708028, + "balance_loss_mlp": 1.02869821, + "epoch": 0.31913422516158124, + "flos": 17129534209920.0, + "grad_norm": 1.891192054321282, + "language_loss": 0.77413881, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.79571068, + "num_input_tokens_seen": 114031375, + "step": 5308, + "time_per_iteration": 2.62485408782959 + }, + { + "auxiliary_loss_clip": 0.01142071, + "auxiliary_loss_mlp": 0.01050679, + "balance_loss_clip": 1.05399549, + "balance_loss_mlp": 1.03109312, + "epoch": 0.3191943484142492, + "flos": 16069846147200.0, + "grad_norm": 3.6914677983836586, + "language_loss": 0.73960984, + "learning_rate": 3.184971450390961e-06, + "loss": 0.76153737, + "num_input_tokens_seen": 114048465, + "step": 5309, + "time_per_iteration": 2.6268463134765625 + }, + { + "auxiliary_loss_clip": 0.01134349, + "auxiliary_loss_mlp": 0.01035267, + "balance_loss_clip": 1.05286658, + "balance_loss_mlp": 1.01932931, + "epoch": 0.3192544716669172, + "flos": 22966166977920.0, + "grad_norm": 1.9182514579370458, + "language_loss": 0.82652342, + "learning_rate": 3.184657685014856e-06, + "loss": 0.84821963, + "num_input_tokens_seen": 114068415, + "step": 5310, + "time_per_iteration": 2.649099111557007 + }, + { + "auxiliary_loss_clip": 0.01116653, + "auxiliary_loss_mlp": 0.01039176, + "balance_loss_clip": 1.04808259, + "balance_loss_mlp": 1.02340484, + "epoch": 0.31931459491958514, + "flos": 26870339018880.0, + "grad_norm": 2.200225110342558, + "language_loss": 0.78296745, + "learning_rate": 3.184343874716412e-06, + "loss": 0.80452585, + "num_input_tokens_seen": 114088565, + "step": 5311, + "time_per_iteration": 2.7054250240325928 + }, + { + "auxiliary_loss_clip": 0.01106724, + "auxiliary_loss_mlp": 0.01036895, + "balance_loss_clip": 1.04822886, + "balance_loss_mlp": 1.01952648, + "epoch": 0.3193747181722531, + "flos": 21836525178240.0, + "grad_norm": 2.0057857548781883, + "language_loss": 0.84169972, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.86313581, + "num_input_tokens_seen": 114107160, + "step": 5312, + "time_per_iteration": 2.749263048171997 + }, + { + "auxiliary_loss_clip": 0.01093899, + "auxiliary_loss_mlp": 0.01053441, + "balance_loss_clip": 1.04266024, + "balance_loss_mlp": 1.03477311, + "epoch": 0.31943484142492107, + "flos": 18324999682560.0, + "grad_norm": 3.6700749085790063, + "language_loss": 0.78648412, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.80795753, + "num_input_tokens_seen": 114123420, + "step": 5313, + "time_per_iteration": 2.720930814743042 + }, + { + "auxiliary_loss_clip": 0.01130677, + "auxiliary_loss_mlp": 0.01038161, + "balance_loss_clip": 1.05141878, + "balance_loss_mlp": 1.0219605, + "epoch": 0.31949496467758903, + "flos": 21615818060160.0, + "grad_norm": 2.386195329240294, + "language_loss": 0.86217451, + "learning_rate": 3.183402174406057e-06, + "loss": 0.88386285, + "num_input_tokens_seen": 114139230, + "step": 5314, + "time_per_iteration": 2.6785764694213867 + }, + { + "auxiliary_loss_clip": 0.01116655, + "auxiliary_loss_mlp": 0.01050856, + "balance_loss_clip": 1.04983997, + "balance_loss_mlp": 1.03231871, + "epoch": 0.31955508793025705, + "flos": 21760214734080.0, + "grad_norm": 1.996028492072791, + "language_loss": 0.79866767, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.82034278, + "num_input_tokens_seen": 114159290, + "step": 5315, + "time_per_iteration": 2.723097085952759 + }, + { + "auxiliary_loss_clip": 0.0110521, + "auxiliary_loss_mlp": 0.01063258, + "balance_loss_clip": 1.04667854, + "balance_loss_mlp": 1.04386258, + "epoch": 0.319615211182925, + "flos": 17164331510400.0, + "grad_norm": 2.2633227615123275, + "language_loss": 0.67312729, + "learning_rate": 3.18277414980567e-06, + "loss": 0.69481194, + "num_input_tokens_seen": 114177655, + "step": 5316, + "time_per_iteration": 2.7841827869415283 + }, + { + "auxiliary_loss_clip": 0.01131119, + "auxiliary_loss_mlp": 0.01046731, + "balance_loss_clip": 1.05015874, + "balance_loss_mlp": 1.03126907, + "epoch": 0.319675334435593, + "flos": 28112812416000.0, + "grad_norm": 1.540647016415601, + "language_loss": 0.69375229, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.71553081, + "num_input_tokens_seen": 114200880, + "step": 5317, + "time_per_iteration": 2.7080705165863037 + }, + { + "auxiliary_loss_clip": 0.01036788, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.02571428, + "balance_loss_mlp": 1.03117692, + "epoch": 0.31973545768826095, + "flos": 69501119408640.0, + "grad_norm": 0.7974882454120521, + "language_loss": 0.53049421, + "learning_rate": 3.182145945801628e-06, + "loss": 0.55119646, + "num_input_tokens_seen": 114267145, + "step": 5318, + "time_per_iteration": 3.5072765350341797 + }, + { + "auxiliary_loss_clip": 0.0114058, + "auxiliary_loss_mlp": 0.01041014, + "balance_loss_clip": 1.05322218, + "balance_loss_mlp": 1.02509975, + "epoch": 0.3197955809409289, + "flos": 13699203408000.0, + "grad_norm": 3.679429868734815, + "language_loss": 0.84239668, + "learning_rate": 3.181831776553012e-06, + "loss": 0.86421257, + "num_input_tokens_seen": 114284630, + "step": 5319, + "time_per_iteration": 2.6148228645324707 + }, + { + "auxiliary_loss_clip": 0.0112589, + "auxiliary_loss_mlp": 0.01041338, + "balance_loss_clip": 1.04876614, + "balance_loss_mlp": 1.02552485, + "epoch": 0.3198557041935969, + "flos": 33218124278400.0, + "grad_norm": 1.684363339069699, + "language_loss": 0.63463295, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.65630519, + "num_input_tokens_seen": 114305830, + "step": 5320, + "time_per_iteration": 2.7444913387298584 + }, + { + "auxiliary_loss_clip": 0.01120865, + "auxiliary_loss_mlp": 0.01042926, + "balance_loss_clip": 1.05072045, + "balance_loss_mlp": 1.02682114, + "epoch": 0.31991582744626484, + "flos": 23732033788800.0, + "grad_norm": 2.113040492667506, + "language_loss": 0.70552826, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.72716618, + "num_input_tokens_seen": 114325165, + "step": 5321, + "time_per_iteration": 2.7078404426574707 + }, + { + "auxiliary_loss_clip": 0.01151862, + "auxiliary_loss_mlp": 0.00776802, + "balance_loss_clip": 1.05639851, + "balance_loss_mlp": 1.00126243, + "epoch": 0.3199759506989328, + "flos": 18550842445440.0, + "grad_norm": 2.699319417691227, + "language_loss": 0.8659147, + "learning_rate": 3.180888999963749e-06, + "loss": 0.88520133, + "num_input_tokens_seen": 114341310, + "step": 5322, + "time_per_iteration": 2.5562047958374023 + }, + { + "auxiliary_loss_clip": 0.01119411, + "auxiliary_loss_mlp": 0.01038951, + "balance_loss_clip": 1.05106568, + "balance_loss_mlp": 1.02265561, + "epoch": 0.3200360739516008, + "flos": 22418888382720.0, + "grad_norm": 1.7451682184714292, + "language_loss": 0.83021653, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.85180014, + "num_input_tokens_seen": 114360355, + "step": 5323, + "time_per_iteration": 2.6323180198669434 + }, + { + "auxiliary_loss_clip": 0.01129356, + "auxiliary_loss_mlp": 0.01041616, + "balance_loss_clip": 1.05092812, + "balance_loss_mlp": 1.02440214, + "epoch": 0.32009619720426874, + "flos": 20595236929920.0, + "grad_norm": 1.6785162629315, + "language_loss": 0.77686846, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.79857814, + "num_input_tokens_seen": 114379220, + "step": 5324, + "time_per_iteration": 2.6361289024353027 + }, + { + "auxiliary_loss_clip": 0.01115575, + "auxiliary_loss_mlp": 0.01035772, + "balance_loss_clip": 1.04754376, + "balance_loss_mlp": 1.01861751, + "epoch": 0.3201563204569367, + "flos": 18147637301760.0, + "grad_norm": 1.9010400542588533, + "language_loss": 0.80500418, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.82651764, + "num_input_tokens_seen": 114396365, + "step": 5325, + "time_per_iteration": 2.681349277496338 + }, + { + "auxiliary_loss_clip": 0.01133585, + "auxiliary_loss_mlp": 0.01039966, + "balance_loss_clip": 1.05378425, + "balance_loss_mlp": 1.02394414, + "epoch": 0.32021644370960467, + "flos": 31684235840640.0, + "grad_norm": 1.7412856997403743, + "language_loss": 0.74817789, + "learning_rate": 3.179631337655037e-06, + "loss": 0.76991343, + "num_input_tokens_seen": 114416780, + "step": 5326, + "time_per_iteration": 2.6932616233825684 + }, + { + "auxiliary_loss_clip": 0.01103829, + "auxiliary_loss_mlp": 0.0104309, + "balance_loss_clip": 1.05045807, + "balance_loss_mlp": 1.02659154, + "epoch": 0.32027656696227264, + "flos": 26865921646080.0, + "grad_norm": 1.642662123916105, + "language_loss": 0.80796289, + "learning_rate": 3.179316810218701e-06, + "loss": 0.82943213, + "num_input_tokens_seen": 114437405, + "step": 5327, + "time_per_iteration": 2.7527899742126465 + }, + { + "auxiliary_loss_clip": 0.01115203, + "auxiliary_loss_mlp": 0.01038297, + "balance_loss_clip": 1.05185604, + "balance_loss_mlp": 1.02162015, + "epoch": 0.32033669021494066, + "flos": 24169928492160.0, + "grad_norm": 1.846540372387515, + "language_loss": 0.77796161, + "learning_rate": 3.179002238062554e-06, + "loss": 0.79949659, + "num_input_tokens_seen": 114458505, + "step": 5328, + "time_per_iteration": 2.7631096839904785 + }, + { + "auxiliary_loss_clip": 0.01087281, + "auxiliary_loss_mlp": 0.01043102, + "balance_loss_clip": 1.0453198, + "balance_loss_mlp": 1.0245527, + "epoch": 0.3203968134676086, + "flos": 24460768915200.0, + "grad_norm": 1.6837826518335735, + "language_loss": 0.74184239, + "learning_rate": 3.178687621198524e-06, + "loss": 0.76314622, + "num_input_tokens_seen": 114479050, + "step": 5329, + "time_per_iteration": 2.7749221324920654 + }, + { + "auxiliary_loss_clip": 0.01110066, + "auxiliary_loss_mlp": 0.01036662, + "balance_loss_clip": 1.04650402, + "balance_loss_mlp": 1.02133203, + "epoch": 0.3204569367202766, + "flos": 18004713085440.0, + "grad_norm": 1.7163505659405243, + "language_loss": 0.71138644, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.73285371, + "num_input_tokens_seen": 114497415, + "step": 5330, + "time_per_iteration": 2.655578136444092 + }, + { + "auxiliary_loss_clip": 0.01093261, + "auxiliary_loss_mlp": 0.01053955, + "balance_loss_clip": 1.05082417, + "balance_loss_mlp": 1.03379714, + "epoch": 0.32051705997294455, + "flos": 30589678650240.0, + "grad_norm": 1.6854796065505788, + "language_loss": 0.80175424, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.82322645, + "num_input_tokens_seen": 114518785, + "step": 5331, + "time_per_iteration": 2.851639747619629 + }, + { + "auxiliary_loss_clip": 0.01040347, + "auxiliary_loss_mlp": 0.01008357, + "balance_loss_clip": 1.02573299, + "balance_loss_mlp": 1.0059495, + "epoch": 0.3205771832256125, + "flos": 68417979765120.0, + "grad_norm": 0.8321512232204817, + "language_loss": 0.57821107, + "learning_rate": 3.177743502478447e-06, + "loss": 0.59869808, + "num_input_tokens_seen": 114577710, + "step": 5332, + "time_per_iteration": 3.1104307174682617 + }, + { + "auxiliary_loss_clip": 0.01104131, + "auxiliary_loss_mlp": 0.01038271, + "balance_loss_clip": 1.04842329, + "balance_loss_mlp": 1.02194548, + "epoch": 0.3206373064782805, + "flos": 30443953173120.0, + "grad_norm": 1.7127909178457088, + "language_loss": 0.72918129, + "learning_rate": 3.177428706902205e-06, + "loss": 0.75060534, + "num_input_tokens_seen": 114598640, + "step": 5333, + "time_per_iteration": 2.7683963775634766 + }, + { + "auxiliary_loss_clip": 0.01118957, + "auxiliary_loss_mlp": 0.01043487, + "balance_loss_clip": 1.04778981, + "balance_loss_mlp": 1.02685761, + "epoch": 0.32069742973094845, + "flos": 22054502862720.0, + "grad_norm": 2.1728626414536767, + "language_loss": 0.70592654, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.72755098, + "num_input_tokens_seen": 114618780, + "step": 5334, + "time_per_iteration": 2.6861116886138916 + }, + { + "auxiliary_loss_clip": 0.01100969, + "auxiliary_loss_mlp": 0.01041644, + "balance_loss_clip": 1.04742825, + "balance_loss_mlp": 1.02536023, + "epoch": 0.3207575529836164, + "flos": 22054000072320.0, + "grad_norm": 2.526978692505362, + "language_loss": 0.77161503, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.79304117, + "num_input_tokens_seen": 114637525, + "step": 5335, + "time_per_iteration": 4.33164381980896 + }, + { + "auxiliary_loss_clip": 0.01130469, + "auxiliary_loss_mlp": 0.01038297, + "balance_loss_clip": 1.05087018, + "balance_loss_mlp": 1.02213204, + "epoch": 0.3208176762362844, + "flos": 34057536186240.0, + "grad_norm": 1.6997548644452432, + "language_loss": 0.68414462, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.7058323, + "num_input_tokens_seen": 114659705, + "step": 5336, + "time_per_iteration": 2.840373992919922 + }, + { + "auxiliary_loss_clip": 0.01102432, + "auxiliary_loss_mlp": 0.01055244, + "balance_loss_clip": 1.04495001, + "balance_loss_mlp": 1.03862596, + "epoch": 0.32087779948895234, + "flos": 21798711135360.0, + "grad_norm": 1.733261513029939, + "language_loss": 0.78828537, + "learning_rate": 3.176169078234487e-06, + "loss": 0.8098622, + "num_input_tokens_seen": 114678340, + "step": 5337, + "time_per_iteration": 4.268811464309692 + }, + { + "auxiliary_loss_clip": 0.01121282, + "auxiliary_loss_mlp": 0.01039712, + "balance_loss_clip": 1.04696417, + "balance_loss_mlp": 1.02512085, + "epoch": 0.3209379227416203, + "flos": 21434110133760.0, + "grad_norm": 2.1583979373304194, + "language_loss": 0.74322718, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.76483715, + "num_input_tokens_seen": 114696980, + "step": 5338, + "time_per_iteration": 2.6442766189575195 + }, + { + "auxiliary_loss_clip": 0.01119062, + "auxiliary_loss_mlp": 0.01047297, + "balance_loss_clip": 1.04633641, + "balance_loss_mlp": 1.03078675, + "epoch": 0.3209980459942883, + "flos": 25849075530240.0, + "grad_norm": 2.118549362741933, + "language_loss": 0.62622869, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.64789224, + "num_input_tokens_seen": 114717330, + "step": 5339, + "time_per_iteration": 2.684843063354492 + }, + { + "auxiliary_loss_clip": 0.01141698, + "auxiliary_loss_mlp": 0.01046177, + "balance_loss_clip": 1.05127931, + "balance_loss_mlp": 1.02954674, + "epoch": 0.32105816924695624, + "flos": 19099162535040.0, + "grad_norm": 2.480509085809345, + "language_loss": 0.81685597, + "learning_rate": 3.175223888387192e-06, + "loss": 0.83873475, + "num_input_tokens_seen": 114736320, + "step": 5340, + "time_per_iteration": 4.130942344665527 + }, + { + "auxiliary_loss_clip": 0.01110441, + "auxiliary_loss_mlp": 0.01050741, + "balance_loss_clip": 1.04820514, + "balance_loss_mlp": 1.03462362, + "epoch": 0.3211182924996242, + "flos": 16581860565120.0, + "grad_norm": 2.326860742494733, + "language_loss": 0.76571834, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.78733015, + "num_input_tokens_seen": 114754575, + "step": 5341, + "time_per_iteration": 2.7302300930023193 + }, + { + "auxiliary_loss_clip": 0.01101828, + "auxiliary_loss_mlp": 0.01044591, + "balance_loss_clip": 1.04797173, + "balance_loss_mlp": 1.02840281, + "epoch": 0.3211784157522922, + "flos": 22672202071680.0, + "grad_norm": 1.680960149410583, + "language_loss": 0.79268491, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.81414914, + "num_input_tokens_seen": 114773590, + "step": 5342, + "time_per_iteration": 4.462036609649658 + }, + { + "auxiliary_loss_clip": 0.01118478, + "auxiliary_loss_mlp": 0.01045941, + "balance_loss_clip": 1.05000186, + "balance_loss_mlp": 1.02876329, + "epoch": 0.3212385390049602, + "flos": 20558787603840.0, + "grad_norm": 3.232512085646521, + "language_loss": 0.74449253, + "learning_rate": 3.174278297458438e-06, + "loss": 0.76613677, + "num_input_tokens_seen": 114790775, + "step": 5343, + "time_per_iteration": 2.7057244777679443 + }, + { + "auxiliary_loss_clip": 0.01080228, + "auxiliary_loss_mlp": 0.0104431, + "balance_loss_clip": 1.04317784, + "balance_loss_mlp": 1.02704811, + "epoch": 0.32129866225762815, + "flos": 24791147233920.0, + "grad_norm": 1.672847320129023, + "language_loss": 0.82661629, + "learning_rate": 3.173963011408748e-06, + "loss": 0.84786165, + "num_input_tokens_seen": 114809835, + "step": 5344, + "time_per_iteration": 2.801013231277466 + }, + { + "auxiliary_loss_clip": 0.01088811, + "auxiliary_loss_mlp": 0.01042568, + "balance_loss_clip": 1.04556143, + "balance_loss_mlp": 1.02565217, + "epoch": 0.3213587855102961, + "flos": 18366871962240.0, + "grad_norm": 22.33494793204904, + "language_loss": 0.79863501, + "learning_rate": 3.173647680842262e-06, + "loss": 0.81994879, + "num_input_tokens_seen": 114826505, + "step": 5345, + "time_per_iteration": 2.743778944015503 + }, + { + "auxiliary_loss_clip": 0.01114864, + "auxiliary_loss_mlp": 0.01041047, + "balance_loss_clip": 1.04774046, + "balance_loss_mlp": 1.02507281, + "epoch": 0.3214189087629641, + "flos": 27015992668800.0, + "grad_norm": 2.095379605818748, + "language_loss": 0.83340824, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85496742, + "num_input_tokens_seen": 114846140, + "step": 5346, + "time_per_iteration": 2.8187026977539062 + }, + { + "auxiliary_loss_clip": 0.01110187, + "auxiliary_loss_mlp": 0.01045041, + "balance_loss_clip": 1.04783988, + "balance_loss_mlp": 1.02797008, + "epoch": 0.32147903201563205, + "flos": 23148269953920.0, + "grad_norm": 1.6371928172660764, + "language_loss": 0.81853002, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.84008235, + "num_input_tokens_seen": 114866660, + "step": 5347, + "time_per_iteration": 2.724003553390503 + }, + { + "auxiliary_loss_clip": 0.0112676, + "auxiliary_loss_mlp": 0.01047135, + "balance_loss_clip": 1.048388, + "balance_loss_mlp": 1.02891994, + "epoch": 0.3215391552683, + "flos": 16580747243520.0, + "grad_norm": 4.152516057334243, + "language_loss": 0.80263776, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.8243767, + "num_input_tokens_seen": 114882820, + "step": 5348, + "time_per_iteration": 2.6249122619628906 + }, + { + "auxiliary_loss_clip": 0.01113488, + "auxiliary_loss_mlp": 0.0105622, + "balance_loss_clip": 1.04640627, + "balance_loss_mlp": 1.03931606, + "epoch": 0.321599278520968, + "flos": 17821820010240.0, + "grad_norm": 2.570277900111974, + "language_loss": 0.85020632, + "learning_rate": 3.172385913647542e-06, + "loss": 0.87190342, + "num_input_tokens_seen": 114900745, + "step": 5349, + "time_per_iteration": 2.6685211658477783 + }, + { + "auxiliary_loss_clip": 0.01113139, + "auxiliary_loss_mlp": 0.0104332, + "balance_loss_clip": 1.04840457, + "balance_loss_mlp": 1.02644002, + "epoch": 0.32165940177363594, + "flos": 16251769555200.0, + "grad_norm": 2.7209437086115282, + "language_loss": 0.80619532, + "learning_rate": 3.172070360676475e-06, + "loss": 0.82775992, + "num_input_tokens_seen": 114917940, + "step": 5350, + "time_per_iteration": 2.6857874393463135 + }, + { + "auxiliary_loss_clip": 0.01128309, + "auxiliary_loss_mlp": 0.01045442, + "balance_loss_clip": 1.05025196, + "balance_loss_mlp": 1.02955103, + "epoch": 0.3217195250263039, + "flos": 27599900158080.0, + "grad_norm": 5.5112684101117395, + "language_loss": 0.80060112, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.82233858, + "num_input_tokens_seen": 114937735, + "step": 5351, + "time_per_iteration": 2.68406081199646 + }, + { + "auxiliary_loss_clip": 0.01104774, + "auxiliary_loss_mlp": 0.01045518, + "balance_loss_clip": 1.04905438, + "balance_loss_mlp": 1.02811348, + "epoch": 0.3217796482789719, + "flos": 21470595373440.0, + "grad_norm": 2.189681121413186, + "language_loss": 0.75826663, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.7797696, + "num_input_tokens_seen": 114956630, + "step": 5352, + "time_per_iteration": 2.7035396099090576 + }, + { + "auxiliary_loss_clip": 0.0109763, + "auxiliary_loss_mlp": 0.01043305, + "balance_loss_clip": 1.04897571, + "balance_loss_mlp": 1.02579308, + "epoch": 0.32183977153163984, + "flos": 21215593745280.0, + "grad_norm": 2.4508783518814807, + "language_loss": 0.81992233, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.84133166, + "num_input_tokens_seen": 114976470, + "step": 5353, + "time_per_iteration": 2.731339931488037 + }, + { + "auxiliary_loss_clip": 0.01074627, + "auxiliary_loss_mlp": 0.0104331, + "balance_loss_clip": 1.04917347, + "balance_loss_mlp": 1.02605999, + "epoch": 0.3218998947843078, + "flos": 24608182331520.0, + "grad_norm": 2.2390857397461246, + "language_loss": 0.73474252, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.75592184, + "num_input_tokens_seen": 114996710, + "step": 5354, + "time_per_iteration": 2.8337595462799072 + }, + { + "auxiliary_loss_clip": 0.01103547, + "auxiliary_loss_mlp": 0.01039731, + "balance_loss_clip": 1.04475546, + "balance_loss_mlp": 1.02428102, + "epoch": 0.3219600180369758, + "flos": 22270577126400.0, + "grad_norm": 1.8690515367544651, + "language_loss": 0.83792925, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.85936201, + "num_input_tokens_seen": 115015775, + "step": 5355, + "time_per_iteration": 2.7299652099609375 + }, + { + "auxiliary_loss_clip": 0.01146025, + "auxiliary_loss_mlp": 0.01046795, + "balance_loss_clip": 1.05450225, + "balance_loss_mlp": 1.03032064, + "epoch": 0.3220201412896438, + "flos": 14939126939520.0, + "grad_norm": 1.9705527058452093, + "language_loss": 0.70895493, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.73088312, + "num_input_tokens_seen": 115034265, + "step": 5356, + "time_per_iteration": 2.638268232345581 + }, + { + "auxiliary_loss_clip": 0.01102103, + "auxiliary_loss_mlp": 0.01040751, + "balance_loss_clip": 1.04954576, + "balance_loss_mlp": 1.02245283, + "epoch": 0.32208026454231176, + "flos": 22667389649280.0, + "grad_norm": 2.5241040535813095, + "language_loss": 0.67760962, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.69903815, + "num_input_tokens_seen": 115051945, + "step": 5357, + "time_per_iteration": 2.7816576957702637 + }, + { + "auxiliary_loss_clip": 0.01037625, + "auxiliary_loss_mlp": 0.01029071, + "balance_loss_clip": 1.0279882, + "balance_loss_mlp": 1.02722347, + "epoch": 0.3221403877949797, + "flos": 64605130053120.0, + "grad_norm": 0.7244200234208643, + "language_loss": 0.58319688, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60386384, + "num_input_tokens_seen": 115119090, + "step": 5358, + "time_per_iteration": 3.3341448307037354 + }, + { + "auxiliary_loss_clip": 0.01076802, + "auxiliary_loss_mlp": 0.01044493, + "balance_loss_clip": 1.04142976, + "balance_loss_mlp": 1.0270052, + "epoch": 0.3222005110476477, + "flos": 20157019004160.0, + "grad_norm": 2.2322811787478427, + "language_loss": 0.83184302, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.85305595, + "num_input_tokens_seen": 115137755, + "step": 5359, + "time_per_iteration": 2.836543083190918 + }, + { + "auxiliary_loss_clip": 0.01129966, + "auxiliary_loss_mlp": 0.01035598, + "balance_loss_clip": 1.04800034, + "balance_loss_mlp": 1.01938617, + "epoch": 0.32226063430031565, + "flos": 22674177319680.0, + "grad_norm": 2.0261007556732964, + "language_loss": 0.79563689, + "learning_rate": 3.168912388464595e-06, + "loss": 0.81729257, + "num_input_tokens_seen": 115158150, + "step": 5360, + "time_per_iteration": 2.66043758392334 + }, + { + "auxiliary_loss_clip": 0.01045199, + "auxiliary_loss_mlp": 0.01009155, + "balance_loss_clip": 1.02352595, + "balance_loss_mlp": 1.00706911, + "epoch": 0.3223207575529836, + "flos": 63828525075840.0, + "grad_norm": 0.6569282603798298, + "language_loss": 0.56928504, + "learning_rate": 3.168596347256737e-06, + "loss": 0.58982855, + "num_input_tokens_seen": 115212755, + "step": 5361, + "time_per_iteration": 3.007119655609131 + }, + { + "auxiliary_loss_clip": 0.01078785, + "auxiliary_loss_mlp": 0.01049092, + "balance_loss_clip": 1.04366553, + "balance_loss_mlp": 1.03166366, + "epoch": 0.3223808808056516, + "flos": 26870123537280.0, + "grad_norm": 3.2787914187636495, + "language_loss": 0.71563178, + "learning_rate": 3.168280261735588e-06, + "loss": 0.73691058, + "num_input_tokens_seen": 115233090, + "step": 5362, + "time_per_iteration": 2.8345048427581787 + }, + { + "auxiliary_loss_clip": 0.0112485, + "auxiliary_loss_mlp": 0.01053523, + "balance_loss_clip": 1.04899716, + "balance_loss_mlp": 1.03670287, + "epoch": 0.32244100405831955, + "flos": 26761350176640.0, + "grad_norm": 2.1292104037374773, + "language_loss": 0.74106693, + "learning_rate": 3.167964131913135e-06, + "loss": 0.76285076, + "num_input_tokens_seen": 115252645, + "step": 5363, + "time_per_iteration": 2.70552659034729 + }, + { + "auxiliary_loss_clip": 0.01134941, + "auxiliary_loss_mlp": 0.01042612, + "balance_loss_clip": 1.05024791, + "balance_loss_mlp": 1.02637601, + "epoch": 0.3225011273109875, + "flos": 23803029020160.0, + "grad_norm": 3.812297759050374, + "language_loss": 0.77379405, + "learning_rate": 3.167647957801365e-06, + "loss": 0.7955696, + "num_input_tokens_seen": 115269085, + "step": 5364, + "time_per_iteration": 2.66058087348938 + }, + { + "auxiliary_loss_clip": 0.01120766, + "auxiliary_loss_mlp": 0.01042612, + "balance_loss_clip": 1.05058861, + "balance_loss_mlp": 1.02468252, + "epoch": 0.3225612505636555, + "flos": 17274505501440.0, + "grad_norm": 3.514939630870356, + "language_loss": 0.76727009, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.78890389, + "num_input_tokens_seen": 115286470, + "step": 5365, + "time_per_iteration": 2.6493194103240967 + }, + { + "auxiliary_loss_clip": 0.01124156, + "auxiliary_loss_mlp": 0.01048476, + "balance_loss_clip": 1.05429566, + "balance_loss_mlp": 1.03201342, + "epoch": 0.32262137381632344, + "flos": 23366247638400.0, + "grad_norm": 7.419360933702927, + "language_loss": 0.76938248, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.79110885, + "num_input_tokens_seen": 115307000, + "step": 5366, + "time_per_iteration": 2.6984689235687256 + }, + { + "auxiliary_loss_clip": 0.01110868, + "auxiliary_loss_mlp": 0.01044399, + "balance_loss_clip": 1.04554594, + "balance_loss_mlp": 1.02792382, + "epoch": 0.3226814970689914, + "flos": 23258803080960.0, + "grad_norm": 2.2843777844497453, + "language_loss": 0.71972823, + "learning_rate": 3.166699169850055e-06, + "loss": 0.74128091, + "num_input_tokens_seen": 115325925, + "step": 5367, + "time_per_iteration": 2.6944496631622314 + }, + { + "auxiliary_loss_clip": 0.01138096, + "auxiliary_loss_mlp": 0.01043716, + "balance_loss_clip": 1.05035067, + "balance_loss_mlp": 1.0286001, + "epoch": 0.32274162032165943, + "flos": 16395196561920.0, + "grad_norm": 13.04054524246424, + "language_loss": 0.74414504, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.76596308, + "num_input_tokens_seen": 115343705, + "step": 5368, + "time_per_iteration": 2.670567750930786 + }, + { + "auxiliary_loss_clip": 0.01103298, + "auxiliary_loss_mlp": 0.01049074, + "balance_loss_clip": 1.04370904, + "balance_loss_mlp": 1.0322659, + "epoch": 0.3228017435743274, + "flos": 27855081354240.0, + "grad_norm": 1.655769512058306, + "language_loss": 0.78693509, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.80845881, + "num_input_tokens_seen": 115364170, + "step": 5369, + "time_per_iteration": 2.777437448501587 + }, + { + "auxiliary_loss_clip": 0.01099309, + "auxiliary_loss_mlp": 0.01037821, + "balance_loss_clip": 1.04874706, + "balance_loss_mlp": 1.0222764, + "epoch": 0.32286186682699536, + "flos": 19608770741760.0, + "grad_norm": 13.189929997499553, + "language_loss": 0.83189309, + "learning_rate": 3.16574998372661e-06, + "loss": 0.85326445, + "num_input_tokens_seen": 115382495, + "step": 5370, + "time_per_iteration": 2.734342336654663 + }, + { + "auxiliary_loss_clip": 0.01141788, + "auxiliary_loss_mlp": 0.01044735, + "balance_loss_clip": 1.05202413, + "balance_loss_mlp": 1.0291779, + "epoch": 0.3229219900796633, + "flos": 24134017870080.0, + "grad_norm": 3.3293058605981614, + "language_loss": 0.8288244, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.85068965, + "num_input_tokens_seen": 115399450, + "step": 5371, + "time_per_iteration": 2.620091676712036 + }, + { + "auxiliary_loss_clip": 0.01133164, + "auxiliary_loss_mlp": 0.00776239, + "balance_loss_clip": 1.05046356, + "balance_loss_mlp": 1.00122416, + "epoch": 0.3229821133323313, + "flos": 17748705876480.0, + "grad_norm": 3.1117013800624993, + "language_loss": 0.8852632, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.90435725, + "num_input_tokens_seen": 115417700, + "step": 5372, + "time_per_iteration": 2.673567056655884 + }, + { + "auxiliary_loss_clip": 0.01140269, + "auxiliary_loss_mlp": 0.01049295, + "balance_loss_clip": 1.05098414, + "balance_loss_mlp": 1.03341591, + "epoch": 0.32304223658499925, + "flos": 22346025644160.0, + "grad_norm": 2.7114986433136727, + "language_loss": 0.73388374, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75577939, + "num_input_tokens_seen": 115435840, + "step": 5373, + "time_per_iteration": 2.6910293102264404 + }, + { + "auxiliary_loss_clip": 0.0110976, + "auxiliary_loss_mlp": 0.01044756, + "balance_loss_clip": 1.04653084, + "balance_loss_mlp": 1.02873468, + "epoch": 0.3231023598376672, + "flos": 18478302929280.0, + "grad_norm": 2.3161305262959573, + "language_loss": 0.81114149, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.83268672, + "num_input_tokens_seen": 115454210, + "step": 5374, + "time_per_iteration": 2.666707992553711 + }, + { + "auxiliary_loss_clip": 0.01095169, + "auxiliary_loss_mlp": 0.01038679, + "balance_loss_clip": 1.0438931, + "balance_loss_mlp": 1.02254975, + "epoch": 0.3231624830903352, + "flos": 27636313570560.0, + "grad_norm": 2.1309099752285863, + "language_loss": 0.87817222, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.89951062, + "num_input_tokens_seen": 115471785, + "step": 5375, + "time_per_iteration": 4.252593994140625 + }, + { + "auxiliary_loss_clip": 0.01140942, + "auxiliary_loss_mlp": 0.01036182, + "balance_loss_clip": 1.04865098, + "balance_loss_mlp": 1.01960015, + "epoch": 0.32322260634300315, + "flos": 21726423014400.0, + "grad_norm": 2.12002794330764, + "language_loss": 0.75837636, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.78014749, + "num_input_tokens_seen": 115491405, + "step": 5376, + "time_per_iteration": 2.64569091796875 + }, + { + "auxiliary_loss_clip": 0.01100111, + "auxiliary_loss_mlp": 0.01037893, + "balance_loss_clip": 1.04745007, + "balance_loss_mlp": 1.0227654, + "epoch": 0.3232827295956711, + "flos": 22637656166400.0, + "grad_norm": 16.356053535517315, + "language_loss": 0.66570163, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.68708175, + "num_input_tokens_seen": 115511555, + "step": 5377, + "time_per_iteration": 4.228315591812134 + }, + { + "auxiliary_loss_clip": 0.01103406, + "auxiliary_loss_mlp": 0.01059488, + "balance_loss_clip": 1.04591548, + "balance_loss_mlp": 1.04070055, + "epoch": 0.3233428528483391, + "flos": 26322593546880.0, + "grad_norm": 1.5026052482517693, + "language_loss": 0.72276354, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.74439251, + "num_input_tokens_seen": 115532860, + "step": 5378, + "time_per_iteration": 2.7754812240600586 + }, + { + "auxiliary_loss_clip": 0.0112205, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_clip": 1.04869092, + "balance_loss_mlp": 1.0214678, + "epoch": 0.32340297610100704, + "flos": 28585217111040.0, + "grad_norm": 2.7898138283200344, + "language_loss": 0.82221997, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.84380603, + "num_input_tokens_seen": 115553850, + "step": 5379, + "time_per_iteration": 2.672743320465088 + }, + { + "auxiliary_loss_clip": 0.01130962, + "auxiliary_loss_mlp": 0.01035985, + "balance_loss_clip": 1.04864693, + "balance_loss_mlp": 1.02083325, + "epoch": 0.323463099353675, + "flos": 30773792787840.0, + "grad_norm": 1.5555457678220286, + "language_loss": 0.78895414, + "learning_rate": 3.162583158454388e-06, + "loss": 0.81062359, + "num_input_tokens_seen": 115575530, + "step": 5380, + "time_per_iteration": 4.130786180496216 + }, + { + "auxiliary_loss_clip": 0.01124956, + "auxiliary_loss_mlp": 0.01044026, + "balance_loss_clip": 1.04988194, + "balance_loss_mlp": 1.0286541, + "epoch": 0.32352322260634303, + "flos": 25228610974080.0, + "grad_norm": 1.7365933554134192, + "language_loss": 0.76877856, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.79046834, + "num_input_tokens_seen": 115594885, + "step": 5381, + "time_per_iteration": 2.6297740936279297 + }, + { + "auxiliary_loss_clip": 0.01122723, + "auxiliary_loss_mlp": 0.0103758, + "balance_loss_clip": 1.0485673, + "balance_loss_mlp": 1.02333474, + "epoch": 0.323583345859011, + "flos": 23330480670720.0, + "grad_norm": 1.9510545380996942, + "language_loss": 0.71868116, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.7402842, + "num_input_tokens_seen": 115614080, + "step": 5382, + "time_per_iteration": 4.239168167114258 + }, + { + "auxiliary_loss_clip": 0.01114051, + "auxiliary_loss_mlp": 0.01051511, + "balance_loss_clip": 1.0454843, + "balance_loss_mlp": 1.03392792, + "epoch": 0.32364346911167896, + "flos": 26207499392640.0, + "grad_norm": 2.5669193665709815, + "language_loss": 0.70947385, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.73112947, + "num_input_tokens_seen": 115632820, + "step": 5383, + "time_per_iteration": 2.701462507247925 + }, + { + "auxiliary_loss_clip": 0.01123558, + "auxiliary_loss_mlp": 0.01038956, + "balance_loss_clip": 1.04770291, + "balance_loss_mlp": 1.02382779, + "epoch": 0.3237035923643469, + "flos": 23695764030720.0, + "grad_norm": 1.9442688765107798, + "language_loss": 0.78333974, + "learning_rate": 3.161315193285283e-06, + "loss": 0.8049649, + "num_input_tokens_seen": 115652860, + "step": 5384, + "time_per_iteration": 2.6939637660980225 + }, + { + "auxiliary_loss_clip": 0.01078749, + "auxiliary_loss_mlp": 0.01050129, + "balance_loss_clip": 1.04298878, + "balance_loss_mlp": 1.03203273, + "epoch": 0.3237637156170149, + "flos": 14428728633600.0, + "grad_norm": 2.1298780259276575, + "language_loss": 0.75396919, + "learning_rate": 3.16099809186998e-06, + "loss": 0.77525795, + "num_input_tokens_seen": 115670940, + "step": 5385, + "time_per_iteration": 2.7813403606414795 + }, + { + "auxiliary_loss_clip": 0.0111287, + "auxiliary_loss_mlp": 0.01040739, + "balance_loss_clip": 1.04995322, + "balance_loss_mlp": 1.0248363, + "epoch": 0.32382383886968286, + "flos": 31062981185280.0, + "grad_norm": 2.042597717530735, + "language_loss": 0.71488941, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.73642552, + "num_input_tokens_seen": 115691155, + "step": 5386, + "time_per_iteration": 2.754636526107788 + }, + { + "auxiliary_loss_clip": 0.01142583, + "auxiliary_loss_mlp": 0.01040273, + "balance_loss_clip": 1.0499016, + "balance_loss_mlp": 1.02334547, + "epoch": 0.3238839621223508, + "flos": 23256935573760.0, + "grad_norm": 5.057227062214219, + "language_loss": 0.94889075, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.97071928, + "num_input_tokens_seen": 115710340, + "step": 5387, + "time_per_iteration": 2.6547048091888428 + }, + { + "auxiliary_loss_clip": 0.01133488, + "auxiliary_loss_mlp": 0.01044118, + "balance_loss_clip": 1.05193102, + "balance_loss_mlp": 1.02696419, + "epoch": 0.3239440853750188, + "flos": 22964658606720.0, + "grad_norm": 10.717385990424205, + "language_loss": 0.77620786, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.79798394, + "num_input_tokens_seen": 115726745, + "step": 5388, + "time_per_iteration": 2.657205820083618 + }, + { + "auxiliary_loss_clip": 0.01111832, + "auxiliary_loss_mlp": 0.01036701, + "balance_loss_clip": 1.04523969, + "balance_loss_mlp": 1.01978493, + "epoch": 0.32400420862768675, + "flos": 36246614653440.0, + "grad_norm": 2.237731185409586, + "language_loss": 0.71233571, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.73382103, + "num_input_tokens_seen": 115749385, + "step": 5389, + "time_per_iteration": 2.799731731414795 + }, + { + "auxiliary_loss_clip": 0.01099836, + "auxiliary_loss_mlp": 0.01038996, + "balance_loss_clip": 1.04759645, + "balance_loss_mlp": 1.02302158, + "epoch": 0.3240643318803547, + "flos": 21616500418560.0, + "grad_norm": 1.8547230503773184, + "language_loss": 0.80461568, + "learning_rate": 3.159411924656557e-06, + "loss": 0.82600403, + "num_input_tokens_seen": 115768105, + "step": 5390, + "time_per_iteration": 2.703913450241089 + }, + { + "auxiliary_loss_clip": 0.01112322, + "auxiliary_loss_mlp": 0.01050073, + "balance_loss_clip": 1.04881656, + "balance_loss_mlp": 1.0330621, + "epoch": 0.3241244551330227, + "flos": 23295611543040.0, + "grad_norm": 4.514534114801655, + "language_loss": 0.72674775, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.74837172, + "num_input_tokens_seen": 115787340, + "step": 5391, + "time_per_iteration": 2.8789660930633545 + }, + { + "auxiliary_loss_clip": 0.01110171, + "auxiliary_loss_mlp": 0.01040459, + "balance_loss_clip": 1.04422975, + "balance_loss_mlp": 1.02517664, + "epoch": 0.32418457838569065, + "flos": 14097236993280.0, + "grad_norm": 2.092129040046021, + "language_loss": 0.77347648, + "learning_rate": 3.158777149931855e-06, + "loss": 0.79498285, + "num_input_tokens_seen": 115805565, + "step": 5392, + "time_per_iteration": 2.6689188480377197 + }, + { + "auxiliary_loss_clip": 0.01112252, + "auxiliary_loss_mlp": 0.01051929, + "balance_loss_clip": 1.04517519, + "balance_loss_mlp": 1.03289127, + "epoch": 0.3242447016383586, + "flos": 29752672953600.0, + "grad_norm": 1.9207699243041063, + "language_loss": 0.62606925, + "learning_rate": 3.158459696652067e-06, + "loss": 0.6477111, + "num_input_tokens_seen": 115826725, + "step": 5393, + "time_per_iteration": 2.758423328399658 + }, + { + "auxiliary_loss_clip": 0.01122257, + "auxiliary_loss_mlp": 0.01043934, + "balance_loss_clip": 1.04730856, + "balance_loss_mlp": 1.02770925, + "epoch": 0.3243048248910266, + "flos": 24351205455360.0, + "grad_norm": 1.583732116281239, + "language_loss": 0.82284617, + "learning_rate": 3.158142199443371e-06, + "loss": 0.84450811, + "num_input_tokens_seen": 115846955, + "step": 5394, + "time_per_iteration": 2.6715636253356934 + }, + { + "auxiliary_loss_clip": 0.01111969, + "auxiliary_loss_mlp": 0.01045824, + "balance_loss_clip": 1.04729748, + "balance_loss_mlp": 1.03120947, + "epoch": 0.3243649481436946, + "flos": 24353037048960.0, + "grad_norm": 1.873068954405441, + "language_loss": 0.817029, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.83860689, + "num_input_tokens_seen": 115865975, + "step": 5395, + "time_per_iteration": 2.7120518684387207 + }, + { + "auxiliary_loss_clip": 0.01126983, + "auxiliary_loss_mlp": 0.01039478, + "balance_loss_clip": 1.0519104, + "balance_loss_mlp": 1.02413607, + "epoch": 0.32442507139636256, + "flos": 22925228451840.0, + "grad_norm": 1.8441183317386671, + "language_loss": 0.83172363, + "learning_rate": 3.157507073287417e-06, + "loss": 0.85338825, + "num_input_tokens_seen": 115884950, + "step": 5396, + "time_per_iteration": 2.6589252948760986 + }, + { + "auxiliary_loss_clip": 0.0110371, + "auxiliary_loss_mlp": 0.01053141, + "balance_loss_clip": 1.04818082, + "balance_loss_mlp": 1.03462827, + "epoch": 0.32448519464903053, + "flos": 22200192426240.0, + "grad_norm": 2.3735724483298553, + "language_loss": 0.75721765, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.77878618, + "num_input_tokens_seen": 115904170, + "step": 5397, + "time_per_iteration": 2.7118513584136963 + }, + { + "auxiliary_loss_clip": 0.01104001, + "auxiliary_loss_mlp": 0.0104059, + "balance_loss_clip": 1.04970932, + "balance_loss_mlp": 1.02504468, + "epoch": 0.3245453179016985, + "flos": 18838450644480.0, + "grad_norm": 7.349892433890134, + "language_loss": 0.67359912, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.69504505, + "num_input_tokens_seen": 115919255, + "step": 5398, + "time_per_iteration": 2.690317153930664 + }, + { + "auxiliary_loss_clip": 0.01111486, + "auxiliary_loss_mlp": 0.01033579, + "balance_loss_clip": 1.04846239, + "balance_loss_mlp": 1.01784301, + "epoch": 0.32460544115436646, + "flos": 21178390233600.0, + "grad_norm": 1.692830304346276, + "language_loss": 0.73074687, + "learning_rate": 3.156554054887718e-06, + "loss": 0.7521975, + "num_input_tokens_seen": 115938535, + "step": 5399, + "time_per_iteration": 2.754539728164673 + }, + { + "auxiliary_loss_clip": 0.01101582, + "auxiliary_loss_mlp": 0.01036858, + "balance_loss_clip": 1.04522848, + "balance_loss_mlp": 1.02056217, + "epoch": 0.3246655644070344, + "flos": 21981137333760.0, + "grad_norm": 2.780796864612311, + "language_loss": 0.71580744, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.7371918, + "num_input_tokens_seen": 115955005, + "step": 5400, + "time_per_iteration": 2.707712173461914 + }, + { + "auxiliary_loss_clip": 0.01127225, + "auxiliary_loss_mlp": 0.01040347, + "balance_loss_clip": 1.0472424, + "balance_loss_mlp": 1.02469516, + "epoch": 0.3247256876597024, + "flos": 32159729105280.0, + "grad_norm": 2.1905750946262805, + "language_loss": 0.79769576, + "learning_rate": 3.155918489984614e-06, + "loss": 0.81937146, + "num_input_tokens_seen": 115975305, + "step": 5401, + "time_per_iteration": 2.7813303470611572 + }, + { + "auxiliary_loss_clip": 0.01109499, + "auxiliary_loss_mlp": 0.01041329, + "balance_loss_clip": 1.04414558, + "balance_loss_mlp": 1.02341187, + "epoch": 0.32478581091237035, + "flos": 20997544233600.0, + "grad_norm": 4.743153882711402, + "language_loss": 0.87785316, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.89936143, + "num_input_tokens_seen": 115994810, + "step": 5402, + "time_per_iteration": 2.7685606479644775 + }, + { + "auxiliary_loss_clip": 0.01078796, + "auxiliary_loss_mlp": 0.01044786, + "balance_loss_clip": 1.03948891, + "balance_loss_mlp": 1.02792931, + "epoch": 0.3248459341650383, + "flos": 17924990849280.0, + "grad_norm": 4.964706141121962, + "language_loss": 0.84572911, + "learning_rate": 3.155282749751332e-06, + "loss": 0.86696494, + "num_input_tokens_seen": 116011095, + "step": 5403, + "time_per_iteration": 2.7299063205718994 + }, + { + "auxiliary_loss_clip": 0.01104053, + "auxiliary_loss_mlp": 0.01045074, + "balance_loss_clip": 1.04597795, + "balance_loss_mlp": 1.03049469, + "epoch": 0.3249060574177063, + "flos": 24535606901760.0, + "grad_norm": 3.7265891750540785, + "language_loss": 0.87614954, + "learning_rate": 3.154964813916007e-06, + "loss": 0.89764082, + "num_input_tokens_seen": 116028805, + "step": 5404, + "time_per_iteration": 2.7740931510925293 + }, + { + "auxiliary_loss_clip": 0.01125798, + "auxiliary_loss_mlp": 0.01043439, + "balance_loss_clip": 1.04930234, + "balance_loss_mlp": 1.02685738, + "epoch": 0.32496618067037425, + "flos": 25994765093760.0, + "grad_norm": 2.5497237434599964, + "language_loss": 0.72717422, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.74886656, + "num_input_tokens_seen": 116047765, + "step": 5405, + "time_per_iteration": 2.6756839752197266 + }, + { + "auxiliary_loss_clip": 0.01098309, + "auxiliary_loss_mlp": 0.01039466, + "balance_loss_clip": 1.04964566, + "balance_loss_mlp": 1.02390265, + "epoch": 0.3250263039230422, + "flos": 19573757959680.0, + "grad_norm": 1.6968031771183532, + "language_loss": 0.82927752, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.8506552, + "num_input_tokens_seen": 116068385, + "step": 5406, + "time_per_iteration": 2.728217124938965 + }, + { + "auxiliary_loss_clip": 0.01136878, + "auxiliary_loss_mlp": 0.01032192, + "balance_loss_clip": 1.05117011, + "balance_loss_mlp": 1.01728487, + "epoch": 0.3250864271757102, + "flos": 16763640318720.0, + "grad_norm": 1.9312900503750694, + "language_loss": 0.87836796, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.90005869, + "num_input_tokens_seen": 116085350, + "step": 5407, + "time_per_iteration": 2.5519261360168457 + }, + { + "auxiliary_loss_clip": 0.01112002, + "auxiliary_loss_mlp": 0.01040482, + "balance_loss_clip": 1.04575169, + "balance_loss_mlp": 1.02506793, + "epoch": 0.3251465504283782, + "flos": 27819458040960.0, + "grad_norm": 1.6044550363094983, + "language_loss": 0.69804603, + "learning_rate": 3.153692632731479e-06, + "loss": 0.71957088, + "num_input_tokens_seen": 116107560, + "step": 5408, + "time_per_iteration": 2.7141807079315186 + }, + { + "auxiliary_loss_clip": 0.01131975, + "auxiliary_loss_mlp": 0.01035871, + "balance_loss_clip": 1.05021083, + "balance_loss_mlp": 1.01977742, + "epoch": 0.32520667368104617, + "flos": 19063144172160.0, + "grad_norm": 10.423580562540607, + "language_loss": 0.77558911, + "learning_rate": 3.153374478034841e-06, + "loss": 0.79726762, + "num_input_tokens_seen": 116125980, + "step": 5409, + "time_per_iteration": 2.644792318344116 + }, + { + "auxiliary_loss_clip": 0.01079567, + "auxiliary_loss_mlp": 0.01043858, + "balance_loss_clip": 1.03893065, + "balance_loss_mlp": 1.0280745, + "epoch": 0.32526679693371413, + "flos": 29382146208000.0, + "grad_norm": 2.0524453166640146, + "language_loss": 0.83282518, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.85405946, + "num_input_tokens_seen": 116146530, + "step": 5410, + "time_per_iteration": 2.846480131149292 + }, + { + "auxiliary_loss_clip": 0.01086095, + "auxiliary_loss_mlp": 0.01037636, + "balance_loss_clip": 1.04789686, + "balance_loss_mlp": 1.02272296, + "epoch": 0.3253269201863821, + "flos": 20704513080960.0, + "grad_norm": 1.6475099523255856, + "language_loss": 0.7081182, + "learning_rate": 3.152738037445405e-06, + "loss": 0.72935545, + "num_input_tokens_seen": 116165695, + "step": 5411, + "time_per_iteration": 2.779330253601074 + }, + { + "auxiliary_loss_clip": 0.0108148, + "auxiliary_loss_mlp": 0.01041588, + "balance_loss_clip": 1.04331398, + "balance_loss_mlp": 1.02688956, + "epoch": 0.32538704343905006, + "flos": 29094142959360.0, + "grad_norm": 1.6354124554173295, + "language_loss": 0.82894456, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85017526, + "num_input_tokens_seen": 116185375, + "step": 5412, + "time_per_iteration": 2.7841992378234863 + }, + { + "auxiliary_loss_clip": 0.01106895, + "auxiliary_loss_mlp": 0.01041599, + "balance_loss_clip": 1.04730868, + "balance_loss_mlp": 1.02430189, + "epoch": 0.325447166691718, + "flos": 24676124906880.0, + "grad_norm": 1.867437266565155, + "language_loss": 0.80913842, + "learning_rate": 3.152101422008203e-06, + "loss": 0.83062339, + "num_input_tokens_seen": 116204335, + "step": 5413, + "time_per_iteration": 2.7533957958221436 + }, + { + "auxiliary_loss_clip": 0.01115005, + "auxiliary_loss_mlp": 0.0103855, + "balance_loss_clip": 1.04923081, + "balance_loss_mlp": 1.02155089, + "epoch": 0.325507289944386, + "flos": 21543134889600.0, + "grad_norm": 3.355430774898342, + "language_loss": 0.76891947, + "learning_rate": 3.151783048751864e-06, + "loss": 0.79045498, + "num_input_tokens_seen": 116222840, + "step": 5414, + "time_per_iteration": 4.331217527389526 + }, + { + "auxiliary_loss_clip": 0.01030644, + "auxiliary_loss_mlp": 0.01012699, + "balance_loss_clip": 1.02726388, + "balance_loss_mlp": 1.01063681, + "epoch": 0.32556741319705396, + "flos": 71518722347520.0, + "grad_norm": 0.9066964616955783, + "language_loss": 0.63865513, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.65908855, + "num_input_tokens_seen": 116274940, + "step": 5415, + "time_per_iteration": 3.172816753387451 + }, + { + "auxiliary_loss_clip": 0.01088465, + "auxiliary_loss_mlp": 0.01038606, + "balance_loss_clip": 1.04119301, + "balance_loss_mlp": 1.02279866, + "epoch": 0.3256275364497219, + "flos": 23732428838400.0, + "grad_norm": 1.52454367487569, + "language_loss": 0.74014068, + "learning_rate": 3.151146171224075e-06, + "loss": 0.76141143, + "num_input_tokens_seen": 116297300, + "step": 5416, + "time_per_iteration": 4.326166868209839 + }, + { + "auxiliary_loss_clip": 0.01062287, + "auxiliary_loss_mlp": 0.0100407, + "balance_loss_clip": 1.03045964, + "balance_loss_mlp": 1.00160217, + "epoch": 0.3256876597023899, + "flos": 67289199891840.0, + "grad_norm": 0.7686966052914506, + "language_loss": 0.57851374, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.59917736, + "num_input_tokens_seen": 116362370, + "step": 5417, + "time_per_iteration": 3.2102463245391846 + }, + { + "auxiliary_loss_clip": 0.01040835, + "auxiliary_loss_mlp": 0.01012103, + "balance_loss_clip": 1.02768993, + "balance_loss_mlp": 1.00975466, + "epoch": 0.32574778295505785, + "flos": 71282323964160.0, + "grad_norm": 0.7997987203444133, + "language_loss": 0.63392216, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65445155, + "num_input_tokens_seen": 116430365, + "step": 5418, + "time_per_iteration": 4.847350120544434 + }, + { + "auxiliary_loss_clip": 0.01110249, + "auxiliary_loss_mlp": 0.01043458, + "balance_loss_clip": 1.05171919, + "balance_loss_mlp": 1.02794838, + "epoch": 0.3258079062077258, + "flos": 20776370238720.0, + "grad_norm": 2.0985111563442325, + "language_loss": 0.69086784, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.71240497, + "num_input_tokens_seen": 116447525, + "step": 5419, + "time_per_iteration": 2.6837174892425537 + }, + { + "auxiliary_loss_clip": 0.0112744, + "auxiliary_loss_mlp": 0.01037157, + "balance_loss_clip": 1.05152702, + "balance_loss_mlp": 1.02099252, + "epoch": 0.3258680294603938, + "flos": 22235456603520.0, + "grad_norm": 1.6553118170887535, + "language_loss": 0.77041519, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.79206121, + "num_input_tokens_seen": 116466310, + "step": 5420, + "time_per_iteration": 2.690243721008301 + }, + { + "auxiliary_loss_clip": 0.01124221, + "auxiliary_loss_mlp": 0.00774579, + "balance_loss_clip": 1.04583097, + "balance_loss_mlp": 1.00118852, + "epoch": 0.3259281527130618, + "flos": 26979974305920.0, + "grad_norm": 1.6758047570714483, + "language_loss": 0.8033973, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.82238531, + "num_input_tokens_seen": 116487825, + "step": 5421, + "time_per_iteration": 4.346652984619141 + }, + { + "auxiliary_loss_clip": 0.01133401, + "auxiliary_loss_mlp": 0.0103494, + "balance_loss_clip": 1.04982162, + "balance_loss_mlp": 1.0212909, + "epoch": 0.32598827596572977, + "flos": 26214251149440.0, + "grad_norm": 1.7368751669124027, + "language_loss": 0.75101721, + "learning_rate": 3.149234491389381e-06, + "loss": 0.77270067, + "num_input_tokens_seen": 116509950, + "step": 5422, + "time_per_iteration": 2.698486566543579 + }, + { + "auxiliary_loss_clip": 0.01104722, + "auxiliary_loss_mlp": 0.00773675, + "balance_loss_clip": 1.04894829, + "balance_loss_mlp": 1.00120938, + "epoch": 0.32604839921839773, + "flos": 17639752947840.0, + "grad_norm": 2.1580318636917384, + "language_loss": 0.63323581, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.65201974, + "num_input_tokens_seen": 116527695, + "step": 5423, + "time_per_iteration": 2.7364964485168457 + }, + { + "auxiliary_loss_clip": 0.01098661, + "auxiliary_loss_mlp": 0.01032454, + "balance_loss_clip": 1.04357564, + "balance_loss_mlp": 1.01884615, + "epoch": 0.3261085224710657, + "flos": 23622721724160.0, + "grad_norm": 1.5676988826806029, + "language_loss": 0.74530792, + "learning_rate": 3.148596916016224e-06, + "loss": 0.76661909, + "num_input_tokens_seen": 116547800, + "step": 5424, + "time_per_iteration": 2.695530652999878 + }, + { + "auxiliary_loss_clip": 0.0110482, + "auxiliary_loss_mlp": 0.01035713, + "balance_loss_clip": 1.04803681, + "balance_loss_mlp": 1.02199221, + "epoch": 0.32616864572373366, + "flos": 23260455106560.0, + "grad_norm": 1.6667522289255576, + "language_loss": 0.77194774, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.79335308, + "num_input_tokens_seen": 116568460, + "step": 5425, + "time_per_iteration": 2.6649699211120605 + }, + { + "auxiliary_loss_clip": 0.01106187, + "auxiliary_loss_mlp": 0.01040306, + "balance_loss_clip": 1.04740202, + "balance_loss_mlp": 1.02368808, + "epoch": 0.32622876897640163, + "flos": 25593427457280.0, + "grad_norm": 2.8883064562409744, + "language_loss": 0.78262472, + "learning_rate": 3.147959166423428e-06, + "loss": 0.80408967, + "num_input_tokens_seen": 116588705, + "step": 5426, + "time_per_iteration": 2.7820892333984375 + }, + { + "auxiliary_loss_clip": 0.01088898, + "auxiliary_loss_mlp": 0.01035243, + "balance_loss_clip": 1.04331303, + "balance_loss_mlp": 1.01889908, + "epoch": 0.3262888922290696, + "flos": 22418996123520.0, + "grad_norm": 1.9267107865215556, + "language_loss": 0.74485052, + "learning_rate": 3.147640226324893e-06, + "loss": 0.76609194, + "num_input_tokens_seen": 116608845, + "step": 5427, + "time_per_iteration": 2.7831003665924072 + }, + { + "auxiliary_loss_clip": 0.01103791, + "auxiliary_loss_mlp": 0.01041786, + "balance_loss_clip": 1.04539597, + "balance_loss_mlp": 1.02549028, + "epoch": 0.32634901548173756, + "flos": 19718908819200.0, + "grad_norm": 6.869638277775165, + "language_loss": 0.79136658, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.81282234, + "num_input_tokens_seen": 116628145, + "step": 5428, + "time_per_iteration": 2.7186481952667236 + }, + { + "auxiliary_loss_clip": 0.01121911, + "auxiliary_loss_mlp": 0.01040908, + "balance_loss_clip": 1.04629314, + "balance_loss_mlp": 1.02576876, + "epoch": 0.3264091387344055, + "flos": 16142924367360.0, + "grad_norm": 5.016107817785842, + "language_loss": 0.71130025, + "learning_rate": 3.147002215584023e-06, + "loss": 0.7329284, + "num_input_tokens_seen": 116646920, + "step": 5429, + "time_per_iteration": 2.6733968257904053 + }, + { + "auxiliary_loss_clip": 0.01098408, + "auxiliary_loss_mlp": 0.01035827, + "balance_loss_clip": 1.04658663, + "balance_loss_mlp": 1.0212121, + "epoch": 0.3264692619870735, + "flos": 16399075230720.0, + "grad_norm": 1.7379615094125744, + "language_loss": 0.78620625, + "learning_rate": 3.146683144965881e-06, + "loss": 0.80754858, + "num_input_tokens_seen": 116665100, + "step": 5430, + "time_per_iteration": 2.7313849925994873 + }, + { + "auxiliary_loss_clip": 0.01084979, + "auxiliary_loss_mlp": 0.01043143, + "balance_loss_clip": 1.04809749, + "balance_loss_mlp": 1.02660871, + "epoch": 0.32652938523974145, + "flos": 22382331315840.0, + "grad_norm": 3.4420441965814477, + "language_loss": 0.84279943, + "learning_rate": 3.146364030865399e-06, + "loss": 0.86408061, + "num_input_tokens_seen": 116682205, + "step": 5431, + "time_per_iteration": 2.720797300338745 + }, + { + "auxiliary_loss_clip": 0.01117845, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.04730058, + "balance_loss_mlp": 1.02067482, + "epoch": 0.3265895084924094, + "flos": 21908059113600.0, + "grad_norm": 1.9482899767939774, + "language_loss": 0.70736587, + "learning_rate": 3.146044873294678e-06, + "loss": 0.7288934, + "num_input_tokens_seen": 116702575, + "step": 5432, + "time_per_iteration": 2.6805124282836914 + }, + { + "auxiliary_loss_clip": 0.01073417, + "auxiliary_loss_mlp": 0.01042634, + "balance_loss_clip": 1.04051948, + "balance_loss_mlp": 1.02625418, + "epoch": 0.3266496317450774, + "flos": 16067152627200.0, + "grad_norm": 1.6263283854003907, + "language_loss": 0.84160507, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86276555, + "num_input_tokens_seen": 116720885, + "step": 5433, + "time_per_iteration": 2.733450174331665 + }, + { + "auxiliary_loss_clip": 0.01110224, + "auxiliary_loss_mlp": 0.01031776, + "balance_loss_clip": 1.04831946, + "balance_loss_mlp": 1.01733375, + "epoch": 0.3267097549977454, + "flos": 22528236360960.0, + "grad_norm": 1.8752055231309104, + "language_loss": 0.860237, + "learning_rate": 3.145406427790931e-06, + "loss": 0.881657, + "num_input_tokens_seen": 116740395, + "step": 5434, + "time_per_iteration": 2.6711690425872803 + }, + { + "auxiliary_loss_clip": 0.01115762, + "auxiliary_loss_mlp": 0.0104022, + "balance_loss_clip": 1.04894018, + "balance_loss_mlp": 1.02460361, + "epoch": 0.32676987825041337, + "flos": 27270419679360.0, + "grad_norm": 2.089345873834278, + "language_loss": 0.87845808, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.90001786, + "num_input_tokens_seen": 116758870, + "step": 5435, + "time_per_iteration": 2.7342183589935303 + }, + { + "auxiliary_loss_clip": 0.01137287, + "auxiliary_loss_mlp": 0.01037617, + "balance_loss_clip": 1.05190301, + "balance_loss_mlp": 1.02256095, + "epoch": 0.32683000150308134, + "flos": 11508257433600.0, + "grad_norm": 3.0926239838125595, + "language_loss": 0.7645883, + "learning_rate": 3.144767808551479e-06, + "loss": 0.78633732, + "num_input_tokens_seen": 116773440, + "step": 5436, + "time_per_iteration": 2.648062229156494 + }, + { + "auxiliary_loss_clip": 0.01137346, + "auxiliary_loss_mlp": 0.01034933, + "balance_loss_clip": 1.0532552, + "balance_loss_mlp": 1.02046728, + "epoch": 0.3268901247557493, + "flos": 25630200005760.0, + "grad_norm": 1.7720337367532448, + "language_loss": 0.71802473, + "learning_rate": 3.144448433811134e-06, + "loss": 0.73974752, + "num_input_tokens_seen": 116794375, + "step": 5437, + "time_per_iteration": 2.680525541305542 + }, + { + "auxiliary_loss_clip": 0.01095966, + "auxiliary_loss_mlp": 0.0104222, + "balance_loss_clip": 1.04542243, + "balance_loss_mlp": 1.02445781, + "epoch": 0.32695024800841727, + "flos": 24860849575680.0, + "grad_norm": 1.7134236857074348, + "language_loss": 0.63728261, + "learning_rate": 3.144129015673189e-06, + "loss": 0.65866441, + "num_input_tokens_seen": 116815095, + "step": 5438, + "time_per_iteration": 2.7343454360961914 + }, + { + "auxiliary_loss_clip": 0.01128746, + "auxiliary_loss_mlp": 0.01039734, + "balance_loss_clip": 1.05383801, + "balance_loss_mlp": 1.02468967, + "epoch": 0.32701037126108523, + "flos": 28839249072000.0, + "grad_norm": 3.854723832885701, + "language_loss": 0.74629039, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.76797515, + "num_input_tokens_seen": 116836630, + "step": 5439, + "time_per_iteration": 2.6859002113342285 + }, + { + "auxiliary_loss_clip": 0.0113034, + "auxiliary_loss_mlp": 0.0104413, + "balance_loss_clip": 1.05407321, + "balance_loss_mlp": 1.02773881, + "epoch": 0.3270704945137532, + "flos": 27965075777280.0, + "grad_norm": 3.9922367032947634, + "language_loss": 0.74743968, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.76918435, + "num_input_tokens_seen": 116856880, + "step": 5440, + "time_per_iteration": 2.6785733699798584 + }, + { + "auxiliary_loss_clip": 0.01124529, + "auxiliary_loss_mlp": 0.00773254, + "balance_loss_clip": 1.05180979, + "balance_loss_mlp": 1.00108397, + "epoch": 0.32713061776642116, + "flos": 23690700213120.0, + "grad_norm": 2.2888111794693033, + "language_loss": 0.84642965, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.86540747, + "num_input_tokens_seen": 116873770, + "step": 5441, + "time_per_iteration": 2.692375421524048 + }, + { + "auxiliary_loss_clip": 0.01126517, + "auxiliary_loss_mlp": 0.01042941, + "balance_loss_clip": 1.05065203, + "balance_loss_mlp": 1.02715778, + "epoch": 0.3271907410190891, + "flos": 22455625017600.0, + "grad_norm": 3.048730330719705, + "language_loss": 0.86782062, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.88951516, + "num_input_tokens_seen": 116891225, + "step": 5442, + "time_per_iteration": 2.6678872108459473 + }, + { + "auxiliary_loss_clip": 0.01105154, + "auxiliary_loss_mlp": 0.01041235, + "balance_loss_clip": 1.05088091, + "balance_loss_mlp": 1.02450991, + "epoch": 0.3272508642717571, + "flos": 22820118278400.0, + "grad_norm": 2.240879974234663, + "language_loss": 0.77471602, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.79617989, + "num_input_tokens_seen": 116912300, + "step": 5443, + "time_per_iteration": 2.715407133102417 + }, + { + "auxiliary_loss_clip": 0.01109692, + "auxiliary_loss_mlp": 0.00773391, + "balance_loss_clip": 1.05144906, + "balance_loss_mlp": 1.00102162, + "epoch": 0.32731098752442506, + "flos": 11801360413440.0, + "grad_norm": 2.595112113661144, + "language_loss": 0.81782895, + "learning_rate": 3.142211596174343e-06, + "loss": 0.83665979, + "num_input_tokens_seen": 116929425, + "step": 5444, + "time_per_iteration": 2.7483620643615723 + }, + { + "auxiliary_loss_clip": 0.0109768, + "auxiliary_loss_mlp": 0.01042359, + "balance_loss_clip": 1.05127132, + "balance_loss_mlp": 1.02671897, + "epoch": 0.327371110777093, + "flos": 21027780506880.0, + "grad_norm": 2.0540771727134786, + "language_loss": 0.59668452, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.61808491, + "num_input_tokens_seen": 116948255, + "step": 5445, + "time_per_iteration": 2.7937049865722656 + }, + { + "auxiliary_loss_clip": 0.01134371, + "auxiliary_loss_mlp": 0.01045479, + "balance_loss_clip": 1.05779314, + "balance_loss_mlp": 1.02935553, + "epoch": 0.327431234029761, + "flos": 19062102677760.0, + "grad_norm": 2.705344105300375, + "language_loss": 0.88343978, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.90523833, + "num_input_tokens_seen": 116964905, + "step": 5446, + "time_per_iteration": 2.586451292037964 + }, + { + "auxiliary_loss_clip": 0.01135097, + "auxiliary_loss_mlp": 0.01041409, + "balance_loss_clip": 1.0612191, + "balance_loss_mlp": 1.02387285, + "epoch": 0.32749135728242895, + "flos": 25849219184640.0, + "grad_norm": 2.2697780368090883, + "language_loss": 0.79279661, + "learning_rate": 3.141252301538802e-06, + "loss": 0.81456167, + "num_input_tokens_seen": 116983650, + "step": 5447, + "time_per_iteration": 2.744072198867798 + }, + { + "auxiliary_loss_clip": 0.01107571, + "auxiliary_loss_mlp": 0.00773964, + "balance_loss_clip": 1.04747021, + "balance_loss_mlp": 1.00110793, + "epoch": 0.327551480535097, + "flos": 20120533764480.0, + "grad_norm": 1.8015667711206929, + "language_loss": 0.73182315, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.75063848, + "num_input_tokens_seen": 117003265, + "step": 5448, + "time_per_iteration": 2.6825077533721924 + }, + { + "auxiliary_loss_clip": 0.01142648, + "auxiliary_loss_mlp": 0.01042295, + "balance_loss_clip": 1.05620432, + "balance_loss_mlp": 1.02694106, + "epoch": 0.32761160378776494, + "flos": 28803553931520.0, + "grad_norm": 1.4660761852129829, + "language_loss": 0.67103487, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.69288433, + "num_input_tokens_seen": 117025370, + "step": 5449, + "time_per_iteration": 2.682499885559082 + }, + { + "auxiliary_loss_clip": 0.0110995, + "auxiliary_loss_mlp": 0.010411, + "balance_loss_clip": 1.0542469, + "balance_loss_mlp": 1.02627623, + "epoch": 0.3276717270404329, + "flos": 26937778803840.0, + "grad_norm": 3.4023702964270943, + "language_loss": 0.65110958, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.67262006, + "num_input_tokens_seen": 117044350, + "step": 5450, + "time_per_iteration": 2.7582857608795166 + }, + { + "auxiliary_loss_clip": 0.0113136, + "auxiliary_loss_mlp": 0.01045713, + "balance_loss_clip": 1.05517817, + "balance_loss_mlp": 1.03021002, + "epoch": 0.32773185029310087, + "flos": 25338425829120.0, + "grad_norm": 1.5880234750249043, + "language_loss": 0.77630055, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.79807132, + "num_input_tokens_seen": 117064450, + "step": 5451, + "time_per_iteration": 2.6543071269989014 + }, + { + "auxiliary_loss_clip": 0.01131184, + "auxiliary_loss_mlp": 0.01044056, + "balance_loss_clip": 1.05428064, + "balance_loss_mlp": 1.02809358, + "epoch": 0.32779197354576883, + "flos": 26391721271040.0, + "grad_norm": 1.913131066587778, + "language_loss": 0.70510584, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.7268582, + "num_input_tokens_seen": 117083060, + "step": 5452, + "time_per_iteration": 2.6963608264923096 + }, + { + "auxiliary_loss_clip": 0.01112229, + "auxiliary_loss_mlp": 0.01036592, + "balance_loss_clip": 1.048841, + "balance_loss_mlp": 1.02223349, + "epoch": 0.3278520967984368, + "flos": 24899381890560.0, + "grad_norm": 2.6287596248848013, + "language_loss": 0.78730083, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.80878907, + "num_input_tokens_seen": 117101860, + "step": 5453, + "time_per_iteration": 4.197263479232788 + }, + { + "auxiliary_loss_clip": 0.01130585, + "auxiliary_loss_mlp": 0.01035536, + "balance_loss_clip": 1.0526675, + "balance_loss_mlp": 1.02026486, + "epoch": 0.32791222005110476, + "flos": 29752996176000.0, + "grad_norm": 5.184832608635382, + "language_loss": 0.75771177, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.77937293, + "num_input_tokens_seen": 117123100, + "step": 5454, + "time_per_iteration": 2.7643721103668213 + }, + { + "auxiliary_loss_clip": 0.01070253, + "auxiliary_loss_mlp": 0.01047697, + "balance_loss_clip": 1.03818846, + "balance_loss_mlp": 1.03363037, + "epoch": 0.32797234330377273, + "flos": 16508064072960.0, + "grad_norm": 2.8017119157252703, + "language_loss": 0.76891404, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.79009354, + "num_input_tokens_seen": 117140515, + "step": 5455, + "time_per_iteration": 4.402290105819702 + }, + { + "auxiliary_loss_clip": 0.01131084, + "auxiliary_loss_mlp": 0.01042542, + "balance_loss_clip": 1.05241477, + "balance_loss_mlp": 1.02624655, + "epoch": 0.3280324665564407, + "flos": 26577918397440.0, + "grad_norm": 1.6426536912861747, + "language_loss": 0.74021912, + "learning_rate": 3.138372082016768e-06, + "loss": 0.76195538, + "num_input_tokens_seen": 117161485, + "step": 5456, + "time_per_iteration": 2.821965217590332 + }, + { + "auxiliary_loss_clip": 0.01140062, + "auxiliary_loss_mlp": 0.01047408, + "balance_loss_clip": 1.05334985, + "balance_loss_mlp": 1.03212523, + "epoch": 0.32809258980910866, + "flos": 22929969047040.0, + "grad_norm": 1.7597936582740754, + "language_loss": 0.78038168, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.80225635, + "num_input_tokens_seen": 117181870, + "step": 5457, + "time_per_iteration": 2.703756093978882 + }, + { + "auxiliary_loss_clip": 0.01104649, + "auxiliary_loss_mlp": 0.01042509, + "balance_loss_clip": 1.04943132, + "balance_loss_mlp": 1.02752471, + "epoch": 0.3281527130617766, + "flos": 22783848520320.0, + "grad_norm": 5.102364490559591, + "language_loss": 0.79493362, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.81640518, + "num_input_tokens_seen": 117201380, + "step": 5458, + "time_per_iteration": 4.307415962219238 + }, + { + "auxiliary_loss_clip": 0.01124323, + "auxiliary_loss_mlp": 0.01039216, + "balance_loss_clip": 1.05467916, + "balance_loss_mlp": 1.02362311, + "epoch": 0.3282128363144446, + "flos": 21250678354560.0, + "grad_norm": 1.6160363150508943, + "language_loss": 0.73029429, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.7519297, + "num_input_tokens_seen": 117221040, + "step": 5459, + "time_per_iteration": 2.678131341934204 + }, + { + "auxiliary_loss_clip": 0.01118921, + "auxiliary_loss_mlp": 0.01041188, + "balance_loss_clip": 1.05190325, + "balance_loss_mlp": 1.02591753, + "epoch": 0.32827295956711255, + "flos": 30843064166400.0, + "grad_norm": 2.011905165126453, + "language_loss": 0.84018445, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.86178553, + "num_input_tokens_seen": 117241395, + "step": 5460, + "time_per_iteration": 5.767046213150024 + }, + { + "auxiliary_loss_clip": 0.01138817, + "auxiliary_loss_mlp": 0.01035204, + "balance_loss_clip": 1.05174541, + "balance_loss_mlp": 1.02029121, + "epoch": 0.3283330828197806, + "flos": 25915006944000.0, + "grad_norm": 1.9959413021835115, + "language_loss": 0.76553524, + "learning_rate": 3.136770448642288e-06, + "loss": 0.78727543, + "num_input_tokens_seen": 117259340, + "step": 5461, + "time_per_iteration": 2.673659086227417 + }, + { + "auxiliary_loss_clip": 0.01121607, + "auxiliary_loss_mlp": 0.01042243, + "balance_loss_clip": 1.05065536, + "balance_loss_mlp": 1.02489805, + "epoch": 0.32839320607244854, + "flos": 38582065042560.0, + "grad_norm": 2.148112131584704, + "language_loss": 0.62898672, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.65062523, + "num_input_tokens_seen": 117282375, + "step": 5462, + "time_per_iteration": 2.789217472076416 + }, + { + "auxiliary_loss_clip": 0.01136727, + "auxiliary_loss_mlp": 0.0077334, + "balance_loss_clip": 1.05279326, + "balance_loss_mlp": 1.00113511, + "epoch": 0.3284533293251165, + "flos": 26650888876800.0, + "grad_norm": 2.4415591889879056, + "language_loss": 0.7805075, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.79960817, + "num_input_tokens_seen": 117303830, + "step": 5463, + "time_per_iteration": 2.6797146797180176 + }, + { + "auxiliary_loss_clip": 0.01109773, + "auxiliary_loss_mlp": 0.01040868, + "balance_loss_clip": 1.05036163, + "balance_loss_mlp": 1.02523983, + "epoch": 0.32851345257778447, + "flos": 15304158904320.0, + "grad_norm": 1.8407799027990368, + "language_loss": 0.70095646, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.72246289, + "num_input_tokens_seen": 117320665, + "step": 5464, + "time_per_iteration": 2.7286477088928223 + }, + { + "auxiliary_loss_clip": 0.01130175, + "auxiliary_loss_mlp": 0.01038523, + "balance_loss_clip": 1.05659711, + "balance_loss_mlp": 1.02327609, + "epoch": 0.32857357583045244, + "flos": 23513732881920.0, + "grad_norm": 1.976060055551124, + "language_loss": 0.72474623, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.74643314, + "num_input_tokens_seen": 117339795, + "step": 5465, + "time_per_iteration": 2.6666364669799805 + }, + { + "auxiliary_loss_clip": 0.01113042, + "auxiliary_loss_mlp": 0.01049431, + "balance_loss_clip": 1.05094242, + "balance_loss_mlp": 1.03334332, + "epoch": 0.3286336990831204, + "flos": 20995209849600.0, + "grad_norm": 1.953344541818443, + "language_loss": 0.832214, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.8538388, + "num_input_tokens_seen": 117359525, + "step": 5466, + "time_per_iteration": 2.7432901859283447 + }, + { + "auxiliary_loss_clip": 0.01113455, + "auxiliary_loss_mlp": 0.01041029, + "balance_loss_clip": 1.04729056, + "balance_loss_mlp": 1.02577055, + "epoch": 0.32869382233578837, + "flos": 23658811914240.0, + "grad_norm": 1.7893036060845653, + "language_loss": 0.79221183, + "learning_rate": 3.134847066213879e-06, + "loss": 0.8137567, + "num_input_tokens_seen": 117380320, + "step": 5467, + "time_per_iteration": 2.701490879058838 + }, + { + "auxiliary_loss_clip": 0.0111678, + "auxiliary_loss_mlp": 0.0103291, + "balance_loss_clip": 1.05045676, + "balance_loss_mlp": 1.01759124, + "epoch": 0.32875394558845633, + "flos": 25336522408320.0, + "grad_norm": 1.5411251384559923, + "language_loss": 0.74338531, + "learning_rate": 3.134526351787587e-06, + "loss": 0.76488233, + "num_input_tokens_seen": 117400695, + "step": 5468, + "time_per_iteration": 2.6820507049560547 + }, + { + "auxiliary_loss_clip": 0.0111552, + "auxiliary_loss_mlp": 0.01042549, + "balance_loss_clip": 1.05065966, + "balance_loss_mlp": 1.02476263, + "epoch": 0.3288140688411243, + "flos": 14903108576640.0, + "grad_norm": 1.9818058078172698, + "language_loss": 0.7869612, + "learning_rate": 3.134205594339942e-06, + "loss": 0.80854189, + "num_input_tokens_seen": 117418800, + "step": 5469, + "time_per_iteration": 2.6281590461730957 + }, + { + "auxiliary_loss_clip": 0.01104752, + "auxiliary_loss_mlp": 0.01033111, + "balance_loss_clip": 1.04863441, + "balance_loss_mlp": 1.01838851, + "epoch": 0.32887419209379226, + "flos": 18551345235840.0, + "grad_norm": 1.9383846382167882, + "language_loss": 0.81744516, + "learning_rate": 3.133884793883107e-06, + "loss": 0.8388238, + "num_input_tokens_seen": 117438220, + "step": 5470, + "time_per_iteration": 2.8643784523010254 + }, + { + "auxiliary_loss_clip": 0.01140563, + "auxiliary_loss_mlp": 0.01045939, + "balance_loss_clip": 1.05232358, + "balance_loss_mlp": 1.03021562, + "epoch": 0.3289343153464602, + "flos": 48105610439040.0, + "grad_norm": 2.0914054865715768, + "language_loss": 0.67699564, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.69886065, + "num_input_tokens_seen": 117462560, + "step": 5471, + "time_per_iteration": 2.851717948913574 + }, + { + "auxiliary_loss_clip": 0.01148136, + "auxiliary_loss_mlp": 0.01043561, + "balance_loss_clip": 1.05701339, + "balance_loss_mlp": 1.02594161, + "epoch": 0.3289944385991282, + "flos": 27600295207680.0, + "grad_norm": 2.097557855250848, + "language_loss": 0.64926231, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.67117929, + "num_input_tokens_seen": 117483665, + "step": 5472, + "time_per_iteration": 2.6586108207702637 + }, + { + "auxiliary_loss_clip": 0.01128351, + "auxiliary_loss_mlp": 0.01045454, + "balance_loss_clip": 1.05333138, + "balance_loss_mlp": 1.02850199, + "epoch": 0.32905456185179616, + "flos": 20120318282880.0, + "grad_norm": 3.4668570750263155, + "language_loss": 0.88257217, + "learning_rate": 3.13292213457912e-06, + "loss": 0.90431023, + "num_input_tokens_seen": 117503565, + "step": 5473, + "time_per_iteration": 2.6792144775390625 + }, + { + "auxiliary_loss_clip": 0.01103479, + "auxiliary_loss_mlp": 0.01038881, + "balance_loss_clip": 1.04814398, + "balance_loss_mlp": 1.02123809, + "epoch": 0.3291146851044642, + "flos": 23180230080000.0, + "grad_norm": 1.8710184691373295, + "language_loss": 0.78193343, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.80335701, + "num_input_tokens_seen": 117521460, + "step": 5474, + "time_per_iteration": 2.739057779312134 + }, + { + "auxiliary_loss_clip": 0.01038022, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.02788568, + "balance_loss_mlp": 1.02673554, + "epoch": 0.32917480835713214, + "flos": 67621912594560.0, + "grad_norm": 0.8109823017171686, + "language_loss": 0.6018818, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62255442, + "num_input_tokens_seen": 117580550, + "step": 5475, + "time_per_iteration": 3.196384906768799 + }, + { + "auxiliary_loss_clip": 0.01091837, + "auxiliary_loss_mlp": 0.01057279, + "balance_loss_clip": 1.04454446, + "balance_loss_mlp": 1.03726411, + "epoch": 0.3292349316098001, + "flos": 27964537073280.0, + "grad_norm": 4.962450920257536, + "language_loss": 0.76504046, + "learning_rate": 3.131959088630455e-06, + "loss": 0.78653169, + "num_input_tokens_seen": 117600645, + "step": 5476, + "time_per_iteration": 2.7369961738586426 + }, + { + "auxiliary_loss_clip": 0.01100541, + "auxiliary_loss_mlp": 0.01044762, + "balance_loss_clip": 1.04824603, + "balance_loss_mlp": 1.02946782, + "epoch": 0.3292950548624681, + "flos": 20263673462400.0, + "grad_norm": 2.5019671735892937, + "language_loss": 0.74746907, + "learning_rate": 3.131637987449997e-06, + "loss": 0.76892209, + "num_input_tokens_seen": 117618880, + "step": 5477, + "time_per_iteration": 2.814467430114746 + }, + { + "auxiliary_loss_clip": 0.01135692, + "auxiliary_loss_mlp": 0.01042652, + "balance_loss_clip": 1.05235898, + "balance_loss_mlp": 1.02838814, + "epoch": 0.32935517811513604, + "flos": 20812999132800.0, + "grad_norm": 3.9065130557825234, + "language_loss": 0.75539625, + "learning_rate": 3.131316843357713e-06, + "loss": 0.77717972, + "num_input_tokens_seen": 117636445, + "step": 5478, + "time_per_iteration": 2.730445384979248 + }, + { + "auxiliary_loss_clip": 0.0112467, + "auxiliary_loss_mlp": 0.01042056, + "balance_loss_clip": 1.04921985, + "balance_loss_mlp": 1.02750051, + "epoch": 0.329415301367804, + "flos": 18441853603200.0, + "grad_norm": 2.855777191383278, + "language_loss": 0.80462509, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.82629234, + "num_input_tokens_seen": 117653105, + "step": 5479, + "time_per_iteration": 2.6443796157836914 + }, + { + "auxiliary_loss_clip": 0.01037863, + "auxiliary_loss_mlp": 0.01000413, + "balance_loss_clip": 1.02671266, + "balance_loss_mlp": 0.99823159, + "epoch": 0.32947542462047197, + "flos": 66323024887680.0, + "grad_norm": 0.7530723778079996, + "language_loss": 0.56519568, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58557844, + "num_input_tokens_seen": 117719225, + "step": 5480, + "time_per_iteration": 3.213240146636963 + }, + { + "auxiliary_loss_clip": 0.01124019, + "auxiliary_loss_mlp": 0.00774449, + "balance_loss_clip": 1.04898739, + "balance_loss_mlp": 1.00116146, + "epoch": 0.32953554787313993, + "flos": 23221599569280.0, + "grad_norm": 1.7923941739082951, + "language_loss": 0.77444887, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.79343355, + "num_input_tokens_seen": 117738725, + "step": 5481, + "time_per_iteration": 2.6905598640441895 + }, + { + "auxiliary_loss_clip": 0.01119194, + "auxiliary_loss_mlp": 0.01050738, + "balance_loss_clip": 1.05167091, + "balance_loss_mlp": 1.03557408, + "epoch": 0.3295956711258079, + "flos": 27009492307200.0, + "grad_norm": 1.5874205685036498, + "language_loss": 0.78222132, + "learning_rate": 3.130031838113899e-06, + "loss": 0.80392069, + "num_input_tokens_seen": 117757765, + "step": 5482, + "time_per_iteration": 2.765235424041748 + }, + { + "auxiliary_loss_clip": 0.01130055, + "auxiliary_loss_mlp": 0.01052605, + "balance_loss_clip": 1.05121589, + "balance_loss_mlp": 1.03674388, + "epoch": 0.32965579437847586, + "flos": 19171702051200.0, + "grad_norm": 2.9405789595849385, + "language_loss": 0.73674762, + "learning_rate": 3.129710479645185e-06, + "loss": 0.75857425, + "num_input_tokens_seen": 117776810, + "step": 5483, + "time_per_iteration": 2.624969005584717 + }, + { + "auxiliary_loss_clip": 0.01122896, + "auxiliary_loss_mlp": 0.01054419, + "balance_loss_clip": 1.05069685, + "balance_loss_mlp": 1.03886831, + "epoch": 0.32971591763114383, + "flos": 30482521401600.0, + "grad_norm": 1.8706124903497952, + "language_loss": 0.75649381, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.77826691, + "num_input_tokens_seen": 117797730, + "step": 5484, + "time_per_iteration": 2.7650864124298096 + }, + { + "auxiliary_loss_clip": 0.01141223, + "auxiliary_loss_mlp": 0.01053478, + "balance_loss_clip": 1.05515027, + "balance_loss_mlp": 1.03807664, + "epoch": 0.3297760408838118, + "flos": 16289583598080.0, + "grad_norm": 72.4202789440072, + "language_loss": 0.71719176, + "learning_rate": 3.129067634203742e-06, + "loss": 0.73913872, + "num_input_tokens_seen": 117815365, + "step": 5485, + "time_per_iteration": 2.603039264678955 + }, + { + "auxiliary_loss_clip": 0.01081054, + "auxiliary_loss_mlp": 0.01052335, + "balance_loss_clip": 1.04921818, + "balance_loss_mlp": 1.03822041, + "epoch": 0.32983616413647976, + "flos": 29530924341120.0, + "grad_norm": 1.6108204077161399, + "language_loss": 0.80275488, + "learning_rate": 3.128746147255388e-06, + "loss": 0.82408869, + "num_input_tokens_seen": 117836095, + "step": 5486, + "time_per_iteration": 2.8364202976226807 + }, + { + "auxiliary_loss_clip": 0.01106188, + "auxiliary_loss_mlp": 0.01053006, + "balance_loss_clip": 1.04739475, + "balance_loss_mlp": 1.03650784, + "epoch": 0.3298962873891478, + "flos": 20631398947200.0, + "grad_norm": 2.173231613182175, + "language_loss": 0.84374005, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.86533195, + "num_input_tokens_seen": 117854655, + "step": 5487, + "time_per_iteration": 2.7796428203582764 + }, + { + "auxiliary_loss_clip": 0.01087509, + "auxiliary_loss_mlp": 0.01055173, + "balance_loss_clip": 1.04317069, + "balance_loss_mlp": 1.0379355, + "epoch": 0.32995641064181574, + "flos": 14976007228800.0, + "grad_norm": 2.633362688401157, + "language_loss": 0.74667275, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.76809955, + "num_input_tokens_seen": 117873300, + "step": 5488, + "time_per_iteration": 2.7173233032226562 + }, + { + "auxiliary_loss_clip": 0.01143363, + "auxiliary_loss_mlp": 0.01051325, + "balance_loss_clip": 1.05679107, + "balance_loss_mlp": 1.03563726, + "epoch": 0.3300165338944837, + "flos": 18661447399680.0, + "grad_norm": 2.518818086418956, + "language_loss": 0.71718305, + "learning_rate": 3.127781429646098e-06, + "loss": 0.7391299, + "num_input_tokens_seen": 117891540, + "step": 5489, + "time_per_iteration": 2.6647188663482666 + }, + { + "auxiliary_loss_clip": 0.01137372, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.05154073, + "balance_loss_mlp": 1.02973497, + "epoch": 0.3300766571471517, + "flos": 25583730785280.0, + "grad_norm": 6.067113992727344, + "language_loss": 0.88346136, + "learning_rate": 3.127459771562238e-06, + "loss": 0.90527773, + "num_input_tokens_seen": 117907690, + "step": 5490, + "time_per_iteration": 2.594193696975708 + }, + { + "auxiliary_loss_clip": 0.01127009, + "auxiliary_loss_mlp": 0.0103878, + "balance_loss_clip": 1.05081856, + "balance_loss_mlp": 1.02396214, + "epoch": 0.33013678039981964, + "flos": 11363501623680.0, + "grad_norm": 5.091693260582257, + "language_loss": 0.83396459, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85562241, + "num_input_tokens_seen": 117925640, + "step": 5491, + "time_per_iteration": 2.6124439239501953 + }, + { + "auxiliary_loss_clip": 0.01111643, + "auxiliary_loss_mlp": 0.01048849, + "balance_loss_clip": 1.05066538, + "balance_loss_mlp": 1.03372788, + "epoch": 0.3301969036524876, + "flos": 24821203939200.0, + "grad_norm": 1.9936853829327341, + "language_loss": 0.77453989, + "learning_rate": 3.126816327146554e-06, + "loss": 0.79614484, + "num_input_tokens_seen": 117944525, + "step": 5492, + "time_per_iteration": 4.26681923866272 + }, + { + "auxiliary_loss_clip": 0.01144384, + "auxiliary_loss_mlp": 0.01046422, + "balance_loss_clip": 1.05559993, + "balance_loss_mlp": 1.02987576, + "epoch": 0.33025702690515557, + "flos": 15961144613760.0, + "grad_norm": 2.586093125227841, + "language_loss": 0.74295127, + "learning_rate": 3.12649454083913e-06, + "loss": 0.76485932, + "num_input_tokens_seen": 117962515, + "step": 5493, + "time_per_iteration": 2.572657585144043 + }, + { + "auxiliary_loss_clip": 0.01007495, + "auxiliary_loss_mlp": 0.01051184, + "balance_loss_clip": 1.0238874, + "balance_loss_mlp": 1.0491215, + "epoch": 0.33031715015782354, + "flos": 59416755989760.0, + "grad_norm": 0.7952972655943692, + "language_loss": 0.53981996, + "learning_rate": 3.12617271181492e-06, + "loss": 0.5604068, + "num_input_tokens_seen": 118018780, + "step": 5494, + "time_per_iteration": 3.2123944759368896 + }, + { + "auxiliary_loss_clip": 0.01114646, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.04879999, + "balance_loss_mlp": 1.02241075, + "epoch": 0.3303772734104915, + "flos": 23184360144000.0, + "grad_norm": 1.4867113292626302, + "language_loss": 0.87236047, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.89388549, + "num_input_tokens_seen": 118038610, + "step": 5495, + "time_per_iteration": 4.180245637893677 + }, + { + "auxiliary_loss_clip": 0.01104415, + "auxiliary_loss_mlp": 0.0104461, + "balance_loss_clip": 1.0520072, + "balance_loss_mlp": 1.02813482, + "epoch": 0.33043739666315947, + "flos": 33071896010880.0, + "grad_norm": 2.0634169818588157, + "language_loss": 0.73468459, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.7561748, + "num_input_tokens_seen": 118055905, + "step": 5496, + "time_per_iteration": 2.816849946975708 + }, + { + "auxiliary_loss_clip": 0.01107244, + "auxiliary_loss_mlp": 0.01039897, + "balance_loss_clip": 1.04852057, + "balance_loss_mlp": 1.02469766, + "epoch": 0.33049751991582743, + "flos": 24895431394560.0, + "grad_norm": 2.430684839051296, + "language_loss": 0.72464252, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.74611384, + "num_input_tokens_seen": 118073695, + "step": 5497, + "time_per_iteration": 4.314718961715698 + }, + { + "auxiliary_loss_clip": 0.01111966, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.05051875, + "balance_loss_mlp": 1.02313733, + "epoch": 0.3305576431684954, + "flos": 29460575554560.0, + "grad_norm": 1.9082848646705384, + "language_loss": 0.804672, + "learning_rate": 3.124884968794321e-06, + "loss": 0.82617176, + "num_input_tokens_seen": 118094030, + "step": 5498, + "time_per_iteration": 2.831347942352295 + }, + { + "auxiliary_loss_clip": 0.01121599, + "auxiliary_loss_mlp": 0.01041664, + "balance_loss_clip": 1.04826963, + "balance_loss_mlp": 1.02467656, + "epoch": 0.33061776642116336, + "flos": 22632305040000.0, + "grad_norm": 2.0593804502858823, + "language_loss": 0.75822198, + "learning_rate": 3.12456292636927e-06, + "loss": 0.77985466, + "num_input_tokens_seen": 118111665, + "step": 5499, + "time_per_iteration": 4.880478858947754 + }, + { + "auxiliary_loss_clip": 0.01119724, + "auxiliary_loss_mlp": 0.01035684, + "balance_loss_clip": 1.05307007, + "balance_loss_mlp": 1.02016318, + "epoch": 0.3306778896738313, + "flos": 25776320532480.0, + "grad_norm": 2.088317081581358, + "language_loss": 0.78981787, + "learning_rate": 3.124240841300681e-06, + "loss": 0.81137192, + "num_input_tokens_seen": 118132435, + "step": 5500, + "time_per_iteration": 2.7601048946380615 + }, + { + "auxiliary_loss_clip": 0.01131843, + "auxiliary_loss_mlp": 0.0103364, + "balance_loss_clip": 1.0540576, + "balance_loss_mlp": 1.01751041, + "epoch": 0.33073801292649935, + "flos": 36940552479360.0, + "grad_norm": 8.499573931934933, + "language_loss": 0.6655246, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.68717939, + "num_input_tokens_seen": 118155255, + "step": 5501, + "time_per_iteration": 2.7880568504333496 + }, + { + "auxiliary_loss_clip": 0.01130024, + "auxiliary_loss_mlp": 0.01044854, + "balance_loss_clip": 1.05215073, + "balance_loss_mlp": 1.02766418, + "epoch": 0.3307981361791673, + "flos": 12967738848000.0, + "grad_norm": 2.417495150038941, + "language_loss": 0.77221018, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.79395902, + "num_input_tokens_seen": 118169865, + "step": 5502, + "time_per_iteration": 2.621891736984253 + }, + { + "auxiliary_loss_clip": 0.01120279, + "auxiliary_loss_mlp": 0.01041312, + "balance_loss_clip": 1.05816746, + "balance_loss_mlp": 1.02508807, + "epoch": 0.3308582594318353, + "flos": 25374372364800.0, + "grad_norm": 1.6870244228079128, + "language_loss": 0.72882998, + "learning_rate": 3.123274330355824e-06, + "loss": 0.75044584, + "num_input_tokens_seen": 118190760, + "step": 5503, + "time_per_iteration": 2.731391191482544 + }, + { + "auxiliary_loss_clip": 0.01107126, + "auxiliary_loss_mlp": 0.01042991, + "balance_loss_clip": 1.04483843, + "balance_loss_mlp": 1.02543116, + "epoch": 0.33091838268450324, + "flos": 26468570419200.0, + "grad_norm": 1.6983408951831631, + "language_loss": 0.75341403, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77491516, + "num_input_tokens_seen": 118213620, + "step": 5504, + "time_per_iteration": 2.734440565109253 + }, + { + "auxiliary_loss_clip": 0.01116159, + "auxiliary_loss_mlp": 0.01038384, + "balance_loss_clip": 1.05076432, + "balance_loss_mlp": 1.02267826, + "epoch": 0.3309785059371712, + "flos": 24971167221120.0, + "grad_norm": 1.5921827086772462, + "language_loss": 0.69537103, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.71691644, + "num_input_tokens_seen": 118235010, + "step": 5505, + "time_per_iteration": 2.7224769592285156 + }, + { + "auxiliary_loss_clip": 0.0112242, + "auxiliary_loss_mlp": 0.01050735, + "balance_loss_clip": 1.04997373, + "balance_loss_mlp": 1.03454661, + "epoch": 0.3310386291898392, + "flos": 20446710192000.0, + "grad_norm": 1.6566524839278514, + "language_loss": 0.81701219, + "learning_rate": 3.122307436058899e-06, + "loss": 0.83874375, + "num_input_tokens_seen": 118255820, + "step": 5506, + "time_per_iteration": 2.6608633995056152 + }, + { + "auxiliary_loss_clip": 0.01126393, + "auxiliary_loss_mlp": 0.01036938, + "balance_loss_clip": 1.05129898, + "balance_loss_mlp": 1.02032042, + "epoch": 0.33109875244250714, + "flos": 23182672204800.0, + "grad_norm": 2.1165262291534663, + "language_loss": 0.7961843, + "learning_rate": 3.121985052827606e-06, + "loss": 0.81781757, + "num_input_tokens_seen": 118274160, + "step": 5507, + "time_per_iteration": 2.6279826164245605 + }, + { + "auxiliary_loss_clip": 0.01115407, + "auxiliary_loss_mlp": 0.0104488, + "balance_loss_clip": 1.04948068, + "balance_loss_mlp": 1.02901316, + "epoch": 0.3311588756951751, + "flos": 24168384207360.0, + "grad_norm": 1.8252383106416188, + "language_loss": 0.71632457, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.73792744, + "num_input_tokens_seen": 118294385, + "step": 5508, + "time_per_iteration": 2.666274070739746 + }, + { + "auxiliary_loss_clip": 0.01105407, + "auxiliary_loss_mlp": 0.01035431, + "balance_loss_clip": 1.04841506, + "balance_loss_mlp": 1.02048194, + "epoch": 0.33121899894784307, + "flos": 28145742209280.0, + "grad_norm": 2.0681023318662053, + "language_loss": 0.71877921, + "learning_rate": 3.12134015873989e-06, + "loss": 0.74018759, + "num_input_tokens_seen": 118313105, + "step": 5509, + "time_per_iteration": 2.9805185794830322 + }, + { + "auxiliary_loss_clip": 0.01123913, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.05431342, + "balance_loss_mlp": 1.02019095, + "epoch": 0.33127912220051103, + "flos": 29567660976000.0, + "grad_norm": 1.690455092128618, + "language_loss": 0.72850806, + "learning_rate": 3.121017647907921e-06, + "loss": 0.75010473, + "num_input_tokens_seen": 118335250, + "step": 5510, + "time_per_iteration": 2.7012648582458496 + }, + { + "auxiliary_loss_clip": 0.01097101, + "auxiliary_loss_mlp": 0.01036395, + "balance_loss_clip": 1.04754674, + "balance_loss_mlp": 1.02099323, + "epoch": 0.331339245453179, + "flos": 14428836374400.0, + "grad_norm": 2.529653220973509, + "language_loss": 0.87842733, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.89976227, + "num_input_tokens_seen": 118351470, + "step": 5511, + "time_per_iteration": 2.699303150177002 + }, + { + "auxiliary_loss_clip": 0.01077351, + "auxiliary_loss_mlp": 0.0103825, + "balance_loss_clip": 1.04569423, + "balance_loss_mlp": 1.0232892, + "epoch": 0.33139936870584696, + "flos": 20887118847360.0, + "grad_norm": 2.0800696693803404, + "language_loss": 0.73301774, + "learning_rate": 3.12037249872891e-06, + "loss": 0.7541737, + "num_input_tokens_seen": 118370970, + "step": 5512, + "time_per_iteration": 2.773071765899658 + }, + { + "auxiliary_loss_clip": 0.01092657, + "auxiliary_loss_mlp": 0.01037164, + "balance_loss_clip": 1.04608238, + "balance_loss_mlp": 1.02226281, + "epoch": 0.33145949195851493, + "flos": 36284356869120.0, + "grad_norm": 28.686212163123738, + "language_loss": 0.7188127, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.74011087, + "num_input_tokens_seen": 118393125, + "step": 5513, + "time_per_iteration": 2.832712411880493 + }, + { + "auxiliary_loss_clip": 0.0110331, + "auxiliary_loss_mlp": 0.01037016, + "balance_loss_clip": 1.0480994, + "balance_loss_mlp": 1.02052951, + "epoch": 0.33151961521118295, + "flos": 14279735018880.0, + "grad_norm": 1.9100766123367274, + "language_loss": 0.68260789, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.70401114, + "num_input_tokens_seen": 118410860, + "step": 5514, + "time_per_iteration": 2.62347674369812 + }, + { + "auxiliary_loss_clip": 0.01111479, + "auxiliary_loss_mlp": 0.01042546, + "balance_loss_clip": 1.04936767, + "balance_loss_mlp": 1.02481997, + "epoch": 0.3315797384638509, + "flos": 20774323163520.0, + "grad_norm": 1.9179680687741931, + "language_loss": 0.65994096, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.68148118, + "num_input_tokens_seen": 118429570, + "step": 5515, + "time_per_iteration": 2.6913952827453613 + }, + { + "auxiliary_loss_clip": 0.01121539, + "auxiliary_loss_mlp": 0.01039988, + "balance_loss_clip": 1.04903245, + "balance_loss_mlp": 1.02393019, + "epoch": 0.3316398617165189, + "flos": 24679464871680.0, + "grad_norm": 1.8088538037879305, + "language_loss": 0.69273043, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.71434575, + "num_input_tokens_seen": 118450285, + "step": 5516, + "time_per_iteration": 2.6469173431396484 + }, + { + "auxiliary_loss_clip": 0.011287, + "auxiliary_loss_mlp": 0.01039737, + "balance_loss_clip": 1.05089724, + "balance_loss_mlp": 1.02339315, + "epoch": 0.33169998496918685, + "flos": 18587974129920.0, + "grad_norm": 3.871010712989623, + "language_loss": 0.79914033, + "learning_rate": 3.118758882514359e-06, + "loss": 0.82082474, + "num_input_tokens_seen": 118468270, + "step": 5517, + "time_per_iteration": 2.6387667655944824 + }, + { + "auxiliary_loss_clip": 0.01113973, + "auxiliary_loss_mlp": 0.01040442, + "balance_loss_clip": 1.04587924, + "balance_loss_mlp": 1.02412271, + "epoch": 0.3317601082218548, + "flos": 20193647898240.0, + "grad_norm": 1.7856922866156533, + "language_loss": 0.74043357, + "learning_rate": 3.118436031952143e-06, + "loss": 0.76197767, + "num_input_tokens_seen": 118486615, + "step": 5518, + "time_per_iteration": 2.6136653423309326 + }, + { + "auxiliary_loss_clip": 0.01035845, + "auxiliary_loss_mlp": 0.0100663, + "balance_loss_clip": 1.02549803, + "balance_loss_mlp": 1.00447261, + "epoch": 0.3318202314745228, + "flos": 68974703637120.0, + "grad_norm": 0.6165261089589951, + "language_loss": 0.54330659, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56373143, + "num_input_tokens_seen": 118553580, + "step": 5519, + "time_per_iteration": 3.3124027252197266 + }, + { + "auxiliary_loss_clip": 0.01129225, + "auxiliary_loss_mlp": 0.01042237, + "balance_loss_clip": 1.05353975, + "balance_loss_mlp": 1.02483273, + "epoch": 0.33188035472719074, + "flos": 21500113374720.0, + "grad_norm": 2.4445902922344342, + "language_loss": 0.78693354, + "learning_rate": 3.117790203606336e-06, + "loss": 0.80864823, + "num_input_tokens_seen": 118570280, + "step": 5520, + "time_per_iteration": 2.680413246154785 + }, + { + "auxiliary_loss_clip": 0.0111174, + "auxiliary_loss_mlp": 0.01034453, + "balance_loss_clip": 1.04981971, + "balance_loss_mlp": 1.01946807, + "epoch": 0.3319404779798587, + "flos": 28870490926080.0, + "grad_norm": 2.1205551001068645, + "language_loss": 0.76597643, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.78743839, + "num_input_tokens_seen": 118590455, + "step": 5521, + "time_per_iteration": 2.7977516651153564 + }, + { + "auxiliary_loss_clip": 0.01128356, + "auxiliary_loss_mlp": 0.0104906, + "balance_loss_clip": 1.0500772, + "balance_loss_mlp": 1.0320611, + "epoch": 0.33200060123252667, + "flos": 23076915586560.0, + "grad_norm": 5.546447388917159, + "language_loss": 0.70404172, + "learning_rate": 3.117144205713664e-06, + "loss": 0.72581589, + "num_input_tokens_seen": 118609495, + "step": 5522, + "time_per_iteration": 2.7343335151672363 + }, + { + "auxiliary_loss_clip": 0.01112615, + "auxiliary_loss_mlp": 0.01039333, + "balance_loss_clip": 1.04872596, + "balance_loss_mlp": 1.02413392, + "epoch": 0.33206072448519464, + "flos": 21142479611520.0, + "grad_norm": 2.5717643633026133, + "language_loss": 0.7406925, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.76221192, + "num_input_tokens_seen": 118628720, + "step": 5523, + "time_per_iteration": 2.6910529136657715 + }, + { + "auxiliary_loss_clip": 0.01108522, + "auxiliary_loss_mlp": 0.01039859, + "balance_loss_clip": 1.04778576, + "balance_loss_mlp": 1.02415287, + "epoch": 0.3321208477378626, + "flos": 13079097987840.0, + "grad_norm": 1.7441145490896364, + "language_loss": 0.82432246, + "learning_rate": 3.116498038372114e-06, + "loss": 0.8458063, + "num_input_tokens_seen": 118645955, + "step": 5524, + "time_per_iteration": 2.747279405593872 + }, + { + "auxiliary_loss_clip": 0.01094215, + "auxiliary_loss_mlp": 0.00773366, + "balance_loss_clip": 1.04763544, + "balance_loss_mlp": 1.000983, + "epoch": 0.33218097099053057, + "flos": 21215414177280.0, + "grad_norm": 1.8821817398487202, + "language_loss": 0.83040905, + "learning_rate": 3.116174891188636e-06, + "loss": 0.84908485, + "num_input_tokens_seen": 118665605, + "step": 5525, + "time_per_iteration": 2.7802865505218506 + }, + { + "auxiliary_loss_clip": 0.01051991, + "auxiliary_loss_mlp": 0.01009126, + "balance_loss_clip": 1.02309918, + "balance_loss_mlp": 1.00730228, + "epoch": 0.33224109424319853, + "flos": 64348979189760.0, + "grad_norm": 0.7599038914172829, + "language_loss": 0.52588648, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54649764, + "num_input_tokens_seen": 118728155, + "step": 5526, + "time_per_iteration": 3.1430625915527344 + }, + { + "auxiliary_loss_clip": 0.01100912, + "auxiliary_loss_mlp": 0.00775153, + "balance_loss_clip": 1.05235875, + "balance_loss_mlp": 1.00101066, + "epoch": 0.33230121749586655, + "flos": 17346003523200.0, + "grad_norm": 1.9434005693126541, + "language_loss": 0.77540255, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.79416323, + "num_input_tokens_seen": 118743955, + "step": 5527, + "time_per_iteration": 2.779862403869629 + }, + { + "auxiliary_loss_clip": 0.01095485, + "auxiliary_loss_mlp": 0.01045396, + "balance_loss_clip": 1.05338502, + "balance_loss_mlp": 1.02997637, + "epoch": 0.3323613407485345, + "flos": 20997041443200.0, + "grad_norm": 2.507974613956182, + "language_loss": 0.7222321, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.7436409, + "num_input_tokens_seen": 118763275, + "step": 5528, + "time_per_iteration": 2.7340548038482666 + }, + { + "auxiliary_loss_clip": 0.01112677, + "auxiliary_loss_mlp": 0.01037789, + "balance_loss_clip": 1.04796624, + "balance_loss_mlp": 1.02333462, + "epoch": 0.3324214640012025, + "flos": 13152535344000.0, + "grad_norm": 1.86583443755271, + "language_loss": 0.82796729, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.84947193, + "num_input_tokens_seen": 118781110, + "step": 5529, + "time_per_iteration": 2.6532175540924072 + }, + { + "auxiliary_loss_clip": 0.01113738, + "auxiliary_loss_mlp": 0.00775289, + "balance_loss_clip": 1.04990721, + "balance_loss_mlp": 1.00095487, + "epoch": 0.33248158725387045, + "flos": 22273522041600.0, + "grad_norm": 2.91854332756289, + "language_loss": 0.69676769, + "learning_rate": 3.114558520634423e-06, + "loss": 0.71565795, + "num_input_tokens_seen": 118800620, + "step": 5530, + "time_per_iteration": 2.708841323852539 + }, + { + "auxiliary_loss_clip": 0.01126266, + "auxiliary_loss_mlp": 0.01050268, + "balance_loss_clip": 1.05040276, + "balance_loss_mlp": 1.03394794, + "epoch": 0.3325417105065384, + "flos": 20740998320640.0, + "grad_norm": 2.896961644373142, + "language_loss": 0.75989115, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.7816565, + "num_input_tokens_seen": 118818725, + "step": 5531, + "time_per_iteration": 2.672736167907715 + }, + { + "auxiliary_loss_clip": 0.01118495, + "auxiliary_loss_mlp": 0.0104264, + "balance_loss_clip": 1.05284333, + "balance_loss_mlp": 1.0260222, + "epoch": 0.3326018337592064, + "flos": 24790536702720.0, + "grad_norm": 2.0175366752259465, + "language_loss": 0.73189509, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.75350642, + "num_input_tokens_seen": 118839390, + "step": 5532, + "time_per_iteration": 4.367426156997681 + }, + { + "auxiliary_loss_clip": 0.0111545, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.0523479, + "balance_loss_mlp": 1.01623583, + "epoch": 0.33266195701187434, + "flos": 14501699112960.0, + "grad_norm": 2.031596721272471, + "language_loss": 0.65847003, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.67993426, + "num_input_tokens_seen": 118856275, + "step": 5533, + "time_per_iteration": 2.66029691696167 + }, + { + "auxiliary_loss_clip": 0.01080696, + "auxiliary_loss_mlp": 0.01037858, + "balance_loss_clip": 1.04513919, + "balance_loss_mlp": 1.02147257, + "epoch": 0.3327220802645423, + "flos": 15304410299520.0, + "grad_norm": 2.349847054242377, + "language_loss": 0.71297956, + "learning_rate": 3.113264663362451e-06, + "loss": 0.73416501, + "num_input_tokens_seen": 118873830, + "step": 5534, + "time_per_iteration": 4.27457070350647 + }, + { + "auxiliary_loss_clip": 0.0109151, + "auxiliary_loss_mlp": 0.01041219, + "balance_loss_clip": 1.04982436, + "balance_loss_mlp": 1.02534652, + "epoch": 0.3327822035172103, + "flos": 23477534951040.0, + "grad_norm": 2.0777718313633997, + "language_loss": 0.6718514, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.69317865, + "num_input_tokens_seen": 118891560, + "step": 5535, + "time_per_iteration": 2.774434804916382 + }, + { + "auxiliary_loss_clip": 0.01126643, + "auxiliary_loss_mlp": 0.00774026, + "balance_loss_clip": 1.04974341, + "balance_loss_mlp": 1.00099397, + "epoch": 0.33284232676987824, + "flos": 25374516019200.0, + "grad_norm": 4.4518317449354905, + "language_loss": 0.72757089, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.74657756, + "num_input_tokens_seen": 118910260, + "step": 5536, + "time_per_iteration": 4.211881399154663 + }, + { + "auxiliary_loss_clip": 0.0112639, + "auxiliary_loss_mlp": 0.01042922, + "balance_loss_clip": 1.05097485, + "balance_loss_mlp": 1.02740741, + "epoch": 0.3329024500225462, + "flos": 23694363400320.0, + "grad_norm": 1.6494647990025764, + "language_loss": 0.81951326, + "learning_rate": 3.112293827106917e-06, + "loss": 0.84120637, + "num_input_tokens_seen": 118929985, + "step": 5537, + "time_per_iteration": 2.723938465118408 + }, + { + "auxiliary_loss_clip": 0.01130953, + "auxiliary_loss_mlp": 0.01041699, + "balance_loss_clip": 1.05334187, + "balance_loss_mlp": 1.02568924, + "epoch": 0.33296257327521417, + "flos": 31723163205120.0, + "grad_norm": 2.0361349610506987, + "language_loss": 0.71549797, + "learning_rate": 3.111970130648789e-06, + "loss": 0.73722446, + "num_input_tokens_seen": 118951355, + "step": 5538, + "time_per_iteration": 4.913949489593506 + }, + { + "auxiliary_loss_clip": 0.01120461, + "auxiliary_loss_mlp": 0.01037376, + "balance_loss_clip": 1.04746032, + "balance_loss_mlp": 1.02189124, + "epoch": 0.33302269652788213, + "flos": 22744705674240.0, + "grad_norm": 1.8849765474814903, + "language_loss": 0.74648041, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.76805872, + "num_input_tokens_seen": 118970910, + "step": 5539, + "time_per_iteration": 2.7290310859680176 + }, + { + "auxiliary_loss_clip": 0.01142521, + "auxiliary_loss_mlp": 0.01045266, + "balance_loss_clip": 1.05175686, + "balance_loss_mlp": 1.02844524, + "epoch": 0.33308281978055015, + "flos": 11473747441920.0, + "grad_norm": 1.7887365250144445, + "language_loss": 0.71008205, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.73195994, + "num_input_tokens_seen": 118989200, + "step": 5540, + "time_per_iteration": 2.6340630054473877 + }, + { + "auxiliary_loss_clip": 0.01121672, + "auxiliary_loss_mlp": 0.01037813, + "balance_loss_clip": 1.04614174, + "balance_loss_mlp": 1.02212477, + "epoch": 0.3331429430332181, + "flos": 38213693112960.0, + "grad_norm": 2.2050863595265535, + "language_loss": 0.60332179, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.62491661, + "num_input_tokens_seen": 119011030, + "step": 5541, + "time_per_iteration": 2.9001681804656982 + }, + { + "auxiliary_loss_clip": 0.01116142, + "auxiliary_loss_mlp": 0.01045386, + "balance_loss_clip": 1.04896498, + "balance_loss_mlp": 1.02827907, + "epoch": 0.3332030662858861, + "flos": 22528667324160.0, + "grad_norm": 1.8682676496278656, + "language_loss": 0.68843257, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.7100479, + "num_input_tokens_seen": 119030620, + "step": 5542, + "time_per_iteration": 2.7336552143096924 + }, + { + "auxiliary_loss_clip": 0.01125827, + "auxiliary_loss_mlp": 0.01039479, + "balance_loss_clip": 1.04983997, + "balance_loss_mlp": 1.0241369, + "epoch": 0.33326318953855405, + "flos": 15997773507840.0, + "grad_norm": 1.7424785130645766, + "language_loss": 0.75545055, + "learning_rate": 3.110351016113414e-06, + "loss": 0.7771036, + "num_input_tokens_seen": 119048015, + "step": 5543, + "time_per_iteration": 2.7098708152770996 + }, + { + "auxiliary_loss_clip": 0.01059952, + "auxiliary_loss_mlp": 0.01049723, + "balance_loss_clip": 1.04679465, + "balance_loss_mlp": 1.03153133, + "epoch": 0.333323312791222, + "flos": 25593535198080.0, + "grad_norm": 1.720313350609618, + "language_loss": 0.75207818, + "learning_rate": 3.110027066843348e-06, + "loss": 0.77317488, + "num_input_tokens_seen": 119066280, + "step": 5544, + "time_per_iteration": 2.8580381870269775 + }, + { + "auxiliary_loss_clip": 0.01131382, + "auxiliary_loss_mlp": 0.01034467, + "balance_loss_clip": 1.0470835, + "balance_loss_mlp": 1.01900601, + "epoch": 0.33338343604389, + "flos": 25119550304640.0, + "grad_norm": 1.8195187872515122, + "language_loss": 0.70631826, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.7279768, + "num_input_tokens_seen": 119087680, + "step": 5545, + "time_per_iteration": 2.6675262451171875 + }, + { + "auxiliary_loss_clip": 0.01090227, + "auxiliary_loss_mlp": 0.01038197, + "balance_loss_clip": 1.04591393, + "balance_loss_mlp": 1.0225687, + "epoch": 0.33344355929655795, + "flos": 16947287579520.0, + "grad_norm": 2.0475528286172615, + "language_loss": 0.68962657, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.7109108, + "num_input_tokens_seen": 119105820, + "step": 5546, + "time_per_iteration": 2.6620733737945557 + }, + { + "auxiliary_loss_clip": 0.01099462, + "auxiliary_loss_mlp": 0.01039292, + "balance_loss_clip": 1.04328573, + "balance_loss_mlp": 1.02330589, + "epoch": 0.3335036825492259, + "flos": 27889591345920.0, + "grad_norm": 1.6439201248410251, + "language_loss": 0.64893299, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.67032051, + "num_input_tokens_seen": 119126630, + "step": 5547, + "time_per_iteration": 2.7897326946258545 + }, + { + "auxiliary_loss_clip": 0.0111514, + "auxiliary_loss_mlp": 0.0103407, + "balance_loss_clip": 1.05108774, + "balance_loss_mlp": 1.01957989, + "epoch": 0.3335638058018939, + "flos": 16179553261440.0, + "grad_norm": 2.7266915889905765, + "language_loss": 0.85475278, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.8762449, + "num_input_tokens_seen": 119143375, + "step": 5548, + "time_per_iteration": 2.691776990890503 + }, + { + "auxiliary_loss_clip": 0.0112443, + "auxiliary_loss_mlp": 0.01038689, + "balance_loss_clip": 1.04759526, + "balance_loss_mlp": 1.02190423, + "epoch": 0.33362392905456184, + "flos": 39896108288640.0, + "grad_norm": 2.1593805374763466, + "language_loss": 0.74996036, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.77159154, + "num_input_tokens_seen": 119166450, + "step": 5549, + "time_per_iteration": 2.778918743133545 + }, + { + "auxiliary_loss_clip": 0.01129114, + "auxiliary_loss_mlp": 0.01040153, + "balance_loss_clip": 1.0509795, + "balance_loss_mlp": 1.02330887, + "epoch": 0.3336840523072298, + "flos": 44271212567040.0, + "grad_norm": 2.0942861782322577, + "language_loss": 0.6826036, + "learning_rate": 3.108082487713921e-06, + "loss": 0.70429623, + "num_input_tokens_seen": 119189645, + "step": 5550, + "time_per_iteration": 2.8417065143585205 + }, + { + "auxiliary_loss_clip": 0.01094461, + "auxiliary_loss_mlp": 0.01050862, + "balance_loss_clip": 1.04752803, + "balance_loss_mlp": 1.03398156, + "epoch": 0.33374417555989777, + "flos": 15085678429440.0, + "grad_norm": 3.079168539029832, + "language_loss": 0.60630679, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.62776005, + "num_input_tokens_seen": 119208045, + "step": 5551, + "time_per_iteration": 2.7206614017486572 + }, + { + "auxiliary_loss_clip": 0.01096001, + "auxiliary_loss_mlp": 0.01040976, + "balance_loss_clip": 1.04871941, + "balance_loss_mlp": 1.02429891, + "epoch": 0.33380429881256574, + "flos": 15849174942720.0, + "grad_norm": 5.115117677651213, + "language_loss": 0.70642906, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.72779882, + "num_input_tokens_seen": 119224910, + "step": 5552, + "time_per_iteration": 2.7452614307403564 + }, + { + "auxiliary_loss_clip": 0.0109902, + "auxiliary_loss_mlp": 0.01036983, + "balance_loss_clip": 1.04360175, + "balance_loss_mlp": 1.02150989, + "epoch": 0.33386442206523376, + "flos": 13480327883520.0, + "grad_norm": 2.544991024269762, + "language_loss": 0.82464319, + "learning_rate": 3.107109630732192e-06, + "loss": 0.84600323, + "num_input_tokens_seen": 119243290, + "step": 5553, + "time_per_iteration": 2.755664110183716 + }, + { + "auxiliary_loss_clip": 0.01115353, + "auxiliary_loss_mlp": 0.00774656, + "balance_loss_clip": 1.05034745, + "balance_loss_mlp": 1.00092673, + "epoch": 0.3339245453179017, + "flos": 16690669839360.0, + "grad_norm": 2.0139615227647343, + "language_loss": 0.80920005, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.82810014, + "num_input_tokens_seen": 119261195, + "step": 5554, + "time_per_iteration": 2.701960563659668 + }, + { + "auxiliary_loss_clip": 0.01127546, + "auxiliary_loss_mlp": 0.01043388, + "balance_loss_clip": 1.05171227, + "balance_loss_mlp": 1.02820015, + "epoch": 0.3339846685705697, + "flos": 24610624456320.0, + "grad_norm": 1.6473304910242343, + "language_loss": 0.81187713, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.83358645, + "num_input_tokens_seen": 119282845, + "step": 5555, + "time_per_iteration": 2.697605609893799 + }, + { + "auxiliary_loss_clip": 0.01120953, + "auxiliary_loss_mlp": 0.01039289, + "balance_loss_clip": 1.04721272, + "balance_loss_mlp": 1.02425706, + "epoch": 0.33404479182323765, + "flos": 30953812775040.0, + "grad_norm": 1.6543240081497628, + "language_loss": 0.74369228, + "learning_rate": 3.106136395915099e-06, + "loss": 0.76529467, + "num_input_tokens_seen": 119304430, + "step": 5556, + "time_per_iteration": 2.7341341972351074 + }, + { + "auxiliary_loss_clip": 0.01124745, + "auxiliary_loss_mlp": 0.0103615, + "balance_loss_clip": 1.05016208, + "balance_loss_mlp": 1.02102232, + "epoch": 0.3341049150759056, + "flos": 23513301918720.0, + "grad_norm": 1.6367363007204896, + "language_loss": 0.82058722, + "learning_rate": 3.105811900403391e-06, + "loss": 0.84219617, + "num_input_tokens_seen": 119323830, + "step": 5557, + "time_per_iteration": 2.6798059940338135 + }, + { + "auxiliary_loss_clip": 0.01115524, + "auxiliary_loss_mlp": 0.01038861, + "balance_loss_clip": 1.04990697, + "balance_loss_mlp": 1.02333987, + "epoch": 0.3341650383285736, + "flos": 24026824707840.0, + "grad_norm": 1.4529426900334401, + "language_loss": 0.80220526, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.82374907, + "num_input_tokens_seen": 119346340, + "step": 5558, + "time_per_iteration": 2.760270118713379 + }, + { + "auxiliary_loss_clip": 0.01108428, + "auxiliary_loss_mlp": 0.01040994, + "balance_loss_clip": 1.04822016, + "balance_loss_mlp": 1.02628982, + "epoch": 0.33422516158124155, + "flos": 24901967669760.0, + "grad_norm": 1.5625296304307381, + "language_loss": 0.8137213, + "learning_rate": 3.105162783594788e-06, + "loss": 0.83521557, + "num_input_tokens_seen": 119367285, + "step": 5559, + "time_per_iteration": 2.7685365676879883 + }, + { + "auxiliary_loss_clip": 0.01096895, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_clip": 1.04609013, + "balance_loss_mlp": 1.02726293, + "epoch": 0.3342852848339095, + "flos": 18333403464960.0, + "grad_norm": 2.3834321283612003, + "language_loss": 0.7164095, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.73780799, + "num_input_tokens_seen": 119385370, + "step": 5560, + "time_per_iteration": 2.721888780593872 + }, + { + "auxiliary_loss_clip": 0.011201, + "auxiliary_loss_mlp": 0.01043409, + "balance_loss_clip": 1.05215085, + "balance_loss_mlp": 1.02716064, + "epoch": 0.3343454080865775, + "flos": 30046530119040.0, + "grad_norm": 2.1203222418546015, + "language_loss": 0.75029516, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.77193022, + "num_input_tokens_seen": 119409150, + "step": 5561, + "time_per_iteration": 2.8445487022399902 + }, + { + "auxiliary_loss_clip": 0.01115063, + "auxiliary_loss_mlp": 0.01036711, + "balance_loss_clip": 1.05170679, + "balance_loss_mlp": 1.02177453, + "epoch": 0.33440553133924544, + "flos": 16398823835520.0, + "grad_norm": 1.6036143049019338, + "language_loss": 0.69467896, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.71619672, + "num_input_tokens_seen": 119426475, + "step": 5562, + "time_per_iteration": 2.664062023162842 + }, + { + "auxiliary_loss_clip": 0.01125323, + "auxiliary_loss_mlp": 0.01042082, + "balance_loss_clip": 1.05125499, + "balance_loss_mlp": 1.02763367, + "epoch": 0.3344656545919134, + "flos": 24242072958720.0, + "grad_norm": 3.5139835262543504, + "language_loss": 0.65094876, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.67262286, + "num_input_tokens_seen": 119446900, + "step": 5563, + "time_per_iteration": 2.70878529548645 + }, + { + "auxiliary_loss_clip": 0.01078552, + "auxiliary_loss_mlp": 0.01045974, + "balance_loss_clip": 1.04751515, + "balance_loss_mlp": 1.0296303, + "epoch": 0.3345257778445814, + "flos": 52118843149440.0, + "grad_norm": 1.4983314251487456, + "language_loss": 0.74106556, + "learning_rate": 3.103539258400766e-06, + "loss": 0.76231086, + "num_input_tokens_seen": 119470945, + "step": 5564, + "time_per_iteration": 3.0751025676727295 + }, + { + "auxiliary_loss_clip": 0.01035298, + "auxiliary_loss_mlp": 0.01009529, + "balance_loss_clip": 1.03294694, + "balance_loss_mlp": 1.00762165, + "epoch": 0.33458590109724934, + "flos": 68048602254720.0, + "grad_norm": 0.7758359845819034, + "language_loss": 0.555296, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57574433, + "num_input_tokens_seen": 119529925, + "step": 5565, + "time_per_iteration": 3.2246947288513184 + }, + { + "auxiliary_loss_clip": 0.01134316, + "auxiliary_loss_mlp": 0.01036162, + "balance_loss_clip": 1.05123055, + "balance_loss_mlp": 1.02145183, + "epoch": 0.3346460243499173, + "flos": 37414788768000.0, + "grad_norm": 2.332924120890769, + "language_loss": 0.65000319, + "learning_rate": 3.102889555312721e-06, + "loss": 0.67170799, + "num_input_tokens_seen": 119550700, + "step": 5566, + "time_per_iteration": 2.8920817375183105 + }, + { + "auxiliary_loss_clip": 0.01115876, + "auxiliary_loss_mlp": 0.0103757, + "balance_loss_clip": 1.05134845, + "balance_loss_mlp": 1.02252626, + "epoch": 0.3347061476025853, + "flos": 18697358021760.0, + "grad_norm": 2.3005222539878436, + "language_loss": 0.77525175, + "learning_rate": 3.102564641030016e-06, + "loss": 0.79678619, + "num_input_tokens_seen": 119569295, + "step": 5567, + "time_per_iteration": 2.82244610786438 + }, + { + "auxiliary_loss_clip": 0.01112911, + "auxiliary_loss_mlp": 0.01037105, + "balance_loss_clip": 1.0479182, + "balance_loss_mlp": 1.02079725, + "epoch": 0.3347662708552533, + "flos": 13917827537280.0, + "grad_norm": 1.7148039320536435, + "language_loss": 0.76432139, + "learning_rate": 3.102239684937949e-06, + "loss": 0.78582156, + "num_input_tokens_seen": 119587375, + "step": 5568, + "time_per_iteration": 2.689354181289673 + }, + { + "auxiliary_loss_clip": 0.01099358, + "auxiliary_loss_mlp": 0.01048314, + "balance_loss_clip": 1.04898834, + "balance_loss_mlp": 1.03163624, + "epoch": 0.33482639410792125, + "flos": 19750402068480.0, + "grad_norm": 3.260707250765708, + "language_loss": 0.70965171, + "learning_rate": 3.101914687048842e-06, + "loss": 0.73112851, + "num_input_tokens_seen": 119604530, + "step": 5569, + "time_per_iteration": 2.747023344039917 + }, + { + "auxiliary_loss_clip": 0.01099669, + "auxiliary_loss_mlp": 0.01034787, + "balance_loss_clip": 1.04569411, + "balance_loss_mlp": 1.01819277, + "epoch": 0.3348865173605892, + "flos": 16102991422080.0, + "grad_norm": 2.127450904564192, + "language_loss": 0.89788258, + "learning_rate": 3.10158964737502e-06, + "loss": 0.91922712, + "num_input_tokens_seen": 119621025, + "step": 5570, + "time_per_iteration": 2.810328960418701 + }, + { + "auxiliary_loss_clip": 0.01098742, + "auxiliary_loss_mlp": 0.01034906, + "balance_loss_clip": 1.04593182, + "balance_loss_mlp": 1.01970696, + "epoch": 0.3349466406132572, + "flos": 25008945350400.0, + "grad_norm": 2.0196203016458245, + "language_loss": 0.79848439, + "learning_rate": 3.101264565928808e-06, + "loss": 0.81982088, + "num_input_tokens_seen": 119641725, + "step": 5571, + "time_per_iteration": 4.5300047397613525 + }, + { + "auxiliary_loss_clip": 0.01052126, + "auxiliary_loss_mlp": 0.00754598, + "balance_loss_clip": 1.02251923, + "balance_loss_mlp": 1.0014987, + "epoch": 0.33500676386592515, + "flos": 54319991564160.0, + "grad_norm": 0.8956854098175919, + "language_loss": 0.5596205, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.57768774, + "num_input_tokens_seen": 119693560, + "step": 5572, + "time_per_iteration": 3.0931503772735596 + }, + { + "auxiliary_loss_clip": 0.01137277, + "auxiliary_loss_mlp": 0.01047626, + "balance_loss_clip": 1.05220318, + "balance_loss_mlp": 1.03196192, + "epoch": 0.3350668871185931, + "flos": 26797332625920.0, + "grad_norm": 2.019282888464976, + "language_loss": 0.78090006, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.8027491, + "num_input_tokens_seen": 119712935, + "step": 5573, + "time_per_iteration": 2.710340738296509 + }, + { + "auxiliary_loss_clip": 0.01105804, + "auxiliary_loss_mlp": 0.01046551, + "balance_loss_clip": 1.05004358, + "balance_loss_mlp": 1.02974284, + "epoch": 0.3351270103712611, + "flos": 33510508986240.0, + "grad_norm": 3.3664569303363834, + "language_loss": 0.7253201, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.74684364, + "num_input_tokens_seen": 119731680, + "step": 5574, + "time_per_iteration": 4.390132427215576 + }, + { + "auxiliary_loss_clip": 0.01119913, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.04622221, + "balance_loss_mlp": 1.01882839, + "epoch": 0.33518713362392905, + "flos": 26506240807680.0, + "grad_norm": 1.806126996337021, + "language_loss": 0.87605375, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.89758873, + "num_input_tokens_seen": 119752155, + "step": 5575, + "time_per_iteration": 2.6650984287261963 + }, + { + "auxiliary_loss_clip": 0.01119423, + "auxiliary_loss_mlp": 0.01044892, + "balance_loss_clip": 1.05073953, + "balance_loss_mlp": 1.02783298, + "epoch": 0.335247256876597, + "flos": 17232345912960.0, + "grad_norm": 2.5292682388354404, + "language_loss": 0.82834053, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.84998369, + "num_input_tokens_seen": 119769195, + "step": 5576, + "time_per_iteration": 4.143759727478027 + }, + { + "auxiliary_loss_clip": 0.01126035, + "auxiliary_loss_mlp": 0.01042249, + "balance_loss_clip": 1.04928613, + "balance_loss_mlp": 1.02584612, + "epoch": 0.335307380129265, + "flos": 25629373992960.0, + "grad_norm": 2.62081807641563, + "language_loss": 0.72970062, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.75138342, + "num_input_tokens_seen": 119786810, + "step": 5577, + "time_per_iteration": 4.264250755310059 + }, + { + "auxiliary_loss_clip": 0.01102749, + "auxiliary_loss_mlp": 0.01040193, + "balance_loss_clip": 1.05250812, + "balance_loss_mlp": 1.02409935, + "epoch": 0.33536750338193294, + "flos": 19680089195520.0, + "grad_norm": 2.2461501835528255, + "language_loss": 0.8147049, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.83613431, + "num_input_tokens_seen": 119805395, + "step": 5578, + "time_per_iteration": 2.748187780380249 + }, + { + "auxiliary_loss_clip": 0.01072311, + "auxiliary_loss_mlp": 0.0077377, + "balance_loss_clip": 1.04737353, + "balance_loss_mlp": 1.00086129, + "epoch": 0.3354276266346009, + "flos": 18332613365760.0, + "grad_norm": 2.081067644088489, + "language_loss": 0.72135395, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.73981476, + "num_input_tokens_seen": 119823135, + "step": 5579, + "time_per_iteration": 2.797891616821289 + }, + { + "auxiliary_loss_clip": 0.0108369, + "auxiliary_loss_mlp": 0.01042635, + "balance_loss_clip": 1.04664183, + "balance_loss_mlp": 1.02608919, + "epoch": 0.3354877498872689, + "flos": 17858556645120.0, + "grad_norm": 2.1516301629227255, + "language_loss": 0.81264424, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.83390749, + "num_input_tokens_seen": 119842265, + "step": 5580, + "time_per_iteration": 2.76359224319458 + }, + { + "auxiliary_loss_clip": 0.01112891, + "auxiliary_loss_mlp": 0.01034758, + "balance_loss_clip": 1.04777932, + "balance_loss_mlp": 1.01918936, + "epoch": 0.3355478731399369, + "flos": 24717745791360.0, + "grad_norm": 1.787418199208594, + "language_loss": 0.78071463, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.80219114, + "num_input_tokens_seen": 119862500, + "step": 5581, + "time_per_iteration": 2.6893699169158936 + }, + { + "auxiliary_loss_clip": 0.01102381, + "auxiliary_loss_mlp": 0.01044533, + "balance_loss_clip": 1.04555583, + "balance_loss_mlp": 1.02674723, + "epoch": 0.33560799639260486, + "flos": 16873886136960.0, + "grad_norm": 3.5541134032025528, + "language_loss": 0.74734783, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.76881701, + "num_input_tokens_seen": 119880160, + "step": 5582, + "time_per_iteration": 2.750110149383545 + }, + { + "auxiliary_loss_clip": 0.01109205, + "auxiliary_loss_mlp": 0.0104468, + "balance_loss_clip": 1.04334664, + "balance_loss_mlp": 1.02793145, + "epoch": 0.3356681196452728, + "flos": 18333511205760.0, + "grad_norm": 2.0738327777636574, + "language_loss": 0.82039702, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.84193587, + "num_input_tokens_seen": 119899040, + "step": 5583, + "time_per_iteration": 2.629065990447998 + }, + { + "auxiliary_loss_clip": 0.01113126, + "auxiliary_loss_mlp": 0.01047702, + "balance_loss_clip": 1.04719925, + "balance_loss_mlp": 1.0322051, + "epoch": 0.3357282428979408, + "flos": 34750612085760.0, + "grad_norm": 2.1437775006956814, + "language_loss": 0.77524137, + "learning_rate": 3.097034711451581e-06, + "loss": 0.79684973, + "num_input_tokens_seen": 119921120, + "step": 5584, + "time_per_iteration": 2.9303438663482666 + }, + { + "auxiliary_loss_clip": 0.01115168, + "auxiliary_loss_mlp": 0.01043431, + "balance_loss_clip": 1.04803944, + "balance_loss_mlp": 1.02755225, + "epoch": 0.33578836615060875, + "flos": 21580087006080.0, + "grad_norm": 1.8068970963649096, + "language_loss": 0.76473475, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78632081, + "num_input_tokens_seen": 119940165, + "step": 5585, + "time_per_iteration": 2.7168867588043213 + }, + { + "auxiliary_loss_clip": 0.01120824, + "auxiliary_loss_mlp": 0.01040676, + "balance_loss_clip": 1.04579937, + "balance_loss_mlp": 1.02442741, + "epoch": 0.3358484894032767, + "flos": 24530291688960.0, + "grad_norm": 1.8490215812193886, + "language_loss": 0.77754235, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.79915732, + "num_input_tokens_seen": 119959730, + "step": 5586, + "time_per_iteration": 2.88452410697937 + }, + { + "auxiliary_loss_clip": 0.01100333, + "auxiliary_loss_mlp": 0.01057166, + "balance_loss_clip": 1.0484302, + "balance_loss_mlp": 1.03673398, + "epoch": 0.3359086126559447, + "flos": 22455589104000.0, + "grad_norm": 1.6698470723885088, + "language_loss": 0.810045, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.8316201, + "num_input_tokens_seen": 119979315, + "step": 5587, + "time_per_iteration": 2.7335522174835205 + }, + { + "auxiliary_loss_clip": 0.01130777, + "auxiliary_loss_mlp": 0.01042735, + "balance_loss_clip": 1.04809558, + "balance_loss_mlp": 1.02837586, + "epoch": 0.33596873590861265, + "flos": 16543687386240.0, + "grad_norm": 1.8626695130182664, + "language_loss": 0.67307252, + "learning_rate": 3.095731802118677e-06, + "loss": 0.69480765, + "num_input_tokens_seen": 119996140, + "step": 5588, + "time_per_iteration": 2.5910611152648926 + }, + { + "auxiliary_loss_clip": 0.01113468, + "auxiliary_loss_mlp": 0.00774774, + "balance_loss_clip": 1.04702032, + "balance_loss_mlp": 1.0007664, + "epoch": 0.3360288591612806, + "flos": 31175812782720.0, + "grad_norm": 2.758181662666948, + "language_loss": 0.70459288, + "learning_rate": 3.095405970878919e-06, + "loss": 0.72347522, + "num_input_tokens_seen": 120017720, + "step": 5589, + "time_per_iteration": 2.7966625690460205 + }, + { + "auxiliary_loss_clip": 0.01110605, + "auxiliary_loss_mlp": 0.01046945, + "balance_loss_clip": 1.04478765, + "balance_loss_mlp": 1.02951634, + "epoch": 0.3360889824139486, + "flos": 23696913265920.0, + "grad_norm": 6.820816752821097, + "language_loss": 0.6717155, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.69329101, + "num_input_tokens_seen": 120036335, + "step": 5590, + "time_per_iteration": 2.804384231567383 + }, + { + "auxiliary_loss_clip": 0.01107091, + "auxiliary_loss_mlp": 0.01044113, + "balance_loss_clip": 1.05176187, + "balance_loss_mlp": 1.02741194, + "epoch": 0.33614910566661654, + "flos": 19318109886720.0, + "grad_norm": 2.108159500929249, + "language_loss": 0.731767, + "learning_rate": 3.094754183798047e-06, + "loss": 0.75327909, + "num_input_tokens_seen": 120056120, + "step": 5591, + "time_per_iteration": 2.7423245906829834 + }, + { + "auxiliary_loss_clip": 0.01132777, + "auxiliary_loss_mlp": 0.01043438, + "balance_loss_clip": 1.04753232, + "balance_loss_mlp": 1.02802432, + "epoch": 0.3362092289192845, + "flos": 16472261191680.0, + "grad_norm": 2.4812698890164238, + "language_loss": 0.6978277, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.71958983, + "num_input_tokens_seen": 120073650, + "step": 5592, + "time_per_iteration": 2.624565362930298 + }, + { + "auxiliary_loss_clip": 0.01109265, + "auxiliary_loss_mlp": 0.01035799, + "balance_loss_clip": 1.0459764, + "balance_loss_mlp": 1.02034986, + "epoch": 0.33626935217195253, + "flos": 24243581329920.0, + "grad_norm": 2.2034044743639676, + "language_loss": 0.76362681, + "learning_rate": 3.094102230664423e-06, + "loss": 0.78507739, + "num_input_tokens_seen": 120093260, + "step": 5593, + "time_per_iteration": 2.7709946632385254 + }, + { + "auxiliary_loss_clip": 0.01100555, + "auxiliary_loss_mlp": 0.00775613, + "balance_loss_clip": 1.04247713, + "balance_loss_mlp": 1.00074506, + "epoch": 0.3363294754246205, + "flos": 19718765164800.0, + "grad_norm": 2.2856177577930876, + "language_loss": 0.7229932, + "learning_rate": 3.093776191858731e-06, + "loss": 0.74175489, + "num_input_tokens_seen": 120111830, + "step": 5594, + "time_per_iteration": 2.7880120277404785 + }, + { + "auxiliary_loss_clip": 0.01079557, + "auxiliary_loss_mlp": 0.00778898, + "balance_loss_clip": 1.04157269, + "balance_loss_mlp": 1.00079668, + "epoch": 0.33638959867728846, + "flos": 22596286677120.0, + "grad_norm": 3.2295215673950293, + "language_loss": 0.79940557, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.81799006, + "num_input_tokens_seen": 120130470, + "step": 5595, + "time_per_iteration": 2.8623924255371094 + }, + { + "auxiliary_loss_clip": 0.01111225, + "auxiliary_loss_mlp": 0.01039348, + "balance_loss_clip": 1.04694319, + "balance_loss_mlp": 1.02456045, + "epoch": 0.3364497219299564, + "flos": 20994742972800.0, + "grad_norm": 3.201033356603963, + "language_loss": 0.81473815, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.83624387, + "num_input_tokens_seen": 120150735, + "step": 5596, + "time_per_iteration": 2.900319814682007 + }, + { + "auxiliary_loss_clip": 0.01113286, + "auxiliary_loss_mlp": 0.01044516, + "balance_loss_clip": 1.04682481, + "balance_loss_mlp": 1.02877986, + "epoch": 0.3365098451826244, + "flos": 25228610974080.0, + "grad_norm": 1.642499178477658, + "language_loss": 0.75647599, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.778054, + "num_input_tokens_seen": 120173230, + "step": 5597, + "time_per_iteration": 2.8402984142303467 + }, + { + "auxiliary_loss_clip": 0.0112326, + "auxiliary_loss_mlp": 0.01034747, + "balance_loss_clip": 1.04734445, + "balance_loss_mlp": 1.01902318, + "epoch": 0.33656996843529235, + "flos": 24571697091840.0, + "grad_norm": 1.910742765655482, + "language_loss": 0.78611934, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.80769938, + "num_input_tokens_seen": 120191860, + "step": 5598, + "time_per_iteration": 2.7380945682525635 + }, + { + "auxiliary_loss_clip": 0.01141013, + "auxiliary_loss_mlp": 0.01041333, + "balance_loss_clip": 1.04969454, + "balance_loss_mlp": 1.0235827, + "epoch": 0.3366300916879603, + "flos": 44091120752640.0, + "grad_norm": 1.511676842650176, + "language_loss": 0.6446076, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.66643113, + "num_input_tokens_seen": 120219195, + "step": 5599, + "time_per_iteration": 2.845017433166504 + }, + { + "auxiliary_loss_clip": 0.01103042, + "auxiliary_loss_mlp": 0.01054079, + "balance_loss_clip": 1.04571164, + "balance_loss_mlp": 1.03408813, + "epoch": 0.3366902149406283, + "flos": 13879869840000.0, + "grad_norm": 3.0475721260430486, + "language_loss": 0.8262403, + "learning_rate": 3.091819088459249e-06, + "loss": 0.84781146, + "num_input_tokens_seen": 120232950, + "step": 5600, + "time_per_iteration": 2.690335512161255 + }, + { + "auxiliary_loss_clip": 0.01128117, + "auxiliary_loss_mlp": 0.01045257, + "balance_loss_clip": 1.04780042, + "balance_loss_mlp": 1.02822232, + "epoch": 0.33675033819329625, + "flos": 16253098358400.0, + "grad_norm": 2.4530209101601037, + "language_loss": 0.83457136, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.856305, + "num_input_tokens_seen": 120248865, + "step": 5601, + "time_per_iteration": 2.760735034942627 + }, + { + "auxiliary_loss_clip": 0.01122256, + "auxiliary_loss_mlp": 0.0103673, + "balance_loss_clip": 1.04873729, + "balance_loss_mlp": 1.02092862, + "epoch": 0.3368104614459642, + "flos": 17055809544960.0, + "grad_norm": 2.1704904083215903, + "language_loss": 0.83173311, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.85332292, + "num_input_tokens_seen": 120267820, + "step": 5602, + "time_per_iteration": 2.6818981170654297 + }, + { + "auxiliary_loss_clip": 0.0113558, + "auxiliary_loss_mlp": 0.01053921, + "balance_loss_clip": 1.04765427, + "balance_loss_mlp": 1.03756535, + "epoch": 0.3368705846986322, + "flos": 17858628472320.0, + "grad_norm": 3.8525391607572477, + "language_loss": 0.69046748, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.71236247, + "num_input_tokens_seen": 120286540, + "step": 5603, + "time_per_iteration": 2.6086158752441406 + }, + { + "auxiliary_loss_clip": 0.01116527, + "auxiliary_loss_mlp": 0.01042678, + "balance_loss_clip": 1.04876411, + "balance_loss_mlp": 1.02617979, + "epoch": 0.33693070795130015, + "flos": 22929502170240.0, + "grad_norm": 1.5388557517073465, + "language_loss": 0.83146536, + "learning_rate": 3.090513524656898e-06, + "loss": 0.85305738, + "num_input_tokens_seen": 120307305, + "step": 5604, + "time_per_iteration": 2.7269375324249268 + }, + { + "auxiliary_loss_clip": 0.01095396, + "auxiliary_loss_mlp": 0.01043597, + "balance_loss_clip": 1.04384422, + "balance_loss_mlp": 1.02708673, + "epoch": 0.3369908312039681, + "flos": 22017443005440.0, + "grad_norm": 1.634462052702842, + "language_loss": 0.73473096, + "learning_rate": 3.090187030294409e-06, + "loss": 0.75612092, + "num_input_tokens_seen": 120327845, + "step": 5605, + "time_per_iteration": 2.712197780609131 + }, + { + "auxiliary_loss_clip": 0.0111786, + "auxiliary_loss_mlp": 0.01038834, + "balance_loss_clip": 1.04761815, + "balance_loss_mlp": 1.02235925, + "epoch": 0.33705095445663613, + "flos": 11801970944640.0, + "grad_norm": 3.8834830456250913, + "language_loss": 0.83444858, + "learning_rate": 3.089860494591919e-06, + "loss": 0.85601556, + "num_input_tokens_seen": 120343255, + "step": 5606, + "time_per_iteration": 2.6680989265441895 + }, + { + "auxiliary_loss_clip": 0.01108557, + "auxiliary_loss_mlp": 0.01039061, + "balance_loss_clip": 1.04293787, + "balance_loss_mlp": 1.02370059, + "epoch": 0.3371110777093041, + "flos": 25046400257280.0, + "grad_norm": 2.0409696956182946, + "language_loss": 0.67694759, + "learning_rate": 3.089533917561809e-06, + "loss": 0.69842374, + "num_input_tokens_seen": 120361745, + "step": 5607, + "time_per_iteration": 2.8172407150268555 + }, + { + "auxiliary_loss_clip": 0.01121964, + "auxiliary_loss_mlp": 0.01053243, + "balance_loss_clip": 1.04604626, + "balance_loss_mlp": 1.03458667, + "epoch": 0.33717120096197206, + "flos": 26579031719040.0, + "grad_norm": 1.9822534609557965, + "language_loss": 0.70618403, + "learning_rate": 3.089207299216464e-06, + "loss": 0.72793615, + "num_input_tokens_seen": 120380565, + "step": 5608, + "time_per_iteration": 2.669027090072632 + }, + { + "auxiliary_loss_clip": 0.01055328, + "auxiliary_loss_mlp": 0.01040575, + "balance_loss_clip": 1.03931713, + "balance_loss_mlp": 1.02449393, + "epoch": 0.33723132421464, + "flos": 15158541168000.0, + "grad_norm": 1.931960515128334, + "language_loss": 0.79290974, + "learning_rate": 3.088880639568269e-06, + "loss": 0.81386876, + "num_input_tokens_seen": 120399235, + "step": 5609, + "time_per_iteration": 2.7859673500061035 + }, + { + "auxiliary_loss_clip": 0.01124996, + "auxiliary_loss_mlp": 0.01041459, + "balance_loss_clip": 1.04914641, + "balance_loss_mlp": 1.02387619, + "epoch": 0.337291447467308, + "flos": 23436093634560.0, + "grad_norm": 1.7580059679361764, + "language_loss": 0.82490408, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.8465687, + "num_input_tokens_seen": 120420095, + "step": 5610, + "time_per_iteration": 4.319208145141602 + }, + { + "auxiliary_loss_clip": 0.01123032, + "auxiliary_loss_mlp": 0.0104256, + "balance_loss_clip": 1.0486002, + "balance_loss_mlp": 1.02448845, + "epoch": 0.33735157071997596, + "flos": 17238163916160.0, + "grad_norm": 2.0228863025134824, + "language_loss": 0.82122159, + "learning_rate": 3.088227196412879e-06, + "loss": 0.84287751, + "num_input_tokens_seen": 120437690, + "step": 5611, + "time_per_iteration": 2.6127841472625732 + }, + { + "auxiliary_loss_clip": 0.01116485, + "auxiliary_loss_mlp": 0.01045036, + "balance_loss_clip": 1.04920387, + "balance_loss_mlp": 1.02683246, + "epoch": 0.3374116939726439, + "flos": 28257388657920.0, + "grad_norm": 2.0856936331065037, + "language_loss": 0.79704899, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.81866419, + "num_input_tokens_seen": 120459240, + "step": 5612, + "time_per_iteration": 2.7237493991851807 + }, + { + "auxiliary_loss_clip": 0.01076712, + "auxiliary_loss_mlp": 0.01040315, + "balance_loss_clip": 1.04079247, + "balance_loss_mlp": 1.02410221, + "epoch": 0.3374718172253119, + "flos": 35919396731520.0, + "grad_norm": 2.390785367991082, + "language_loss": 0.70200634, + "learning_rate": 3.087573588194753e-06, + "loss": 0.7231766, + "num_input_tokens_seen": 120481090, + "step": 5613, + "time_per_iteration": 4.43415379524231 + }, + { + "auxiliary_loss_clip": 0.01118495, + "auxiliary_loss_mlp": 0.01037291, + "balance_loss_clip": 1.04903054, + "balance_loss_mlp": 1.02097178, + "epoch": 0.33753194047797985, + "flos": 18186672407040.0, + "grad_norm": 2.1929626699857585, + "language_loss": 0.79407388, + "learning_rate": 3.087246722218144e-06, + "loss": 0.81563175, + "num_input_tokens_seen": 120500045, + "step": 5614, + "time_per_iteration": 2.6484436988830566 + }, + { + "auxiliary_loss_clip": 0.01105902, + "auxiliary_loss_mlp": 0.01046863, + "balance_loss_clip": 1.04512811, + "balance_loss_mlp": 1.02796841, + "epoch": 0.3375920637306478, + "flos": 23148916398720.0, + "grad_norm": 1.967540834348034, + "language_loss": 0.91201901, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93354666, + "num_input_tokens_seen": 120521125, + "step": 5615, + "time_per_iteration": 4.486853361129761 + }, + { + "auxiliary_loss_clip": 0.01119294, + "auxiliary_loss_mlp": 0.01042109, + "balance_loss_clip": 1.04542458, + "balance_loss_mlp": 1.0265168, + "epoch": 0.3376521869833158, + "flos": 23112215677440.0, + "grad_norm": 2.688104519924193, + "language_loss": 0.80865037, + "learning_rate": 3.086592866591809e-06, + "loss": 0.83026439, + "num_input_tokens_seen": 120539180, + "step": 5616, + "time_per_iteration": 2.693419933319092 + }, + { + "auxiliary_loss_clip": 0.01132102, + "auxiliary_loss_mlp": 0.00776249, + "balance_loss_clip": 1.04987526, + "balance_loss_mlp": 1.00074387, + "epoch": 0.33771231023598375, + "flos": 19274585581440.0, + "grad_norm": 5.641479508637021, + "language_loss": 0.83967853, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.85876203, + "num_input_tokens_seen": 120556280, + "step": 5617, + "time_per_iteration": 4.261611461639404 + }, + { + "auxiliary_loss_clip": 0.01065047, + "auxiliary_loss_mlp": 0.01048039, + "balance_loss_clip": 1.0423851, + "balance_loss_mlp": 1.030074, + "epoch": 0.3377724334886517, + "flos": 18150187167360.0, + "grad_norm": 2.2609860925126117, + "language_loss": 0.80159199, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.82272285, + "num_input_tokens_seen": 120575395, + "step": 5618, + "time_per_iteration": 2.8115389347076416 + }, + { + "auxiliary_loss_clip": 0.01092947, + "auxiliary_loss_mlp": 0.01037796, + "balance_loss_clip": 1.04605365, + "balance_loss_mlp": 1.02121353, + "epoch": 0.3378325567413197, + "flos": 25775997310080.0, + "grad_norm": 1.9598490702889584, + "language_loss": 0.7111814, + "learning_rate": 3.085611774155481e-06, + "loss": 0.73248887, + "num_input_tokens_seen": 120596075, + "step": 5619, + "time_per_iteration": 2.86958909034729 + }, + { + "auxiliary_loss_clip": 0.01116213, + "auxiliary_loss_mlp": 0.01047745, + "balance_loss_clip": 1.04749656, + "balance_loss_mlp": 1.03167593, + "epoch": 0.3378926799939877, + "flos": 21317112558720.0, + "grad_norm": 2.630730252639156, + "language_loss": 0.70144761, + "learning_rate": 3.085284660993821e-06, + "loss": 0.72308713, + "num_input_tokens_seen": 120614195, + "step": 5620, + "time_per_iteration": 2.6953368186950684 + }, + { + "auxiliary_loss_clip": 0.01136416, + "auxiliary_loss_mlp": 0.01047216, + "balance_loss_clip": 1.05076015, + "balance_loss_mlp": 1.03201699, + "epoch": 0.33795280324665566, + "flos": 24900028335360.0, + "grad_norm": 1.8373178803043773, + "language_loss": 0.67899036, + "learning_rate": 3.084957506678058e-06, + "loss": 0.70082676, + "num_input_tokens_seen": 120634475, + "step": 5621, + "time_per_iteration": 2.6531872749328613 + }, + { + "auxiliary_loss_clip": 0.0110792, + "auxiliary_loss_mlp": 0.01044445, + "balance_loss_clip": 1.04716897, + "balance_loss_mlp": 1.02814865, + "epoch": 0.33801292649932363, + "flos": 24753943722240.0, + "grad_norm": 1.7693089540657438, + "language_loss": 0.82862681, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.85015041, + "num_input_tokens_seen": 120654980, + "step": 5622, + "time_per_iteration": 2.7764267921447754 + }, + { + "auxiliary_loss_clip": 0.01097036, + "auxiliary_loss_mlp": 0.01041227, + "balance_loss_clip": 1.043239, + "balance_loss_mlp": 1.02565813, + "epoch": 0.3380730497519916, + "flos": 26723967096960.0, + "grad_norm": 7.015051283901371, + "language_loss": 0.73815429, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.75953692, + "num_input_tokens_seen": 120676245, + "step": 5623, + "time_per_iteration": 2.7962961196899414 + }, + { + "auxiliary_loss_clip": 0.0104645, + "auxiliary_loss_mlp": 0.01031816, + "balance_loss_clip": 1.03514934, + "balance_loss_mlp": 1.0298605, + "epoch": 0.33813317300465956, + "flos": 70035756416640.0, + "grad_norm": 0.757644747116446, + "language_loss": 0.55002284, + "learning_rate": 3.083975796930215e-06, + "loss": 0.57080543, + "num_input_tokens_seen": 120741965, + "step": 5624, + "time_per_iteration": 3.3495559692382812 + }, + { + "auxiliary_loss_clip": 0.01091887, + "auxiliary_loss_mlp": 0.01055525, + "balance_loss_clip": 1.04508519, + "balance_loss_mlp": 1.03704786, + "epoch": 0.3381932962573275, + "flos": 24097317148800.0, + "grad_norm": 3.1490866232839876, + "language_loss": 0.73299229, + "learning_rate": 3.083648478122111e-06, + "loss": 0.75446641, + "num_input_tokens_seen": 120760410, + "step": 5625, + "time_per_iteration": 2.7474253177642822 + }, + { + "auxiliary_loss_clip": 0.01127839, + "auxiliary_loss_mlp": 0.01045252, + "balance_loss_clip": 1.04838002, + "balance_loss_mlp": 1.02828884, + "epoch": 0.3382534195099955, + "flos": 19278248768640.0, + "grad_norm": 5.828984180477566, + "language_loss": 0.70578009, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.72751105, + "num_input_tokens_seen": 120777705, + "step": 5626, + "time_per_iteration": 2.6597115993499756 + }, + { + "auxiliary_loss_clip": 0.01108172, + "auxiliary_loss_mlp": 0.01041744, + "balance_loss_clip": 1.04509664, + "balance_loss_mlp": 1.02605569, + "epoch": 0.33831354276266346, + "flos": 25226240676480.0, + "grad_norm": 3.2927176036830574, + "language_loss": 0.80853224, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83003139, + "num_input_tokens_seen": 120798660, + "step": 5627, + "time_per_iteration": 2.730774402618408 + }, + { + "auxiliary_loss_clip": 0.01131612, + "auxiliary_loss_mlp": 0.0077564, + "balance_loss_clip": 1.05286694, + "balance_loss_mlp": 1.00064421, + "epoch": 0.3383736660153314, + "flos": 23112000195840.0, + "grad_norm": 2.306116347111899, + "language_loss": 0.80454439, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.82361686, + "num_input_tokens_seen": 120816705, + "step": 5628, + "time_per_iteration": 2.691471576690674 + }, + { + "auxiliary_loss_clip": 0.01080566, + "auxiliary_loss_mlp": 0.01046147, + "balance_loss_clip": 1.04250276, + "balance_loss_mlp": 1.02787185, + "epoch": 0.3384337892679994, + "flos": 23477139901440.0, + "grad_norm": 3.64262689820424, + "language_loss": 0.77174091, + "learning_rate": 3.082338792093254e-06, + "loss": 0.79300809, + "num_input_tokens_seen": 120835375, + "step": 5629, + "time_per_iteration": 2.7564992904663086 + }, + { + "auxiliary_loss_clip": 0.01116368, + "auxiliary_loss_mlp": 0.01046104, + "balance_loss_clip": 1.04699719, + "balance_loss_mlp": 1.02819836, + "epoch": 0.33849391252066735, + "flos": 19425805839360.0, + "grad_norm": 4.669184863549949, + "language_loss": 0.84738326, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.86900795, + "num_input_tokens_seen": 120854260, + "step": 5630, + "time_per_iteration": 2.7284910678863525 + }, + { + "auxiliary_loss_clip": 0.0108732, + "auxiliary_loss_mlp": 0.01055965, + "balance_loss_clip": 1.04692125, + "balance_loss_mlp": 1.03889382, + "epoch": 0.3385540357733353, + "flos": 21064840364160.0, + "grad_norm": 2.0951078731071204, + "language_loss": 0.71627271, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.73770559, + "num_input_tokens_seen": 120871590, + "step": 5631, + "time_per_iteration": 2.7423501014709473 + }, + { + "auxiliary_loss_clip": 0.01036653, + "auxiliary_loss_mlp": 0.01008716, + "balance_loss_clip": 1.02691352, + "balance_loss_mlp": 1.00683236, + "epoch": 0.3386141590260033, + "flos": 69208013450880.0, + "grad_norm": 0.8383263502294551, + "language_loss": 0.56103444, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58148813, + "num_input_tokens_seen": 120925550, + "step": 5632, + "time_per_iteration": 3.24780535697937 + }, + { + "auxiliary_loss_clip": 0.01122742, + "auxiliary_loss_mlp": 0.01038822, + "balance_loss_clip": 1.05064476, + "balance_loss_mlp": 1.02198935, + "epoch": 0.3386742822786713, + "flos": 25519487310720.0, + "grad_norm": 1.5341010429525646, + "language_loss": 0.80410492, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.82572055, + "num_input_tokens_seen": 120947620, + "step": 5633, + "time_per_iteration": 2.6492738723754883 + }, + { + "auxiliary_loss_clip": 0.01099799, + "auxiliary_loss_mlp": 0.01044702, + "balance_loss_clip": 1.04435778, + "balance_loss_mlp": 1.02854943, + "epoch": 0.33873440553133927, + "flos": 23623116773760.0, + "grad_norm": 2.1401050060877997, + "language_loss": 0.59013391, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.61157894, + "num_input_tokens_seen": 120965205, + "step": 5634, + "time_per_iteration": 2.7261369228363037 + }, + { + "auxiliary_loss_clip": 0.01106157, + "auxiliary_loss_mlp": 0.01040516, + "balance_loss_clip": 1.04877985, + "balance_loss_mlp": 1.02482784, + "epoch": 0.33879452878400723, + "flos": 17088882992640.0, + "grad_norm": 1.8243057386875807, + "language_loss": 0.92440355, + "learning_rate": 3.080373032026589e-06, + "loss": 0.94587028, + "num_input_tokens_seen": 120983560, + "step": 5635, + "time_per_iteration": 2.627788782119751 + }, + { + "auxiliary_loss_clip": 0.01091476, + "auxiliary_loss_mlp": 0.01039192, + "balance_loss_clip": 1.05005646, + "balance_loss_mlp": 1.02288401, + "epoch": 0.3388546520366752, + "flos": 15742053607680.0, + "grad_norm": 2.00681285666687, + "language_loss": 0.75539577, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.7767024, + "num_input_tokens_seen": 121001400, + "step": 5636, + "time_per_iteration": 2.706772565841675 + }, + { + "auxiliary_loss_clip": 0.0112617, + "auxiliary_loss_mlp": 0.01044921, + "balance_loss_clip": 1.05089188, + "balance_loss_mlp": 1.02866137, + "epoch": 0.33891477528934316, + "flos": 22418744728320.0, + "grad_norm": 1.7127540900641318, + "language_loss": 0.83448696, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.85619783, + "num_input_tokens_seen": 121021760, + "step": 5637, + "time_per_iteration": 2.6864166259765625 + }, + { + "auxiliary_loss_clip": 0.0109052, + "auxiliary_loss_mlp": 0.01051499, + "balance_loss_clip": 1.04899251, + "balance_loss_mlp": 1.03193665, + "epoch": 0.3389748985420111, + "flos": 17274828723840.0, + "grad_norm": 1.650296659926583, + "language_loss": 0.70123053, + "learning_rate": 3.079389598759495e-06, + "loss": 0.72265071, + "num_input_tokens_seen": 121041070, + "step": 5638, + "time_per_iteration": 2.7513418197631836 + }, + { + "auxiliary_loss_clip": 0.01107421, + "auxiliary_loss_mlp": 0.01049541, + "balance_loss_clip": 1.0486834, + "balance_loss_mlp": 1.0325892, + "epoch": 0.3390350217946791, + "flos": 27744979190400.0, + "grad_norm": 3.471125425253904, + "language_loss": 0.80819786, + "learning_rate": 3.079061705792765e-06, + "loss": 0.82976747, + "num_input_tokens_seen": 121060890, + "step": 5639, + "time_per_iteration": 2.8025810718536377 + }, + { + "auxiliary_loss_clip": 0.01143398, + "auxiliary_loss_mlp": 0.01048836, + "balance_loss_clip": 1.0533762, + "balance_loss_mlp": 1.03158689, + "epoch": 0.33909514504734706, + "flos": 20339804338560.0, + "grad_norm": 8.162571098362656, + "language_loss": 0.67619336, + "learning_rate": 3.078733771907907e-06, + "loss": 0.69811565, + "num_input_tokens_seen": 121079135, + "step": 5640, + "time_per_iteration": 2.662127733230591 + }, + { + "auxiliary_loss_clip": 0.01114186, + "auxiliary_loss_mlp": 0.01038526, + "balance_loss_clip": 1.04930854, + "balance_loss_mlp": 1.02196789, + "epoch": 0.339155268300015, + "flos": 14830030356480.0, + "grad_norm": 1.6687164879604648, + "language_loss": 0.69589841, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.71742553, + "num_input_tokens_seen": 121097685, + "step": 5641, + "time_per_iteration": 2.6596109867095947 + }, + { + "auxiliary_loss_clip": 0.01142481, + "auxiliary_loss_mlp": 0.0104296, + "balance_loss_clip": 1.05451512, + "balance_loss_mlp": 1.02698565, + "epoch": 0.339215391552683, + "flos": 26067951054720.0, + "grad_norm": 2.4357287647671266, + "language_loss": 0.87591994, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.89777428, + "num_input_tokens_seen": 121115640, + "step": 5642, + "time_per_iteration": 2.6347198486328125 + }, + { + "auxiliary_loss_clip": 0.01117312, + "auxiliary_loss_mlp": 0.01034931, + "balance_loss_clip": 1.04759669, + "balance_loss_mlp": 1.02112639, + "epoch": 0.33927551480535095, + "flos": 14574705505920.0, + "grad_norm": 1.860184080586481, + "language_loss": 0.83900917, + "learning_rate": 3.077749724868924e-06, + "loss": 0.86053157, + "num_input_tokens_seen": 121132485, + "step": 5643, + "time_per_iteration": 2.678086042404175 + }, + { + "auxiliary_loss_clip": 0.01107188, + "auxiliary_loss_mlp": 0.01049417, + "balance_loss_clip": 1.04616475, + "balance_loss_mlp": 1.03295422, + "epoch": 0.3393356380580189, + "flos": 23805578885760.0, + "grad_norm": 4.293096130940915, + "language_loss": 0.76897138, + "learning_rate": 3.077421627435922e-06, + "loss": 0.79053748, + "num_input_tokens_seen": 121152935, + "step": 5644, + "time_per_iteration": 2.6681976318359375 + }, + { + "auxiliary_loss_clip": 0.01123, + "auxiliary_loss_mlp": 0.01046638, + "balance_loss_clip": 1.05055666, + "balance_loss_mlp": 1.02978194, + "epoch": 0.3393957613106869, + "flos": 17347871030400.0, + "grad_norm": 8.889141309374795, + "language_loss": 0.62855232, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.65024871, + "num_input_tokens_seen": 121169835, + "step": 5645, + "time_per_iteration": 2.5976576805114746 + }, + { + "auxiliary_loss_clip": 0.01123901, + "auxiliary_loss_mlp": 0.01042398, + "balance_loss_clip": 1.04963613, + "balance_loss_mlp": 1.0272944, + "epoch": 0.3394558845633549, + "flos": 28433960939520.0, + "grad_norm": 1.8158202042065192, + "language_loss": 0.76223624, + "learning_rate": 3.076765310014552e-06, + "loss": 0.78389925, + "num_input_tokens_seen": 121190290, + "step": 5646, + "time_per_iteration": 2.674058437347412 + }, + { + "auxiliary_loss_clip": 0.01128511, + "auxiliary_loss_mlp": 0.01049927, + "balance_loss_clip": 1.05314088, + "balance_loss_mlp": 1.03245091, + "epoch": 0.33951600781602287, + "flos": 22086929865600.0, + "grad_norm": 2.6597837481337256, + "language_loss": 0.78888249, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.81066692, + "num_input_tokens_seen": 121209060, + "step": 5647, + "time_per_iteration": 2.636462688446045 + }, + { + "auxiliary_loss_clip": 0.01113432, + "auxiliary_loss_mlp": 0.0077397, + "balance_loss_clip": 1.05254745, + "balance_loss_mlp": 1.00053275, + "epoch": 0.33957613106869083, + "flos": 23878262056320.0, + "grad_norm": 2.0563114900155037, + "language_loss": 0.77694631, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.7958203, + "num_input_tokens_seen": 121227480, + "step": 5648, + "time_per_iteration": 2.704535484313965 + }, + { + "auxiliary_loss_clip": 0.00999132, + "auxiliary_loss_mlp": 0.01023587, + "balance_loss_clip": 1.03748918, + "balance_loss_mlp": 1.02168012, + "epoch": 0.3396362543213588, + "flos": 71242642414080.0, + "grad_norm": 0.7822172669689142, + "language_loss": 0.56281364, + "learning_rate": 3.075780527680754e-06, + "loss": 0.58304083, + "num_input_tokens_seen": 121291305, + "step": 5649, + "time_per_iteration": 3.6428561210632324 + }, + { + "auxiliary_loss_clip": 0.01109513, + "auxiliary_loss_mlp": 0.00776659, + "balance_loss_clip": 1.04886901, + "balance_loss_mlp": 1.00053644, + "epoch": 0.33969637757402676, + "flos": 25921615046400.0, + "grad_norm": 1.4990429944851429, + "language_loss": 0.85522908, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.87409085, + "num_input_tokens_seen": 121312740, + "step": 5650, + "time_per_iteration": 4.6250996589660645 + }, + { + "auxiliary_loss_clip": 0.01125063, + "auxiliary_loss_mlp": 0.01029114, + "balance_loss_clip": 1.04845572, + "balance_loss_mlp": 1.01392674, + "epoch": 0.33975650082669473, + "flos": 35261728663680.0, + "grad_norm": 1.7009103293103713, + "language_loss": 0.70462626, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.7261681, + "num_input_tokens_seen": 121334220, + "step": 5651, + "time_per_iteration": 3.0873425006866455 + }, + { + "auxiliary_loss_clip": 0.01088353, + "auxiliary_loss_mlp": 0.01041459, + "balance_loss_clip": 1.04718101, + "balance_loss_mlp": 1.02539587, + "epoch": 0.3398166240793627, + "flos": 16647001879680.0, + "grad_norm": 2.657059560006321, + "language_loss": 0.80932343, + "learning_rate": 3.074795378203616e-06, + "loss": 0.83062148, + "num_input_tokens_seen": 121351870, + "step": 5652, + "time_per_iteration": 2.957105875015259 + }, + { + "auxiliary_loss_clip": 0.01143187, + "auxiliary_loss_mlp": 0.0104477, + "balance_loss_clip": 1.05543184, + "balance_loss_mlp": 1.0275445, + "epoch": 0.33987674733203066, + "flos": 24062196625920.0, + "grad_norm": 2.181969038816262, + "language_loss": 0.76847494, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.79035449, + "num_input_tokens_seen": 121373400, + "step": 5653, + "time_per_iteration": 4.277743816375732 + }, + { + "auxiliary_loss_clip": 0.01117346, + "auxiliary_loss_mlp": 0.01041107, + "balance_loss_clip": 1.04708898, + "balance_loss_mlp": 1.02475142, + "epoch": 0.3399368705846986, + "flos": 13250678279040.0, + "grad_norm": 2.9108557214850217, + "language_loss": 0.85412633, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.8757109, + "num_input_tokens_seen": 121385225, + "step": 5654, + "time_per_iteration": 4.243285179138184 + }, + { + "auxiliary_loss_clip": 0.01118111, + "auxiliary_loss_mlp": 0.01041226, + "balance_loss_clip": 1.04521537, + "balance_loss_mlp": 1.02490664, + "epoch": 0.3399969938373666, + "flos": 27012832272000.0, + "grad_norm": 5.5024852924346765, + "language_loss": 0.64919531, + "learning_rate": 3.073809861919351e-06, + "loss": 0.67078876, + "num_input_tokens_seen": 121404735, + "step": 5655, + "time_per_iteration": 2.793121576309204 + }, + { + "auxiliary_loss_clip": 0.01129599, + "auxiliary_loss_mlp": 0.01043607, + "balance_loss_clip": 1.05404055, + "balance_loss_mlp": 1.02828872, + "epoch": 0.34005711709003456, + "flos": 28550096588160.0, + "grad_norm": 1.7231624830718477, + "language_loss": 0.7624622, + "learning_rate": 3.073481275036697e-06, + "loss": 0.78419423, + "num_input_tokens_seen": 121426780, + "step": 5656, + "time_per_iteration": 2.739227056503296 + }, + { + "auxiliary_loss_clip": 0.01102847, + "auxiliary_loss_mlp": 0.01040319, + "balance_loss_clip": 1.0458467, + "balance_loss_mlp": 1.02364159, + "epoch": 0.3401172403427025, + "flos": 21617003208960.0, + "grad_norm": 8.964185236965056, + "language_loss": 0.82842731, + "learning_rate": 3.073152647447525e-06, + "loss": 0.849859, + "num_input_tokens_seen": 121447245, + "step": 5657, + "time_per_iteration": 5.179774761199951 + }, + { + "auxiliary_loss_clip": 0.01113742, + "auxiliary_loss_mlp": 0.01048481, + "balance_loss_clip": 1.05169284, + "balance_loss_mlp": 1.03313899, + "epoch": 0.3401773635953705, + "flos": 25885776251520.0, + "grad_norm": 1.8385093437954252, + "language_loss": 0.85050905, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.87213123, + "num_input_tokens_seen": 121468165, + "step": 5658, + "time_per_iteration": 2.776137351989746 + }, + { + "auxiliary_loss_clip": 0.01053106, + "auxiliary_loss_mlp": 0.01016184, + "balance_loss_clip": 1.03449082, + "balance_loss_mlp": 1.01424086, + "epoch": 0.3402374868480385, + "flos": 65507995336320.0, + "grad_norm": 0.825209949556337, + "language_loss": 0.59988189, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62057471, + "num_input_tokens_seen": 121523795, + "step": 5659, + "time_per_iteration": 3.272684335708618 + }, + { + "auxiliary_loss_clip": 0.01137862, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_clip": 1.05531621, + "balance_loss_mlp": 1.02102888, + "epoch": 0.34029761010070647, + "flos": 24060580513920.0, + "grad_norm": 2.521681543348545, + "language_loss": 0.67763948, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.69937897, + "num_input_tokens_seen": 121542950, + "step": 5660, + "time_per_iteration": 2.699267864227295 + }, + { + "auxiliary_loss_clip": 0.01142235, + "auxiliary_loss_mlp": 0.010443, + "balance_loss_clip": 1.05695057, + "balance_loss_mlp": 1.02787328, + "epoch": 0.34035773335337444, + "flos": 27599720590080.0, + "grad_norm": 1.9299535220965447, + "language_loss": 0.67668259, + "learning_rate": 3.071837730274918e-06, + "loss": 0.69854796, + "num_input_tokens_seen": 121562765, + "step": 5661, + "time_per_iteration": 2.647101402282715 + }, + { + "auxiliary_loss_clip": 0.01119112, + "auxiliary_loss_mlp": 0.01041902, + "balance_loss_clip": 1.05479288, + "balance_loss_mlp": 1.02634561, + "epoch": 0.3404178566060424, + "flos": 20812783651200.0, + "grad_norm": 2.0521689983251954, + "language_loss": 0.78806192, + "learning_rate": 3.071508899340113e-06, + "loss": 0.80967206, + "num_input_tokens_seen": 121581610, + "step": 5662, + "time_per_iteration": 2.847168207168579 + }, + { + "auxiliary_loss_clip": 0.01103563, + "auxiliary_loss_mlp": 0.01041962, + "balance_loss_clip": 1.05163002, + "balance_loss_mlp": 1.02498698, + "epoch": 0.34047797985871037, + "flos": 26833566470400.0, + "grad_norm": 2.226848836482441, + "language_loss": 0.73531127, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.75676656, + "num_input_tokens_seen": 121601885, + "step": 5663, + "time_per_iteration": 2.8581340312957764 + }, + { + "auxiliary_loss_clip": 0.01090462, + "auxiliary_loss_mlp": 0.01035271, + "balance_loss_clip": 1.04631042, + "balance_loss_mlp": 1.02079868, + "epoch": 0.34053810311137833, + "flos": 19682639061120.0, + "grad_norm": 1.7108226041633658, + "language_loss": 0.86297357, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.88423085, + "num_input_tokens_seen": 121621335, + "step": 5664, + "time_per_iteration": 2.778038501739502 + }, + { + "auxiliary_loss_clip": 0.01139377, + "auxiliary_loss_mlp": 0.0103938, + "balance_loss_clip": 1.05399597, + "balance_loss_mlp": 1.0245564, + "epoch": 0.3405982263640463, + "flos": 21725740656000.0, + "grad_norm": 2.2398696420560675, + "language_loss": 0.68712831, + "learning_rate": 3.070522162795235e-06, + "loss": 0.70891583, + "num_input_tokens_seen": 121641310, + "step": 5665, + "time_per_iteration": 2.688643217086792 + }, + { + "auxiliary_loss_clip": 0.01138662, + "auxiliary_loss_mlp": 0.01039766, + "balance_loss_clip": 1.05278993, + "balance_loss_mlp": 1.0229218, + "epoch": 0.34065834961671426, + "flos": 18041629288320.0, + "grad_norm": 2.716291820837314, + "language_loss": 0.73084486, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.7526291, + "num_input_tokens_seen": 121659625, + "step": 5666, + "time_per_iteration": 2.7325544357299805 + }, + { + "auxiliary_loss_clip": 0.01128915, + "auxiliary_loss_mlp": 0.01039671, + "balance_loss_clip": 1.05135012, + "balance_loss_mlp": 1.0244832, + "epoch": 0.3407184728693822, + "flos": 21397337585280.0, + "grad_norm": 2.363121461769924, + "language_loss": 0.72947341, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.75115931, + "num_input_tokens_seen": 121679205, + "step": 5667, + "time_per_iteration": 2.7143874168395996 + }, + { + "auxiliary_loss_clip": 0.01042137, + "auxiliary_loss_mlp": 0.01008076, + "balance_loss_clip": 1.02401757, + "balance_loss_mlp": 1.00638342, + "epoch": 0.3407785961220502, + "flos": 68688101018880.0, + "grad_norm": 0.8313790259289849, + "language_loss": 0.63259363, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65309572, + "num_input_tokens_seen": 121751085, + "step": 5668, + "time_per_iteration": 3.3907217979431152 + }, + { + "auxiliary_loss_clip": 0.01036989, + "auxiliary_loss_mlp": 0.01045108, + "balance_loss_clip": 1.03961444, + "balance_loss_mlp": 1.02808475, + "epoch": 0.34083871937471816, + "flos": 14064379027200.0, + "grad_norm": 2.2447075161594365, + "language_loss": 0.71795446, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.73877549, + "num_input_tokens_seen": 121768565, + "step": 5669, + "time_per_iteration": 2.941349983215332 + }, + { + "auxiliary_loss_clip": 0.0110323, + "auxiliary_loss_mlp": 0.00773367, + "balance_loss_clip": 1.04966998, + "balance_loss_mlp": 1.00054646, + "epoch": 0.3408988426273861, + "flos": 17085435287040.0, + "grad_norm": 1.973306725053756, + "language_loss": 0.80678529, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.82555127, + "num_input_tokens_seen": 121784925, + "step": 5670, + "time_per_iteration": 2.8877930641174316 + }, + { + "auxiliary_loss_clip": 0.01088488, + "auxiliary_loss_mlp": 0.01037182, + "balance_loss_clip": 1.04484558, + "balance_loss_mlp": 1.02111244, + "epoch": 0.3409589658800541, + "flos": 24024562151040.0, + "grad_norm": 1.926244069219147, + "language_loss": 0.77521646, + "learning_rate": 3.068547593996078e-06, + "loss": 0.79647315, + "num_input_tokens_seen": 121804425, + "step": 5671, + "time_per_iteration": 2.886425256729126 + }, + { + "auxiliary_loss_clip": 0.01138739, + "auxiliary_loss_mlp": 0.0077388, + "balance_loss_clip": 1.05301285, + "balance_loss_mlp": 1.00052333, + "epoch": 0.34101908913272205, + "flos": 21142012734720.0, + "grad_norm": 3.7152219569219427, + "language_loss": 0.74220848, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.76133466, + "num_input_tokens_seen": 121825145, + "step": 5672, + "time_per_iteration": 2.751692056655884 + }, + { + "auxiliary_loss_clip": 0.01121109, + "auxiliary_loss_mlp": 0.01047405, + "balance_loss_clip": 1.04886246, + "balance_loss_mlp": 1.03089476, + "epoch": 0.3410792123853901, + "flos": 15702012921600.0, + "grad_norm": 1.8011032028958165, + "language_loss": 0.73721337, + "learning_rate": 3.06788908010777e-06, + "loss": 0.7588985, + "num_input_tokens_seen": 121842185, + "step": 5673, + "time_per_iteration": 2.6628050804138184 + }, + { + "auxiliary_loss_clip": 0.01126244, + "auxiliary_loss_mlp": 0.01038975, + "balance_loss_clip": 1.05143654, + "balance_loss_mlp": 1.02362132, + "epoch": 0.34113933563805804, + "flos": 23036012974080.0, + "grad_norm": 1.7591090628800392, + "language_loss": 0.79972708, + "learning_rate": 3.067559762415682e-06, + "loss": 0.8213793, + "num_input_tokens_seen": 121862260, + "step": 5674, + "time_per_iteration": 2.6803476810455322 + }, + { + "auxiliary_loss_clip": 0.01054856, + "auxiliary_loss_mlp": 0.01001466, + "balance_loss_clip": 1.0258925, + "balance_loss_mlp": 0.9994635, + "epoch": 0.341199458890726, + "flos": 69614235336960.0, + "grad_norm": 0.7875282266281167, + "language_loss": 0.56080592, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.5813691, + "num_input_tokens_seen": 121923560, + "step": 5675, + "time_per_iteration": 3.3068313598632812 + }, + { + "auxiliary_loss_clip": 0.01115956, + "auxiliary_loss_mlp": 0.00773448, + "balance_loss_clip": 1.052145, + "balance_loss_mlp": 1.0006851, + "epoch": 0.34125958214339397, + "flos": 22346348866560.0, + "grad_norm": 1.6444328441844458, + "language_loss": 0.78795338, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.80684733, + "num_input_tokens_seen": 121943515, + "step": 5676, + "time_per_iteration": 2.7983739376068115 + }, + { + "auxiliary_loss_clip": 0.01120251, + "auxiliary_loss_mlp": 0.01036846, + "balance_loss_clip": 1.04593658, + "balance_loss_mlp": 1.02024043, + "epoch": 0.34131970539606193, + "flos": 21871933009920.0, + "grad_norm": 1.8897537275348075, + "language_loss": 0.85468972, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.8762607, + "num_input_tokens_seen": 121962540, + "step": 5677, + "time_per_iteration": 2.698751449584961 + }, + { + "auxiliary_loss_clip": 0.01109896, + "auxiliary_loss_mlp": 0.01042182, + "balance_loss_clip": 1.04772925, + "balance_loss_mlp": 1.02586842, + "epoch": 0.3413798286487299, + "flos": 24935723475840.0, + "grad_norm": 1.7514589696636707, + "language_loss": 0.79352021, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.81504107, + "num_input_tokens_seen": 121979830, + "step": 5678, + "time_per_iteration": 2.731834650039673 + }, + { + "auxiliary_loss_clip": 0.01123477, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.04799783, + "balance_loss_mlp": 1.01833677, + "epoch": 0.34143995190139786, + "flos": 25374372364800.0, + "grad_norm": 1.8765190883227818, + "language_loss": 0.74821675, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.76978606, + "num_input_tokens_seen": 121999055, + "step": 5679, + "time_per_iteration": 2.7362489700317383 + }, + { + "auxiliary_loss_clip": 0.01044772, + "auxiliary_loss_mlp": 0.01004164, + "balance_loss_clip": 1.02617037, + "balance_loss_mlp": 1.00210214, + "epoch": 0.34150007515406583, + "flos": 67782578129280.0, + "grad_norm": 0.716476818724812, + "language_loss": 0.59445524, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.61494464, + "num_input_tokens_seen": 122067015, + "step": 5680, + "time_per_iteration": 3.241750955581665 + }, + { + "auxiliary_loss_clip": 0.01108333, + "auxiliary_loss_mlp": 0.01032851, + "balance_loss_clip": 1.04563892, + "balance_loss_mlp": 1.01804543, + "epoch": 0.3415601984067338, + "flos": 20302421258880.0, + "grad_norm": 1.760771174406363, + "language_loss": 0.72054088, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.74195278, + "num_input_tokens_seen": 122085295, + "step": 5681, + "time_per_iteration": 2.7306556701660156 + }, + { + "auxiliary_loss_clip": 0.01109003, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_clip": 1.0462265, + "balance_loss_mlp": 1.02786994, + "epoch": 0.34162032165940176, + "flos": 26031178506240.0, + "grad_norm": 2.2327180896030443, + "language_loss": 0.71463466, + "learning_rate": 3.064923764577233e-06, + "loss": 0.73615474, + "num_input_tokens_seen": 122104020, + "step": 5682, + "time_per_iteration": 2.825296640396118 + }, + { + "auxiliary_loss_clip": 0.01132395, + "auxiliary_loss_mlp": 0.0104079, + "balance_loss_clip": 1.04721618, + "balance_loss_mlp": 1.02507806, + "epoch": 0.3416804449120697, + "flos": 28803338449920.0, + "grad_norm": 1.5426603390069147, + "language_loss": 0.84101224, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.86274409, + "num_input_tokens_seen": 122125080, + "step": 5683, + "time_per_iteration": 2.6654412746429443 + }, + { + "auxiliary_loss_clip": 0.01112942, + "auxiliary_loss_mlp": 0.01047099, + "balance_loss_clip": 1.04768562, + "balance_loss_mlp": 1.03113699, + "epoch": 0.3417405681647377, + "flos": 22601601889920.0, + "grad_norm": 4.046428716645244, + "language_loss": 0.70964772, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.73124808, + "num_input_tokens_seen": 122146350, + "step": 5684, + "time_per_iteration": 2.724592924118042 + }, + { + "auxiliary_loss_clip": 0.01132202, + "auxiliary_loss_mlp": 0.01038054, + "balance_loss_clip": 1.04905093, + "balance_loss_mlp": 1.02367699, + "epoch": 0.34180069141740566, + "flos": 24716237420160.0, + "grad_norm": 1.9204482618269598, + "language_loss": 0.74832582, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77002841, + "num_input_tokens_seen": 122168085, + "step": 5685, + "time_per_iteration": 2.7046890258789062 + }, + { + "auxiliary_loss_clip": 0.01114777, + "auxiliary_loss_mlp": 0.0104831, + "balance_loss_clip": 1.04522872, + "balance_loss_mlp": 1.03261042, + "epoch": 0.3418608146700737, + "flos": 30518755246080.0, + "grad_norm": 1.9200820074556442, + "language_loss": 0.70611888, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.72774971, + "num_input_tokens_seen": 122191040, + "step": 5686, + "time_per_iteration": 2.7390410900115967 + }, + { + "auxiliary_loss_clip": 0.01123208, + "auxiliary_loss_mlp": 0.01044107, + "balance_loss_clip": 1.04809284, + "balance_loss_mlp": 1.02819252, + "epoch": 0.34192093792274164, + "flos": 15122343237120.0, + "grad_norm": 2.0197354521106563, + "language_loss": 0.77240539, + "learning_rate": 3.06327495310661e-06, + "loss": 0.79407853, + "num_input_tokens_seen": 122209225, + "step": 5687, + "time_per_iteration": 2.6381263732910156 + }, + { + "auxiliary_loss_clip": 0.01106353, + "auxiliary_loss_mlp": 0.01040255, + "balance_loss_clip": 1.04849195, + "balance_loss_mlp": 1.02412593, + "epoch": 0.3419810611754096, + "flos": 13187799521280.0, + "grad_norm": 3.7332163528162385, + "language_loss": 0.8676976, + "learning_rate": 3.062945069803981e-06, + "loss": 0.88916373, + "num_input_tokens_seen": 122226160, + "step": 5688, + "time_per_iteration": 2.647320508956909 + }, + { + "auxiliary_loss_clip": 0.01119843, + "auxiliary_loss_mlp": 0.01042145, + "balance_loss_clip": 1.04928863, + "balance_loss_mlp": 1.0255394, + "epoch": 0.34204118442807757, + "flos": 19536267139200.0, + "grad_norm": 1.870477619822585, + "language_loss": 0.79564822, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.81726807, + "num_input_tokens_seen": 122243115, + "step": 5689, + "time_per_iteration": 4.1660990715026855 + }, + { + "auxiliary_loss_clip": 0.0112576, + "auxiliary_loss_mlp": 0.01042306, + "balance_loss_clip": 1.04875994, + "balance_loss_mlp": 1.02580786, + "epoch": 0.34210130768074554, + "flos": 15194846839680.0, + "grad_norm": 1.7530560995380315, + "language_loss": 0.73215616, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.75383675, + "num_input_tokens_seen": 122261105, + "step": 5690, + "time_per_iteration": 2.699846029281616 + }, + { + "auxiliary_loss_clip": 0.01115188, + "auxiliary_loss_mlp": 0.01047594, + "balance_loss_clip": 1.04381919, + "balance_loss_mlp": 1.03121471, + "epoch": 0.3421614309334135, + "flos": 24936226266240.0, + "grad_norm": 2.1339055209058184, + "language_loss": 0.76036334, + "learning_rate": 3.061955178104237e-06, + "loss": 0.78199112, + "num_input_tokens_seen": 122279995, + "step": 5691, + "time_per_iteration": 2.707598924636841 + }, + { + "auxiliary_loss_clip": 0.01119412, + "auxiliary_loss_mlp": 0.01042889, + "balance_loss_clip": 1.04769242, + "balance_loss_mlp": 1.02878046, + "epoch": 0.34222155418608147, + "flos": 21908633731200.0, + "grad_norm": 1.9419180569645556, + "language_loss": 0.68321705, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.70484006, + "num_input_tokens_seen": 122299070, + "step": 5692, + "time_per_iteration": 2.6876816749572754 + }, + { + "auxiliary_loss_clip": 0.01123804, + "auxiliary_loss_mlp": 0.01042902, + "balance_loss_clip": 1.0481621, + "balance_loss_mlp": 1.02660608, + "epoch": 0.34228167743874943, + "flos": 18114061063680.0, + "grad_norm": 2.8342834288415504, + "language_loss": 0.72458065, + "learning_rate": 3.06129504893632e-06, + "loss": 0.74624765, + "num_input_tokens_seen": 122316800, + "step": 5693, + "time_per_iteration": 5.672837018966675 + }, + { + "auxiliary_loss_clip": 0.01090312, + "auxiliary_loss_mlp": 0.01043466, + "balance_loss_clip": 1.0433774, + "balance_loss_mlp": 1.02832651, + "epoch": 0.3423418006914174, + "flos": 21288600138240.0, + "grad_norm": 1.9009541760697364, + "language_loss": 0.75556326, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.77690107, + "num_input_tokens_seen": 122335275, + "step": 5694, + "time_per_iteration": 2.713236093521118 + }, + { + "auxiliary_loss_clip": 0.01093804, + "auxiliary_loss_mlp": 0.01036832, + "balance_loss_clip": 1.04769742, + "balance_loss_mlp": 1.02205038, + "epoch": 0.34240192394408536, + "flos": 19823480288640.0, + "grad_norm": 2.1810058063417608, + "language_loss": 0.79590774, + "learning_rate": 3.060634758790747e-06, + "loss": 0.81721413, + "num_input_tokens_seen": 122353215, + "step": 5695, + "time_per_iteration": 2.7206506729125977 + }, + { + "auxiliary_loss_clip": 0.01077977, + "auxiliary_loss_mlp": 0.01043311, + "balance_loss_clip": 1.04183137, + "balance_loss_mlp": 1.02764642, + "epoch": 0.3424620471967533, + "flos": 24535535074560.0, + "grad_norm": 1.8643380844369803, + "language_loss": 0.73428202, + "learning_rate": 3.060304553382635e-06, + "loss": 0.75549489, + "num_input_tokens_seen": 122372495, + "step": 5696, + "time_per_iteration": 4.777001857757568 + }, + { + "auxiliary_loss_clip": 0.01088152, + "auxiliary_loss_mlp": 0.01052674, + "balance_loss_clip": 1.0424118, + "balance_loss_mlp": 1.03569841, + "epoch": 0.3425221704494213, + "flos": 25848895962240.0, + "grad_norm": 5.815439398629578, + "language_loss": 0.71460104, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.73600936, + "num_input_tokens_seen": 122394600, + "step": 5697, + "time_per_iteration": 2.7620668411254883 + }, + { + "auxiliary_loss_clip": 0.01108783, + "auxiliary_loss_mlp": 0.01032533, + "balance_loss_clip": 1.04925871, + "balance_loss_mlp": 1.01740503, + "epoch": 0.34258229370208926, + "flos": 21540513196800.0, + "grad_norm": 2.6993537181180316, + "language_loss": 0.82170486, + "learning_rate": 3.05964402195837e-06, + "loss": 0.84311801, + "num_input_tokens_seen": 122414700, + "step": 5698, + "time_per_iteration": 2.6930580139160156 + }, + { + "auxiliary_loss_clip": 0.01077965, + "auxiliary_loss_mlp": 0.01049711, + "balance_loss_clip": 1.0451839, + "balance_loss_mlp": 1.03073311, + "epoch": 0.3426424169547573, + "flos": 23652778429440.0, + "grad_norm": 2.492082875954734, + "language_loss": 0.68941295, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.71068972, + "num_input_tokens_seen": 122432760, + "step": 5699, + "time_per_iteration": 2.8604705333709717 + }, + { + "auxiliary_loss_clip": 0.01113381, + "auxiliary_loss_mlp": 0.01042187, + "balance_loss_clip": 1.05009818, + "balance_loss_mlp": 1.02698755, + "epoch": 0.34270254020742524, + "flos": 24644883052800.0, + "grad_norm": 2.4799642493365046, + "language_loss": 0.72708368, + "learning_rate": 3.058983329806877e-06, + "loss": 0.74863935, + "num_input_tokens_seen": 122449105, + "step": 5700, + "time_per_iteration": 2.721219301223755 + }, + { + "auxiliary_loss_clip": 0.01107869, + "auxiliary_loss_mlp": 0.01033632, + "balance_loss_clip": 1.05173492, + "balance_loss_mlp": 1.01942825, + "epoch": 0.3427626634600932, + "flos": 20996754134400.0, + "grad_norm": 1.8907099352771195, + "language_loss": 0.81771016, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.83912516, + "num_input_tokens_seen": 122468700, + "step": 5701, + "time_per_iteration": 2.668776273727417 + }, + { + "auxiliary_loss_clip": 0.01122749, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.05318427, + "balance_loss_mlp": 1.02137566, + "epoch": 0.3428227867127612, + "flos": 21433786911360.0, + "grad_norm": 1.8540703451937275, + "language_loss": 0.71611702, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.73770893, + "num_input_tokens_seen": 122488160, + "step": 5702, + "time_per_iteration": 2.7413434982299805 + }, + { + "auxiliary_loss_clip": 0.01034072, + "auxiliary_loss_mlp": 0.0102117, + "balance_loss_clip": 1.02648544, + "balance_loss_mlp": 1.01936996, + "epoch": 0.34288290996542914, + "flos": 55731782695680.0, + "grad_norm": 0.8291151185510042, + "language_loss": 0.57455015, + "learning_rate": 3.057991990435309e-06, + "loss": 0.59510255, + "num_input_tokens_seen": 122542890, + "step": 5703, + "time_per_iteration": 3.123619318008423 + }, + { + "auxiliary_loss_clip": 0.01125899, + "auxiliary_loss_mlp": 0.01044546, + "balance_loss_clip": 1.05167961, + "balance_loss_mlp": 1.02754664, + "epoch": 0.3429430332180971, + "flos": 20156803522560.0, + "grad_norm": 2.054859273280662, + "language_loss": 0.75049305, + "learning_rate": 3.057661463723086e-06, + "loss": 0.77219748, + "num_input_tokens_seen": 122561770, + "step": 5704, + "time_per_iteration": 2.786344051361084 + }, + { + "auxiliary_loss_clip": 0.01103715, + "auxiliary_loss_mlp": 0.01039493, + "balance_loss_clip": 1.05234969, + "balance_loss_mlp": 1.02506232, + "epoch": 0.34300315647076507, + "flos": 17965857548160.0, + "grad_norm": 1.921400910299184, + "language_loss": 0.72367042, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.74510252, + "num_input_tokens_seen": 122580580, + "step": 5705, + "time_per_iteration": 2.7464826107025146 + }, + { + "auxiliary_loss_clip": 0.01099266, + "auxiliary_loss_mlp": 0.01035276, + "balance_loss_clip": 1.05201912, + "balance_loss_mlp": 1.01980281, + "epoch": 0.34306327972343303, + "flos": 22086822124800.0, + "grad_norm": 2.585473080189318, + "language_loss": 0.80016834, + "learning_rate": 3.057000289991289e-06, + "loss": 0.82151377, + "num_input_tokens_seen": 122599810, + "step": 5706, + "time_per_iteration": 2.83493971824646 + }, + { + "auxiliary_loss_clip": 0.01126183, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.05822873, + "balance_loss_mlp": 1.02111542, + "epoch": 0.343123402976101, + "flos": 18442679616000.0, + "grad_norm": 2.833985332828215, + "language_loss": 0.83001584, + "learning_rate": 3.056669642996787e-06, + "loss": 0.85165167, + "num_input_tokens_seen": 122616035, + "step": 5707, + "time_per_iteration": 2.6888725757598877 + }, + { + "auxiliary_loss_clip": 0.01130807, + "auxiliary_loss_mlp": 0.01038349, + "balance_loss_clip": 1.05664158, + "balance_loss_mlp": 1.02264881, + "epoch": 0.34318352622876896, + "flos": 17163685065600.0, + "grad_norm": 1.6733576562987098, + "language_loss": 0.75313264, + "learning_rate": 3.056338955933266e-06, + "loss": 0.7748242, + "num_input_tokens_seen": 122633785, + "step": 5708, + "time_per_iteration": 2.655061960220337 + }, + { + "auxiliary_loss_clip": 0.01105586, + "auxiliary_loss_mlp": 0.01039807, + "balance_loss_clip": 1.05063939, + "balance_loss_mlp": 1.02357078, + "epoch": 0.34324364948143693, + "flos": 26688164215680.0, + "grad_norm": 1.6008558791331946, + "language_loss": 0.81187862, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.83333254, + "num_input_tokens_seen": 122652100, + "step": 5709, + "time_per_iteration": 2.7354934215545654 + }, + { + "auxiliary_loss_clip": 0.01119071, + "auxiliary_loss_mlp": 0.01043385, + "balance_loss_clip": 1.0550828, + "balance_loss_mlp": 1.02581382, + "epoch": 0.3433037727341049, + "flos": 21251576194560.0, + "grad_norm": 2.1605529243452297, + "language_loss": 0.79441178, + "learning_rate": 3.055677461649329e-06, + "loss": 0.81603634, + "num_input_tokens_seen": 122669720, + "step": 5710, + "time_per_iteration": 2.757321834564209 + }, + { + "auxiliary_loss_clip": 0.01130524, + "auxiliary_loss_mlp": 0.01039861, + "balance_loss_clip": 1.05363941, + "balance_loss_mlp": 1.02329111, + "epoch": 0.34336389598677286, + "flos": 20629423699200.0, + "grad_norm": 1.8403881586839854, + "language_loss": 0.70303786, + "learning_rate": 3.055346654453996e-06, + "loss": 0.7247417, + "num_input_tokens_seen": 122688715, + "step": 5711, + "time_per_iteration": 2.6535775661468506 + }, + { + "auxiliary_loss_clip": 0.01106817, + "auxiliary_loss_mlp": 0.00774858, + "balance_loss_clip": 1.05299044, + "balance_loss_mlp": 1.00072622, + "epoch": 0.3434240192394409, + "flos": 14538579402240.0, + "grad_norm": 1.8401630077009354, + "language_loss": 0.67124939, + "learning_rate": 3.055015807239812e-06, + "loss": 0.69006616, + "num_input_tokens_seen": 122706970, + "step": 5712, + "time_per_iteration": 2.7115519046783447 + }, + { + "auxiliary_loss_clip": 0.01051163, + "auxiliary_loss_mlp": 0.01005713, + "balance_loss_clip": 1.0511148, + "balance_loss_mlp": 1.00409162, + "epoch": 0.34348414249210885, + "flos": 58051538841600.0, + "grad_norm": 0.846630151399307, + "language_loss": 0.58072996, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60129869, + "num_input_tokens_seen": 122758095, + "step": 5713, + "time_per_iteration": 3.3988189697265625 + }, + { + "auxiliary_loss_clip": 0.01142007, + "auxiliary_loss_mlp": 0.01043862, + "balance_loss_clip": 1.05782688, + "balance_loss_mlp": 1.02813852, + "epoch": 0.3435442657447768, + "flos": 20704441253760.0, + "grad_norm": 1.6506449407169241, + "language_loss": 0.8079257, + "learning_rate": 3.054353992805076e-06, + "loss": 0.82978439, + "num_input_tokens_seen": 122777815, + "step": 5714, + "time_per_iteration": 2.682537078857422 + }, + { + "auxiliary_loss_clip": 0.01142274, + "auxiliary_loss_mlp": 0.01042249, + "balance_loss_clip": 1.0581255, + "balance_loss_mlp": 1.02628696, + "epoch": 0.3436043889974448, + "flos": 22930256355840.0, + "grad_norm": 2.1462767477025055, + "language_loss": 0.72059911, + "learning_rate": 3.05402302560962e-06, + "loss": 0.74244434, + "num_input_tokens_seen": 122797555, + "step": 5715, + "time_per_iteration": 2.6535134315490723 + }, + { + "auxiliary_loss_clip": 0.01070037, + "auxiliary_loss_mlp": 0.01002865, + "balance_loss_clip": 1.0577507, + "balance_loss_mlp": 1.00051689, + "epoch": 0.34366451225011274, + "flos": 58403285752320.0, + "grad_norm": 0.9103705044251069, + "language_loss": 0.65885556, + "learning_rate": 3.053692018445505e-06, + "loss": 0.67958462, + "num_input_tokens_seen": 122863955, + "step": 5716, + "time_per_iteration": 3.205113172531128 + }, + { + "auxiliary_loss_clip": 0.01124236, + "auxiliary_loss_mlp": 0.0104266, + "balance_loss_clip": 1.05416417, + "balance_loss_mlp": 1.02718663, + "epoch": 0.3437246355027807, + "flos": 15596292216960.0, + "grad_norm": 2.101112668121384, + "language_loss": 0.74272031, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.76438928, + "num_input_tokens_seen": 122883000, + "step": 5717, + "time_per_iteration": 2.60300350189209 + }, + { + "auxiliary_loss_clip": 0.01084832, + "auxiliary_loss_mlp": 0.01039269, + "balance_loss_clip": 1.05195725, + "balance_loss_mlp": 1.02437937, + "epoch": 0.34378475875544867, + "flos": 27672260106240.0, + "grad_norm": 1.8405555467441777, + "language_loss": 0.75446129, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.7757023, + "num_input_tokens_seen": 122903265, + "step": 5718, + "time_per_iteration": 2.787687301635742 + }, + { + "auxiliary_loss_clip": 0.01097103, + "auxiliary_loss_mlp": 0.01043125, + "balance_loss_clip": 1.04837775, + "balance_loss_mlp": 1.02739501, + "epoch": 0.34384488200811664, + "flos": 31431496769280.0, + "grad_norm": 1.9369525419747404, + "language_loss": 0.63647246, + "learning_rate": 3.052698757266734e-06, + "loss": 0.65787476, + "num_input_tokens_seen": 122923860, + "step": 5719, + "time_per_iteration": 2.8138949871063232 + }, + { + "auxiliary_loss_clip": 0.01098152, + "auxiliary_loss_mlp": 0.01040429, + "balance_loss_clip": 1.05234158, + "balance_loss_mlp": 1.02310777, + "epoch": 0.3439050052607846, + "flos": 24899920594560.0, + "grad_norm": 1.8182809721987367, + "language_loss": 0.73785692, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.75924277, + "num_input_tokens_seen": 122945305, + "step": 5720, + "time_per_iteration": 2.761371612548828 + }, + { + "auxiliary_loss_clip": 0.01127909, + "auxiliary_loss_mlp": 0.01052147, + "balance_loss_clip": 1.056463, + "balance_loss_mlp": 1.03434944, + "epoch": 0.34396512851345257, + "flos": 18150079426560.0, + "grad_norm": 2.2267988645125896, + "language_loss": 0.74087942, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.76267999, + "num_input_tokens_seen": 122962535, + "step": 5721, + "time_per_iteration": 2.6139280796051025 + }, + { + "auxiliary_loss_clip": 0.0111919, + "auxiliary_loss_mlp": 0.0077563, + "balance_loss_clip": 1.05647993, + "balance_loss_mlp": 1.00063252, + "epoch": 0.34402525176612053, + "flos": 16034438315520.0, + "grad_norm": 2.313932715754647, + "language_loss": 0.80464351, + "learning_rate": 3.051705136821992e-06, + "loss": 0.82359171, + "num_input_tokens_seen": 122979750, + "step": 5722, + "time_per_iteration": 2.6886982917785645 + }, + { + "auxiliary_loss_clip": 0.01092207, + "auxiliary_loss_mlp": 0.01038868, + "balance_loss_clip": 1.05326557, + "balance_loss_mlp": 1.02348995, + "epoch": 0.3440853750187885, + "flos": 21178641628800.0, + "grad_norm": 2.5095280683984984, + "language_loss": 0.81647789, + "learning_rate": 3.051373850228801e-06, + "loss": 0.83778864, + "num_input_tokens_seen": 122998955, + "step": 5723, + "time_per_iteration": 2.7464921474456787 + }, + { + "auxiliary_loss_clip": 0.01099736, + "auxiliary_loss_mlp": 0.0105726, + "balance_loss_clip": 1.0488528, + "balance_loss_mlp": 1.04023743, + "epoch": 0.34414549827145646, + "flos": 12677868092160.0, + "grad_norm": 1.9897062128640133, + "language_loss": 0.81431544, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.83588541, + "num_input_tokens_seen": 123016165, + "step": 5724, + "time_per_iteration": 2.7447471618652344 + }, + { + "auxiliary_loss_clip": 0.01112954, + "auxiliary_loss_mlp": 0.01047765, + "balance_loss_clip": 1.05231178, + "balance_loss_mlp": 1.03056324, + "epoch": 0.3442056215241244, + "flos": 31284514316160.0, + "grad_norm": 1.858960952495153, + "language_loss": 0.68913317, + "learning_rate": 3.05071115745038e-06, + "loss": 0.71074033, + "num_input_tokens_seen": 123036900, + "step": 5725, + "time_per_iteration": 2.798987627029419 + }, + { + "auxiliary_loss_clip": 0.01132971, + "auxiliary_loss_mlp": 0.0105182, + "balance_loss_clip": 1.05775714, + "balance_loss_mlp": 1.03379524, + "epoch": 0.34426574477679245, + "flos": 23367289132800.0, + "grad_norm": 1.4701315954442116, + "language_loss": 0.6946882, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.71653616, + "num_input_tokens_seen": 123057480, + "step": 5726, + "time_per_iteration": 2.663766622543335 + }, + { + "auxiliary_loss_clip": 0.01111868, + "auxiliary_loss_mlp": 0.01038496, + "balance_loss_clip": 1.05667615, + "balance_loss_mlp": 1.02374983, + "epoch": 0.3443258680294604, + "flos": 24535427333760.0, + "grad_norm": 2.4860883718983873, + "language_loss": 0.73317868, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.7546823, + "num_input_tokens_seen": 123076890, + "step": 5727, + "time_per_iteration": 2.8002336025238037 + }, + { + "auxiliary_loss_clip": 0.01097058, + "auxiliary_loss_mlp": 0.01052204, + "balance_loss_clip": 1.05053401, + "balance_loss_mlp": 1.03590822, + "epoch": 0.3443859912821284, + "flos": 20230133137920.0, + "grad_norm": 2.2067060616784815, + "language_loss": 0.88451493, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.90600753, + "num_input_tokens_seen": 123092530, + "step": 5728, + "time_per_iteration": 2.703842878341675 + }, + { + "auxiliary_loss_clip": 0.01089582, + "auxiliary_loss_mlp": 0.01048379, + "balance_loss_clip": 1.04858351, + "balance_loss_mlp": 1.03266144, + "epoch": 0.34444611453479634, + "flos": 24316515895680.0, + "grad_norm": 2.2135571419735904, + "language_loss": 0.70018214, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.72156173, + "num_input_tokens_seen": 123110560, + "step": 5729, + "time_per_iteration": 4.360877275466919 + }, + { + "auxiliary_loss_clip": 0.01124088, + "auxiliary_loss_mlp": 0.01037772, + "balance_loss_clip": 1.0525502, + "balance_loss_mlp": 1.02208424, + "epoch": 0.3445062377874643, + "flos": 16983413683200.0, + "grad_norm": 1.9483871766944658, + "language_loss": 0.7435137, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.76513231, + "num_input_tokens_seen": 123128655, + "step": 5730, + "time_per_iteration": 2.6021499633789062 + }, + { + "auxiliary_loss_clip": 0.01099617, + "auxiliary_loss_mlp": 0.0105823, + "balance_loss_clip": 1.04880106, + "balance_loss_mlp": 1.04053974, + "epoch": 0.3445663610401323, + "flos": 20302708567680.0, + "grad_norm": 2.1142556114368314, + "language_loss": 0.7952323, + "learning_rate": 3.048722123283578e-06, + "loss": 0.81681079, + "num_input_tokens_seen": 123145130, + "step": 5731, + "time_per_iteration": 4.273399114608765 + }, + { + "auxiliary_loss_clip": 0.01130567, + "auxiliary_loss_mlp": 0.01043537, + "balance_loss_clip": 1.05617356, + "balance_loss_mlp": 1.02793896, + "epoch": 0.34462648429280024, + "flos": 15888102307200.0, + "grad_norm": 2.0299111477971334, + "language_loss": 0.78609502, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.80783606, + "num_input_tokens_seen": 123162265, + "step": 5732, + "time_per_iteration": 4.672218322753906 + }, + { + "auxiliary_loss_clip": 0.01037769, + "auxiliary_loss_mlp": 0.0101237, + "balance_loss_clip": 1.03788018, + "balance_loss_mlp": 1.0106411, + "epoch": 0.3446866075454682, + "flos": 59311035285120.0, + "grad_norm": 0.7456337544046427, + "language_loss": 0.53537595, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55587733, + "num_input_tokens_seen": 123218620, + "step": 5733, + "time_per_iteration": 3.322802782058716 + }, + { + "auxiliary_loss_clip": 0.01122514, + "auxiliary_loss_mlp": 0.01042066, + "balance_loss_clip": 1.05675018, + "balance_loss_mlp": 1.02577019, + "epoch": 0.34474673079813617, + "flos": 22343799000960.0, + "grad_norm": 1.936820728476944, + "language_loss": 0.832178, + "learning_rate": 3.047727069167207e-06, + "loss": 0.85382378, + "num_input_tokens_seen": 123237325, + "step": 5734, + "time_per_iteration": 2.7426953315734863 + }, + { + "auxiliary_loss_clip": 0.01120142, + "auxiliary_loss_mlp": 0.0103601, + "balance_loss_clip": 1.05517805, + "balance_loss_mlp": 1.01988125, + "epoch": 0.34480685405080413, + "flos": 27670141203840.0, + "grad_norm": 2.7764640699074077, + "language_loss": 0.92655241, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.94811392, + "num_input_tokens_seen": 123258650, + "step": 5735, + "time_per_iteration": 4.536838054656982 + }, + { + "auxiliary_loss_clip": 0.0110302, + "auxiliary_loss_mlp": 0.01041265, + "balance_loss_clip": 1.05774188, + "balance_loss_mlp": 1.02492189, + "epoch": 0.3448669773034721, + "flos": 22456020067200.0, + "grad_norm": 1.7508294751665012, + "language_loss": 0.76571405, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.78715694, + "num_input_tokens_seen": 123277155, + "step": 5736, + "time_per_iteration": 2.784958600997925 + }, + { + "auxiliary_loss_clip": 0.01122912, + "auxiliary_loss_mlp": 0.0104053, + "balance_loss_clip": 1.05683184, + "balance_loss_mlp": 1.02396011, + "epoch": 0.34492710055614006, + "flos": 24936190352640.0, + "grad_norm": 1.7983696926456887, + "language_loss": 0.78327668, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.80491114, + "num_input_tokens_seen": 123297640, + "step": 5737, + "time_per_iteration": 2.709786891937256 + }, + { + "auxiliary_loss_clip": 0.01083721, + "auxiliary_loss_mlp": 0.0104406, + "balance_loss_clip": 1.04379368, + "balance_loss_mlp": 1.02520096, + "epoch": 0.34498722380880803, + "flos": 20120821073280.0, + "grad_norm": 2.0055780284948375, + "language_loss": 0.71544027, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.73671806, + "num_input_tokens_seen": 123314370, + "step": 5738, + "time_per_iteration": 2.779651165008545 + }, + { + "auxiliary_loss_clip": 0.0110112, + "auxiliary_loss_mlp": 0.01042892, + "balance_loss_clip": 1.04991913, + "balance_loss_mlp": 1.02520132, + "epoch": 0.34504734706147605, + "flos": 28438126917120.0, + "grad_norm": 2.7751951344870562, + "language_loss": 0.82324719, + "learning_rate": 3.046067851209389e-06, + "loss": 0.84468728, + "num_input_tokens_seen": 123336085, + "step": 5739, + "time_per_iteration": 2.7953522205352783 + }, + { + "auxiliary_loss_clip": 0.01104482, + "auxiliary_loss_mlp": 0.01037335, + "balance_loss_clip": 1.05071819, + "balance_loss_mlp": 1.02132511, + "epoch": 0.345107470314144, + "flos": 22674464628480.0, + "grad_norm": 1.8186717226973075, + "language_loss": 0.83071041, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.85212862, + "num_input_tokens_seen": 123354460, + "step": 5740, + "time_per_iteration": 2.7530486583709717 + }, + { + "auxiliary_loss_clip": 0.01130478, + "auxiliary_loss_mlp": 0.01035685, + "balance_loss_clip": 1.05699897, + "balance_loss_mlp": 1.01901984, + "epoch": 0.345167593566812, + "flos": 20630716588800.0, + "grad_norm": 2.1971165557092656, + "language_loss": 0.7704618, + "learning_rate": 3.045403886269181e-06, + "loss": 0.79212344, + "num_input_tokens_seen": 123373420, + "step": 5741, + "time_per_iteration": 2.6488983631134033 + }, + { + "auxiliary_loss_clip": 0.01116686, + "auxiliary_loss_mlp": 0.01038328, + "balance_loss_clip": 1.05202794, + "balance_loss_mlp": 1.02271724, + "epoch": 0.34522771681947995, + "flos": 26214358890240.0, + "grad_norm": 1.629760829576741, + "language_loss": 0.76972193, + "learning_rate": 3.045071844330053e-06, + "loss": 0.7912721, + "num_input_tokens_seen": 123394730, + "step": 5742, + "time_per_iteration": 2.7333807945251465 + }, + { + "auxiliary_loss_clip": 0.01133631, + "auxiliary_loss_mlp": 0.01040013, + "balance_loss_clip": 1.05862427, + "balance_loss_mlp": 1.02371693, + "epoch": 0.3452878400721479, + "flos": 19062354072960.0, + "grad_norm": 2.2460068376984523, + "language_loss": 0.76135588, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.78309238, + "num_input_tokens_seen": 123412895, + "step": 5743, + "time_per_iteration": 2.677682638168335 + }, + { + "auxiliary_loss_clip": 0.01128893, + "auxiliary_loss_mlp": 0.01037178, + "balance_loss_clip": 1.05570602, + "balance_loss_mlp": 1.02171636, + "epoch": 0.3453479633248159, + "flos": 27929739772800.0, + "grad_norm": 2.0501405423310097, + "language_loss": 0.70481914, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.72647989, + "num_input_tokens_seen": 123432320, + "step": 5744, + "time_per_iteration": 2.7430574893951416 + }, + { + "auxiliary_loss_clip": 0.01140382, + "auxiliary_loss_mlp": 0.01036281, + "balance_loss_clip": 1.05727339, + "balance_loss_mlp": 1.01959133, + "epoch": 0.34540808657748384, + "flos": 19606113135360.0, + "grad_norm": 2.271690731291802, + "language_loss": 0.79658759, + "learning_rate": 3.044075480787665e-06, + "loss": 0.81835419, + "num_input_tokens_seen": 123450980, + "step": 5745, + "time_per_iteration": 2.6587865352630615 + }, + { + "auxiliary_loss_clip": 0.01092128, + "auxiliary_loss_mlp": 0.01041398, + "balance_loss_clip": 1.0486573, + "balance_loss_mlp": 1.02435148, + "epoch": 0.3454682098301518, + "flos": 20411661496320.0, + "grad_norm": 1.8194779915280654, + "language_loss": 0.89049339, + "learning_rate": 3.043743280407182e-06, + "loss": 0.91182864, + "num_input_tokens_seen": 123469365, + "step": 5746, + "time_per_iteration": 2.7314908504486084 + }, + { + "auxiliary_loss_clip": 0.01133638, + "auxiliary_loss_mlp": 0.01038455, + "balance_loss_clip": 1.05554819, + "balance_loss_mlp": 1.02101421, + "epoch": 0.34552833308281977, + "flos": 21325121291520.0, + "grad_norm": 2.5554958969654136, + "language_loss": 0.64851058, + "learning_rate": 3.043411040447849e-06, + "loss": 0.67023152, + "num_input_tokens_seen": 123489425, + "step": 5747, + "time_per_iteration": 2.6858277320861816 + }, + { + "auxiliary_loss_clip": 0.01119459, + "auxiliary_loss_mlp": 0.01035118, + "balance_loss_clip": 1.05213308, + "balance_loss_mlp": 1.01928735, + "epoch": 0.34558845633548774, + "flos": 36243633824640.0, + "grad_norm": 1.5633023430662023, + "language_loss": 0.72855747, + "learning_rate": 3.043078760922264e-06, + "loss": 0.75010324, + "num_input_tokens_seen": 123509970, + "step": 5748, + "time_per_iteration": 2.805250406265259 + }, + { + "auxiliary_loss_clip": 0.01084714, + "auxiliary_loss_mlp": 0.01032651, + "balance_loss_clip": 1.05246413, + "balance_loss_mlp": 1.01832819, + "epoch": 0.3456485795881557, + "flos": 22450561200000.0, + "grad_norm": 1.6861475272665256, + "language_loss": 0.7584126, + "learning_rate": 3.042746441843029e-06, + "loss": 0.7795862, + "num_input_tokens_seen": 123531055, + "step": 5749, + "time_per_iteration": 2.8886258602142334 + }, + { + "auxiliary_loss_clip": 0.01061531, + "auxiliary_loss_mlp": 0.01002064, + "balance_loss_clip": 1.05058503, + "balance_loss_mlp": 1.00045478, + "epoch": 0.34570870284082367, + "flos": 62004299005440.0, + "grad_norm": 0.8852783380527953, + "language_loss": 0.62715566, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.64779162, + "num_input_tokens_seen": 123584720, + "step": 5750, + "time_per_iteration": 3.1283066272735596 + }, + { + "auxiliary_loss_clip": 0.01110881, + "auxiliary_loss_mlp": 0.01037788, + "balance_loss_clip": 1.05210388, + "balance_loss_mlp": 1.02242184, + "epoch": 0.34576882609349163, + "flos": 22782196494720.0, + "grad_norm": 2.239830827663745, + "language_loss": 0.80332017, + "learning_rate": 3.042081685074012e-06, + "loss": 0.82480681, + "num_input_tokens_seen": 123604465, + "step": 5751, + "time_per_iteration": 2.721344470977783 + }, + { + "auxiliary_loss_clip": 0.01135561, + "auxiliary_loss_mlp": 0.01045926, + "balance_loss_clip": 1.0536952, + "balance_loss_mlp": 1.03101254, + "epoch": 0.34582894934615965, + "flos": 12348818576640.0, + "grad_norm": 2.3847713847020744, + "language_loss": 0.84148252, + "learning_rate": 3.041749247409439e-06, + "loss": 0.86329746, + "num_input_tokens_seen": 123622320, + "step": 5752, + "time_per_iteration": 2.578984260559082 + }, + { + "auxiliary_loss_clip": 0.01047286, + "auxiliary_loss_mlp": 0.00754976, + "balance_loss_clip": 1.0380801, + "balance_loss_mlp": 1.00148225, + "epoch": 0.3458890725988276, + "flos": 70167691071360.0, + "grad_norm": 0.7284359747550926, + "language_loss": 0.6310631, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.64908576, + "num_input_tokens_seen": 123678010, + "step": 5753, + "time_per_iteration": 3.0907819271087646 + }, + { + "auxiliary_loss_clip": 0.01112695, + "auxiliary_loss_mlp": 0.01035981, + "balance_loss_clip": 1.05358505, + "balance_loss_mlp": 1.01956582, + "epoch": 0.3459491958514956, + "flos": 17092582093440.0, + "grad_norm": 1.9590865283999213, + "language_loss": 0.71000856, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.73149538, + "num_input_tokens_seen": 123696830, + "step": 5754, + "time_per_iteration": 2.7031564712524414 + }, + { + "auxiliary_loss_clip": 0.01127989, + "auxiliary_loss_mlp": 0.01038041, + "balance_loss_clip": 1.05300486, + "balance_loss_mlp": 1.02251959, + "epoch": 0.34600931910416355, + "flos": 16650952375680.0, + "grad_norm": 2.56305874029915, + "language_loss": 0.73286581, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.75452608, + "num_input_tokens_seen": 123714360, + "step": 5755, + "time_per_iteration": 2.656804084777832 + }, + { + "auxiliary_loss_clip": 0.01122508, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.0504849, + "balance_loss_mlp": 1.01791406, + "epoch": 0.3460694423568315, + "flos": 38546190334080.0, + "grad_norm": 1.7746130503339408, + "language_loss": 0.7232182, + "learning_rate": 3.040419101844869e-06, + "loss": 0.74477637, + "num_input_tokens_seen": 123739250, + "step": 5756, + "time_per_iteration": 2.8805603981018066 + }, + { + "auxiliary_loss_clip": 0.01055943, + "auxiliary_loss_mlp": 0.01012753, + "balance_loss_clip": 1.03647125, + "balance_loss_mlp": 1.01088166, + "epoch": 0.3461295656094995, + "flos": 72081479704320.0, + "grad_norm": 0.7176054236110851, + "language_loss": 0.62659568, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64728266, + "num_input_tokens_seen": 123802845, + "step": 5757, + "time_per_iteration": 3.21248197555542 + }, + { + "auxiliary_loss_clip": 0.0103445, + "auxiliary_loss_mlp": 0.00755471, + "balance_loss_clip": 1.03495657, + "balance_loss_mlp": 1.0016396, + "epoch": 0.34618968886216744, + "flos": 65460089571840.0, + "grad_norm": 0.8171010225304897, + "language_loss": 0.59206927, + "learning_rate": 3.039753792295362e-06, + "loss": 0.60996854, + "num_input_tokens_seen": 123861805, + "step": 5758, + "time_per_iteration": 3.2514266967773438 + }, + { + "auxiliary_loss_clip": 0.01122832, + "auxiliary_loss_mlp": 0.01042223, + "balance_loss_clip": 1.05849838, + "balance_loss_mlp": 1.02783418, + "epoch": 0.3462498121148354, + "flos": 23472542960640.0, + "grad_norm": 1.8827972101732287, + "language_loss": 0.71806967, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.73972023, + "num_input_tokens_seen": 123881820, + "step": 5759, + "time_per_iteration": 2.943061351776123 + }, + { + "auxiliary_loss_clip": 0.0108272, + "auxiliary_loss_mlp": 0.01061154, + "balance_loss_clip": 1.0455631, + "balance_loss_mlp": 1.04352307, + "epoch": 0.3463099353675034, + "flos": 24170790418560.0, + "grad_norm": 1.9206924983950955, + "language_loss": 0.83097923, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.85241801, + "num_input_tokens_seen": 123903700, + "step": 5760, + "time_per_iteration": 2.8922929763793945 + }, + { + "auxiliary_loss_clip": 0.01029416, + "auxiliary_loss_mlp": 0.01010127, + "balance_loss_clip": 1.02909803, + "balance_loss_mlp": 1.00855386, + "epoch": 0.34637005862017134, + "flos": 63700609766400.0, + "grad_norm": 0.8149802448400086, + "language_loss": 0.56472003, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.58511543, + "num_input_tokens_seen": 123960075, + "step": 5761, + "time_per_iteration": 3.274470567703247 + }, + { + "auxiliary_loss_clip": 0.01122229, + "auxiliary_loss_mlp": 0.00773416, + "balance_loss_clip": 1.04931128, + "balance_loss_mlp": 1.00069964, + "epoch": 0.3464301818728393, + "flos": 13145532192000.0, + "grad_norm": 2.486389460519204, + "language_loss": 0.94996566, + "learning_rate": 3.038422700166474e-06, + "loss": 0.96892214, + "num_input_tokens_seen": 123975805, + "step": 5762, + "time_per_iteration": 2.636906623840332 + }, + { + "auxiliary_loss_clip": 0.01106692, + "auxiliary_loss_mlp": 0.0104127, + "balance_loss_clip": 1.04844642, + "balance_loss_mlp": 1.02467608, + "epoch": 0.34649030512550727, + "flos": 29315173299840.0, + "grad_norm": 1.8335548533403485, + "language_loss": 0.69540495, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.71688455, + "num_input_tokens_seen": 123997530, + "step": 5763, + "time_per_iteration": 2.8476505279541016 + }, + { + "auxiliary_loss_clip": 0.01125911, + "auxiliary_loss_mlp": 0.01051478, + "balance_loss_clip": 1.04963946, + "balance_loss_mlp": 1.03319085, + "epoch": 0.34655042837817523, + "flos": 23730884553600.0, + "grad_norm": 2.0043623648961195, + "language_loss": 0.83985734, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.86163127, + "num_input_tokens_seen": 124016375, + "step": 5764, + "time_per_iteration": 2.693847417831421 + }, + { + "auxiliary_loss_clip": 0.01103367, + "auxiliary_loss_mlp": 0.01039514, + "balance_loss_clip": 1.04989028, + "balance_loss_mlp": 1.02363563, + "epoch": 0.34661055163084326, + "flos": 22054215553920.0, + "grad_norm": 2.2905956292147045, + "language_loss": 0.6769501, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.69837892, + "num_input_tokens_seen": 124033975, + "step": 5765, + "time_per_iteration": 2.7656123638153076 + }, + { + "auxiliary_loss_clip": 0.01108658, + "auxiliary_loss_mlp": 0.01045242, + "balance_loss_clip": 1.05017447, + "balance_loss_mlp": 1.0279808, + "epoch": 0.3466706748835112, + "flos": 21799213925760.0, + "grad_norm": 2.7236728572511653, + "language_loss": 0.77394044, + "learning_rate": 3.03709097800413e-06, + "loss": 0.79547942, + "num_input_tokens_seen": 124051930, + "step": 5766, + "time_per_iteration": 2.7095906734466553 + }, + { + "auxiliary_loss_clip": 0.01078684, + "auxiliary_loss_mlp": 0.01035923, + "balance_loss_clip": 1.04552221, + "balance_loss_mlp": 1.02113521, + "epoch": 0.3467307981361792, + "flos": 19461680547840.0, + "grad_norm": 1.6543575607114767, + "language_loss": 0.73547316, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.75661922, + "num_input_tokens_seen": 124071220, + "step": 5767, + "time_per_iteration": 2.8161730766296387 + }, + { + "auxiliary_loss_clip": 0.01111822, + "auxiliary_loss_mlp": 0.01043875, + "balance_loss_clip": 1.05307102, + "balance_loss_mlp": 1.02734113, + "epoch": 0.34679092138884715, + "flos": 24827452905600.0, + "grad_norm": 2.2530154082607776, + "language_loss": 0.7832194, + "learning_rate": 3.036424880912893e-06, + "loss": 0.80477637, + "num_input_tokens_seen": 124090140, + "step": 5768, + "time_per_iteration": 4.265673875808716 + }, + { + "auxiliary_loss_clip": 0.01050543, + "auxiliary_loss_mlp": 0.01012109, + "balance_loss_clip": 1.0320363, + "balance_loss_mlp": 1.0104636, + "epoch": 0.3468510446415151, + "flos": 63236070149760.0, + "grad_norm": 0.7741250202123364, + "language_loss": 0.57502627, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59565282, + "num_input_tokens_seen": 124152025, + "step": 5769, + "time_per_iteration": 3.2264139652252197 + }, + { + "auxiliary_loss_clip": 0.01107195, + "auxiliary_loss_mlp": 0.01044629, + "balance_loss_clip": 1.04818511, + "balance_loss_mlp": 1.02630615, + "epoch": 0.3469111678941831, + "flos": 12120713256960.0, + "grad_norm": 2.34841523993127, + "language_loss": 0.85575318, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.87727135, + "num_input_tokens_seen": 124165795, + "step": 5770, + "time_per_iteration": 2.7029645442962646 + }, + { + "auxiliary_loss_clip": 0.01034922, + "auxiliary_loss_mlp": 0.01007496, + "balance_loss_clip": 1.02998519, + "balance_loss_mlp": 1.00527906, + "epoch": 0.34697129114685105, + "flos": 65934110378880.0, + "grad_norm": 0.7677707974310557, + "language_loss": 0.59758615, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.6180104, + "num_input_tokens_seen": 124222925, + "step": 5771, + "time_per_iteration": 4.5523951053619385 + }, + { + "auxiliary_loss_clip": 0.01127175, + "auxiliary_loss_mlp": 0.01049141, + "balance_loss_clip": 1.05249262, + "balance_loss_mlp": 1.03320241, + "epoch": 0.347031414399519, + "flos": 34454205054720.0, + "grad_norm": 1.9048919633537342, + "language_loss": 0.71560407, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.73736715, + "num_input_tokens_seen": 124240915, + "step": 5772, + "time_per_iteration": 2.8108439445495605 + }, + { + "auxiliary_loss_clip": 0.01108886, + "auxiliary_loss_mlp": 0.0077423, + "balance_loss_clip": 1.05118012, + "balance_loss_mlp": 1.00077164, + "epoch": 0.347091537652187, + "flos": 26944135511040.0, + "grad_norm": 1.679823492532721, + "language_loss": 0.764898, + "learning_rate": 3.034758950632507e-06, + "loss": 0.78372908, + "num_input_tokens_seen": 124262770, + "step": 5773, + "time_per_iteration": 2.813775062561035 + }, + { + "auxiliary_loss_clip": 0.01128178, + "auxiliary_loss_mlp": 0.01043067, + "balance_loss_clip": 1.05019748, + "balance_loss_mlp": 1.02674699, + "epoch": 0.34715166090485494, + "flos": 21142228216320.0, + "grad_norm": 5.389351496516036, + "language_loss": 0.70094979, + "learning_rate": 3.034425646811396e-06, + "loss": 0.72266221, + "num_input_tokens_seen": 124280950, + "step": 5774, + "time_per_iteration": 4.167816162109375 + }, + { + "auxiliary_loss_clip": 0.01113209, + "auxiliary_loss_mlp": 0.00774032, + "balance_loss_clip": 1.05024052, + "balance_loss_mlp": 1.00071549, + "epoch": 0.3472117841575229, + "flos": 23478001827840.0, + "grad_norm": 1.6687380405540382, + "language_loss": 0.76013231, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.77900469, + "num_input_tokens_seen": 124299540, + "step": 5775, + "time_per_iteration": 2.739729404449463 + }, + { + "auxiliary_loss_clip": 0.01114926, + "auxiliary_loss_mlp": 0.01046919, + "balance_loss_clip": 1.0480268, + "balance_loss_mlp": 1.02965736, + "epoch": 0.34727190741019087, + "flos": 17492806408320.0, + "grad_norm": 2.598065011523741, + "language_loss": 0.77565503, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.79727352, + "num_input_tokens_seen": 124316285, + "step": 5776, + "time_per_iteration": 2.7339272499084473 + }, + { + "auxiliary_loss_clip": 0.01036494, + "auxiliary_loss_mlp": 0.01014475, + "balance_loss_clip": 1.02741766, + "balance_loss_mlp": 1.01280594, + "epoch": 0.34733203066285884, + "flos": 65265491640960.0, + "grad_norm": 0.8358378555600092, + "language_loss": 0.63272905, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65323877, + "num_input_tokens_seen": 124376650, + "step": 5777, + "time_per_iteration": 3.257993459701538 + }, + { + "auxiliary_loss_clip": 0.01098381, + "auxiliary_loss_mlp": 0.01045801, + "balance_loss_clip": 1.04933393, + "balance_loss_mlp": 1.02975535, + "epoch": 0.3473921539155268, + "flos": 28658726294400.0, + "grad_norm": 3.5330364681008755, + "language_loss": 0.6504612, + "learning_rate": 3.033092039398119e-06, + "loss": 0.67190301, + "num_input_tokens_seen": 124396475, + "step": 5778, + "time_per_iteration": 2.775846481323242 + }, + { + "auxiliary_loss_clip": 0.01113961, + "auxiliary_loss_mlp": 0.01054607, + "balance_loss_clip": 1.04786038, + "balance_loss_mlp": 1.03903246, + "epoch": 0.3474522771681948, + "flos": 40836895355520.0, + "grad_norm": 2.3967507755094064, + "language_loss": 0.71278334, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.73446906, + "num_input_tokens_seen": 124416480, + "step": 5779, + "time_per_iteration": 2.7915873527526855 + }, + { + "auxiliary_loss_clip": 0.01142932, + "auxiliary_loss_mlp": 0.01053692, + "balance_loss_clip": 1.05395269, + "balance_loss_mlp": 1.03762269, + "epoch": 0.3475124004208628, + "flos": 24608577381120.0, + "grad_norm": 2.0452202029673043, + "language_loss": 0.62873107, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.65069735, + "num_input_tokens_seen": 124435950, + "step": 5780, + "time_per_iteration": 2.6743876934051514 + }, + { + "auxiliary_loss_clip": 0.01095736, + "auxiliary_loss_mlp": 0.01050069, + "balance_loss_clip": 1.04648292, + "balance_loss_mlp": 1.03446484, + "epoch": 0.34757252367353075, + "flos": 22711309004160.0, + "grad_norm": 1.6009150193459345, + "language_loss": 0.72167897, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.743137, + "num_input_tokens_seen": 124455410, + "step": 5781, + "time_per_iteration": 2.749302625656128 + }, + { + "auxiliary_loss_clip": 0.01073898, + "auxiliary_loss_mlp": 0.01052117, + "balance_loss_clip": 1.040519, + "balance_loss_mlp": 1.03405714, + "epoch": 0.3476326469261987, + "flos": 19828184970240.0, + "grad_norm": 2.5507599846278644, + "language_loss": 0.76966107, + "learning_rate": 3.031757805185612e-06, + "loss": 0.79092121, + "num_input_tokens_seen": 124474870, + "step": 5782, + "time_per_iteration": 2.801867723464966 + }, + { + "auxiliary_loss_clip": 0.01108825, + "auxiliary_loss_mlp": 0.01037018, + "balance_loss_clip": 1.05032897, + "balance_loss_mlp": 1.02193785, + "epoch": 0.3476927701788667, + "flos": 19938107566080.0, + "grad_norm": 2.367934041085959, + "language_loss": 0.62506068, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.64651906, + "num_input_tokens_seen": 124494105, + "step": 5783, + "time_per_iteration": 2.709778070449829 + }, + { + "auxiliary_loss_clip": 0.01092863, + "auxiliary_loss_mlp": 0.01031024, + "balance_loss_clip": 1.04997683, + "balance_loss_mlp": 1.0163672, + "epoch": 0.34775289343153465, + "flos": 20735108490240.0, + "grad_norm": 1.7498214415914104, + "language_loss": 0.88513505, + "learning_rate": 3.031090453282605e-06, + "loss": 0.90637398, + "num_input_tokens_seen": 124512030, + "step": 5784, + "time_per_iteration": 2.769317150115967 + }, + { + "auxiliary_loss_clip": 0.01089006, + "auxiliary_loss_mlp": 0.01036783, + "balance_loss_clip": 1.05206084, + "balance_loss_mlp": 1.02097547, + "epoch": 0.3478130166842026, + "flos": 19354846521600.0, + "grad_norm": 1.703369857104052, + "language_loss": 0.81740022, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.83865809, + "num_input_tokens_seen": 124530980, + "step": 5785, + "time_per_iteration": 2.791860818862915 + }, + { + "auxiliary_loss_clip": 0.01106676, + "auxiliary_loss_mlp": 0.01040592, + "balance_loss_clip": 1.04747128, + "balance_loss_mlp": 1.02563095, + "epoch": 0.3478731399368706, + "flos": 22051198811520.0, + "grad_norm": 1.689422515624071, + "language_loss": 0.80540836, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.82688099, + "num_input_tokens_seen": 124549330, + "step": 5786, + "time_per_iteration": 2.7547576427459717 + }, + { + "auxiliary_loss_clip": 0.0113505, + "auxiliary_loss_mlp": 0.00773369, + "balance_loss_clip": 1.05242872, + "balance_loss_mlp": 1.00073981, + "epoch": 0.34793326318953854, + "flos": 18041449720320.0, + "grad_norm": 2.7072955912962686, + "language_loss": 0.74945676, + "learning_rate": 3.030089132216836e-06, + "loss": 0.76854098, + "num_input_tokens_seen": 124567200, + "step": 5787, + "time_per_iteration": 2.592688798904419 + }, + { + "auxiliary_loss_clip": 0.01102822, + "auxiliary_loss_mlp": 0.00773627, + "balance_loss_clip": 1.04294109, + "balance_loss_mlp": 1.00074553, + "epoch": 0.3479933864422065, + "flos": 29314670509440.0, + "grad_norm": 1.9068485918966191, + "language_loss": 0.81542754, + "learning_rate": 3.029755280389203e-06, + "loss": 0.83419204, + "num_input_tokens_seen": 124587025, + "step": 5788, + "time_per_iteration": 2.84395694732666 + }, + { + "auxiliary_loss_clip": 0.01144785, + "auxiliary_loss_mlp": 0.01037478, + "balance_loss_clip": 1.0562067, + "balance_loss_mlp": 1.02140832, + "epoch": 0.3480535096948745, + "flos": 20120713332480.0, + "grad_norm": 2.2432452775203964, + "language_loss": 0.85701168, + "learning_rate": 3.029421389513147e-06, + "loss": 0.87883425, + "num_input_tokens_seen": 124605860, + "step": 5789, + "time_per_iteration": 2.630535125732422 + }, + { + "auxiliary_loss_clip": 0.01130136, + "auxiliary_loss_mlp": 0.01056162, + "balance_loss_clip": 1.05231345, + "balance_loss_mlp": 1.04007459, + "epoch": 0.34811363294754244, + "flos": 18548974938240.0, + "grad_norm": 5.008598067350991, + "language_loss": 0.8502599, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87212288, + "num_input_tokens_seen": 124624270, + "step": 5790, + "time_per_iteration": 2.6052823066711426 + }, + { + "auxiliary_loss_clip": 0.01130643, + "auxiliary_loss_mlp": 0.01044731, + "balance_loss_clip": 1.05373776, + "balance_loss_mlp": 1.02904904, + "epoch": 0.3481737562002104, + "flos": 26870303105280.0, + "grad_norm": 1.9264082121319324, + "language_loss": 0.80832046, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.83007419, + "num_input_tokens_seen": 124644005, + "step": 5791, + "time_per_iteration": 2.7190260887145996 + }, + { + "auxiliary_loss_clip": 0.01125872, + "auxiliary_loss_mlp": 0.0104286, + "balance_loss_clip": 1.04968619, + "balance_loss_mlp": 1.02690983, + "epoch": 0.3482338794528784, + "flos": 28908664104960.0, + "grad_norm": 2.4373031068755022, + "language_loss": 0.77855796, + "learning_rate": 3.028419482721056e-06, + "loss": 0.80024529, + "num_input_tokens_seen": 124663020, + "step": 5792, + "time_per_iteration": 2.7223403453826904 + }, + { + "auxiliary_loss_clip": 0.01108923, + "auxiliary_loss_mlp": 0.01034893, + "balance_loss_clip": 1.04401517, + "balance_loss_mlp": 1.01922882, + "epoch": 0.3482940027055464, + "flos": 22200767043840.0, + "grad_norm": 1.6684091148270528, + "language_loss": 0.81824791, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.8396861, + "num_input_tokens_seen": 124682975, + "step": 5793, + "time_per_iteration": 2.84191632270813 + }, + { + "auxiliary_loss_clip": 0.01124823, + "auxiliary_loss_mlp": 0.01055766, + "balance_loss_clip": 1.05077863, + "balance_loss_mlp": 1.0392313, + "epoch": 0.34835412595821436, + "flos": 20302708567680.0, + "grad_norm": 1.8786694421525794, + "language_loss": 0.7607373, + "learning_rate": 3.027751349849706e-06, + "loss": 0.78254318, + "num_input_tokens_seen": 124701340, + "step": 5794, + "time_per_iteration": 2.707648515701294 + }, + { + "auxiliary_loss_clip": 0.01123664, + "auxiliary_loss_mlp": 0.01044013, + "balance_loss_clip": 1.04820764, + "balance_loss_mlp": 1.02735913, + "epoch": 0.3484142492108823, + "flos": 20449691020800.0, + "grad_norm": 2.79979085265216, + "language_loss": 0.57190084, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.59357756, + "num_input_tokens_seen": 124719165, + "step": 5795, + "time_per_iteration": 2.6533401012420654 + }, + { + "auxiliary_loss_clip": 0.01106011, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.04720807, + "balance_loss_mlp": 1.02177811, + "epoch": 0.3484743724635503, + "flos": 24352929308160.0, + "grad_norm": 2.0564463844351546, + "language_loss": 0.82218957, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.84361899, + "num_input_tokens_seen": 124738670, + "step": 5796, + "time_per_iteration": 2.6823246479034424 + }, + { + "auxiliary_loss_clip": 0.01120404, + "auxiliary_loss_mlp": 0.01034067, + "balance_loss_clip": 1.04927754, + "balance_loss_mlp": 1.0192616, + "epoch": 0.34853449571621825, + "flos": 24353001135360.0, + "grad_norm": 1.9927036097023587, + "language_loss": 0.83429003, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.85583472, + "num_input_tokens_seen": 124758760, + "step": 5797, + "time_per_iteration": 2.7048346996307373 + }, + { + "auxiliary_loss_clip": 0.01132676, + "auxiliary_loss_mlp": 0.01037057, + "balance_loss_clip": 1.05049801, + "balance_loss_mlp": 1.02151191, + "epoch": 0.3485946189688862, + "flos": 27267690245760.0, + "grad_norm": 1.9361964581914621, + "language_loss": 0.73449033, + "learning_rate": 3.026414616539167e-06, + "loss": 0.75618768, + "num_input_tokens_seen": 124777765, + "step": 5798, + "time_per_iteration": 2.6807782649993896 + }, + { + "auxiliary_loss_clip": 0.01135458, + "auxiliary_loss_mlp": 0.01044729, + "balance_loss_clip": 1.04995012, + "balance_loss_mlp": 1.02815914, + "epoch": 0.3486547422215542, + "flos": 20156695781760.0, + "grad_norm": 2.5738259800272725, + "language_loss": 0.76111758, + "learning_rate": 3.026080335875485e-06, + "loss": 0.78291941, + "num_input_tokens_seen": 124796775, + "step": 5799, + "time_per_iteration": 2.629671096801758 + }, + { + "auxiliary_loss_clip": 0.01073192, + "auxiliary_loss_mlp": 0.01035978, + "balance_loss_clip": 1.05208993, + "balance_loss_mlp": 1.02083826, + "epoch": 0.34871486547422215, + "flos": 20230348619520.0, + "grad_norm": 2.242229362705527, + "language_loss": 0.75801086, + "learning_rate": 3.025746016302734e-06, + "loss": 0.77910256, + "num_input_tokens_seen": 124815825, + "step": 5800, + "time_per_iteration": 3.047725200653076 + }, + { + "auxiliary_loss_clip": 0.01112927, + "auxiliary_loss_mlp": 0.00774006, + "balance_loss_clip": 1.04720354, + "balance_loss_mlp": 1.00079536, + "epoch": 0.3487749887268901, + "flos": 44053234882560.0, + "grad_norm": 2.6257316922509286, + "language_loss": 0.67468953, + "learning_rate": 3.025411657833591e-06, + "loss": 0.69355887, + "num_input_tokens_seen": 124838420, + "step": 5801, + "time_per_iteration": 3.2364816665649414 + }, + { + "auxiliary_loss_clip": 0.01103773, + "auxiliary_loss_mlp": 0.010448, + "balance_loss_clip": 1.04506934, + "balance_loss_mlp": 1.028754, + "epoch": 0.3488351119795581, + "flos": 23295144666240.0, + "grad_norm": 1.8428676315803219, + "language_loss": 0.76738638, + "learning_rate": 3.025077260480735e-06, + "loss": 0.78887206, + "num_input_tokens_seen": 124857320, + "step": 5802, + "time_per_iteration": 2.7959024906158447 + }, + { + "auxiliary_loss_clip": 0.01053855, + "auxiliary_loss_mlp": 0.01037371, + "balance_loss_clip": 1.03989601, + "balance_loss_mlp": 1.02219605, + "epoch": 0.34889523523222604, + "flos": 19934839428480.0, + "grad_norm": 1.7816673584343024, + "language_loss": 0.78991377, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.81082606, + "num_input_tokens_seen": 124875685, + "step": 5803, + "time_per_iteration": 2.8440747261047363 + }, + { + "auxiliary_loss_clip": 0.01111548, + "auxiliary_loss_mlp": 0.00774436, + "balance_loss_clip": 1.04601288, + "balance_loss_mlp": 1.00073576, + "epoch": 0.348955358484894, + "flos": 30446179816320.0, + "grad_norm": 6.169621760932873, + "language_loss": 0.67899323, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.69785309, + "num_input_tokens_seen": 124895960, + "step": 5804, + "time_per_iteration": 2.8011341094970703 + }, + { + "auxiliary_loss_clip": 0.01109039, + "auxiliary_loss_mlp": 0.01046207, + "balance_loss_clip": 1.05153811, + "balance_loss_mlp": 1.0306263, + "epoch": 0.349015481737562, + "flos": 17999972490240.0, + "grad_norm": 1.9366950093174176, + "language_loss": 0.75972986, + "learning_rate": 3.024073835246702e-06, + "loss": 0.78128237, + "num_input_tokens_seen": 124914140, + "step": 5805, + "time_per_iteration": 2.735410213470459 + }, + { + "auxiliary_loss_clip": 0.01085261, + "auxiliary_loss_mlp": 0.0103851, + "balance_loss_clip": 1.040416, + "balance_loss_mlp": 1.0230304, + "epoch": 0.34907560499023, + "flos": 27198490694400.0, + "grad_norm": 2.3089286954803194, + "language_loss": 0.67154014, + "learning_rate": 3.023739282485814e-06, + "loss": 0.69277781, + "num_input_tokens_seen": 124934180, + "step": 5806, + "time_per_iteration": 2.793893575668335 + }, + { + "auxiliary_loss_clip": 0.01122813, + "auxiliary_loss_mlp": 0.0104012, + "balance_loss_clip": 1.05324221, + "balance_loss_mlp": 1.02445614, + "epoch": 0.34913572824289796, + "flos": 30226873328640.0, + "grad_norm": 1.5212397526739, + "language_loss": 0.71703929, + "learning_rate": 3.023404690904629e-06, + "loss": 0.73866862, + "num_input_tokens_seen": 124956060, + "step": 5807, + "time_per_iteration": 2.7225730419158936 + }, + { + "auxiliary_loss_clip": 0.01135343, + "auxiliary_loss_mlp": 0.0103686, + "balance_loss_clip": 1.04923332, + "balance_loss_mlp": 1.02102923, + "epoch": 0.3491958514955659, + "flos": 29971907614080.0, + "grad_norm": 2.9062872704377125, + "language_loss": 0.7383548, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.76007676, + "num_input_tokens_seen": 124976070, + "step": 5808, + "time_per_iteration": 4.38737154006958 + }, + { + "auxiliary_loss_clip": 0.01133483, + "auxiliary_loss_mlp": 0.01047071, + "balance_loss_clip": 1.05228174, + "balance_loss_mlp": 1.03241384, + "epoch": 0.3492559747482339, + "flos": 22783273902720.0, + "grad_norm": 1.513097370663534, + "language_loss": 0.84501046, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.86681598, + "num_input_tokens_seen": 124996995, + "step": 5809, + "time_per_iteration": 2.629246711730957 + }, + { + "auxiliary_loss_clip": 0.01106316, + "auxiliary_loss_mlp": 0.01034055, + "balance_loss_clip": 1.04668331, + "balance_loss_mlp": 1.01995289, + "epoch": 0.34931609800090185, + "flos": 26068022881920.0, + "grad_norm": 2.856878325415132, + "language_loss": 0.80759805, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.82900178, + "num_input_tokens_seen": 125015600, + "step": 5810, + "time_per_iteration": 2.815232276916504 + }, + { + "auxiliary_loss_clip": 0.01134295, + "auxiliary_loss_mlp": 0.01039591, + "balance_loss_clip": 1.05105019, + "balance_loss_mlp": 1.02539277, + "epoch": 0.3493762212535698, + "flos": 29242023252480.0, + "grad_norm": 1.9587859815348794, + "language_loss": 0.75694251, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.7786814, + "num_input_tokens_seen": 125035290, + "step": 5811, + "time_per_iteration": 4.295617580413818 + }, + { + "auxiliary_loss_clip": 0.0111498, + "auxiliary_loss_mlp": 0.01040701, + "balance_loss_clip": 1.04791081, + "balance_loss_mlp": 1.02616942, + "epoch": 0.3494363445062378, + "flos": 27126058919040.0, + "grad_norm": 1.5951936061604581, + "language_loss": 0.80199474, + "learning_rate": 3.021731151138386e-06, + "loss": 0.82355154, + "num_input_tokens_seen": 125057130, + "step": 5812, + "time_per_iteration": 2.8571486473083496 + }, + { + "auxiliary_loss_clip": 0.0106966, + "auxiliary_loss_mlp": 0.01038506, + "balance_loss_clip": 1.04193187, + "balance_loss_mlp": 1.02299738, + "epoch": 0.34949646775890575, + "flos": 12276207233280.0, + "grad_norm": 1.932575417997546, + "language_loss": 0.69221139, + "learning_rate": 3.021396326901918e-06, + "loss": 0.71329308, + "num_input_tokens_seen": 125073720, + "step": 5813, + "time_per_iteration": 4.446147441864014 + }, + { + "auxiliary_loss_clip": 0.01101223, + "auxiliary_loss_mlp": 0.00772918, + "balance_loss_clip": 1.04168797, + "balance_loss_mlp": 1.00074911, + "epoch": 0.3495565910115737, + "flos": 17165516659200.0, + "grad_norm": 2.168508070197816, + "language_loss": 0.76586467, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.7846061, + "num_input_tokens_seen": 125090635, + "step": 5814, + "time_per_iteration": 2.698594331741333 + }, + { + "auxiliary_loss_clip": 0.01114737, + "auxiliary_loss_mlp": 0.00773337, + "balance_loss_clip": 1.05010188, + "balance_loss_mlp": 1.00060046, + "epoch": 0.3496167142642417, + "flos": 26465661417600.0, + "grad_norm": 1.9777422761312171, + "language_loss": 0.84760284, + "learning_rate": 3.020726562247328e-06, + "loss": 0.86648357, + "num_input_tokens_seen": 125110070, + "step": 5815, + "time_per_iteration": 2.7839486598968506 + }, + { + "auxiliary_loss_clip": 0.01117022, + "auxiliary_loss_mlp": 0.01031007, + "balance_loss_clip": 1.04850423, + "balance_loss_mlp": 1.01695168, + "epoch": 0.34967683751690964, + "flos": 17414843938560.0, + "grad_norm": 2.1137892099104674, + "language_loss": 0.77541941, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.79689968, + "num_input_tokens_seen": 125125730, + "step": 5816, + "time_per_iteration": 2.6244633197784424 + }, + { + "auxiliary_loss_clip": 0.01122041, + "auxiliary_loss_mlp": 0.01042966, + "balance_loss_clip": 1.05198002, + "balance_loss_mlp": 1.0282141, + "epoch": 0.3497369607695776, + "flos": 22600021691520.0, + "grad_norm": 2.2643435778821246, + "language_loss": 0.5898062, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.61145627, + "num_input_tokens_seen": 125146195, + "step": 5817, + "time_per_iteration": 2.676058530807495 + }, + { + "auxiliary_loss_clip": 0.01065616, + "auxiliary_loss_mlp": 0.01004328, + "balance_loss_clip": 1.03704262, + "balance_loss_mlp": 1.00290895, + "epoch": 0.34979708402224563, + "flos": 68529374818560.0, + "grad_norm": 0.8661744616347857, + "language_loss": 0.59915632, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.61985576, + "num_input_tokens_seen": 125207790, + "step": 5818, + "time_per_iteration": 3.2298331260681152 + }, + { + "auxiliary_loss_clip": 0.0109396, + "auxiliary_loss_mlp": 0.01044055, + "balance_loss_clip": 1.04599476, + "balance_loss_mlp": 1.02892733, + "epoch": 0.3498572072749136, + "flos": 18989634988800.0, + "grad_norm": 2.0582091611638713, + "language_loss": 0.83473527, + "learning_rate": 3.019386568567123e-06, + "loss": 0.85611546, + "num_input_tokens_seen": 125226220, + "step": 5819, + "time_per_iteration": 2.6558237075805664 + }, + { + "auxiliary_loss_clip": 0.01106439, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.04502416, + "balance_loss_mlp": 1.01987886, + "epoch": 0.34991733052758156, + "flos": 27818883423360.0, + "grad_norm": 1.848700539441483, + "language_loss": 0.7078613, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.72926915, + "num_input_tokens_seen": 125247485, + "step": 5820, + "time_per_iteration": 2.703023672103882 + }, + { + "auxiliary_loss_clip": 0.01122902, + "auxiliary_loss_mlp": 0.01036767, + "balance_loss_clip": 1.04821718, + "balance_loss_mlp": 1.02288496, + "epoch": 0.3499774537802495, + "flos": 33584197737600.0, + "grad_norm": 1.691680241057735, + "language_loss": 0.70418453, + "learning_rate": 3.018716339744759e-06, + "loss": 0.7257812, + "num_input_tokens_seen": 125268625, + "step": 5821, + "time_per_iteration": 2.7258172035217285 + }, + { + "auxiliary_loss_clip": 0.01128016, + "auxiliary_loss_mlp": 0.01045237, + "balance_loss_clip": 1.05040097, + "balance_loss_mlp": 1.02945328, + "epoch": 0.3500375770329175, + "flos": 23476744851840.0, + "grad_norm": 3.022669367007059, + "language_loss": 0.73552108, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.75725359, + "num_input_tokens_seen": 125287530, + "step": 5822, + "time_per_iteration": 2.6288442611694336 + }, + { + "auxiliary_loss_clip": 0.01111612, + "auxiliary_loss_mlp": 0.01034787, + "balance_loss_clip": 1.04867673, + "balance_loss_mlp": 1.0193131, + "epoch": 0.35009770028558546, + "flos": 19026048401280.0, + "grad_norm": 13.86145468617928, + "language_loss": 0.78286207, + "learning_rate": 3.018045956403094e-06, + "loss": 0.80432606, + "num_input_tokens_seen": 125307020, + "step": 5823, + "time_per_iteration": 2.585644245147705 + }, + { + "auxiliary_loss_clip": 0.01050549, + "auxiliary_loss_mlp": 0.01002993, + "balance_loss_clip": 1.03169346, + "balance_loss_mlp": 1.00141954, + "epoch": 0.3501578235382534, + "flos": 68351868783360.0, + "grad_norm": 0.7268668465066358, + "language_loss": 0.59232962, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61286497, + "num_input_tokens_seen": 125370445, + "step": 5824, + "time_per_iteration": 3.2155251502990723 + }, + { + "auxiliary_loss_clip": 0.01110681, + "auxiliary_loss_mlp": 0.01041197, + "balance_loss_clip": 1.04737854, + "balance_loss_mlp": 1.02561092, + "epoch": 0.3502179467909214, + "flos": 21250893836160.0, + "grad_norm": 3.9873136748139126, + "language_loss": 0.84533477, + "learning_rate": 3.017375418643811e-06, + "loss": 0.86685359, + "num_input_tokens_seen": 125388900, + "step": 5825, + "time_per_iteration": 2.687849998474121 + }, + { + "auxiliary_loss_clip": 0.01123129, + "auxiliary_loss_mlp": 0.00772852, + "balance_loss_clip": 1.04982102, + "balance_loss_mlp": 1.00084817, + "epoch": 0.35027807004358935, + "flos": 11942955826560.0, + "grad_norm": 3.7970216760931654, + "language_loss": 0.83272213, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.85168195, + "num_input_tokens_seen": 125402675, + "step": 5826, + "time_per_iteration": 2.623713970184326 + }, + { + "auxiliary_loss_clip": 0.01108751, + "auxiliary_loss_mlp": 0.01045941, + "balance_loss_clip": 1.04680669, + "balance_loss_mlp": 1.0308249, + "epoch": 0.3503381932962573, + "flos": 21470918595840.0, + "grad_norm": 1.799644232020304, + "language_loss": 0.8068707, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.82841766, + "num_input_tokens_seen": 125421360, + "step": 5827, + "time_per_iteration": 2.7149739265441895 + }, + { + "auxiliary_loss_clip": 0.01080927, + "auxiliary_loss_mlp": 0.01041383, + "balance_loss_clip": 1.04276204, + "balance_loss_mlp": 1.02641606, + "epoch": 0.3503983165489253, + "flos": 21251109317760.0, + "grad_norm": 3.105536532024743, + "language_loss": 0.71077561, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.73199868, + "num_input_tokens_seen": 125440000, + "step": 5828, + "time_per_iteration": 2.7468550205230713 + }, + { + "auxiliary_loss_clip": 0.01126682, + "auxiliary_loss_mlp": 0.01050267, + "balance_loss_clip": 1.05060673, + "balance_loss_mlp": 1.0323143, + "epoch": 0.35045843980159325, + "flos": 27815723026560.0, + "grad_norm": 2.750124615693701, + "language_loss": 0.79695857, + "learning_rate": 3.016033880279248e-06, + "loss": 0.81872809, + "num_input_tokens_seen": 125460390, + "step": 5829, + "time_per_iteration": 2.6937646865844727 + }, + { + "auxiliary_loss_clip": 0.01096574, + "auxiliary_loss_mlp": 0.01044418, + "balance_loss_clip": 1.0481379, + "balance_loss_mlp": 1.02766919, + "epoch": 0.3505185630542612, + "flos": 25921148169600.0, + "grad_norm": 1.9090298023730403, + "language_loss": 0.72606629, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.74747616, + "num_input_tokens_seen": 125478410, + "step": 5830, + "time_per_iteration": 2.7369346618652344 + }, + { + "auxiliary_loss_clip": 0.01090166, + "auxiliary_loss_mlp": 0.01037306, + "balance_loss_clip": 1.04190445, + "balance_loss_mlp": 1.02131414, + "epoch": 0.35057868630692923, + "flos": 20521763660160.0, + "grad_norm": 2.5268343856675437, + "language_loss": 0.88473773, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.90601242, + "num_input_tokens_seen": 125495975, + "step": 5831, + "time_per_iteration": 2.716801166534424 + }, + { + "auxiliary_loss_clip": 0.01076431, + "auxiliary_loss_mlp": 0.01046131, + "balance_loss_clip": 1.04348278, + "balance_loss_mlp": 1.03036547, + "epoch": 0.3506388095595972, + "flos": 20448649526400.0, + "grad_norm": 2.8335622037275052, + "language_loss": 0.78706706, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.80829263, + "num_input_tokens_seen": 125515035, + "step": 5832, + "time_per_iteration": 2.719874143600464 + }, + { + "auxiliary_loss_clip": 0.01096023, + "auxiliary_loss_mlp": 0.01049214, + "balance_loss_clip": 1.04483593, + "balance_loss_mlp": 1.0303669, + "epoch": 0.35069893281226516, + "flos": 23109665811840.0, + "grad_norm": 2.771771323399588, + "language_loss": 0.71084702, + "learning_rate": 3.014691725465008e-06, + "loss": 0.73229945, + "num_input_tokens_seen": 125535555, + "step": 5833, + "time_per_iteration": 2.729029655456543 + }, + { + "auxiliary_loss_clip": 0.0111933, + "auxiliary_loss_mlp": 0.01035784, + "balance_loss_clip": 1.04690456, + "balance_loss_mlp": 1.02119827, + "epoch": 0.35075905606493313, + "flos": 27271999877760.0, + "grad_norm": 1.4652984704802052, + "language_loss": 0.80866987, + "learning_rate": 3.014356090536606e-06, + "loss": 0.830221, + "num_input_tokens_seen": 125558195, + "step": 5834, + "time_per_iteration": 2.6999855041503906 + }, + { + "auxiliary_loss_clip": 0.01086162, + "auxiliary_loss_mlp": 0.01041057, + "balance_loss_clip": 1.05142856, + "balance_loss_mlp": 1.02516639, + "epoch": 0.3508191793176011, + "flos": 19128608709120.0, + "grad_norm": 2.24398587431922, + "language_loss": 0.84067535, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.86194754, + "num_input_tokens_seen": 125575375, + "step": 5835, + "time_per_iteration": 2.7401607036590576 + }, + { + "auxiliary_loss_clip": 0.01072219, + "auxiliary_loss_mlp": 0.0104369, + "balance_loss_clip": 1.04324877, + "balance_loss_mlp": 1.02816927, + "epoch": 0.35087930257026906, + "flos": 25557588662400.0, + "grad_norm": 1.6286460178957367, + "language_loss": 0.76643491, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.78759408, + "num_input_tokens_seen": 125596745, + "step": 5836, + "time_per_iteration": 2.767824649810791 + }, + { + "auxiliary_loss_clip": 0.01095252, + "auxiliary_loss_mlp": 0.01044499, + "balance_loss_clip": 1.04785156, + "balance_loss_mlp": 1.02751756, + "epoch": 0.350939425822937, + "flos": 18004246208640.0, + "grad_norm": 2.0145924652365945, + "language_loss": 0.77402902, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.79542655, + "num_input_tokens_seen": 125613980, + "step": 5837, + "time_per_iteration": 2.684300661087036 + }, + { + "auxiliary_loss_clip": 0.01122261, + "auxiliary_loss_mlp": 0.01044889, + "balance_loss_clip": 1.04895687, + "balance_loss_mlp": 1.02941537, + "epoch": 0.350999549075605, + "flos": 22273198819200.0, + "grad_norm": 2.68275803808264, + "language_loss": 0.67695981, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.69863135, + "num_input_tokens_seen": 125632100, + "step": 5838, + "time_per_iteration": 2.6679129600524902 + }, + { + "auxiliary_loss_clip": 0.01133084, + "auxiliary_loss_mlp": 0.01041419, + "balance_loss_clip": 1.04808521, + "balance_loss_mlp": 1.02538526, + "epoch": 0.35105967232827295, + "flos": 14392279307520.0, + "grad_norm": 2.478699358378921, + "language_loss": 0.83575064, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.85749567, + "num_input_tokens_seen": 125649190, + "step": 5839, + "time_per_iteration": 2.7186849117279053 + }, + { + "auxiliary_loss_clip": 0.01125827, + "auxiliary_loss_mlp": 0.01045138, + "balance_loss_clip": 1.0484879, + "balance_loss_mlp": 1.02930689, + "epoch": 0.3511197955809409, + "flos": 25082346792960.0, + "grad_norm": 2.56286420283892, + "language_loss": 0.58882701, + "learning_rate": 3.012341473657572e-06, + "loss": 0.61053669, + "num_input_tokens_seen": 125668680, + "step": 5840, + "time_per_iteration": 2.7048165798187256 + }, + { + "auxiliary_loss_clip": 0.01093858, + "auxiliary_loss_mlp": 0.01043209, + "balance_loss_clip": 1.0449121, + "balance_loss_mlp": 1.02719963, + "epoch": 0.3511799188336089, + "flos": 25884160139520.0, + "grad_norm": 2.762376787670534, + "language_loss": 0.87442869, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.89579934, + "num_input_tokens_seen": 125686935, + "step": 5841, + "time_per_iteration": 2.763007402420044 + }, + { + "auxiliary_loss_clip": 0.01116677, + "auxiliary_loss_mlp": 0.01038697, + "balance_loss_clip": 1.04990196, + "balance_loss_mlp": 1.02083993, + "epoch": 0.35124004208627685, + "flos": 20083725302400.0, + "grad_norm": 1.9868500880648916, + "language_loss": 0.75116056, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.77271438, + "num_input_tokens_seen": 125707180, + "step": 5842, + "time_per_iteration": 2.703010082244873 + }, + { + "auxiliary_loss_clip": 0.01124735, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_clip": 1.04863322, + "balance_loss_mlp": 1.0302043, + "epoch": 0.3513001653389448, + "flos": 17783431349760.0, + "grad_norm": 2.134458584945634, + "language_loss": 0.68687361, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.70857882, + "num_input_tokens_seen": 125722780, + "step": 5843, + "time_per_iteration": 2.6459767818450928 + }, + { + "auxiliary_loss_clip": 0.01135637, + "auxiliary_loss_mlp": 0.01046534, + "balance_loss_clip": 1.05054379, + "balance_loss_mlp": 1.0305481, + "epoch": 0.3513602885916128, + "flos": 29387138198400.0, + "grad_norm": 2.0610262324560984, + "language_loss": 0.65392244, + "learning_rate": 3.010997627806655e-06, + "loss": 0.67574418, + "num_input_tokens_seen": 125742110, + "step": 5844, + "time_per_iteration": 2.6542131900787354 + }, + { + "auxiliary_loss_clip": 0.01119986, + "auxiliary_loss_mlp": 0.01042575, + "balance_loss_clip": 1.04791713, + "balance_loss_mlp": 1.02620745, + "epoch": 0.3514204118442808, + "flos": 16179876483840.0, + "grad_norm": 2.0120705985466394, + "language_loss": 0.75180912, + "learning_rate": 3.010661570469245e-06, + "loss": 0.77343476, + "num_input_tokens_seen": 125759980, + "step": 5845, + "time_per_iteration": 2.686753511428833 + }, + { + "auxiliary_loss_clip": 0.01122626, + "auxiliary_loss_mlp": 0.01043989, + "balance_loss_clip": 1.0485301, + "balance_loss_mlp": 1.02835488, + "epoch": 0.35148053509694877, + "flos": 23834665923840.0, + "grad_norm": 4.021226487899694, + "language_loss": 0.73548663, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.7571528, + "num_input_tokens_seen": 125772660, + "step": 5846, + "time_per_iteration": 2.67868971824646 + }, + { + "auxiliary_loss_clip": 0.01094187, + "auxiliary_loss_mlp": 0.01044379, + "balance_loss_clip": 1.04565465, + "balance_loss_mlp": 1.02834511, + "epoch": 0.35154065834961673, + "flos": 20991295267200.0, + "grad_norm": 1.687499817432144, + "language_loss": 0.756024, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.77740967, + "num_input_tokens_seen": 125791935, + "step": 5847, + "time_per_iteration": 2.749495267868042 + }, + { + "auxiliary_loss_clip": 0.011087, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.04465413, + "balance_loss_mlp": 1.01871789, + "epoch": 0.3516007816022847, + "flos": 33255471444480.0, + "grad_norm": 2.8847551511625675, + "language_loss": 0.71752924, + "learning_rate": 3.009653168561666e-06, + "loss": 0.73895657, + "num_input_tokens_seen": 125813455, + "step": 5848, + "time_per_iteration": 4.367843151092529 + }, + { + "auxiliary_loss_clip": 0.0111724, + "auxiliary_loss_mlp": 0.01051356, + "balance_loss_clip": 1.04754996, + "balance_loss_mlp": 1.03528619, + "epoch": 0.35166090485495266, + "flos": 11726953390080.0, + "grad_norm": 2.1303857634409455, + "language_loss": 0.89211285, + "learning_rate": 3.009316958003178e-06, + "loss": 0.91379881, + "num_input_tokens_seen": 125827660, + "step": 5849, + "time_per_iteration": 2.720156192779541 + }, + { + "auxiliary_loss_clip": 0.01112345, + "auxiliary_loss_mlp": 0.01035199, + "balance_loss_clip": 1.04670548, + "balance_loss_mlp": 1.01948714, + "epoch": 0.3517210281076206, + "flos": 22638446265600.0, + "grad_norm": 5.671837642447228, + "language_loss": 0.74645329, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.76792872, + "num_input_tokens_seen": 125846655, + "step": 5850, + "time_per_iteration": 5.769666910171509 + }, + { + "auxiliary_loss_clip": 0.01124277, + "auxiliary_loss_mlp": 0.01039165, + "balance_loss_clip": 1.05061293, + "balance_loss_mlp": 1.02304828, + "epoch": 0.3517811513602886, + "flos": 21322750993920.0, + "grad_norm": 4.453824391316201, + "language_loss": 0.75497609, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.77661049, + "num_input_tokens_seen": 125866290, + "step": 5851, + "time_per_iteration": 2.6903436183929443 + }, + { + "auxiliary_loss_clip": 0.01109028, + "auxiliary_loss_mlp": 0.01043585, + "balance_loss_clip": 1.047647, + "balance_loss_mlp": 1.02581048, + "epoch": 0.35184127461295656, + "flos": 21032880238080.0, + "grad_norm": 2.6842208339362714, + "language_loss": 0.8711859, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.892712, + "num_input_tokens_seen": 125884620, + "step": 5852, + "time_per_iteration": 4.37211275100708 + }, + { + "auxiliary_loss_clip": 0.01134086, + "auxiliary_loss_mlp": 0.01034974, + "balance_loss_clip": 1.05088282, + "balance_loss_mlp": 1.02020407, + "epoch": 0.3519013978656245, + "flos": 22455265881600.0, + "grad_norm": 4.894656899057391, + "language_loss": 0.67756367, + "learning_rate": 3.007971733162737e-06, + "loss": 0.69925427, + "num_input_tokens_seen": 125902430, + "step": 5853, + "time_per_iteration": 2.6657445430755615 + }, + { + "auxiliary_loss_clip": 0.0110992, + "auxiliary_loss_mlp": 0.01035315, + "balance_loss_clip": 1.04499912, + "balance_loss_mlp": 1.01943672, + "epoch": 0.3519615211182925, + "flos": 13115295918720.0, + "grad_norm": 1.9396695842158058, + "language_loss": 0.80834955, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.82980192, + "num_input_tokens_seen": 125920570, + "step": 5854, + "time_per_iteration": 2.741804361343384 + }, + { + "auxiliary_loss_clip": 0.0111683, + "auxiliary_loss_mlp": 0.01035573, + "balance_loss_clip": 1.05230534, + "balance_loss_mlp": 1.02117872, + "epoch": 0.35202164437096045, + "flos": 19135144984320.0, + "grad_norm": 2.236186864476635, + "language_loss": 0.73234653, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.75387061, + "num_input_tokens_seen": 125939800, + "step": 5855, + "time_per_iteration": 2.730731725692749 + }, + { + "auxiliary_loss_clip": 0.0113392, + "auxiliary_loss_mlp": 0.01038425, + "balance_loss_clip": 1.05024409, + "balance_loss_mlp": 1.02407861, + "epoch": 0.3520817676236284, + "flos": 26542187343360.0, + "grad_norm": 2.4482136775911427, + "language_loss": 0.71000826, + "learning_rate": 3.006962413152691e-06, + "loss": 0.73173165, + "num_input_tokens_seen": 125958720, + "step": 5856, + "time_per_iteration": 2.632906436920166 + }, + { + "auxiliary_loss_clip": 0.01121339, + "auxiliary_loss_mlp": 0.01047265, + "balance_loss_clip": 1.0479008, + "balance_loss_mlp": 1.03056359, + "epoch": 0.3521418908762964, + "flos": 44893472803200.0, + "grad_norm": 1.9582827204032656, + "language_loss": 0.61505377, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.63673985, + "num_input_tokens_seen": 125984310, + "step": 5857, + "time_per_iteration": 2.8992249965667725 + }, + { + "auxiliary_loss_clip": 0.01126198, + "auxiliary_loss_mlp": 0.0103782, + "balance_loss_clip": 1.05141187, + "balance_loss_mlp": 1.02197754, + "epoch": 0.3522020141289644, + "flos": 20187398931840.0, + "grad_norm": 2.047463358229584, + "language_loss": 0.73246485, + "learning_rate": 3.006289342204152e-06, + "loss": 0.75410509, + "num_input_tokens_seen": 126002410, + "step": 5858, + "time_per_iteration": 2.6754567623138428 + }, + { + "auxiliary_loss_clip": 0.01139705, + "auxiliary_loss_mlp": 0.01044718, + "balance_loss_clip": 1.05193448, + "balance_loss_mlp": 1.028947, + "epoch": 0.35226213738163237, + "flos": 27563917708800.0, + "grad_norm": 1.8174320112537778, + "language_loss": 0.7662344, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.78807867, + "num_input_tokens_seen": 126022490, + "step": 5859, + "time_per_iteration": 2.6464414596557617 + }, + { + "auxiliary_loss_clip": 0.01123734, + "auxiliary_loss_mlp": 0.0104748, + "balance_loss_clip": 1.05600715, + "balance_loss_mlp": 1.03037381, + "epoch": 0.35232226063430033, + "flos": 22966310632320.0, + "grad_norm": 2.0728265984729974, + "language_loss": 0.71452159, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.73623371, + "num_input_tokens_seen": 126042895, + "step": 5860, + "time_per_iteration": 2.7567954063415527 + }, + { + "auxiliary_loss_clip": 0.01107752, + "auxiliary_loss_mlp": 0.01042463, + "balance_loss_clip": 1.04505348, + "balance_loss_mlp": 1.02517724, + "epoch": 0.3523823838869683, + "flos": 19168290259200.0, + "grad_norm": 2.4820154826508896, + "language_loss": 0.66456246, + "learning_rate": 3.005279449623811e-06, + "loss": 0.6860646, + "num_input_tokens_seen": 126060130, + "step": 5861, + "time_per_iteration": 2.6954853534698486 + }, + { + "auxiliary_loss_clip": 0.01114832, + "auxiliary_loss_mlp": 0.01037396, + "balance_loss_clip": 1.05085611, + "balance_loss_mlp": 1.0220778, + "epoch": 0.35244250713963626, + "flos": 17930988420480.0, + "grad_norm": 2.552495084661914, + "language_loss": 0.66833258, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.68985492, + "num_input_tokens_seen": 126077850, + "step": 5862, + "time_per_iteration": 2.758626699447632 + }, + { + "auxiliary_loss_clip": 0.01111543, + "auxiliary_loss_mlp": 0.01046885, + "balance_loss_clip": 1.04932082, + "balance_loss_mlp": 1.02999306, + "epoch": 0.35250263039230423, + "flos": 21432529935360.0, + "grad_norm": 2.001922070828984, + "language_loss": 0.77027225, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.79185653, + "num_input_tokens_seen": 126095985, + "step": 5863, + "time_per_iteration": 2.692974328994751 + }, + { + "auxiliary_loss_clip": 0.01124448, + "auxiliary_loss_mlp": 0.01041257, + "balance_loss_clip": 1.05029762, + "balance_loss_mlp": 1.02602828, + "epoch": 0.3525627536449722, + "flos": 27416863428480.0, + "grad_norm": 2.204178263750967, + "language_loss": 0.75406265, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.77571976, + "num_input_tokens_seen": 126116070, + "step": 5864, + "time_per_iteration": 2.7303273677825928 + }, + { + "auxiliary_loss_clip": 0.01124417, + "auxiliary_loss_mlp": 0.01048097, + "balance_loss_clip": 1.04847336, + "balance_loss_mlp": 1.03237331, + "epoch": 0.35262287689764016, + "flos": 24789818430720.0, + "grad_norm": 2.3571129928423713, + "language_loss": 0.79312253, + "learning_rate": 3.003932392558793e-06, + "loss": 0.81484771, + "num_input_tokens_seen": 126135205, + "step": 5865, + "time_per_iteration": 2.6439075469970703 + }, + { + "auxiliary_loss_clip": 0.01136688, + "auxiliary_loss_mlp": 0.01047929, + "balance_loss_clip": 1.05626893, + "balance_loss_mlp": 1.03143001, + "epoch": 0.3526830001503081, + "flos": 17821604528640.0, + "grad_norm": 2.261768767041389, + "language_loss": 0.81215894, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.83400512, + "num_input_tokens_seen": 126151895, + "step": 5866, + "time_per_iteration": 2.649991035461426 + }, + { + "auxiliary_loss_clip": 0.01095064, + "auxiliary_loss_mlp": 0.01040513, + "balance_loss_clip": 1.04940605, + "balance_loss_mlp": 1.0227983, + "epoch": 0.3527431234029761, + "flos": 18078114528000.0, + "grad_norm": 2.4092573216113182, + "language_loss": 0.84224141, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.86359721, + "num_input_tokens_seen": 126168515, + "step": 5867, + "time_per_iteration": 2.7634172439575195 + }, + { + "auxiliary_loss_clip": 0.01142449, + "auxiliary_loss_mlp": 0.01051484, + "balance_loss_clip": 1.05421114, + "balance_loss_mlp": 1.03525996, + "epoch": 0.35280324665564405, + "flos": 19427350124160.0, + "grad_norm": 1.8115003163784764, + "language_loss": 0.74367464, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.76561391, + "num_input_tokens_seen": 126186460, + "step": 5868, + "time_per_iteration": 2.5986721515655518 + }, + { + "auxiliary_loss_clip": 0.01131163, + "auxiliary_loss_mlp": 0.01040977, + "balance_loss_clip": 1.05391645, + "balance_loss_mlp": 1.02457356, + "epoch": 0.352863369908312, + "flos": 21504027957120.0, + "grad_norm": 1.9536193185751474, + "language_loss": 0.6105355, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.63225693, + "num_input_tokens_seen": 126206170, + "step": 5869, + "time_per_iteration": 2.6737887859344482 + }, + { + "auxiliary_loss_clip": 0.0112854, + "auxiliary_loss_mlp": 0.01048512, + "balance_loss_clip": 1.05128717, + "balance_loss_mlp": 1.03254998, + "epoch": 0.35292349316098, + "flos": 22309504490880.0, + "grad_norm": 2.4234624332717347, + "language_loss": 0.74279565, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.76456618, + "num_input_tokens_seen": 126225605, + "step": 5870, + "time_per_iteration": 2.6921114921569824 + }, + { + "auxiliary_loss_clip": 0.01126478, + "auxiliary_loss_mlp": 0.01039703, + "balance_loss_clip": 1.05037582, + "balance_loss_mlp": 1.02376485, + "epoch": 0.352983616413648, + "flos": 33109745967360.0, + "grad_norm": 1.6641276231491144, + "language_loss": 0.71796882, + "learning_rate": 3.001910665140316e-06, + "loss": 0.73963058, + "num_input_tokens_seen": 126250230, + "step": 5871, + "time_per_iteration": 2.8457682132720947 + }, + { + "auxiliary_loss_clip": 0.01120204, + "auxiliary_loss_mlp": 0.01040363, + "balance_loss_clip": 1.04829907, + "balance_loss_mlp": 1.02547359, + "epoch": 0.35304373966631597, + "flos": 18696603836160.0, + "grad_norm": 2.0001362497177233, + "language_loss": 0.73279023, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.75439584, + "num_input_tokens_seen": 126268315, + "step": 5872, + "time_per_iteration": 2.6763055324554443 + }, + { + "auxiliary_loss_clip": 0.01114426, + "auxiliary_loss_mlp": 0.0077352, + "balance_loss_clip": 1.04808497, + "balance_loss_mlp": 1.00056779, + "epoch": 0.35310386291898394, + "flos": 23364954748800.0, + "grad_norm": 1.9067005964756008, + "language_loss": 0.82472706, + "learning_rate": 3.001236451924089e-06, + "loss": 0.84360659, + "num_input_tokens_seen": 126288390, + "step": 5873, + "time_per_iteration": 2.7487120628356934 + }, + { + "auxiliary_loss_clip": 0.0111852, + "auxiliary_loss_mlp": 0.01055173, + "balance_loss_clip": 1.04805684, + "balance_loss_mlp": 1.03743458, + "epoch": 0.3531639861716519, + "flos": 24461954064000.0, + "grad_norm": 2.0747562837168956, + "language_loss": 0.65867126, + "learning_rate": 3.000899288359104e-06, + "loss": 0.68040824, + "num_input_tokens_seen": 126305750, + "step": 5874, + "time_per_iteration": 2.717100143432617 + }, + { + "auxiliary_loss_clip": 0.01065517, + "auxiliary_loss_mlp": 0.01018804, + "balance_loss_clip": 1.04397154, + "balance_loss_mlp": 1.01712346, + "epoch": 0.35322410942431987, + "flos": 70312446881280.0, + "grad_norm": 0.7718710282270123, + "language_loss": 0.61513722, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63598049, + "num_input_tokens_seen": 126362495, + "step": 5875, + "time_per_iteration": 3.1768009662628174 + }, + { + "auxiliary_loss_clip": 0.0106968, + "auxiliary_loss_mlp": 0.01053019, + "balance_loss_clip": 1.04069328, + "balance_loss_mlp": 1.03722405, + "epoch": 0.35328423267698783, + "flos": 19820894509440.0, + "grad_norm": 1.9274751499515825, + "language_loss": 0.79748046, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.81870747, + "num_input_tokens_seen": 126378320, + "step": 5876, + "time_per_iteration": 2.7911314964294434 + }, + { + "auxiliary_loss_clip": 0.01038976, + "auxiliary_loss_mlp": 0.00753375, + "balance_loss_clip": 1.03853297, + "balance_loss_mlp": 1.00146759, + "epoch": 0.3533443559296558, + "flos": 60826356391680.0, + "grad_norm": 0.6715924709851474, + "language_loss": 0.56771934, + "learning_rate": 2.999887569990088e-06, + "loss": 0.58564281, + "num_input_tokens_seen": 126442735, + "step": 5877, + "time_per_iteration": 3.3190126419067383 + }, + { + "auxiliary_loss_clip": 0.01106988, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.04755747, + "balance_loss_mlp": 1.02150357, + "epoch": 0.35340447918232376, + "flos": 24755775315840.0, + "grad_norm": 2.262624772342981, + "language_loss": 0.72041059, + "learning_rate": 2.999550254685024e-06, + "loss": 0.74185729, + "num_input_tokens_seen": 126463090, + "step": 5878, + "time_per_iteration": 2.769482135772705 + }, + { + "auxiliary_loss_clip": 0.01111223, + "auxiliary_loss_mlp": 0.01039233, + "balance_loss_clip": 1.0494144, + "balance_loss_mlp": 1.02333045, + "epoch": 0.3534646024349917, + "flos": 21796304924160.0, + "grad_norm": 1.9529875004972157, + "language_loss": 0.78282005, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.80432463, + "num_input_tokens_seen": 126482105, + "step": 5879, + "time_per_iteration": 2.7066614627838135 + }, + { + "auxiliary_loss_clip": 0.01111375, + "auxiliary_loss_mlp": 0.01046843, + "balance_loss_clip": 1.05344558, + "balance_loss_mlp": 1.0287354, + "epoch": 0.3535247256876597, + "flos": 20012119539840.0, + "grad_norm": 2.4774809869114547, + "language_loss": 0.63312674, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.65470898, + "num_input_tokens_seen": 126502125, + "step": 5880, + "time_per_iteration": 2.87187123298645 + }, + { + "auxiliary_loss_clip": 0.01116729, + "auxiliary_loss_mlp": 0.01037267, + "balance_loss_clip": 1.05014002, + "balance_loss_mlp": 1.02067327, + "epoch": 0.35358484894032766, + "flos": 18187929383040.0, + "grad_norm": 2.079670586085082, + "language_loss": 0.65503716, + "learning_rate": 2.998538081402727e-06, + "loss": 0.67657715, + "num_input_tokens_seen": 126521950, + "step": 5881, + "time_per_iteration": 2.701570510864258 + }, + { + "auxiliary_loss_clip": 0.01119778, + "auxiliary_loss_mlp": 0.01035576, + "balance_loss_clip": 1.05182576, + "balance_loss_mlp": 1.02047253, + "epoch": 0.3536449721929956, + "flos": 22820369673600.0, + "grad_norm": 1.437925300063569, + "language_loss": 0.75797737, + "learning_rate": 2.998200614562239e-06, + "loss": 0.77953088, + "num_input_tokens_seen": 126542445, + "step": 5882, + "time_per_iteration": 2.713350772857666 + }, + { + "auxiliary_loss_clip": 0.01112568, + "auxiliary_loss_mlp": 0.01044857, + "balance_loss_clip": 1.0485872, + "balance_loss_mlp": 1.02591491, + "epoch": 0.3537050954456636, + "flos": 26432336574720.0, + "grad_norm": 2.160470372067537, + "language_loss": 0.70095098, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.72252524, + "num_input_tokens_seen": 126560690, + "step": 5883, + "time_per_iteration": 2.77695631980896 + }, + { + "auxiliary_loss_clip": 0.01107169, + "auxiliary_loss_mlp": 0.01040706, + "balance_loss_clip": 1.04937398, + "balance_loss_mlp": 1.02364671, + "epoch": 0.3537652186983316, + "flos": 17197153562880.0, + "grad_norm": 3.3935912100169117, + "language_loss": 0.78052664, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.80200535, + "num_input_tokens_seen": 126577620, + "step": 5884, + "time_per_iteration": 2.8704800605773926 + }, + { + "auxiliary_loss_clip": 0.0111409, + "auxiliary_loss_mlp": 0.01036742, + "balance_loss_clip": 1.05093837, + "balance_loss_mlp": 1.02157819, + "epoch": 0.3538253419509996, + "flos": 19536769929600.0, + "grad_norm": 1.9052381201351025, + "language_loss": 0.7519542, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.77346253, + "num_input_tokens_seen": 126596235, + "step": 5885, + "time_per_iteration": 2.74930477142334 + }, + { + "auxiliary_loss_clip": 0.01088229, + "auxiliary_loss_mlp": 0.01040915, + "balance_loss_clip": 1.04355764, + "balance_loss_mlp": 1.02321255, + "epoch": 0.35388546520366754, + "flos": 12128578335360.0, + "grad_norm": 3.360136520151105, + "language_loss": 0.83904099, + "learning_rate": 2.996850368809606e-06, + "loss": 0.86033243, + "num_input_tokens_seen": 126612830, + "step": 5886, + "time_per_iteration": 2.9362361431121826 + }, + { + "auxiliary_loss_clip": 0.01139122, + "auxiliary_loss_mlp": 0.01039479, + "balance_loss_clip": 1.05223978, + "balance_loss_mlp": 1.02178788, + "epoch": 0.3539455884563355, + "flos": 19678149861120.0, + "grad_norm": 2.3342407880968765, + "language_loss": 0.78239143, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.8041774, + "num_input_tokens_seen": 126630910, + "step": 5887, + "time_per_iteration": 4.157519340515137 + }, + { + "auxiliary_loss_clip": 0.01079386, + "auxiliary_loss_mlp": 0.01047635, + "balance_loss_clip": 1.04380405, + "balance_loss_mlp": 1.03155398, + "epoch": 0.35400571170900347, + "flos": 18072045129600.0, + "grad_norm": 3.4693260211189614, + "language_loss": 0.65532601, + "learning_rate": 2.996175019078089e-06, + "loss": 0.67659628, + "num_input_tokens_seen": 126648365, + "step": 5888, + "time_per_iteration": 2.7693519592285156 + }, + { + "auxiliary_loss_clip": 0.01108859, + "auxiliary_loss_mlp": 0.01038745, + "balance_loss_clip": 1.04853678, + "balance_loss_mlp": 1.02278328, + "epoch": 0.35406583496167143, + "flos": 26068058795520.0, + "grad_norm": 2.324375134725136, + "language_loss": 0.77100271, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.7924788, + "num_input_tokens_seen": 126667500, + "step": 5889, + "time_per_iteration": 4.211338996887207 + }, + { + "auxiliary_loss_clip": 0.0110217, + "auxiliary_loss_mlp": 0.01041504, + "balance_loss_clip": 1.05017257, + "balance_loss_mlp": 1.0262332, + "epoch": 0.3541259582143394, + "flos": 19792453916160.0, + "grad_norm": 2.074151752869495, + "language_loss": 0.81132901, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.83276576, + "num_input_tokens_seen": 126686820, + "step": 5890, + "time_per_iteration": 4.248823642730713 + }, + { + "auxiliary_loss_clip": 0.01112591, + "auxiliary_loss_mlp": 0.01034659, + "balance_loss_clip": 1.04692972, + "balance_loss_mlp": 1.01979923, + "epoch": 0.35418608146700736, + "flos": 24022084112640.0, + "grad_norm": 1.8036187380252735, + "language_loss": 0.79384875, + "learning_rate": 2.99516171119991e-06, + "loss": 0.81532121, + "num_input_tokens_seen": 126706965, + "step": 5891, + "time_per_iteration": 4.335815668106079 + }, + { + "auxiliary_loss_clip": 0.01099264, + "auxiliary_loss_mlp": 0.01046084, + "balance_loss_clip": 1.04669261, + "balance_loss_mlp": 1.0285244, + "epoch": 0.35424620471967533, + "flos": 12385770693120.0, + "grad_norm": 2.015603194975926, + "language_loss": 0.73404211, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.75549555, + "num_input_tokens_seen": 126724015, + "step": 5892, + "time_per_iteration": 2.760498046875 + }, + { + "auxiliary_loss_clip": 0.01112321, + "auxiliary_loss_mlp": 0.01041472, + "balance_loss_clip": 1.04650092, + "balance_loss_mlp": 1.02434158, + "epoch": 0.3543063279723433, + "flos": 19673624747520.0, + "grad_norm": 2.094655212929219, + "language_loss": 0.6720162, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.6935541, + "num_input_tokens_seen": 126737565, + "step": 5893, + "time_per_iteration": 2.671706199645996 + }, + { + "auxiliary_loss_clip": 0.01084647, + "auxiliary_loss_mlp": 0.01041527, + "balance_loss_clip": 1.04317796, + "balance_loss_mlp": 1.02440834, + "epoch": 0.35436645122501126, + "flos": 21909208348800.0, + "grad_norm": 1.9115541405313234, + "language_loss": 0.69860309, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.71986485, + "num_input_tokens_seen": 126756095, + "step": 5894, + "time_per_iteration": 2.720066785812378 + }, + { + "auxiliary_loss_clip": 0.01111006, + "auxiliary_loss_mlp": 0.00773076, + "balance_loss_clip": 1.04764175, + "balance_loss_mlp": 1.00055242, + "epoch": 0.3544265744776792, + "flos": 21719527603200.0, + "grad_norm": 1.7998653616668008, + "language_loss": 0.74833035, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.76717114, + "num_input_tokens_seen": 126775455, + "step": 5895, + "time_per_iteration": 2.8295304775238037 + }, + { + "auxiliary_loss_clip": 0.011052, + "auxiliary_loss_mlp": 0.01040742, + "balance_loss_clip": 1.04288006, + "balance_loss_mlp": 1.02485108, + "epoch": 0.3544866977303472, + "flos": 21213223447680.0, + "grad_norm": 2.053997857318945, + "language_loss": 0.83762395, + "learning_rate": 2.993472110174491e-06, + "loss": 0.85908329, + "num_input_tokens_seen": 126792320, + "step": 5896, + "time_per_iteration": 2.723158836364746 + }, + { + "auxiliary_loss_clip": 0.01111237, + "auxiliary_loss_mlp": 0.00773671, + "balance_loss_clip": 1.04756641, + "balance_loss_mlp": 1.0005331, + "epoch": 0.35454682098301515, + "flos": 29311402371840.0, + "grad_norm": 1.7709518935889355, + "language_loss": 0.70033729, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.71918637, + "num_input_tokens_seen": 126813680, + "step": 5897, + "time_per_iteration": 2.744617223739624 + }, + { + "auxiliary_loss_clip": 0.01111293, + "auxiliary_loss_mlp": 0.01046033, + "balance_loss_clip": 1.04829669, + "balance_loss_mlp": 1.02830625, + "epoch": 0.3546069442356832, + "flos": 24316587722880.0, + "grad_norm": 3.0934933528513344, + "language_loss": 0.81546402, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.83703721, + "num_input_tokens_seen": 126834395, + "step": 5898, + "time_per_iteration": 2.77911376953125 + }, + { + "auxiliary_loss_clip": 0.0113395, + "auxiliary_loss_mlp": 0.01037456, + "balance_loss_clip": 1.04943967, + "balance_loss_mlp": 1.02232838, + "epoch": 0.35466706748835114, + "flos": 22857285876480.0, + "grad_norm": 5.100417261000322, + "language_loss": 0.73975331, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.7614674, + "num_input_tokens_seen": 126855145, + "step": 5899, + "time_per_iteration": 2.6566851139068604 + }, + { + "auxiliary_loss_clip": 0.0113747, + "auxiliary_loss_mlp": 0.00772565, + "balance_loss_clip": 1.04971743, + "balance_loss_mlp": 1.00056052, + "epoch": 0.3547271907410191, + "flos": 28330107742080.0, + "grad_norm": 1.7615083390778834, + "language_loss": 0.79458243, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.81368273, + "num_input_tokens_seen": 126873790, + "step": 5900, + "time_per_iteration": 2.6658642292022705 + }, + { + "auxiliary_loss_clip": 0.0111331, + "auxiliary_loss_mlp": 0.01044824, + "balance_loss_clip": 1.04659319, + "balance_loss_mlp": 1.0288384, + "epoch": 0.35478731399368707, + "flos": 23514092017920.0, + "grad_norm": 2.160550694830747, + "language_loss": 0.81303531, + "learning_rate": 2.991781567335093e-06, + "loss": 0.83461666, + "num_input_tokens_seen": 126892865, + "step": 5901, + "time_per_iteration": 2.711568593978882 + }, + { + "auxiliary_loss_clip": 0.01125037, + "auxiliary_loss_mlp": 0.00772744, + "balance_loss_clip": 1.05092883, + "balance_loss_mlp": 1.00049663, + "epoch": 0.35484743724635504, + "flos": 18624315715200.0, + "grad_norm": 2.0558354102165373, + "language_loss": 0.75869077, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.7776686, + "num_input_tokens_seen": 126911935, + "step": 5902, + "time_per_iteration": 2.6833012104034424 + }, + { + "auxiliary_loss_clip": 0.01123978, + "auxiliary_loss_mlp": 0.01036322, + "balance_loss_clip": 1.04852581, + "balance_loss_mlp": 1.02142096, + "epoch": 0.354907560499023, + "flos": 17384499924480.0, + "grad_norm": 2.534328384273088, + "language_loss": 0.70550704, + "learning_rate": 2.991105086850381e-06, + "loss": 0.72711003, + "num_input_tokens_seen": 126930040, + "step": 5903, + "time_per_iteration": 2.689303159713745 + }, + { + "auxiliary_loss_clip": 0.01128401, + "auxiliary_loss_mlp": 0.01036477, + "balance_loss_clip": 1.05025887, + "balance_loss_mlp": 1.02051437, + "epoch": 0.35496768375169097, + "flos": 19208546426880.0, + "grad_norm": 3.3775979872187203, + "language_loss": 0.7448622, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.76651096, + "num_input_tokens_seen": 126948390, + "step": 5904, + "time_per_iteration": 2.6360747814178467 + }, + { + "auxiliary_loss_clip": 0.01113034, + "auxiliary_loss_mlp": 0.00772738, + "balance_loss_clip": 1.04721618, + "balance_loss_mlp": 1.000543, + "epoch": 0.35502780700435893, + "flos": 18332792933760.0, + "grad_norm": 3.051840518778985, + "language_loss": 0.78653091, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.80538863, + "num_input_tokens_seen": 126964905, + "step": 5905, + "time_per_iteration": 2.8539419174194336 + }, + { + "auxiliary_loss_clip": 0.01101916, + "auxiliary_loss_mlp": 0.01038927, + "balance_loss_clip": 1.04842138, + "balance_loss_mlp": 1.02486014, + "epoch": 0.3550879302570269, + "flos": 15448555578240.0, + "grad_norm": 18.846860460510154, + "language_loss": 0.72740704, + "learning_rate": 2.990090084284356e-06, + "loss": 0.74881542, + "num_input_tokens_seen": 126982000, + "step": 5906, + "time_per_iteration": 2.7013392448425293 + }, + { + "auxiliary_loss_clip": 0.01109726, + "auxiliary_loss_mlp": 0.01039804, + "balance_loss_clip": 1.04908431, + "balance_loss_mlp": 1.02265012, + "epoch": 0.35514805350969486, + "flos": 21979197999360.0, + "grad_norm": 1.821131131528883, + "language_loss": 0.74746358, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.76895893, + "num_input_tokens_seen": 126998390, + "step": 5907, + "time_per_iteration": 2.7603847980499268 + }, + { + "auxiliary_loss_clip": 0.01062812, + "auxiliary_loss_mlp": 0.01042872, + "balance_loss_clip": 1.03682017, + "balance_loss_mlp": 1.02463293, + "epoch": 0.3552081767623628, + "flos": 29861949104640.0, + "grad_norm": 3.0473905008627775, + "language_loss": 0.7563526, + "learning_rate": 2.989413228164047e-06, + "loss": 0.77740943, + "num_input_tokens_seen": 127020220, + "step": 5908, + "time_per_iteration": 2.8653454780578613 + }, + { + "auxiliary_loss_clip": 0.01114185, + "auxiliary_loss_mlp": 0.01042445, + "balance_loss_clip": 1.05034626, + "balance_loss_mlp": 1.02736473, + "epoch": 0.3552683000150308, + "flos": 26432264747520.0, + "grad_norm": 2.926995842336842, + "language_loss": 0.68243527, + "learning_rate": 2.989074743819502e-06, + "loss": 0.70400161, + "num_input_tokens_seen": 127038585, + "step": 5909, + "time_per_iteration": 2.6967928409576416 + }, + { + "auxiliary_loss_clip": 0.01120713, + "auxiliary_loss_mlp": 0.01037454, + "balance_loss_clip": 1.0503571, + "balance_loss_mlp": 1.02271986, + "epoch": 0.35532842326769876, + "flos": 19785989468160.0, + "grad_norm": 2.2169711344959864, + "language_loss": 0.78605235, + "learning_rate": 2.988736221969144e-06, + "loss": 0.807634, + "num_input_tokens_seen": 127056215, + "step": 5910, + "time_per_iteration": 2.65592885017395 + }, + { + "auxiliary_loss_clip": 0.01111825, + "auxiliary_loss_mlp": 0.01044022, + "balance_loss_clip": 1.04383612, + "balance_loss_mlp": 1.02745175, + "epoch": 0.3553885465203668, + "flos": 17239277237760.0, + "grad_norm": 4.097628076705993, + "language_loss": 0.71322721, + "learning_rate": 2.98839766262581e-06, + "loss": 0.73478568, + "num_input_tokens_seen": 127075825, + "step": 5911, + "time_per_iteration": 2.6958134174346924 + }, + { + "auxiliary_loss_clip": 0.01122761, + "auxiliary_loss_mlp": 0.01041881, + "balance_loss_clip": 1.04820287, + "balance_loss_mlp": 1.02711153, + "epoch": 0.35544866977303474, + "flos": 14934350430720.0, + "grad_norm": 2.592685980990988, + "language_loss": 0.86703777, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.88868415, + "num_input_tokens_seen": 127091205, + "step": 5912, + "time_per_iteration": 2.615788221359253 + }, + { + "auxiliary_loss_clip": 0.01113661, + "auxiliary_loss_mlp": 0.01038659, + "balance_loss_clip": 1.04849911, + "balance_loss_mlp": 1.02413917, + "epoch": 0.3555087930257027, + "flos": 19756040503680.0, + "grad_norm": 1.9602305341473392, + "language_loss": 0.76948488, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.79100811, + "num_input_tokens_seen": 127109210, + "step": 5913, + "time_per_iteration": 2.7827799320220947 + }, + { + "auxiliary_loss_clip": 0.01098195, + "auxiliary_loss_mlp": 0.01036489, + "balance_loss_clip": 1.04796672, + "balance_loss_mlp": 1.02183783, + "epoch": 0.3555689162783707, + "flos": 21068252156160.0, + "grad_norm": 1.6272917241322848, + "language_loss": 0.82545209, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.8467989, + "num_input_tokens_seen": 127128400, + "step": 5914, + "time_per_iteration": 2.7242603302001953 + }, + { + "auxiliary_loss_clip": 0.01137835, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.05178475, + "balance_loss_mlp": 1.02247739, + "epoch": 0.35562903953103864, + "flos": 33069633454080.0, + "grad_norm": 2.9034799926536, + "language_loss": 0.70664769, + "learning_rate": 2.98704305057949e-06, + "loss": 0.72840279, + "num_input_tokens_seen": 127149965, + "step": 5915, + "time_per_iteration": 2.6785290241241455 + }, + { + "auxiliary_loss_clip": 0.01124956, + "auxiliary_loss_mlp": 0.01042738, + "balance_loss_clip": 1.04884696, + "balance_loss_mlp": 1.02823067, + "epoch": 0.3556891627837066, + "flos": 20557853850240.0, + "grad_norm": 1.7433450554379117, + "language_loss": 0.76387751, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.78555447, + "num_input_tokens_seen": 127169865, + "step": 5916, + "time_per_iteration": 2.646141529083252 + }, + { + "auxiliary_loss_clip": 0.01103991, + "auxiliary_loss_mlp": 0.01039438, + "balance_loss_clip": 1.04549897, + "balance_loss_mlp": 1.02451277, + "epoch": 0.35574928603637457, + "flos": 20703327932160.0, + "grad_norm": 1.7213233773991115, + "language_loss": 0.88551259, + "learning_rate": 2.986365519932332e-06, + "loss": 0.9069469, + "num_input_tokens_seen": 127188075, + "step": 5917, + "time_per_iteration": 2.735424757003784 + }, + { + "auxiliary_loss_clip": 0.01057648, + "auxiliary_loss_mlp": 0.01050179, + "balance_loss_clip": 1.03888357, + "balance_loss_mlp": 1.03190458, + "epoch": 0.35580940928904253, + "flos": 15194595444480.0, + "grad_norm": 2.1986231946039916, + "language_loss": 0.74800515, + "learning_rate": 2.98602669849771e-06, + "loss": 0.76908338, + "num_input_tokens_seen": 127206065, + "step": 5918, + "time_per_iteration": 2.759612798690796 + }, + { + "auxiliary_loss_clip": 0.01046226, + "auxiliary_loss_mlp": 0.01004318, + "balance_loss_clip": 1.03416467, + "balance_loss_mlp": 1.00212467, + "epoch": 0.3558695325417105, + "flos": 58639145431680.0, + "grad_norm": 0.9523078238877629, + "language_loss": 0.63871694, + "learning_rate": 2.985687839672857e-06, + "loss": 0.65922242, + "num_input_tokens_seen": 127257885, + "step": 5919, + "time_per_iteration": 2.974400281906128 + }, + { + "auxiliary_loss_clip": 0.01125949, + "auxiliary_loss_mlp": 0.01037737, + "balance_loss_clip": 1.05126309, + "balance_loss_mlp": 1.02168, + "epoch": 0.35592965579437846, + "flos": 22018233104640.0, + "grad_norm": 2.3466450300124952, + "language_loss": 0.73515332, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.75679016, + "num_input_tokens_seen": 127275550, + "step": 5920, + "time_per_iteration": 2.6402368545532227 + }, + { + "auxiliary_loss_clip": 0.01092607, + "auxiliary_loss_mlp": 0.01035798, + "balance_loss_clip": 1.0452888, + "balance_loss_mlp": 1.02082539, + "epoch": 0.35598977904704643, + "flos": 23367684182400.0, + "grad_norm": 2.020155019062759, + "language_loss": 0.76745147, + "learning_rate": 2.985010009903857e-06, + "loss": 0.78873557, + "num_input_tokens_seen": 127295110, + "step": 5921, + "time_per_iteration": 2.7224855422973633 + }, + { + "auxiliary_loss_clip": 0.01112186, + "auxiliary_loss_mlp": 0.01038012, + "balance_loss_clip": 1.04887438, + "balance_loss_mlp": 1.0231111, + "epoch": 0.3560499022997144, + "flos": 17785334770560.0, + "grad_norm": 2.0978128065546717, + "language_loss": 0.68095905, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.702461, + "num_input_tokens_seen": 127312865, + "step": 5922, + "time_per_iteration": 2.6849706172943115 + }, + { + "auxiliary_loss_clip": 0.01120912, + "auxiliary_loss_mlp": 0.01035687, + "balance_loss_clip": 1.04752564, + "balance_loss_mlp": 1.02032125, + "epoch": 0.35611002555238236, + "flos": 20740459616640.0, + "grad_norm": 3.470851899346702, + "language_loss": 0.79121947, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.81278539, + "num_input_tokens_seen": 127331710, + "step": 5923, + "time_per_iteration": 2.659977436065674 + }, + { + "auxiliary_loss_clip": 0.01118161, + "auxiliary_loss_mlp": 0.01042419, + "balance_loss_clip": 1.0530231, + "balance_loss_mlp": 1.02770221, + "epoch": 0.3561701488050504, + "flos": 19462219251840.0, + "grad_norm": 2.2084385051152946, + "language_loss": 0.85266459, + "learning_rate": 2.983992985144908e-06, + "loss": 0.87427044, + "num_input_tokens_seen": 127350950, + "step": 5924, + "time_per_iteration": 2.680994987487793 + }, + { + "auxiliary_loss_clip": 0.01109604, + "auxiliary_loss_mlp": 0.01046078, + "balance_loss_clip": 1.04669881, + "balance_loss_mlp": 1.02974653, + "epoch": 0.35623027205771834, + "flos": 30774942023040.0, + "grad_norm": 3.12021389910605, + "language_loss": 0.77619767, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.79775453, + "num_input_tokens_seen": 127369385, + "step": 5925, + "time_per_iteration": 2.854043960571289 + }, + { + "auxiliary_loss_clip": 0.01078608, + "auxiliary_loss_mlp": 0.01047631, + "balance_loss_clip": 1.04546142, + "balance_loss_mlp": 1.03274155, + "epoch": 0.3562903953103863, + "flos": 16981079299200.0, + "grad_norm": 2.0406100546628108, + "language_loss": 0.75402963, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.77529198, + "num_input_tokens_seen": 127386965, + "step": 5926, + "time_per_iteration": 4.347430467605591 + }, + { + "auxiliary_loss_clip": 0.01110536, + "auxiliary_loss_mlp": 0.00773423, + "balance_loss_clip": 1.04907203, + "balance_loss_mlp": 1.00041842, + "epoch": 0.3563505185630543, + "flos": 23839837482240.0, + "grad_norm": 2.7011184644215254, + "language_loss": 0.69563019, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.71446979, + "num_input_tokens_seen": 127406075, + "step": 5927, + "time_per_iteration": 2.8237216472625732 + }, + { + "auxiliary_loss_clip": 0.01136293, + "auxiliary_loss_mlp": 0.01040585, + "balance_loss_clip": 1.05083871, + "balance_loss_mlp": 1.0256958, + "epoch": 0.35641064181572224, + "flos": 22273450214400.0, + "grad_norm": 2.594343371199836, + "language_loss": 0.79681075, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.81857955, + "num_input_tokens_seen": 127425350, + "step": 5928, + "time_per_iteration": 4.171353340148926 + }, + { + "auxiliary_loss_clip": 0.01139765, + "auxiliary_loss_mlp": 0.01040338, + "balance_loss_clip": 1.05304861, + "balance_loss_mlp": 1.02473354, + "epoch": 0.3564707650683902, + "flos": 23001251587200.0, + "grad_norm": 1.4355701611092584, + "language_loss": 0.81758744, + "learning_rate": 2.982297197789215e-06, + "loss": 0.83938849, + "num_input_tokens_seen": 127446335, + "step": 5929, + "time_per_iteration": 4.3162572383880615 + }, + { + "auxiliary_loss_clip": 0.01120871, + "auxiliary_loss_mlp": 0.01037566, + "balance_loss_clip": 1.04776335, + "balance_loss_mlp": 1.02304602, + "epoch": 0.35653088832105817, + "flos": 14684268965760.0, + "grad_norm": 1.9323399136404307, + "language_loss": 0.70277226, + "learning_rate": 2.981957928520201e-06, + "loss": 0.72435665, + "num_input_tokens_seen": 127462795, + "step": 5930, + "time_per_iteration": 2.6527109146118164 + }, + { + "auxiliary_loss_clip": 0.01131875, + "auxiliary_loss_mlp": 0.01045641, + "balance_loss_clip": 1.05533779, + "balance_loss_mlp": 1.02960742, + "epoch": 0.35659101157372614, + "flos": 23477068074240.0, + "grad_norm": 2.2535070260025147, + "language_loss": 0.6758765, + "learning_rate": 2.981618622015244e-06, + "loss": 0.69765162, + "num_input_tokens_seen": 127482675, + "step": 5931, + "time_per_iteration": 4.3453147411346436 + }, + { + "auxiliary_loss_clip": 0.0112554, + "auxiliary_loss_mlp": 0.01040124, + "balance_loss_clip": 1.04992425, + "balance_loss_mlp": 1.02531803, + "epoch": 0.3566511348263941, + "flos": 26578672583040.0, + "grad_norm": 1.9436277425022137, + "language_loss": 0.67792088, + "learning_rate": 2.981279278287211e-06, + "loss": 0.69957745, + "num_input_tokens_seen": 127502275, + "step": 5932, + "time_per_iteration": 2.700096368789673 + }, + { + "auxiliary_loss_clip": 0.01082532, + "auxiliary_loss_mlp": 0.01033095, + "balance_loss_clip": 1.04578543, + "balance_loss_mlp": 1.01849222, + "epoch": 0.35671125807906207, + "flos": 13115008609920.0, + "grad_norm": 5.160615382495107, + "language_loss": 0.78454852, + "learning_rate": 2.980939897348969e-06, + "loss": 0.80570471, + "num_input_tokens_seen": 127520195, + "step": 5933, + "time_per_iteration": 2.6900391578674316 + }, + { + "auxiliary_loss_clip": 0.01121777, + "auxiliary_loss_mlp": 0.01052933, + "balance_loss_clip": 1.0480361, + "balance_loss_mlp": 1.03600574, + "epoch": 0.35677138133173003, + "flos": 33000577557120.0, + "grad_norm": 1.6861574442761758, + "language_loss": 0.69256425, + "learning_rate": 2.980600479213388e-06, + "loss": 0.7143113, + "num_input_tokens_seen": 127544495, + "step": 5934, + "time_per_iteration": 2.7415738105773926 + }, + { + "auxiliary_loss_clip": 0.01117054, + "auxiliary_loss_mlp": 0.0077763, + "balance_loss_clip": 1.05076528, + "balance_loss_mlp": 1.00057197, + "epoch": 0.356831504584398, + "flos": 20777842696320.0, + "grad_norm": 1.9577931058258786, + "language_loss": 0.70848507, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.72743189, + "num_input_tokens_seen": 127563810, + "step": 5935, + "time_per_iteration": 2.689974069595337 + }, + { + "auxiliary_loss_clip": 0.01105553, + "auxiliary_loss_mlp": 0.01040367, + "balance_loss_clip": 1.04790044, + "balance_loss_mlp": 1.02414298, + "epoch": 0.35689162783706596, + "flos": 12165566365440.0, + "grad_norm": 2.8406009493899567, + "language_loss": 0.7755211, + "learning_rate": 2.979921531401692e-06, + "loss": 0.79698032, + "num_input_tokens_seen": 127579065, + "step": 5936, + "time_per_iteration": 2.741913318634033 + }, + { + "auxiliary_loss_clip": 0.0112859, + "auxiliary_loss_mlp": 0.00773213, + "balance_loss_clip": 1.05281317, + "balance_loss_mlp": 1.00073922, + "epoch": 0.356951751089734, + "flos": 23841489507840.0, + "grad_norm": 1.4219917851433757, + "language_loss": 0.64282179, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.66183978, + "num_input_tokens_seen": 127599105, + "step": 5937, + "time_per_iteration": 2.698432207107544 + }, + { + "auxiliary_loss_clip": 0.011437, + "auxiliary_loss_mlp": 0.00773044, + "balance_loss_clip": 1.05475211, + "balance_loss_mlp": 1.00064254, + "epoch": 0.35701187434240195, + "flos": 11722176881280.0, + "grad_norm": 3.0634993604384744, + "language_loss": 0.78483748, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.80400497, + "num_input_tokens_seen": 127614940, + "step": 5938, + "time_per_iteration": 2.617074489593506 + }, + { + "auxiliary_loss_clip": 0.01104152, + "auxiliary_loss_mlp": 0.01042471, + "balance_loss_clip": 1.05522823, + "balance_loss_mlp": 1.0276773, + "epoch": 0.3570719975950699, + "flos": 24898879100160.0, + "grad_norm": 1.4921508018011957, + "language_loss": 0.8058449, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.82731104, + "num_input_tokens_seen": 127634960, + "step": 5939, + "time_per_iteration": 2.805285930633545 + }, + { + "auxiliary_loss_clip": 0.01119857, + "auxiliary_loss_mlp": 0.01039048, + "balance_loss_clip": 1.05386829, + "balance_loss_mlp": 1.02343178, + "epoch": 0.3571321208477379, + "flos": 25994836920960.0, + "grad_norm": 2.412769849050775, + "language_loss": 0.79263425, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.81422341, + "num_input_tokens_seen": 127654545, + "step": 5940, + "time_per_iteration": 2.729759693145752 + }, + { + "auxiliary_loss_clip": 0.01122797, + "auxiliary_loss_mlp": 0.0103573, + "balance_loss_clip": 1.05434561, + "balance_loss_mlp": 1.01836729, + "epoch": 0.35719224410040584, + "flos": 14501663199360.0, + "grad_norm": 2.99992676537861, + "language_loss": 0.72561693, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.74720228, + "num_input_tokens_seen": 127672320, + "step": 5941, + "time_per_iteration": 2.7407357692718506 + }, + { + "auxiliary_loss_clip": 0.01131761, + "auxiliary_loss_mlp": 0.01043456, + "balance_loss_clip": 1.0537883, + "balance_loss_mlp": 1.02636182, + "epoch": 0.3572523673530738, + "flos": 31175453646720.0, + "grad_norm": 4.524453853263744, + "language_loss": 0.64234614, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.66409832, + "num_input_tokens_seen": 127693315, + "step": 5942, + "time_per_iteration": 2.693835735321045 + }, + { + "auxiliary_loss_clip": 0.01125006, + "auxiliary_loss_mlp": 0.0104058, + "balance_loss_clip": 1.05074191, + "balance_loss_mlp": 1.02442718, + "epoch": 0.3573124906057418, + "flos": 15851976203520.0, + "grad_norm": 1.88999720959261, + "language_loss": 0.7433207, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.76497656, + "num_input_tokens_seen": 127711570, + "step": 5943, + "time_per_iteration": 2.6655383110046387 + }, + { + "auxiliary_loss_clip": 0.0107084, + "auxiliary_loss_mlp": 0.01002098, + "balance_loss_clip": 1.04128122, + "balance_loss_mlp": 1.000512, + "epoch": 0.35737261385840974, + "flos": 60822729118080.0, + "grad_norm": 0.7930578325967097, + "language_loss": 0.60739905, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.62812841, + "num_input_tokens_seen": 127772475, + "step": 5944, + "time_per_iteration": 3.257052421569824 + }, + { + "auxiliary_loss_clip": 0.01113544, + "auxiliary_loss_mlp": 0.01038819, + "balance_loss_clip": 1.05017304, + "balance_loss_mlp": 1.02329779, + "epoch": 0.3574327371110777, + "flos": 18843765857280.0, + "grad_norm": 2.0176419730945554, + "language_loss": 0.72310007, + "learning_rate": 2.976864428379655e-06, + "loss": 0.74462366, + "num_input_tokens_seen": 127790940, + "step": 5945, + "time_per_iteration": 2.6320457458496094 + }, + { + "auxiliary_loss_clip": 0.01113199, + "auxiliary_loss_mlp": 0.00773448, + "balance_loss_clip": 1.04710388, + "balance_loss_mlp": 1.00053716, + "epoch": 0.35749286036374567, + "flos": 23549679417600.0, + "grad_norm": 2.1873404124300655, + "language_loss": 0.81147355, + "learning_rate": 2.976524564880326e-06, + "loss": 0.83034003, + "num_input_tokens_seen": 127808275, + "step": 5946, + "time_per_iteration": 2.7045581340789795 + }, + { + "auxiliary_loss_clip": 0.01142015, + "auxiliary_loss_mlp": 0.01041839, + "balance_loss_clip": 1.05382085, + "balance_loss_mlp": 1.02568626, + "epoch": 0.35755298361641363, + "flos": 21105491581440.0, + "grad_norm": 1.5286248167474699, + "language_loss": 0.68842459, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.71026313, + "num_input_tokens_seen": 127828840, + "step": 5947, + "time_per_iteration": 2.6360325813293457 + }, + { + "auxiliary_loss_clip": 0.01107164, + "auxiliary_loss_mlp": 0.01039633, + "balance_loss_clip": 1.04598188, + "balance_loss_mlp": 1.02426696, + "epoch": 0.3576131068690816, + "flos": 19245031666560.0, + "grad_norm": 4.061535671212192, + "language_loss": 0.76024956, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.78171754, + "num_input_tokens_seen": 127846240, + "step": 5948, + "time_per_iteration": 2.6968884468078613 + }, + { + "auxiliary_loss_clip": 0.01081903, + "auxiliary_loss_mlp": 0.01043894, + "balance_loss_clip": 1.04692364, + "balance_loss_mlp": 1.0291121, + "epoch": 0.35767323012174956, + "flos": 28654703971200.0, + "grad_norm": 1.8353415788349725, + "language_loss": 0.70553362, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.72679162, + "num_input_tokens_seen": 127866880, + "step": 5949, + "time_per_iteration": 2.8849079608917236 + }, + { + "auxiliary_loss_clip": 0.01113321, + "auxiliary_loss_mlp": 0.01041031, + "balance_loss_clip": 1.04892492, + "balance_loss_mlp": 1.02688098, + "epoch": 0.35773335337441753, + "flos": 17085363459840.0, + "grad_norm": 2.820547719587591, + "language_loss": 0.77489066, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.79643422, + "num_input_tokens_seen": 127883560, + "step": 5950, + "time_per_iteration": 2.6595206260681152 + }, + { + "auxiliary_loss_clip": 0.0112732, + "auxiliary_loss_mlp": 0.01041981, + "balance_loss_clip": 1.04834211, + "balance_loss_mlp": 1.02592397, + "epoch": 0.35779347662708555, + "flos": 15888605097600.0, + "grad_norm": 1.7233867228761917, + "language_loss": 0.72746027, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.74915326, + "num_input_tokens_seen": 127902330, + "step": 5951, + "time_per_iteration": 2.6544554233551025 + }, + { + "auxiliary_loss_clip": 0.01129333, + "auxiliary_loss_mlp": 0.01041471, + "balance_loss_clip": 1.05047357, + "balance_loss_mlp": 1.0256753, + "epoch": 0.3578535998797535, + "flos": 28658834035200.0, + "grad_norm": 2.2344429074284693, + "language_loss": 0.69326741, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.71497542, + "num_input_tokens_seen": 127922325, + "step": 5952, + "time_per_iteration": 2.7666146755218506 + }, + { + "auxiliary_loss_clip": 0.01080716, + "auxiliary_loss_mlp": 0.01049645, + "balance_loss_clip": 1.04122877, + "balance_loss_mlp": 1.03411233, + "epoch": 0.3579137231324215, + "flos": 37852432076160.0, + "grad_norm": 4.791743787800428, + "language_loss": 0.69651616, + "learning_rate": 2.974144484269449e-06, + "loss": 0.71781975, + "num_input_tokens_seen": 127942635, + "step": 5953, + "time_per_iteration": 2.900196075439453 + }, + { + "auxiliary_loss_clip": 0.01113192, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.0476222, + "balance_loss_mlp": 1.0198822, + "epoch": 0.35797384638508944, + "flos": 22346851656960.0, + "grad_norm": 2.3015234956442394, + "language_loss": 0.6670965, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.68857497, + "num_input_tokens_seen": 127962520, + "step": 5954, + "time_per_iteration": 2.7609100341796875 + }, + { + "auxiliary_loss_clip": 0.011102, + "auxiliary_loss_mlp": 0.01040434, + "balance_loss_clip": 1.04845512, + "balance_loss_mlp": 1.02633798, + "epoch": 0.3580339696377574, + "flos": 13589711775360.0, + "grad_norm": 1.9332002852280215, + "language_loss": 0.74798024, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.76948655, + "num_input_tokens_seen": 127981180, + "step": 5955, + "time_per_iteration": 2.727787733078003 + }, + { + "auxiliary_loss_clip": 0.01114534, + "auxiliary_loss_mlp": 0.01039755, + "balance_loss_clip": 1.04827058, + "balance_loss_mlp": 1.02546179, + "epoch": 0.3580940928904254, + "flos": 23768231719680.0, + "grad_norm": 1.745052650810224, + "language_loss": 0.75871193, + "learning_rate": 2.973123895369182e-06, + "loss": 0.78025484, + "num_input_tokens_seen": 127999725, + "step": 5956, + "time_per_iteration": 2.685006856918335 + }, + { + "auxiliary_loss_clip": 0.01133387, + "auxiliary_loss_mlp": 0.01035002, + "balance_loss_clip": 1.05088747, + "balance_loss_mlp": 1.0211376, + "epoch": 0.35815421614309334, + "flos": 19463871277440.0, + "grad_norm": 4.15447674959345, + "language_loss": 0.73543882, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.75712276, + "num_input_tokens_seen": 128018885, + "step": 5957, + "time_per_iteration": 2.6640098094940186 + }, + { + "auxiliary_loss_clip": 0.01113163, + "auxiliary_loss_mlp": 0.01037962, + "balance_loss_clip": 1.04958355, + "balance_loss_mlp": 1.02395511, + "epoch": 0.3582143393957613, + "flos": 23368186972800.0, + "grad_norm": 3.3283201757671037, + "language_loss": 0.70960939, + "learning_rate": 2.972443318242726e-06, + "loss": 0.73112065, + "num_input_tokens_seen": 128037875, + "step": 5958, + "time_per_iteration": 2.6962838172912598 + }, + { + "auxiliary_loss_clip": 0.01093969, + "auxiliary_loss_mlp": 0.01038485, + "balance_loss_clip": 1.04454029, + "balance_loss_mlp": 1.02435875, + "epoch": 0.35827446264842927, + "flos": 26323275905280.0, + "grad_norm": 2.5438119471533494, + "language_loss": 0.88630176, + "learning_rate": 2.972102974360324e-06, + "loss": 0.90762633, + "num_input_tokens_seen": 128056045, + "step": 5959, + "time_per_iteration": 2.713508129119873 + }, + { + "auxiliary_loss_clip": 0.0113447, + "auxiliary_loss_mlp": 0.010399, + "balance_loss_clip": 1.05009389, + "balance_loss_mlp": 1.02511787, + "epoch": 0.35833458590109724, + "flos": 30446610779520.0, + "grad_norm": 2.2010810744211486, + "language_loss": 0.58033586, + "learning_rate": 2.971762593615679e-06, + "loss": 0.60207957, + "num_input_tokens_seen": 128077815, + "step": 5960, + "time_per_iteration": 2.685009479522705 + }, + { + "auxiliary_loss_clip": 0.0113445, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_clip": 1.04900908, + "balance_loss_mlp": 1.0255897, + "epoch": 0.3583947091537652, + "flos": 14829886702080.0, + "grad_norm": 2.9088839798225035, + "language_loss": 0.75860739, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.7803694, + "num_input_tokens_seen": 128095460, + "step": 5961, + "time_per_iteration": 2.591665506362915 + }, + { + "auxiliary_loss_clip": 0.01103629, + "auxiliary_loss_mlp": 0.01037452, + "balance_loss_clip": 1.04985154, + "balance_loss_mlp": 1.022223, + "epoch": 0.35845483240643317, + "flos": 34240644743040.0, + "grad_norm": 1.7962139278871543, + "language_loss": 0.70392656, + "learning_rate": 2.971081721591294e-06, + "loss": 0.72533739, + "num_input_tokens_seen": 128118605, + "step": 5962, + "time_per_iteration": 2.78696346282959 + }, + { + "auxiliary_loss_clip": 0.01116632, + "auxiliary_loss_mlp": 0.01038106, + "balance_loss_clip": 1.0513072, + "balance_loss_mlp": 1.02532077, + "epoch": 0.35851495565910113, + "flos": 20960089326720.0, + "grad_norm": 3.937600501619356, + "language_loss": 0.75052911, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.77207649, + "num_input_tokens_seen": 128139205, + "step": 5963, + "time_per_iteration": 2.779210090637207 + }, + { + "auxiliary_loss_clip": 0.01136067, + "auxiliary_loss_mlp": 0.01044967, + "balance_loss_clip": 1.05189323, + "balance_loss_mlp": 1.03017306, + "epoch": 0.35857507891176915, + "flos": 22309863626880.0, + "grad_norm": 3.7087256254692305, + "language_loss": 0.78717148, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.80898178, + "num_input_tokens_seen": 128158765, + "step": 5964, + "time_per_iteration": 2.598621368408203 + }, + { + "auxiliary_loss_clip": 0.01112011, + "auxiliary_loss_mlp": 0.01041333, + "balance_loss_clip": 1.05019569, + "balance_loss_mlp": 1.02534723, + "epoch": 0.3586352021644371, + "flos": 23367863750400.0, + "grad_norm": 2.0226045347569857, + "language_loss": 0.66572571, + "learning_rate": 2.970060137410626e-06, + "loss": 0.6872592, + "num_input_tokens_seen": 128177850, + "step": 5965, + "time_per_iteration": 2.684847116470337 + }, + { + "auxiliary_loss_clip": 0.01132652, + "auxiliary_loss_mlp": 0.0077213, + "balance_loss_clip": 1.04819942, + "balance_loss_mlp": 1.00052619, + "epoch": 0.3586953254171051, + "flos": 27849227437440.0, + "grad_norm": 2.180178648475794, + "language_loss": 0.79150963, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.81055743, + "num_input_tokens_seen": 128196925, + "step": 5966, + "time_per_iteration": 4.321925163269043 + }, + { + "auxiliary_loss_clip": 0.01076497, + "auxiliary_loss_mlp": 0.01042048, + "balance_loss_clip": 1.04272628, + "balance_loss_mlp": 1.02573991, + "epoch": 0.35875544866977305, + "flos": 19500500171520.0, + "grad_norm": 2.3639555115609663, + "language_loss": 0.91201752, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.93320298, + "num_input_tokens_seen": 128213955, + "step": 5967, + "time_per_iteration": 2.7455573081970215 + }, + { + "auxiliary_loss_clip": 0.01101026, + "auxiliary_loss_mlp": 0.01053293, + "balance_loss_clip": 1.04794097, + "balance_loss_mlp": 1.03494644, + "epoch": 0.358815571922441, + "flos": 21471134077440.0, + "grad_norm": 5.4514250686274695, + "language_loss": 0.80356693, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.82511014, + "num_input_tokens_seen": 128232980, + "step": 5968, + "time_per_iteration": 4.176758766174316 + }, + { + "auxiliary_loss_clip": 0.01109306, + "auxiliary_loss_mlp": 0.01052187, + "balance_loss_clip": 1.04507756, + "balance_loss_mlp": 1.03602266, + "epoch": 0.358875695175109, + "flos": 21835411856640.0, + "grad_norm": 2.18425096992674, + "language_loss": 0.8341769, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.85579193, + "num_input_tokens_seen": 128252795, + "step": 5969, + "time_per_iteration": 4.278231382369995 + }, + { + "auxiliary_loss_clip": 0.01089525, + "auxiliary_loss_mlp": 0.01034474, + "balance_loss_clip": 1.04389262, + "balance_loss_mlp": 1.0201571, + "epoch": 0.35893581842777694, + "flos": 32011633330560.0, + "grad_norm": 2.040075228447558, + "language_loss": 0.72608048, + "learning_rate": 2.968356761586202e-06, + "loss": 0.74732047, + "num_input_tokens_seen": 128273115, + "step": 5970, + "time_per_iteration": 2.7784154415130615 + }, + { + "auxiliary_loss_clip": 0.01110616, + "auxiliary_loss_mlp": 0.01033542, + "balance_loss_clip": 1.04673791, + "balance_loss_mlp": 1.01868832, + "epoch": 0.3589959416804449, + "flos": 20485817124480.0, + "grad_norm": 1.7975318028216438, + "language_loss": 0.79562962, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.8170712, + "num_input_tokens_seen": 128292220, + "step": 5971, + "time_per_iteration": 4.519066333770752 + }, + { + "auxiliary_loss_clip": 0.01098267, + "auxiliary_loss_mlp": 0.01043063, + "balance_loss_clip": 1.04956031, + "balance_loss_mlp": 1.02766144, + "epoch": 0.3590560649331129, + "flos": 16180666583040.0, + "grad_norm": 1.754965992567408, + "language_loss": 0.78217793, + "learning_rate": 2.967675154124696e-06, + "loss": 0.80359125, + "num_input_tokens_seen": 128310305, + "step": 5972, + "time_per_iteration": 2.7724227905273438 + }, + { + "auxiliary_loss_clip": 0.01092509, + "auxiliary_loss_mlp": 0.01035503, + "balance_loss_clip": 1.04198921, + "balance_loss_mlp": 1.02043509, + "epoch": 0.35911618818578084, + "flos": 20375391738240.0, + "grad_norm": 2.4812117519320287, + "language_loss": 0.8120966, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.83337677, + "num_input_tokens_seen": 128328305, + "step": 5973, + "time_per_iteration": 2.8266379833221436 + }, + { + "auxiliary_loss_clip": 0.01042329, + "auxiliary_loss_mlp": 0.01005341, + "balance_loss_clip": 1.03088689, + "balance_loss_mlp": 1.0036602, + "epoch": 0.3591763114384488, + "flos": 41236691685120.0, + "grad_norm": 0.9056618080123127, + "language_loss": 0.56743383, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.58791053, + "num_input_tokens_seen": 128378380, + "step": 5974, + "time_per_iteration": 3.0758044719696045 + }, + { + "auxiliary_loss_clip": 0.01126274, + "auxiliary_loss_mlp": 0.01037404, + "balance_loss_clip": 1.04946434, + "balance_loss_mlp": 1.02339661, + "epoch": 0.35923643469111677, + "flos": 18695454600960.0, + "grad_norm": 2.5569125412900022, + "language_loss": 0.68787563, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.70951241, + "num_input_tokens_seen": 128394315, + "step": 5975, + "time_per_iteration": 2.657576084136963 + }, + { + "auxiliary_loss_clip": 0.01134392, + "auxiliary_loss_mlp": 0.01038612, + "balance_loss_clip": 1.04914975, + "balance_loss_mlp": 1.02426553, + "epoch": 0.35929655794378473, + "flos": 25009950931200.0, + "grad_norm": 1.804443520579843, + "language_loss": 0.79982442, + "learning_rate": 2.96631149897303e-06, + "loss": 0.82155442, + "num_input_tokens_seen": 128414515, + "step": 5976, + "time_per_iteration": 2.6197311878204346 + }, + { + "auxiliary_loss_clip": 0.01074524, + "auxiliary_loss_mlp": 0.01040105, + "balance_loss_clip": 1.04337287, + "balance_loss_mlp": 1.02404785, + "epoch": 0.35935668119645275, + "flos": 14975576265600.0, + "grad_norm": 1.9714674470262432, + "language_loss": 0.78818405, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.8093304, + "num_input_tokens_seen": 128430615, + "step": 5977, + "time_per_iteration": 2.735844612121582 + }, + { + "auxiliary_loss_clip": 0.01094647, + "auxiliary_loss_mlp": 0.01041851, + "balance_loss_clip": 1.04511654, + "balance_loss_mlp": 1.02789736, + "epoch": 0.3594168044491207, + "flos": 21178138838400.0, + "grad_norm": 2.560014574379112, + "language_loss": 0.79859221, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.8199572, + "num_input_tokens_seen": 128449480, + "step": 5978, + "time_per_iteration": 2.704134941101074 + }, + { + "auxiliary_loss_clip": 0.01135434, + "auxiliary_loss_mlp": 0.00773692, + "balance_loss_clip": 1.04890609, + "balance_loss_mlp": 1.00073409, + "epoch": 0.3594769277017887, + "flos": 27672152365440.0, + "grad_norm": 4.868201977342703, + "language_loss": 0.68310702, + "learning_rate": 2.965288372816436e-06, + "loss": 0.70219827, + "num_input_tokens_seen": 128471465, + "step": 5979, + "time_per_iteration": 2.667222499847412 + }, + { + "auxiliary_loss_clip": 0.01105596, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.04548645, + "balance_loss_mlp": 1.01876652, + "epoch": 0.35953705095445665, + "flos": 23002328995200.0, + "grad_norm": 6.298210491387724, + "language_loss": 0.67445302, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.69584739, + "num_input_tokens_seen": 128490645, + "step": 5980, + "time_per_iteration": 2.6262974739074707 + }, + { + "auxiliary_loss_clip": 0.01113802, + "auxiliary_loss_mlp": 0.01040029, + "balance_loss_clip": 1.04725266, + "balance_loss_mlp": 1.02324414, + "epoch": 0.3595971742071246, + "flos": 25513992529920.0, + "grad_norm": 1.8251567017824133, + "language_loss": 0.71328801, + "learning_rate": 2.964606105671327e-06, + "loss": 0.73482633, + "num_input_tokens_seen": 128510225, + "step": 5981, + "time_per_iteration": 2.696676254272461 + }, + { + "auxiliary_loss_clip": 0.01109039, + "auxiliary_loss_mlp": 0.01041685, + "balance_loss_clip": 1.04872131, + "balance_loss_mlp": 1.02498353, + "epoch": 0.3596572974597926, + "flos": 29862559635840.0, + "grad_norm": 2.0089481436352767, + "language_loss": 0.71294796, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.73445523, + "num_input_tokens_seen": 128530195, + "step": 5982, + "time_per_iteration": 2.7264244556427 + }, + { + "auxiliary_loss_clip": 0.01114107, + "auxiliary_loss_mlp": 0.01046667, + "balance_loss_clip": 1.04542398, + "balance_loss_mlp": 1.03115773, + "epoch": 0.35971742071246054, + "flos": 23112538899840.0, + "grad_norm": 1.8520970942870048, + "language_loss": 0.75614822, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.77775598, + "num_input_tokens_seen": 128549990, + "step": 5983, + "time_per_iteration": 2.6827449798583984 + }, + { + "auxiliary_loss_clip": 0.01140239, + "auxiliary_loss_mlp": 0.01042697, + "balance_loss_clip": 1.0510025, + "balance_loss_mlp": 1.02626991, + "epoch": 0.3597775439651285, + "flos": 16725359399040.0, + "grad_norm": 17.088734777986428, + "language_loss": 0.76256114, + "learning_rate": 2.96358243065131e-06, + "loss": 0.78439057, + "num_input_tokens_seen": 128567925, + "step": 5984, + "time_per_iteration": 2.695389747619629 + }, + { + "auxiliary_loss_clip": 0.01117847, + "auxiliary_loss_mlp": 0.00772256, + "balance_loss_clip": 1.04583967, + "balance_loss_mlp": 1.00047541, + "epoch": 0.3598376672177965, + "flos": 19719483436800.0, + "grad_norm": 1.8513392555770956, + "language_loss": 0.86111921, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.88002026, + "num_input_tokens_seen": 128585655, + "step": 5985, + "time_per_iteration": 2.6440985202789307 + }, + { + "auxiliary_loss_clip": 0.01117958, + "auxiliary_loss_mlp": 0.01045892, + "balance_loss_clip": 1.04564977, + "balance_loss_mlp": 1.03012037, + "epoch": 0.35989779047046444, + "flos": 17311529445120.0, + "grad_norm": 2.5721307867834406, + "language_loss": 0.72770452, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.74934304, + "num_input_tokens_seen": 128604820, + "step": 5986, + "time_per_iteration": 2.6169698238372803 + }, + { + "auxiliary_loss_clip": 0.01100506, + "auxiliary_loss_mlp": 0.01039862, + "balance_loss_clip": 1.04264784, + "balance_loss_mlp": 1.02473474, + "epoch": 0.3599579137231324, + "flos": 22711237176960.0, + "grad_norm": 2.1943162754876497, + "language_loss": 0.73883474, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.76023847, + "num_input_tokens_seen": 128623070, + "step": 5987, + "time_per_iteration": 2.72385573387146 + }, + { + "auxiliary_loss_clip": 0.0114047, + "auxiliary_loss_mlp": 0.01040262, + "balance_loss_clip": 1.05135727, + "balance_loss_mlp": 1.02456188, + "epoch": 0.36001803697580037, + "flos": 20959873845120.0, + "grad_norm": 2.225645474388546, + "language_loss": 0.69665354, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.71846086, + "num_input_tokens_seen": 128642430, + "step": 5988, + "time_per_iteration": 2.6040101051330566 + }, + { + "auxiliary_loss_clip": 0.01127132, + "auxiliary_loss_mlp": 0.01043358, + "balance_loss_clip": 1.04819822, + "balance_loss_mlp": 1.0278132, + "epoch": 0.36007816022846834, + "flos": 20485565729280.0, + "grad_norm": 2.281223653114012, + "language_loss": 0.73300481, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.75470972, + "num_input_tokens_seen": 128661285, + "step": 5989, + "time_per_iteration": 2.6532981395721436 + }, + { + "auxiliary_loss_clip": 0.01089891, + "auxiliary_loss_mlp": 0.01037817, + "balance_loss_clip": 1.04161119, + "balance_loss_mlp": 1.02237916, + "epoch": 0.36013828348113636, + "flos": 28001237794560.0, + "grad_norm": 3.1935134184936156, + "language_loss": 0.79950285, + "learning_rate": 2.961534094403931e-06, + "loss": 0.82077992, + "num_input_tokens_seen": 128682210, + "step": 5990, + "time_per_iteration": 2.785142421722412 + }, + { + "auxiliary_loss_clip": 0.01123339, + "auxiliary_loss_mlp": 0.0103344, + "balance_loss_clip": 1.04714704, + "balance_loss_mlp": 1.01775789, + "epoch": 0.3601984067338043, + "flos": 20082181017600.0, + "grad_norm": 2.506195073342272, + "language_loss": 0.83875644, + "learning_rate": 2.961192577338698e-06, + "loss": 0.86032414, + "num_input_tokens_seen": 128700445, + "step": 5991, + "time_per_iteration": 2.6310808658599854 + }, + { + "auxiliary_loss_clip": 0.01111044, + "auxiliary_loss_mlp": 0.01045829, + "balance_loss_clip": 1.04896092, + "balance_loss_mlp": 1.03068912, + "epoch": 0.3602585299864723, + "flos": 18617599872000.0, + "grad_norm": 2.314320245159203, + "language_loss": 0.75628942, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.77785814, + "num_input_tokens_seen": 128716855, + "step": 5992, + "time_per_iteration": 2.6698272228240967 + }, + { + "auxiliary_loss_clip": 0.01134951, + "auxiliary_loss_mlp": 0.01039412, + "balance_loss_clip": 1.04993188, + "balance_loss_mlp": 1.02385557, + "epoch": 0.36031865323914025, + "flos": 19573003774080.0, + "grad_norm": 2.1820524355734072, + "language_loss": 0.76886415, + "learning_rate": 2.960509433875627e-06, + "loss": 0.79060775, + "num_input_tokens_seen": 128735835, + "step": 5993, + "time_per_iteration": 2.5999341011047363 + }, + { + "auxiliary_loss_clip": 0.01111748, + "auxiliary_loss_mlp": 0.01054388, + "balance_loss_clip": 1.04750419, + "balance_loss_mlp": 1.03762674, + "epoch": 0.3603787764918082, + "flos": 17490615678720.0, + "grad_norm": 1.8546706349055275, + "language_loss": 0.74672681, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.76838815, + "num_input_tokens_seen": 128752465, + "step": 5994, + "time_per_iteration": 2.6691155433654785 + }, + { + "auxiliary_loss_clip": 0.01095118, + "auxiliary_loss_mlp": 0.01038312, + "balance_loss_clip": 1.0480628, + "balance_loss_mlp": 1.02331567, + "epoch": 0.3604388997444762, + "flos": 15523393564800.0, + "grad_norm": 2.7696142346579666, + "language_loss": 0.68887782, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.71021217, + "num_input_tokens_seen": 128770865, + "step": 5995, + "time_per_iteration": 2.7497267723083496 + }, + { + "auxiliary_loss_clip": 0.01104395, + "auxiliary_loss_mlp": 0.01046311, + "balance_loss_clip": 1.04338932, + "balance_loss_mlp": 1.03031349, + "epoch": 0.36049902299714415, + "flos": 17310883000320.0, + "grad_norm": 2.2305093143222248, + "language_loss": 0.82564914, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.84715617, + "num_input_tokens_seen": 128789730, + "step": 5996, + "time_per_iteration": 2.7227983474731445 + }, + { + "auxiliary_loss_clip": 0.01135369, + "auxiliary_loss_mlp": 0.0103828, + "balance_loss_clip": 1.04974842, + "balance_loss_mlp": 1.02300954, + "epoch": 0.3605591462498121, + "flos": 17056025026560.0, + "grad_norm": 2.068995609090248, + "language_loss": 0.73795009, + "learning_rate": 2.959142709981763e-06, + "loss": 0.75968659, + "num_input_tokens_seen": 128806610, + "step": 5997, + "time_per_iteration": 2.572842836380005 + }, + { + "auxiliary_loss_clip": 0.01121916, + "auxiliary_loss_mlp": 0.01036628, + "balance_loss_clip": 1.0482775, + "balance_loss_mlp": 1.0226686, + "epoch": 0.3606192695024801, + "flos": 16836862193280.0, + "grad_norm": 2.7116535757300215, + "language_loss": 0.69209671, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.71368217, + "num_input_tokens_seen": 128824830, + "step": 5998, + "time_per_iteration": 2.604459047317505 + }, + { + "auxiliary_loss_clip": 0.01085406, + "auxiliary_loss_mlp": 0.01041904, + "balance_loss_clip": 1.04395008, + "balance_loss_mlp": 1.02565587, + "epoch": 0.36067939275514804, + "flos": 12129655743360.0, + "grad_norm": 2.6293691676304745, + "language_loss": 0.76580822, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.78708136, + "num_input_tokens_seen": 128838170, + "step": 5999, + "time_per_iteration": 2.6671667098999023 + }, + { + "auxiliary_loss_clip": 0.01098137, + "auxiliary_loss_mlp": 0.01040783, + "balance_loss_clip": 1.04674315, + "balance_loss_mlp": 1.02590609, + "epoch": 0.360739516007816, + "flos": 18041449720320.0, + "grad_norm": 1.8157116334206203, + "language_loss": 0.78264523, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.80403441, + "num_input_tokens_seen": 128855625, + "step": 6000, + "time_per_iteration": 2.743117332458496 + }, + { + "auxiliary_loss_clip": 0.01095162, + "auxiliary_loss_mlp": 0.01036289, + "balance_loss_clip": 1.04705954, + "balance_loss_mlp": 1.02203155, + "epoch": 0.360799639260484, + "flos": 18549800951040.0, + "grad_norm": 1.8701006971713747, + "language_loss": 0.78316295, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.80447751, + "num_input_tokens_seen": 128873540, + "step": 6001, + "time_per_iteration": 2.7342417240142822 + }, + { + "auxiliary_loss_clip": 0.01130356, + "auxiliary_loss_mlp": 0.0077146, + "balance_loss_clip": 1.04727733, + "balance_loss_mlp": 1.00072694, + "epoch": 0.36085976251315194, + "flos": 19682028529920.0, + "grad_norm": 3.3927220139250056, + "language_loss": 0.83151853, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.8505367, + "num_input_tokens_seen": 128889925, + "step": 6002, + "time_per_iteration": 2.6884238719940186 + }, + { + "auxiliary_loss_clip": 0.01101804, + "auxiliary_loss_mlp": 0.01033284, + "balance_loss_clip": 1.04249346, + "balance_loss_mlp": 1.02011156, + "epoch": 0.3609198857658199, + "flos": 24198943703040.0, + "grad_norm": 2.135208430409031, + "language_loss": 0.90677911, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.92812997, + "num_input_tokens_seen": 128906890, + "step": 6003, + "time_per_iteration": 2.666738986968994 + }, + { + "auxiliary_loss_clip": 0.01036783, + "auxiliary_loss_mlp": 0.0101378, + "balance_loss_clip": 1.03707922, + "balance_loss_mlp": 1.01194429, + "epoch": 0.3609800090184879, + "flos": 57115995160320.0, + "grad_norm": 0.8844533830179444, + "language_loss": 0.53396428, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55446988, + "num_input_tokens_seen": 128965940, + "step": 6004, + "time_per_iteration": 3.1421444416046143 + }, + { + "auxiliary_loss_clip": 0.01112391, + "auxiliary_loss_mlp": 0.00772771, + "balance_loss_clip": 1.04665363, + "balance_loss_mlp": 1.00050342, + "epoch": 0.3610401322711559, + "flos": 20811239366400.0, + "grad_norm": 2.085214899207264, + "language_loss": 0.77743608, + "learning_rate": 2.956407517225883e-06, + "loss": 0.79628766, + "num_input_tokens_seen": 128985835, + "step": 6005, + "time_per_iteration": 4.196998596191406 + }, + { + "auxiliary_loss_clip": 0.01114373, + "auxiliary_loss_mlp": 0.01043264, + "balance_loss_clip": 1.04545391, + "balance_loss_mlp": 1.02866125, + "epoch": 0.36110025552382385, + "flos": 13699167494400.0, + "grad_norm": 1.984756598411705, + "language_loss": 0.78795588, + "learning_rate": 2.956065454793429e-06, + "loss": 0.80953228, + "num_input_tokens_seen": 129003120, + "step": 6006, + "time_per_iteration": 2.642446517944336 + }, + { + "auxiliary_loss_clip": 0.01135515, + "auxiliary_loss_mlp": 0.01037404, + "balance_loss_clip": 1.04913247, + "balance_loss_mlp": 1.02116823, + "epoch": 0.3611603787764918, + "flos": 22455014486400.0, + "grad_norm": 3.6522767524231248, + "language_loss": 0.84766537, + "learning_rate": 2.955723356106876e-06, + "loss": 0.86939454, + "num_input_tokens_seen": 129021645, + "step": 6007, + "time_per_iteration": 4.38408637046814 + }, + { + "auxiliary_loss_clip": 0.01120706, + "auxiliary_loss_mlp": 0.01035853, + "balance_loss_clip": 1.05059266, + "balance_loss_mlp": 1.01940203, + "epoch": 0.3612205020291598, + "flos": 20886651970560.0, + "grad_norm": 2.20663208121776, + "language_loss": 0.72179425, + "learning_rate": 2.955381221179198e-06, + "loss": 0.7433598, + "num_input_tokens_seen": 129038375, + "step": 6008, + "time_per_iteration": 4.262283802032471 + }, + { + "auxiliary_loss_clip": 0.01118211, + "auxiliary_loss_mlp": 0.0103587, + "balance_loss_clip": 1.04345882, + "balance_loss_mlp": 1.02150559, + "epoch": 0.36128062528182775, + "flos": 15741981780480.0, + "grad_norm": 7.815944525258205, + "language_loss": 0.83056295, + "learning_rate": 2.955039050023368e-06, + "loss": 0.85210377, + "num_input_tokens_seen": 129056235, + "step": 6009, + "time_per_iteration": 2.643824577331543 + }, + { + "auxiliary_loss_clip": 0.01105662, + "auxiliary_loss_mlp": 0.01045676, + "balance_loss_clip": 1.04862237, + "balance_loss_mlp": 1.03013086, + "epoch": 0.3613407485344957, + "flos": 16764502245120.0, + "grad_norm": 2.1132167438001166, + "language_loss": 0.7616573, + "learning_rate": 2.954696842652362e-06, + "loss": 0.7831707, + "num_input_tokens_seen": 129072405, + "step": 6010, + "time_per_iteration": 4.361377000808716 + }, + { + "auxiliary_loss_clip": 0.01104786, + "auxiliary_loss_mlp": 0.01035576, + "balance_loss_clip": 1.04665053, + "balance_loss_mlp": 1.02091312, + "epoch": 0.3614008717871637, + "flos": 20371189847040.0, + "grad_norm": 1.759609272436165, + "language_loss": 0.83214396, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.85354757, + "num_input_tokens_seen": 129090225, + "step": 6011, + "time_per_iteration": 2.679145574569702 + }, + { + "auxiliary_loss_clip": 0.01141696, + "auxiliary_loss_mlp": 0.01041601, + "balance_loss_clip": 1.05070031, + "balance_loss_mlp": 1.02562666, + "epoch": 0.36146099503983165, + "flos": 22776665800320.0, + "grad_norm": 2.194420173883677, + "language_loss": 0.62446111, + "learning_rate": 2.954012319316727e-06, + "loss": 0.64629406, + "num_input_tokens_seen": 129107685, + "step": 6012, + "time_per_iteration": 2.6012516021728516 + }, + { + "auxiliary_loss_clip": 0.01106556, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.04518831, + "balance_loss_mlp": 1.02368951, + "epoch": 0.3615211182924996, + "flos": 22996654646400.0, + "grad_norm": 1.831524666449312, + "language_loss": 0.8381623, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.85961026, + "num_input_tokens_seen": 129125315, + "step": 6013, + "time_per_iteration": 2.7191901206970215 + }, + { + "auxiliary_loss_clip": 0.01131608, + "auxiliary_loss_mlp": 0.01040321, + "balance_loss_clip": 1.04590511, + "balance_loss_mlp": 1.02466893, + "epoch": 0.3615812415451676, + "flos": 16648079287680.0, + "grad_norm": 3.6755742539930285, + "language_loss": 0.91541535, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.93713462, + "num_input_tokens_seen": 129141600, + "step": 6014, + "time_per_iteration": 2.714121103286743 + }, + { + "auxiliary_loss_clip": 0.01131507, + "auxiliary_loss_mlp": 0.01042414, + "balance_loss_clip": 1.0463829, + "balance_loss_mlp": 1.0268693, + "epoch": 0.36164136479783554, + "flos": 21320093387520.0, + "grad_norm": 2.2181121985150094, + "language_loss": 0.73578274, + "learning_rate": 2.95298526302391e-06, + "loss": 0.75752199, + "num_input_tokens_seen": 129160665, + "step": 6015, + "time_per_iteration": 2.668600082397461 + }, + { + "auxiliary_loss_clip": 0.0105036, + "auxiliary_loss_mlp": 0.01047702, + "balance_loss_clip": 1.03610015, + "balance_loss_mlp": 1.02980912, + "epoch": 0.3617014880505035, + "flos": 24169569356160.0, + "grad_norm": 2.2662955263586158, + "language_loss": 0.64756966, + "learning_rate": 2.9526428386344e-06, + "loss": 0.66855025, + "num_input_tokens_seen": 129179220, + "step": 6016, + "time_per_iteration": 2.8753597736358643 + }, + { + "auxiliary_loss_clip": 0.01127577, + "auxiliary_loss_mlp": 0.01039172, + "balance_loss_clip": 1.05000329, + "balance_loss_mlp": 1.02170801, + "epoch": 0.3617616113031715, + "flos": 39014824101120.0, + "grad_norm": 2.0483319793753343, + "language_loss": 0.71927178, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.74093938, + "num_input_tokens_seen": 129200385, + "step": 6017, + "time_per_iteration": 2.8195903301239014 + }, + { + "auxiliary_loss_clip": 0.01123165, + "auxiliary_loss_mlp": 0.01043013, + "balance_loss_clip": 1.04506993, + "balance_loss_mlp": 1.02724147, + "epoch": 0.3618217345558395, + "flos": 12130840892160.0, + "grad_norm": 2.196881428409859, + "language_loss": 0.73543239, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.7570942, + "num_input_tokens_seen": 129217395, + "step": 6018, + "time_per_iteration": 2.6454639434814453 + }, + { + "auxiliary_loss_clip": 0.01088616, + "auxiliary_loss_mlp": 0.01036025, + "balance_loss_clip": 1.0470562, + "balance_loss_mlp": 1.02079058, + "epoch": 0.36188185780850746, + "flos": 24935005203840.0, + "grad_norm": 2.8373114264415222, + "language_loss": 0.69157374, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.71282017, + "num_input_tokens_seen": 129238940, + "step": 6019, + "time_per_iteration": 2.824361801147461 + }, + { + "auxiliary_loss_clip": 0.0111438, + "auxiliary_loss_mlp": 0.0103897, + "balance_loss_clip": 1.04542887, + "balance_loss_mlp": 1.02275765, + "epoch": 0.3619419810611754, + "flos": 20958832350720.0, + "grad_norm": 3.405770043894724, + "language_loss": 0.76428473, + "learning_rate": 2.95127277996311e-06, + "loss": 0.78581828, + "num_input_tokens_seen": 129258240, + "step": 6020, + "time_per_iteration": 2.6757993698120117 + }, + { + "auxiliary_loss_clip": 0.01124662, + "auxiliary_loss_mlp": 0.01041506, + "balance_loss_clip": 1.04899478, + "balance_loss_mlp": 1.02512705, + "epoch": 0.3620021043138434, + "flos": 22528882805760.0, + "grad_norm": 2.1413312386751606, + "language_loss": 0.73802006, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.7596817, + "num_input_tokens_seen": 129279040, + "step": 6021, + "time_per_iteration": 2.6422386169433594 + }, + { + "auxiliary_loss_clip": 0.01094575, + "auxiliary_loss_mlp": 0.01036086, + "balance_loss_clip": 1.04502845, + "balance_loss_mlp": 1.02170944, + "epoch": 0.36206222756651135, + "flos": 15596687266560.0, + "grad_norm": 8.65046906858069, + "language_loss": 0.80683851, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.82814515, + "num_input_tokens_seen": 129295415, + "step": 6022, + "time_per_iteration": 2.7069809436798096 + }, + { + "auxiliary_loss_clip": 0.0112144, + "auxiliary_loss_mlp": 0.01034482, + "balance_loss_clip": 1.04967427, + "balance_loss_mlp": 1.02036762, + "epoch": 0.3621223508191793, + "flos": 23587170238080.0, + "grad_norm": 1.6359940708258738, + "language_loss": 0.81630391, + "learning_rate": 2.950244857154417e-06, + "loss": 0.83786309, + "num_input_tokens_seen": 129312620, + "step": 6023, + "time_per_iteration": 2.676196575164795 + }, + { + "auxiliary_loss_clip": 0.01115391, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.04994166, + "balance_loss_mlp": 1.02266037, + "epoch": 0.3621824740718473, + "flos": 22309899540480.0, + "grad_norm": 2.238629896510925, + "language_loss": 0.79401833, + "learning_rate": 2.9499021441341e-06, + "loss": 0.81555158, + "num_input_tokens_seen": 129331825, + "step": 6024, + "time_per_iteration": 2.6479294300079346 + }, + { + "auxiliary_loss_clip": 0.01098352, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.04168642, + "balance_loss_mlp": 1.02567625, + "epoch": 0.36224259732451525, + "flos": 16763640318720.0, + "grad_norm": 2.1016508822119517, + "language_loss": 0.74409318, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.76549369, + "num_input_tokens_seen": 129350400, + "step": 6025, + "time_per_iteration": 2.720113515853882 + }, + { + "auxiliary_loss_clip": 0.01121634, + "auxiliary_loss_mlp": 0.00772492, + "balance_loss_clip": 1.04758501, + "balance_loss_mlp": 1.00045466, + "epoch": 0.3623027205771832, + "flos": 23149742411520.0, + "grad_norm": 1.7192758683210898, + "language_loss": 0.72363192, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.74257314, + "num_input_tokens_seen": 129371155, + "step": 6026, + "time_per_iteration": 2.647515296936035 + }, + { + "auxiliary_loss_clip": 0.01130763, + "auxiliary_loss_mlp": 0.01045791, + "balance_loss_clip": 1.05090141, + "balance_loss_mlp": 1.0300554, + "epoch": 0.3623628438298512, + "flos": 28549162834560.0, + "grad_norm": 3.1509295844270166, + "language_loss": 0.79584157, + "learning_rate": 2.948873789002833e-06, + "loss": 0.81760705, + "num_input_tokens_seen": 129391230, + "step": 6027, + "time_per_iteration": 2.666778802871704 + }, + { + "auxiliary_loss_clip": 0.01112806, + "auxiliary_loss_mlp": 0.01044567, + "balance_loss_clip": 1.04690945, + "balance_loss_mlp": 1.02730584, + "epoch": 0.36242296708251914, + "flos": 25484941405440.0, + "grad_norm": 2.036912075012155, + "language_loss": 0.67857373, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.70014751, + "num_input_tokens_seen": 129410065, + "step": 6028, + "time_per_iteration": 2.721635103225708 + }, + { + "auxiliary_loss_clip": 0.01093428, + "auxiliary_loss_mlp": 0.01039806, + "balance_loss_clip": 1.04534137, + "balance_loss_mlp": 1.02493429, + "epoch": 0.3624830903351871, + "flos": 16290373697280.0, + "grad_norm": 2.040296243102333, + "language_loss": 0.85588348, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.87721586, + "num_input_tokens_seen": 129428655, + "step": 6029, + "time_per_iteration": 2.768638849258423 + }, + { + "auxiliary_loss_clip": 0.01097178, + "auxiliary_loss_mlp": 0.01040472, + "balance_loss_clip": 1.04583371, + "balance_loss_mlp": 1.02534389, + "epoch": 0.36254321358785513, + "flos": 18296307694080.0, + "grad_norm": 1.826841085229912, + "language_loss": 0.72638077, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.74775726, + "num_input_tokens_seen": 129447845, + "step": 6030, + "time_per_iteration": 2.6222145557403564 + }, + { + "auxiliary_loss_clip": 0.01111443, + "auxiliary_loss_mlp": 0.0104401, + "balance_loss_clip": 1.0471518, + "balance_loss_mlp": 1.02635479, + "epoch": 0.3626033368405231, + "flos": 14865294533760.0, + "grad_norm": 2.682823168265615, + "language_loss": 0.74219912, + "learning_rate": 2.94750214514905e-06, + "loss": 0.76375365, + "num_input_tokens_seen": 129463275, + "step": 6031, + "time_per_iteration": 2.62003493309021 + }, + { + "auxiliary_loss_clip": 0.01090216, + "auxiliary_loss_mlp": 0.01046109, + "balance_loss_clip": 1.04174352, + "balance_loss_mlp": 1.03031349, + "epoch": 0.36266346009319106, + "flos": 22306595489280.0, + "grad_norm": 2.122404426395552, + "language_loss": 0.72930032, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.75066358, + "num_input_tokens_seen": 129483205, + "step": 6032, + "time_per_iteration": 2.7382266521453857 + }, + { + "auxiliary_loss_clip": 0.01089342, + "auxiliary_loss_mlp": 0.0104871, + "balance_loss_clip": 1.0457828, + "balance_loss_mlp": 1.03320134, + "epoch": 0.362723583345859, + "flos": 18222331633920.0, + "grad_norm": 2.0052695882675895, + "language_loss": 0.77577424, + "learning_rate": 2.946816107593884e-06, + "loss": 0.79715478, + "num_input_tokens_seen": 129499885, + "step": 6033, + "time_per_iteration": 2.712574005126953 + }, + { + "auxiliary_loss_clip": 0.01011518, + "auxiliary_loss_mlp": 0.01010455, + "balance_loss_clip": 1.02346182, + "balance_loss_mlp": 1.00881004, + "epoch": 0.362783706598527, + "flos": 68499174458880.0, + "grad_norm": 0.775881514372135, + "language_loss": 0.6472615, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.66748118, + "num_input_tokens_seen": 129561885, + "step": 6034, + "time_per_iteration": 3.33389949798584 + }, + { + "auxiliary_loss_clip": 0.0111586, + "auxiliary_loss_mlp": 0.01039589, + "balance_loss_clip": 1.04362679, + "balance_loss_mlp": 1.02373409, + "epoch": 0.36284382985119495, + "flos": 26576589594240.0, + "grad_norm": 2.348469757016237, + "language_loss": 0.89869213, + "learning_rate": 2.946129926425273e-06, + "loss": 0.9202466, + "num_input_tokens_seen": 129582325, + "step": 6035, + "time_per_iteration": 2.661137580871582 + }, + { + "auxiliary_loss_clip": 0.01112128, + "auxiliary_loss_mlp": 0.01040682, + "balance_loss_clip": 1.04810882, + "balance_loss_mlp": 1.02445734, + "epoch": 0.3629039531038629, + "flos": 20156767608960.0, + "grad_norm": 1.7965494412259506, + "language_loss": 0.73480749, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.75633562, + "num_input_tokens_seen": 129600350, + "step": 6036, + "time_per_iteration": 2.627746105194092 + }, + { + "auxiliary_loss_clip": 0.01118939, + "auxiliary_loss_mlp": 0.01034203, + "balance_loss_clip": 1.0476563, + "balance_loss_mlp": 1.01825309, + "epoch": 0.3629640763565309, + "flos": 18625716345600.0, + "grad_norm": 2.247638401714898, + "language_loss": 0.75895989, + "learning_rate": 2.945443601747297e-06, + "loss": 0.78049135, + "num_input_tokens_seen": 129618425, + "step": 6037, + "time_per_iteration": 2.6763134002685547 + }, + { + "auxiliary_loss_clip": 0.01117432, + "auxiliary_loss_mlp": 0.0105958, + "balance_loss_clip": 1.04722893, + "balance_loss_mlp": 1.04149556, + "epoch": 0.36302419960919885, + "flos": 19571459489280.0, + "grad_norm": 1.7641921793444904, + "language_loss": 0.78425813, + "learning_rate": 2.945100385624828e-06, + "loss": 0.80602825, + "num_input_tokens_seen": 129636750, + "step": 6038, + "time_per_iteration": 2.6576154232025146 + }, + { + "auxiliary_loss_clip": 0.01042272, + "auxiliary_loss_mlp": 0.01000075, + "balance_loss_clip": 1.02576721, + "balance_loss_mlp": 0.99842948, + "epoch": 0.3630843228618668, + "flos": 63797606444160.0, + "grad_norm": 0.8328343708327894, + "language_loss": 0.63371962, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.6541431, + "num_input_tokens_seen": 129699030, + "step": 6039, + "time_per_iteration": 3.268035650253296 + }, + { + "auxiliary_loss_clip": 0.01108663, + "auxiliary_loss_mlp": 0.01052032, + "balance_loss_clip": 1.04687905, + "balance_loss_mlp": 1.03485394, + "epoch": 0.3631444461145348, + "flos": 21835160461440.0, + "grad_norm": 2.83972356132426, + "language_loss": 0.71349055, + "learning_rate": 2.944413845878002e-06, + "loss": 0.73509747, + "num_input_tokens_seen": 129717135, + "step": 6040, + "time_per_iteration": 2.7468066215515137 + }, + { + "auxiliary_loss_clip": 0.01129452, + "auxiliary_loss_mlp": 0.01039721, + "balance_loss_clip": 1.05027485, + "balance_loss_mlp": 1.02372289, + "epoch": 0.36320456936720275, + "flos": 21722041555200.0, + "grad_norm": 1.6017927687359714, + "language_loss": 0.81615877, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.83785057, + "num_input_tokens_seen": 129735940, + "step": 6041, + "time_per_iteration": 2.6624767780303955 + }, + { + "auxiliary_loss_clip": 0.01116373, + "auxiliary_loss_mlp": 0.01037475, + "balance_loss_clip": 1.04789138, + "balance_loss_mlp": 1.02039289, + "epoch": 0.3632646926198707, + "flos": 17019072910080.0, + "grad_norm": 6.335898198250863, + "language_loss": 0.83848882, + "learning_rate": 2.943727162882107e-06, + "loss": 0.86002731, + "num_input_tokens_seen": 129752790, + "step": 6042, + "time_per_iteration": 2.6279616355895996 + }, + { + "auxiliary_loss_clip": 0.01113831, + "auxiliary_loss_mlp": 0.01045895, + "balance_loss_clip": 1.04817295, + "balance_loss_mlp": 1.03020668, + "epoch": 0.36332481587253873, + "flos": 23331163029120.0, + "grad_norm": 1.8194124872693949, + "language_loss": 0.78401059, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.80560786, + "num_input_tokens_seen": 129773655, + "step": 6043, + "time_per_iteration": 4.221862077713013 + }, + { + "auxiliary_loss_clip": 0.01111193, + "auxiliary_loss_mlp": 0.01036813, + "balance_loss_clip": 1.05454051, + "balance_loss_mlp": 1.02078581, + "epoch": 0.3633849391252067, + "flos": 10743539857920.0, + "grad_norm": 2.743973887678544, + "language_loss": 0.65664518, + "learning_rate": 2.943040336741298e-06, + "loss": 0.67812526, + "num_input_tokens_seen": 129791605, + "step": 6044, + "time_per_iteration": 2.7301173210144043 + }, + { + "auxiliary_loss_clip": 0.01109397, + "auxiliary_loss_mlp": 0.01034976, + "balance_loss_clip": 1.04838157, + "balance_loss_mlp": 1.02035475, + "epoch": 0.36344506237787466, + "flos": 25849147357440.0, + "grad_norm": 2.5365479968338187, + "language_loss": 0.81149542, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.83293915, + "num_input_tokens_seen": 129811075, + "step": 6045, + "time_per_iteration": 2.6896753311157227 + }, + { + "auxiliary_loss_clip": 0.0110304, + "auxiliary_loss_mlp": 0.01045503, + "balance_loss_clip": 1.04706383, + "balance_loss_mlp": 1.02923083, + "epoch": 0.3635051856305426, + "flos": 30154046503680.0, + "grad_norm": 2.400629400498793, + "language_loss": 0.65010375, + "learning_rate": 2.942353367559755e-06, + "loss": 0.67158914, + "num_input_tokens_seen": 129833755, + "step": 6046, + "time_per_iteration": 2.800321578979492 + }, + { + "auxiliary_loss_clip": 0.01102544, + "auxiliary_loss_mlp": 0.01038937, + "balance_loss_clip": 1.0467155, + "balance_loss_mlp": 1.02399993, + "epoch": 0.3635653088832106, + "flos": 22198396746240.0, + "grad_norm": 2.172977049503826, + "language_loss": 0.77142686, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.79284167, + "num_input_tokens_seen": 129854475, + "step": 6047, + "time_per_iteration": 4.274283170700073 + }, + { + "auxiliary_loss_clip": 0.01137356, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_clip": 1.05142486, + "balance_loss_mlp": 1.02983761, + "epoch": 0.36362543213587856, + "flos": 24787053083520.0, + "grad_norm": 1.922622021112015, + "language_loss": 0.79610157, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.81795079, + "num_input_tokens_seen": 129873530, + "step": 6048, + "time_per_iteration": 4.283480644226074 + }, + { + "auxiliary_loss_clip": 0.01037942, + "auxiliary_loss_mlp": 0.01005664, + "balance_loss_clip": 1.01860034, + "balance_loss_mlp": 1.00387573, + "epoch": 0.3636855553885465, + "flos": 62526369231360.0, + "grad_norm": 0.749844121463454, + "language_loss": 0.52550006, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54593611, + "num_input_tokens_seen": 129940400, + "step": 6049, + "time_per_iteration": 3.2647299766540527 + }, + { + "auxiliary_loss_clip": 0.01105759, + "auxiliary_loss_mlp": 0.01042028, + "balance_loss_clip": 1.04831481, + "balance_loss_mlp": 1.02467084, + "epoch": 0.3637456786412145, + "flos": 24060652341120.0, + "grad_norm": 9.722138117523357, + "language_loss": 0.8628068, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.88428462, + "num_input_tokens_seen": 129958635, + "step": 6050, + "time_per_iteration": 2.744236469268799 + }, + { + "auxiliary_loss_clip": 0.01120328, + "auxiliary_loss_mlp": 0.00772785, + "balance_loss_clip": 1.04944158, + "balance_loss_mlp": 1.0004611, + "epoch": 0.36380580189388245, + "flos": 16691495852160.0, + "grad_norm": 3.109361789309709, + "language_loss": 0.78116536, + "learning_rate": 2.940635319486546e-06, + "loss": 0.80009651, + "num_input_tokens_seen": 129977685, + "step": 6051, + "time_per_iteration": 2.6305320262908936 + }, + { + "auxiliary_loss_clip": 0.01127196, + "auxiliary_loss_mlp": 0.01040856, + "balance_loss_clip": 1.04900503, + "balance_loss_mlp": 1.02559745, + "epoch": 0.3638659251465504, + "flos": 25114091437440.0, + "grad_norm": 1.9275322741448784, + "language_loss": 0.82526582, + "learning_rate": 2.940291602812822e-06, + "loss": 0.84694636, + "num_input_tokens_seen": 129997530, + "step": 6052, + "time_per_iteration": 2.711794853210449 + }, + { + "auxiliary_loss_clip": 0.01100415, + "auxiliary_loss_mlp": 0.01036967, + "balance_loss_clip": 1.04675376, + "balance_loss_mlp": 1.02270949, + "epoch": 0.3639260483992184, + "flos": 23003011353600.0, + "grad_norm": 1.7820298413079305, + "language_loss": 0.72085792, + "learning_rate": 2.939947850483145e-06, + "loss": 0.74223173, + "num_input_tokens_seen": 130017955, + "step": 6053, + "time_per_iteration": 2.725600481033325 + }, + { + "auxiliary_loss_clip": 0.01015406, + "auxiliary_loss_mlp": 0.01003631, + "balance_loss_clip": 1.0300014, + "balance_loss_mlp": 1.00155663, + "epoch": 0.36398617165188635, + "flos": 70716011160960.0, + "grad_norm": 0.7712310074836012, + "language_loss": 0.61214095, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63233131, + "num_input_tokens_seen": 130074275, + "step": 6054, + "time_per_iteration": 3.3252007961273193 + }, + { + "auxiliary_loss_clip": 0.0111079, + "auxiliary_loss_mlp": 0.01038999, + "balance_loss_clip": 1.04735899, + "balance_loss_mlp": 1.02214301, + "epoch": 0.3640462949045543, + "flos": 22235456603520.0, + "grad_norm": 2.93078334140581, + "language_loss": 0.75820959, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.77970749, + "num_input_tokens_seen": 130091375, + "step": 6055, + "time_per_iteration": 2.656001091003418 + }, + { + "auxiliary_loss_clip": 0.0113529, + "auxiliary_loss_mlp": 0.01041525, + "balance_loss_clip": 1.04910016, + "balance_loss_mlp": 1.02581286, + "epoch": 0.3641064181572223, + "flos": 21543529939200.0, + "grad_norm": 1.6734377169093124, + "language_loss": 0.7533145, + "learning_rate": 2.938916379688765e-06, + "loss": 0.77508265, + "num_input_tokens_seen": 130111595, + "step": 6056, + "time_per_iteration": 2.654418468475342 + }, + { + "auxiliary_loss_clip": 0.01121707, + "auxiliary_loss_mlp": 0.01038714, + "balance_loss_clip": 1.055071, + "balance_loss_mlp": 1.02337217, + "epoch": 0.3641665414098903, + "flos": 22273306560000.0, + "grad_norm": 2.035168503846255, + "language_loss": 0.80473512, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.82633936, + "num_input_tokens_seen": 130131440, + "step": 6057, + "time_per_iteration": 2.7347753047943115 + }, + { + "auxiliary_loss_clip": 0.01107128, + "auxiliary_loss_mlp": 0.01039802, + "balance_loss_clip": 1.04495037, + "balance_loss_mlp": 1.02438855, + "epoch": 0.36422666466255826, + "flos": 28329676778880.0, + "grad_norm": 2.043030499006847, + "language_loss": 0.80264485, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.8241142, + "num_input_tokens_seen": 130151375, + "step": 6058, + "time_per_iteration": 2.695674180984497 + }, + { + "auxiliary_loss_clip": 0.01102831, + "auxiliary_loss_mlp": 0.00772601, + "balance_loss_clip": 1.04357934, + "balance_loss_mlp": 1.00046432, + "epoch": 0.36428678791522623, + "flos": 24170503109760.0, + "grad_norm": 2.032310914115462, + "language_loss": 0.84994543, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.86869979, + "num_input_tokens_seen": 130169960, + "step": 6059, + "time_per_iteration": 2.6912410259246826 + }, + { + "auxiliary_loss_clip": 0.01093721, + "auxiliary_loss_mlp": 0.01039242, + "balance_loss_clip": 1.04318213, + "balance_loss_mlp": 1.02287483, + "epoch": 0.3643469111678942, + "flos": 22528451842560.0, + "grad_norm": 5.903326132338396, + "language_loss": 0.87806225, + "learning_rate": 2.937540586903884e-06, + "loss": 0.89939183, + "num_input_tokens_seen": 130189800, + "step": 6060, + "time_per_iteration": 2.713115692138672 + }, + { + "auxiliary_loss_clip": 0.01125791, + "auxiliary_loss_mlp": 0.01040312, + "balance_loss_clip": 1.0498302, + "balance_loss_mlp": 1.02388453, + "epoch": 0.36440703442056216, + "flos": 19426595938560.0, + "grad_norm": 2.3521788015610805, + "language_loss": 0.66954017, + "learning_rate": 2.937196549795971e-06, + "loss": 0.69120121, + "num_input_tokens_seen": 130206370, + "step": 6061, + "time_per_iteration": 2.8435866832733154 + }, + { + "auxiliary_loss_clip": 0.0111942, + "auxiliary_loss_mlp": 0.01038694, + "balance_loss_clip": 1.05207086, + "balance_loss_mlp": 1.02260041, + "epoch": 0.3644671576732301, + "flos": 18040515966720.0, + "grad_norm": 2.5119296796020354, + "language_loss": 0.75012159, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.77170277, + "num_input_tokens_seen": 130224445, + "step": 6062, + "time_per_iteration": 2.659853935241699 + }, + { + "auxiliary_loss_clip": 0.01108402, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.04851866, + "balance_loss_mlp": 1.01628149, + "epoch": 0.3645272809258981, + "flos": 21542811667200.0, + "grad_norm": 2.568706719167558, + "language_loss": 0.72070628, + "learning_rate": 2.936508368977432e-06, + "loss": 0.74213189, + "num_input_tokens_seen": 130245380, + "step": 6063, + "time_per_iteration": 2.7098159790039062 + }, + { + "auxiliary_loss_clip": 0.01118768, + "auxiliary_loss_mlp": 0.010373, + "balance_loss_clip": 1.04472148, + "balance_loss_mlp": 1.02187479, + "epoch": 0.36458740417856605, + "flos": 22746860490240.0, + "grad_norm": 2.3511982692020936, + "language_loss": 0.68179435, + "learning_rate": 2.936164225292901e-06, + "loss": 0.70335501, + "num_input_tokens_seen": 130265575, + "step": 6064, + "time_per_iteration": 2.6513044834136963 + }, + { + "auxiliary_loss_clip": 0.01116627, + "auxiliary_loss_mlp": 0.01045789, + "balance_loss_clip": 1.04925466, + "balance_loss_mlp": 1.02988076, + "epoch": 0.364647527431234, + "flos": 26140670138880.0, + "grad_norm": 1.9840367281230236, + "language_loss": 0.74147421, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.76309836, + "num_input_tokens_seen": 130286195, + "step": 6065, + "time_per_iteration": 2.764556407928467 + }, + { + "auxiliary_loss_clip": 0.0111688, + "auxiliary_loss_mlp": 0.01040465, + "balance_loss_clip": 1.04924774, + "balance_loss_mlp": 1.02306008, + "epoch": 0.364707650683902, + "flos": 31029907737600.0, + "grad_norm": 2.0108238901766042, + "language_loss": 0.75444913, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.77602255, + "num_input_tokens_seen": 130306095, + "step": 6066, + "time_per_iteration": 2.749293088912964 + }, + { + "auxiliary_loss_clip": 0.01121102, + "auxiliary_loss_mlp": 0.01034674, + "balance_loss_clip": 1.04859555, + "balance_loss_mlp": 1.02010643, + "epoch": 0.36476777393656995, + "flos": 19572896033280.0, + "grad_norm": 2.8385875288429587, + "language_loss": 0.76480901, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.78636676, + "num_input_tokens_seen": 130324685, + "step": 6067, + "time_per_iteration": 2.619833469390869 + }, + { + "auxiliary_loss_clip": 0.01135088, + "auxiliary_loss_mlp": 0.0103807, + "balance_loss_clip": 1.05067635, + "balance_loss_mlp": 1.02401567, + "epoch": 0.3648278971892379, + "flos": 17748849530880.0, + "grad_norm": 2.2214902441228563, + "language_loss": 0.71036232, + "learning_rate": 2.934787295690886e-06, + "loss": 0.73209393, + "num_input_tokens_seen": 130343855, + "step": 6068, + "time_per_iteration": 2.633678674697876 + }, + { + "auxiliary_loss_clip": 0.01119276, + "auxiliary_loss_mlp": 0.01039471, + "balance_loss_clip": 1.0432384, + "balance_loss_mlp": 1.02402711, + "epoch": 0.3648880204419059, + "flos": 17931167988480.0, + "grad_norm": 2.184109901605664, + "language_loss": 0.74421692, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.76580441, + "num_input_tokens_seen": 130362320, + "step": 6069, + "time_per_iteration": 2.6463425159454346 + }, + { + "auxiliary_loss_clip": 0.01115147, + "auxiliary_loss_mlp": 0.0103807, + "balance_loss_clip": 1.04814148, + "balance_loss_mlp": 1.02237022, + "epoch": 0.3649481436945739, + "flos": 22638266697600.0, + "grad_norm": 1.8874088651190308, + "language_loss": 0.66247845, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.68401062, + "num_input_tokens_seen": 130383165, + "step": 6070, + "time_per_iteration": 2.70835280418396 + }, + { + "auxiliary_loss_clip": 0.01118852, + "auxiliary_loss_mlp": 0.01036547, + "balance_loss_clip": 1.04837227, + "balance_loss_mlp": 1.02161574, + "epoch": 0.36500826694724187, + "flos": 21579656042880.0, + "grad_norm": 1.882521473859371, + "language_loss": 0.74406028, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76561427, + "num_input_tokens_seen": 130402425, + "step": 6071, + "time_per_iteration": 2.6479921340942383 + }, + { + "auxiliary_loss_clip": 0.0112348, + "auxiliary_loss_mlp": 0.01037332, + "balance_loss_clip": 1.04683149, + "balance_loss_mlp": 1.02142978, + "epoch": 0.36506839019990983, + "flos": 13772533023360.0, + "grad_norm": 1.9443656652026238, + "language_loss": 0.88592315, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.9075312, + "num_input_tokens_seen": 130419440, + "step": 6072, + "time_per_iteration": 2.641340732574463 + }, + { + "auxiliary_loss_clip": 0.01122637, + "auxiliary_loss_mlp": 0.01036427, + "balance_loss_clip": 1.0495832, + "balance_loss_mlp": 1.02225924, + "epoch": 0.3651285134525778, + "flos": 17274972378240.0, + "grad_norm": 2.382408041683643, + "language_loss": 0.72436309, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.7459538, + "num_input_tokens_seen": 130438495, + "step": 6073, + "time_per_iteration": 2.6814513206481934 + }, + { + "auxiliary_loss_clip": 0.01067321, + "auxiliary_loss_mlp": 0.01042007, + "balance_loss_clip": 1.04483008, + "balance_loss_mlp": 1.0249722, + "epoch": 0.36518863670524576, + "flos": 21907987286400.0, + "grad_norm": 3.1332797030940913, + "language_loss": 0.66850221, + "learning_rate": 2.932720838132236e-06, + "loss": 0.68959546, + "num_input_tokens_seen": 130455575, + "step": 6074, + "time_per_iteration": 2.7943460941314697 + }, + { + "auxiliary_loss_clip": 0.01103652, + "auxiliary_loss_mlp": 0.01037343, + "balance_loss_clip": 1.04833269, + "balance_loss_mlp": 1.02238262, + "epoch": 0.3652487599579137, + "flos": 27122180250240.0, + "grad_norm": 1.5371260958261816, + "language_loss": 0.72812623, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.74953616, + "num_input_tokens_seen": 130476385, + "step": 6075, + "time_per_iteration": 2.7581374645233154 + }, + { + "auxiliary_loss_clip": 0.01100578, + "auxiliary_loss_mlp": 0.01046604, + "balance_loss_clip": 1.04679585, + "balance_loss_mlp": 1.03011715, + "epoch": 0.3653088832105817, + "flos": 19755573626880.0, + "grad_norm": 2.1248471900324186, + "language_loss": 0.89377797, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.91524976, + "num_input_tokens_seen": 130493630, + "step": 6076, + "time_per_iteration": 2.7085182666778564 + }, + { + "auxiliary_loss_clip": 0.01125287, + "auxiliary_loss_mlp": 0.01043945, + "balance_loss_clip": 1.0504595, + "balance_loss_mlp": 1.02784586, + "epoch": 0.36536900646324966, + "flos": 13115008609920.0, + "grad_norm": 2.218138292044272, + "language_loss": 0.69377828, + "learning_rate": 2.931687131696872e-06, + "loss": 0.71547067, + "num_input_tokens_seen": 130510735, + "step": 6077, + "time_per_iteration": 2.6516926288604736 + }, + { + "auxiliary_loss_clip": 0.01063406, + "auxiliary_loss_mlp": 0.01003112, + "balance_loss_clip": 1.03200221, + "balance_loss_mlp": 1.00121677, + "epoch": 0.3654291297159176, + "flos": 71100472383360.0, + "grad_norm": 0.7484778409156561, + "language_loss": 0.61802375, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.63868892, + "num_input_tokens_seen": 130577050, + "step": 6078, + "time_per_iteration": 3.2192225456237793 + }, + { + "auxiliary_loss_clip": 0.01105852, + "auxiliary_loss_mlp": 0.01053011, + "balance_loss_clip": 1.04234397, + "balance_loss_mlp": 1.03565383, + "epoch": 0.3654892529685856, + "flos": 23617478338560.0, + "grad_norm": 2.6620805395927283, + "language_loss": 0.78445792, + "learning_rate": 2.930997817403173e-06, + "loss": 0.80604661, + "num_input_tokens_seen": 130593780, + "step": 6079, + "time_per_iteration": 2.6616902351379395 + }, + { + "auxiliary_loss_clip": 0.01129934, + "auxiliary_loss_mlp": 0.01040158, + "balance_loss_clip": 1.05226243, + "balance_loss_mlp": 1.02386224, + "epoch": 0.36554937622125355, + "flos": 43470799850880.0, + "grad_norm": 2.4767906644356037, + "language_loss": 0.62662333, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.64832425, + "num_input_tokens_seen": 130615510, + "step": 6080, + "time_per_iteration": 2.8651509284973145 + }, + { + "auxiliary_loss_clip": 0.01108292, + "auxiliary_loss_mlp": 0.01042236, + "balance_loss_clip": 1.04737091, + "balance_loss_mlp": 1.02529645, + "epoch": 0.3656094994739215, + "flos": 23294641875840.0, + "grad_norm": 3.1314387429818327, + "language_loss": 0.67686033, + "learning_rate": 2.930308361895352e-06, + "loss": 0.69836557, + "num_input_tokens_seen": 130635410, + "step": 6081, + "time_per_iteration": 2.707031011581421 + }, + { + "auxiliary_loss_clip": 0.01112746, + "auxiliary_loss_mlp": 0.00773158, + "balance_loss_clip": 1.04989302, + "balance_loss_mlp": 1.00033236, + "epoch": 0.3656696227265895, + "flos": 24571984400640.0, + "grad_norm": 1.5854068035466964, + "language_loss": 0.74755692, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.76641595, + "num_input_tokens_seen": 130657725, + "step": 6082, + "time_per_iteration": 2.7261881828308105 + }, + { + "auxiliary_loss_clip": 0.01072732, + "auxiliary_loss_mlp": 0.00772597, + "balance_loss_clip": 1.04222691, + "balance_loss_mlp": 1.00042963, + "epoch": 0.3657297459792575, + "flos": 27928375056000.0, + "grad_norm": 2.051480252043875, + "language_loss": 0.82956016, + "learning_rate": 2.929618765277987e-06, + "loss": 0.8480134, + "num_input_tokens_seen": 130678360, + "step": 6083, + "time_per_iteration": 4.360748529434204 + }, + { + "auxiliary_loss_clip": 0.01041394, + "auxiliary_loss_mlp": 0.01001412, + "balance_loss_clip": 1.02900386, + "balance_loss_mlp": 0.99936181, + "epoch": 0.36578986923192547, + "flos": 67392622126080.0, + "grad_norm": 0.8163771270511553, + "language_loss": 0.59314513, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61357319, + "num_input_tokens_seen": 130742110, + "step": 6084, + "time_per_iteration": 3.3273561000823975 + }, + { + "auxiliary_loss_clip": 0.0109183, + "auxiliary_loss_mlp": 0.0104143, + "balance_loss_clip": 1.04496968, + "balance_loss_mlp": 1.02570593, + "epoch": 0.36584999248459343, + "flos": 20227511445120.0, + "grad_norm": 3.4329037043843478, + "language_loss": 0.72791892, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.74925154, + "num_input_tokens_seen": 130759870, + "step": 6085, + "time_per_iteration": 2.7221856117248535 + }, + { + "auxiliary_loss_clip": 0.01101549, + "auxiliary_loss_mlp": 0.01038512, + "balance_loss_clip": 1.04982924, + "balance_loss_mlp": 1.02383745, + "epoch": 0.3659101157372614, + "flos": 19062461813760.0, + "grad_norm": 2.636651052815632, + "language_loss": 0.77860379, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.80000436, + "num_input_tokens_seen": 130778510, + "step": 6086, + "time_per_iteration": 4.265977621078491 + }, + { + "auxiliary_loss_clip": 0.0111591, + "auxiliary_loss_mlp": 0.01032554, + "balance_loss_clip": 1.04616153, + "balance_loss_mlp": 1.01771855, + "epoch": 0.36597023898992936, + "flos": 30810708990720.0, + "grad_norm": 1.8562986050024126, + "language_loss": 0.76759315, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.78907776, + "num_input_tokens_seen": 130798535, + "step": 6087, + "time_per_iteration": 4.227373123168945 + }, + { + "auxiliary_loss_clip": 0.01081855, + "auxiliary_loss_mlp": 0.01042282, + "balance_loss_clip": 1.04556108, + "balance_loss_mlp": 1.02589023, + "epoch": 0.36603036224259733, + "flos": 20521799573760.0, + "grad_norm": 2.2476274891892474, + "language_loss": 0.71063232, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.73187363, + "num_input_tokens_seen": 130816655, + "step": 6088, + "time_per_iteration": 4.3080058097839355 + }, + { + "auxiliary_loss_clip": 0.01136094, + "auxiliary_loss_mlp": 0.01039702, + "balance_loss_clip": 1.05314517, + "balance_loss_mlp": 1.02267289, + "epoch": 0.3660904854952653, + "flos": 38329397798400.0, + "grad_norm": 1.6318023186273214, + "language_loss": 0.79717827, + "learning_rate": 2.92754912981472e-06, + "loss": 0.81893623, + "num_input_tokens_seen": 130841225, + "step": 6089, + "time_per_iteration": 2.782954216003418 + }, + { + "auxiliary_loss_clip": 0.01099767, + "auxiliary_loss_mlp": 0.01036428, + "balance_loss_clip": 1.04514015, + "balance_loss_mlp": 1.02220643, + "epoch": 0.36615060874793326, + "flos": 21835555511040.0, + "grad_norm": 2.0312735397290043, + "language_loss": 0.71617413, + "learning_rate": 2.927204067389884e-06, + "loss": 0.73753607, + "num_input_tokens_seen": 130861050, + "step": 6090, + "time_per_iteration": 2.7414958477020264 + }, + { + "auxiliary_loss_clip": 0.01105933, + "auxiliary_loss_mlp": 0.01047805, + "balance_loss_clip": 1.05133104, + "balance_loss_mlp": 1.03305852, + "epoch": 0.3662107320006012, + "flos": 16581537342720.0, + "grad_norm": 2.037307676788604, + "language_loss": 0.74434924, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.7658866, + "num_input_tokens_seen": 130879775, + "step": 6091, + "time_per_iteration": 2.628554344177246 + }, + { + "auxiliary_loss_clip": 0.01076087, + "auxiliary_loss_mlp": 0.01042935, + "balance_loss_clip": 1.04836047, + "balance_loss_mlp": 1.02728868, + "epoch": 0.3662708552532692, + "flos": 20958365473920.0, + "grad_norm": 2.1960531931019682, + "language_loss": 0.73387206, + "learning_rate": 2.926513837074284e-06, + "loss": 0.75506234, + "num_input_tokens_seen": 130898070, + "step": 6092, + "time_per_iteration": 2.7320556640625 + }, + { + "auxiliary_loss_clip": 0.01127006, + "auxiliary_loss_mlp": 0.01044139, + "balance_loss_clip": 1.04809344, + "balance_loss_mlp": 1.02796876, + "epoch": 0.36633097850593715, + "flos": 21902707987200.0, + "grad_norm": 1.9967925590844784, + "language_loss": 0.77662504, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.79833645, + "num_input_tokens_seen": 130915250, + "step": 6093, + "time_per_iteration": 2.721311092376709 + }, + { + "auxiliary_loss_clip": 0.01124005, + "auxiliary_loss_mlp": 0.01042053, + "balance_loss_clip": 1.04696584, + "balance_loss_mlp": 1.02686548, + "epoch": 0.3663911017586051, + "flos": 32854133808000.0, + "grad_norm": 1.926436620767835, + "language_loss": 0.7455743, + "learning_rate": 2.925823466224696e-06, + "loss": 0.76723486, + "num_input_tokens_seen": 130936995, + "step": 6094, + "time_per_iteration": 2.767188310623169 + }, + { + "auxiliary_loss_clip": 0.01142303, + "auxiliary_loss_mlp": 0.01055832, + "balance_loss_clip": 1.05334711, + "balance_loss_mlp": 1.03969133, + "epoch": 0.3664512250112731, + "flos": 27271748482560.0, + "grad_norm": 1.743331442809004, + "language_loss": 0.79444361, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.81642497, + "num_input_tokens_seen": 130957970, + "step": 6095, + "time_per_iteration": 2.718632459640503 + }, + { + "auxiliary_loss_clip": 0.01118218, + "auxiliary_loss_mlp": 0.00774719, + "balance_loss_clip": 1.05141842, + "balance_loss_mlp": 1.00037265, + "epoch": 0.3665113482639411, + "flos": 17784436930560.0, + "grad_norm": 3.4988865885900178, + "language_loss": 0.73592722, + "learning_rate": 2.925132954945834e-06, + "loss": 0.75485659, + "num_input_tokens_seen": 130974915, + "step": 6096, + "time_per_iteration": 2.674382448196411 + }, + { + "auxiliary_loss_clip": 0.01099743, + "auxiliary_loss_mlp": 0.01038971, + "balance_loss_clip": 1.04458702, + "balance_loss_mlp": 1.02355742, + "epoch": 0.36657147151660907, + "flos": 27854614477440.0, + "grad_norm": 2.41624095312735, + "language_loss": 0.67081815, + "learning_rate": 2.924787646678155e-06, + "loss": 0.69220531, + "num_input_tokens_seen": 130995745, + "step": 6097, + "time_per_iteration": 2.789118766784668 + }, + { + "auxiliary_loss_clip": 0.01077673, + "auxiliary_loss_mlp": 0.01038362, + "balance_loss_clip": 1.04489172, + "balance_loss_mlp": 1.02268624, + "epoch": 0.36663159476927704, + "flos": 25374013228800.0, + "grad_norm": 1.4796406838499911, + "language_loss": 0.77679402, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.79795432, + "num_input_tokens_seen": 131015545, + "step": 6098, + "time_per_iteration": 2.7803733348846436 + }, + { + "auxiliary_loss_clip": 0.01122346, + "auxiliary_loss_mlp": 0.01045291, + "balance_loss_clip": 1.04734826, + "balance_loss_mlp": 1.02987766, + "epoch": 0.366691718021945, + "flos": 21357225072000.0, + "grad_norm": 1.744595499322522, + "language_loss": 0.73707491, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.75875127, + "num_input_tokens_seen": 131033990, + "step": 6099, + "time_per_iteration": 2.6809163093566895 + }, + { + "auxiliary_loss_clip": 0.01111202, + "auxiliary_loss_mlp": 0.01044256, + "balance_loss_clip": 1.04759586, + "balance_loss_mlp": 1.02931285, + "epoch": 0.36675184127461297, + "flos": 16800376953600.0, + "grad_norm": 1.8475933970370078, + "language_loss": 0.84773195, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.86928654, + "num_input_tokens_seen": 131050710, + "step": 6100, + "time_per_iteration": 2.6730356216430664 + }, + { + "auxiliary_loss_clip": 0.01102438, + "auxiliary_loss_mlp": 0.01037575, + "balance_loss_clip": 1.04448223, + "balance_loss_mlp": 1.02181566, + "epoch": 0.36681196452728093, + "flos": 21906514828800.0, + "grad_norm": 3.9532097547953104, + "language_loss": 0.70893979, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.73033994, + "num_input_tokens_seen": 131071435, + "step": 6101, + "time_per_iteration": 2.7369589805603027 + }, + { + "auxiliary_loss_clip": 0.01111262, + "auxiliary_loss_mlp": 0.01052791, + "balance_loss_clip": 1.05096185, + "balance_loss_mlp": 1.0361371, + "epoch": 0.3668720877799489, + "flos": 17712436118400.0, + "grad_norm": 2.286737474315047, + "language_loss": 0.76634502, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.7879855, + "num_input_tokens_seen": 131088775, + "step": 6102, + "time_per_iteration": 2.7081708908081055 + }, + { + "auxiliary_loss_clip": 0.01131629, + "auxiliary_loss_mlp": 0.01037373, + "balance_loss_clip": 1.0524683, + "balance_loss_mlp": 1.02050483, + "epoch": 0.36693221103261686, + "flos": 47045455499520.0, + "grad_norm": 4.369253140908342, + "language_loss": 0.70019859, + "learning_rate": 2.922715061101625e-06, + "loss": 0.72188866, + "num_input_tokens_seen": 131112800, + "step": 6103, + "time_per_iteration": 2.8610281944274902 + }, + { + "auxiliary_loss_clip": 0.01093091, + "auxiliary_loss_mlp": 0.0103895, + "balance_loss_clip": 1.04730344, + "balance_loss_mlp": 1.02283263, + "epoch": 0.3669923342852848, + "flos": 15960929132160.0, + "grad_norm": 3.0883152470965842, + "language_loss": 0.72272754, + "learning_rate": 2.922369507632716e-06, + "loss": 0.744048, + "num_input_tokens_seen": 131131150, + "step": 6104, + "time_per_iteration": 2.7520432472229004 + }, + { + "auxiliary_loss_clip": 0.01127975, + "auxiliary_loss_mlp": 0.01036046, + "balance_loss_clip": 1.05017686, + "balance_loss_mlp": 1.01940393, + "epoch": 0.3670524575379528, + "flos": 19974485064960.0, + "grad_norm": 2.1608886453477947, + "language_loss": 0.81461251, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.83625269, + "num_input_tokens_seen": 131150365, + "step": 6105, + "time_per_iteration": 2.7565362453460693 + }, + { + "auxiliary_loss_clip": 0.0114363, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.05170739, + "balance_loss_mlp": 1.02526236, + "epoch": 0.36711258079062076, + "flos": 25702955003520.0, + "grad_norm": 1.7202629897198451, + "language_loss": 0.81035495, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.83220649, + "num_input_tokens_seen": 131169310, + "step": 6106, + "time_per_iteration": 2.73502779006958 + }, + { + "auxiliary_loss_clip": 0.01035121, + "auxiliary_loss_mlp": 0.00753905, + "balance_loss_clip": 1.03131676, + "balance_loss_mlp": 1.00104892, + "epoch": 0.3671727040432887, + "flos": 60772743342720.0, + "grad_norm": 0.6921927745874564, + "language_loss": 0.59176284, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.60965312, + "num_input_tokens_seen": 131232900, + "step": 6107, + "time_per_iteration": 3.2754647731781006 + }, + { + "auxiliary_loss_clip": 0.01111272, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.04770529, + "balance_loss_mlp": 1.02058864, + "epoch": 0.3672328272959567, + "flos": 18661303745280.0, + "grad_norm": 1.8102661289525128, + "language_loss": 0.74492711, + "learning_rate": 2.92098694412469e-06, + "loss": 0.76639688, + "num_input_tokens_seen": 131250920, + "step": 6108, + "time_per_iteration": 2.730562448501587 + }, + { + "auxiliary_loss_clip": 0.01129123, + "auxiliary_loss_mlp": 0.01037704, + "balance_loss_clip": 1.04957151, + "balance_loss_mlp": 1.02196801, + "epoch": 0.3672929505486247, + "flos": 15049049535360.0, + "grad_norm": 2.04949693656995, + "language_loss": 0.72790694, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.7495752, + "num_input_tokens_seen": 131267910, + "step": 6109, + "time_per_iteration": 2.6488542556762695 + }, + { + "auxiliary_loss_clip": 0.01065451, + "auxiliary_loss_mlp": 0.01040533, + "balance_loss_clip": 1.04156637, + "balance_loss_mlp": 1.02426052, + "epoch": 0.3673530738012927, + "flos": 20589347099520.0, + "grad_norm": 4.856830375229604, + "language_loss": 0.53295934, + "learning_rate": 2.920295452774744e-06, + "loss": 0.55401909, + "num_input_tokens_seen": 131287150, + "step": 6110, + "time_per_iteration": 2.8366596698760986 + }, + { + "auxiliary_loss_clip": 0.01123878, + "auxiliary_loss_mlp": 0.01039006, + "balance_loss_clip": 1.04783487, + "balance_loss_mlp": 1.02253747, + "epoch": 0.36741319705396064, + "flos": 21689830033920.0, + "grad_norm": 1.6516494205850427, + "language_loss": 0.80507129, + "learning_rate": 2.919949654746672e-06, + "loss": 0.82670015, + "num_input_tokens_seen": 131308225, + "step": 6111, + "time_per_iteration": 2.7537708282470703 + }, + { + "auxiliary_loss_clip": 0.01083524, + "auxiliary_loss_mlp": 0.01044306, + "balance_loss_clip": 1.04381704, + "balance_loss_mlp": 1.02897525, + "epoch": 0.3674733203066286, + "flos": 29862200499840.0, + "grad_norm": 1.7980410764958656, + "language_loss": 0.72401643, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.74529469, + "num_input_tokens_seen": 131332115, + "step": 6112, + "time_per_iteration": 2.80513858795166 + }, + { + "auxiliary_loss_clip": 0.0112775, + "auxiliary_loss_mlp": 0.01046215, + "balance_loss_clip": 1.05025816, + "balance_loss_mlp": 1.03102732, + "epoch": 0.36753344355929657, + "flos": 18257021193600.0, + "grad_norm": 1.6179233760027578, + "language_loss": 0.8539387, + "learning_rate": 2.919257954049892e-06, + "loss": 0.8756783, + "num_input_tokens_seen": 131351885, + "step": 6113, + "time_per_iteration": 2.6997315883636475 + }, + { + "auxiliary_loss_clip": 0.01128342, + "auxiliary_loss_mlp": 0.01041644, + "balance_loss_clip": 1.04813516, + "balance_loss_mlp": 1.02512193, + "epoch": 0.36759356681196453, + "flos": 25301150490240.0, + "grad_norm": 2.2420277636185872, + "language_loss": 0.78542709, + "learning_rate": 2.918912051407413e-06, + "loss": 0.807127, + "num_input_tokens_seen": 131370245, + "step": 6114, + "time_per_iteration": 2.694831609725952 + }, + { + "auxiliary_loss_clip": 0.01133627, + "auxiliary_loss_mlp": 0.01044455, + "balance_loss_clip": 1.05145383, + "balance_loss_mlp": 1.02612031, + "epoch": 0.3676536900646325, + "flos": 21032952065280.0, + "grad_norm": 1.6750895304816946, + "language_loss": 0.67368686, + "learning_rate": 2.918566113919698e-06, + "loss": 0.69546771, + "num_input_tokens_seen": 131388115, + "step": 6115, + "time_per_iteration": 2.6966724395751953 + }, + { + "auxiliary_loss_clip": 0.01104674, + "auxiliary_loss_mlp": 0.01037383, + "balance_loss_clip": 1.04332471, + "balance_loss_mlp": 1.02229142, + "epoch": 0.36771381331730046, + "flos": 16288506190080.0, + "grad_norm": 3.500949938115168, + "language_loss": 0.76685899, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.78827953, + "num_input_tokens_seen": 131404595, + "step": 6116, + "time_per_iteration": 2.6796109676361084 + }, + { + "auxiliary_loss_clip": 0.01088778, + "auxiliary_loss_mlp": 0.01043047, + "balance_loss_clip": 1.04433835, + "balance_loss_mlp": 1.02729988, + "epoch": 0.36777393656996843, + "flos": 22309971367680.0, + "grad_norm": 1.7533988300226562, + "language_loss": 0.62997502, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.65129328, + "num_input_tokens_seen": 131423760, + "step": 6117, + "time_per_iteration": 2.7784011363983154 + }, + { + "auxiliary_loss_clip": 0.01103848, + "auxiliary_loss_mlp": 0.01037351, + "balance_loss_clip": 1.04275632, + "balance_loss_mlp": 1.0210557, + "epoch": 0.3678340598226364, + "flos": 26834069260800.0, + "grad_norm": 1.9867834860772036, + "language_loss": 0.73087811, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.75229007, + "num_input_tokens_seen": 131444955, + "step": 6118, + "time_per_iteration": 2.734731674194336 + }, + { + "auxiliary_loss_clip": 0.01132746, + "auxiliary_loss_mlp": 0.01043898, + "balance_loss_clip": 1.05198336, + "balance_loss_mlp": 1.0266242, + "epoch": 0.36789418307530436, + "flos": 21761723105280.0, + "grad_norm": 2.319960114880422, + "language_loss": 0.72638988, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.74815631, + "num_input_tokens_seen": 131465720, + "step": 6119, + "time_per_iteration": 2.7073371410369873 + }, + { + "auxiliary_loss_clip": 0.0111183, + "auxiliary_loss_mlp": 0.01037904, + "balance_loss_clip": 1.04830384, + "balance_loss_mlp": 1.02101171, + "epoch": 0.3679543063279723, + "flos": 15924192497280.0, + "grad_norm": 1.9587818101138383, + "language_loss": 0.80524689, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.8267442, + "num_input_tokens_seen": 131483080, + "step": 6120, + "time_per_iteration": 2.679933547973633 + }, + { + "auxiliary_loss_clip": 0.01093981, + "auxiliary_loss_mlp": 0.0104441, + "balance_loss_clip": 1.04785204, + "balance_loss_mlp": 1.02894819, + "epoch": 0.3680144295806403, + "flos": 24275541456000.0, + "grad_norm": 2.4092121945194496, + "language_loss": 0.64745319, + "learning_rate": 2.916489757978126e-06, + "loss": 0.66883707, + "num_input_tokens_seen": 131502545, + "step": 6121, + "time_per_iteration": 2.7067880630493164 + }, + { + "auxiliary_loss_clip": 0.01126101, + "auxiliary_loss_mlp": 0.01043212, + "balance_loss_clip": 1.05021691, + "balance_loss_mlp": 1.02735114, + "epoch": 0.36807455283330826, + "flos": 26104148985600.0, + "grad_norm": 1.774708172393826, + "language_loss": 0.71686751, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.73856068, + "num_input_tokens_seen": 131522155, + "step": 6122, + "time_per_iteration": 4.026647329330444 + }, + { + "auxiliary_loss_clip": 0.01106964, + "auxiliary_loss_mlp": 0.01043545, + "balance_loss_clip": 1.04859734, + "balance_loss_mlp": 1.0265938, + "epoch": 0.3681346760859763, + "flos": 24644990793600.0, + "grad_norm": 5.6855406070233245, + "language_loss": 0.69653022, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71803534, + "num_input_tokens_seen": 131543865, + "step": 6123, + "time_per_iteration": 2.7548627853393555 + }, + { + "auxiliary_loss_clip": 0.01128204, + "auxiliary_loss_mlp": 0.01040578, + "balance_loss_clip": 1.04822993, + "balance_loss_mlp": 1.02251744, + "epoch": 0.36819479933864424, + "flos": 23878369797120.0, + "grad_norm": 7.022932421262019, + "language_loss": 0.73640841, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.75809622, + "num_input_tokens_seen": 131562155, + "step": 6124, + "time_per_iteration": 2.6710870265960693 + }, + { + "auxiliary_loss_clip": 0.01116833, + "auxiliary_loss_mlp": 0.01045789, + "balance_loss_clip": 1.04977059, + "balance_loss_mlp": 1.02809882, + "epoch": 0.3682549225913122, + "flos": 25553997302400.0, + "grad_norm": 1.931714997280456, + "language_loss": 0.74334198, + "learning_rate": 2.915104825441114e-06, + "loss": 0.76496822, + "num_input_tokens_seen": 131581695, + "step": 6125, + "time_per_iteration": 4.175686359405518 + }, + { + "auxiliary_loss_clip": 0.01132649, + "auxiliary_loss_mlp": 0.01053205, + "balance_loss_clip": 1.05193818, + "balance_loss_mlp": 1.03514445, + "epoch": 0.36831504584398017, + "flos": 16946605221120.0, + "grad_norm": 1.8318884745506827, + "language_loss": 0.78127813, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.80313659, + "num_input_tokens_seen": 131599465, + "step": 6126, + "time_per_iteration": 2.6783266067504883 + }, + { + "auxiliary_loss_clip": 0.01128437, + "auxiliary_loss_mlp": 0.01045021, + "balance_loss_clip": 1.0490706, + "balance_loss_mlp": 1.02659082, + "epoch": 0.36837516909664814, + "flos": 19865065259520.0, + "grad_norm": 2.7490159956422575, + "language_loss": 0.66118228, + "learning_rate": 2.914412150914888e-06, + "loss": 0.68291688, + "num_input_tokens_seen": 131618330, + "step": 6127, + "time_per_iteration": 4.20530891418457 + }, + { + "auxiliary_loss_clip": 0.01120142, + "auxiliary_loss_mlp": 0.01046706, + "balance_loss_clip": 1.05205703, + "balance_loss_mlp": 1.02980185, + "epoch": 0.3684352923493161, + "flos": 37626984362880.0, + "grad_norm": 1.8515813315176315, + "language_loss": 0.70152593, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.72319436, + "num_input_tokens_seen": 131638960, + "step": 6128, + "time_per_iteration": 4.498606204986572 + }, + { + "auxiliary_loss_clip": 0.0112131, + "auxiliary_loss_mlp": 0.01046424, + "balance_loss_clip": 1.05264103, + "balance_loss_mlp": 1.02957964, + "epoch": 0.36849541560198407, + "flos": 14465501182080.0, + "grad_norm": 2.3245894967836698, + "language_loss": 0.75067866, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.77235603, + "num_input_tokens_seen": 131657440, + "step": 6129, + "time_per_iteration": 2.6874284744262695 + }, + { + "auxiliary_loss_clip": 0.01118674, + "auxiliary_loss_mlp": 0.01040759, + "balance_loss_clip": 1.04533887, + "balance_loss_mlp": 1.02403355, + "epoch": 0.36855553885465203, + "flos": 25770753924480.0, + "grad_norm": 1.6533761140504426, + "language_loss": 0.84758681, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.86918116, + "num_input_tokens_seen": 131678035, + "step": 6130, + "time_per_iteration": 2.729963541030884 + }, + { + "auxiliary_loss_clip": 0.0102639, + "auxiliary_loss_mlp": 0.01017875, + "balance_loss_clip": 1.02295637, + "balance_loss_mlp": 1.01620567, + "epoch": 0.36861566210732, + "flos": 65049417377280.0, + "grad_norm": 0.8481176099425293, + "language_loss": 0.60254776, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62299049, + "num_input_tokens_seen": 131742470, + "step": 6131, + "time_per_iteration": 3.2806124687194824 + }, + { + "auxiliary_loss_clip": 0.01097122, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.04542315, + "balance_loss_mlp": 1.01914179, + "epoch": 0.36867578535998796, + "flos": 30954495133440.0, + "grad_norm": 1.5587449528822306, + "language_loss": 0.73085582, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.75218356, + "num_input_tokens_seen": 131764570, + "step": 6132, + "time_per_iteration": 2.781385898590088 + }, + { + "auxiliary_loss_clip": 0.01127214, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.04795551, + "balance_loss_mlp": 1.02187514, + "epoch": 0.3687359086126559, + "flos": 28837956182400.0, + "grad_norm": 1.9292425463255205, + "language_loss": 0.74192035, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76358628, + "num_input_tokens_seen": 131785720, + "step": 6133, + "time_per_iteration": 2.718660831451416 + }, + { + "auxiliary_loss_clip": 0.01072831, + "auxiliary_loss_mlp": 0.01049093, + "balance_loss_clip": 1.041502, + "balance_loss_mlp": 1.03042495, + "epoch": 0.3687960318653239, + "flos": 21396798881280.0, + "grad_norm": 1.8863128538280483, + "language_loss": 0.71522588, + "learning_rate": 2.911986698512874e-06, + "loss": 0.73644507, + "num_input_tokens_seen": 131804430, + "step": 6134, + "time_per_iteration": 2.8003294467926025 + }, + { + "auxiliary_loss_clip": 0.01102901, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.0472008, + "balance_loss_mlp": 1.01838863, + "epoch": 0.36885615511799186, + "flos": 20266043760000.0, + "grad_norm": 1.6906065874809195, + "language_loss": 0.75386798, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.77524465, + "num_input_tokens_seen": 131822060, + "step": 6135, + "time_per_iteration": 2.7916624546051025 + }, + { + "auxiliary_loss_clip": 0.01030435, + "auxiliary_loss_mlp": 0.01019879, + "balance_loss_clip": 1.0281316, + "balance_loss_mlp": 1.01760185, + "epoch": 0.3689162783706599, + "flos": 63088836301440.0, + "grad_norm": 0.8159837123545765, + "language_loss": 0.58766222, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.60816532, + "num_input_tokens_seen": 131880715, + "step": 6136, + "time_per_iteration": 3.2766408920288086 + }, + { + "auxiliary_loss_clip": 0.01106354, + "auxiliary_loss_mlp": 0.01043903, + "balance_loss_clip": 1.04497695, + "balance_loss_mlp": 1.02723718, + "epoch": 0.36897640162332784, + "flos": 10961984419200.0, + "grad_norm": 2.3780452593473393, + "language_loss": 0.79126394, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.81276655, + "num_input_tokens_seen": 131895850, + "step": 6137, + "time_per_iteration": 2.8411052227020264 + }, + { + "auxiliary_loss_clip": 0.011261, + "auxiliary_loss_mlp": 0.01043272, + "balance_loss_clip": 1.04803205, + "balance_loss_mlp": 1.02645159, + "epoch": 0.3690365248759958, + "flos": 20704297599360.0, + "grad_norm": 2.0312275113078337, + "language_loss": 0.7454071, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.76710081, + "num_input_tokens_seen": 131915775, + "step": 6138, + "time_per_iteration": 2.7210230827331543 + }, + { + "auxiliary_loss_clip": 0.01090918, + "auxiliary_loss_mlp": 0.01042472, + "balance_loss_clip": 1.04320955, + "balance_loss_mlp": 1.0259856, + "epoch": 0.3690966481286638, + "flos": 31826369957760.0, + "grad_norm": 2.0947758027881767, + "language_loss": 0.64676917, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.66810304, + "num_input_tokens_seen": 131935715, + "step": 6139, + "time_per_iteration": 2.8667304515838623 + }, + { + "auxiliary_loss_clip": 0.01095075, + "auxiliary_loss_mlp": 0.01042873, + "balance_loss_clip": 1.04443955, + "balance_loss_mlp": 1.02646971, + "epoch": 0.36915677138133174, + "flos": 13114936782720.0, + "grad_norm": 2.1146776737326998, + "language_loss": 0.71764016, + "learning_rate": 2.909906390418006e-06, + "loss": 0.73901963, + "num_input_tokens_seen": 131954120, + "step": 6140, + "time_per_iteration": 2.718100070953369 + }, + { + "auxiliary_loss_clip": 0.01017799, + "auxiliary_loss_mlp": 0.01004631, + "balance_loss_clip": 1.02079976, + "balance_loss_mlp": 1.00281894, + "epoch": 0.3692168946339997, + "flos": 68686879956480.0, + "grad_norm": 0.7503567012350645, + "language_loss": 0.59252203, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.61274636, + "num_input_tokens_seen": 132017485, + "step": 6141, + "time_per_iteration": 3.3003833293914795 + }, + { + "auxiliary_loss_clip": 0.01122088, + "auxiliary_loss_mlp": 0.01040836, + "balance_loss_clip": 1.04716861, + "balance_loss_mlp": 1.02458787, + "epoch": 0.36927701788666767, + "flos": 22017873968640.0, + "grad_norm": 1.900744005055956, + "language_loss": 0.75374687, + "learning_rate": 2.909212678216192e-06, + "loss": 0.77537608, + "num_input_tokens_seen": 132036760, + "step": 6142, + "time_per_iteration": 2.707676410675049 + }, + { + "auxiliary_loss_clip": 0.01122008, + "auxiliary_loss_mlp": 0.01037683, + "balance_loss_clip": 1.04708242, + "balance_loss_mlp": 1.02276349, + "epoch": 0.36933714113933563, + "flos": 21835591424640.0, + "grad_norm": 2.0868371024046346, + "language_loss": 0.77474618, + "learning_rate": 2.908865770392555e-06, + "loss": 0.79634303, + "num_input_tokens_seen": 132056935, + "step": 6143, + "time_per_iteration": 2.6308929920196533 + }, + { + "auxiliary_loss_clip": 0.01122961, + "auxiliary_loss_mlp": 0.01033227, + "balance_loss_clip": 1.04840302, + "balance_loss_mlp": 1.01860011, + "epoch": 0.3693972643920036, + "flos": 23691705793920.0, + "grad_norm": 2.7754530777388555, + "language_loss": 0.82127941, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.84284127, + "num_input_tokens_seen": 132077285, + "step": 6144, + "time_per_iteration": 2.7094409465789795 + }, + { + "auxiliary_loss_clip": 0.01126238, + "auxiliary_loss_mlp": 0.01040495, + "balance_loss_clip": 1.0479883, + "balance_loss_mlp": 1.02547419, + "epoch": 0.36945738764467156, + "flos": 22856747172480.0, + "grad_norm": 2.260022101229928, + "language_loss": 0.774791, + "learning_rate": 2.908171851365593e-06, + "loss": 0.79645836, + "num_input_tokens_seen": 132095520, + "step": 6145, + "time_per_iteration": 2.6951241493225098 + }, + { + "auxiliary_loss_clip": 0.01120499, + "auxiliary_loss_mlp": 0.01030806, + "balance_loss_clip": 1.04903388, + "balance_loss_mlp": 1.01503491, + "epoch": 0.36951751089733953, + "flos": 16615939593600.0, + "grad_norm": 2.2611713814894423, + "language_loss": 0.76861286, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.79012597, + "num_input_tokens_seen": 132112810, + "step": 6146, + "time_per_iteration": 2.6205246448516846 + }, + { + "auxiliary_loss_clip": 0.0110988, + "auxiliary_loss_mlp": 0.01042802, + "balance_loss_clip": 1.04717457, + "balance_loss_mlp": 1.02518249, + "epoch": 0.3695776341500075, + "flos": 18914545607040.0, + "grad_norm": 3.3549376840260394, + "language_loss": 0.80945081, + "learning_rate": 2.907477794586761e-06, + "loss": 0.83097762, + "num_input_tokens_seen": 132131615, + "step": 6147, + "time_per_iteration": 2.7176942825317383 + }, + { + "auxiliary_loss_clip": 0.01108097, + "auxiliary_loss_mlp": 0.00773519, + "balance_loss_clip": 1.05041718, + "balance_loss_mlp": 1.00029731, + "epoch": 0.36963775740267546, + "flos": 20808474019200.0, + "grad_norm": 1.8104892137163535, + "language_loss": 0.83325249, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.85206866, + "num_input_tokens_seen": 132149585, + "step": 6148, + "time_per_iteration": 2.7764229774475098 + }, + { + "auxiliary_loss_clip": 0.01121751, + "auxiliary_loss_mlp": 0.01033697, + "balance_loss_clip": 1.04946411, + "balance_loss_mlp": 1.01843238, + "epoch": 0.3696978806553435, + "flos": 26061881656320.0, + "grad_norm": 2.472295207741171, + "language_loss": 0.74167144, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.76322597, + "num_input_tokens_seen": 132165555, + "step": 6149, + "time_per_iteration": 2.729785680770874 + }, + { + "auxiliary_loss_clip": 0.01141043, + "auxiliary_loss_mlp": 0.01040796, + "balance_loss_clip": 1.0524776, + "balance_loss_mlp": 1.02347541, + "epoch": 0.36975800390801145, + "flos": 26833925606400.0, + "grad_norm": 2.18045381202803, + "language_loss": 0.71229833, + "learning_rate": 2.906436451364054e-06, + "loss": 0.73411667, + "num_input_tokens_seen": 132185100, + "step": 6150, + "time_per_iteration": 2.6558914184570312 + }, + { + "auxiliary_loss_clip": 0.01112432, + "auxiliary_loss_mlp": 0.0104236, + "balance_loss_clip": 1.04834723, + "balance_loss_mlp": 1.02634454, + "epoch": 0.3698181271606794, + "flos": 21142623265920.0, + "grad_norm": 2.1283605732632487, + "language_loss": 0.82001126, + "learning_rate": 2.906089268194611e-06, + "loss": 0.84155917, + "num_input_tokens_seen": 132203930, + "step": 6151, + "time_per_iteration": 2.811908483505249 + }, + { + "auxiliary_loss_clip": 0.0104085, + "auxiliary_loss_mlp": 0.01012111, + "balance_loss_clip": 1.02895284, + "balance_loss_mlp": 1.01035905, + "epoch": 0.3698782504133474, + "flos": 66742639568640.0, + "grad_norm": 0.8434423047890295, + "language_loss": 0.63103437, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.651564, + "num_input_tokens_seen": 132263845, + "step": 6152, + "time_per_iteration": 3.283348798751831 + }, + { + "auxiliary_loss_clip": 0.01083912, + "auxiliary_loss_mlp": 0.01046371, + "balance_loss_clip": 1.04603028, + "balance_loss_mlp": 1.02939606, + "epoch": 0.36993837366601534, + "flos": 24311523905280.0, + "grad_norm": 2.101714417244525, + "language_loss": 0.70249707, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.72379988, + "num_input_tokens_seen": 132282350, + "step": 6153, + "time_per_iteration": 2.776003837585449 + }, + { + "auxiliary_loss_clip": 0.01126735, + "auxiliary_loss_mlp": 0.01038393, + "balance_loss_clip": 1.04984677, + "balance_loss_mlp": 1.02176309, + "epoch": 0.3699984969186833, + "flos": 24349194293760.0, + "grad_norm": 1.5983560512083512, + "language_loss": 0.72364891, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.74530017, + "num_input_tokens_seen": 132301930, + "step": 6154, + "time_per_iteration": 2.7031455039978027 + }, + { + "auxiliary_loss_clip": 0.01108862, + "auxiliary_loss_mlp": 0.01038947, + "balance_loss_clip": 1.04792106, + "balance_loss_mlp": 1.02376008, + "epoch": 0.37005862017135127, + "flos": 19829154637440.0, + "grad_norm": 1.6579101756116525, + "language_loss": 0.67716074, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.6986388, + "num_input_tokens_seen": 132320915, + "step": 6155, + "time_per_iteration": 2.7716591358184814 + }, + { + "auxiliary_loss_clip": 0.01124062, + "auxiliary_loss_mlp": 0.01032665, + "balance_loss_clip": 1.04789114, + "balance_loss_mlp": 1.0171442, + "epoch": 0.37011874342401924, + "flos": 19573793873280.0, + "grad_norm": 1.797024775246088, + "language_loss": 0.68048114, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.70204842, + "num_input_tokens_seen": 132340415, + "step": 6156, + "time_per_iteration": 2.7830615043640137 + }, + { + "auxiliary_loss_clip": 0.01109781, + "auxiliary_loss_mlp": 0.0103684, + "balance_loss_clip": 1.04603815, + "balance_loss_mlp": 1.02202225, + "epoch": 0.3701788666766872, + "flos": 20374350243840.0, + "grad_norm": 1.8485807917443284, + "language_loss": 0.82232833, + "learning_rate": 2.904005448099916e-06, + "loss": 0.84379458, + "num_input_tokens_seen": 132358600, + "step": 6157, + "time_per_iteration": 2.676429033279419 + }, + { + "auxiliary_loss_clip": 0.01087924, + "auxiliary_loss_mlp": 0.01042208, + "balance_loss_clip": 1.04360199, + "balance_loss_mlp": 1.02474344, + "epoch": 0.37023898992935517, + "flos": 15340931452800.0, + "grad_norm": 2.2992188770836175, + "language_loss": 0.76899838, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.79029977, + "num_input_tokens_seen": 132373160, + "step": 6158, + "time_per_iteration": 2.7764365673065186 + }, + { + "auxiliary_loss_clip": 0.01138492, + "auxiliary_loss_mlp": 0.01037057, + "balance_loss_clip": 1.0489651, + "balance_loss_mlp": 1.01997483, + "epoch": 0.37029911318202313, + "flos": 19573937527680.0, + "grad_norm": 2.8360595009252196, + "language_loss": 0.68930852, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.71106398, + "num_input_tokens_seen": 132392345, + "step": 6159, + "time_per_iteration": 2.664858818054199 + }, + { + "auxiliary_loss_clip": 0.01110756, + "auxiliary_loss_mlp": 0.01035031, + "balance_loss_clip": 1.049088, + "balance_loss_mlp": 1.02067792, + "epoch": 0.3703592364346911, + "flos": 26213353309440.0, + "grad_norm": 2.9956624327703523, + "language_loss": 0.71067882, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.73213673, + "num_input_tokens_seen": 132412620, + "step": 6160, + "time_per_iteration": 2.757081985473633 + }, + { + "auxiliary_loss_clip": 0.01106906, + "auxiliary_loss_mlp": 0.01033059, + "balance_loss_clip": 1.04698467, + "balance_loss_mlp": 1.01918936, + "epoch": 0.37041935968735906, + "flos": 20048317470720.0, + "grad_norm": 2.0439504076987403, + "language_loss": 0.79205775, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.81345737, + "num_input_tokens_seen": 132431570, + "step": 6161, + "time_per_iteration": 2.8008711338043213 + }, + { + "auxiliary_loss_clip": 0.01136197, + "auxiliary_loss_mlp": 0.01038947, + "balance_loss_clip": 1.04960537, + "balance_loss_mlp": 1.02284193, + "epoch": 0.3704794829400271, + "flos": 24133802388480.0, + "grad_norm": 2.0425786778899058, + "language_loss": 0.79665029, + "learning_rate": 2.902267988534295e-06, + "loss": 0.81840169, + "num_input_tokens_seen": 132451525, + "step": 6162, + "time_per_iteration": 4.2554450035095215 + }, + { + "auxiliary_loss_clip": 0.01107039, + "auxiliary_loss_mlp": 0.00773743, + "balance_loss_clip": 1.0442729, + "balance_loss_mlp": 1.00038123, + "epoch": 0.37053960619269505, + "flos": 14866874732160.0, + "grad_norm": 2.0272159369395193, + "language_loss": 0.79314882, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.81195664, + "num_input_tokens_seen": 132469875, + "step": 6163, + "time_per_iteration": 2.753324508666992 + }, + { + "auxiliary_loss_clip": 0.0112147, + "auxiliary_loss_mlp": 0.01039825, + "balance_loss_clip": 1.04676855, + "balance_loss_mlp": 1.02351689, + "epoch": 0.370599729445363, + "flos": 21361498790400.0, + "grad_norm": 1.847799951808159, + "language_loss": 0.67843366, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.7000466, + "num_input_tokens_seen": 132488360, + "step": 6164, + "time_per_iteration": 2.7885541915893555 + }, + { + "auxiliary_loss_clip": 0.01109766, + "auxiliary_loss_mlp": 0.01045808, + "balance_loss_clip": 1.04918885, + "balance_loss_mlp": 1.02877307, + "epoch": 0.370659852698031, + "flos": 26829041356800.0, + "grad_norm": 2.0007288653084334, + "language_loss": 0.83441198, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.85596776, + "num_input_tokens_seen": 132508630, + "step": 6165, + "time_per_iteration": 4.3637871742248535 + }, + { + "auxiliary_loss_clip": 0.01115767, + "auxiliary_loss_mlp": 0.01037848, + "balance_loss_clip": 1.0473845, + "balance_loss_mlp": 1.02026439, + "epoch": 0.37071997595069894, + "flos": 19099018880640.0, + "grad_norm": 1.7502292049636352, + "language_loss": 0.69057518, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.71211129, + "num_input_tokens_seen": 132527465, + "step": 6166, + "time_per_iteration": 2.6754019260406494 + }, + { + "auxiliary_loss_clip": 0.01032616, + "auxiliary_loss_mlp": 0.01025464, + "balance_loss_clip": 1.03081024, + "balance_loss_mlp": 1.02362847, + "epoch": 0.3707800992033669, + "flos": 52178384920320.0, + "grad_norm": 0.8028866408552083, + "language_loss": 0.5688796, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.58946037, + "num_input_tokens_seen": 132579940, + "step": 6167, + "time_per_iteration": 6.357440233230591 + }, + { + "auxiliary_loss_clip": 0.01110244, + "auxiliary_loss_mlp": 0.01037896, + "balance_loss_clip": 1.04592001, + "balance_loss_mlp": 1.02284551, + "epoch": 0.3708402224560349, + "flos": 19901837808000.0, + "grad_norm": 2.0812394742982203, + "language_loss": 0.75159574, + "learning_rate": 2.900181908135584e-06, + "loss": 0.77307719, + "num_input_tokens_seen": 132598390, + "step": 6168, + "time_per_iteration": 2.7107198238372803 + }, + { + "auxiliary_loss_clip": 0.01117658, + "auxiliary_loss_mlp": 0.00773774, + "balance_loss_clip": 1.04381216, + "balance_loss_mlp": 1.00029826, + "epoch": 0.37090034570870284, + "flos": 20007630339840.0, + "grad_norm": 2.166706099657804, + "language_loss": 0.73690271, + "learning_rate": 2.899834108519755e-06, + "loss": 0.755817, + "num_input_tokens_seen": 132616920, + "step": 6169, + "time_per_iteration": 2.743741035461426 + }, + { + "auxiliary_loss_clip": 0.0113208, + "auxiliary_loss_mlp": 0.01038383, + "balance_loss_clip": 1.0476737, + "balance_loss_mlp": 1.02352989, + "epoch": 0.3709604689613708, + "flos": 24134700228480.0, + "grad_norm": 1.6724632615545945, + "language_loss": 0.79498589, + "learning_rate": 2.899486274782127e-06, + "loss": 0.81669056, + "num_input_tokens_seen": 132637660, + "step": 6170, + "time_per_iteration": 2.738492727279663 + }, + { + "auxiliary_loss_clip": 0.01122253, + "auxiliary_loss_mlp": 0.01045679, + "balance_loss_clip": 1.04780805, + "balance_loss_mlp": 1.02913237, + "epoch": 0.37102059221403877, + "flos": 23876071326720.0, + "grad_norm": 1.739457755704792, + "language_loss": 0.76506341, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.78674281, + "num_input_tokens_seen": 132657635, + "step": 6171, + "time_per_iteration": 2.6531472206115723 + }, + { + "auxiliary_loss_clip": 0.01112543, + "auxiliary_loss_mlp": 0.01041865, + "balance_loss_clip": 1.05081654, + "balance_loss_mlp": 1.02546144, + "epoch": 0.37108071546670673, + "flos": 14501268149760.0, + "grad_norm": 2.0084032146250608, + "language_loss": 0.80705774, + "learning_rate": 2.898790504994232e-06, + "loss": 0.82860184, + "num_input_tokens_seen": 132674455, + "step": 6172, + "time_per_iteration": 2.6587960720062256 + }, + { + "auxiliary_loss_clip": 0.01125694, + "auxiliary_loss_mlp": 0.01044257, + "balance_loss_clip": 1.0475564, + "balance_loss_mlp": 1.02747262, + "epoch": 0.3711408387193747, + "flos": 34562619279360.0, + "grad_norm": 2.410153405618026, + "language_loss": 0.59260982, + "learning_rate": 2.89844256897035e-06, + "loss": 0.61430931, + "num_input_tokens_seen": 132695140, + "step": 6173, + "time_per_iteration": 2.738430976867676 + }, + { + "auxiliary_loss_clip": 0.01110933, + "auxiliary_loss_mlp": 0.01044385, + "balance_loss_clip": 1.04549873, + "balance_loss_mlp": 1.02885222, + "epoch": 0.37120096197204266, + "flos": 17310703432320.0, + "grad_norm": 1.954423749693878, + "language_loss": 0.80869365, + "learning_rate": 2.898094598877435e-06, + "loss": 0.83024681, + "num_input_tokens_seen": 132712470, + "step": 6174, + "time_per_iteration": 2.7166690826416016 + }, + { + "auxiliary_loss_clip": 0.01129522, + "auxiliary_loss_mlp": 0.01045042, + "balance_loss_clip": 1.04628158, + "balance_loss_mlp": 1.03025961, + "epoch": 0.37126108522471063, + "flos": 30664049760000.0, + "grad_norm": 2.1592050046005, + "language_loss": 0.79910219, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.82084787, + "num_input_tokens_seen": 132732945, + "step": 6175, + "time_per_iteration": 2.6746280193328857 + }, + { + "auxiliary_loss_clip": 0.011267, + "auxiliary_loss_mlp": 0.01053826, + "balance_loss_clip": 1.05173898, + "balance_loss_mlp": 1.0380547, + "epoch": 0.37132120847737865, + "flos": 25155640494720.0, + "grad_norm": 2.2578092376668315, + "language_loss": 0.88735723, + "learning_rate": 2.89739855653729e-06, + "loss": 0.90916252, + "num_input_tokens_seen": 132752470, + "step": 6176, + "time_per_iteration": 2.6791093349456787 + }, + { + "auxiliary_loss_clip": 0.01124216, + "auxiliary_loss_mlp": 0.01042973, + "balance_loss_clip": 1.04811859, + "balance_loss_mlp": 1.02713037, + "epoch": 0.3713813317300466, + "flos": 21213474842880.0, + "grad_norm": 1.5716198978013565, + "language_loss": 0.73431349, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.75598538, + "num_input_tokens_seen": 132771485, + "step": 6177, + "time_per_iteration": 2.6808605194091797 + }, + { + "auxiliary_loss_clip": 0.01102086, + "auxiliary_loss_mlp": 0.01051929, + "balance_loss_clip": 1.04524541, + "balance_loss_mlp": 1.03575838, + "epoch": 0.3714414549827146, + "flos": 21616644072960.0, + "grad_norm": 2.0030850547718915, + "language_loss": 0.75349051, + "learning_rate": 2.896702378079374e-06, + "loss": 0.77503073, + "num_input_tokens_seen": 132791465, + "step": 6178, + "time_per_iteration": 2.7112066745758057 + }, + { + "auxiliary_loss_clip": 0.0107122, + "auxiliary_loss_mlp": 0.01050415, + "balance_loss_clip": 1.04323864, + "balance_loss_mlp": 1.03208089, + "epoch": 0.37150157823538255, + "flos": 19972294335360.0, + "grad_norm": 2.0305314414463136, + "language_loss": 0.72141892, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.74263525, + "num_input_tokens_seen": 132810160, + "step": 6179, + "time_per_iteration": 2.7965877056121826 + }, + { + "auxiliary_loss_clip": 0.01137504, + "auxiliary_loss_mlp": 0.01046799, + "balance_loss_clip": 1.05008841, + "balance_loss_mlp": 1.03018165, + "epoch": 0.3715617014880505, + "flos": 24860562266880.0, + "grad_norm": 2.387630814732786, + "language_loss": 0.6993162, + "learning_rate": 2.896006063609283e-06, + "loss": 0.72115916, + "num_input_tokens_seen": 132831265, + "step": 6180, + "time_per_iteration": 2.695232391357422 + }, + { + "auxiliary_loss_clip": 0.01113448, + "auxiliary_loss_mlp": 0.01037109, + "balance_loss_clip": 1.04914021, + "balance_loss_mlp": 1.02208257, + "epoch": 0.3716218247407185, + "flos": 20449080489600.0, + "grad_norm": 2.1080005695464243, + "language_loss": 0.77920252, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.80070812, + "num_input_tokens_seen": 132850005, + "step": 6181, + "time_per_iteration": 2.7087795734405518 + }, + { + "auxiliary_loss_clip": 0.01123157, + "auxiliary_loss_mlp": 0.01041815, + "balance_loss_clip": 1.05016994, + "balance_loss_mlp": 1.02525139, + "epoch": 0.37168194799338644, + "flos": 24133479166080.0, + "grad_norm": 2.570629027716188, + "language_loss": 0.79222846, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.81387818, + "num_input_tokens_seen": 132865790, + "step": 6182, + "time_per_iteration": 2.6541473865509033 + }, + { + "auxiliary_loss_clip": 0.01041849, + "auxiliary_loss_mlp": 0.01016945, + "balance_loss_clip": 1.03053021, + "balance_loss_mlp": 1.01533604, + "epoch": 0.3717420712460544, + "flos": 67408926900480.0, + "grad_norm": 0.7830434308203498, + "language_loss": 0.57445002, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59503794, + "num_input_tokens_seen": 132921775, + "step": 6183, + "time_per_iteration": 3.191969633102417 + }, + { + "auxiliary_loss_clip": 0.01126783, + "auxiliary_loss_mlp": 0.00775242, + "balance_loss_clip": 1.04496169, + "balance_loss_mlp": 1.00043631, + "epoch": 0.37180219449872237, + "flos": 22376908362240.0, + "grad_norm": 1.9647478507461604, + "language_loss": 0.76617277, + "learning_rate": 2.894613027055066e-06, + "loss": 0.78519297, + "num_input_tokens_seen": 132941060, + "step": 6184, + "time_per_iteration": 2.7096588611602783 + }, + { + "auxiliary_loss_clip": 0.01090654, + "auxiliary_loss_mlp": 0.01039062, + "balance_loss_clip": 1.04084587, + "balance_loss_mlp": 1.02344596, + "epoch": 0.37186231775139034, + "flos": 21869885934720.0, + "grad_norm": 2.1021072738728717, + "language_loss": 0.7217713, + "learning_rate": 2.894264683073954e-06, + "loss": 0.74306846, + "num_input_tokens_seen": 132961850, + "step": 6185, + "time_per_iteration": 2.739130735397339 + }, + { + "auxiliary_loss_clip": 0.01081138, + "auxiliary_loss_mlp": 0.01034498, + "balance_loss_clip": 1.04156423, + "balance_loss_mlp": 1.01805878, + "epoch": 0.3719224410040583, + "flos": 22415225195520.0, + "grad_norm": 2.1871647895832496, + "language_loss": 0.76805776, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.78921413, + "num_input_tokens_seen": 132981625, + "step": 6186, + "time_per_iteration": 2.779510259628296 + }, + { + "auxiliary_loss_clip": 0.01131414, + "auxiliary_loss_mlp": 0.01042221, + "balance_loss_clip": 1.05090106, + "balance_loss_mlp": 1.02491212, + "epoch": 0.37198256425672627, + "flos": 25151223121920.0, + "grad_norm": 1.8929067887672733, + "language_loss": 0.84037393, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.86211032, + "num_input_tokens_seen": 133001225, + "step": 6187, + "time_per_iteration": 2.67541241645813 + }, + { + "auxiliary_loss_clip": 0.01120953, + "auxiliary_loss_mlp": 0.01040882, + "balance_loss_clip": 1.04474545, + "balance_loss_mlp": 1.02553999, + "epoch": 0.37204268750939423, + "flos": 21138313633920.0, + "grad_norm": 1.7194664317181616, + "language_loss": 0.84831274, + "learning_rate": 2.893219447719824e-06, + "loss": 0.86993104, + "num_input_tokens_seen": 133018820, + "step": 6188, + "time_per_iteration": 2.6241226196289062 + }, + { + "auxiliary_loss_clip": 0.01108827, + "auxiliary_loss_mlp": 0.01040814, + "balance_loss_clip": 1.04934168, + "balance_loss_mlp": 1.02501917, + "epoch": 0.37210281076206225, + "flos": 21506829217920.0, + "grad_norm": 2.498329305558477, + "language_loss": 0.65702367, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.67852014, + "num_input_tokens_seen": 133040205, + "step": 6189, + "time_per_iteration": 2.724707841873169 + }, + { + "auxiliary_loss_clip": 0.01112219, + "auxiliary_loss_mlp": 0.0104713, + "balance_loss_clip": 1.0451889, + "balance_loss_mlp": 1.03045225, + "epoch": 0.3721629340147302, + "flos": 17347835116800.0, + "grad_norm": 1.9571366893805608, + "language_loss": 0.84120989, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.86280334, + "num_input_tokens_seen": 133058095, + "step": 6190, + "time_per_iteration": 2.719454050064087 + }, + { + "auxiliary_loss_clip": 0.01109992, + "auxiliary_loss_mlp": 0.01041587, + "balance_loss_clip": 1.0465343, + "balance_loss_mlp": 1.02571416, + "epoch": 0.3722230572673982, + "flos": 16432400073600.0, + "grad_norm": 4.021000090429005, + "language_loss": 0.87807733, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.89959311, + "num_input_tokens_seen": 133071530, + "step": 6191, + "time_per_iteration": 2.7081027030944824 + }, + { + "auxiliary_loss_clip": 0.0108777, + "auxiliary_loss_mlp": 0.01037991, + "balance_loss_clip": 1.04300189, + "balance_loss_mlp": 1.01962125, + "epoch": 0.37228318052006615, + "flos": 22674716023680.0, + "grad_norm": 3.7199150853096508, + "language_loss": 0.74228656, + "learning_rate": 2.891825326449073e-06, + "loss": 0.7635442, + "num_input_tokens_seen": 133091410, + "step": 6192, + "time_per_iteration": 2.8161356449127197 + }, + { + "auxiliary_loss_clip": 0.01134777, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.04818201, + "balance_loss_mlp": 1.02497888, + "epoch": 0.3723433037727341, + "flos": 25265491263360.0, + "grad_norm": 2.31871347399746, + "language_loss": 0.80621845, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.82796752, + "num_input_tokens_seen": 133110365, + "step": 6193, + "time_per_iteration": 2.661550760269165 + }, + { + "auxiliary_loss_clip": 0.01101478, + "auxiliary_loss_mlp": 0.01041083, + "balance_loss_clip": 1.04354334, + "balance_loss_mlp": 1.02522826, + "epoch": 0.3724034270254021, + "flos": 10524664333440.0, + "grad_norm": 2.475173523724827, + "language_loss": 0.84729886, + "learning_rate": 2.891128062852194e-06, + "loss": 0.86872447, + "num_input_tokens_seen": 133128255, + "step": 6194, + "time_per_iteration": 2.711531400680542 + }, + { + "auxiliary_loss_clip": 0.0111161, + "auxiliary_loss_mlp": 0.010372, + "balance_loss_clip": 1.04650784, + "balance_loss_mlp": 1.02142286, + "epoch": 0.37246355027807004, + "flos": 20266223328000.0, + "grad_norm": 9.44838101604173, + "language_loss": 0.77016377, + "learning_rate": 2.890779380359646e-06, + "loss": 0.79165184, + "num_input_tokens_seen": 133143975, + "step": 6195, + "time_per_iteration": 2.6527512073516846 + }, + { + "auxiliary_loss_clip": 0.01112195, + "auxiliary_loss_mlp": 0.0103539, + "balance_loss_clip": 1.0468967, + "balance_loss_mlp": 1.02030444, + "epoch": 0.372523673530738, + "flos": 19500571998720.0, + "grad_norm": 1.7021548935758455, + "language_loss": 0.79216856, + "learning_rate": 2.890430664088655e-06, + "loss": 0.81364441, + "num_input_tokens_seen": 133162935, + "step": 6196, + "time_per_iteration": 2.6642892360687256 + }, + { + "auxiliary_loss_clip": 0.01124648, + "auxiliary_loss_mlp": 0.01038359, + "balance_loss_clip": 1.04975688, + "balance_loss_mlp": 1.0240953, + "epoch": 0.372583796783406, + "flos": 16764250849920.0, + "grad_norm": 2.570886031241156, + "language_loss": 0.83998835, + "learning_rate": 2.890081914052443e-06, + "loss": 0.8616184, + "num_input_tokens_seen": 133181180, + "step": 6197, + "time_per_iteration": 2.627305030822754 + }, + { + "auxiliary_loss_clip": 0.01131102, + "auxiliary_loss_mlp": 0.01040963, + "balance_loss_clip": 1.04697967, + "balance_loss_mlp": 1.02488184, + "epoch": 0.37264392003607394, + "flos": 22637979388800.0, + "grad_norm": 1.697216275583005, + "language_loss": 0.64450538, + "learning_rate": 2.889733130264237e-06, + "loss": 0.66622603, + "num_input_tokens_seen": 133199615, + "step": 6198, + "time_per_iteration": 2.606621503829956 + }, + { + "auxiliary_loss_clip": 0.01120059, + "auxiliary_loss_mlp": 0.01044451, + "balance_loss_clip": 1.04676938, + "balance_loss_mlp": 1.02959776, + "epoch": 0.3727040432887419, + "flos": 19973120348160.0, + "grad_norm": 1.4273324893736263, + "language_loss": 0.737185, + "learning_rate": 2.889384312737261e-06, + "loss": 0.75883007, + "num_input_tokens_seen": 133219650, + "step": 6199, + "time_per_iteration": 2.78157901763916 + }, + { + "auxiliary_loss_clip": 0.01105963, + "auxiliary_loss_mlp": 0.01037053, + "balance_loss_clip": 1.04564095, + "balance_loss_mlp": 1.02154374, + "epoch": 0.37276416654140987, + "flos": 63899122279680.0, + "grad_norm": 2.2948998309451905, + "language_loss": 0.80481982, + "learning_rate": 2.889035461484742e-06, + "loss": 0.82624996, + "num_input_tokens_seen": 133245675, + "step": 6200, + "time_per_iteration": 3.0623533725738525 + }, + { + "auxiliary_loss_clip": 0.0109608, + "auxiliary_loss_mlp": 0.01045798, + "balance_loss_clip": 1.04552174, + "balance_loss_mlp": 1.03016961, + "epoch": 0.37282428979407783, + "flos": 39785970211200.0, + "grad_norm": 2.0774746879263746, + "language_loss": 0.60494614, + "learning_rate": 2.88868657651991e-06, + "loss": 0.62636495, + "num_input_tokens_seen": 133266905, + "step": 6201, + "time_per_iteration": 2.8960700035095215 + }, + { + "auxiliary_loss_clip": 0.01125447, + "auxiliary_loss_mlp": 0.01039384, + "balance_loss_clip": 1.0489639, + "balance_loss_mlp": 1.02346373, + "epoch": 0.37288441304674586, + "flos": 22709046447360.0, + "grad_norm": 1.870117482164085, + "language_loss": 0.72692698, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.74857527, + "num_input_tokens_seen": 133286865, + "step": 6202, + "time_per_iteration": 4.202298402786255 + }, + { + "auxiliary_loss_clip": 0.01110741, + "auxiliary_loss_mlp": 0.01033326, + "balance_loss_clip": 1.04642594, + "balance_loss_mlp": 1.01800799, + "epoch": 0.3729445362994138, + "flos": 18770292587520.0, + "grad_norm": 2.0679450432005666, + "language_loss": 0.74148834, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.76292896, + "num_input_tokens_seen": 133305295, + "step": 6203, + "time_per_iteration": 2.7268033027648926 + }, + { + "auxiliary_loss_clip": 0.01106859, + "auxiliary_loss_mlp": 0.01038826, + "balance_loss_clip": 1.04595554, + "balance_loss_mlp": 1.02524805, + "epoch": 0.3730046595520818, + "flos": 22456199635200.0, + "grad_norm": 1.649450499506288, + "language_loss": 0.81921744, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.84067428, + "num_input_tokens_seen": 133324625, + "step": 6204, + "time_per_iteration": 4.347074747085571 + }, + { + "auxiliary_loss_clip": 0.01123916, + "auxiliary_loss_mlp": 0.01044159, + "balance_loss_clip": 1.04827762, + "balance_loss_mlp": 1.02794707, + "epoch": 0.37306478280474975, + "flos": 24316372241280.0, + "grad_norm": 1.675399556922802, + "language_loss": 0.74961317, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77129394, + "num_input_tokens_seen": 133344625, + "step": 6205, + "time_per_iteration": 2.66701602935791 + }, + { + "auxiliary_loss_clip": 0.01117233, + "auxiliary_loss_mlp": 0.01045323, + "balance_loss_clip": 1.04337549, + "balance_loss_mlp": 1.02857447, + "epoch": 0.3731249060574177, + "flos": 15815167741440.0, + "grad_norm": 1.8318607259579, + "language_loss": 0.7815854, + "learning_rate": 2.886941646474128e-06, + "loss": 0.80321097, + "num_input_tokens_seen": 133363605, + "step": 6206, + "time_per_iteration": 4.202580451965332 + }, + { + "auxiliary_loss_clip": 0.01134488, + "auxiliary_loss_mlp": 0.01039926, + "balance_loss_clip": 1.04804325, + "balance_loss_mlp": 1.02317739, + "epoch": 0.3731850293100857, + "flos": 19828077229440.0, + "grad_norm": 2.3232535418166256, + "language_loss": 0.93322426, + "learning_rate": 2.886592559513283e-06, + "loss": 0.95496845, + "num_input_tokens_seen": 133379405, + "step": 6207, + "time_per_iteration": 4.318574666976929 + }, + { + "auxiliary_loss_clip": 0.01105421, + "auxiliary_loss_mlp": 0.0103386, + "balance_loss_clip": 1.0478878, + "balance_loss_mlp": 1.01876843, + "epoch": 0.37324515256275365, + "flos": 19062354072960.0, + "grad_norm": 3.0736568130228363, + "language_loss": 0.82651198, + "learning_rate": 2.886243438932759e-06, + "loss": 0.8479048, + "num_input_tokens_seen": 133397585, + "step": 6208, + "time_per_iteration": 2.749662160873413 + }, + { + "auxiliary_loss_clip": 0.01122225, + "auxiliary_loss_mlp": 0.0103968, + "balance_loss_clip": 1.04488516, + "balance_loss_mlp": 1.0223707, + "epoch": 0.3733052758154216, + "flos": 20704333512960.0, + "grad_norm": 2.0157740087962845, + "language_loss": 0.73122764, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.75284666, + "num_input_tokens_seen": 133415365, + "step": 6209, + "time_per_iteration": 2.6315791606903076 + }, + { + "auxiliary_loss_clip": 0.01095649, + "auxiliary_loss_mlp": 0.01037134, + "balance_loss_clip": 1.04820108, + "balance_loss_mlp": 1.02065969, + "epoch": 0.3733653990680896, + "flos": 20193504243840.0, + "grad_norm": 1.9650719997143145, + "language_loss": 0.70413053, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.72545838, + "num_input_tokens_seen": 133435700, + "step": 6210, + "time_per_iteration": 2.7484405040740967 + }, + { + "auxiliary_loss_clip": 0.01072484, + "auxiliary_loss_mlp": 0.01045611, + "balance_loss_clip": 1.03769457, + "balance_loss_mlp": 1.02674007, + "epoch": 0.37342552232075754, + "flos": 20339660684160.0, + "grad_norm": 2.0510282142916427, + "language_loss": 0.77773547, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.79891646, + "num_input_tokens_seen": 133455180, + "step": 6211, + "time_per_iteration": 2.706294536590576 + }, + { + "auxiliary_loss_clip": 0.01122999, + "auxiliary_loss_mlp": 0.01042393, + "balance_loss_clip": 1.04602683, + "balance_loss_mlp": 1.02645469, + "epoch": 0.3734856455734255, + "flos": 35517879527040.0, + "grad_norm": 1.675173432335243, + "language_loss": 0.73258781, + "learning_rate": 2.884846620678668e-06, + "loss": 0.7542417, + "num_input_tokens_seen": 133476715, + "step": 6212, + "time_per_iteration": 2.788787841796875 + }, + { + "auxiliary_loss_clip": 0.01131124, + "auxiliary_loss_mlp": 0.01047595, + "balance_loss_clip": 1.05055571, + "balance_loss_mlp": 1.03106034, + "epoch": 0.37354576882609347, + "flos": 21142300043520.0, + "grad_norm": 1.9808770110660865, + "language_loss": 0.81656909, + "learning_rate": 2.884497332198356e-06, + "loss": 0.83835626, + "num_input_tokens_seen": 133494550, + "step": 6213, + "time_per_iteration": 2.6829304695129395 + }, + { + "auxiliary_loss_clip": 0.01089374, + "auxiliary_loss_mlp": 0.01046172, + "balance_loss_clip": 1.0412662, + "balance_loss_mlp": 1.02843404, + "epoch": 0.37360589207876144, + "flos": 21506793304320.0, + "grad_norm": 2.223600899112558, + "language_loss": 0.78999674, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.81135225, + "num_input_tokens_seen": 133512640, + "step": 6214, + "time_per_iteration": 2.674373149871826 + }, + { + "auxiliary_loss_clip": 0.01109052, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_clip": 1.04420567, + "balance_loss_mlp": 1.02827835, + "epoch": 0.37366601533142946, + "flos": 38435800861440.0, + "grad_norm": 1.9266500277332215, + "language_loss": 0.84611148, + "learning_rate": 2.883798654630296e-06, + "loss": 0.86764371, + "num_input_tokens_seen": 133535540, + "step": 6215, + "time_per_iteration": 2.8276026248931885 + }, + { + "auxiliary_loss_clip": 0.01100197, + "auxiliary_loss_mlp": 0.01039814, + "balance_loss_clip": 1.04435837, + "balance_loss_mlp": 1.02298141, + "epoch": 0.3737261385840974, + "flos": 18441171244800.0, + "grad_norm": 1.8731663372997254, + "language_loss": 0.67690969, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.69830984, + "num_input_tokens_seen": 133555795, + "step": 6216, + "time_per_iteration": 2.724090576171875 + }, + { + "auxiliary_loss_clip": 0.01111654, + "auxiliary_loss_mlp": 0.01042601, + "balance_loss_clip": 1.045977, + "balance_loss_mlp": 1.02578092, + "epoch": 0.3737862618367654, + "flos": 22929861306240.0, + "grad_norm": 2.3172976096058853, + "language_loss": 0.65993899, + "learning_rate": 2.883099843007303e-06, + "loss": 0.68148154, + "num_input_tokens_seen": 133575905, + "step": 6217, + "time_per_iteration": 2.7126269340515137 + }, + { + "auxiliary_loss_clip": 0.01115905, + "auxiliary_loss_mlp": 0.01039702, + "balance_loss_clip": 1.0483315, + "balance_loss_mlp": 1.02264857, + "epoch": 0.37384638508943335, + "flos": 15409664127360.0, + "grad_norm": 2.0273109551694777, + "language_loss": 0.80449212, + "learning_rate": 2.88275038695833e-06, + "loss": 0.82604814, + "num_input_tokens_seen": 133592585, + "step": 6218, + "time_per_iteration": 2.680894374847412 + }, + { + "auxiliary_loss_clip": 0.01115539, + "auxiliary_loss_mlp": 0.0103289, + "balance_loss_clip": 1.04488862, + "balance_loss_mlp": 1.01760781, + "epoch": 0.3739065083421013, + "flos": 24280820755200.0, + "grad_norm": 1.5960804840892617, + "language_loss": 0.78692639, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.80841064, + "num_input_tokens_seen": 133615070, + "step": 6219, + "time_per_iteration": 2.6683976650238037 + }, + { + "auxiliary_loss_clip": 0.01107805, + "auxiliary_loss_mlp": 0.01040758, + "balance_loss_clip": 1.04602623, + "balance_loss_mlp": 1.0247364, + "epoch": 0.3739666315947693, + "flos": 23002831785600.0, + "grad_norm": 1.8103875982928064, + "language_loss": 0.77023458, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.79172027, + "num_input_tokens_seen": 133633490, + "step": 6220, + "time_per_iteration": 2.670686960220337 + }, + { + "auxiliary_loss_clip": 0.01105245, + "auxiliary_loss_mlp": 0.01041158, + "balance_loss_clip": 1.04717016, + "balance_loss_mlp": 1.02473164, + "epoch": 0.37402675484743725, + "flos": 19391116279680.0, + "grad_norm": 3.4153989861378204, + "language_loss": 0.8298834, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.85134745, + "num_input_tokens_seen": 133653425, + "step": 6221, + "time_per_iteration": 2.730738401412964 + }, + { + "auxiliary_loss_clip": 0.01108391, + "auxiliary_loss_mlp": 0.01043965, + "balance_loss_clip": 1.04499435, + "balance_loss_mlp": 1.02825367, + "epoch": 0.3740868781001052, + "flos": 17126158331520.0, + "grad_norm": 1.9668982067313725, + "language_loss": 0.75944567, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.78096926, + "num_input_tokens_seen": 133670220, + "step": 6222, + "time_per_iteration": 2.62052321434021 + }, + { + "auxiliary_loss_clip": 0.01103117, + "auxiliary_loss_mlp": 0.00772891, + "balance_loss_clip": 1.04785156, + "balance_loss_mlp": 1.00029564, + "epoch": 0.3741470013527732, + "flos": 20043505048320.0, + "grad_norm": 1.8881600065301847, + "language_loss": 0.70621789, + "learning_rate": 2.881002604868789e-06, + "loss": 0.72497797, + "num_input_tokens_seen": 133688910, + "step": 6223, + "time_per_iteration": 2.7686285972595215 + }, + { + "auxiliary_loss_clip": 0.01104752, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.05155015, + "balance_loss_mlp": 1.02057576, + "epoch": 0.37420712460544114, + "flos": 36897279569280.0, + "grad_norm": 2.1852519558340644, + "language_loss": 0.6875304, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.7089299, + "num_input_tokens_seen": 133708690, + "step": 6224, + "time_per_iteration": 2.817263126373291 + }, + { + "auxiliary_loss_clip": 0.01091747, + "auxiliary_loss_mlp": 0.01036393, + "balance_loss_clip": 1.04859614, + "balance_loss_mlp": 1.02059817, + "epoch": 0.3742672478581091, + "flos": 22201198007040.0, + "grad_norm": 2.246642459489035, + "language_loss": 0.70192593, + "learning_rate": 2.880303258086228e-06, + "loss": 0.72320735, + "num_input_tokens_seen": 133728095, + "step": 6225, + "time_per_iteration": 2.785083532333374 + }, + { + "auxiliary_loss_clip": 0.01088757, + "auxiliary_loss_mlp": 0.01048544, + "balance_loss_clip": 1.04366183, + "balance_loss_mlp": 1.03175974, + "epoch": 0.3743273711107771, + "flos": 24681547860480.0, + "grad_norm": 2.1768682992812236, + "language_loss": 0.7896018, + "learning_rate": 2.879953534616536e-06, + "loss": 0.81097472, + "num_input_tokens_seen": 133745590, + "step": 6226, + "time_per_iteration": 2.7403974533081055 + }, + { + "auxiliary_loss_clip": 0.01105293, + "auxiliary_loss_mlp": 0.01039029, + "balance_loss_clip": 1.04631484, + "balance_loss_mlp": 1.02303696, + "epoch": 0.37438749436344504, + "flos": 24459619680000.0, + "grad_norm": 1.7825799805329443, + "language_loss": 0.67965841, + "learning_rate": 2.879603777778917e-06, + "loss": 0.70110166, + "num_input_tokens_seen": 133766155, + "step": 6227, + "time_per_iteration": 2.6975693702697754 + }, + { + "auxiliary_loss_clip": 0.01099252, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_clip": 1.04493213, + "balance_loss_mlp": 1.01890039, + "epoch": 0.374447617616113, + "flos": 21798747048960.0, + "grad_norm": 1.9005486801766094, + "language_loss": 0.829476, + "learning_rate": 2.879253987586635e-06, + "loss": 0.85081351, + "num_input_tokens_seen": 133783185, + "step": 6228, + "time_per_iteration": 2.7754271030426025 + }, + { + "auxiliary_loss_clip": 0.01090082, + "auxiliary_loss_mlp": 0.01048677, + "balance_loss_clip": 1.04396605, + "balance_loss_mlp": 1.03159404, + "epoch": 0.374507740868781, + "flos": 17968191932160.0, + "grad_norm": 1.6406992237121778, + "language_loss": 0.74450547, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.76589304, + "num_input_tokens_seen": 133800975, + "step": 6229, + "time_per_iteration": 2.6378824710845947 + }, + { + "auxiliary_loss_clip": 0.0109707, + "auxiliary_loss_mlp": 0.01035996, + "balance_loss_clip": 1.0470053, + "balance_loss_mlp": 1.01971197, + "epoch": 0.374567864121449, + "flos": 16105828596480.0, + "grad_norm": 2.127994694324029, + "language_loss": 0.83782691, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.85915756, + "num_input_tokens_seen": 133818020, + "step": 6230, + "time_per_iteration": 2.6857657432556152 + }, + { + "auxiliary_loss_clip": 0.0112393, + "auxiliary_loss_mlp": 0.01041627, + "balance_loss_clip": 1.04905128, + "balance_loss_mlp": 1.02556968, + "epoch": 0.37462798737411696, + "flos": 25773160135680.0, + "grad_norm": 2.8382818326589145, + "language_loss": 0.735865, + "learning_rate": 2.878204417014456e-06, + "loss": 0.75752056, + "num_input_tokens_seen": 133840690, + "step": 6231, + "time_per_iteration": 2.7082016468048096 + }, + { + "auxiliary_loss_clip": 0.0112579, + "auxiliary_loss_mlp": 0.01046917, + "balance_loss_clip": 1.05376148, + "balance_loss_mlp": 1.03075266, + "epoch": 0.3746881106267849, + "flos": 16654507822080.0, + "grad_norm": 2.9683381932525665, + "language_loss": 0.7412858, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.76301289, + "num_input_tokens_seen": 133858350, + "step": 6232, + "time_per_iteration": 2.5764057636260986 + }, + { + "auxiliary_loss_clip": 0.01106131, + "auxiliary_loss_mlp": 0.01039245, + "balance_loss_clip": 1.04461622, + "balance_loss_mlp": 1.02237701, + "epoch": 0.3747482338794529, + "flos": 26177981391360.0, + "grad_norm": 2.121427790242168, + "language_loss": 0.77296579, + "learning_rate": 2.877504536769561e-06, + "loss": 0.79441959, + "num_input_tokens_seen": 133879775, + "step": 6233, + "time_per_iteration": 2.692286252975464 + }, + { + "auxiliary_loss_clip": 0.01118513, + "auxiliary_loss_mlp": 0.01040639, + "balance_loss_clip": 1.05093503, + "balance_loss_mlp": 1.024593, + "epoch": 0.37480835713212085, + "flos": 12021061950720.0, + "grad_norm": 1.8446337373318833, + "language_loss": 0.69493848, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.71652997, + "num_input_tokens_seen": 133898295, + "step": 6234, + "time_per_iteration": 2.658332586288452 + }, + { + "auxiliary_loss_clip": 0.01123531, + "auxiliary_loss_mlp": 0.01042963, + "balance_loss_clip": 1.04885483, + "balance_loss_mlp": 1.02833033, + "epoch": 0.3748684803847888, + "flos": 19679263182720.0, + "grad_norm": 1.9015387878630694, + "language_loss": 0.82462788, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.84629285, + "num_input_tokens_seen": 133915230, + "step": 6235, + "time_per_iteration": 2.591198682785034 + }, + { + "auxiliary_loss_clip": 0.01140927, + "auxiliary_loss_mlp": 0.0103602, + "balance_loss_clip": 1.05301189, + "balance_loss_mlp": 1.02021289, + "epoch": 0.3749286036374568, + "flos": 20521189042560.0, + "grad_norm": 1.8869628373328378, + "language_loss": 0.78439927, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.80616879, + "num_input_tokens_seen": 133934110, + "step": 6236, + "time_per_iteration": 2.6754372119903564 + }, + { + "auxiliary_loss_clip": 0.01118225, + "auxiliary_loss_mlp": 0.01050242, + "balance_loss_clip": 1.04519606, + "balance_loss_mlp": 1.03202713, + "epoch": 0.37498872689012475, + "flos": 20704620821760.0, + "grad_norm": 2.0770406770017242, + "language_loss": 0.74357057, + "learning_rate": 2.876104377085234e-06, + "loss": 0.76525521, + "num_input_tokens_seen": 133952395, + "step": 6237, + "time_per_iteration": 2.6760342121124268 + }, + { + "auxiliary_loss_clip": 0.01114513, + "auxiliary_loss_mlp": 0.00773766, + "balance_loss_clip": 1.04626942, + "balance_loss_mlp": 1.00036037, + "epoch": 0.3750488501427927, + "flos": 21574843620480.0, + "grad_norm": 2.0699756536584633, + "language_loss": 0.93258965, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.95147252, + "num_input_tokens_seen": 133969635, + "step": 6238, + "time_per_iteration": 2.6805243492126465 + }, + { + "auxiliary_loss_clip": 0.01137619, + "auxiliary_loss_mlp": 0.01037341, + "balance_loss_clip": 1.04995167, + "balance_loss_mlp": 1.02081275, + "epoch": 0.3751089733954607, + "flos": 15923869274880.0, + "grad_norm": 2.3841921025147284, + "language_loss": 0.70885909, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.73060858, + "num_input_tokens_seen": 133987215, + "step": 6239, + "time_per_iteration": 2.548285961151123 + }, + { + "auxiliary_loss_clip": 0.01068531, + "auxiliary_loss_mlp": 0.01040031, + "balance_loss_clip": 1.04656243, + "balance_loss_mlp": 1.02303219, + "epoch": 0.37516909664812864, + "flos": 36284644177920.0, + "grad_norm": 1.601808094344726, + "language_loss": 0.65752542, + "learning_rate": 2.875053908444895e-06, + "loss": 0.67861104, + "num_input_tokens_seen": 134009250, + "step": 6240, + "time_per_iteration": 3.016897201538086 + }, + { + "auxiliary_loss_clip": 0.01101858, + "auxiliary_loss_mlp": 0.00773445, + "balance_loss_clip": 1.04618907, + "balance_loss_mlp": 1.00033951, + "epoch": 0.3752292199007966, + "flos": 13515915283200.0, + "grad_norm": 2.721418670308367, + "language_loss": 0.75816065, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.7769137, + "num_input_tokens_seen": 134026875, + "step": 6241, + "time_per_iteration": 4.402552843093872 + }, + { + "auxiliary_loss_clip": 0.01103844, + "auxiliary_loss_mlp": 0.01044119, + "balance_loss_clip": 1.04654765, + "balance_loss_mlp": 1.0276264, + "epoch": 0.3752893431534646, + "flos": 27198095644800.0, + "grad_norm": 2.108703330368865, + "language_loss": 0.83791685, + "learning_rate": 2.874353430085213e-06, + "loss": 0.85939646, + "num_input_tokens_seen": 134047185, + "step": 6242, + "time_per_iteration": 2.7508704662323 + }, + { + "auxiliary_loss_clip": 0.01110348, + "auxiliary_loss_mlp": 0.01048171, + "balance_loss_clip": 1.04799628, + "balance_loss_mlp": 1.03319848, + "epoch": 0.3753494664061326, + "flos": 30007674581760.0, + "grad_norm": 2.4924519814208774, + "language_loss": 0.68438506, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.70597029, + "num_input_tokens_seen": 134067330, + "step": 6243, + "time_per_iteration": 2.7814478874206543 + }, + { + "auxiliary_loss_clip": 0.01056696, + "auxiliary_loss_mlp": 0.00776554, + "balance_loss_clip": 1.04175019, + "balance_loss_mlp": 1.00038528, + "epoch": 0.37540958965880056, + "flos": 24461954064000.0, + "grad_norm": 1.7699519682943652, + "language_loss": 0.84165168, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.85998416, + "num_input_tokens_seen": 134085525, + "step": 6244, + "time_per_iteration": 4.510041952133179 + }, + { + "auxiliary_loss_clip": 0.01074238, + "auxiliary_loss_mlp": 0.01042872, + "balance_loss_clip": 1.03981614, + "balance_loss_mlp": 1.02712417, + "epoch": 0.3754697129114685, + "flos": 16508387295360.0, + "grad_norm": 2.7453088605805616, + "language_loss": 0.82679987, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.84797096, + "num_input_tokens_seen": 134101855, + "step": 6245, + "time_per_iteration": 4.745215654373169 + }, + { + "auxiliary_loss_clip": 0.01096909, + "auxiliary_loss_mlp": 0.0104658, + "balance_loss_clip": 1.04049206, + "balance_loss_mlp": 1.0296756, + "epoch": 0.3755298361641365, + "flos": 19390900798080.0, + "grad_norm": 8.46557879021872, + "language_loss": 0.63902843, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.66046333, + "num_input_tokens_seen": 134119360, + "step": 6246, + "time_per_iteration": 4.33053731918335 + }, + { + "auxiliary_loss_clip": 0.01112093, + "auxiliary_loss_mlp": 0.0104355, + "balance_loss_clip": 1.04961443, + "balance_loss_mlp": 1.0264082, + "epoch": 0.37558995941680445, + "flos": 14720395069440.0, + "grad_norm": 2.0508038288587183, + "language_loss": 0.74467009, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.76622653, + "num_input_tokens_seen": 134137475, + "step": 6247, + "time_per_iteration": 2.688081979751587 + }, + { + "auxiliary_loss_clip": 0.01126872, + "auxiliary_loss_mlp": 0.01037368, + "balance_loss_clip": 1.05022037, + "balance_loss_mlp": 1.02133489, + "epoch": 0.3756500826694724, + "flos": 21689901861120.0, + "grad_norm": 2.703785960910372, + "language_loss": 0.5497098, + "learning_rate": 2.872251199697598e-06, + "loss": 0.57135224, + "num_input_tokens_seen": 134154580, + "step": 6248, + "time_per_iteration": 2.6308822631835938 + }, + { + "auxiliary_loss_clip": 0.01117073, + "auxiliary_loss_mlp": 0.01036379, + "balance_loss_clip": 1.04465234, + "balance_loss_mlp": 1.0200597, + "epoch": 0.3757102059221404, + "flos": 26505666190080.0, + "grad_norm": 4.209721572066423, + "language_loss": 0.84492457, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86645913, + "num_input_tokens_seen": 134174285, + "step": 6249, + "time_per_iteration": 2.6539809703826904 + }, + { + "auxiliary_loss_clip": 0.01107733, + "auxiliary_loss_mlp": 0.01035495, + "balance_loss_clip": 1.04784632, + "balance_loss_mlp": 1.01956248, + "epoch": 0.37577032917480835, + "flos": 37338083274240.0, + "grad_norm": 1.546160982958922, + "language_loss": 0.67701882, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.69845104, + "num_input_tokens_seen": 134195940, + "step": 6250, + "time_per_iteration": 2.787398338317871 + }, + { + "auxiliary_loss_clip": 0.01117019, + "auxiliary_loss_mlp": 0.01044359, + "balance_loss_clip": 1.04946029, + "balance_loss_mlp": 1.0293386, + "epoch": 0.3758304524274763, + "flos": 21908597817600.0, + "grad_norm": 1.960309683567346, + "language_loss": 0.77824795, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.79986179, + "num_input_tokens_seen": 134212235, + "step": 6251, + "time_per_iteration": 2.7143123149871826 + }, + { + "auxiliary_loss_clip": 0.01121024, + "auxiliary_loss_mlp": 0.01039102, + "balance_loss_clip": 1.04994178, + "balance_loss_mlp": 1.0236522, + "epoch": 0.3758905756801443, + "flos": 36569343375360.0, + "grad_norm": 2.527016245081176, + "language_loss": 0.58002663, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.60162789, + "num_input_tokens_seen": 134233810, + "step": 6252, + "time_per_iteration": 2.716597557067871 + }, + { + "auxiliary_loss_clip": 0.01116459, + "auxiliary_loss_mlp": 0.01042556, + "balance_loss_clip": 1.05007291, + "balance_loss_mlp": 1.0260098, + "epoch": 0.37595069893281224, + "flos": 24528783317760.0, + "grad_norm": 4.856643583290163, + "language_loss": 0.89482141, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.91641152, + "num_input_tokens_seen": 134252020, + "step": 6253, + "time_per_iteration": 2.701361894607544 + }, + { + "auxiliary_loss_clip": 0.01098154, + "auxiliary_loss_mlp": 0.01040398, + "balance_loss_clip": 1.04815936, + "balance_loss_mlp": 1.02562761, + "epoch": 0.3760108221854802, + "flos": 16435021766400.0, + "grad_norm": 2.218099502464204, + "language_loss": 0.76568806, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.78707361, + "num_input_tokens_seen": 134269495, + "step": 6254, + "time_per_iteration": 2.6995303630828857 + }, + { + "auxiliary_loss_clip": 0.01096995, + "auxiliary_loss_mlp": 0.01043484, + "balance_loss_clip": 1.04379475, + "balance_loss_mlp": 1.02628207, + "epoch": 0.37607094543814823, + "flos": 13771742924160.0, + "grad_norm": 2.131769376763656, + "language_loss": 0.6180023, + "learning_rate": 2.869797092829169e-06, + "loss": 0.6394071, + "num_input_tokens_seen": 134287035, + "step": 6255, + "time_per_iteration": 2.7164864540100098 + }, + { + "auxiliary_loss_clip": 0.01127282, + "auxiliary_loss_mlp": 0.01036673, + "balance_loss_clip": 1.04883361, + "balance_loss_mlp": 1.02017426, + "epoch": 0.3761310686908162, + "flos": 19857918453120.0, + "grad_norm": 2.6629341180561545, + "language_loss": 0.74404681, + "learning_rate": 2.869446374096135e-06, + "loss": 0.76568639, + "num_input_tokens_seen": 134304840, + "step": 6256, + "time_per_iteration": 2.588169574737549 + }, + { + "auxiliary_loss_clip": 0.01127124, + "auxiliary_loss_mlp": 0.01046358, + "balance_loss_clip": 1.04913831, + "balance_loss_mlp": 1.02977645, + "epoch": 0.37619119194348416, + "flos": 12750802657920.0, + "grad_norm": 2.3087979716808937, + "language_loss": 0.702447, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.72418177, + "num_input_tokens_seen": 134323180, + "step": 6257, + "time_per_iteration": 2.701555013656616 + }, + { + "auxiliary_loss_clip": 0.01110787, + "auxiliary_loss_mlp": 0.01033343, + "balance_loss_clip": 1.04812109, + "balance_loss_mlp": 1.01796508, + "epoch": 0.3762513151961521, + "flos": 17530548624000.0, + "grad_norm": 1.673769537318751, + "language_loss": 0.84842372, + "learning_rate": 2.868744837734889e-06, + "loss": 0.86986494, + "num_input_tokens_seen": 134341390, + "step": 6258, + "time_per_iteration": 2.6336703300476074 + }, + { + "auxiliary_loss_clip": 0.01091689, + "auxiliary_loss_mlp": 0.01041654, + "balance_loss_clip": 1.04571128, + "balance_loss_mlp": 1.0271697, + "epoch": 0.3763114384488201, + "flos": 23617406511360.0, + "grad_norm": 1.4940028515654036, + "language_loss": 0.80920124, + "learning_rate": 2.868394020133277e-06, + "loss": 0.83053464, + "num_input_tokens_seen": 134360425, + "step": 6259, + "time_per_iteration": 2.752392053604126 + }, + { + "auxiliary_loss_clip": 0.01093234, + "auxiliary_loss_mlp": 0.01046443, + "balance_loss_clip": 1.04547083, + "balance_loss_mlp": 1.02969444, + "epoch": 0.37637156170148806, + "flos": 25406978935680.0, + "grad_norm": 2.4951694968605627, + "language_loss": 0.71285564, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.73425239, + "num_input_tokens_seen": 134379775, + "step": 6260, + "time_per_iteration": 2.782561779022217 + }, + { + "auxiliary_loss_clip": 0.01107136, + "auxiliary_loss_mlp": 0.01039319, + "balance_loss_clip": 1.04386747, + "balance_loss_mlp": 1.02305889, + "epoch": 0.376431684954156, + "flos": 23440906056960.0, + "grad_norm": 1.627422352949978, + "language_loss": 0.78342533, + "learning_rate": 2.867692286154594e-06, + "loss": 0.80488986, + "num_input_tokens_seen": 134400315, + "step": 6261, + "time_per_iteration": 2.6978867053985596 + }, + { + "auxiliary_loss_clip": 0.01112259, + "auxiliary_loss_mlp": 0.01048861, + "balance_loss_clip": 1.04744315, + "balance_loss_mlp": 1.0312773, + "epoch": 0.376491808206824, + "flos": 34204482725760.0, + "grad_norm": 2.418447947297228, + "language_loss": 0.80871278, + "learning_rate": 2.867341369804132e-06, + "loss": 0.83032399, + "num_input_tokens_seen": 134422875, + "step": 6262, + "time_per_iteration": 2.852675437927246 + }, + { + "auxiliary_loss_clip": 0.01115101, + "auxiliary_loss_mlp": 0.01038136, + "balance_loss_clip": 1.04584765, + "balance_loss_mlp": 1.02277565, + "epoch": 0.37655193145949195, + "flos": 35185669614720.0, + "grad_norm": 2.9875520790285774, + "language_loss": 0.80295742, + "learning_rate": 2.866990420563998e-06, + "loss": 0.82448983, + "num_input_tokens_seen": 134443025, + "step": 6263, + "time_per_iteration": 2.785395622253418 + }, + { + "auxiliary_loss_clip": 0.01140252, + "auxiliary_loss_mlp": 0.01045838, + "balance_loss_clip": 1.05247605, + "balance_loss_mlp": 1.0300312, + "epoch": 0.3766120547121599, + "flos": 16761844638720.0, + "grad_norm": 2.896352989954936, + "language_loss": 0.79601765, + "learning_rate": 2.866639438447501e-06, + "loss": 0.81787854, + "num_input_tokens_seen": 134460945, + "step": 6264, + "time_per_iteration": 2.581125497817993 + }, + { + "auxiliary_loss_clip": 0.01133548, + "auxiliary_loss_mlp": 0.0105155, + "balance_loss_clip": 1.04770851, + "balance_loss_mlp": 1.03557551, + "epoch": 0.3766721779648279, + "flos": 23550361776000.0, + "grad_norm": 2.0921625870578913, + "language_loss": 0.73808366, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.75993466, + "num_input_tokens_seen": 134480440, + "step": 6265, + "time_per_iteration": 2.6998226642608643 + }, + { + "auxiliary_loss_clip": 0.01123221, + "auxiliary_loss_mlp": 0.0103937, + "balance_loss_clip": 1.05005145, + "balance_loss_mlp": 1.02543402, + "epoch": 0.37673230121749585, + "flos": 29129191655040.0, + "grad_norm": 1.9744000825782282, + "language_loss": 0.68550873, + "learning_rate": 2.865937375638654e-06, + "loss": 0.70713472, + "num_input_tokens_seen": 134501110, + "step": 6266, + "time_per_iteration": 2.6934731006622314 + }, + { + "auxiliary_loss_clip": 0.01128105, + "auxiliary_loss_mlp": 0.01041187, + "balance_loss_clip": 1.04846668, + "balance_loss_mlp": 1.02536833, + "epoch": 0.3767924244701638, + "flos": 28146783703680.0, + "grad_norm": 3.437883319374573, + "language_loss": 0.63078731, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.65248024, + "num_input_tokens_seen": 134522460, + "step": 6267, + "time_per_iteration": 2.7006735801696777 + }, + { + "auxiliary_loss_clip": 0.01050407, + "auxiliary_loss_mlp": 0.01011452, + "balance_loss_clip": 1.02822745, + "balance_loss_mlp": 1.00960469, + "epoch": 0.37685254772283183, + "flos": 60797197526400.0, + "grad_norm": 0.7198108741876666, + "language_loss": 0.58852816, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.60914677, + "num_input_tokens_seen": 134589545, + "step": 6268, + "time_per_iteration": 3.355120897293091 + }, + { + "auxiliary_loss_clip": 0.011375, + "auxiliary_loss_mlp": 0.01043603, + "balance_loss_clip": 1.05033755, + "balance_loss_mlp": 1.02698505, + "epoch": 0.3769126709754998, + "flos": 26032543223040.0, + "grad_norm": 2.34128493463531, + "language_loss": 0.65263468, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67444575, + "num_input_tokens_seen": 134610550, + "step": 6269, + "time_per_iteration": 2.656585931777954 + }, + { + "auxiliary_loss_clip": 0.01099912, + "auxiliary_loss_mlp": 0.01041008, + "balance_loss_clip": 1.04970932, + "balance_loss_mlp": 1.02536798, + "epoch": 0.37697279422816776, + "flos": 23579879777280.0, + "grad_norm": 1.5250715006737088, + "language_loss": 0.7069717, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.72838092, + "num_input_tokens_seen": 134630485, + "step": 6270, + "time_per_iteration": 2.7498419284820557 + }, + { + "auxiliary_loss_clip": 0.01059818, + "auxiliary_loss_mlp": 0.01007405, + "balance_loss_clip": 1.02900875, + "balance_loss_mlp": 1.00581956, + "epoch": 0.3770329174808357, + "flos": 64745935367040.0, + "grad_norm": 0.7193704591933474, + "language_loss": 0.56122422, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.58189648, + "num_input_tokens_seen": 134693510, + "step": 6271, + "time_per_iteration": 3.1569089889526367 + }, + { + "auxiliary_loss_clip": 0.01121208, + "auxiliary_loss_mlp": 0.01042721, + "balance_loss_clip": 1.04645681, + "balance_loss_mlp": 1.02609181, + "epoch": 0.3770930407335037, + "flos": 21835304115840.0, + "grad_norm": 2.1611051517344246, + "language_loss": 0.79855239, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.82019162, + "num_input_tokens_seen": 134713115, + "step": 6272, + "time_per_iteration": 2.628180742263794 + }, + { + "auxiliary_loss_clip": 0.01118748, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.0451988, + "balance_loss_mlp": 1.01934206, + "epoch": 0.37715316398617166, + "flos": 22747901984640.0, + "grad_norm": 2.0954681641544304, + "language_loss": 0.73789483, + "learning_rate": 2.863479122159103e-06, + "loss": 0.75941932, + "num_input_tokens_seen": 134732635, + "step": 6273, + "time_per_iteration": 2.7064390182495117 + }, + { + "auxiliary_loss_clip": 0.01117899, + "auxiliary_loss_mlp": 0.01044408, + "balance_loss_clip": 1.04745209, + "balance_loss_mlp": 1.02905381, + "epoch": 0.3772132872388396, + "flos": 18914581520640.0, + "grad_norm": 1.6440580648783938, + "language_loss": 0.71867502, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.74029803, + "num_input_tokens_seen": 134750695, + "step": 6274, + "time_per_iteration": 2.650559186935425 + }, + { + "auxiliary_loss_clip": 0.01105418, + "auxiliary_loss_mlp": 0.01040714, + "balance_loss_clip": 1.04509926, + "balance_loss_mlp": 1.02567029, + "epoch": 0.3772734104915076, + "flos": 17346219004800.0, + "grad_norm": 1.9251108643001593, + "language_loss": 0.83620244, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.85766381, + "num_input_tokens_seen": 134768935, + "step": 6275, + "time_per_iteration": 2.662346839904785 + }, + { + "auxiliary_loss_clip": 0.01077547, + "auxiliary_loss_mlp": 0.01035941, + "balance_loss_clip": 1.04383206, + "balance_loss_mlp": 1.02238655, + "epoch": 0.37733353374417555, + "flos": 32342370785280.0, + "grad_norm": 1.4850375213112275, + "language_loss": 0.75779188, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.77892679, + "num_input_tokens_seen": 134791260, + "step": 6276, + "time_per_iteration": 2.824374198913574 + }, + { + "auxiliary_loss_clip": 0.01109985, + "auxiliary_loss_mlp": 0.01039728, + "balance_loss_clip": 1.04301822, + "balance_loss_mlp": 1.02318192, + "epoch": 0.3773936569968435, + "flos": 23360681030400.0, + "grad_norm": 1.996464283971086, + "language_loss": 0.85758084, + "learning_rate": 2.862073685241366e-06, + "loss": 0.87907803, + "num_input_tokens_seen": 134808350, + "step": 6277, + "time_per_iteration": 2.6880812644958496 + }, + { + "auxiliary_loss_clip": 0.01123239, + "auxiliary_loss_mlp": 0.01035838, + "balance_loss_clip": 1.04981339, + "balance_loss_mlp": 1.02147365, + "epoch": 0.3774537802495115, + "flos": 21466788531840.0, + "grad_norm": 2.8692620956149613, + "language_loss": 0.78788501, + "learning_rate": 2.861722244253818e-06, + "loss": 0.80947578, + "num_input_tokens_seen": 134826005, + "step": 6278, + "time_per_iteration": 2.6566152572631836 + }, + { + "auxiliary_loss_clip": 0.01104603, + "auxiliary_loss_mlp": 0.01044359, + "balance_loss_clip": 1.04592609, + "balance_loss_mlp": 1.02740717, + "epoch": 0.37751390350217945, + "flos": 24973717086720.0, + "grad_norm": 2.420687530183356, + "language_loss": 0.8289634, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.85045302, + "num_input_tokens_seen": 134844995, + "step": 6279, + "time_per_iteration": 2.732966899871826 + }, + { + "auxiliary_loss_clip": 0.01110227, + "auxiliary_loss_mlp": 0.01039275, + "balance_loss_clip": 1.04498839, + "balance_loss_mlp": 1.02520263, + "epoch": 0.3775740267548474, + "flos": 27819098904960.0, + "grad_norm": 5.36242068768128, + "language_loss": 0.74968797, + "learning_rate": 2.861019264262269e-06, + "loss": 0.77118295, + "num_input_tokens_seen": 134865285, + "step": 6280, + "time_per_iteration": 4.266780376434326 + }, + { + "auxiliary_loss_clip": 0.01130932, + "auxiliary_loss_mlp": 0.01036032, + "balance_loss_clip": 1.04845715, + "balance_loss_mlp": 1.02235854, + "epoch": 0.3776341500075154, + "flos": 22565224391040.0, + "grad_norm": 1.4530407212668277, + "language_loss": 0.76169163, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.7833612, + "num_input_tokens_seen": 134886535, + "step": 6281, + "time_per_iteration": 2.649930477142334 + }, + { + "auxiliary_loss_clip": 0.01101629, + "auxiliary_loss_mlp": 0.01040327, + "balance_loss_clip": 1.04291892, + "balance_loss_mlp": 1.02471018, + "epoch": 0.3776942732601834, + "flos": 23077238808960.0, + "grad_norm": 2.430303484367767, + "language_loss": 0.83814883, + "learning_rate": 2.860316153670974e-06, + "loss": 0.85956836, + "num_input_tokens_seen": 134907435, + "step": 6282, + "time_per_iteration": 2.6882312297821045 + }, + { + "auxiliary_loss_clip": 0.0111945, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.04452085, + "balance_loss_mlp": 1.02134025, + "epoch": 0.37775439651285136, + "flos": 21724411852800.0, + "grad_norm": 2.5787880774083725, + "language_loss": 0.698241, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.71980345, + "num_input_tokens_seen": 134925360, + "step": 6283, + "time_per_iteration": 4.2020978927612305 + }, + { + "auxiliary_loss_clip": 0.01072442, + "auxiliary_loss_mlp": 0.01052062, + "balance_loss_clip": 1.04226279, + "balance_loss_mlp": 1.03394175, + "epoch": 0.37781451976551933, + "flos": 23987753688960.0, + "grad_norm": 2.007181392308561, + "language_loss": 0.76503819, + "learning_rate": 2.859612912586581e-06, + "loss": 0.78628325, + "num_input_tokens_seen": 134944205, + "step": 6284, + "time_per_iteration": 4.349794387817383 + }, + { + "auxiliary_loss_clip": 0.01142581, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.05249381, + "balance_loss_mlp": 1.01713097, + "epoch": 0.3778746430181873, + "flos": 13727967223680.0, + "grad_norm": 2.7318562260547554, + "language_loss": 0.85677552, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.87853491, + "num_input_tokens_seen": 134960255, + "step": 6285, + "time_per_iteration": 2.6949870586395264 + }, + { + "auxiliary_loss_clip": 0.01111269, + "auxiliary_loss_mlp": 0.01042933, + "balance_loss_clip": 1.04731882, + "balance_loss_mlp": 1.02694702, + "epoch": 0.37793476627085526, + "flos": 19460495399040.0, + "grad_norm": 1.8544385642750592, + "language_loss": 0.84419537, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86573738, + "num_input_tokens_seen": 134978605, + "step": 6286, + "time_per_iteration": 4.541024684906006 + }, + { + "auxiliary_loss_clip": 0.01120151, + "auxiliary_loss_mlp": 0.01043503, + "balance_loss_clip": 1.05024576, + "balance_loss_mlp": 1.0280652, + "epoch": 0.3779948895235232, + "flos": 10707018704640.0, + "grad_norm": 2.400905995704231, + "language_loss": 0.81738019, + "learning_rate": 2.858557806518775e-06, + "loss": 0.83901674, + "num_input_tokens_seen": 134995020, + "step": 6287, + "time_per_iteration": 2.6611125469207764 + }, + { + "auxiliary_loss_clip": 0.01118978, + "auxiliary_loss_mlp": 0.01041796, + "balance_loss_clip": 1.04537022, + "balance_loss_mlp": 1.02645934, + "epoch": 0.3780550127761912, + "flos": 22310007281280.0, + "grad_norm": 3.0671932020533133, + "language_loss": 0.73071134, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.7523191, + "num_input_tokens_seen": 135012620, + "step": 6288, + "time_per_iteration": 2.6759073734283447 + }, + { + "auxiliary_loss_clip": 0.01124666, + "auxiliary_loss_mlp": 0.01036773, + "balance_loss_clip": 1.05113983, + "balance_loss_mlp": 1.02115071, + "epoch": 0.37811513602885916, + "flos": 28950644125440.0, + "grad_norm": 1.9644960153972613, + "language_loss": 0.75616127, + "learning_rate": 2.857854239668352e-06, + "loss": 0.77777576, + "num_input_tokens_seen": 135033365, + "step": 6289, + "time_per_iteration": 2.656367778778076 + }, + { + "auxiliary_loss_clip": 0.0112159, + "auxiliary_loss_mlp": 0.01035617, + "balance_loss_clip": 1.04737473, + "balance_loss_mlp": 1.02025056, + "epoch": 0.3781752592815271, + "flos": 23112933949440.0, + "grad_norm": 1.7941331023092641, + "language_loss": 0.73271513, + "learning_rate": 2.857502407441593e-06, + "loss": 0.75428718, + "num_input_tokens_seen": 135052185, + "step": 6290, + "time_per_iteration": 2.740370512008667 + }, + { + "auxiliary_loss_clip": 0.01098389, + "auxiliary_loss_mlp": 0.01041015, + "balance_loss_clip": 1.04425681, + "balance_loss_mlp": 1.023193, + "epoch": 0.3782353825341951, + "flos": 19755932762880.0, + "grad_norm": 8.943174604406142, + "language_loss": 0.79843229, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.81982636, + "num_input_tokens_seen": 135070425, + "step": 6291, + "time_per_iteration": 2.729116916656494 + }, + { + "auxiliary_loss_clip": 0.01101536, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.04736066, + "balance_loss_mlp": 1.01611638, + "epoch": 0.37829550578686305, + "flos": 22050839675520.0, + "grad_norm": 2.1381581001103203, + "language_loss": 0.76017123, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.78151298, + "num_input_tokens_seen": 135090525, + "step": 6292, + "time_per_iteration": 2.7115557193756104 + }, + { + "auxiliary_loss_clip": 0.0111659, + "auxiliary_loss_mlp": 0.01045333, + "balance_loss_clip": 1.04599166, + "balance_loss_mlp": 1.02922773, + "epoch": 0.378355629039531, + "flos": 16470357770880.0, + "grad_norm": 2.0329947363530616, + "language_loss": 0.69857049, + "learning_rate": 2.856446715715224e-06, + "loss": 0.72018969, + "num_input_tokens_seen": 135109575, + "step": 6293, + "time_per_iteration": 2.6687965393066406 + }, + { + "auxiliary_loss_clip": 0.01133204, + "auxiliary_loss_mlp": 0.01039264, + "balance_loss_clip": 1.04852223, + "balance_loss_mlp": 1.02307534, + "epoch": 0.378415752292199, + "flos": 19974844200960.0, + "grad_norm": 2.030259976194038, + "language_loss": 0.70870757, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.73043227, + "num_input_tokens_seen": 135127000, + "step": 6294, + "time_per_iteration": 2.600249767303467 + }, + { + "auxiliary_loss_clip": 0.01115678, + "auxiliary_loss_mlp": 0.01040569, + "balance_loss_clip": 1.04706097, + "balance_loss_mlp": 1.02365303, + "epoch": 0.378475875544867, + "flos": 14647388676480.0, + "grad_norm": 4.788626069957177, + "language_loss": 0.82803214, + "learning_rate": 2.855742758826011e-06, + "loss": 0.84959471, + "num_input_tokens_seen": 135145285, + "step": 6295, + "time_per_iteration": 2.656090497970581 + }, + { + "auxiliary_loss_clip": 0.0111937, + "auxiliary_loss_mlp": 0.0103653, + "balance_loss_clip": 1.04782999, + "balance_loss_mlp": 1.02058005, + "epoch": 0.37853599879753497, + "flos": 26650996617600.0, + "grad_norm": 9.577751233202987, + "language_loss": 0.71744889, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.73900783, + "num_input_tokens_seen": 135165240, + "step": 6296, + "time_per_iteration": 2.6698925495147705 + }, + { + "auxiliary_loss_clip": 0.01134516, + "auxiliary_loss_mlp": 0.01043376, + "balance_loss_clip": 1.05133939, + "balance_loss_mlp": 1.02771211, + "epoch": 0.37859612205020293, + "flos": 17311960408320.0, + "grad_norm": 3.288847845161644, + "language_loss": 0.76889098, + "learning_rate": 2.855038672137396e-06, + "loss": 0.79066986, + "num_input_tokens_seen": 135184045, + "step": 6297, + "time_per_iteration": 2.629037380218506 + }, + { + "auxiliary_loss_clip": 0.01109354, + "auxiliary_loss_mlp": 0.01038115, + "balance_loss_clip": 1.04526067, + "balance_loss_mlp": 1.02226055, + "epoch": 0.3786562453028709, + "flos": 18220392299520.0, + "grad_norm": 1.9191527971099975, + "language_loss": 0.79743183, + "learning_rate": 2.854686580151684e-06, + "loss": 0.81890655, + "num_input_tokens_seen": 135202365, + "step": 6298, + "time_per_iteration": 2.673081874847412 + }, + { + "auxiliary_loss_clip": 0.01075918, + "auxiliary_loss_mlp": 0.01051187, + "balance_loss_clip": 1.04113722, + "balance_loss_mlp": 1.03267384, + "epoch": 0.37871636855553886, + "flos": 21214875473280.0, + "grad_norm": 1.8248163373816215, + "language_loss": 0.84369445, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.86496556, + "num_input_tokens_seen": 135220955, + "step": 6299, + "time_per_iteration": 2.748072862625122 + }, + { + "auxiliary_loss_clip": 0.01104171, + "auxiliary_loss_mlp": 0.01036156, + "balance_loss_clip": 1.0473597, + "balance_loss_mlp": 1.02021194, + "epoch": 0.3787764918082068, + "flos": 20952727038720.0, + "grad_norm": 2.2683019346862587, + "language_loss": 0.76286763, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.78427088, + "num_input_tokens_seen": 135239715, + "step": 6300, + "time_per_iteration": 2.742335796356201 + }, + { + "auxiliary_loss_clip": 0.01118244, + "auxiliary_loss_mlp": 0.01037884, + "balance_loss_clip": 1.04743147, + "balance_loss_mlp": 1.01999068, + "epoch": 0.3788366150608748, + "flos": 17308009912320.0, + "grad_norm": 2.2544575031135863, + "language_loss": 0.82409781, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.84565908, + "num_input_tokens_seen": 135257035, + "step": 6301, + "time_per_iteration": 2.6785736083984375 + }, + { + "auxiliary_loss_clip": 0.01120863, + "auxiliary_loss_mlp": 0.01039969, + "balance_loss_clip": 1.04765666, + "balance_loss_mlp": 1.02410781, + "epoch": 0.37889673831354276, + "flos": 24311092942080.0, + "grad_norm": 2.7886341766039466, + "language_loss": 0.67584914, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.69745743, + "num_input_tokens_seen": 135275720, + "step": 6302, + "time_per_iteration": 2.677690029144287 + }, + { + "auxiliary_loss_clip": 0.01090953, + "auxiliary_loss_mlp": 0.01043064, + "balance_loss_clip": 1.04460323, + "balance_loss_mlp": 1.02736425, + "epoch": 0.3789568615662107, + "flos": 26683603188480.0, + "grad_norm": 1.752291551629032, + "language_loss": 0.68745166, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.70879185, + "num_input_tokens_seen": 135294140, + "step": 6303, + "time_per_iteration": 2.8387813568115234 + }, + { + "auxiliary_loss_clip": 0.01133092, + "auxiliary_loss_mlp": 0.01039166, + "balance_loss_clip": 1.04745388, + "balance_loss_mlp": 1.02412772, + "epoch": 0.3790169848188787, + "flos": 23585194990080.0, + "grad_norm": 1.8875159078783896, + "language_loss": 0.77695227, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.79867482, + "num_input_tokens_seen": 135314845, + "step": 6304, + "time_per_iteration": 2.673499584197998 + }, + { + "auxiliary_loss_clip": 0.01145067, + "auxiliary_loss_mlp": 0.01040581, + "balance_loss_clip": 1.05417812, + "balance_loss_mlp": 1.02412987, + "epoch": 0.37907710807154665, + "flos": 18437436230400.0, + "grad_norm": 2.779181085633227, + "language_loss": 0.79659361, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.81845009, + "num_input_tokens_seen": 135333055, + "step": 6305, + "time_per_iteration": 2.5770838260650635 + }, + { + "auxiliary_loss_clip": 0.01046795, + "auxiliary_loss_mlp": 0.01001141, + "balance_loss_clip": 1.02554131, + "balance_loss_mlp": 0.99954396, + "epoch": 0.3791372313242146, + "flos": 50107165954560.0, + "grad_norm": 0.9814261912828969, + "language_loss": 0.64473259, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66521198, + "num_input_tokens_seen": 135387865, + "step": 6306, + "time_per_iteration": 3.0782721042633057 + }, + { + "auxiliary_loss_clip": 0.01111605, + "auxiliary_loss_mlp": 0.01058558, + "balance_loss_clip": 1.04987538, + "balance_loss_mlp": 1.03932941, + "epoch": 0.3791973545768826, + "flos": 24316551809280.0, + "grad_norm": 3.4757923579383343, + "language_loss": 0.73271245, + "learning_rate": 2.851516295441817e-06, + "loss": 0.75441408, + "num_input_tokens_seen": 135409095, + "step": 6307, + "time_per_iteration": 2.756335973739624 + }, + { + "auxiliary_loss_clip": 0.01112868, + "auxiliary_loss_mlp": 0.01041837, + "balance_loss_clip": 1.04757965, + "balance_loss_mlp": 1.02545738, + "epoch": 0.3792574778295506, + "flos": 21579907438080.0, + "grad_norm": 1.5984922637838355, + "language_loss": 0.78426826, + "learning_rate": 2.851163879959112e-06, + "loss": 0.80581522, + "num_input_tokens_seen": 135429585, + "step": 6308, + "time_per_iteration": 2.7782399654388428 + }, + { + "auxiliary_loss_clip": 0.01099815, + "auxiliary_loss_mlp": 0.01047567, + "balance_loss_clip": 1.04646075, + "balance_loss_mlp": 1.03061557, + "epoch": 0.37931760108221857, + "flos": 22272731942400.0, + "grad_norm": 30.20771720098995, + "language_loss": 0.72349942, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.74497324, + "num_input_tokens_seen": 135446320, + "step": 6309, + "time_per_iteration": 2.779332399368286 + }, + { + "auxiliary_loss_clip": 0.0107726, + "auxiliary_loss_mlp": 0.01047463, + "balance_loss_clip": 1.04217935, + "balance_loss_mlp": 1.03061867, + "epoch": 0.37937772433488653, + "flos": 19682998197120.0, + "grad_norm": 1.3823910789919382, + "language_loss": 0.78832853, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.8095758, + "num_input_tokens_seen": 135465720, + "step": 6310, + "time_per_iteration": 2.771423101425171 + }, + { + "auxiliary_loss_clip": 0.01125039, + "auxiliary_loss_mlp": 0.0077385, + "balance_loss_clip": 1.04667282, + "balance_loss_mlp": 1.00038886, + "epoch": 0.3794378475875545, + "flos": 19099378016640.0, + "grad_norm": 2.0276391959107687, + "language_loss": 0.76350379, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.78249264, + "num_input_tokens_seen": 135485155, + "step": 6311, + "time_per_iteration": 2.6458020210266113 + }, + { + "auxiliary_loss_clip": 0.01111162, + "auxiliary_loss_mlp": 0.01038798, + "balance_loss_clip": 1.04782593, + "balance_loss_mlp": 1.02345526, + "epoch": 0.37949797084022246, + "flos": 20339660684160.0, + "grad_norm": 1.662830094695082, + "language_loss": 0.7082535, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.72975308, + "num_input_tokens_seen": 135502675, + "step": 6312, + "time_per_iteration": 2.719555377960205 + }, + { + "auxiliary_loss_clip": 0.01023104, + "auxiliary_loss_mlp": 0.01013837, + "balance_loss_clip": 1.02154779, + "balance_loss_mlp": 1.0123291, + "epoch": 0.37955809409289043, + "flos": 63972203477760.0, + "grad_norm": 0.7865225154891, + "language_loss": 0.56087357, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58124298, + "num_input_tokens_seen": 135562005, + "step": 6313, + "time_per_iteration": 3.2287843227386475 + }, + { + "auxiliary_loss_clip": 0.01096229, + "auxiliary_loss_mlp": 0.01051812, + "balance_loss_clip": 1.04299724, + "balance_loss_mlp": 1.03592694, + "epoch": 0.3796182173455584, + "flos": 31540665179520.0, + "grad_norm": 1.6673731637282567, + "language_loss": 0.71260917, + "learning_rate": 2.849048709730083e-06, + "loss": 0.73408955, + "num_input_tokens_seen": 135582600, + "step": 6314, + "time_per_iteration": 2.7842931747436523 + }, + { + "auxiliary_loss_clip": 0.01129376, + "auxiliary_loss_mlp": 0.01048605, + "balance_loss_clip": 1.04880047, + "balance_loss_mlp": 1.03201127, + "epoch": 0.37967834059822636, + "flos": 12130804978560.0, + "grad_norm": 2.0299747539506408, + "language_loss": 0.73270208, + "learning_rate": 2.848696068594545e-06, + "loss": 0.75448191, + "num_input_tokens_seen": 135600280, + "step": 6315, + "time_per_iteration": 2.6785545349121094 + }, + { + "auxiliary_loss_clip": 0.01122054, + "auxiliary_loss_mlp": 0.01048691, + "balance_loss_clip": 1.0479691, + "balance_loss_mlp": 1.03326535, + "epoch": 0.3797384638508943, + "flos": 39348578298240.0, + "grad_norm": 2.0273248392275645, + "language_loss": 0.71108794, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.73279542, + "num_input_tokens_seen": 135621560, + "step": 6316, + "time_per_iteration": 2.7634074687957764 + }, + { + "auxiliary_loss_clip": 0.01099766, + "auxiliary_loss_mlp": 0.01041876, + "balance_loss_clip": 1.04686475, + "balance_loss_mlp": 1.02733219, + "epoch": 0.3797985871035623, + "flos": 34054016653440.0, + "grad_norm": 6.091183487708486, + "language_loss": 0.6551193, + "learning_rate": 2.847990689788923e-06, + "loss": 0.67653567, + "num_input_tokens_seen": 135641745, + "step": 6317, + "time_per_iteration": 2.8334715366363525 + }, + { + "auxiliary_loss_clip": 0.01119227, + "auxiliary_loss_mlp": 0.01036315, + "balance_loss_clip": 1.04556906, + "balance_loss_mlp": 1.02204525, + "epoch": 0.37985871035623026, + "flos": 23222174186880.0, + "grad_norm": 2.5588148844770364, + "language_loss": 0.85254991, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.87410533, + "num_input_tokens_seen": 135660650, + "step": 6318, + "time_per_iteration": 2.6611499786376953 + }, + { + "auxiliary_loss_clip": 0.01113843, + "auxiliary_loss_mlp": 0.01046062, + "balance_loss_clip": 1.04669976, + "balance_loss_mlp": 1.02933645, + "epoch": 0.3799188336088982, + "flos": 18114958903680.0, + "grad_norm": 2.5013130494780254, + "language_loss": 0.75813186, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.77973092, + "num_input_tokens_seen": 135679980, + "step": 6319, + "time_per_iteration": 2.643206834793091 + }, + { + "auxiliary_loss_clip": 0.01136645, + "auxiliary_loss_mlp": 0.01043703, + "balance_loss_clip": 1.04961717, + "balance_loss_mlp": 1.02813435, + "epoch": 0.3799789568615662, + "flos": 21871897096320.0, + "grad_norm": 1.6614251909537696, + "language_loss": 0.64298296, + "learning_rate": 2.846932380444744e-06, + "loss": 0.66478646, + "num_input_tokens_seen": 135699400, + "step": 6320, + "time_per_iteration": 4.031519174575806 + }, + { + "auxiliary_loss_clip": 0.01102323, + "auxiliary_loss_mlp": 0.01046665, + "balance_loss_clip": 1.05175698, + "balance_loss_mlp": 1.03132319, + "epoch": 0.3800390801142342, + "flos": 32962943082240.0, + "grad_norm": 2.289587921641626, + "language_loss": 0.713642, + "learning_rate": 2.846579546413992e-06, + "loss": 0.73513186, + "num_input_tokens_seen": 135723455, + "step": 6321, + "time_per_iteration": 2.8465514183044434 + }, + { + "auxiliary_loss_clip": 0.01096183, + "auxiliary_loss_mlp": 0.01042053, + "balance_loss_clip": 1.04067016, + "balance_loss_mlp": 1.02673435, + "epoch": 0.38009920336690217, + "flos": 26907075653760.0, + "grad_norm": 1.7413772853733611, + "language_loss": 0.74461544, + "learning_rate": 2.846226680280859e-06, + "loss": 0.76599777, + "num_input_tokens_seen": 135744335, + "step": 6322, + "time_per_iteration": 4.407487630844116 + }, + { + "auxiliary_loss_clip": 0.01122719, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.0462966, + "balance_loss_mlp": 1.02587986, + "epoch": 0.38015932661957014, + "flos": 22488913946880.0, + "grad_norm": 3.5770930684707527, + "language_loss": 0.84908414, + "learning_rate": 2.845873782058725e-06, + "loss": 0.87071967, + "num_input_tokens_seen": 135761440, + "step": 6323, + "time_per_iteration": 2.6349892616271973 + }, + { + "auxiliary_loss_clip": 0.01111414, + "auxiliary_loss_mlp": 0.01037556, + "balance_loss_clip": 1.04454303, + "balance_loss_mlp": 1.02075982, + "epoch": 0.3802194498722381, + "flos": 21980993679360.0, + "grad_norm": 5.3693824839272954, + "language_loss": 0.73171353, + "learning_rate": 2.845520851760973e-06, + "loss": 0.75320327, + "num_input_tokens_seen": 135779955, + "step": 6324, + "time_per_iteration": 4.240839958190918 + }, + { + "auxiliary_loss_clip": 0.01105568, + "auxiliary_loss_mlp": 0.01038696, + "balance_loss_clip": 1.04704404, + "balance_loss_mlp": 1.02263856, + "epoch": 0.38027957312490607, + "flos": 21324869896320.0, + "grad_norm": 1.716026134262254, + "language_loss": 0.83859229, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.86003488, + "num_input_tokens_seen": 135799840, + "step": 6325, + "time_per_iteration": 2.72074818611145 + }, + { + "auxiliary_loss_clip": 0.01110489, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.04811895, + "balance_loss_mlp": 1.02094209, + "epoch": 0.38033969637757403, + "flos": 16691244456960.0, + "grad_norm": 2.0321742163093264, + "language_loss": 0.80093408, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.82239556, + "num_input_tokens_seen": 135817880, + "step": 6326, + "time_per_iteration": 4.313997030258179 + }, + { + "auxiliary_loss_clip": 0.01119893, + "auxiliary_loss_mlp": 0.01038876, + "balance_loss_clip": 1.04593146, + "balance_loss_mlp": 1.02497053, + "epoch": 0.380399819630242, + "flos": 36210847685760.0, + "grad_norm": 1.80559395505396, + "language_loss": 0.72578084, + "learning_rate": 2.844461868547842e-06, + "loss": 0.74736857, + "num_input_tokens_seen": 135838940, + "step": 6327, + "time_per_iteration": 2.7500593662261963 + }, + { + "auxiliary_loss_clip": 0.01134332, + "auxiliary_loss_mlp": 0.00772576, + "balance_loss_clip": 1.04898763, + "balance_loss_mlp": 1.00039506, + "epoch": 0.38045994288290996, + "flos": 21288851533440.0, + "grad_norm": 1.9791898832174752, + "language_loss": 0.83074433, + "learning_rate": 2.844108810081459e-06, + "loss": 0.84981334, + "num_input_tokens_seen": 135858325, + "step": 6328, + "time_per_iteration": 2.7503418922424316 + }, + { + "auxiliary_loss_clip": 0.01119735, + "auxiliary_loss_mlp": 0.01029986, + "balance_loss_clip": 1.04522514, + "balance_loss_mlp": 1.01522779, + "epoch": 0.38052006613557793, + "flos": 20922885815040.0, + "grad_norm": 1.5313878449465446, + "language_loss": 0.61713332, + "learning_rate": 2.843755719606385e-06, + "loss": 0.63863051, + "num_input_tokens_seen": 135878430, + "step": 6329, + "time_per_iteration": 2.682016134262085 + }, + { + "auxiliary_loss_clip": 0.01103557, + "auxiliary_loss_mlp": 0.01040275, + "balance_loss_clip": 1.04332185, + "balance_loss_mlp": 1.02436066, + "epoch": 0.3805801893882459, + "flos": 20990720649600.0, + "grad_norm": 1.9096594726999414, + "language_loss": 0.56007183, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.58151013, + "num_input_tokens_seen": 135894755, + "step": 6330, + "time_per_iteration": 2.6704044342041016 + }, + { + "auxiliary_loss_clip": 0.01088801, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.04801345, + "balance_loss_mlp": 1.02142704, + "epoch": 0.38064031264091386, + "flos": 25558594243200.0, + "grad_norm": 3.9882905607247046, + "language_loss": 0.65945244, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.6806919, + "num_input_tokens_seen": 135918275, + "step": 6331, + "time_per_iteration": 2.750293731689453 + }, + { + "auxiliary_loss_clip": 0.01120934, + "auxiliary_loss_mlp": 0.01042908, + "balance_loss_clip": 1.05122471, + "balance_loss_mlp": 1.02723169, + "epoch": 0.3807004358935818, + "flos": 15085857997440.0, + "grad_norm": 2.769340057272882, + "language_loss": 0.7601527, + "learning_rate": 2.842696256262919e-06, + "loss": 0.78179109, + "num_input_tokens_seen": 135937430, + "step": 6332, + "time_per_iteration": 2.64774227142334 + }, + { + "auxiliary_loss_clip": 0.01073508, + "auxiliary_loss_mlp": 0.00772959, + "balance_loss_clip": 1.04594767, + "balance_loss_mlp": 1.00029111, + "epoch": 0.3807605591462498, + "flos": 16399398453120.0, + "grad_norm": 2.059894273755589, + "language_loss": 0.8224051, + "learning_rate": 2.842343037886987e-06, + "loss": 0.84086972, + "num_input_tokens_seen": 135954210, + "step": 6333, + "time_per_iteration": 2.7650275230407715 + }, + { + "auxiliary_loss_clip": 0.01121534, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.04730785, + "balance_loss_mlp": 1.01878643, + "epoch": 0.3808206823989178, + "flos": 29057083102080.0, + "grad_norm": 1.5368445040683132, + "language_loss": 0.8620519, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.88359934, + "num_input_tokens_seen": 135974425, + "step": 6334, + "time_per_iteration": 2.7348363399505615 + }, + { + "auxiliary_loss_clip": 0.01123412, + "auxiliary_loss_mlp": 0.01038067, + "balance_loss_clip": 1.04626036, + "balance_loss_mlp": 1.02280819, + "epoch": 0.3808808056515858, + "flos": 15705855676800.0, + "grad_norm": 1.7714454860846107, + "language_loss": 0.79359698, + "learning_rate": 2.841636505323321e-06, + "loss": 0.81521177, + "num_input_tokens_seen": 135991985, + "step": 6335, + "time_per_iteration": 2.7020695209503174 + }, + { + "auxiliary_loss_clip": 0.01121693, + "auxiliary_loss_mlp": 0.01033758, + "balance_loss_clip": 1.04490542, + "balance_loss_mlp": 1.01847494, + "epoch": 0.38094092890425374, + "flos": 20704584908160.0, + "grad_norm": 1.872444579903983, + "language_loss": 0.72939491, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.75094938, + "num_input_tokens_seen": 136010015, + "step": 6336, + "time_per_iteration": 2.7088463306427 + }, + { + "auxiliary_loss_clip": 0.01117324, + "auxiliary_loss_mlp": 0.01033417, + "balance_loss_clip": 1.04605365, + "balance_loss_mlp": 1.01930285, + "epoch": 0.3810010521569217, + "flos": 20667956014080.0, + "grad_norm": 2.014308937626889, + "language_loss": 0.69164217, + "learning_rate": 2.840929845099894e-06, + "loss": 0.71314949, + "num_input_tokens_seen": 136028440, + "step": 6337, + "time_per_iteration": 2.6832611560821533 + }, + { + "auxiliary_loss_clip": 0.01111033, + "auxiliary_loss_mlp": 0.01036513, + "balance_loss_clip": 1.04483473, + "balance_loss_mlp": 1.02133763, + "epoch": 0.38106117540958967, + "flos": 31827626933760.0, + "grad_norm": 1.9800177042646252, + "language_loss": 0.63416338, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.65563887, + "num_input_tokens_seen": 136048360, + "step": 6338, + "time_per_iteration": 2.8045074939727783 + }, + { + "auxiliary_loss_clip": 0.01112594, + "auxiliary_loss_mlp": 0.01041591, + "balance_loss_clip": 1.04514265, + "balance_loss_mlp": 1.02520001, + "epoch": 0.38112129866225763, + "flos": 16902757693440.0, + "grad_norm": 2.42049576026076, + "language_loss": 0.69146717, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.713009, + "num_input_tokens_seen": 136065500, + "step": 6339, + "time_per_iteration": 2.6873764991760254 + }, + { + "auxiliary_loss_clip": 0.01107753, + "auxiliary_loss_mlp": 0.01047128, + "balance_loss_clip": 1.04493856, + "balance_loss_mlp": 1.03165436, + "epoch": 0.3811814219149256, + "flos": 20887226588160.0, + "grad_norm": 2.484915003961603, + "language_loss": 0.68283296, + "learning_rate": 2.839869615637177e-06, + "loss": 0.70438182, + "num_input_tokens_seen": 136084060, + "step": 6340, + "time_per_iteration": 2.730966567993164 + }, + { + "auxiliary_loss_clip": 0.01098909, + "auxiliary_loss_mlp": 0.01040765, + "balance_loss_clip": 1.0444243, + "balance_loss_mlp": 1.02449322, + "epoch": 0.38124154516759357, + "flos": 16690813493760.0, + "grad_norm": 2.645956512625022, + "language_loss": 0.89689833, + "learning_rate": 2.839516142102522e-06, + "loss": 0.91829509, + "num_input_tokens_seen": 136102310, + "step": 6341, + "time_per_iteration": 2.7552878856658936 + }, + { + "auxiliary_loss_clip": 0.01127861, + "auxiliary_loss_mlp": 0.01042909, + "balance_loss_clip": 1.04863834, + "balance_loss_mlp": 1.02668464, + "epoch": 0.38130166842026153, + "flos": 19681956702720.0, + "grad_norm": 2.1539523414578103, + "language_loss": 0.75359344, + "learning_rate": 2.83916263673333e-06, + "loss": 0.7753011, + "num_input_tokens_seen": 136120725, + "step": 6342, + "time_per_iteration": 2.6937670707702637 + }, + { + "auxiliary_loss_clip": 0.01109868, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.04506934, + "balance_loss_mlp": 1.02071738, + "epoch": 0.3813617916729295, + "flos": 22198432659840.0, + "grad_norm": 1.797512240627555, + "language_loss": 0.8348105, + "learning_rate": 2.838809099543007e-06, + "loss": 0.85626709, + "num_input_tokens_seen": 136139105, + "step": 6343, + "time_per_iteration": 2.6647467613220215 + }, + { + "auxiliary_loss_clip": 0.01073856, + "auxiliary_loss_mlp": 0.01047466, + "balance_loss_clip": 1.04339314, + "balance_loss_mlp": 1.03099144, + "epoch": 0.38142191492559746, + "flos": 19096899978240.0, + "grad_norm": 1.8507846773973766, + "language_loss": 0.76930642, + "learning_rate": 2.838455530544959e-06, + "loss": 0.7905196, + "num_input_tokens_seen": 136158265, + "step": 6344, + "time_per_iteration": 2.807464838027954 + }, + { + "auxiliary_loss_clip": 0.01099031, + "auxiliary_loss_mlp": 0.01049913, + "balance_loss_clip": 1.04580665, + "balance_loss_mlp": 1.03225255, + "epoch": 0.3814820381782654, + "flos": 24097748112000.0, + "grad_norm": 2.0591822661314847, + "language_loss": 0.73010087, + "learning_rate": 2.838101929752593e-06, + "loss": 0.75159037, + "num_input_tokens_seen": 136176100, + "step": 6345, + "time_per_iteration": 2.756462574005127 + }, + { + "auxiliary_loss_clip": 0.01094565, + "auxiliary_loss_mlp": 0.00771987, + "balance_loss_clip": 1.04568338, + "balance_loss_mlp": 1.00028944, + "epoch": 0.3815421614309334, + "flos": 15778502933760.0, + "grad_norm": 1.8320535118847152, + "language_loss": 0.69709373, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.71575922, + "num_input_tokens_seen": 136195125, + "step": 6346, + "time_per_iteration": 2.7221782207489014 + }, + { + "auxiliary_loss_clip": 0.01124746, + "auxiliary_loss_mlp": 0.01038046, + "balance_loss_clip": 1.04819, + "balance_loss_mlp": 1.02297819, + "epoch": 0.38160228468360136, + "flos": 19899754819200.0, + "grad_norm": 1.9952986193352877, + "language_loss": 0.75480664, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.77643454, + "num_input_tokens_seen": 136213885, + "step": 6347, + "time_per_iteration": 2.646730422973633 + }, + { + "auxiliary_loss_clip": 0.0112204, + "auxiliary_loss_mlp": 0.01039786, + "balance_loss_clip": 1.04638994, + "balance_loss_mlp": 1.0253861, + "epoch": 0.3816624079362694, + "flos": 19281050029440.0, + "grad_norm": 3.670871038619067, + "language_loss": 0.74398822, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.76560652, + "num_input_tokens_seen": 136232700, + "step": 6348, + "time_per_iteration": 2.651153802871704 + }, + { + "auxiliary_loss_clip": 0.01109969, + "auxiliary_loss_mlp": 0.01037685, + "balance_loss_clip": 1.04792547, + "balance_loss_mlp": 1.0233444, + "epoch": 0.38172253118893734, + "flos": 21177564220800.0, + "grad_norm": 2.7978232906816665, + "language_loss": 0.87172502, + "learning_rate": 2.836687208908142e-06, + "loss": 0.89320159, + "num_input_tokens_seen": 136248975, + "step": 6349, + "time_per_iteration": 2.693459987640381 + }, + { + "auxiliary_loss_clip": 0.0112098, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.04788637, + "balance_loss_mlp": 1.02244771, + "epoch": 0.3817826544416053, + "flos": 17529219820800.0, + "grad_norm": 1.7341599494512197, + "language_loss": 0.76554048, + "learning_rate": 2.836333449345341e-06, + "loss": 0.78712171, + "num_input_tokens_seen": 136266710, + "step": 6350, + "time_per_iteration": 2.6194076538085938 + }, + { + "auxiliary_loss_clip": 0.01104228, + "auxiliary_loss_mlp": 0.01032221, + "balance_loss_clip": 1.04922175, + "balance_loss_mlp": 1.01640153, + "epoch": 0.38184277769427327, + "flos": 16326535714560.0, + "grad_norm": 2.525722230514251, + "language_loss": 0.75608248, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.77744693, + "num_input_tokens_seen": 136284445, + "step": 6351, + "time_per_iteration": 2.723487138748169 + }, + { + "auxiliary_loss_clip": 0.01122109, + "auxiliary_loss_mlp": 0.01037028, + "balance_loss_clip": 1.04607773, + "balance_loss_mlp": 1.02048135, + "epoch": 0.38190290094694124, + "flos": 30443450382720.0, + "grad_norm": 2.201358799690427, + "language_loss": 0.74001205, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.76160336, + "num_input_tokens_seen": 136305730, + "step": 6352, + "time_per_iteration": 2.6779909133911133 + }, + { + "auxiliary_loss_clip": 0.01093469, + "auxiliary_loss_mlp": 0.01035075, + "balance_loss_clip": 1.04185915, + "balance_loss_mlp": 1.02093625, + "epoch": 0.3819630241996092, + "flos": 14209924936320.0, + "grad_norm": 1.7014377772216425, + "language_loss": 0.64249897, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.66378438, + "num_input_tokens_seen": 136323850, + "step": 6353, + "time_per_iteration": 2.731860399246216 + }, + { + "auxiliary_loss_clip": 0.01133265, + "auxiliary_loss_mlp": 0.01039549, + "balance_loss_clip": 1.04809213, + "balance_loss_mlp": 1.02529204, + "epoch": 0.38202314745227717, + "flos": 25009699536000.0, + "grad_norm": 2.7523604394748644, + "language_loss": 0.83447051, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85619861, + "num_input_tokens_seen": 136344880, + "step": 6354, + "time_per_iteration": 2.665891170501709 + }, + { + "auxiliary_loss_clip": 0.01132291, + "auxiliary_loss_mlp": 0.01034862, + "balance_loss_clip": 1.04866302, + "balance_loss_mlp": 1.02162409, + "epoch": 0.38208327070494513, + "flos": 20814507504000.0, + "grad_norm": 16.091226432139102, + "language_loss": 0.80633152, + "learning_rate": 2.834564176091943e-06, + "loss": 0.82800299, + "num_input_tokens_seen": 136366060, + "step": 6355, + "time_per_iteration": 2.6580965518951416 + }, + { + "auxiliary_loss_clip": 0.01092469, + "auxiliary_loss_mlp": 0.01037027, + "balance_loss_clip": 1.04551625, + "balance_loss_mlp": 1.02263832, + "epoch": 0.3821433939576131, + "flos": 22637727993600.0, + "grad_norm": 1.8508447811900344, + "language_loss": 0.75970227, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.78099722, + "num_input_tokens_seen": 136385625, + "step": 6356, + "time_per_iteration": 2.7381057739257812 + }, + { + "auxiliary_loss_clip": 0.01123851, + "auxiliary_loss_mlp": 0.00772749, + "balance_loss_clip": 1.04802036, + "balance_loss_mlp": 1.00034022, + "epoch": 0.38220351721028106, + "flos": 26869872142080.0, + "grad_norm": 2.3854964939919188, + "language_loss": 0.81208009, + "learning_rate": 2.833856245169348e-06, + "loss": 0.8310461, + "num_input_tokens_seen": 136405750, + "step": 6357, + "time_per_iteration": 2.8209376335144043 + }, + { + "auxiliary_loss_clip": 0.01118527, + "auxiliary_loss_mlp": 0.01044748, + "balance_loss_clip": 1.05246222, + "balance_loss_mlp": 1.02842796, + "epoch": 0.38226364046294903, + "flos": 23367468700800.0, + "grad_norm": 2.215929075758269, + "language_loss": 0.77378345, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.79541618, + "num_input_tokens_seen": 136426085, + "step": 6358, + "time_per_iteration": 2.7004640102386475 + }, + { + "auxiliary_loss_clip": 0.01115504, + "auxiliary_loss_mlp": 0.01047061, + "balance_loss_clip": 1.0469476, + "balance_loss_mlp": 1.03118849, + "epoch": 0.382323763715617, + "flos": 19646225648640.0, + "grad_norm": 3.6635579737055837, + "language_loss": 0.78477705, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.80640268, + "num_input_tokens_seen": 136442670, + "step": 6359, + "time_per_iteration": 4.184551954269409 + }, + { + "auxiliary_loss_clip": 0.01065181, + "auxiliary_loss_mlp": 0.01052018, + "balance_loss_clip": 1.03820515, + "balance_loss_mlp": 1.03462481, + "epoch": 0.38238388696828496, + "flos": 54124741232640.0, + "grad_norm": 1.6779400536158158, + "language_loss": 0.69735414, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.71852612, + "num_input_tokens_seen": 136465730, + "step": 6360, + "time_per_iteration": 3.1072845458984375 + }, + { + "auxiliary_loss_clip": 0.01102455, + "auxiliary_loss_mlp": 0.01037366, + "balance_loss_clip": 1.04502857, + "balance_loss_mlp": 1.02189279, + "epoch": 0.382444010220953, + "flos": 24936190352640.0, + "grad_norm": 1.5790785802582266, + "language_loss": 0.79362941, + "learning_rate": 2.83244000399261e-06, + "loss": 0.81502759, + "num_input_tokens_seen": 136487215, + "step": 6361, + "time_per_iteration": 4.285314559936523 + }, + { + "auxiliary_loss_clip": 0.01111113, + "auxiliary_loss_mlp": 0.01043827, + "balance_loss_clip": 1.04649949, + "balance_loss_mlp": 1.02906859, + "epoch": 0.38250413347362094, + "flos": 42337351209600.0, + "grad_norm": 1.9067122847602551, + "language_loss": 0.65606177, + "learning_rate": 2.832085864749337e-06, + "loss": 0.67761117, + "num_input_tokens_seen": 136510365, + "step": 6362, + "time_per_iteration": 2.8447117805480957 + }, + { + "auxiliary_loss_clip": 0.0113439, + "auxiliary_loss_mlp": 0.01035947, + "balance_loss_clip": 1.0483737, + "balance_loss_mlp": 1.01978207, + "epoch": 0.3825642567262889, + "flos": 16289224462080.0, + "grad_norm": 2.3383155012254284, + "language_loss": 0.82138497, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.84308833, + "num_input_tokens_seen": 136527100, + "step": 6363, + "time_per_iteration": 4.166736602783203 + }, + { + "auxiliary_loss_clip": 0.01075728, + "auxiliary_loss_mlp": 0.01042552, + "balance_loss_clip": 1.04349709, + "balance_loss_mlp": 1.02707291, + "epoch": 0.3826243799789569, + "flos": 45654778586880.0, + "grad_norm": 2.1311203141010835, + "language_loss": 0.59044886, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.61163169, + "num_input_tokens_seen": 136550870, + "step": 6364, + "time_per_iteration": 3.006801128387451 + }, + { + "auxiliary_loss_clip": 0.01122076, + "auxiliary_loss_mlp": 0.01041213, + "balance_loss_clip": 1.05097353, + "balance_loss_mlp": 1.02542353, + "epoch": 0.38268450323162484, + "flos": 25301581453440.0, + "grad_norm": 1.9239689491626994, + "language_loss": 0.68903065, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.7106635, + "num_input_tokens_seen": 136569895, + "step": 6365, + "time_per_iteration": 2.695068597793579 + }, + { + "auxiliary_loss_clip": 0.01123716, + "auxiliary_loss_mlp": 0.01039809, + "balance_loss_clip": 1.04955769, + "balance_loss_mlp": 1.02366817, + "epoch": 0.3827446264842928, + "flos": 21836022387840.0, + "grad_norm": 2.0334034116186137, + "language_loss": 0.73193848, + "learning_rate": 2.830668992382758e-06, + "loss": 0.75357372, + "num_input_tokens_seen": 136588585, + "step": 6366, + "time_per_iteration": 4.418980598449707 + }, + { + "auxiliary_loss_clip": 0.01115964, + "auxiliary_loss_mlp": 0.01038347, + "balance_loss_clip": 1.04846239, + "balance_loss_mlp": 1.02265882, + "epoch": 0.38280474973696077, + "flos": 25734591907200.0, + "grad_norm": 2.4539991484931645, + "language_loss": 0.68623614, + "learning_rate": 2.830314695509902e-06, + "loss": 0.70777929, + "num_input_tokens_seen": 136606640, + "step": 6367, + "time_per_iteration": 2.6878082752227783 + }, + { + "auxiliary_loss_clip": 0.01125961, + "auxiliary_loss_mlp": 0.01037618, + "balance_loss_clip": 1.05120409, + "balance_loss_mlp": 1.02256823, + "epoch": 0.38286487298962874, + "flos": 24895934184960.0, + "grad_norm": 2.196344444241347, + "language_loss": 0.64423102, + "learning_rate": 2.82996036715143e-06, + "loss": 0.66586685, + "num_input_tokens_seen": 136624940, + "step": 6368, + "time_per_iteration": 2.6698646545410156 + }, + { + "auxiliary_loss_clip": 0.01139795, + "auxiliary_loss_mlp": 0.01040116, + "balance_loss_clip": 1.05269098, + "balance_loss_mlp": 1.02390361, + "epoch": 0.3829249962422967, + "flos": 28543703967360.0, + "grad_norm": 1.346024597035963, + "language_loss": 0.684017, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.70581615, + "num_input_tokens_seen": 136645540, + "step": 6369, + "time_per_iteration": 2.7156169414520264 + }, + { + "auxiliary_loss_clip": 0.01084469, + "auxiliary_loss_mlp": 0.01039929, + "balance_loss_clip": 1.04267466, + "balance_loss_mlp": 1.02391946, + "epoch": 0.38298511949496467, + "flos": 21471205904640.0, + "grad_norm": 1.7824237306329542, + "language_loss": 0.78701794, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.80826187, + "num_input_tokens_seen": 136664530, + "step": 6370, + "time_per_iteration": 2.7351901531219482 + }, + { + "auxiliary_loss_clip": 0.01121027, + "auxiliary_loss_mlp": 0.01050163, + "balance_loss_clip": 1.04909503, + "balance_loss_mlp": 1.03279376, + "epoch": 0.38304524274763263, + "flos": 31679998035840.0, + "grad_norm": 2.5095706519371794, + "language_loss": 0.65098304, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.67269492, + "num_input_tokens_seen": 136682315, + "step": 6371, + "time_per_iteration": 2.739689350128174 + }, + { + "auxiliary_loss_clip": 0.01110581, + "auxiliary_loss_mlp": 0.01041968, + "balance_loss_clip": 1.04938042, + "balance_loss_mlp": 1.02471852, + "epoch": 0.3831053660003006, + "flos": 25076816098560.0, + "grad_norm": 3.269308088463154, + "language_loss": 0.7304002, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.75192571, + "num_input_tokens_seen": 136701185, + "step": 6372, + "time_per_iteration": 2.7497966289520264 + }, + { + "auxiliary_loss_clip": 0.01127864, + "auxiliary_loss_mlp": 0.01034223, + "balance_loss_clip": 1.05050421, + "balance_loss_mlp": 1.01848698, + "epoch": 0.38316548925296856, + "flos": 23259018562560.0, + "grad_norm": 1.83316702621751, + "language_loss": 0.8491025, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.87072337, + "num_input_tokens_seen": 136721265, + "step": 6373, + "time_per_iteration": 2.6510777473449707 + }, + { + "auxiliary_loss_clip": 0.01084717, + "auxiliary_loss_mlp": 0.01048262, + "balance_loss_clip": 1.0416218, + "balance_loss_mlp": 1.0316565, + "epoch": 0.3832256125056366, + "flos": 34423465991040.0, + "grad_norm": 2.287485479433922, + "language_loss": 0.74893212, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.770262, + "num_input_tokens_seen": 136741885, + "step": 6374, + "time_per_iteration": 2.8658056259155273 + }, + { + "auxiliary_loss_clip": 0.01130215, + "auxiliary_loss_mlp": 0.01042427, + "balance_loss_clip": 1.05264366, + "balance_loss_mlp": 1.02613068, + "epoch": 0.38328573575830455, + "flos": 21762764599680.0, + "grad_norm": 7.5426595342284735, + "language_loss": 0.75737238, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.77909875, + "num_input_tokens_seen": 136760905, + "step": 6375, + "time_per_iteration": 2.6622958183288574 + }, + { + "auxiliary_loss_clip": 0.01126708, + "auxiliary_loss_mlp": 0.01039776, + "balance_loss_clip": 1.05043924, + "balance_loss_mlp": 1.0244453, + "epoch": 0.3833458590109725, + "flos": 17380010724480.0, + "grad_norm": 2.1246389624552435, + "language_loss": 0.72777182, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.74943662, + "num_input_tokens_seen": 136777240, + "step": 6376, + "time_per_iteration": 2.6562421321868896 + }, + { + "auxiliary_loss_clip": 0.01122147, + "auxiliary_loss_mlp": 0.01039822, + "balance_loss_clip": 1.04791379, + "balance_loss_mlp": 1.02381194, + "epoch": 0.3834059822636405, + "flos": 29424557191680.0, + "grad_norm": 1.7414598633373413, + "language_loss": 0.67441249, + "learning_rate": 2.826769997289796e-06, + "loss": 0.69603217, + "num_input_tokens_seen": 136801040, + "step": 6377, + "time_per_iteration": 2.779766798019409 + }, + { + "auxiliary_loss_clip": 0.01110002, + "auxiliary_loss_mlp": 0.01041228, + "balance_loss_clip": 1.05152845, + "balance_loss_mlp": 1.02421689, + "epoch": 0.38346610551630844, + "flos": 21470739027840.0, + "grad_norm": 2.377659826482013, + "language_loss": 0.73287642, + "learning_rate": 2.826415354814344e-06, + "loss": 0.75438869, + "num_input_tokens_seen": 136819495, + "step": 6378, + "time_per_iteration": 2.7345829010009766 + }, + { + "auxiliary_loss_clip": 0.01085335, + "auxiliary_loss_mlp": 0.01042694, + "balance_loss_clip": 1.0479784, + "balance_loss_mlp": 1.02707767, + "epoch": 0.3835262287689764, + "flos": 27561224188800.0, + "grad_norm": 2.283576437082984, + "language_loss": 0.69473612, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.71601641, + "num_input_tokens_seen": 136838840, + "step": 6379, + "time_per_iteration": 2.7592358589172363 + }, + { + "auxiliary_loss_clip": 0.01124706, + "auxiliary_loss_mlp": 0.01036177, + "balance_loss_clip": 1.0516969, + "balance_loss_mlp": 1.02094209, + "epoch": 0.3835863520216444, + "flos": 15523716787200.0, + "grad_norm": 1.8393672130560537, + "language_loss": 0.83356249, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.85517132, + "num_input_tokens_seen": 136854425, + "step": 6380, + "time_per_iteration": 2.6572370529174805 + }, + { + "auxiliary_loss_clip": 0.01135434, + "auxiliary_loss_mlp": 0.01035321, + "balance_loss_clip": 1.05187774, + "balance_loss_mlp": 1.02010989, + "epoch": 0.38364647527431234, + "flos": 21904934630400.0, + "grad_norm": 1.5891747666862521, + "language_loss": 0.8141042, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.83581179, + "num_input_tokens_seen": 136874355, + "step": 6381, + "time_per_iteration": 2.7251663208007812 + }, + { + "auxiliary_loss_clip": 0.01057344, + "auxiliary_loss_mlp": 0.01005901, + "balance_loss_clip": 1.02759361, + "balance_loss_mlp": 1.00418437, + "epoch": 0.3837065985269803, + "flos": 65534927558400.0, + "grad_norm": 0.7954141143291842, + "language_loss": 0.60376751, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.62440002, + "num_input_tokens_seen": 136937475, + "step": 6382, + "time_per_iteration": 3.1750948429107666 + }, + { + "auxiliary_loss_clip": 0.01139607, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.05060625, + "balance_loss_mlp": 1.02099442, + "epoch": 0.38376672177964827, + "flos": 28256598558720.0, + "grad_norm": 3.8324285625149925, + "language_loss": 0.66432369, + "learning_rate": 2.824641672639794e-06, + "loss": 0.68608773, + "num_input_tokens_seen": 136955805, + "step": 6383, + "time_per_iteration": 2.7543957233428955 + }, + { + "auxiliary_loss_clip": 0.01103794, + "auxiliary_loss_mlp": 0.01039577, + "balance_loss_clip": 1.04783142, + "balance_loss_mlp": 1.02375221, + "epoch": 0.38382684503231623, + "flos": 20631363033600.0, + "grad_norm": 2.110615575498957, + "language_loss": 0.75144917, + "learning_rate": 2.824286842339587e-06, + "loss": 0.77288288, + "num_input_tokens_seen": 136975240, + "step": 6384, + "time_per_iteration": 2.7796735763549805 + }, + { + "auxiliary_loss_clip": 0.01122869, + "auxiliary_loss_mlp": 0.01040365, + "balance_loss_clip": 1.05156231, + "balance_loss_mlp": 1.02510643, + "epoch": 0.3838869682849842, + "flos": 19605825826560.0, + "grad_norm": 1.5394774946197278, + "language_loss": 0.76096714, + "learning_rate": 2.823931980782341e-06, + "loss": 0.78259945, + "num_input_tokens_seen": 136994985, + "step": 6385, + "time_per_iteration": 2.6831300258636475 + }, + { + "auxiliary_loss_clip": 0.01046831, + "auxiliary_loss_mlp": 0.01001133, + "balance_loss_clip": 1.02648735, + "balance_loss_mlp": 0.99943984, + "epoch": 0.38394709153765216, + "flos": 56556110891520.0, + "grad_norm": 0.9063295744618779, + "language_loss": 0.66955769, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69003725, + "num_input_tokens_seen": 137046290, + "step": 6386, + "time_per_iteration": 3.0693411827087402 + }, + { + "auxiliary_loss_clip": 0.0109652, + "auxiliary_loss_mlp": 0.01041859, + "balance_loss_clip": 1.04551756, + "balance_loss_mlp": 1.02686286, + "epoch": 0.3840072147903202, + "flos": 15888748752000.0, + "grad_norm": 1.7986188221191803, + "language_loss": 0.7215755, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.74295932, + "num_input_tokens_seen": 137064725, + "step": 6387, + "time_per_iteration": 2.736774206161499 + }, + { + "auxiliary_loss_clip": 0.01134624, + "auxiliary_loss_mlp": 0.01044946, + "balance_loss_clip": 1.05156994, + "balance_loss_mlp": 1.03039086, + "epoch": 0.38406733804298815, + "flos": 28218030330240.0, + "grad_norm": 1.6374516085838389, + "language_loss": 0.8088249, + "learning_rate": 2.822867208702932e-06, + "loss": 0.83062065, + "num_input_tokens_seen": 137086030, + "step": 6388, + "time_per_iteration": 2.782958507537842 + }, + { + "auxiliary_loss_clip": 0.01103471, + "auxiliary_loss_mlp": 0.01047592, + "balance_loss_clip": 1.04727554, + "balance_loss_mlp": 1.03298843, + "epoch": 0.3841274612956561, + "flos": 18223588609920.0, + "grad_norm": 1.7872750649564642, + "language_loss": 0.76085746, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.78236812, + "num_input_tokens_seen": 137105400, + "step": 6389, + "time_per_iteration": 2.6644833087921143 + }, + { + "auxiliary_loss_clip": 0.01119906, + "auxiliary_loss_mlp": 0.0104877, + "balance_loss_clip": 1.05389404, + "balance_loss_mlp": 1.03203344, + "epoch": 0.3841875845483241, + "flos": 19792884879360.0, + "grad_norm": 4.9507505317589775, + "language_loss": 0.76550084, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.78718758, + "num_input_tokens_seen": 137124985, + "step": 6390, + "time_per_iteration": 2.825714588165283 + }, + { + "auxiliary_loss_clip": 0.01090482, + "auxiliary_loss_mlp": 0.01048203, + "balance_loss_clip": 1.04517913, + "balance_loss_mlp": 1.03196096, + "epoch": 0.38424770780099204, + "flos": 29898829393920.0, + "grad_norm": 1.7614871223783444, + "language_loss": 0.70377523, + "learning_rate": 2.821802155794668e-06, + "loss": 0.72516215, + "num_input_tokens_seen": 137146745, + "step": 6391, + "time_per_iteration": 2.918065309524536 + }, + { + "auxiliary_loss_clip": 0.01125443, + "auxiliary_loss_mlp": 0.01036977, + "balance_loss_clip": 1.04874265, + "balance_loss_mlp": 1.02158153, + "epoch": 0.38430783105366, + "flos": 20813717404800.0, + "grad_norm": 1.7948670510085722, + "language_loss": 0.84005457, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.86167878, + "num_input_tokens_seen": 137163195, + "step": 6392, + "time_per_iteration": 2.679427146911621 + }, + { + "auxiliary_loss_clip": 0.01122701, + "auxiliary_loss_mlp": 0.01037128, + "balance_loss_clip": 1.04846168, + "balance_loss_mlp": 1.0227809, + "epoch": 0.384367954306328, + "flos": 10998577399680.0, + "grad_norm": 2.3141685884805145, + "language_loss": 0.6062203, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.62781858, + "num_input_tokens_seen": 137179330, + "step": 6393, + "time_per_iteration": 2.6622374057769775 + }, + { + "auxiliary_loss_clip": 0.01110672, + "auxiliary_loss_mlp": 0.01036894, + "balance_loss_clip": 1.04954767, + "balance_loss_mlp": 1.02025223, + "epoch": 0.38442807755899594, + "flos": 25338030779520.0, + "grad_norm": 1.7908313499382054, + "language_loss": 0.70639426, + "learning_rate": 2.820736822421029e-06, + "loss": 0.72786993, + "num_input_tokens_seen": 137198655, + "step": 6394, + "time_per_iteration": 2.7460365295410156 + }, + { + "auxiliary_loss_clip": 0.01123613, + "auxiliary_loss_mlp": 0.01035163, + "balance_loss_clip": 1.04763663, + "balance_loss_mlp": 1.01871169, + "epoch": 0.3844882008116639, + "flos": 21069760527360.0, + "grad_norm": 2.646318489707099, + "language_loss": 0.81774974, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.83933747, + "num_input_tokens_seen": 137217120, + "step": 6395, + "time_per_iteration": 2.676023006439209 + }, + { + "auxiliary_loss_clip": 0.01129196, + "auxiliary_loss_mlp": 0.01046949, + "balance_loss_clip": 1.05485177, + "balance_loss_mlp": 1.03209007, + "epoch": 0.38454832406433187, + "flos": 17963235855360.0, + "grad_norm": 1.9755185808990787, + "language_loss": 0.71031433, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.73207581, + "num_input_tokens_seen": 137234410, + "step": 6396, + "time_per_iteration": 2.7082455158233643 + }, + { + "auxiliary_loss_clip": 0.01044031, + "auxiliary_loss_mlp": 0.0100801, + "balance_loss_clip": 1.02689695, + "balance_loss_mlp": 1.00657308, + "epoch": 0.38460844731699984, + "flos": 67924999555200.0, + "grad_norm": 0.8839433118134116, + "language_loss": 0.59671199, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.61723238, + "num_input_tokens_seen": 137294940, + "step": 6397, + "time_per_iteration": 3.2412428855895996 + }, + { + "auxiliary_loss_clip": 0.01137376, + "auxiliary_loss_mlp": 0.01035554, + "balance_loss_clip": 1.05209756, + "balance_loss_mlp": 1.02044976, + "epoch": 0.3846685705696678, + "flos": 25849075530240.0, + "grad_norm": 2.648974669995796, + "language_loss": 0.85017276, + "learning_rate": 2.819315942271794e-06, + "loss": 0.87190199, + "num_input_tokens_seen": 137315035, + "step": 6398, + "time_per_iteration": 2.7374656200408936 + }, + { + "auxiliary_loss_clip": 0.01136492, + "auxiliary_loss_mlp": 0.01030698, + "balance_loss_clip": 1.0517211, + "balance_loss_mlp": 1.0165, + "epoch": 0.38472869382233577, + "flos": 16290194129280.0, + "grad_norm": 2.1032431430060075, + "language_loss": 0.79989493, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.82156688, + "num_input_tokens_seen": 137333155, + "step": 6399, + "time_per_iteration": 4.446218729019165 + }, + { + "auxiliary_loss_clip": 0.0113807, + "auxiliary_loss_mlp": 0.00773562, + "balance_loss_clip": 1.05109119, + "balance_loss_mlp": 1.00025833, + "epoch": 0.38478881707500373, + "flos": 19353122668800.0, + "grad_norm": 3.0376300513317416, + "language_loss": 0.67328328, + "learning_rate": 2.818605315732038e-06, + "loss": 0.69239962, + "num_input_tokens_seen": 137351515, + "step": 6400, + "time_per_iteration": 2.6920905113220215 + }, + { + "auxiliary_loss_clip": 0.01122811, + "auxiliary_loss_mlp": 0.01042029, + "balance_loss_clip": 1.05546772, + "balance_loss_mlp": 1.0264008, + "epoch": 0.38484894032767175, + "flos": 24860849575680.0, + "grad_norm": 11.158483612058907, + "language_loss": 0.73623443, + "learning_rate": 2.81824995589303e-06, + "loss": 0.75788283, + "num_input_tokens_seen": 137371255, + "step": 6401, + "time_per_iteration": 4.2371673583984375 + }, + { + "auxiliary_loss_clip": 0.01102005, + "auxiliary_loss_mlp": 0.01039851, + "balance_loss_clip": 1.04852486, + "balance_loss_mlp": 1.02387738, + "epoch": 0.3849090635803397, + "flos": 14501806853760.0, + "grad_norm": 2.0006804524577233, + "language_loss": 0.72059876, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.74201727, + "num_input_tokens_seen": 137388980, + "step": 6402, + "time_per_iteration": 2.686413288116455 + }, + { + "auxiliary_loss_clip": 0.0113478, + "auxiliary_loss_mlp": 0.01035082, + "balance_loss_clip": 1.05094552, + "balance_loss_mlp": 1.02016854, + "epoch": 0.3849691868330077, + "flos": 18515865576960.0, + "grad_norm": 2.094788133183166, + "language_loss": 0.82884681, + "learning_rate": 2.817539143144128e-06, + "loss": 0.85054541, + "num_input_tokens_seen": 137406885, + "step": 6403, + "time_per_iteration": 4.234680891036987 + }, + { + "auxiliary_loss_clip": 0.01078109, + "auxiliary_loss_mlp": 0.01040581, + "balance_loss_clip": 1.04205656, + "balance_loss_mlp": 1.02466702, + "epoch": 0.38502931008567565, + "flos": 21616392677760.0, + "grad_norm": 4.587008789601206, + "language_loss": 0.82845348, + "learning_rate": 2.817183690261189e-06, + "loss": 0.84964037, + "num_input_tokens_seen": 137425535, + "step": 6404, + "time_per_iteration": 2.777756452560425 + }, + { + "auxiliary_loss_clip": 0.0111195, + "auxiliary_loss_mlp": 0.01034861, + "balance_loss_clip": 1.04970074, + "balance_loss_mlp": 1.02046084, + "epoch": 0.3850894333383436, + "flos": 25415346804480.0, + "grad_norm": 2.6287869212560646, + "language_loss": 0.69417107, + "learning_rate": 2.816828206390563e-06, + "loss": 0.71563923, + "num_input_tokens_seen": 137447700, + "step": 6405, + "time_per_iteration": 4.478301286697388 + }, + { + "auxiliary_loss_clip": 0.01102381, + "auxiliary_loss_mlp": 0.01038086, + "balance_loss_clip": 1.0438571, + "balance_loss_mlp": 1.02414417, + "epoch": 0.3851495565910116, + "flos": 20227870581120.0, + "grad_norm": 1.9306681180439358, + "language_loss": 0.79248095, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81388557, + "num_input_tokens_seen": 137462245, + "step": 6406, + "time_per_iteration": 2.7157816886901855 + }, + { + "auxiliary_loss_clip": 0.01129296, + "auxiliary_loss_mlp": 0.01040841, + "balance_loss_clip": 1.05465746, + "balance_loss_mlp": 1.02483082, + "epoch": 0.38520967984367954, + "flos": 16508459122560.0, + "grad_norm": 5.929375109580111, + "language_loss": 0.84107637, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.86277771, + "num_input_tokens_seen": 137476455, + "step": 6407, + "time_per_iteration": 2.6058037281036377 + }, + { + "auxiliary_loss_clip": 0.01049614, + "auxiliary_loss_mlp": 0.00999678, + "balance_loss_clip": 1.03001904, + "balance_loss_mlp": 0.99828893, + "epoch": 0.3852698030963475, + "flos": 61313772971520.0, + "grad_norm": 0.845548946049954, + "language_loss": 0.64919412, + "learning_rate": 2.815761568987365e-06, + "loss": 0.66968703, + "num_input_tokens_seen": 137539845, + "step": 6408, + "time_per_iteration": 3.2015879154205322 + }, + { + "auxiliary_loss_clip": 0.01110915, + "auxiliary_loss_mlp": 0.01042045, + "balance_loss_clip": 1.05201948, + "balance_loss_mlp": 1.02547526, + "epoch": 0.3853299263490155, + "flos": 22893016930560.0, + "grad_norm": 1.5734517214124462, + "language_loss": 0.73444313, + "learning_rate": 2.8154059613008e-06, + "loss": 0.75597274, + "num_input_tokens_seen": 137559880, + "step": 6409, + "time_per_iteration": 2.683310031890869 + }, + { + "auxiliary_loss_clip": 0.01099042, + "auxiliary_loss_mlp": 0.01052587, + "balance_loss_clip": 1.05162942, + "balance_loss_mlp": 1.03458679, + "epoch": 0.38539004960168344, + "flos": 20047491457920.0, + "grad_norm": 3.095928763270071, + "language_loss": 0.70505756, + "learning_rate": 2.81505032269396e-06, + "loss": 0.72657388, + "num_input_tokens_seen": 137578225, + "step": 6410, + "time_per_iteration": 2.7694053649902344 + }, + { + "auxiliary_loss_clip": 0.01018797, + "auxiliary_loss_mlp": 0.00754046, + "balance_loss_clip": 1.02754462, + "balance_loss_mlp": 1.00070059, + "epoch": 0.3854501728543514, + "flos": 68730691570560.0, + "grad_norm": 0.6824056349925876, + "language_loss": 0.6019417, + "learning_rate": 2.81469465318033e-06, + "loss": 0.61967015, + "num_input_tokens_seen": 137645770, + "step": 6411, + "time_per_iteration": 3.3692543506622314 + }, + { + "auxiliary_loss_clip": 0.01091571, + "auxiliary_loss_mlp": 0.01029185, + "balance_loss_clip": 1.04337883, + "balance_loss_mlp": 1.01451063, + "epoch": 0.38551029610701937, + "flos": 20485027025280.0, + "grad_norm": 2.4386958956664344, + "language_loss": 0.78219938, + "learning_rate": 2.814338952773397e-06, + "loss": 0.80340695, + "num_input_tokens_seen": 137664090, + "step": 6412, + "time_per_iteration": 2.7462196350097656 + }, + { + "auxiliary_loss_clip": 0.01097982, + "auxiliary_loss_mlp": 0.01037754, + "balance_loss_clip": 1.04309821, + "balance_loss_mlp": 1.01995587, + "epoch": 0.38557041935968733, + "flos": 23471788775040.0, + "grad_norm": 2.0249224045322802, + "language_loss": 0.78112727, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.80248463, + "num_input_tokens_seen": 137683190, + "step": 6413, + "time_per_iteration": 2.768624782562256 + }, + { + "auxiliary_loss_clip": 0.01056912, + "auxiliary_loss_mlp": 0.01003998, + "balance_loss_clip": 1.02733278, + "balance_loss_mlp": 1.00254369, + "epoch": 0.38563054261235535, + "flos": 63966636869760.0, + "grad_norm": 0.8082958368118873, + "language_loss": 0.61342072, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63402981, + "num_input_tokens_seen": 137737315, + "step": 6414, + "time_per_iteration": 2.983466625213623 + }, + { + "auxiliary_loss_clip": 0.01103716, + "auxiliary_loss_mlp": 0.01038577, + "balance_loss_clip": 1.05065155, + "balance_loss_mlp": 1.02302015, + "epoch": 0.3856906658650233, + "flos": 23987789602560.0, + "grad_norm": 2.2111312580879106, + "language_loss": 0.77225536, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.79367828, + "num_input_tokens_seen": 137753535, + "step": 6415, + "time_per_iteration": 2.7486205101013184 + }, + { + "auxiliary_loss_clip": 0.01109368, + "auxiliary_loss_mlp": 0.01030786, + "balance_loss_clip": 1.04894936, + "balance_loss_mlp": 1.01676726, + "epoch": 0.3857507891176913, + "flos": 25007436979200.0, + "grad_norm": 1.644505635534703, + "language_loss": 0.80036473, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.82176626, + "num_input_tokens_seen": 137773405, + "step": 6416, + "time_per_iteration": 2.709200859069824 + }, + { + "auxiliary_loss_clip": 0.0112133, + "auxiliary_loss_mlp": 0.00771665, + "balance_loss_clip": 1.04777813, + "balance_loss_mlp": 1.00020468, + "epoch": 0.38581091237035925, + "flos": 21536778182400.0, + "grad_norm": 1.8974153334913886, + "language_loss": 0.78746861, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.80639857, + "num_input_tokens_seen": 137790810, + "step": 6417, + "time_per_iteration": 2.6839869022369385 + }, + { + "auxiliary_loss_clip": 0.01106617, + "auxiliary_loss_mlp": 0.0103795, + "balance_loss_clip": 1.04771507, + "balance_loss_mlp": 1.02424121, + "epoch": 0.3858710356230272, + "flos": 17383889393280.0, + "grad_norm": 1.8492847143532247, + "language_loss": 0.80066824, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.82211387, + "num_input_tokens_seen": 137810265, + "step": 6418, + "time_per_iteration": 2.709463119506836 + }, + { + "auxiliary_loss_clip": 0.01106426, + "auxiliary_loss_mlp": 0.01035927, + "balance_loss_clip": 1.04606509, + "balance_loss_mlp": 1.02115691, + "epoch": 0.3859311588756952, + "flos": 20339588856960.0, + "grad_norm": 2.0121704661475524, + "language_loss": 0.79591382, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.81733727, + "num_input_tokens_seen": 137828580, + "step": 6419, + "time_per_iteration": 2.687030553817749 + }, + { + "auxiliary_loss_clip": 0.01109367, + "auxiliary_loss_mlp": 0.01035627, + "balance_loss_clip": 1.04662013, + "balance_loss_mlp": 1.0194031, + "epoch": 0.38599128212836314, + "flos": 26321157002880.0, + "grad_norm": 2.202509680177809, + "language_loss": 0.67581224, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.69726223, + "num_input_tokens_seen": 137846145, + "step": 6420, + "time_per_iteration": 2.7517049312591553 + }, + { + "auxiliary_loss_clip": 0.01089731, + "auxiliary_loss_mlp": 0.01053637, + "balance_loss_clip": 1.04479241, + "balance_loss_mlp": 1.03771043, + "epoch": 0.3860514053810311, + "flos": 13553837066880.0, + "grad_norm": 2.406147976104497, + "language_loss": 0.81137526, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.83280897, + "num_input_tokens_seen": 137863705, + "step": 6421, + "time_per_iteration": 2.970040798187256 + }, + { + "auxiliary_loss_clip": 0.01108309, + "auxiliary_loss_mlp": 0.01040046, + "balance_loss_clip": 1.04625583, + "balance_loss_mlp": 1.02510345, + "epoch": 0.3861115286336991, + "flos": 20954271323520.0, + "grad_norm": 2.6092074943148797, + "language_loss": 0.71989834, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.74138188, + "num_input_tokens_seen": 137880285, + "step": 6422, + "time_per_iteration": 2.690490961074829 + }, + { + "auxiliary_loss_clip": 0.01104575, + "auxiliary_loss_mlp": 0.0104152, + "balance_loss_clip": 1.04663455, + "balance_loss_mlp": 1.02759588, + "epoch": 0.38617165188636704, + "flos": 16362697731840.0, + "grad_norm": 1.6942063430957965, + "language_loss": 0.66644311, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.687904, + "num_input_tokens_seen": 137898335, + "step": 6423, + "time_per_iteration": 2.6189329624176025 + }, + { + "auxiliary_loss_clip": 0.01128312, + "auxiliary_loss_mlp": 0.01042786, + "balance_loss_clip": 1.05139875, + "balance_loss_mlp": 1.02860618, + "epoch": 0.386231775139035, + "flos": 34787276893440.0, + "grad_norm": 2.1536039728580394, + "language_loss": 0.68359423, + "learning_rate": 2.810068143123449e-06, + "loss": 0.70530522, + "num_input_tokens_seen": 137918605, + "step": 6424, + "time_per_iteration": 2.7609992027282715 + }, + { + "auxiliary_loss_clip": 0.01098796, + "auxiliary_loss_mlp": 0.01038309, + "balance_loss_clip": 1.04750848, + "balance_loss_mlp": 1.02387285, + "epoch": 0.38629189839170297, + "flos": 21726171619200.0, + "grad_norm": 1.4481478329406698, + "language_loss": 0.72367114, + "learning_rate": 2.809712042331429e-06, + "loss": 0.7450422, + "num_input_tokens_seen": 137938245, + "step": 6425, + "time_per_iteration": 2.7069387435913086 + }, + { + "auxiliary_loss_clip": 0.01099551, + "auxiliary_loss_mlp": 0.00773141, + "balance_loss_clip": 1.0428803, + "balance_loss_mlp": 1.00013173, + "epoch": 0.38635202164437094, + "flos": 27923634460800.0, + "grad_norm": 2.52438881915832, + "language_loss": 0.80258477, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.82131171, + "num_input_tokens_seen": 137956770, + "step": 6426, + "time_per_iteration": 2.8976056575775146 + }, + { + "auxiliary_loss_clip": 0.01125602, + "auxiliary_loss_mlp": 0.0103515, + "balance_loss_clip": 1.04929447, + "balance_loss_mlp": 1.02013016, + "epoch": 0.38641214489703896, + "flos": 23586631534080.0, + "grad_norm": 2.2578291383073825, + "language_loss": 0.7536087, + "learning_rate": 2.80899974864781e-06, + "loss": 0.77521622, + "num_input_tokens_seen": 137977040, + "step": 6427, + "time_per_iteration": 2.7281436920166016 + }, + { + "auxiliary_loss_clip": 0.01075932, + "auxiliary_loss_mlp": 0.01057335, + "balance_loss_clip": 1.04142189, + "balance_loss_mlp": 1.04013276, + "epoch": 0.3864722681497069, + "flos": 12641239198080.0, + "grad_norm": 2.0875975256988055, + "language_loss": 0.69435054, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.71568322, + "num_input_tokens_seen": 137993545, + "step": 6428, + "time_per_iteration": 2.7289116382598877 + }, + { + "auxiliary_loss_clip": 0.01113154, + "auxiliary_loss_mlp": 0.01042018, + "balance_loss_clip": 1.04947257, + "balance_loss_mlp": 1.02729535, + "epoch": 0.3865323914023749, + "flos": 17598922162560.0, + "grad_norm": 2.847119477349317, + "language_loss": 0.8444519, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.86600363, + "num_input_tokens_seen": 138010140, + "step": 6429, + "time_per_iteration": 2.7385170459747314 + }, + { + "auxiliary_loss_clip": 0.01110797, + "auxiliary_loss_mlp": 0.01038599, + "balance_loss_clip": 1.04555535, + "balance_loss_mlp": 1.02423429, + "epoch": 0.38659251465504285, + "flos": 18478949374080.0, + "grad_norm": 2.174010980525696, + "language_loss": 0.80673695, + "learning_rate": 2.807931078076015e-06, + "loss": 0.82823092, + "num_input_tokens_seen": 138028880, + "step": 6430, + "time_per_iteration": 2.660228967666626 + }, + { + "auxiliary_loss_clip": 0.0102628, + "auxiliary_loss_mlp": 0.01015101, + "balance_loss_clip": 1.02508974, + "balance_loss_mlp": 1.01382565, + "epoch": 0.3866526379077108, + "flos": 64165726978560.0, + "grad_norm": 0.719429045650031, + "language_loss": 0.58803207, + "learning_rate": 2.807574793260416e-06, + "loss": 0.60844588, + "num_input_tokens_seen": 138098090, + "step": 6431, + "time_per_iteration": 3.2772469520568848 + }, + { + "auxiliary_loss_clip": 0.01086398, + "auxiliary_loss_mlp": 0.01039293, + "balance_loss_clip": 1.04541588, + "balance_loss_mlp": 1.02296114, + "epoch": 0.3867127611603788, + "flos": 14388292897920.0, + "grad_norm": 2.1660589654497424, + "language_loss": 0.79041815, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.81167507, + "num_input_tokens_seen": 138114735, + "step": 6432, + "time_per_iteration": 2.7949061393737793 + }, + { + "auxiliary_loss_clip": 0.01125593, + "auxiliary_loss_mlp": 0.01048624, + "balance_loss_clip": 1.04708362, + "balance_loss_mlp": 1.03231645, + "epoch": 0.38677288441304675, + "flos": 20010754823040.0, + "grad_norm": 2.0695366497364294, + "language_loss": 0.80186564, + "learning_rate": 2.806862131772779e-06, + "loss": 0.82360786, + "num_input_tokens_seen": 138130480, + "step": 6433, + "time_per_iteration": 2.6526312828063965 + }, + { + "auxiliary_loss_clip": 0.01111087, + "auxiliary_loss_mlp": 0.01037922, + "balance_loss_clip": 1.04934025, + "balance_loss_mlp": 1.02162611, + "epoch": 0.3868330076657147, + "flos": 22236893147520.0, + "grad_norm": 1.6267030007711512, + "language_loss": 0.70441496, + "learning_rate": 2.806505755127765e-06, + "loss": 0.72590506, + "num_input_tokens_seen": 138150640, + "step": 6434, + "time_per_iteration": 2.6985394954681396 + }, + { + "auxiliary_loss_clip": 0.01097728, + "auxiliary_loss_mlp": 0.01047403, + "balance_loss_clip": 1.04536152, + "balance_loss_mlp": 1.03008235, + "epoch": 0.3868931309183827, + "flos": 16727442387840.0, + "grad_norm": 1.7348790517482282, + "language_loss": 0.77462173, + "learning_rate": 2.806149347899972e-06, + "loss": 0.79607308, + "num_input_tokens_seen": 138169700, + "step": 6435, + "time_per_iteration": 2.7326719760894775 + }, + { + "auxiliary_loss_clip": 0.01119609, + "auxiliary_loss_mlp": 0.01035834, + "balance_loss_clip": 1.04651809, + "balance_loss_mlp": 1.0208497, + "epoch": 0.38695325417105064, + "flos": 22674716023680.0, + "grad_norm": 2.3575842278582813, + "language_loss": 0.79599082, + "learning_rate": 2.805792910102915e-06, + "loss": 0.81754529, + "num_input_tokens_seen": 138185835, + "step": 6436, + "time_per_iteration": 2.6643154621124268 + }, + { + "auxiliary_loss_clip": 0.01107099, + "auxiliary_loss_mlp": 0.01036499, + "balance_loss_clip": 1.04809546, + "balance_loss_mlp": 1.0215621, + "epoch": 0.3870133774237186, + "flos": 23112036109440.0, + "grad_norm": 1.9038851888933561, + "language_loss": 0.76043606, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.78187203, + "num_input_tokens_seen": 138204080, + "step": 6437, + "time_per_iteration": 2.701834201812744 + }, + { + "auxiliary_loss_clip": 0.01110073, + "auxiliary_loss_mlp": 0.01037115, + "balance_loss_clip": 1.04696321, + "balance_loss_mlp": 1.02374589, + "epoch": 0.3870735006763866, + "flos": 17675699483520.0, + "grad_norm": 2.022501790448194, + "language_loss": 0.81817484, + "learning_rate": 2.805079942855074e-06, + "loss": 0.8396467, + "num_input_tokens_seen": 138220710, + "step": 6438, + "time_per_iteration": 4.327820539474487 + }, + { + "auxiliary_loss_clip": 0.01111326, + "auxiliary_loss_mlp": 0.0077319, + "balance_loss_clip": 1.04504764, + "balance_loss_mlp": 1.00027561, + "epoch": 0.38713362392905454, + "flos": 23295791111040.0, + "grad_norm": 1.7517226143139228, + "language_loss": 0.75388491, + "learning_rate": 2.804723413431326e-06, + "loss": 0.77273011, + "num_input_tokens_seen": 138241720, + "step": 6439, + "time_per_iteration": 2.797830104827881 + }, + { + "auxiliary_loss_clip": 0.01131277, + "auxiliary_loss_mlp": 0.01037901, + "balance_loss_clip": 1.04915833, + "balance_loss_mlp": 1.0235002, + "epoch": 0.38719374718172256, + "flos": 21031192298880.0, + "grad_norm": 1.7565856090832077, + "language_loss": 0.74071443, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.76240611, + "num_input_tokens_seen": 138261885, + "step": 6440, + "time_per_iteration": 4.2160422801971436 + }, + { + "auxiliary_loss_clip": 0.01125111, + "auxiliary_loss_mlp": 0.01034995, + "balance_loss_clip": 1.04927301, + "balance_loss_mlp": 1.01949763, + "epoch": 0.3872538704343905, + "flos": 19609776322560.0, + "grad_norm": 2.101028456947384, + "language_loss": 0.82017142, + "learning_rate": 2.804010263051774e-06, + "loss": 0.84177244, + "num_input_tokens_seen": 138280255, + "step": 6441, + "time_per_iteration": 4.199851036071777 + }, + { + "auxiliary_loss_clip": 0.0113476, + "auxiliary_loss_mlp": 0.01039285, + "balance_loss_clip": 1.05011272, + "balance_loss_mlp": 1.02490842, + "epoch": 0.3873139936870585, + "flos": 17530045833600.0, + "grad_norm": 2.8802239922493147, + "language_loss": 0.80824792, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.82998842, + "num_input_tokens_seen": 138296675, + "step": 6442, + "time_per_iteration": 2.6942524909973145 + }, + { + "auxiliary_loss_clip": 0.01090073, + "auxiliary_loss_mlp": 0.01032275, + "balance_loss_clip": 1.04431343, + "balance_loss_mlp": 1.01747537, + "epoch": 0.38737411693972645, + "flos": 17786555832960.0, + "grad_norm": 2.1593394156288044, + "language_loss": 0.84054118, + "learning_rate": 2.803296990719624e-06, + "loss": 0.86176467, + "num_input_tokens_seen": 138314985, + "step": 6443, + "time_per_iteration": 2.6660094261169434 + }, + { + "auxiliary_loss_clip": 0.01033878, + "auxiliary_loss_mlp": 0.01000185, + "balance_loss_clip": 1.02513885, + "balance_loss_mlp": 0.99879646, + "epoch": 0.3874342401923944, + "flos": 58304637048960.0, + "grad_norm": 0.7605185654135588, + "language_loss": 0.50208193, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.52242255, + "num_input_tokens_seen": 138373275, + "step": 6444, + "time_per_iteration": 4.807433128356934 + }, + { + "auxiliary_loss_clip": 0.01086333, + "auxiliary_loss_mlp": 0.00773648, + "balance_loss_clip": 1.04187298, + "balance_loss_mlp": 1.00033963, + "epoch": 0.3874943634450624, + "flos": 17711933328000.0, + "grad_norm": 1.4666177781563792, + "language_loss": 0.78874767, + "learning_rate": 2.802583596543065e-06, + "loss": 0.80734754, + "num_input_tokens_seen": 138391145, + "step": 6445, + "time_per_iteration": 2.689142942428589 + }, + { + "auxiliary_loss_clip": 0.0111426, + "auxiliary_loss_mlp": 0.0103959, + "balance_loss_clip": 1.04754841, + "balance_loss_mlp": 1.02445602, + "epoch": 0.38755448669773035, + "flos": 19244852098560.0, + "grad_norm": 2.4274750437973958, + "language_loss": 0.81207073, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83360916, + "num_input_tokens_seen": 138409875, + "step": 6446, + "time_per_iteration": 2.6582860946655273 + }, + { + "auxiliary_loss_clip": 0.01107394, + "auxiliary_loss_mlp": 0.01037275, + "balance_loss_clip": 1.04530001, + "balance_loss_mlp": 1.02277923, + "epoch": 0.3876146099503983, + "flos": 20594267262720.0, + "grad_norm": 3.0137556994939887, + "language_loss": 0.77366996, + "learning_rate": 2.801870080630306e-06, + "loss": 0.79511666, + "num_input_tokens_seen": 138428965, + "step": 6447, + "time_per_iteration": 2.727285146713257 + }, + { + "auxiliary_loss_clip": 0.01108854, + "auxiliary_loss_mlp": 0.01037807, + "balance_loss_clip": 1.04590762, + "balance_loss_mlp": 1.02378821, + "epoch": 0.3876747332030663, + "flos": 19281121856640.0, + "grad_norm": 2.4450461903172562, + "language_loss": 0.76364803, + "learning_rate": 2.801513277056671e-06, + "loss": 0.78511459, + "num_input_tokens_seen": 138448090, + "step": 6448, + "time_per_iteration": 2.663989543914795 + }, + { + "auxiliary_loss_clip": 0.01102873, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.04449654, + "balance_loss_mlp": 1.02322626, + "epoch": 0.38773485645573424, + "flos": 18945895201920.0, + "grad_norm": 1.6490971101368535, + "language_loss": 0.76146352, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.7828809, + "num_input_tokens_seen": 138466105, + "step": 6449, + "time_per_iteration": 2.806537628173828 + }, + { + "auxiliary_loss_clip": 0.01098531, + "auxiliary_loss_mlp": 0.00772575, + "balance_loss_clip": 1.04406381, + "balance_loss_mlp": 1.00027394, + "epoch": 0.3877949797084022, + "flos": 23071348978560.0, + "grad_norm": 2.0995234377866985, + "language_loss": 0.78572172, + "learning_rate": 2.800799578742542e-06, + "loss": 0.80443275, + "num_input_tokens_seen": 138485160, + "step": 6450, + "time_per_iteration": 2.7541351318359375 + }, + { + "auxiliary_loss_clip": 0.01137663, + "auxiliary_loss_mlp": 0.01039948, + "balance_loss_clip": 1.04827702, + "balance_loss_mlp": 1.02452803, + "epoch": 0.3878551029610702, + "flos": 29095543589760.0, + "grad_norm": 2.5655640440870946, + "language_loss": 0.78046334, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.80223942, + "num_input_tokens_seen": 138504135, + "step": 6451, + "time_per_iteration": 2.6868700981140137 + }, + { + "auxiliary_loss_clip": 0.01126689, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.04576159, + "balance_loss_mlp": 1.01696229, + "epoch": 0.38791522621373814, + "flos": 20996394998400.0, + "grad_norm": 2.633183178462793, + "language_loss": 0.76404589, + "learning_rate": 2.800085758962812e-06, + "loss": 0.78562915, + "num_input_tokens_seen": 138523955, + "step": 6452, + "time_per_iteration": 2.708750009536743 + }, + { + "auxiliary_loss_clip": 0.01103834, + "auxiliary_loss_mlp": 0.01042785, + "balance_loss_clip": 1.04665875, + "balance_loss_mlp": 1.0285815, + "epoch": 0.3879753494664061, + "flos": 15486836497920.0, + "grad_norm": 1.5878969811553463, + "language_loss": 0.79534453, + "learning_rate": 2.799728803557182e-06, + "loss": 0.81681073, + "num_input_tokens_seen": 138541655, + "step": 6453, + "time_per_iteration": 2.7226593494415283 + }, + { + "auxiliary_loss_clip": 0.0112782, + "auxiliary_loss_mlp": 0.0104096, + "balance_loss_clip": 1.04889584, + "balance_loss_mlp": 1.02560616, + "epoch": 0.3880354727190741, + "flos": 22053964158720.0, + "grad_norm": 19.957823861770734, + "language_loss": 0.71643323, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.73812103, + "num_input_tokens_seen": 138560860, + "step": 6454, + "time_per_iteration": 2.7265548706054688 + }, + { + "auxiliary_loss_clip": 0.01137183, + "auxiliary_loss_mlp": 0.01043076, + "balance_loss_clip": 1.04976404, + "balance_loss_mlp": 1.02693522, + "epoch": 0.3880955959717421, + "flos": 20340307128960.0, + "grad_norm": 2.029110970619929, + "language_loss": 0.77489239, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.79669499, + "num_input_tokens_seen": 138580200, + "step": 6455, + "time_per_iteration": 2.7688205242156982 + }, + { + "auxiliary_loss_clip": 0.01131496, + "auxiliary_loss_mlp": 0.01043781, + "balance_loss_clip": 1.04975748, + "balance_loss_mlp": 1.02897501, + "epoch": 0.38815571922441006, + "flos": 23075407215360.0, + "grad_norm": 1.8133016626985128, + "language_loss": 0.76193333, + "learning_rate": 2.798657755439662e-06, + "loss": 0.78368604, + "num_input_tokens_seen": 138598315, + "step": 6456, + "time_per_iteration": 2.6894283294677734 + }, + { + "auxiliary_loss_clip": 0.01059894, + "auxiliary_loss_mlp": 0.01038862, + "balance_loss_clip": 1.04251969, + "balance_loss_mlp": 1.02365136, + "epoch": 0.388215842477078, + "flos": 20776944856320.0, + "grad_norm": 9.416859416659493, + "language_loss": 0.59422505, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.61521268, + "num_input_tokens_seen": 138615695, + "step": 6457, + "time_per_iteration": 2.8189444541931152 + }, + { + "auxiliary_loss_clip": 0.01136561, + "auxiliary_loss_mlp": 0.01039144, + "balance_loss_clip": 1.04989612, + "balance_loss_mlp": 1.02262187, + "epoch": 0.388275965729746, + "flos": 20448182649600.0, + "grad_norm": 2.336997181985419, + "language_loss": 0.79927063, + "learning_rate": 2.797943571912841e-06, + "loss": 0.82102776, + "num_input_tokens_seen": 138633180, + "step": 6458, + "time_per_iteration": 2.66198992729187 + }, + { + "auxiliary_loss_clip": 0.01081764, + "auxiliary_loss_mlp": 0.0104529, + "balance_loss_clip": 1.04428816, + "balance_loss_mlp": 1.02855277, + "epoch": 0.38833608898241395, + "flos": 27892392606720.0, + "grad_norm": 2.218973373394608, + "language_loss": 0.81497735, + "learning_rate": 2.797586434755509e-06, + "loss": 0.83624792, + "num_input_tokens_seen": 138654785, + "step": 6459, + "time_per_iteration": 2.780120611190796 + }, + { + "auxiliary_loss_clip": 0.01105714, + "auxiliary_loss_mlp": 0.01037251, + "balance_loss_clip": 1.04633725, + "balance_loss_mlp": 1.0236907, + "epoch": 0.3883962122350819, + "flos": 18076390675200.0, + "grad_norm": 1.942341955564712, + "language_loss": 0.62001127, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.64144087, + "num_input_tokens_seen": 138673330, + "step": 6460, + "time_per_iteration": 2.625399112701416 + }, + { + "auxiliary_loss_clip": 0.01120569, + "auxiliary_loss_mlp": 0.01032391, + "balance_loss_clip": 1.04955411, + "balance_loss_mlp": 1.01920033, + "epoch": 0.3884563354877499, + "flos": 23622254847360.0, + "grad_norm": 1.928823237011181, + "language_loss": 0.86226058, + "learning_rate": 2.796872069720717e-06, + "loss": 0.88379019, + "num_input_tokens_seen": 138694185, + "step": 6461, + "time_per_iteration": 2.6901583671569824 + }, + { + "auxiliary_loss_clip": 0.0111976, + "auxiliary_loss_mlp": 0.01038779, + "balance_loss_clip": 1.04810238, + "balance_loss_mlp": 1.0244205, + "epoch": 0.38851645874041785, + "flos": 27453528236160.0, + "grad_norm": 4.963760229824091, + "language_loss": 0.70659202, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.72817743, + "num_input_tokens_seen": 138714625, + "step": 6462, + "time_per_iteration": 2.7463371753692627 + }, + { + "auxiliary_loss_clip": 0.01086013, + "auxiliary_loss_mlp": 0.01043745, + "balance_loss_clip": 1.04045033, + "balance_loss_mlp": 1.02786636, + "epoch": 0.3885765819930858, + "flos": 25228072270080.0, + "grad_norm": 2.747031306466439, + "language_loss": 0.76228201, + "learning_rate": 2.796157583816052e-06, + "loss": 0.78357965, + "num_input_tokens_seen": 138733585, + "step": 6463, + "time_per_iteration": 2.7231578826904297 + }, + { + "auxiliary_loss_clip": 0.01103201, + "auxiliary_loss_mlp": 0.0104459, + "balance_loss_clip": 1.05013013, + "balance_loss_mlp": 1.02841353, + "epoch": 0.3886367052457538, + "flos": 16946605221120.0, + "grad_norm": 3.605418601568306, + "language_loss": 0.70244539, + "learning_rate": 2.795800295571382e-06, + "loss": 0.72392333, + "num_input_tokens_seen": 138752335, + "step": 6464, + "time_per_iteration": 2.773066759109497 + }, + { + "auxiliary_loss_clip": 0.01110861, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.04950786, + "balance_loss_mlp": 1.02211452, + "epoch": 0.38869682849842174, + "flos": 27154140376320.0, + "grad_norm": 2.8184770761764777, + "language_loss": 0.69632983, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.71780872, + "num_input_tokens_seen": 138768450, + "step": 6465, + "time_per_iteration": 2.7013487815856934 + }, + { + "auxiliary_loss_clip": 0.01097351, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_clip": 1.04837847, + "balance_loss_mlp": 1.02645373, + "epoch": 0.3887569517510897, + "flos": 21063619301760.0, + "grad_norm": 2.665243237814177, + "language_loss": 0.78489739, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.80628836, + "num_input_tokens_seen": 138786775, + "step": 6466, + "time_per_iteration": 2.736819267272949 + }, + { + "auxiliary_loss_clip": 0.01095374, + "auxiliary_loss_mlp": 0.01037568, + "balance_loss_clip": 1.0463171, + "balance_loss_mlp": 1.02242851, + "epoch": 0.38881707500375773, + "flos": 29497384016640.0, + "grad_norm": 1.6522613533538497, + "language_loss": 0.69341898, + "learning_rate": 2.794728249830611e-06, + "loss": 0.71474838, + "num_input_tokens_seen": 138810100, + "step": 6467, + "time_per_iteration": 2.778083324432373 + }, + { + "auxiliary_loss_clip": 0.01098114, + "auxiliary_loss_mlp": 0.01048152, + "balance_loss_clip": 1.04706931, + "balance_loss_mlp": 1.0326246, + "epoch": 0.3888771982564257, + "flos": 17488281294720.0, + "grad_norm": 3.2276382920067817, + "language_loss": 0.84199375, + "learning_rate": 2.794370840959936e-06, + "loss": 0.86345637, + "num_input_tokens_seen": 138825140, + "step": 6468, + "time_per_iteration": 2.6842098236083984 + }, + { + "auxiliary_loss_clip": 0.01108569, + "auxiliary_loss_mlp": 0.01036235, + "balance_loss_clip": 1.048172, + "balance_loss_mlp": 1.0227766, + "epoch": 0.38893732150909366, + "flos": 21942425450880.0, + "grad_norm": 1.8219377355536144, + "language_loss": 0.84232908, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.86377716, + "num_input_tokens_seen": 138844115, + "step": 6469, + "time_per_iteration": 2.7538135051727295 + }, + { + "auxiliary_loss_clip": 0.0109067, + "auxiliary_loss_mlp": 0.01048288, + "balance_loss_clip": 1.04416847, + "balance_loss_mlp": 1.03205132, + "epoch": 0.3889974447617616, + "flos": 24276367468800.0, + "grad_norm": 2.339210402911935, + "language_loss": 0.75173676, + "learning_rate": 2.793655932864273e-06, + "loss": 0.7731263, + "num_input_tokens_seen": 138860860, + "step": 6470, + "time_per_iteration": 2.7425949573516846 + }, + { + "auxiliary_loss_clip": 0.01095528, + "auxiliary_loss_mlp": 0.00772188, + "balance_loss_clip": 1.0480423, + "balance_loss_mlp": 1.00016475, + "epoch": 0.3890575680144296, + "flos": 25667116208640.0, + "grad_norm": 1.5943716760052937, + "language_loss": 0.74977577, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.76845288, + "num_input_tokens_seen": 138881910, + "step": 6471, + "time_per_iteration": 2.8880369663238525 + }, + { + "auxiliary_loss_clip": 0.01077518, + "auxiliary_loss_mlp": 0.01049277, + "balance_loss_clip": 1.03879571, + "balance_loss_mlp": 1.03336215, + "epoch": 0.38911769126709755, + "flos": 22855274714880.0, + "grad_norm": 2.421548050110463, + "language_loss": 0.67984551, + "learning_rate": 2.792940904386562e-06, + "loss": 0.70111346, + "num_input_tokens_seen": 138900975, + "step": 6472, + "time_per_iteration": 2.7776875495910645 + }, + { + "auxiliary_loss_clip": 0.01103596, + "auxiliary_loss_mlp": 0.01043152, + "balance_loss_clip": 1.04819107, + "balance_loss_mlp": 1.02974129, + "epoch": 0.3891778145197655, + "flos": 25447522412160.0, + "grad_norm": 1.8102352941433608, + "language_loss": 0.76068687, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.78215432, + "num_input_tokens_seen": 138920795, + "step": 6473, + "time_per_iteration": 2.7568469047546387 + }, + { + "auxiliary_loss_clip": 0.01113975, + "auxiliary_loss_mlp": 0.01046096, + "balance_loss_clip": 1.05217087, + "balance_loss_mlp": 1.03031242, + "epoch": 0.3892379377724335, + "flos": 14027965614720.0, + "grad_norm": 2.045216735434868, + "language_loss": 0.70959115, + "learning_rate": 2.792225755635257e-06, + "loss": 0.73119187, + "num_input_tokens_seen": 138938770, + "step": 6474, + "time_per_iteration": 2.6930696964263916 + }, + { + "auxiliary_loss_clip": 0.01135028, + "auxiliary_loss_mlp": 0.01042055, + "balance_loss_clip": 1.05145836, + "balance_loss_mlp": 1.02861369, + "epoch": 0.38929806102510145, + "flos": 20157449967360.0, + "grad_norm": 1.5519949793695216, + "language_loss": 0.69049072, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.71226156, + "num_input_tokens_seen": 138958880, + "step": 6475, + "time_per_iteration": 2.670830011367798 + }, + { + "auxiliary_loss_clip": 0.01110637, + "auxiliary_loss_mlp": 0.01057592, + "balance_loss_clip": 1.04578567, + "balance_loss_mlp": 1.03981757, + "epoch": 0.3893581842777694, + "flos": 22163958581760.0, + "grad_norm": 1.9596553320764234, + "language_loss": 0.75820196, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.77988434, + "num_input_tokens_seen": 138977240, + "step": 6476, + "time_per_iteration": 2.683980941772461 + }, + { + "auxiliary_loss_clip": 0.01039888, + "auxiliary_loss_mlp": 0.01002183, + "balance_loss_clip": 1.02862918, + "balance_loss_mlp": 1.00084782, + "epoch": 0.3894183075304374, + "flos": 67301877392640.0, + "grad_norm": 0.7759740468574157, + "language_loss": 0.58146399, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60188472, + "num_input_tokens_seen": 139039035, + "step": 6477, + "time_per_iteration": 3.2430496215820312 + }, + { + "auxiliary_loss_clip": 0.01092497, + "auxiliary_loss_mlp": 0.01040603, + "balance_loss_clip": 1.04780793, + "balance_loss_mlp": 1.02428961, + "epoch": 0.38947843078310534, + "flos": 18547502480640.0, + "grad_norm": 1.9073891309950948, + "language_loss": 0.78554142, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.80687243, + "num_input_tokens_seen": 139055560, + "step": 6478, + "time_per_iteration": 4.241156339645386 + }, + { + "auxiliary_loss_clip": 0.01116081, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.04505491, + "balance_loss_mlp": 1.02545047, + "epoch": 0.3895385540357733, + "flos": 14605875532800.0, + "grad_norm": 2.6992371438810783, + "language_loss": 0.82647753, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.84803581, + "num_input_tokens_seen": 139071865, + "step": 6479, + "time_per_iteration": 4.1569294929504395 + }, + { + "auxiliary_loss_clip": 0.01131381, + "auxiliary_loss_mlp": 0.01036344, + "balance_loss_clip": 1.04886651, + "balance_loss_mlp": 1.02161551, + "epoch": 0.38959867728844133, + "flos": 19975203336960.0, + "grad_norm": 2.334048099077096, + "language_loss": 0.79657412, + "learning_rate": 2.790079588824617e-06, + "loss": 0.81825137, + "num_input_tokens_seen": 139089640, + "step": 6480, + "time_per_iteration": 4.170635938644409 + }, + { + "auxiliary_loss_clip": 0.0110471, + "auxiliary_loss_mlp": 0.01032466, + "balance_loss_clip": 1.04561472, + "balance_loss_mlp": 1.01822066, + "epoch": 0.3896588005411093, + "flos": 22672130244480.0, + "grad_norm": 6.364109786330533, + "language_loss": 0.83021134, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85158312, + "num_input_tokens_seen": 139109365, + "step": 6481, + "time_per_iteration": 2.638821840286255 + }, + { + "auxiliary_loss_clip": 0.01102815, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.04740214, + "balance_loss_mlp": 1.02503228, + "epoch": 0.38971892379377726, + "flos": 20996035862400.0, + "grad_norm": 1.7002276765936415, + "language_loss": 0.75389051, + "learning_rate": 2.789363960063863e-06, + "loss": 0.77530706, + "num_input_tokens_seen": 139128260, + "step": 6482, + "time_per_iteration": 2.5737624168395996 + }, + { + "auxiliary_loss_clip": 0.01100553, + "auxiliary_loss_mlp": 0.01035815, + "balance_loss_clip": 1.04781246, + "balance_loss_mlp": 1.02164662, + "epoch": 0.3897790470464452, + "flos": 22528487756160.0, + "grad_norm": 2.0703094316503554, + "language_loss": 0.78786725, + "learning_rate": 2.78900610077756e-06, + "loss": 0.80923092, + "num_input_tokens_seen": 139147315, + "step": 6483, + "time_per_iteration": 2.6177117824554443 + }, + { + "auxiliary_loss_clip": 0.01121516, + "auxiliary_loss_mlp": 0.01030702, + "balance_loss_clip": 1.04790664, + "balance_loss_mlp": 1.01487088, + "epoch": 0.3898391702991132, + "flos": 26209905603840.0, + "grad_norm": 1.6677367088018817, + "language_loss": 0.79871929, + "learning_rate": 2.788648211572067e-06, + "loss": 0.82024151, + "num_input_tokens_seen": 139167270, + "step": 6484, + "time_per_iteration": 4.221461534500122 + }, + { + "auxiliary_loss_clip": 0.01119394, + "auxiliary_loss_mlp": 0.01051487, + "balance_loss_clip": 1.05063844, + "balance_loss_mlp": 1.03472662, + "epoch": 0.38989929355178116, + "flos": 21065558636160.0, + "grad_norm": 2.1008000508061104, + "language_loss": 0.77901775, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.80072653, + "num_input_tokens_seen": 139185970, + "step": 6485, + "time_per_iteration": 2.664097785949707 + }, + { + "auxiliary_loss_clip": 0.01085813, + "auxiliary_loss_mlp": 0.01036912, + "balance_loss_clip": 1.0427084, + "balance_loss_mlp": 1.02207613, + "epoch": 0.3899594168044491, + "flos": 25484115392640.0, + "grad_norm": 6.223818029706007, + "language_loss": 0.85190272, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.87312996, + "num_input_tokens_seen": 139203730, + "step": 6486, + "time_per_iteration": 2.8325467109680176 + }, + { + "auxiliary_loss_clip": 0.01111569, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.04786611, + "balance_loss_mlp": 1.01883638, + "epoch": 0.3900195400571171, + "flos": 31139363456640.0, + "grad_norm": 2.4250185390770618, + "language_loss": 0.85333234, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.87478197, + "num_input_tokens_seen": 139222560, + "step": 6487, + "time_per_iteration": 2.8390486240386963 + }, + { + "auxiliary_loss_clip": 0.01103222, + "auxiliary_loss_mlp": 0.01032994, + "balance_loss_clip": 1.04449213, + "balance_loss_mlp": 1.01793766, + "epoch": 0.39007966330978505, + "flos": 20229917656320.0, + "grad_norm": 1.5390409302603854, + "language_loss": 0.72954559, + "learning_rate": 2.787216355829633e-06, + "loss": 0.75090778, + "num_input_tokens_seen": 139242165, + "step": 6488, + "time_per_iteration": 2.7613236904144287 + }, + { + "auxiliary_loss_clip": 0.01096805, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.04673266, + "balance_loss_mlp": 1.02771914, + "epoch": 0.390139786562453, + "flos": 22528739151360.0, + "grad_norm": 2.6420160637986383, + "language_loss": 0.68467176, + "learning_rate": 2.786858317231779e-06, + "loss": 0.70608854, + "num_input_tokens_seen": 139262525, + "step": 6489, + "time_per_iteration": 2.746307849884033 + }, + { + "auxiliary_loss_clip": 0.01108111, + "auxiliary_loss_mlp": 0.01041602, + "balance_loss_clip": 1.04793715, + "balance_loss_mlp": 1.02673674, + "epoch": 0.390199909815121, + "flos": 26432911192320.0, + "grad_norm": 1.6912118236512272, + "language_loss": 0.80629271, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.82778984, + "num_input_tokens_seen": 139282835, + "step": 6490, + "time_per_iteration": 2.7116847038269043 + }, + { + "auxiliary_loss_clip": 0.01124963, + "auxiliary_loss_mlp": 0.01033045, + "balance_loss_clip": 1.04856181, + "balance_loss_mlp": 1.0187161, + "epoch": 0.39026003306778895, + "flos": 17274577328640.0, + "grad_norm": 3.073568327903315, + "language_loss": 0.89115125, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.91273135, + "num_input_tokens_seen": 139299490, + "step": 6491, + "time_per_iteration": 2.6211190223693848 + }, + { + "auxiliary_loss_clip": 0.01092029, + "auxiliary_loss_mlp": 0.01045074, + "balance_loss_clip": 1.04406416, + "balance_loss_mlp": 1.02952874, + "epoch": 0.3903201563204569, + "flos": 24532841554560.0, + "grad_norm": 1.8064559635296407, + "language_loss": 0.78637981, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.80775088, + "num_input_tokens_seen": 139317865, + "step": 6492, + "time_per_iteration": 2.7505667209625244 + }, + { + "auxiliary_loss_clip": 0.01108778, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_clip": 1.04486537, + "balance_loss_mlp": 1.02735257, + "epoch": 0.39038027957312493, + "flos": 23767944410880.0, + "grad_norm": 1.7227367696506604, + "language_loss": 0.74431908, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.76582652, + "num_input_tokens_seen": 139339840, + "step": 6493, + "time_per_iteration": 2.7200233936309814 + }, + { + "auxiliary_loss_clip": 0.01091358, + "auxiliary_loss_mlp": 0.01040258, + "balance_loss_clip": 1.04613161, + "balance_loss_mlp": 1.02549398, + "epoch": 0.3904404028257929, + "flos": 14100612871680.0, + "grad_norm": 2.9656676182999395, + "language_loss": 0.7637316, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.78504777, + "num_input_tokens_seen": 139357555, + "step": 6494, + "time_per_iteration": 2.6818442344665527 + }, + { + "auxiliary_loss_clip": 0.01131498, + "auxiliary_loss_mlp": 0.01048378, + "balance_loss_clip": 1.0500524, + "balance_loss_mlp": 1.03182006, + "epoch": 0.39050052607846086, + "flos": 16910048154240.0, + "grad_norm": 2.1152980497113782, + "language_loss": 0.74208486, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.76388359, + "num_input_tokens_seen": 139374455, + "step": 6495, + "time_per_iteration": 2.6432337760925293 + }, + { + "auxiliary_loss_clip": 0.01137243, + "auxiliary_loss_mlp": 0.01045454, + "balance_loss_clip": 1.05153751, + "balance_loss_mlp": 1.02913451, + "epoch": 0.39056064933112883, + "flos": 25915761129600.0, + "grad_norm": 2.402575660392066, + "language_loss": 0.67757058, + "learning_rate": 2.784351212350352e-06, + "loss": 0.69939756, + "num_input_tokens_seen": 139394770, + "step": 6496, + "time_per_iteration": 2.762009859085083 + }, + { + "auxiliary_loss_clip": 0.01023856, + "auxiliary_loss_mlp": 0.01010625, + "balance_loss_clip": 1.02393842, + "balance_loss_mlp": 1.00925446, + "epoch": 0.3906207725837968, + "flos": 60028421713920.0, + "grad_norm": 0.6655460592599327, + "language_loss": 0.53920811, + "learning_rate": 2.783992935430775e-06, + "loss": 0.55955297, + "num_input_tokens_seen": 139454760, + "step": 6497, + "time_per_iteration": 3.351006507873535 + }, + { + "auxiliary_loss_clip": 0.01094838, + "auxiliary_loss_mlp": 0.00772151, + "balance_loss_clip": 1.0476501, + "balance_loss_mlp": 1.00038421, + "epoch": 0.39068089583646476, + "flos": 21068683119360.0, + "grad_norm": 2.7558428999232847, + "language_loss": 0.6865977, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.70526755, + "num_input_tokens_seen": 139472645, + "step": 6498, + "time_per_iteration": 2.7838692665100098 + }, + { + "auxiliary_loss_clip": 0.01022021, + "auxiliary_loss_mlp": 0.01009741, + "balance_loss_clip": 1.02064919, + "balance_loss_mlp": 1.00839996, + "epoch": 0.3907410190891327, + "flos": 70445677403520.0, + "grad_norm": 0.7248596102007157, + "language_loss": 0.51767612, + "learning_rate": 2.783276292417936e-06, + "loss": 0.53799379, + "num_input_tokens_seen": 139536730, + "step": 6499, + "time_per_iteration": 3.2980377674102783 + }, + { + "auxiliary_loss_clip": 0.01122618, + "auxiliary_loss_mlp": 0.01044387, + "balance_loss_clip": 1.04676056, + "balance_loss_mlp": 1.02793658, + "epoch": 0.3908011423418007, + "flos": 27962454084480.0, + "grad_norm": 1.973185164339423, + "language_loss": 0.73842579, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.76009583, + "num_input_tokens_seen": 139557540, + "step": 6500, + "time_per_iteration": 2.7239198684692383 + }, + { + "auxiliary_loss_clip": 0.01125366, + "auxiliary_loss_mlp": 0.01037256, + "balance_loss_clip": 1.05035591, + "balance_loss_mlp": 1.02246249, + "epoch": 0.39086126559446865, + "flos": 24462097718400.0, + "grad_norm": 2.6021512056662814, + "language_loss": 0.68837166, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.70999795, + "num_input_tokens_seen": 139576875, + "step": 6501, + "time_per_iteration": 2.6926429271698 + }, + { + "auxiliary_loss_clip": 0.01122637, + "auxiliary_loss_mlp": 0.0103859, + "balance_loss_clip": 1.04832482, + "balance_loss_mlp": 1.02442181, + "epoch": 0.3909213888471366, + "flos": 16941541403520.0, + "grad_norm": 2.1384909443348246, + "language_loss": 0.78875881, + "learning_rate": 2.782201105168287e-06, + "loss": 0.8103711, + "num_input_tokens_seen": 139594295, + "step": 6502, + "time_per_iteration": 2.647021770477295 + }, + { + "auxiliary_loss_clip": 0.01109811, + "auxiliary_loss_mlp": 0.01035328, + "balance_loss_clip": 1.04876852, + "balance_loss_mlp": 1.02171457, + "epoch": 0.3909815120998046, + "flos": 29278400751360.0, + "grad_norm": 3.671996146003432, + "language_loss": 0.80537987, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.82683128, + "num_input_tokens_seen": 139614080, + "step": 6503, + "time_per_iteration": 2.7318384647369385 + }, + { + "auxiliary_loss_clip": 0.0110371, + "auxiliary_loss_mlp": 0.01031248, + "balance_loss_clip": 1.04387689, + "balance_loss_mlp": 1.01760423, + "epoch": 0.39104163535247255, + "flos": 18951246328320.0, + "grad_norm": 1.848076786389183, + "language_loss": 0.71439689, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.7357465, + "num_input_tokens_seen": 139632755, + "step": 6504, + "time_per_iteration": 2.6983554363250732 + }, + { + "auxiliary_loss_clip": 0.01130195, + "auxiliary_loss_mlp": 0.01034576, + "balance_loss_clip": 1.0459981, + "balance_loss_mlp": 1.0199374, + "epoch": 0.3911017586051405, + "flos": 26323347732480.0, + "grad_norm": 1.4848516480735832, + "language_loss": 0.83245611, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.8541038, + "num_input_tokens_seen": 139654205, + "step": 6505, + "time_per_iteration": 2.6663267612457275 + }, + { + "auxiliary_loss_clip": 0.01131259, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.04880178, + "balance_loss_mlp": 1.02123427, + "epoch": 0.3911618818578085, + "flos": 21835770992640.0, + "grad_norm": 1.9330872564568533, + "language_loss": 0.71352887, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.73521107, + "num_input_tokens_seen": 139673595, + "step": 6506, + "time_per_iteration": 2.6168534755706787 + }, + { + "auxiliary_loss_clip": 0.01105925, + "auxiliary_loss_mlp": 0.01036209, + "balance_loss_clip": 1.04536867, + "balance_loss_mlp": 1.02267289, + "epoch": 0.3912220051104765, + "flos": 16359680989440.0, + "grad_norm": 2.106647299507305, + "language_loss": 0.75086504, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.77228636, + "num_input_tokens_seen": 139690565, + "step": 6507, + "time_per_iteration": 2.8207101821899414 + }, + { + "auxiliary_loss_clip": 0.01053146, + "auxiliary_loss_mlp": 0.01002126, + "balance_loss_clip": 1.02403712, + "balance_loss_mlp": 1.00068331, + "epoch": 0.39128212836314447, + "flos": 71050986420480.0, + "grad_norm": 0.9386901837185221, + "language_loss": 0.56488812, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.58544087, + "num_input_tokens_seen": 139749420, + "step": 6508, + "time_per_iteration": 3.3985793590545654 + }, + { + "auxiliary_loss_clip": 0.01121659, + "auxiliary_loss_mlp": 0.01038464, + "balance_loss_clip": 1.05045915, + "balance_loss_mlp": 1.02476096, + "epoch": 0.39134225161581243, + "flos": 20331975173760.0, + "grad_norm": 2.0207920703954936, + "language_loss": 0.76855135, + "learning_rate": 2.779691297413471e-06, + "loss": 0.79015261, + "num_input_tokens_seen": 139766265, + "step": 6509, + "time_per_iteration": 2.6667048931121826 + }, + { + "auxiliary_loss_clip": 0.01101334, + "auxiliary_loss_mlp": 0.01043985, + "balance_loss_clip": 1.04298568, + "balance_loss_mlp": 1.02731967, + "epoch": 0.3914023748684804, + "flos": 17018390551680.0, + "grad_norm": 5.905968065437354, + "language_loss": 0.82739937, + "learning_rate": 2.779332635075825e-06, + "loss": 0.84885252, + "num_input_tokens_seen": 139782400, + "step": 6510, + "time_per_iteration": 2.933931589126587 + }, + { + "auxiliary_loss_clip": 0.0112259, + "auxiliary_loss_mlp": 0.01038677, + "balance_loss_clip": 1.04712081, + "balance_loss_mlp": 1.02406788, + "epoch": 0.39146249812114836, + "flos": 18405224709120.0, + "grad_norm": 5.781106582003857, + "language_loss": 0.76999253, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.79160517, + "num_input_tokens_seen": 139801435, + "step": 6511, + "time_per_iteration": 2.6926233768463135 + }, + { + "auxiliary_loss_clip": 0.01035867, + "auxiliary_loss_mlp": 0.01006458, + "balance_loss_clip": 1.02583003, + "balance_loss_mlp": 1.00515223, + "epoch": 0.3915226213738163, + "flos": 67637355442560.0, + "grad_norm": 0.716551875912138, + "language_loss": 0.57749176, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.59791505, + "num_input_tokens_seen": 139869700, + "step": 6512, + "time_per_iteration": 3.3695731163024902 + }, + { + "auxiliary_loss_clip": 0.01135844, + "auxiliary_loss_mlp": 0.01035397, + "balance_loss_clip": 1.05013657, + "balance_loss_mlp": 1.02001858, + "epoch": 0.3915827446264843, + "flos": 26359330181760.0, + "grad_norm": 1.8014676974175234, + "language_loss": 0.69625974, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.71797216, + "num_input_tokens_seen": 139890140, + "step": 6513, + "time_per_iteration": 2.8037526607513428 + }, + { + "auxiliary_loss_clip": 0.01095461, + "auxiliary_loss_mlp": 0.01038913, + "balance_loss_clip": 1.04791474, + "balance_loss_mlp": 1.02376771, + "epoch": 0.39164286787915226, + "flos": 21943897908480.0, + "grad_norm": 8.577901504868834, + "language_loss": 0.75566119, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.77700496, + "num_input_tokens_seen": 139908020, + "step": 6514, + "time_per_iteration": 2.8419485092163086 + }, + { + "auxiliary_loss_clip": 0.01094835, + "auxiliary_loss_mlp": 0.01040327, + "balance_loss_clip": 1.04639578, + "balance_loss_mlp": 1.02636766, + "epoch": 0.3917029911318202, + "flos": 16399829416320.0, + "grad_norm": 2.188170768945522, + "language_loss": 0.77334291, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.79469454, + "num_input_tokens_seen": 139926180, + "step": 6515, + "time_per_iteration": 2.7894155979156494 + }, + { + "auxiliary_loss_clip": 0.01087017, + "auxiliary_loss_mlp": 0.0105158, + "balance_loss_clip": 1.03979194, + "balance_loss_mlp": 1.03763223, + "epoch": 0.3917631143844882, + "flos": 26211701283840.0, + "grad_norm": 1.5088395946363757, + "language_loss": 0.79678488, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.81817091, + "num_input_tokens_seen": 139947420, + "step": 6516, + "time_per_iteration": 2.902660608291626 + }, + { + "auxiliary_loss_clip": 0.01092649, + "auxiliary_loss_mlp": 0.01042434, + "balance_loss_clip": 1.04691982, + "balance_loss_mlp": 1.02799749, + "epoch": 0.39182323763715615, + "flos": 18548364407040.0, + "grad_norm": 1.9539461980584907, + "language_loss": 0.70539331, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.72674412, + "num_input_tokens_seen": 139965800, + "step": 6517, + "time_per_iteration": 4.275412082672119 + }, + { + "auxiliary_loss_clip": 0.0108795, + "auxiliary_loss_mlp": 0.01045392, + "balance_loss_clip": 1.04107618, + "balance_loss_mlp": 1.03034759, + "epoch": 0.3918833608898241, + "flos": 34313543395200.0, + "grad_norm": 1.7270068216094292, + "language_loss": 0.72215492, + "learning_rate": 2.776462273631956e-06, + "loss": 0.74348831, + "num_input_tokens_seen": 139988140, + "step": 6518, + "time_per_iteration": 4.390907287597656 + }, + { + "auxiliary_loss_clip": 0.01124647, + "auxiliary_loss_mlp": 0.0104138, + "balance_loss_clip": 1.05179489, + "balance_loss_mlp": 1.02679503, + "epoch": 0.3919434841424921, + "flos": 36939582812160.0, + "grad_norm": 1.8265438315676477, + "language_loss": 0.61835045, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.64001071, + "num_input_tokens_seen": 140010060, + "step": 6519, + "time_per_iteration": 4.281017780303955 + }, + { + "auxiliary_loss_clip": 0.01142133, + "auxiliary_loss_mlp": 0.01043415, + "balance_loss_clip": 1.05199361, + "balance_loss_mlp": 1.02807307, + "epoch": 0.3920036073951601, + "flos": 23508956373120.0, + "grad_norm": 2.723028929016538, + "language_loss": 0.67084813, + "learning_rate": 2.775744388563563e-06, + "loss": 0.6927036, + "num_input_tokens_seen": 140029400, + "step": 6520, + "time_per_iteration": 2.6971800327301025 + }, + { + "auxiliary_loss_clip": 0.01130641, + "auxiliary_loss_mlp": 0.01040483, + "balance_loss_clip": 1.04749501, + "balance_loss_mlp": 1.02648759, + "epoch": 0.39206373064782807, + "flos": 18406086635520.0, + "grad_norm": 1.8214273138880266, + "language_loss": 0.78716481, + "learning_rate": 2.775385401898104e-06, + "loss": 0.80887604, + "num_input_tokens_seen": 140048940, + "step": 6521, + "time_per_iteration": 2.69966459274292 + }, + { + "auxiliary_loss_clip": 0.01128458, + "auxiliary_loss_mlp": 0.01040156, + "balance_loss_clip": 1.05050826, + "balance_loss_mlp": 1.02289462, + "epoch": 0.39212385390049603, + "flos": 12313051608960.0, + "grad_norm": 2.9673341059873897, + "language_loss": 0.70119011, + "learning_rate": 2.775026385829952e-06, + "loss": 0.72287625, + "num_input_tokens_seen": 140066380, + "step": 6522, + "time_per_iteration": 2.7100417613983154 + }, + { + "auxiliary_loss_clip": 0.0110971, + "auxiliary_loss_mlp": 0.01035612, + "balance_loss_clip": 1.0467701, + "balance_loss_mlp": 1.02100325, + "epoch": 0.392183977153164, + "flos": 19719160214400.0, + "grad_norm": 2.0481488550445595, + "language_loss": 0.76847959, + "learning_rate": 2.774667340372722e-06, + "loss": 0.78993279, + "num_input_tokens_seen": 140085275, + "step": 6523, + "time_per_iteration": 4.336375713348389 + }, + { + "auxiliary_loss_clip": 0.01111577, + "auxiliary_loss_mlp": 0.01040964, + "balance_loss_clip": 1.04617906, + "balance_loss_mlp": 1.02597904, + "epoch": 0.39224410040583196, + "flos": 33144902403840.0, + "grad_norm": 2.4064695780458254, + "language_loss": 0.62052447, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.64204991, + "num_input_tokens_seen": 140105105, + "step": 6524, + "time_per_iteration": 2.861999750137329 + }, + { + "auxiliary_loss_clip": 0.0113421, + "auxiliary_loss_mlp": 0.01041444, + "balance_loss_clip": 1.04792655, + "balance_loss_mlp": 1.02591681, + "epoch": 0.39230422365849993, + "flos": 27782434097280.0, + "grad_norm": 3.311294983146634, + "language_loss": 0.74027938, + "learning_rate": 2.773949161345489e-06, + "loss": 0.76203597, + "num_input_tokens_seen": 140125645, + "step": 6525, + "time_per_iteration": 2.6660265922546387 + }, + { + "auxiliary_loss_clip": 0.01111123, + "auxiliary_loss_mlp": 0.01038937, + "balance_loss_clip": 1.04621911, + "balance_loss_mlp": 1.02488267, + "epoch": 0.3923643469111679, + "flos": 17931634865280.0, + "grad_norm": 1.9378599466790423, + "language_loss": 0.81101322, + "learning_rate": 2.773590027802719e-06, + "loss": 0.83251387, + "num_input_tokens_seen": 140141925, + "step": 6526, + "time_per_iteration": 2.6949198246002197 + }, + { + "auxiliary_loss_clip": 0.01122115, + "auxiliary_loss_mlp": 0.01043128, + "balance_loss_clip": 1.04750228, + "balance_loss_mlp": 1.02844119, + "epoch": 0.39242447016383586, + "flos": 24059539019520.0, + "grad_norm": 2.21390394508072, + "language_loss": 0.69860446, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.72025692, + "num_input_tokens_seen": 140160965, + "step": 6527, + "time_per_iteration": 2.648738384246826 + }, + { + "auxiliary_loss_clip": 0.01093845, + "auxiliary_loss_mlp": 0.01034532, + "balance_loss_clip": 1.04563034, + "balance_loss_mlp": 1.01990485, + "epoch": 0.3924845934165038, + "flos": 10664069016960.0, + "grad_norm": 2.870547931880311, + "language_loss": 0.82659566, + "learning_rate": 2.772871672726965e-06, + "loss": 0.84787941, + "num_input_tokens_seen": 140177780, + "step": 6528, + "time_per_iteration": 2.7436537742614746 + }, + { + "auxiliary_loss_clip": 0.01105744, + "auxiliary_loss_mlp": 0.01032675, + "balance_loss_clip": 1.04709864, + "balance_loss_mlp": 1.01909113, + "epoch": 0.3925447166691718, + "flos": 31245910174080.0, + "grad_norm": 1.7012894335018593, + "language_loss": 0.68846285, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.70984709, + "num_input_tokens_seen": 140201660, + "step": 6529, + "time_per_iteration": 2.7932794094085693 + }, + { + "auxiliary_loss_clip": 0.01112194, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.04500198, + "balance_loss_mlp": 1.02043366, + "epoch": 0.39260483992183975, + "flos": 29415040087680.0, + "grad_norm": 2.4176127237752145, + "language_loss": 0.80461496, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.82609558, + "num_input_tokens_seen": 140218585, + "step": 6530, + "time_per_iteration": 2.7094242572784424 + }, + { + "auxiliary_loss_clip": 0.01119536, + "auxiliary_loss_mlp": 0.0104093, + "balance_loss_clip": 1.04586959, + "balance_loss_mlp": 1.0264107, + "epoch": 0.3926649631745077, + "flos": 22857788666880.0, + "grad_norm": 1.6828565274400475, + "language_loss": 0.75680822, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.77841288, + "num_input_tokens_seen": 140239905, + "step": 6531, + "time_per_iteration": 2.7238411903381348 + }, + { + "auxiliary_loss_clip": 0.01058847, + "auxiliary_loss_mlp": 0.01008064, + "balance_loss_clip": 1.03009987, + "balance_loss_mlp": 1.00663972, + "epoch": 0.3927250864271757, + "flos": 63893881872000.0, + "grad_norm": 0.8211432271778524, + "language_loss": 0.60317427, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62384337, + "num_input_tokens_seen": 140293820, + "step": 6532, + "time_per_iteration": 3.047954797744751 + }, + { + "auxiliary_loss_clip": 0.01037233, + "auxiliary_loss_mlp": 0.01004719, + "balance_loss_clip": 1.02873898, + "balance_loss_mlp": 1.00334251, + "epoch": 0.3927852096798437, + "flos": 68909741890560.0, + "grad_norm": 0.7803139799058858, + "language_loss": 0.55459583, + "learning_rate": 2.771075272396981e-06, + "loss": 0.57501537, + "num_input_tokens_seen": 140360420, + "step": 6533, + "time_per_iteration": 3.306561231613159 + }, + { + "auxiliary_loss_clip": 0.01112553, + "auxiliary_loss_mlp": 0.01040733, + "balance_loss_clip": 1.04983759, + "balance_loss_mlp": 1.02614141, + "epoch": 0.39284533293251167, + "flos": 29715972232320.0, + "grad_norm": 2.2467248181922232, + "language_loss": 0.75955313, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.78108597, + "num_input_tokens_seen": 140381950, + "step": 6534, + "time_per_iteration": 2.7788329124450684 + }, + { + "auxiliary_loss_clip": 0.0112134, + "auxiliary_loss_mlp": 0.01045716, + "balance_loss_clip": 1.04698312, + "balance_loss_mlp": 1.02866912, + "epoch": 0.39290545618517964, + "flos": 18552027594240.0, + "grad_norm": 2.2080736338994944, + "language_loss": 0.78123498, + "learning_rate": 2.770356507494851e-06, + "loss": 0.80290556, + "num_input_tokens_seen": 140399410, + "step": 6535, + "time_per_iteration": 2.6949005126953125 + }, + { + "auxiliary_loss_clip": 0.0109337, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.04779291, + "balance_loss_mlp": 1.01950169, + "epoch": 0.3929655794378476, + "flos": 26249479413120.0, + "grad_norm": 1.9769476518607105, + "language_loss": 0.686719, + "learning_rate": 2.769997081218978e-06, + "loss": 0.7079792, + "num_input_tokens_seen": 140419055, + "step": 6536, + "time_per_iteration": 2.7684245109558105 + }, + { + "auxiliary_loss_clip": 0.01104946, + "auxiliary_loss_mlp": 0.01037851, + "balance_loss_clip": 1.04767156, + "balance_loss_mlp": 1.02469027, + "epoch": 0.39302570269051557, + "flos": 29277933874560.0, + "grad_norm": 1.8012856746153256, + "language_loss": 0.69048655, + "learning_rate": 2.769637625744738e-06, + "loss": 0.71191454, + "num_input_tokens_seen": 140438800, + "step": 6537, + "time_per_iteration": 2.7638440132141113 + }, + { + "auxiliary_loss_clip": 0.01122897, + "auxiliary_loss_mlp": 0.01040751, + "balance_loss_clip": 1.05155134, + "balance_loss_mlp": 1.02624357, + "epoch": 0.39308582594318353, + "flos": 17347440067200.0, + "grad_norm": 1.7514361880438423, + "language_loss": 0.78990901, + "learning_rate": 2.769278141085763e-06, + "loss": 0.81154549, + "num_input_tokens_seen": 140456880, + "step": 6538, + "time_per_iteration": 2.635075807571411 + }, + { + "auxiliary_loss_clip": 0.01003397, + "auxiliary_loss_mlp": 0.01017351, + "balance_loss_clip": 1.02259159, + "balance_loss_mlp": 1.01596797, + "epoch": 0.3931459491958515, + "flos": 61007094650880.0, + "grad_norm": 0.8098068956453415, + "language_loss": 0.6190061, + "learning_rate": 2.768918627255683e-06, + "loss": 0.63921356, + "num_input_tokens_seen": 140507510, + "step": 6539, + "time_per_iteration": 3.0673203468322754 + }, + { + "auxiliary_loss_clip": 0.01104217, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.04730296, + "balance_loss_mlp": 1.0206002, + "epoch": 0.39320607244851946, + "flos": 39016009249920.0, + "grad_norm": 3.0347619755245248, + "language_loss": 0.68405032, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.70544618, + "num_input_tokens_seen": 140528740, + "step": 6540, + "time_per_iteration": 2.7993643283843994 + }, + { + "auxiliary_loss_clip": 0.01105128, + "auxiliary_loss_mlp": 0.01030736, + "balance_loss_clip": 1.04439306, + "balance_loss_mlp": 1.01638293, + "epoch": 0.3932661957011874, + "flos": 24679752180480.0, + "grad_norm": 1.8325322608278536, + "language_loss": 0.7276125, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.74897116, + "num_input_tokens_seen": 140547560, + "step": 6541, + "time_per_iteration": 2.659224510192871 + }, + { + "auxiliary_loss_clip": 0.01054751, + "auxiliary_loss_mlp": 0.01009472, + "balance_loss_clip": 1.02648139, + "balance_loss_mlp": 1.0080775, + "epoch": 0.3933263189538554, + "flos": 70096552185600.0, + "grad_norm": 0.8313029932067456, + "language_loss": 0.60319722, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.6238395, + "num_input_tokens_seen": 140601175, + "step": 6542, + "time_per_iteration": 2.968062400817871 + }, + { + "auxiliary_loss_clip": 0.01121623, + "auxiliary_loss_mlp": 0.01038302, + "balance_loss_clip": 1.04764903, + "balance_loss_mlp": 1.0243547, + "epoch": 0.39338644220652336, + "flos": 22929071207040.0, + "grad_norm": 1.6209695943494522, + "language_loss": 0.82034504, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.84194422, + "num_input_tokens_seen": 140622200, + "step": 6543, + "time_per_iteration": 2.638796806335449 + }, + { + "auxiliary_loss_clip": 0.01103923, + "auxiliary_loss_mlp": 0.01034902, + "balance_loss_clip": 1.04355097, + "balance_loss_mlp": 1.02045417, + "epoch": 0.3934465654591913, + "flos": 30848163897600.0, + "grad_norm": 3.743075543188527, + "language_loss": 0.69100285, + "learning_rate": 2.767120621015908e-06, + "loss": 0.71239114, + "num_input_tokens_seen": 140643125, + "step": 6544, + "time_per_iteration": 2.7180936336517334 + }, + { + "auxiliary_loss_clip": 0.01112442, + "auxiliary_loss_mlp": 0.01047198, + "balance_loss_clip": 1.04659534, + "balance_loss_mlp": 1.0316174, + "epoch": 0.3935066887118593, + "flos": 29236528471680.0, + "grad_norm": 2.0996268311737976, + "language_loss": 0.76072371, + "learning_rate": 2.76676093244553e-06, + "loss": 0.78232014, + "num_input_tokens_seen": 140662500, + "step": 6545, + "time_per_iteration": 2.7429869174957275 + }, + { + "auxiliary_loss_clip": 0.01091051, + "auxiliary_loss_mlp": 0.01033724, + "balance_loss_clip": 1.04633403, + "balance_loss_mlp": 1.02104044, + "epoch": 0.3935668119645273, + "flos": 19135288638720.0, + "grad_norm": 1.7673371756448844, + "language_loss": 0.74672133, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.76796907, + "num_input_tokens_seen": 140681960, + "step": 6546, + "time_per_iteration": 2.6785295009613037 + }, + { + "auxiliary_loss_clip": 0.01109428, + "auxiliary_loss_mlp": 0.0103425, + "balance_loss_clip": 1.04903293, + "balance_loss_mlp": 1.01946843, + "epoch": 0.3936269352171953, + "flos": 18516116972160.0, + "grad_norm": 1.9230817449169166, + "language_loss": 0.81627518, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.83771199, + "num_input_tokens_seen": 140699170, + "step": 6547, + "time_per_iteration": 2.638214588165283 + }, + { + "auxiliary_loss_clip": 0.01114598, + "auxiliary_loss_mlp": 0.00772919, + "balance_loss_clip": 1.04404151, + "balance_loss_mlp": 1.00032711, + "epoch": 0.39368705846986324, + "flos": 15632813370240.0, + "grad_norm": 1.9821442562566327, + "language_loss": 0.84406352, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.86293864, + "num_input_tokens_seen": 140714920, + "step": 6548, + "time_per_iteration": 2.6490747928619385 + }, + { + "auxiliary_loss_clip": 0.01118074, + "auxiliary_loss_mlp": 0.00771091, + "balance_loss_clip": 1.04686236, + "balance_loss_mlp": 1.00034189, + "epoch": 0.3937471817225312, + "flos": 21325839563520.0, + "grad_norm": 1.7617733187332765, + "language_loss": 0.7311933, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.75008494, + "num_input_tokens_seen": 140734595, + "step": 6549, + "time_per_iteration": 2.635380983352661 + }, + { + "auxiliary_loss_clip": 0.01071621, + "auxiliary_loss_mlp": 0.01042928, + "balance_loss_clip": 1.0444963, + "balance_loss_mlp": 1.0259527, + "epoch": 0.39380730497519917, + "flos": 20776693461120.0, + "grad_norm": 2.774519883144605, + "language_loss": 0.77592897, + "learning_rate": 2.764962053731699e-06, + "loss": 0.7970745, + "num_input_tokens_seen": 140754050, + "step": 6550, + "time_per_iteration": 2.733921527862549 + }, + { + "auxiliary_loss_clip": 0.01095205, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.04455531, + "balance_loss_mlp": 1.01674485, + "epoch": 0.39386742822786713, + "flos": 21609784575360.0, + "grad_norm": 3.1837220930493517, + "language_loss": 0.81144142, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.83270073, + "num_input_tokens_seen": 140771440, + "step": 6551, + "time_per_iteration": 2.851475238800049 + }, + { + "auxiliary_loss_clip": 0.01117625, + "auxiliary_loss_mlp": 0.01036299, + "balance_loss_clip": 1.0443331, + "balance_loss_mlp": 1.02188659, + "epoch": 0.3939275514805351, + "flos": 12414642249600.0, + "grad_norm": 12.177431380433415, + "language_loss": 0.80449802, + "learning_rate": 2.764242299098596e-06, + "loss": 0.82603723, + "num_input_tokens_seen": 140786715, + "step": 6552, + "time_per_iteration": 2.667344570159912 + }, + { + "auxiliary_loss_clip": 0.01133223, + "auxiliary_loss_mlp": 0.01043273, + "balance_loss_clip": 1.04791522, + "balance_loss_mlp": 1.02883697, + "epoch": 0.39398767473320306, + "flos": 18552027594240.0, + "grad_norm": 2.002210962432939, + "language_loss": 0.71199149, + "learning_rate": 2.763882378305003e-06, + "loss": 0.73375642, + "num_input_tokens_seen": 140804950, + "step": 6553, + "time_per_iteration": 2.6329705715179443 + }, + { + "auxiliary_loss_clip": 0.0111827, + "auxiliary_loss_mlp": 0.0077145, + "balance_loss_clip": 1.04818738, + "balance_loss_mlp": 1.00036502, + "epoch": 0.39404779798587103, + "flos": 29308888419840.0, + "grad_norm": 4.200797737547303, + "language_loss": 0.64058566, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.65948284, + "num_input_tokens_seen": 140822800, + "step": 6554, + "time_per_iteration": 2.7190303802490234 + }, + { + "auxiliary_loss_clip": 0.01109713, + "auxiliary_loss_mlp": 0.01041117, + "balance_loss_clip": 1.04655266, + "balance_loss_mlp": 1.02747416, + "epoch": 0.394107921238539, + "flos": 34897055834880.0, + "grad_norm": 2.186636266316066, + "language_loss": 0.78957009, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.81107843, + "num_input_tokens_seen": 140842940, + "step": 6555, + "time_per_iteration": 2.7675819396972656 + }, + { + "auxiliary_loss_clip": 0.01102424, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_clip": 1.04469514, + "balance_loss_mlp": 1.02758873, + "epoch": 0.39416804449120696, + "flos": 25081413039360.0, + "grad_norm": 1.7945119387028163, + "language_loss": 0.71689165, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.7383461, + "num_input_tokens_seen": 140863060, + "step": 6556, + "time_per_iteration": 4.261122703552246 + }, + { + "auxiliary_loss_clip": 0.01129248, + "auxiliary_loss_mlp": 0.01031706, + "balance_loss_clip": 1.0445503, + "balance_loss_mlp": 1.01749015, + "epoch": 0.3942281677438749, + "flos": 32306639731200.0, + "grad_norm": 1.7970895618407805, + "language_loss": 0.84080362, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.86241317, + "num_input_tokens_seen": 140883795, + "step": 6557, + "time_per_iteration": 2.7031610012054443 + }, + { + "auxiliary_loss_clip": 0.01116561, + "auxiliary_loss_mlp": 0.01032116, + "balance_loss_clip": 1.04790783, + "balance_loss_mlp": 1.01810956, + "epoch": 0.3942882909965429, + "flos": 24936621315840.0, + "grad_norm": 3.8501140650976238, + "language_loss": 0.806759, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.82824582, + "num_input_tokens_seen": 140903055, + "step": 6558, + "time_per_iteration": 5.6523637771606445 + }, + { + "auxiliary_loss_clip": 0.01130051, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.04807055, + "balance_loss_mlp": 1.02238965, + "epoch": 0.39434841424921085, + "flos": 11874797769600.0, + "grad_norm": 1.8974962376031472, + "language_loss": 0.70930403, + "learning_rate": 2.761722245724792e-06, + "loss": 0.73096335, + "num_input_tokens_seen": 140920685, + "step": 6559, + "time_per_iteration": 2.6645302772521973 + }, + { + "auxiliary_loss_clip": 0.01113668, + "auxiliary_loss_mlp": 0.0104073, + "balance_loss_clip": 1.04660964, + "balance_loss_mlp": 1.02452326, + "epoch": 0.3944085375018789, + "flos": 16361620323840.0, + "grad_norm": 2.3002241644217865, + "language_loss": 0.80355662, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.82510054, + "num_input_tokens_seen": 140937320, + "step": 6560, + "time_per_iteration": 2.8372745513916016 + }, + { + "auxiliary_loss_clip": 0.01109469, + "auxiliary_loss_mlp": 0.0103941, + "balance_loss_clip": 1.04681468, + "balance_loss_mlp": 1.02334619, + "epoch": 0.39446866075454684, + "flos": 10633365866880.0, + "grad_norm": 2.2192317359233034, + "language_loss": 0.828062, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.84955078, + "num_input_tokens_seen": 140954855, + "step": 6561, + "time_per_iteration": 2.6724014282226562 + }, + { + "auxiliary_loss_clip": 0.01119263, + "auxiliary_loss_mlp": 0.01043889, + "balance_loss_clip": 1.04620779, + "balance_loss_mlp": 1.02972126, + "epoch": 0.3945287840072148, + "flos": 18187498419840.0, + "grad_norm": 2.478683034492453, + "language_loss": 0.80985552, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.83148706, + "num_input_tokens_seen": 140973250, + "step": 6562, + "time_per_iteration": 4.211291074752808 + }, + { + "auxiliary_loss_clip": 0.01100981, + "auxiliary_loss_mlp": 0.01040375, + "balance_loss_clip": 1.04367661, + "balance_loss_mlp": 1.02568245, + "epoch": 0.39458890725988277, + "flos": 23039891642880.0, + "grad_norm": 1.8396668644534004, + "language_loss": 0.81574059, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.83715415, + "num_input_tokens_seen": 140993050, + "step": 6563, + "time_per_iteration": 2.6933205127716064 + }, + { + "auxiliary_loss_clip": 0.01078578, + "auxiliary_loss_mlp": 0.01052866, + "balance_loss_clip": 1.03979552, + "balance_loss_mlp": 1.03385234, + "epoch": 0.39464903051255074, + "flos": 17159052211200.0, + "grad_norm": 2.4284687703059276, + "language_loss": 0.69678622, + "learning_rate": 2.759921340790127e-06, + "loss": 0.71810067, + "num_input_tokens_seen": 141010815, + "step": 6564, + "time_per_iteration": 2.7754619121551514 + }, + { + "auxiliary_loss_clip": 0.01119553, + "auxiliary_loss_mlp": 0.01037847, + "balance_loss_clip": 1.04547322, + "balance_loss_mlp": 1.02260029, + "epoch": 0.3947091537652187, + "flos": 15889000147200.0, + "grad_norm": 2.342409184231709, + "language_loss": 0.82842124, + "learning_rate": 2.759561073299676e-06, + "loss": 0.84999526, + "num_input_tokens_seen": 141028720, + "step": 6565, + "time_per_iteration": 2.652029037475586 + }, + { + "auxiliary_loss_clip": 0.01091527, + "auxiliary_loss_mlp": 0.01044097, + "balance_loss_clip": 1.04201448, + "balance_loss_mlp": 1.02794445, + "epoch": 0.39476927701788667, + "flos": 18545491319040.0, + "grad_norm": 1.8313066364371182, + "language_loss": 0.83458865, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.85594487, + "num_input_tokens_seen": 141046025, + "step": 6566, + "time_per_iteration": 2.6853299140930176 + }, + { + "auxiliary_loss_clip": 0.01137834, + "auxiliary_loss_mlp": 0.01036947, + "balance_loss_clip": 1.04882693, + "balance_loss_mlp": 1.02146816, + "epoch": 0.39482940027055463, + "flos": 22275712771200.0, + "grad_norm": 2.7953182854439973, + "language_loss": 0.77462149, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.79636931, + "num_input_tokens_seen": 141066865, + "step": 6567, + "time_per_iteration": 2.6695878505706787 + }, + { + "auxiliary_loss_clip": 0.01114738, + "auxiliary_loss_mlp": 0.01037774, + "balance_loss_clip": 1.04457474, + "balance_loss_mlp": 1.0235877, + "epoch": 0.3948895235232226, + "flos": 14757634494720.0, + "grad_norm": 3.2000391748281065, + "language_loss": 0.80752146, + "learning_rate": 2.758480098067182e-06, + "loss": 0.82904655, + "num_input_tokens_seen": 141084210, + "step": 6568, + "time_per_iteration": 2.6126980781555176 + }, + { + "auxiliary_loss_clip": 0.01100656, + "auxiliary_loss_mlp": 0.01035941, + "balance_loss_clip": 1.04693437, + "balance_loss_mlp": 1.02142143, + "epoch": 0.39494964677589056, + "flos": 22565763095040.0, + "grad_norm": 3.155903507973262, + "language_loss": 0.846977, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.868343, + "num_input_tokens_seen": 141103895, + "step": 6569, + "time_per_iteration": 2.731241464614868 + }, + { + "auxiliary_loss_clip": 0.01076285, + "auxiliary_loss_mlp": 0.01045444, + "balance_loss_clip": 1.046417, + "balance_loss_mlp": 1.03076911, + "epoch": 0.3950097700285585, + "flos": 22963186149120.0, + "grad_norm": 2.966787083651573, + "language_loss": 0.74931526, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.77053261, + "num_input_tokens_seen": 141124000, + "step": 6570, + "time_per_iteration": 2.816168785095215 + }, + { + "auxiliary_loss_clip": 0.01093382, + "auxiliary_loss_mlp": 0.01037489, + "balance_loss_clip": 1.04271865, + "balance_loss_mlp": 1.02224803, + "epoch": 0.3950698932812265, + "flos": 20595236929920.0, + "grad_norm": 3.807643490882315, + "language_loss": 0.80009687, + "learning_rate": 2.757398863979922e-06, + "loss": 0.82140559, + "num_input_tokens_seen": 141142535, + "step": 6571, + "time_per_iteration": 2.7444143295288086 + }, + { + "auxiliary_loss_clip": 0.0110309, + "auxiliary_loss_mlp": 0.01042438, + "balance_loss_clip": 1.046592, + "balance_loss_mlp": 1.02792382, + "epoch": 0.39513001653389446, + "flos": 20375786787840.0, + "grad_norm": 2.0513494110156105, + "language_loss": 0.77667749, + "learning_rate": 2.757038395157997e-06, + "loss": 0.79813272, + "num_input_tokens_seen": 141161575, + "step": 6572, + "time_per_iteration": 2.787951946258545 + }, + { + "auxiliary_loss_clip": 0.01096298, + "auxiliary_loss_mlp": 0.01039178, + "balance_loss_clip": 1.04524946, + "balance_loss_mlp": 1.02422285, + "epoch": 0.3951901397865625, + "flos": 26463650256000.0, + "grad_norm": 2.2233910711840092, + "language_loss": 0.74710405, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.76845872, + "num_input_tokens_seen": 141181150, + "step": 6573, + "time_per_iteration": 2.8065271377563477 + }, + { + "auxiliary_loss_clip": 0.01119667, + "auxiliary_loss_mlp": 0.01033875, + "balance_loss_clip": 1.04583275, + "balance_loss_mlp": 1.02073228, + "epoch": 0.39525026303923044, + "flos": 43838345767680.0, + "grad_norm": 1.5702623893020402, + "language_loss": 0.681665, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.7032004, + "num_input_tokens_seen": 141206310, + "step": 6574, + "time_per_iteration": 2.917938470840454 + }, + { + "auxiliary_loss_clip": 0.01066027, + "auxiliary_loss_mlp": 0.01046829, + "balance_loss_clip": 1.03601551, + "balance_loss_mlp": 1.02941298, + "epoch": 0.3953103862918984, + "flos": 18040803275520.0, + "grad_norm": 11.51359836007049, + "language_loss": 0.71934754, + "learning_rate": 2.755956816505072e-06, + "loss": 0.74047613, + "num_input_tokens_seen": 141223925, + "step": 6575, + "time_per_iteration": 2.8125574588775635 + }, + { + "auxiliary_loss_clip": 0.01106625, + "auxiliary_loss_mlp": 0.01044084, + "balance_loss_clip": 1.04328454, + "balance_loss_mlp": 1.02871156, + "epoch": 0.3953705095445664, + "flos": 16976015481600.0, + "grad_norm": 2.3082458130711276, + "language_loss": 0.73497486, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.75648189, + "num_input_tokens_seen": 141239010, + "step": 6576, + "time_per_iteration": 2.7072994709014893 + }, + { + "auxiliary_loss_clip": 0.01131853, + "auxiliary_loss_mlp": 0.01038072, + "balance_loss_clip": 1.04721868, + "balance_loss_mlp": 1.02482772, + "epoch": 0.39543063279723434, + "flos": 17411144837760.0, + "grad_norm": 2.584581612312142, + "language_loss": 0.83806884, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.85976809, + "num_input_tokens_seen": 141252255, + "step": 6577, + "time_per_iteration": 2.673980236053467 + }, + { + "auxiliary_loss_clip": 0.01108115, + "auxiliary_loss_mlp": 0.01038249, + "balance_loss_clip": 1.04473734, + "balance_loss_mlp": 1.02394366, + "epoch": 0.3954907560499023, + "flos": 22784207656320.0, + "grad_norm": 3.282604232183532, + "language_loss": 0.90597945, + "learning_rate": 2.75487497985853e-06, + "loss": 0.92744309, + "num_input_tokens_seen": 141269325, + "step": 6578, + "time_per_iteration": 2.8357715606689453 + }, + { + "auxiliary_loss_clip": 0.01113431, + "auxiliary_loss_mlp": 0.01038042, + "balance_loss_clip": 1.04971015, + "balance_loss_mlp": 1.0215559, + "epoch": 0.39555087930257027, + "flos": 21944400698880.0, + "grad_norm": 1.9328811386040925, + "language_loss": 0.77836883, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.7998836, + "num_input_tokens_seen": 141288505, + "step": 6579, + "time_per_iteration": 2.78900146484375 + }, + { + "auxiliary_loss_clip": 0.01080071, + "auxiliary_loss_mlp": 0.01037596, + "balance_loss_clip": 1.04296517, + "balance_loss_mlp": 1.02181292, + "epoch": 0.39561100255523823, + "flos": 20404622430720.0, + "grad_norm": 2.0515288557813705, + "language_loss": 0.68375254, + "learning_rate": 2.754153612280037e-06, + "loss": 0.70492923, + "num_input_tokens_seen": 141303680, + "step": 6580, + "time_per_iteration": 2.796602249145508 + }, + { + "auxiliary_loss_clip": 0.01119101, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.04687381, + "balance_loss_mlp": 1.01770234, + "epoch": 0.3956711258079062, + "flos": 27964572986880.0, + "grad_norm": 5.6192422063497425, + "language_loss": 0.58592093, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.60742974, + "num_input_tokens_seen": 141324090, + "step": 6581, + "time_per_iteration": 2.738732099533081 + }, + { + "auxiliary_loss_clip": 0.0110807, + "auxiliary_loss_mlp": 0.01047889, + "balance_loss_clip": 1.04554892, + "balance_loss_mlp": 1.03111625, + "epoch": 0.39573124906057416, + "flos": 14428297670400.0, + "grad_norm": 1.840388254222325, + "language_loss": 0.69687581, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.71843535, + "num_input_tokens_seen": 141342235, + "step": 6582, + "time_per_iteration": 2.74564790725708 + }, + { + "auxiliary_loss_clip": 0.01132763, + "auxiliary_loss_mlp": 0.0077198, + "balance_loss_clip": 1.04670966, + "balance_loss_mlp": 1.00066948, + "epoch": 0.39579137231324213, + "flos": 18733699607040.0, + "grad_norm": 2.093309053458098, + "language_loss": 0.76838243, + "learning_rate": 2.753071346464642e-06, + "loss": 0.78742981, + "num_input_tokens_seen": 141361195, + "step": 6583, + "time_per_iteration": 2.6127665042877197 + }, + { + "auxiliary_loss_clip": 0.01084294, + "auxiliary_loss_mlp": 0.00772199, + "balance_loss_clip": 1.04135418, + "balance_loss_mlp": 1.00058353, + "epoch": 0.3958514955659101, + "flos": 17676417755520.0, + "grad_norm": 2.422087879109688, + "language_loss": 0.66005278, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.67861772, + "num_input_tokens_seen": 141378275, + "step": 6584, + "time_per_iteration": 2.8412790298461914 + }, + { + "auxiliary_loss_clip": 0.0109769, + "auxiliary_loss_mlp": 0.01042803, + "balance_loss_clip": 1.04634333, + "balance_loss_mlp": 1.02687716, + "epoch": 0.39591161881857806, + "flos": 29309103901440.0, + "grad_norm": 7.452692779947077, + "language_loss": 0.72775561, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.74916053, + "num_input_tokens_seen": 141396960, + "step": 6585, + "time_per_iteration": 2.8504436016082764 + }, + { + "auxiliary_loss_clip": 0.0109915, + "auxiliary_loss_mlp": 0.01041099, + "balance_loss_clip": 1.04335117, + "balance_loss_mlp": 1.02628136, + "epoch": 0.3959717420712461, + "flos": 25771831332480.0, + "grad_norm": 1.8603715362450812, + "language_loss": 0.73381901, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.75522149, + "num_input_tokens_seen": 141417320, + "step": 6586, + "time_per_iteration": 2.8426311016082764 + }, + { + "auxiliary_loss_clip": 0.01101854, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.04255629, + "balance_loss_mlp": 1.02266693, + "epoch": 0.39603186532391405, + "flos": 20923783655040.0, + "grad_norm": 2.174728382433504, + "language_loss": 0.71447468, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.73587245, + "num_input_tokens_seen": 141435985, + "step": 6587, + "time_per_iteration": 2.7798478603363037 + }, + { + "auxiliary_loss_clip": 0.01007869, + "auxiliary_loss_mlp": 0.01003214, + "balance_loss_clip": 1.02249742, + "balance_loss_mlp": 1.00195026, + "epoch": 0.396091988576582, + "flos": 54880986176640.0, + "grad_norm": 0.9478406102040471, + "language_loss": 0.61186492, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63197577, + "num_input_tokens_seen": 141486075, + "step": 6588, + "time_per_iteration": 3.1663742065429688 + }, + { + "auxiliary_loss_clip": 0.0110963, + "auxiliary_loss_mlp": 0.00772247, + "balance_loss_clip": 1.04547548, + "balance_loss_mlp": 1.0006907, + "epoch": 0.39615211182925, + "flos": 20702896968960.0, + "grad_norm": 3.1004380492305206, + "language_loss": 0.81686854, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.8356874, + "num_input_tokens_seen": 141505280, + "step": 6589, + "time_per_iteration": 2.7711055278778076 + }, + { + "auxiliary_loss_clip": 0.01106228, + "auxiliary_loss_mlp": 0.01038149, + "balance_loss_clip": 1.04562962, + "balance_loss_mlp": 1.02241325, + "epoch": 0.39621223508191794, + "flos": 20994312009600.0, + "grad_norm": 2.2429889858322802, + "language_loss": 0.69913912, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.72058284, + "num_input_tokens_seen": 141523930, + "step": 6590, + "time_per_iteration": 2.793330669403076 + }, + { + "auxiliary_loss_clip": 0.01117633, + "auxiliary_loss_mlp": 0.01056421, + "balance_loss_clip": 1.04669666, + "balance_loss_mlp": 1.03980339, + "epoch": 0.3962723583345859, + "flos": 23368833417600.0, + "grad_norm": 1.772211549409949, + "language_loss": 0.75809395, + "learning_rate": 2.750184048805956e-06, + "loss": 0.77983451, + "num_input_tokens_seen": 141541320, + "step": 6591, + "time_per_iteration": 2.7317981719970703 + }, + { + "auxiliary_loss_clip": 0.01043506, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_clip": 1.03802264, + "balance_loss_mlp": 1.03364813, + "epoch": 0.39633248158725387, + "flos": 25115599808640.0, + "grad_norm": 2.064980952243903, + "language_loss": 0.78466719, + "learning_rate": 2.749823008443152e-06, + "loss": 0.80559409, + "num_input_tokens_seen": 141561880, + "step": 6592, + "time_per_iteration": 3.192194700241089 + }, + { + "auxiliary_loss_clip": 0.01059924, + "auxiliary_loss_mlp": 0.01033868, + "balance_loss_clip": 1.03984666, + "balance_loss_mlp": 1.01872826, + "epoch": 0.39639260483992184, + "flos": 39787622236800.0, + "grad_norm": 1.9568402514544967, + "language_loss": 0.69690341, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.71784127, + "num_input_tokens_seen": 141586460, + "step": 6593, + "time_per_iteration": 3.365752696990967 + }, + { + "auxiliary_loss_clip": 0.01059564, + "auxiliary_loss_mlp": 0.01046377, + "balance_loss_clip": 1.03668404, + "balance_loss_mlp": 1.03035569, + "epoch": 0.3964527280925898, + "flos": 17347045017600.0, + "grad_norm": 1.624713370756075, + "language_loss": 0.77905881, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.80011821, + "num_input_tokens_seen": 141605955, + "step": 6594, + "time_per_iteration": 2.890626907348633 + }, + { + "auxiliary_loss_clip": 0.01025812, + "auxiliary_loss_mlp": 0.01003509, + "balance_loss_clip": 1.02550435, + "balance_loss_mlp": 1.00200129, + "epoch": 0.39651285134525777, + "flos": 71717848369920.0, + "grad_norm": 0.9363100872746896, + "language_loss": 0.6304667, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65075988, + "num_input_tokens_seen": 141673140, + "step": 6595, + "time_per_iteration": 3.3955094814300537 + }, + { + "auxiliary_loss_clip": 0.01096586, + "auxiliary_loss_mlp": 0.01055368, + "balance_loss_clip": 1.0442034, + "balance_loss_mlp": 1.03774858, + "epoch": 0.39657297459792573, + "flos": 25775710001280.0, + "grad_norm": 2.5609780352809732, + "language_loss": 0.63787287, + "learning_rate": 2.748378562795223e-06, + "loss": 0.65939242, + "num_input_tokens_seen": 141692955, + "step": 6596, + "time_per_iteration": 4.60092568397522 + }, + { + "auxiliary_loss_clip": 0.01120147, + "auxiliary_loss_mlp": 0.010422, + "balance_loss_clip": 1.04657853, + "balance_loss_mlp": 1.02747798, + "epoch": 0.3966330978505937, + "flos": 20266115587200.0, + "grad_norm": 2.0315739566024567, + "language_loss": 0.79006839, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.81169188, + "num_input_tokens_seen": 141710680, + "step": 6597, + "time_per_iteration": 5.807824373245239 + }, + { + "auxiliary_loss_clip": 0.01099639, + "auxiliary_loss_mlp": 0.00773402, + "balance_loss_clip": 1.04352951, + "balance_loss_mlp": 1.00076032, + "epoch": 0.39669322110326166, + "flos": 20631183465600.0, + "grad_norm": 2.966898609781474, + "language_loss": 0.6772182, + "learning_rate": 2.747656169644941e-06, + "loss": 0.69594866, + "num_input_tokens_seen": 141729860, + "step": 6598, + "time_per_iteration": 2.786884307861328 + }, + { + "auxiliary_loss_clip": 0.01129462, + "auxiliary_loss_mlp": 0.01041455, + "balance_loss_clip": 1.04473436, + "balance_loss_mlp": 1.02785325, + "epoch": 0.3967533443559297, + "flos": 21726063878400.0, + "grad_norm": 2.1804433985902247, + "language_loss": 0.79342777, + "learning_rate": 2.747294930536157e-06, + "loss": 0.81513697, + "num_input_tokens_seen": 141749060, + "step": 6599, + "time_per_iteration": 2.6758370399475098 + }, + { + "auxiliary_loss_clip": 0.01091573, + "auxiliary_loss_mlp": 0.01041619, + "balance_loss_clip": 1.04314208, + "balance_loss_mlp": 1.02487051, + "epoch": 0.39681346760859765, + "flos": 25484151306240.0, + "grad_norm": 2.279505844275463, + "language_loss": 0.72878486, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.75011677, + "num_input_tokens_seen": 141769860, + "step": 6600, + "time_per_iteration": 2.7616889476776123 + }, + { + "auxiliary_loss_clip": 0.01083152, + "auxiliary_loss_mlp": 0.01037422, + "balance_loss_clip": 1.03626251, + "balance_loss_mlp": 1.0220201, + "epoch": 0.3968735908612656, + "flos": 20959586536320.0, + "grad_norm": 2.0515710245603938, + "language_loss": 0.85973942, + "learning_rate": 2.746572367319791e-06, + "loss": 0.88094509, + "num_input_tokens_seen": 141788465, + "step": 6601, + "time_per_iteration": 2.755791664123535 + }, + { + "auxiliary_loss_clip": 0.01095713, + "auxiliary_loss_mlp": 0.01041964, + "balance_loss_clip": 1.0429877, + "balance_loss_mlp": 1.02468467, + "epoch": 0.3969337141139336, + "flos": 10707090531840.0, + "grad_norm": 2.240549855963289, + "language_loss": 0.70372766, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.72510445, + "num_input_tokens_seen": 141804955, + "step": 6602, + "time_per_iteration": 4.643726348876953 + }, + { + "auxiliary_loss_clip": 0.01133428, + "auxiliary_loss_mlp": 0.01047809, + "balance_loss_clip": 1.04658508, + "balance_loss_mlp": 1.03230548, + "epoch": 0.39699383736660154, + "flos": 17593714690560.0, + "grad_norm": 3.7711392584572896, + "language_loss": 0.83248609, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.85429847, + "num_input_tokens_seen": 141820025, + "step": 6603, + "time_per_iteration": 2.8909716606140137 + }, + { + "auxiliary_loss_clip": 0.01112282, + "auxiliary_loss_mlp": 0.01034339, + "balance_loss_clip": 1.04651403, + "balance_loss_mlp": 1.02003431, + "epoch": 0.3970539606192695, + "flos": 17785945301760.0, + "grad_norm": 1.9508498227264648, + "language_loss": 0.73302728, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.75449347, + "num_input_tokens_seen": 141838735, + "step": 6604, + "time_per_iteration": 2.828908920288086 + }, + { + "auxiliary_loss_clip": 0.01105132, + "auxiliary_loss_mlp": 0.01038476, + "balance_loss_clip": 1.04384422, + "balance_loss_mlp": 1.02364659, + "epoch": 0.3971140838719375, + "flos": 24789495208320.0, + "grad_norm": 1.769953580879433, + "language_loss": 0.82582277, + "learning_rate": 2.745126901275491e-06, + "loss": 0.84725887, + "num_input_tokens_seen": 141858090, + "step": 6605, + "time_per_iteration": 2.6773502826690674 + }, + { + "auxiliary_loss_clip": 0.01128613, + "auxiliary_loss_mlp": 0.01033129, + "balance_loss_clip": 1.04549098, + "balance_loss_mlp": 1.01968801, + "epoch": 0.39717420712460544, + "flos": 24243581329920.0, + "grad_norm": 1.4871941413504006, + "language_loss": 0.73511499, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.75673246, + "num_input_tokens_seen": 141877540, + "step": 6606, + "time_per_iteration": 2.632805347442627 + }, + { + "auxiliary_loss_clip": 0.01089285, + "auxiliary_loss_mlp": 0.01048599, + "balance_loss_clip": 1.0436089, + "balance_loss_mlp": 1.03198171, + "epoch": 0.3972343303772734, + "flos": 25884698843520.0, + "grad_norm": 2.092571399644939, + "language_loss": 0.74296981, + "learning_rate": 2.744403998666805e-06, + "loss": 0.76434863, + "num_input_tokens_seen": 141897315, + "step": 6607, + "time_per_iteration": 2.7277770042419434 + }, + { + "auxiliary_loss_clip": 0.01124169, + "auxiliary_loss_mlp": 0.01037393, + "balance_loss_clip": 1.04697132, + "balance_loss_mlp": 1.02267027, + "epoch": 0.39729445362994137, + "flos": 45623716300800.0, + "grad_norm": 1.5196847129379933, + "language_loss": 0.6787042, + "learning_rate": 2.744042505013797e-06, + "loss": 0.70031989, + "num_input_tokens_seen": 141919580, + "step": 6608, + "time_per_iteration": 2.8229119777679443 + }, + { + "auxiliary_loss_clip": 0.01094928, + "auxiliary_loss_mlp": 0.01054175, + "balance_loss_clip": 1.04091311, + "balance_loss_mlp": 1.03580451, + "epoch": 0.39735457688260933, + "flos": 20193971120640.0, + "grad_norm": 7.314681050409252, + "language_loss": 0.74670005, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.7681911, + "num_input_tokens_seen": 141937045, + "step": 6609, + "time_per_iteration": 2.7502245903015137 + }, + { + "auxiliary_loss_clip": 0.01107217, + "auxiliary_loss_mlp": 0.01036058, + "balance_loss_clip": 1.04354, + "balance_loss_mlp": 1.02056026, + "epoch": 0.3974147001352773, + "flos": 23331163029120.0, + "grad_norm": 1.742454501323656, + "language_loss": 0.713238, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.73467076, + "num_input_tokens_seen": 141956695, + "step": 6610, + "time_per_iteration": 2.7225286960601807 + }, + { + "auxiliary_loss_clip": 0.01105851, + "auxiliary_loss_mlp": 0.01030068, + "balance_loss_clip": 1.03818822, + "balance_loss_mlp": 1.01509547, + "epoch": 0.39747482338794526, + "flos": 21688644885120.0, + "grad_norm": 1.7960063460415152, + "language_loss": 0.78151029, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.8028695, + "num_input_tokens_seen": 141975935, + "step": 6611, + "time_per_iteration": 2.6464622020721436 + }, + { + "auxiliary_loss_clip": 0.01121213, + "auxiliary_loss_mlp": 0.01038273, + "balance_loss_clip": 1.04614162, + "balance_loss_mlp": 1.0235095, + "epoch": 0.3975349466406133, + "flos": 30988717816320.0, + "grad_norm": 1.7937788130001704, + "language_loss": 0.7921629, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.81375778, + "num_input_tokens_seen": 141995750, + "step": 6612, + "time_per_iteration": 2.734950304031372 + }, + { + "auxiliary_loss_clip": 0.01018209, + "auxiliary_loss_mlp": 0.0100828, + "balance_loss_clip": 1.02113628, + "balance_loss_mlp": 1.00702214, + "epoch": 0.39759506989328125, + "flos": 63683948833920.0, + "grad_norm": 0.8423760762856193, + "language_loss": 0.64935088, + "learning_rate": 2.742234613810459e-06, + "loss": 0.66961575, + "num_input_tokens_seen": 142057655, + "step": 6613, + "time_per_iteration": 3.1294105052948 + }, + { + "auxiliary_loss_clip": 0.01097901, + "auxiliary_loss_mlp": 0.01042526, + "balance_loss_clip": 1.03916883, + "balance_loss_mlp": 1.02507401, + "epoch": 0.3976551931459492, + "flos": 23695835857920.0, + "grad_norm": 3.0444472336295636, + "language_loss": 0.71508956, + "learning_rate": 2.741872951078109e-06, + "loss": 0.73649383, + "num_input_tokens_seen": 142076020, + "step": 6614, + "time_per_iteration": 2.6479976177215576 + }, + { + "auxiliary_loss_clip": 0.01116106, + "auxiliary_loss_mlp": 0.01035284, + "balance_loss_clip": 1.04503131, + "balance_loss_mlp": 1.02034712, + "epoch": 0.3977153163986172, + "flos": 15669657745920.0, + "grad_norm": 2.2927333729520885, + "language_loss": 0.81362098, + "learning_rate": 2.741511260213862e-06, + "loss": 0.83513486, + "num_input_tokens_seen": 142093790, + "step": 6615, + "time_per_iteration": 2.6567723751068115 + }, + { + "auxiliary_loss_clip": 0.01094954, + "auxiliary_loss_mlp": 0.01034601, + "balance_loss_clip": 1.04491544, + "balance_loss_mlp": 1.02023649, + "epoch": 0.39777543965128515, + "flos": 14064702249600.0, + "grad_norm": 2.01024859105405, + "language_loss": 0.67510247, + "learning_rate": 2.741149541231434e-06, + "loss": 0.69639802, + "num_input_tokens_seen": 142110545, + "step": 6616, + "time_per_iteration": 2.6675400733947754 + }, + { + "auxiliary_loss_clip": 0.01133654, + "auxiliary_loss_mlp": 0.01043633, + "balance_loss_clip": 1.04658771, + "balance_loss_mlp": 1.02765918, + "epoch": 0.3978355629039531, + "flos": 23367468700800.0, + "grad_norm": 2.3086733420735785, + "language_loss": 0.83678514, + "learning_rate": 2.740787794144541e-06, + "loss": 0.85855806, + "num_input_tokens_seen": 142128695, + "step": 6617, + "time_per_iteration": 2.5879552364349365 + }, + { + "auxiliary_loss_clip": 0.01126085, + "auxiliary_loss_mlp": 0.01039432, + "balance_loss_clip": 1.04570735, + "balance_loss_mlp": 1.02563334, + "epoch": 0.3978956861566211, + "flos": 19062785036160.0, + "grad_norm": 1.7795732635152253, + "language_loss": 0.72766519, + "learning_rate": 2.7404260189669e-06, + "loss": 0.74932027, + "num_input_tokens_seen": 142148375, + "step": 6618, + "time_per_iteration": 2.613162040710449 + }, + { + "auxiliary_loss_clip": 0.01111951, + "auxiliary_loss_mlp": 0.01041983, + "balance_loss_clip": 1.04827428, + "balance_loss_mlp": 1.02544832, + "epoch": 0.39795580940928904, + "flos": 30227699341440.0, + "grad_norm": 1.6960793445061386, + "language_loss": 0.65858316, + "learning_rate": 2.740064215712231e-06, + "loss": 0.68012249, + "num_input_tokens_seen": 142169735, + "step": 6619, + "time_per_iteration": 2.7474000453948975 + }, + { + "auxiliary_loss_clip": 0.01052495, + "auxiliary_loss_mlp": 0.01004058, + "balance_loss_clip": 1.0230546, + "balance_loss_mlp": 1.00270545, + "epoch": 0.398015932661957, + "flos": 69847224906240.0, + "grad_norm": 0.7704475067145287, + "language_loss": 0.58246851, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60303402, + "num_input_tokens_seen": 142229520, + "step": 6620, + "time_per_iteration": 3.1400091648101807 + }, + { + "auxiliary_loss_clip": 0.01113547, + "auxiliary_loss_mlp": 0.0103675, + "balance_loss_clip": 1.04998422, + "balance_loss_mlp": 1.02314794, + "epoch": 0.39807605591462497, + "flos": 20157773189760.0, + "grad_norm": 1.821199996328267, + "language_loss": 0.7925806, + "learning_rate": 2.739340525026686e-06, + "loss": 0.81408358, + "num_input_tokens_seen": 142247660, + "step": 6621, + "time_per_iteration": 2.7389161586761475 + }, + { + "auxiliary_loss_clip": 0.0110802, + "auxiliary_loss_mlp": 0.01034956, + "balance_loss_clip": 1.04590595, + "balance_loss_mlp": 1.02088952, + "epoch": 0.39813617916729294, + "flos": 21141761339520.0, + "grad_norm": 1.899291394170355, + "language_loss": 0.77800381, + "learning_rate": 2.738978637623252e-06, + "loss": 0.79943347, + "num_input_tokens_seen": 142266990, + "step": 6622, + "time_per_iteration": 2.7175779342651367 + }, + { + "auxiliary_loss_clip": 0.01101638, + "auxiliary_loss_mlp": 0.01038721, + "balance_loss_clip": 1.04108417, + "balance_loss_mlp": 1.02377844, + "epoch": 0.3981963024199609, + "flos": 18988485753600.0, + "grad_norm": 1.6278701941081761, + "language_loss": 0.7497921, + "learning_rate": 2.738616722197674e-06, + "loss": 0.77119565, + "num_input_tokens_seen": 142287170, + "step": 6623, + "time_per_iteration": 2.682567596435547 + }, + { + "auxiliary_loss_clip": 0.01088304, + "auxiliary_loss_mlp": 0.01040759, + "balance_loss_clip": 1.04280734, + "balance_loss_mlp": 1.02590537, + "epoch": 0.39825642567262887, + "flos": 16575108808320.0, + "grad_norm": 2.4968757127264465, + "language_loss": 0.79733497, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.81862563, + "num_input_tokens_seen": 142305405, + "step": 6624, + "time_per_iteration": 2.6878697872161865 + }, + { + "auxiliary_loss_clip": 0.01135858, + "auxiliary_loss_mlp": 0.01043783, + "balance_loss_clip": 1.04792297, + "balance_loss_mlp": 1.0270462, + "epoch": 0.39831654892529683, + "flos": 22199833290240.0, + "grad_norm": 2.0557211895564884, + "language_loss": 0.83616954, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.85796595, + "num_input_tokens_seen": 142322710, + "step": 6625, + "time_per_iteration": 2.5847036838531494 + }, + { + "auxiliary_loss_clip": 0.011152, + "auxiliary_loss_mlp": 0.01044436, + "balance_loss_clip": 1.04585958, + "balance_loss_mlp": 1.02948713, + "epoch": 0.39837667217796485, + "flos": 10487963612160.0, + "grad_norm": 2.4120237094780377, + "language_loss": 0.87324822, + "learning_rate": 2.737530807925321e-06, + "loss": 0.89484465, + "num_input_tokens_seen": 142338535, + "step": 6626, + "time_per_iteration": 2.5845320224761963 + }, + { + "auxiliary_loss_clip": 0.01067442, + "auxiliary_loss_mlp": 0.00775778, + "balance_loss_clip": 1.03995085, + "balance_loss_mlp": 1.00066137, + "epoch": 0.3984367954306328, + "flos": 17965282930560.0, + "grad_norm": 2.3324132294494797, + "language_loss": 0.83462882, + "learning_rate": 2.737168780548417e-06, + "loss": 0.85306096, + "num_input_tokens_seen": 142354570, + "step": 6627, + "time_per_iteration": 2.854428291320801 + }, + { + "auxiliary_loss_clip": 0.01087071, + "auxiliary_loss_mlp": 0.00771611, + "balance_loss_clip": 1.04081798, + "balance_loss_mlp": 1.00056684, + "epoch": 0.3984969186833008, + "flos": 22711057608960.0, + "grad_norm": 1.4575889504047923, + "language_loss": 0.82904339, + "learning_rate": 2.736806725217998e-06, + "loss": 0.84763026, + "num_input_tokens_seen": 142374395, + "step": 6628, + "time_per_iteration": 2.772620916366577 + }, + { + "auxiliary_loss_clip": 0.01092039, + "auxiliary_loss_mlp": 0.01062711, + "balance_loss_clip": 1.04402328, + "balance_loss_mlp": 1.04652882, + "epoch": 0.39855704193596875, + "flos": 23405785534080.0, + "grad_norm": 1.6631347026103094, + "language_loss": 0.71145642, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.73300385, + "num_input_tokens_seen": 142396040, + "step": 6629, + "time_per_iteration": 2.681969165802002 + }, + { + "auxiliary_loss_clip": 0.01097676, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_clip": 1.04695797, + "balance_loss_mlp": 1.02136111, + "epoch": 0.3986171651886367, + "flos": 21251935330560.0, + "grad_norm": 1.757569665448266, + "language_loss": 0.80513418, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.82646906, + "num_input_tokens_seen": 142415495, + "step": 6630, + "time_per_iteration": 2.7747275829315186 + }, + { + "auxiliary_loss_clip": 0.01072778, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.04495096, + "balance_loss_mlp": 1.01805389, + "epoch": 0.3986772884413047, + "flos": 12458705258880.0, + "grad_norm": 2.3833222910170857, + "language_loss": 0.74846494, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.76951796, + "num_input_tokens_seen": 142431865, + "step": 6631, + "time_per_iteration": 2.8098866939544678 + }, + { + "auxiliary_loss_clip": 0.01095184, + "auxiliary_loss_mlp": 0.01040404, + "balance_loss_clip": 1.04248333, + "balance_loss_mlp": 1.02500248, + "epoch": 0.39873741169397264, + "flos": 19646117907840.0, + "grad_norm": 2.096728163981437, + "language_loss": 0.7160908, + "learning_rate": 2.735358224635783e-06, + "loss": 0.73744667, + "num_input_tokens_seen": 142450595, + "step": 6632, + "time_per_iteration": 2.81479811668396 + }, + { + "auxiliary_loss_clip": 0.01063774, + "auxiliary_loss_mlp": 0.00771132, + "balance_loss_clip": 1.04164338, + "balance_loss_mlp": 1.00057721, + "epoch": 0.3987975349466406, + "flos": 21684766216320.0, + "grad_norm": 2.0680050346702945, + "language_loss": 0.7479074, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.76625645, + "num_input_tokens_seen": 142466650, + "step": 6633, + "time_per_iteration": 2.9533073902130127 + }, + { + "auxiliary_loss_clip": 0.01105798, + "auxiliary_loss_mlp": 0.01028668, + "balance_loss_clip": 1.0465138, + "balance_loss_mlp": 1.01509583, + "epoch": 0.3988576581993086, + "flos": 23914064937600.0, + "grad_norm": 1.7626671777587215, + "language_loss": 0.81420207, + "learning_rate": 2.7346338069806e-06, + "loss": 0.83554673, + "num_input_tokens_seen": 142486165, + "step": 6634, + "time_per_iteration": 2.760012626647949 + }, + { + "auxiliary_loss_clip": 0.0110458, + "auxiliary_loss_mlp": 0.0103091, + "balance_loss_clip": 1.04739153, + "balance_loss_mlp": 1.01618731, + "epoch": 0.39891778145197654, + "flos": 18149899858560.0, + "grad_norm": 2.495702621722643, + "language_loss": 0.74914795, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.77050287, + "num_input_tokens_seen": 142505035, + "step": 6635, + "time_per_iteration": 4.225152015686035 + }, + { + "auxiliary_loss_clip": 0.01101511, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.04791617, + "balance_loss_mlp": 1.02265239, + "epoch": 0.3989779047046445, + "flos": 22595281096320.0, + "grad_norm": 28.19463582214486, + "language_loss": 0.66373086, + "learning_rate": 2.733909277895868e-06, + "loss": 0.68513715, + "num_input_tokens_seen": 142521870, + "step": 6636, + "time_per_iteration": 4.455794811248779 + }, + { + "auxiliary_loss_clip": 0.01118899, + "auxiliary_loss_mlp": 0.01041724, + "balance_loss_clip": 1.04681683, + "balance_loss_mlp": 1.02687669, + "epoch": 0.39903802795731247, + "flos": 18077216688000.0, + "grad_norm": 2.0422591411720723, + "language_loss": 0.81318372, + "learning_rate": 2.733546971601763e-06, + "loss": 0.83478993, + "num_input_tokens_seen": 142540455, + "step": 6637, + "time_per_iteration": 4.3843090534210205 + }, + { + "auxiliary_loss_clip": 0.0102804, + "auxiliary_loss_mlp": 0.01018728, + "balance_loss_clip": 1.02743387, + "balance_loss_mlp": 1.01694012, + "epoch": 0.39909815120998043, + "flos": 70441367771520.0, + "grad_norm": 0.719892771757815, + "language_loss": 0.53119934, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55166698, + "num_input_tokens_seen": 142599665, + "step": 6638, + "time_per_iteration": 3.2910361289978027 + }, + { + "auxiliary_loss_clip": 0.01112783, + "auxiliary_loss_mlp": 0.00772668, + "balance_loss_clip": 1.04786587, + "balance_loss_mlp": 1.00065207, + "epoch": 0.39915827446264845, + "flos": 18549262247040.0, + "grad_norm": 1.6115719033099838, + "language_loss": 0.75487578, + "learning_rate": 2.732822275578769e-06, + "loss": 0.77373028, + "num_input_tokens_seen": 142618845, + "step": 6639, + "time_per_iteration": 2.7083969116210938 + }, + { + "auxiliary_loss_clip": 0.0105821, + "auxiliary_loss_mlp": 0.01036909, + "balance_loss_clip": 1.03856301, + "balance_loss_mlp": 1.022264, + "epoch": 0.3992183977153164, + "flos": 29897249195520.0, + "grad_norm": 2.505539025941121, + "language_loss": 0.76163709, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.78258824, + "num_input_tokens_seen": 142640885, + "step": 6640, + "time_per_iteration": 2.8801841735839844 + }, + { + "auxiliary_loss_clip": 0.01102565, + "auxiliary_loss_mlp": 0.01038995, + "balance_loss_clip": 1.04663992, + "balance_loss_mlp": 1.02430892, + "epoch": 0.3992785209679844, + "flos": 22565080736640.0, + "grad_norm": 2.779199402703341, + "language_loss": 0.81995392, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.84136951, + "num_input_tokens_seen": 142659340, + "step": 6641, + "time_per_iteration": 4.346608638763428 + }, + { + "auxiliary_loss_clip": 0.01136449, + "auxiliary_loss_mlp": 0.01038781, + "balance_loss_clip": 1.05189252, + "balance_loss_mlp": 1.02393353, + "epoch": 0.39933864422065235, + "flos": 19682674974720.0, + "grad_norm": 2.1545130527280985, + "language_loss": 0.76744998, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.78920233, + "num_input_tokens_seen": 142677085, + "step": 6642, + "time_per_iteration": 2.656057596206665 + }, + { + "auxiliary_loss_clip": 0.01106418, + "auxiliary_loss_mlp": 0.01034072, + "balance_loss_clip": 1.04871511, + "balance_loss_mlp": 1.0196898, + "epoch": 0.3993987674733203, + "flos": 23038491012480.0, + "grad_norm": 2.1744041926742788, + "language_loss": 0.72387367, + "learning_rate": 2.731372550178393e-06, + "loss": 0.7452786, + "num_input_tokens_seen": 142694595, + "step": 6643, + "time_per_iteration": 2.680995225906372 + }, + { + "auxiliary_loss_clip": 0.01123145, + "auxiliary_loss_mlp": 0.01040337, + "balance_loss_clip": 1.04840899, + "balance_loss_mlp": 1.02565074, + "epoch": 0.3994588907259883, + "flos": 19390828970880.0, + "grad_norm": 1.7059817149479597, + "language_loss": 0.6665355, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.68817025, + "num_input_tokens_seen": 142714175, + "step": 6644, + "time_per_iteration": 2.6378324031829834 + }, + { + "auxiliary_loss_clip": 0.01130779, + "auxiliary_loss_mlp": 0.0103839, + "balance_loss_clip": 1.04629064, + "balance_loss_mlp": 1.02349472, + "epoch": 0.39951901397865625, + "flos": 13734395758080.0, + "grad_norm": 2.1425296608964937, + "language_loss": 0.78164649, + "learning_rate": 2.730647521020907e-06, + "loss": 0.80333817, + "num_input_tokens_seen": 142730955, + "step": 6645, + "time_per_iteration": 2.6268746852874756 + }, + { + "auxiliary_loss_clip": 0.0112116, + "auxiliary_loss_mlp": 0.01037104, + "balance_loss_clip": 1.04624033, + "balance_loss_mlp": 1.02252507, + "epoch": 0.3995791372313242, + "flos": 23586451966080.0, + "grad_norm": 1.7492924628724136, + "language_loss": 0.69861412, + "learning_rate": 2.73028496487595e-06, + "loss": 0.72019678, + "num_input_tokens_seen": 142751200, + "step": 6646, + "time_per_iteration": 2.7350409030914307 + }, + { + "auxiliary_loss_clip": 0.0107684, + "auxiliary_loss_mlp": 0.01037342, + "balance_loss_clip": 1.03799927, + "balance_loss_mlp": 1.02223825, + "epoch": 0.3996392604839922, + "flos": 21355896268800.0, + "grad_norm": 1.7623657715359762, + "language_loss": 0.72017872, + "learning_rate": 2.729922381038513e-06, + "loss": 0.74132061, + "num_input_tokens_seen": 142770170, + "step": 6647, + "time_per_iteration": 2.7607529163360596 + }, + { + "auxiliary_loss_clip": 0.01093143, + "auxiliary_loss_mlp": 0.01043089, + "balance_loss_clip": 1.04529011, + "balance_loss_mlp": 1.02973795, + "epoch": 0.39969938373666014, + "flos": 26032255914240.0, + "grad_norm": 1.4563496549616326, + "language_loss": 0.74217343, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.7635358, + "num_input_tokens_seen": 142792680, + "step": 6648, + "time_per_iteration": 2.8048219680786133 + }, + { + "auxiliary_loss_clip": 0.01133606, + "auxiliary_loss_mlp": 0.01037616, + "balance_loss_clip": 1.04912674, + "balance_loss_mlp": 1.02281022, + "epoch": 0.3997595069893281, + "flos": 20116367786880.0, + "grad_norm": 2.0433040752683578, + "language_loss": 0.6589973, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.6807096, + "num_input_tokens_seen": 142810510, + "step": 6649, + "time_per_iteration": 2.6976583003997803 + }, + { + "auxiliary_loss_clip": 0.01103049, + "auxiliary_loss_mlp": 0.01042133, + "balance_loss_clip": 1.04713392, + "balance_loss_mlp": 1.02803016, + "epoch": 0.39981963024199607, + "flos": 27783403764480.0, + "grad_norm": 1.7319771659085785, + "language_loss": 0.75106388, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77251565, + "num_input_tokens_seen": 142832455, + "step": 6650, + "time_per_iteration": 2.7441325187683105 + }, + { + "auxiliary_loss_clip": 0.01132922, + "auxiliary_loss_mlp": 0.01042591, + "balance_loss_clip": 1.04873252, + "balance_loss_mlp": 1.02803564, + "epoch": 0.39987975349466404, + "flos": 21944436612480.0, + "grad_norm": 1.5673208577322473, + "language_loss": 0.72102094, + "learning_rate": 2.728471769038975e-06, + "loss": 0.74277604, + "num_input_tokens_seen": 142852590, + "step": 6651, + "time_per_iteration": 2.6027066707611084 + }, + { + "auxiliary_loss_clip": 0.01132958, + "auxiliary_loss_mlp": 0.01045235, + "balance_loss_clip": 1.04850328, + "balance_loss_mlp": 1.03093004, + "epoch": 0.39993987674733206, + "flos": 20704405340160.0, + "grad_norm": 1.8492457158027382, + "language_loss": 0.73126423, + "learning_rate": 2.728109046945403e-06, + "loss": 0.75304615, + "num_input_tokens_seen": 142870595, + "step": 6652, + "time_per_iteration": 2.5880327224731445 + }, + { + "auxiliary_loss_clip": 0.01029168, + "auxiliary_loss_mlp": 0.01002764, + "balance_loss_clip": 1.02822125, + "balance_loss_mlp": 1.00134552, + "epoch": 0.4, + "flos": 61525429862400.0, + "grad_norm": 0.8458278780382239, + "language_loss": 0.60614997, + "learning_rate": 2.727746297241862e-06, + "loss": 0.62646931, + "num_input_tokens_seen": 142925805, + "step": 6653, + "time_per_iteration": 3.1626622676849365 + }, + { + "auxiliary_loss_clip": 0.01093219, + "auxiliary_loss_mlp": 0.01039197, + "balance_loss_clip": 1.04810715, + "balance_loss_mlp": 1.02577376, + "epoch": 0.400060123252668, + "flos": 14502309644160.0, + "grad_norm": 3.0617453661279788, + "language_loss": 0.66701174, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.6883359, + "num_input_tokens_seen": 142943145, + "step": 6654, + "time_per_iteration": 2.696179151535034 + }, + { + "auxiliary_loss_clip": 0.01119303, + "auxiliary_loss_mlp": 0.01043738, + "balance_loss_clip": 1.04738593, + "balance_loss_mlp": 1.03145993, + "epoch": 0.40012024650533595, + "flos": 19093308618240.0, + "grad_norm": 2.461149956206156, + "language_loss": 0.89818919, + "learning_rate": 2.7270207150599e-06, + "loss": 0.91981959, + "num_input_tokens_seen": 142956925, + "step": 6655, + "time_per_iteration": 2.601891279220581 + }, + { + "auxiliary_loss_clip": 0.01100614, + "auxiliary_loss_mlp": 0.01040322, + "balance_loss_clip": 1.04367936, + "balance_loss_mlp": 1.02693462, + "epoch": 0.4001803697580039, + "flos": 29351012094720.0, + "grad_norm": 1.7913118709828861, + "language_loss": 0.73551166, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.75692105, + "num_input_tokens_seen": 142978040, + "step": 6656, + "time_per_iteration": 2.705662727355957 + }, + { + "auxiliary_loss_clip": 0.01131953, + "auxiliary_loss_mlp": 0.01046856, + "balance_loss_clip": 1.04838896, + "balance_loss_mlp": 1.03224063, + "epoch": 0.4002404930106719, + "flos": 20920048640640.0, + "grad_norm": 1.6512050463613386, + "language_loss": 0.73344004, + "learning_rate": 2.726295022603144e-06, + "loss": 0.75522816, + "num_input_tokens_seen": 142998390, + "step": 6657, + "time_per_iteration": 2.7595558166503906 + }, + { + "auxiliary_loss_clip": 0.0113267, + "auxiliary_loss_mlp": 0.01046679, + "balance_loss_clip": 1.04887247, + "balance_loss_mlp": 1.03145635, + "epoch": 0.40030061626333985, + "flos": 28405735827840.0, + "grad_norm": 1.7318374723338787, + "language_loss": 0.79715288, + "learning_rate": 2.725932135056117e-06, + "loss": 0.81894636, + "num_input_tokens_seen": 143021505, + "step": 6658, + "time_per_iteration": 2.6718270778656006 + }, + { + "auxiliary_loss_clip": 0.01115521, + "auxiliary_loss_mlp": 0.01042275, + "balance_loss_clip": 1.04249525, + "balance_loss_mlp": 1.02865553, + "epoch": 0.4003607395160078, + "flos": 25921615046400.0, + "grad_norm": 2.0999446343296317, + "language_loss": 0.77464151, + "learning_rate": 2.72556921998167e-06, + "loss": 0.79621947, + "num_input_tokens_seen": 143041375, + "step": 6659, + "time_per_iteration": 2.7160539627075195 + }, + { + "auxiliary_loss_clip": 0.01118822, + "auxiliary_loss_mlp": 0.01028418, + "balance_loss_clip": 1.04276848, + "balance_loss_mlp": 1.01649117, + "epoch": 0.4004208627686758, + "flos": 20768648814720.0, + "grad_norm": 1.6781351315554156, + "language_loss": 0.72410327, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.74557567, + "num_input_tokens_seen": 143058725, + "step": 6660, + "time_per_iteration": 2.636833429336548 + }, + { + "auxiliary_loss_clip": 0.01101229, + "auxiliary_loss_mlp": 0.01041317, + "balance_loss_clip": 1.04196119, + "balance_loss_mlp": 1.02828765, + "epoch": 0.40048098602134374, + "flos": 24681224638080.0, + "grad_norm": 1.813091564393644, + "language_loss": 0.71008015, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.73150557, + "num_input_tokens_seen": 143076995, + "step": 6661, + "time_per_iteration": 2.6956517696380615 + }, + { + "auxiliary_loss_clip": 0.0113437, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_clip": 1.0506804, + "balance_loss_mlp": 1.02825832, + "epoch": 0.4005411092740117, + "flos": 23185688947200.0, + "grad_norm": 1.8086148623568068, + "language_loss": 0.75526643, + "learning_rate": 2.724480309731437e-06, + "loss": 0.77703071, + "num_input_tokens_seen": 143096780, + "step": 6662, + "time_per_iteration": 2.6232621669769287 + }, + { + "auxiliary_loss_clip": 0.01115634, + "auxiliary_loss_mlp": 0.01036997, + "balance_loss_clip": 1.04385805, + "balance_loss_mlp": 1.02194118, + "epoch": 0.4006012325266797, + "flos": 17522324409600.0, + "grad_norm": 2.00646115239694, + "language_loss": 0.66450548, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.68603182, + "num_input_tokens_seen": 143112590, + "step": 6663, + "time_per_iteration": 2.622520923614502 + }, + { + "auxiliary_loss_clip": 0.01112804, + "auxiliary_loss_mlp": 0.01042686, + "balance_loss_clip": 1.04327071, + "balance_loss_mlp": 1.02767718, + "epoch": 0.40066135577934764, + "flos": 19857200181120.0, + "grad_norm": 2.069962140682172, + "language_loss": 0.86383915, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.88539398, + "num_input_tokens_seen": 143130220, + "step": 6664, + "time_per_iteration": 2.575124979019165 + }, + { + "auxiliary_loss_clip": 0.01119355, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.04696763, + "balance_loss_mlp": 1.0227679, + "epoch": 0.40072147903201566, + "flos": 18150007599360.0, + "grad_norm": 16.441358853078547, + "language_loss": 0.84723455, + "learning_rate": 2.723391152229917e-06, + "loss": 0.86879396, + "num_input_tokens_seen": 143147160, + "step": 6665, + "time_per_iteration": 2.671715259552002 + }, + { + "auxiliary_loss_clip": 0.01119739, + "auxiliary_loss_mlp": 0.01037355, + "balance_loss_clip": 1.04976356, + "balance_loss_mlp": 1.02249575, + "epoch": 0.4007816022846836, + "flos": 18661267831680.0, + "grad_norm": 1.8896907519127706, + "language_loss": 0.78118432, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.80275524, + "num_input_tokens_seen": 143164605, + "step": 6666, + "time_per_iteration": 2.606566905975342 + }, + { + "auxiliary_loss_clip": 0.01120664, + "auxiliary_loss_mlp": 0.01038255, + "balance_loss_clip": 1.0485028, + "balance_loss_mlp": 1.02380657, + "epoch": 0.4008417255373516, + "flos": 25703170485120.0, + "grad_norm": 1.7955817814438895, + "language_loss": 0.73301423, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.75460339, + "num_input_tokens_seen": 143183965, + "step": 6667, + "time_per_iteration": 2.652503490447998 + }, + { + "auxiliary_loss_clip": 0.0111465, + "auxiliary_loss_mlp": 0.01054818, + "balance_loss_clip": 1.04516435, + "balance_loss_mlp": 1.03899896, + "epoch": 0.40090184879001955, + "flos": 22858614679680.0, + "grad_norm": 1.708550182183753, + "language_loss": 0.76022822, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.78192288, + "num_input_tokens_seen": 143204965, + "step": 6668, + "time_per_iteration": 2.6797566413879395 + }, + { + "auxiliary_loss_clip": 0.01096645, + "auxiliary_loss_mlp": 0.01046849, + "balance_loss_clip": 1.04792619, + "balance_loss_mlp": 1.0321629, + "epoch": 0.4009619720426875, + "flos": 29059848449280.0, + "grad_norm": 2.335244314112793, + "language_loss": 0.8221435, + "learning_rate": 2.721938558257248e-06, + "loss": 0.84357846, + "num_input_tokens_seen": 143225015, + "step": 6669, + "time_per_iteration": 2.7661361694335938 + }, + { + "auxiliary_loss_clip": 0.010311, + "auxiliary_loss_mlp": 0.01009516, + "balance_loss_clip": 1.02684975, + "balance_loss_mlp": 1.00805604, + "epoch": 0.4010220952953555, + "flos": 66059763131520.0, + "grad_norm": 0.69994773813092, + "language_loss": 0.53312683, + "learning_rate": 2.721575341289695e-06, + "loss": 0.55353302, + "num_input_tokens_seen": 143294925, + "step": 6670, + "time_per_iteration": 3.5547046661376953 + }, + { + "auxiliary_loss_clip": 0.01083638, + "auxiliary_loss_mlp": 0.01041448, + "balance_loss_clip": 1.04546833, + "balance_loss_mlp": 1.02720881, + "epoch": 0.40108221854802345, + "flos": 29642822184960.0, + "grad_norm": 1.626307597556219, + "language_loss": 0.88544351, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.90669441, + "num_input_tokens_seen": 143314170, + "step": 6671, + "time_per_iteration": 2.9112329483032227 + }, + { + "auxiliary_loss_clip": 0.01119533, + "auxiliary_loss_mlp": 0.01036462, + "balance_loss_clip": 1.04568124, + "balance_loss_mlp": 1.02137589, + "epoch": 0.4011423418006914, + "flos": 19929560129280.0, + "grad_norm": 3.0264857347014993, + "language_loss": 0.79105932, + "learning_rate": 2.720848825281736e-06, + "loss": 0.81261927, + "num_input_tokens_seen": 143330050, + "step": 6672, + "time_per_iteration": 2.789889335632324 + }, + { + "auxiliary_loss_clip": 0.01096186, + "auxiliary_loss_mlp": 0.01045513, + "balance_loss_clip": 1.04610085, + "balance_loss_mlp": 1.03012288, + "epoch": 0.4012024650533594, + "flos": 20084299920000.0, + "grad_norm": 4.192283777131793, + "language_loss": 0.6293034, + "learning_rate": 2.72048552626888e-06, + "loss": 0.65072036, + "num_input_tokens_seen": 143348650, + "step": 6673, + "time_per_iteration": 2.796834945678711 + }, + { + "auxiliary_loss_clip": 0.011055, + "auxiliary_loss_mlp": 0.00771502, + "balance_loss_clip": 1.04474831, + "balance_loss_mlp": 1.00076985, + "epoch": 0.40126258830602735, + "flos": 21695719864320.0, + "grad_norm": 1.5776272245666931, + "language_loss": 0.79948354, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.81825352, + "num_input_tokens_seen": 143370275, + "step": 6674, + "time_per_iteration": 4.298279523849487 + }, + { + "auxiliary_loss_clip": 0.0108893, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_clip": 1.04919565, + "balance_loss_mlp": 1.02610552, + "epoch": 0.4013227115586953, + "flos": 12020379592320.0, + "grad_norm": 6.494329221896898, + "language_loss": 0.82218468, + "learning_rate": 2.719758846294294e-06, + "loss": 0.84348273, + "num_input_tokens_seen": 143385390, + "step": 6675, + "time_per_iteration": 2.7607553005218506 + }, + { + "auxiliary_loss_clip": 0.01116053, + "auxiliary_loss_mlp": 0.01038994, + "balance_loss_clip": 1.04261947, + "balance_loss_mlp": 1.02364039, + "epoch": 0.4013828348113633, + "flos": 25447522412160.0, + "grad_norm": 2.205024073964141, + "language_loss": 0.93500578, + "learning_rate": 2.71939546536012e-06, + "loss": 0.95655626, + "num_input_tokens_seen": 143404215, + "step": 6676, + "time_per_iteration": 5.81420373916626 + }, + { + "auxiliary_loss_clip": 0.01126662, + "auxiliary_loss_mlp": 0.01041717, + "balance_loss_clip": 1.04832482, + "balance_loss_mlp": 1.02589226, + "epoch": 0.40144295806403124, + "flos": 18582946225920.0, + "grad_norm": 2.1287377468959727, + "language_loss": 0.79300511, + "learning_rate": 2.719032057146399e-06, + "loss": 0.81468892, + "num_input_tokens_seen": 143422245, + "step": 6677, + "time_per_iteration": 2.6485939025878906 + }, + { + "auxiliary_loss_clip": 0.01107812, + "auxiliary_loss_mlp": 0.01035756, + "balance_loss_clip": 1.04743207, + "balance_loss_mlp": 1.02122426, + "epoch": 0.4015030813166992, + "flos": 22930220442240.0, + "grad_norm": 2.404301700652251, + "language_loss": 0.83507645, + "learning_rate": 2.71866862166691e-06, + "loss": 0.85651207, + "num_input_tokens_seen": 143443130, + "step": 6678, + "time_per_iteration": 2.749229907989502 + }, + { + "auxiliary_loss_clip": 0.01127798, + "auxiliary_loss_mlp": 0.01039278, + "balance_loss_clip": 1.04660463, + "balance_loss_mlp": 1.02481759, + "epoch": 0.4015632045693672, + "flos": 20595057361920.0, + "grad_norm": 2.137342142944676, + "language_loss": 0.63547456, + "learning_rate": 2.718305158935434e-06, + "loss": 0.65714526, + "num_input_tokens_seen": 143461385, + "step": 6679, + "time_per_iteration": 4.272741794586182 + }, + { + "auxiliary_loss_clip": 0.01100371, + "auxiliary_loss_mlp": 0.01032462, + "balance_loss_clip": 1.04277802, + "balance_loss_mlp": 1.01852596, + "epoch": 0.4016233278220352, + "flos": 23438930808960.0, + "grad_norm": 2.2420809209281582, + "language_loss": 0.78955674, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.81088501, + "num_input_tokens_seen": 143481750, + "step": 6680, + "time_per_iteration": 2.6541543006896973 + }, + { + "auxiliary_loss_clip": 0.01099744, + "auxiliary_loss_mlp": 0.00773185, + "balance_loss_clip": 1.04565692, + "balance_loss_mlp": 1.0009259, + "epoch": 0.40168345107470316, + "flos": 21431057477760.0, + "grad_norm": 1.5474671150398438, + "language_loss": 0.75901389, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.77774316, + "num_input_tokens_seen": 143501540, + "step": 6681, + "time_per_iteration": 2.747549295425415 + }, + { + "auxiliary_loss_clip": 0.01092334, + "auxiliary_loss_mlp": 0.01031295, + "balance_loss_clip": 1.04743123, + "balance_loss_mlp": 1.01728785, + "epoch": 0.4017435743273711, + "flos": 22857214049280.0, + "grad_norm": 1.9537198932922564, + "language_loss": 0.64593118, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.66716748, + "num_input_tokens_seen": 143520530, + "step": 6682, + "time_per_iteration": 2.764676094055176 + }, + { + "auxiliary_loss_clip": 0.01084656, + "auxiliary_loss_mlp": 0.01040039, + "balance_loss_clip": 1.04031992, + "balance_loss_mlp": 1.025424, + "epoch": 0.4018036975800391, + "flos": 28622312881920.0, + "grad_norm": 8.033606907615594, + "language_loss": 0.72794902, + "learning_rate": 2.716851035765337e-06, + "loss": 0.74919599, + "num_input_tokens_seen": 143540210, + "step": 6683, + "time_per_iteration": 2.9106507301330566 + }, + { + "auxiliary_loss_clip": 0.01116481, + "auxiliary_loss_mlp": 0.01043307, + "balance_loss_clip": 1.04472065, + "balance_loss_mlp": 1.02844119, + "epoch": 0.40186382083270705, + "flos": 26651212099200.0, + "grad_norm": 1.6079104273266733, + "language_loss": 0.73560667, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.75720453, + "num_input_tokens_seen": 143560940, + "step": 6684, + "time_per_iteration": 2.814746141433716 + }, + { + "auxiliary_loss_clip": 0.01038178, + "auxiliary_loss_mlp": 0.01003165, + "balance_loss_clip": 1.02248073, + "balance_loss_mlp": 1.00177026, + "epoch": 0.401923944085375, + "flos": 59259969123840.0, + "grad_norm": 0.8040960642815781, + "language_loss": 0.6037817, + "learning_rate": 2.716123811026767e-06, + "loss": 0.6241951, + "num_input_tokens_seen": 143624015, + "step": 6685, + "time_per_iteration": 3.3159523010253906 + }, + { + "auxiliary_loss_clip": 0.01121727, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.04626095, + "balance_loss_mlp": 1.01806533, + "epoch": 0.401984067338043, + "flos": 16982803152000.0, + "grad_norm": 2.1640557725493563, + "language_loss": 0.69947135, + "learning_rate": 2.715760157917357e-06, + "loss": 0.7210151, + "num_input_tokens_seen": 143642750, + "step": 6686, + "time_per_iteration": 2.7339890003204346 + }, + { + "auxiliary_loss_clip": 0.01109024, + "auxiliary_loss_mlp": 0.01036336, + "balance_loss_clip": 1.04641056, + "balance_loss_mlp": 1.02213836, + "epoch": 0.40204419059071095, + "flos": 24972496024320.0, + "grad_norm": 1.482832144271372, + "language_loss": 0.74904519, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.77049881, + "num_input_tokens_seen": 143664515, + "step": 6687, + "time_per_iteration": 2.7403111457824707 + }, + { + "auxiliary_loss_clip": 0.01110823, + "auxiliary_loss_mlp": 0.01036282, + "balance_loss_clip": 1.04890549, + "balance_loss_mlp": 1.02179182, + "epoch": 0.4021043138433789, + "flos": 23477463123840.0, + "grad_norm": 1.9109413621033529, + "language_loss": 0.71165651, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.73312759, + "num_input_tokens_seen": 143683135, + "step": 6688, + "time_per_iteration": 2.7349321842193604 + }, + { + "auxiliary_loss_clip": 0.01105847, + "auxiliary_loss_mlp": 0.01043979, + "balance_loss_clip": 1.0426929, + "balance_loss_mlp": 1.02785039, + "epoch": 0.4021644370960469, + "flos": 25995806588160.0, + "grad_norm": 2.0144045301965248, + "language_loss": 0.64289308, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.66439128, + "num_input_tokens_seen": 143703985, + "step": 6689, + "time_per_iteration": 2.740938186645508 + }, + { + "auxiliary_loss_clip": 0.0112261, + "auxiliary_loss_mlp": 0.01032805, + "balance_loss_clip": 1.04519129, + "balance_loss_mlp": 1.01838636, + "epoch": 0.40222456034871484, + "flos": 13587987922560.0, + "grad_norm": 2.8658666003554147, + "language_loss": 0.7358911, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.75744528, + "num_input_tokens_seen": 143719245, + "step": 6690, + "time_per_iteration": 2.622920513153076 + }, + { + "auxiliary_loss_clip": 0.01099316, + "auxiliary_loss_mlp": 0.01037502, + "balance_loss_clip": 1.04444623, + "balance_loss_mlp": 1.0230422, + "epoch": 0.4022846836013828, + "flos": 24278019494400.0, + "grad_norm": 1.7112869735009542, + "language_loss": 0.74805617, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.76942438, + "num_input_tokens_seen": 143739575, + "step": 6691, + "time_per_iteration": 2.704138994216919 + }, + { + "auxiliary_loss_clip": 0.0111344, + "auxiliary_loss_mlp": 0.01040266, + "balance_loss_clip": 1.0485332, + "balance_loss_mlp": 1.02509689, + "epoch": 0.40234480685405083, + "flos": 20151596050560.0, + "grad_norm": 1.5633314974955987, + "language_loss": 0.7267946, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.74833167, + "num_input_tokens_seen": 143758515, + "step": 6692, + "time_per_iteration": 2.6782071590423584 + }, + { + "auxiliary_loss_clip": 0.01081716, + "auxiliary_loss_mlp": 0.0103731, + "balance_loss_clip": 1.04122448, + "balance_loss_mlp": 1.02274227, + "epoch": 0.4024049301067188, + "flos": 22930220442240.0, + "grad_norm": 2.743543242099247, + "language_loss": 0.84403068, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.8652209, + "num_input_tokens_seen": 143776770, + "step": 6693, + "time_per_iteration": 2.746689558029175 + }, + { + "auxiliary_loss_clip": 0.01092043, + "auxiliary_loss_mlp": 0.0104876, + "balance_loss_clip": 1.04803836, + "balance_loss_mlp": 1.03265464, + "epoch": 0.40246505335938676, + "flos": 36028421487360.0, + "grad_norm": 2.4363636021200716, + "language_loss": 0.70996636, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.73137438, + "num_input_tokens_seen": 143798450, + "step": 6694, + "time_per_iteration": 2.8071961402893066 + }, + { + "auxiliary_loss_clip": 0.01104186, + "auxiliary_loss_mlp": 0.01044295, + "balance_loss_clip": 1.04619551, + "balance_loss_mlp": 1.0292511, + "epoch": 0.4025251766120547, + "flos": 20594303176320.0, + "grad_norm": 2.4336892369471976, + "language_loss": 0.67823637, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.6997211, + "num_input_tokens_seen": 143816995, + "step": 6695, + "time_per_iteration": 2.628509283065796 + }, + { + "auxiliary_loss_clip": 0.01100807, + "auxiliary_loss_mlp": 0.01043105, + "balance_loss_clip": 1.04269171, + "balance_loss_mlp": 1.0272975, + "epoch": 0.4025852998647227, + "flos": 64523932381440.0, + "grad_norm": 2.090135381279502, + "language_loss": 0.79316044, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.81459951, + "num_input_tokens_seen": 143842090, + "step": 6696, + "time_per_iteration": 3.065619707107544 + }, + { + "auxiliary_loss_clip": 0.01107424, + "auxiliary_loss_mlp": 0.0105453, + "balance_loss_clip": 1.04772997, + "balance_loss_mlp": 1.03700638, + "epoch": 0.40264542311739066, + "flos": 20886292834560.0, + "grad_norm": 2.0469796766510164, + "language_loss": 0.71048194, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.73210156, + "num_input_tokens_seen": 143860800, + "step": 6697, + "time_per_iteration": 2.732112169265747 + }, + { + "auxiliary_loss_clip": 0.01119834, + "auxiliary_loss_mlp": 0.01045848, + "balance_loss_clip": 1.04644823, + "balance_loss_mlp": 1.03167999, + "epoch": 0.4027055463700586, + "flos": 26250197685120.0, + "grad_norm": 2.1595912992700725, + "language_loss": 0.6184175, + "learning_rate": 2.711394207496984e-06, + "loss": 0.64007437, + "num_input_tokens_seen": 143878950, + "step": 6698, + "time_per_iteration": 2.6853909492492676 + }, + { + "auxiliary_loss_clip": 0.01122685, + "auxiliary_loss_mlp": 0.01038255, + "balance_loss_clip": 1.04787982, + "balance_loss_mlp": 1.02309155, + "epoch": 0.4027656696227266, + "flos": 20631398947200.0, + "grad_norm": 2.043260848719272, + "language_loss": 0.76455128, + "learning_rate": 2.711030202621491e-06, + "loss": 0.78616071, + "num_input_tokens_seen": 143898385, + "step": 6699, + "time_per_iteration": 2.6033456325531006 + }, + { + "auxiliary_loss_clip": 0.01093615, + "auxiliary_loss_mlp": 0.01030999, + "balance_loss_clip": 1.04446507, + "balance_loss_mlp": 1.01700354, + "epoch": 0.40282579287539455, + "flos": 22346277039360.0, + "grad_norm": 1.6890007857677205, + "language_loss": 0.80442715, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.82567334, + "num_input_tokens_seen": 143918795, + "step": 6700, + "time_per_iteration": 2.777510404586792 + }, + { + "auxiliary_loss_clip": 0.01112643, + "auxiliary_loss_mlp": 0.01045016, + "balance_loss_clip": 1.04943717, + "balance_loss_mlp": 1.02808821, + "epoch": 0.4028859161280625, + "flos": 29274988959360.0, + "grad_norm": 2.176323872107602, + "language_loss": 0.74529326, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.7668699, + "num_input_tokens_seen": 143938245, + "step": 6701, + "time_per_iteration": 2.7424893379211426 + }, + { + "auxiliary_loss_clip": 0.01099003, + "auxiliary_loss_mlp": 0.01037912, + "balance_loss_clip": 1.04379773, + "balance_loss_mlp": 1.02355886, + "epoch": 0.4029460393807305, + "flos": 28622312881920.0, + "grad_norm": 1.8130604516939894, + "language_loss": 0.66064012, + "learning_rate": 2.709938026276208e-06, + "loss": 0.68200922, + "num_input_tokens_seen": 143960995, + "step": 6702, + "time_per_iteration": 2.7448410987854004 + }, + { + "auxiliary_loss_clip": 0.01105222, + "auxiliary_loss_mlp": 0.01045108, + "balance_loss_clip": 1.0470736, + "balance_loss_mlp": 1.02900267, + "epoch": 0.40300616263339845, + "flos": 22601925112320.0, + "grad_norm": 1.86356350955038, + "language_loss": 0.66031915, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.68182242, + "num_input_tokens_seen": 143979910, + "step": 6703, + "time_per_iteration": 2.679979085922241 + }, + { + "auxiliary_loss_clip": 0.01060539, + "auxiliary_loss_mlp": 0.01041946, + "balance_loss_clip": 1.04386449, + "balance_loss_mlp": 1.02445817, + "epoch": 0.4030662858860664, + "flos": 25520313323520.0, + "grad_norm": 2.0398618821746304, + "language_loss": 0.82689512, + "learning_rate": 2.709209774085071e-06, + "loss": 0.84792, + "num_input_tokens_seen": 144000095, + "step": 6704, + "time_per_iteration": 2.9296765327453613 + }, + { + "auxiliary_loss_clip": 0.01112771, + "auxiliary_loss_mlp": 0.01040131, + "balance_loss_clip": 1.04960763, + "balance_loss_mlp": 1.02517009, + "epoch": 0.40312640913873443, + "flos": 23586703361280.0, + "grad_norm": 1.6638111373196858, + "language_loss": 0.73759186, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.75912088, + "num_input_tokens_seen": 144019695, + "step": 6705, + "time_per_iteration": 3.0039970874786377 + }, + { + "auxiliary_loss_clip": 0.0111798, + "auxiliary_loss_mlp": 0.01039471, + "balance_loss_clip": 1.04735386, + "balance_loss_mlp": 1.02541077, + "epoch": 0.4031865323914024, + "flos": 20011042131840.0, + "grad_norm": 1.7718881662691552, + "language_loss": 0.65816283, + "learning_rate": 2.708481414320713e-06, + "loss": 0.67973745, + "num_input_tokens_seen": 144038525, + "step": 6706, + "time_per_iteration": 2.6920299530029297 + }, + { + "auxiliary_loss_clip": 0.01123098, + "auxiliary_loss_mlp": 0.01039977, + "balance_loss_clip": 1.05084229, + "balance_loss_mlp": 1.02508759, + "epoch": 0.40324665564407036, + "flos": 21871430219520.0, + "grad_norm": 1.5916886093016338, + "language_loss": 0.71493578, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.73656654, + "num_input_tokens_seen": 144059485, + "step": 6707, + "time_per_iteration": 2.6424286365509033 + }, + { + "auxiliary_loss_clip": 0.01104664, + "auxiliary_loss_mlp": 0.0103554, + "balance_loss_clip": 1.04652226, + "balance_loss_mlp": 1.0201261, + "epoch": 0.4033067788967383, + "flos": 23878728933120.0, + "grad_norm": 1.6049010195706548, + "language_loss": 0.79860801, + "learning_rate": 2.707752947093611e-06, + "loss": 0.82001007, + "num_input_tokens_seen": 144080265, + "step": 6708, + "time_per_iteration": 2.7476210594177246 + }, + { + "auxiliary_loss_clip": 0.01081311, + "auxiliary_loss_mlp": 0.01041497, + "balance_loss_clip": 1.04192591, + "balance_loss_mlp": 1.0254873, + "epoch": 0.4033669021494063, + "flos": 17419907756160.0, + "grad_norm": 2.2092970812397823, + "language_loss": 0.82527256, + "learning_rate": 2.70738867321606e-06, + "loss": 0.84650064, + "num_input_tokens_seen": 144098040, + "step": 6709, + "time_per_iteration": 2.6981422901153564 + }, + { + "auxiliary_loss_clip": 0.01126319, + "auxiliary_loss_mlp": 0.01037264, + "balance_loss_clip": 1.052701, + "balance_loss_mlp": 1.02168322, + "epoch": 0.40342702540207426, + "flos": 29600554855680.0, + "grad_norm": 3.462855853005799, + "language_loss": 0.71349508, + "learning_rate": 2.70702437251426e-06, + "loss": 0.73513091, + "num_input_tokens_seen": 144118265, + "step": 6710, + "time_per_iteration": 2.745234727859497 + }, + { + "auxiliary_loss_clip": 0.01100277, + "auxiliary_loss_mlp": 0.01040518, + "balance_loss_clip": 1.0461812, + "balance_loss_mlp": 1.02506852, + "epoch": 0.4034871486547422, + "flos": 11284605400320.0, + "grad_norm": 2.0008015592173285, + "language_loss": 0.8497777, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.8711856, + "num_input_tokens_seen": 144133865, + "step": 6711, + "time_per_iteration": 2.6388518810272217 + }, + { + "auxiliary_loss_clip": 0.01124865, + "auxiliary_loss_mlp": 0.01037377, + "balance_loss_clip": 1.04873466, + "balance_loss_mlp": 1.02192783, + "epoch": 0.4035472719074102, + "flos": 15552839738880.0, + "grad_norm": 1.9288958482484087, + "language_loss": 0.76210845, + "learning_rate": 2.706295690693168e-06, + "loss": 0.78373086, + "num_input_tokens_seen": 144150125, + "step": 6712, + "time_per_iteration": 2.617612838745117 + }, + { + "auxiliary_loss_clip": 0.0110296, + "auxiliary_loss_mlp": 0.01042297, + "balance_loss_clip": 1.0465771, + "balance_loss_mlp": 1.02682328, + "epoch": 0.40360739516007815, + "flos": 24674365140480.0, + "grad_norm": 2.8401310029686284, + "language_loss": 0.79334903, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.81480157, + "num_input_tokens_seen": 144169295, + "step": 6713, + "time_per_iteration": 4.2229533195495605 + }, + { + "auxiliary_loss_clip": 0.01096327, + "auxiliary_loss_mlp": 0.01040909, + "balance_loss_clip": 1.04259837, + "balance_loss_mlp": 1.02437484, + "epoch": 0.4036675184127461, + "flos": 17304095329920.0, + "grad_norm": 2.4269881355691867, + "language_loss": 0.88230258, + "learning_rate": 2.705566901740865e-06, + "loss": 0.90367496, + "num_input_tokens_seen": 144185790, + "step": 6714, + "time_per_iteration": 2.6861040592193604 + }, + { + "auxiliary_loss_clip": 0.0112277, + "auxiliary_loss_mlp": 0.01042461, + "balance_loss_clip": 1.04913116, + "balance_loss_mlp": 1.02755439, + "epoch": 0.4037276416654141, + "flos": 19864023765120.0, + "grad_norm": 1.685218394347131, + "language_loss": 0.69355965, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.71521199, + "num_input_tokens_seen": 144205190, + "step": 6715, + "time_per_iteration": 6.05805778503418 + }, + { + "auxiliary_loss_clip": 0.01085368, + "auxiliary_loss_mlp": 0.01039127, + "balance_loss_clip": 1.03982067, + "balance_loss_mlp": 1.02422547, + "epoch": 0.40378776491808205, + "flos": 18296271780480.0, + "grad_norm": 2.4042590138214543, + "language_loss": 0.7738142, + "learning_rate": 2.704838005767892e-06, + "loss": 0.7950592, + "num_input_tokens_seen": 144222705, + "step": 6716, + "time_per_iteration": 2.874701738357544 + }, + { + "auxiliary_loss_clip": 0.01084201, + "auxiliary_loss_mlp": 0.01039901, + "balance_loss_clip": 1.04515779, + "balance_loss_mlp": 1.02554834, + "epoch": 0.40384788817075, + "flos": 15049372757760.0, + "grad_norm": 1.8822370621315767, + "language_loss": 0.7590825, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.78032351, + "num_input_tokens_seen": 144239545, + "step": 6717, + "time_per_iteration": 2.806605339050293 + }, + { + "auxiliary_loss_clip": 0.01034573, + "auxiliary_loss_mlp": 0.01006348, + "balance_loss_clip": 1.03120637, + "balance_loss_mlp": 1.00481057, + "epoch": 0.40390801142341803, + "flos": 61929927895680.0, + "grad_norm": 0.9365934623644069, + "language_loss": 0.60732949, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.62773865, + "num_input_tokens_seen": 144288145, + "step": 6718, + "time_per_iteration": 4.683047771453857 + }, + { + "auxiliary_loss_clip": 0.01137275, + "auxiliary_loss_mlp": 0.01039366, + "balance_loss_clip": 1.04942691, + "balance_loss_mlp": 1.02322555, + "epoch": 0.403968134676086, + "flos": 22738779930240.0, + "grad_norm": 2.360676977629441, + "language_loss": 0.74748445, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.76925087, + "num_input_tokens_seen": 144302315, + "step": 6719, + "time_per_iteration": 2.6020865440368652 + }, + { + "auxiliary_loss_clip": 0.01122679, + "auxiliary_loss_mlp": 0.01042794, + "balance_loss_clip": 1.04766619, + "balance_loss_mlp": 1.02643895, + "epoch": 0.40402825792875396, + "flos": 19784409269760.0, + "grad_norm": 2.123342604077105, + "language_loss": 0.81516802, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.83682275, + "num_input_tokens_seen": 144318990, + "step": 6720, + "time_per_iteration": 2.6707048416137695 + }, + { + "auxiliary_loss_clip": 0.01106407, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.04365981, + "balance_loss_mlp": 1.01866555, + "epoch": 0.40408838118142193, + "flos": 19609273532160.0, + "grad_norm": 2.786601864332057, + "language_loss": 0.77150661, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.79290426, + "num_input_tokens_seen": 144335765, + "step": 6721, + "time_per_iteration": 2.648050546646118 + }, + { + "auxiliary_loss_clip": 0.01091711, + "auxiliary_loss_mlp": 0.0102956, + "balance_loss_clip": 1.04391122, + "balance_loss_mlp": 1.01643503, + "epoch": 0.4041485044340899, + "flos": 24426043441920.0, + "grad_norm": 2.012609049395132, + "language_loss": 0.72214961, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.74336231, + "num_input_tokens_seen": 144355825, + "step": 6722, + "time_per_iteration": 2.7598764896392822 + }, + { + "auxiliary_loss_clip": 0.01117849, + "auxiliary_loss_mlp": 0.01035294, + "balance_loss_clip": 1.04649532, + "balance_loss_mlp": 1.02137017, + "epoch": 0.40420862768675786, + "flos": 16760192613120.0, + "grad_norm": 2.003025152561758, + "language_loss": 0.66099858, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.68252993, + "num_input_tokens_seen": 144374320, + "step": 6723, + "time_per_iteration": 2.6525375843048096 + }, + { + "auxiliary_loss_clip": 0.0111764, + "auxiliary_loss_mlp": 0.01047962, + "balance_loss_clip": 1.04678059, + "balance_loss_mlp": 1.03247619, + "epoch": 0.4042687509394258, + "flos": 22491571553280.0, + "grad_norm": 1.6479262490520643, + "language_loss": 0.73566139, + "learning_rate": 2.701921353880734e-06, + "loss": 0.75731742, + "num_input_tokens_seen": 144394325, + "step": 6724, + "time_per_iteration": 2.6602234840393066 + }, + { + "auxiliary_loss_clip": 0.01096943, + "auxiliary_loss_mlp": 0.01034012, + "balance_loss_clip": 1.04471684, + "balance_loss_mlp": 1.02009475, + "epoch": 0.4043288741920938, + "flos": 30336149479680.0, + "grad_norm": 1.8514955948130458, + "language_loss": 0.74733102, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.76864064, + "num_input_tokens_seen": 144412765, + "step": 6725, + "time_per_iteration": 2.7086737155914307 + }, + { + "auxiliary_loss_clip": 0.01116531, + "auxiliary_loss_mlp": 0.01035939, + "balance_loss_clip": 1.04757476, + "balance_loss_mlp": 1.02062047, + "epoch": 0.40438899744476176, + "flos": 46348321363200.0, + "grad_norm": 2.3229573968410766, + "language_loss": 0.76987183, + "learning_rate": 2.701191924463126e-06, + "loss": 0.7913965, + "num_input_tokens_seen": 144435400, + "step": 6726, + "time_per_iteration": 2.880244493484497 + }, + { + "auxiliary_loss_clip": 0.01102844, + "auxiliary_loss_mlp": 0.00775301, + "balance_loss_clip": 1.04148483, + "balance_loss_mlp": 1.00105536, + "epoch": 0.4044491206974297, + "flos": 13333524998400.0, + "grad_norm": 2.125548317574291, + "language_loss": 0.8180182, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.83679968, + "num_input_tokens_seen": 144452925, + "step": 6727, + "time_per_iteration": 2.6953587532043457 + }, + { + "auxiliary_loss_clip": 0.01128783, + "auxiliary_loss_mlp": 0.01036901, + "balance_loss_clip": 1.04577255, + "balance_loss_mlp": 1.02264905, + "epoch": 0.4045092439500977, + "flos": 12093745121280.0, + "grad_norm": 2.0701087852414504, + "language_loss": 0.85462439, + "learning_rate": 2.700462388688447e-06, + "loss": 0.87628114, + "num_input_tokens_seen": 144470195, + "step": 6728, + "time_per_iteration": 2.5963056087493896 + }, + { + "auxiliary_loss_clip": 0.01095663, + "auxiliary_loss_mlp": 0.01043865, + "balance_loss_clip": 1.04611719, + "balance_loss_mlp": 1.029351, + "epoch": 0.40456936720276565, + "flos": 21179683123200.0, + "grad_norm": 1.739738235535384, + "language_loss": 0.81606215, + "learning_rate": 2.700097580951786e-06, + "loss": 0.83745748, + "num_input_tokens_seen": 144490320, + "step": 6729, + "time_per_iteration": 2.8157620429992676 + }, + { + "auxiliary_loss_clip": 0.01105665, + "auxiliary_loss_mlp": 0.01043945, + "balance_loss_clip": 1.0443244, + "balance_loss_mlp": 1.02993762, + "epoch": 0.4046294904554336, + "flos": 23915286000000.0, + "grad_norm": 1.917482865643355, + "language_loss": 0.73375344, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.75524956, + "num_input_tokens_seen": 144508990, + "step": 6730, + "time_per_iteration": 2.67053484916687 + }, + { + "auxiliary_loss_clip": 0.01113781, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.04319108, + "balance_loss_mlp": 1.02674532, + "epoch": 0.4046896137081016, + "flos": 38071235773440.0, + "grad_norm": 2.5953767613834673, + "language_loss": 0.67485142, + "learning_rate": 2.699367885848985e-06, + "loss": 0.69640195, + "num_input_tokens_seen": 144529550, + "step": 6731, + "time_per_iteration": 2.8106632232666016 + }, + { + "auxiliary_loss_clip": 0.01128909, + "auxiliary_loss_mlp": 0.01038653, + "balance_loss_clip": 1.04689097, + "balance_loss_mlp": 1.02531338, + "epoch": 0.4047497369607696, + "flos": 23617262856960.0, + "grad_norm": 1.5691591770044138, + "language_loss": 0.74245793, + "learning_rate": 2.699002998510517e-06, + "loss": 0.76413357, + "num_input_tokens_seen": 144549310, + "step": 6732, + "time_per_iteration": 2.6608641147613525 + }, + { + "auxiliary_loss_clip": 0.0110044, + "auxiliary_loss_mlp": 0.00770096, + "balance_loss_clip": 1.04635525, + "balance_loss_mlp": 1.00099349, + "epoch": 0.40480986021343757, + "flos": 12823593569280.0, + "grad_norm": 1.738611378800115, + "language_loss": 0.77579916, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.79450446, + "num_input_tokens_seen": 144567430, + "step": 6733, + "time_per_iteration": 2.648707151412964 + }, + { + "auxiliary_loss_clip": 0.01102753, + "auxiliary_loss_mlp": 0.01043236, + "balance_loss_clip": 1.04195142, + "balance_loss_mlp": 1.0276798, + "epoch": 0.40486998346610553, + "flos": 23768770423680.0, + "grad_norm": 1.875618790304424, + "language_loss": 0.76887047, + "learning_rate": 2.698273144328627e-06, + "loss": 0.79033035, + "num_input_tokens_seen": 144585975, + "step": 6734, + "time_per_iteration": 2.7222812175750732 + }, + { + "auxiliary_loss_clip": 0.01110956, + "auxiliary_loss_mlp": 0.01032993, + "balance_loss_clip": 1.04893517, + "balance_loss_mlp": 1.01923609, + "epoch": 0.4049301067187735, + "flos": 22856818999680.0, + "grad_norm": 2.463703641644531, + "language_loss": 0.64536786, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.66680741, + "num_input_tokens_seen": 144605225, + "step": 6735, + "time_per_iteration": 2.682111978530884 + }, + { + "auxiliary_loss_clip": 0.01088904, + "auxiliary_loss_mlp": 0.01039113, + "balance_loss_clip": 1.04142201, + "balance_loss_mlp": 1.0247122, + "epoch": 0.40499022997144146, + "flos": 22783992174720.0, + "grad_norm": 1.9621030422141739, + "language_loss": 0.83120507, + "learning_rate": 2.697543184232387e-06, + "loss": 0.85248524, + "num_input_tokens_seen": 144624145, + "step": 6736, + "time_per_iteration": 2.737946033477783 + }, + { + "auxiliary_loss_clip": 0.01103471, + "auxiliary_loss_mlp": 0.00773133, + "balance_loss_clip": 1.04903114, + "balance_loss_mlp": 1.00089931, + "epoch": 0.4050503532241094, + "flos": 23039352938880.0, + "grad_norm": 1.950757015883091, + "language_loss": 0.75173002, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.77049613, + "num_input_tokens_seen": 144644470, + "step": 6737, + "time_per_iteration": 2.7009494304656982 + }, + { + "auxiliary_loss_clip": 0.01119674, + "auxiliary_loss_mlp": 0.01042697, + "balance_loss_clip": 1.04876637, + "balance_loss_mlp": 1.02858806, + "epoch": 0.4051104764767774, + "flos": 16647756065280.0, + "grad_norm": 3.18955375846042, + "language_loss": 0.72142565, + "learning_rate": 2.696813118332519e-06, + "loss": 0.74304938, + "num_input_tokens_seen": 144661055, + "step": 6738, + "time_per_iteration": 2.63269305229187 + }, + { + "auxiliary_loss_clip": 0.01094776, + "auxiliary_loss_mlp": 0.01033432, + "balance_loss_clip": 1.04453516, + "balance_loss_mlp": 1.02065849, + "epoch": 0.40517059972944536, + "flos": 16358962717440.0, + "grad_norm": 1.9585661201522753, + "language_loss": 0.75113159, + "learning_rate": 2.696448045740828e-06, + "loss": 0.77241367, + "num_input_tokens_seen": 144677935, + "step": 6739, + "time_per_iteration": 2.678330421447754 + }, + { + "auxiliary_loss_clip": 0.01092708, + "auxiliary_loss_mlp": 0.01036695, + "balance_loss_clip": 1.04475963, + "balance_loss_mlp": 1.02244925, + "epoch": 0.4052307229821133, + "flos": 28803374363520.0, + "grad_norm": 2.0151914408481066, + "language_loss": 0.73516095, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.75645494, + "num_input_tokens_seen": 144697725, + "step": 6740, + "time_per_iteration": 2.821165084838867 + }, + { + "auxiliary_loss_clip": 0.01111182, + "auxiliary_loss_mlp": 0.01032908, + "balance_loss_clip": 1.04380143, + "balance_loss_mlp": 1.01927674, + "epoch": 0.4052908462347813, + "flos": 21397876289280.0, + "grad_norm": 1.5447802431592257, + "language_loss": 0.77149022, + "learning_rate": 2.695717821343153e-06, + "loss": 0.79293114, + "num_input_tokens_seen": 144718805, + "step": 6741, + "time_per_iteration": 2.639744758605957 + }, + { + "auxiliary_loss_clip": 0.01132474, + "auxiliary_loss_mlp": 0.01039418, + "balance_loss_clip": 1.04797888, + "balance_loss_mlp": 1.02415919, + "epoch": 0.40535096948744925, + "flos": 22419067950720.0, + "grad_norm": 2.3470472177782584, + "language_loss": 0.71132898, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.73304784, + "num_input_tokens_seen": 144737105, + "step": 6742, + "time_per_iteration": 2.566246509552002 + }, + { + "auxiliary_loss_clip": 0.01132445, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.04941666, + "balance_loss_mlp": 1.01739824, + "epoch": 0.4054110927401172, + "flos": 17010776868480.0, + "grad_norm": 2.3285032794047966, + "language_loss": 0.71915448, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.74079311, + "num_input_tokens_seen": 144751350, + "step": 6743, + "time_per_iteration": 2.7150700092315674 + }, + { + "auxiliary_loss_clip": 0.01109405, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.04626715, + "balance_loss_mlp": 1.0209291, + "epoch": 0.4054712159927852, + "flos": 21614848392960.0, + "grad_norm": 2.2533363543989053, + "language_loss": 0.70529258, + "learning_rate": 2.694622286918588e-06, + "loss": 0.72674704, + "num_input_tokens_seen": 144770030, + "step": 6744, + "time_per_iteration": 2.715900421142578 + }, + { + "auxiliary_loss_clip": 0.01118115, + "auxiliary_loss_mlp": 0.01036188, + "balance_loss_clip": 1.04826701, + "balance_loss_mlp": 1.02316439, + "epoch": 0.4055313392454532, + "flos": 25812554376960.0, + "grad_norm": 1.8071994834567642, + "language_loss": 0.80102956, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.82257259, + "num_input_tokens_seen": 144790965, + "step": 6745, + "time_per_iteration": 2.6989259719848633 + }, + { + "auxiliary_loss_clip": 0.01108583, + "auxiliary_loss_mlp": 0.01034972, + "balance_loss_clip": 1.04861784, + "balance_loss_mlp": 1.02049959, + "epoch": 0.40559146249812117, + "flos": 14137098111360.0, + "grad_norm": 1.8906308851954157, + "language_loss": 0.66942173, + "learning_rate": 2.693891798911731e-06, + "loss": 0.69085735, + "num_input_tokens_seen": 144807755, + "step": 6746, + "time_per_iteration": 2.7211005687713623 + }, + { + "auxiliary_loss_clip": 0.01092509, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.04508781, + "balance_loss_mlp": 1.02044201, + "epoch": 0.40565158575078913, + "flos": 41355481962240.0, + "grad_norm": 1.4960206584486848, + "language_loss": 0.57240731, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.59367168, + "num_input_tokens_seen": 144832405, + "step": 6747, + "time_per_iteration": 2.8735926151275635 + }, + { + "auxiliary_loss_clip": 0.0109681, + "auxiliary_loss_mlp": 0.01043537, + "balance_loss_clip": 1.04770565, + "balance_loss_mlp": 1.03084731, + "epoch": 0.4057117090034571, + "flos": 28544529980160.0, + "grad_norm": 1.7545120295248704, + "language_loss": 0.8468259, + "learning_rate": 2.693161205655089e-06, + "loss": 0.86822933, + "num_input_tokens_seen": 144853890, + "step": 6748, + "time_per_iteration": 2.7470786571502686 + }, + { + "auxiliary_loss_clip": 0.01107762, + "auxiliary_loss_mlp": 0.0104113, + "balance_loss_clip": 1.05110598, + "balance_loss_mlp": 1.02695, + "epoch": 0.40577183225612506, + "flos": 18004066640640.0, + "grad_norm": 2.5881063547984398, + "language_loss": 0.81445849, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.83594739, + "num_input_tokens_seen": 144871395, + "step": 6749, + "time_per_iteration": 2.677762746810913 + }, + { + "auxiliary_loss_clip": 0.01119763, + "auxiliary_loss_mlp": 0.00771508, + "balance_loss_clip": 1.04914761, + "balance_loss_mlp": 1.00084698, + "epoch": 0.40583195550879303, + "flos": 19536734016000.0, + "grad_norm": 1.7422987888005266, + "language_loss": 0.75235945, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.77127212, + "num_input_tokens_seen": 144890975, + "step": 6750, + "time_per_iteration": 2.6956052780151367 + }, + { + "auxiliary_loss_clip": 0.0111553, + "auxiliary_loss_mlp": 0.01041156, + "balance_loss_clip": 1.04812646, + "balance_loss_mlp": 1.02654123, + "epoch": 0.405892078761461, + "flos": 22309468577280.0, + "grad_norm": 2.479262216129207, + "language_loss": 0.73942888, + "learning_rate": 2.692065118669195e-06, + "loss": 0.76099575, + "num_input_tokens_seen": 144908170, + "step": 6751, + "time_per_iteration": 2.6845548152923584 + }, + { + "auxiliary_loss_clip": 0.01086462, + "auxiliary_loss_mlp": 0.01042521, + "balance_loss_clip": 1.04832053, + "balance_loss_mlp": 1.02627254, + "epoch": 0.40595220201412896, + "flos": 25484402701440.0, + "grad_norm": 1.7042707146701068, + "language_loss": 0.66690767, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.68819749, + "num_input_tokens_seen": 144928020, + "step": 6752, + "time_per_iteration": 4.372137784957886 + }, + { + "auxiliary_loss_clip": 0.01086822, + "auxiliary_loss_mlp": 0.0104486, + "balance_loss_clip": 1.04698646, + "balance_loss_mlp": 1.02896988, + "epoch": 0.4060123252667969, + "flos": 49856004103680.0, + "grad_norm": 2.0675680438490374, + "language_loss": 0.7062583, + "learning_rate": 2.691334262772948e-06, + "loss": 0.72757506, + "num_input_tokens_seen": 144951240, + "step": 6753, + "time_per_iteration": 2.954685688018799 + }, + { + "auxiliary_loss_clip": 0.0110904, + "auxiliary_loss_mlp": 0.01037036, + "balance_loss_clip": 1.04630709, + "balance_loss_mlp": 1.02162218, + "epoch": 0.4060724485194649, + "flos": 21135476459520.0, + "grad_norm": 1.6674578897026393, + "language_loss": 0.72053552, + "learning_rate": 2.690968795494699e-06, + "loss": 0.74199629, + "num_input_tokens_seen": 144969100, + "step": 6754, + "time_per_iteration": 5.758596420288086 + }, + { + "auxiliary_loss_clip": 0.01097183, + "auxiliary_loss_mlp": 0.01040377, + "balance_loss_clip": 1.04531932, + "balance_loss_mlp": 1.02634573, + "epoch": 0.40613257177213286, + "flos": 21758059918080.0, + "grad_norm": 1.655202233640458, + "language_loss": 0.8301084, + "learning_rate": 2.690603302014844e-06, + "loss": 0.851484, + "num_input_tokens_seen": 144987065, + "step": 6755, + "time_per_iteration": 2.7983388900756836 + }, + { + "auxiliary_loss_clip": 0.01086578, + "auxiliary_loss_mlp": 0.01041496, + "balance_loss_clip": 1.04638743, + "balance_loss_mlp": 1.02645206, + "epoch": 0.4061926950248008, + "flos": 25555074710400.0, + "grad_norm": 1.5597680276021608, + "language_loss": 0.71212381, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.73340452, + "num_input_tokens_seen": 145007310, + "step": 6756, + "time_per_iteration": 2.8140816688537598 + }, + { + "auxiliary_loss_clip": 0.01071802, + "auxiliary_loss_mlp": 0.00773633, + "balance_loss_clip": 1.04193711, + "balance_loss_mlp": 1.00074661, + "epoch": 0.4062528182774688, + "flos": 23695799944320.0, + "grad_norm": 2.0528550033278075, + "language_loss": 0.79103237, + "learning_rate": 2.689872236505755e-06, + "loss": 0.80948675, + "num_input_tokens_seen": 145026210, + "step": 6757, + "time_per_iteration": 4.472316741943359 + }, + { + "auxiliary_loss_clip": 0.01112634, + "auxiliary_loss_mlp": 0.0103161, + "balance_loss_clip": 1.05197811, + "balance_loss_mlp": 1.01777542, + "epoch": 0.4063129415301368, + "flos": 21726027964800.0, + "grad_norm": 1.8573345394429819, + "language_loss": 0.78500074, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.80644321, + "num_input_tokens_seen": 145045475, + "step": 6758, + "time_per_iteration": 2.732006072998047 + }, + { + "auxiliary_loss_clip": 0.01096195, + "auxiliary_loss_mlp": 0.0103788, + "balance_loss_clip": 1.05092061, + "balance_loss_mlp": 1.02355731, + "epoch": 0.40637306478280477, + "flos": 12787575206400.0, + "grad_norm": 2.1068114153090254, + "language_loss": 0.89142424, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.91276503, + "num_input_tokens_seen": 145062260, + "step": 6759, + "time_per_iteration": 2.768120288848877 + }, + { + "auxiliary_loss_clip": 0.0109872, + "auxiliary_loss_mlp": 0.01036325, + "balance_loss_clip": 1.04916096, + "balance_loss_mlp": 1.02241302, + "epoch": 0.40643318803547274, + "flos": 24024490323840.0, + "grad_norm": 1.8143975866028277, + "language_loss": 0.64272439, + "learning_rate": 2.688775442076598e-06, + "loss": 0.66407484, + "num_input_tokens_seen": 145082470, + "step": 6760, + "time_per_iteration": 2.724278211593628 + }, + { + "auxiliary_loss_clip": 0.01120642, + "auxiliary_loss_mlp": 0.01035863, + "balance_loss_clip": 1.04679084, + "balance_loss_mlp": 1.02100921, + "epoch": 0.4064933112881407, + "flos": 25592421876480.0, + "grad_norm": 1.9958038926303674, + "language_loss": 0.75134486, + "learning_rate": 2.688409791678193e-06, + "loss": 0.77290988, + "num_input_tokens_seen": 145105685, + "step": 6761, + "time_per_iteration": 2.81839919090271 + }, + { + "auxiliary_loss_clip": 0.01097139, + "auxiliary_loss_mlp": 0.0103932, + "balance_loss_clip": 1.04636633, + "balance_loss_mlp": 1.02598023, + "epoch": 0.40655343454080867, + "flos": 22054323294720.0, + "grad_norm": 1.6270794268543942, + "language_loss": 0.70070893, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.72207355, + "num_input_tokens_seen": 145125590, + "step": 6762, + "time_per_iteration": 2.6583070755004883 + }, + { + "auxiliary_loss_clip": 0.0111912, + "auxiliary_loss_mlp": 0.01032723, + "balance_loss_clip": 1.0519619, + "balance_loss_mlp": 1.01906157, + "epoch": 0.40661355779347663, + "flos": 26468893641600.0, + "grad_norm": 1.6183981098694702, + "language_loss": 0.73523986, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.75675833, + "num_input_tokens_seen": 145146810, + "step": 6763, + "time_per_iteration": 2.674830198287964 + }, + { + "auxiliary_loss_clip": 0.01090413, + "auxiliary_loss_mlp": 0.01035278, + "balance_loss_clip": 1.04031014, + "balance_loss_mlp": 1.0199244, + "epoch": 0.4066736810461446, + "flos": 13261129136640.0, + "grad_norm": 2.065371393903723, + "language_loss": 0.68689919, + "learning_rate": 2.687312683911033e-06, + "loss": 0.70815611, + "num_input_tokens_seen": 145163130, + "step": 6764, + "time_per_iteration": 2.7424631118774414 + }, + { + "auxiliary_loss_clip": 0.01104645, + "auxiliary_loss_mlp": 0.01045832, + "balance_loss_clip": 1.0461781, + "balance_loss_mlp": 1.02930999, + "epoch": 0.40673380429881256, + "flos": 28803625758720.0, + "grad_norm": 2.4553121190783, + "language_loss": 0.91144872, + "learning_rate": 2.686946929177557e-06, + "loss": 0.93295348, + "num_input_tokens_seen": 145181420, + "step": 6765, + "time_per_iteration": 2.705754280090332 + }, + { + "auxiliary_loss_clip": 0.01121713, + "auxiliary_loss_mlp": 0.01044564, + "balance_loss_clip": 1.04742265, + "balance_loss_mlp": 1.02876294, + "epoch": 0.4067939275514805, + "flos": 12495334152960.0, + "grad_norm": 3.5832481358362673, + "language_loss": 0.78673786, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.80840063, + "num_input_tokens_seen": 145198545, + "step": 6766, + "time_per_iteration": 2.6291732788085938 + }, + { + "auxiliary_loss_clip": 0.01137462, + "auxiliary_loss_mlp": 0.01042218, + "balance_loss_clip": 1.0502665, + "balance_loss_mlp": 1.02767396, + "epoch": 0.4068540508041485, + "flos": 18770508069120.0, + "grad_norm": 2.203846422574217, + "language_loss": 0.763403, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.78519982, + "num_input_tokens_seen": 145215835, + "step": 6767, + "time_per_iteration": 2.583494186401367 + }, + { + "auxiliary_loss_clip": 0.01124058, + "auxiliary_loss_mlp": 0.01038086, + "balance_loss_clip": 1.0510633, + "balance_loss_mlp": 1.02363229, + "epoch": 0.40691417405681646, + "flos": 28512821249280.0, + "grad_norm": 2.5264206630573827, + "language_loss": 0.77474844, + "learning_rate": 2.685849508738034e-06, + "loss": 0.79636991, + "num_input_tokens_seen": 145236555, + "step": 6768, + "time_per_iteration": 2.6851589679718018 + }, + { + "auxiliary_loss_clip": 0.01134023, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.05076826, + "balance_loss_mlp": 1.01887226, + "epoch": 0.4069742973094844, + "flos": 20814040627200.0, + "grad_norm": 1.8984102670150322, + "language_loss": 0.87523651, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.8969059, + "num_input_tokens_seen": 145254595, + "step": 6769, + "time_per_iteration": 2.7267651557922363 + }, + { + "auxiliary_loss_clip": 0.01105045, + "auxiliary_loss_mlp": 0.0104448, + "balance_loss_clip": 1.04947972, + "balance_loss_mlp": 1.03028178, + "epoch": 0.4070344205621524, + "flos": 21470272151040.0, + "grad_norm": 3.1498546640216234, + "language_loss": 0.80951393, + "learning_rate": 2.685117765051156e-06, + "loss": 0.83100921, + "num_input_tokens_seen": 145274005, + "step": 6770, + "time_per_iteration": 2.7272839546203613 + }, + { + "auxiliary_loss_clip": 0.01136551, + "auxiliary_loss_mlp": 0.0103334, + "balance_loss_clip": 1.05021751, + "balance_loss_mlp": 1.01781273, + "epoch": 0.4070945438148204, + "flos": 26830046937600.0, + "grad_norm": 1.9062828764414554, + "language_loss": 0.80237663, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.82407558, + "num_input_tokens_seen": 145294850, + "step": 6771, + "time_per_iteration": 2.5958163738250732 + }, + { + "auxiliary_loss_clip": 0.01097968, + "auxiliary_loss_mlp": 0.01044728, + "balance_loss_clip": 1.04523098, + "balance_loss_mlp": 1.02995801, + "epoch": 0.4071546670674884, + "flos": 26354158623360.0, + "grad_norm": 1.4305431390081056, + "language_loss": 0.76077241, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.78219938, + "num_input_tokens_seen": 145317050, + "step": 6772, + "time_per_iteration": 2.79603910446167 + }, + { + "auxiliary_loss_clip": 0.01110195, + "auxiliary_loss_mlp": 0.01043051, + "balance_loss_clip": 1.04724109, + "balance_loss_mlp": 1.0283401, + "epoch": 0.40721479032015634, + "flos": 17895401020800.0, + "grad_norm": 1.8845179488175254, + "language_loss": 0.81205189, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.83358431, + "num_input_tokens_seen": 145334480, + "step": 6773, + "time_per_iteration": 2.699221611022949 + }, + { + "auxiliary_loss_clip": 0.01044722, + "auxiliary_loss_mlp": 0.01025696, + "balance_loss_clip": 1.03283918, + "balance_loss_mlp": 1.02369332, + "epoch": 0.4072749135728243, + "flos": 49854570537600.0, + "grad_norm": 0.9856620885651128, + "language_loss": 0.64339805, + "learning_rate": 2.683653966031597e-06, + "loss": 0.6641022, + "num_input_tokens_seen": 145388695, + "step": 6774, + "time_per_iteration": 3.147400140762329 + }, + { + "auxiliary_loss_clip": 0.01089769, + "auxiliary_loss_mlp": 0.01034709, + "balance_loss_clip": 1.04686499, + "balance_loss_mlp": 1.02041602, + "epoch": 0.40733503682549227, + "flos": 27563630400000.0, + "grad_norm": 2.273542425652267, + "language_loss": 0.72560251, + "learning_rate": 2.683287951431446e-06, + "loss": 0.74684727, + "num_input_tokens_seen": 145408240, + "step": 6775, + "time_per_iteration": 2.787423849105835 + }, + { + "auxiliary_loss_clip": 0.01105468, + "auxiliary_loss_mlp": 0.00773431, + "balance_loss_clip": 1.04828203, + "balance_loss_mlp": 1.00090027, + "epoch": 0.40739516007816023, + "flos": 22126970551680.0, + "grad_norm": 1.407391884450963, + "language_loss": 0.77802348, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.79681242, + "num_input_tokens_seen": 145428395, + "step": 6776, + "time_per_iteration": 2.682548761367798 + }, + { + "auxiliary_loss_clip": 0.01126451, + "auxiliary_loss_mlp": 0.0104142, + "balance_loss_clip": 1.05063748, + "balance_loss_mlp": 1.02654302, + "epoch": 0.4074552833308282, + "flos": 23842243693440.0, + "grad_norm": 2.817997105966, + "language_loss": 0.79558617, + "learning_rate": 2.682555844513981e-06, + "loss": 0.81726491, + "num_input_tokens_seen": 145448290, + "step": 6777, + "time_per_iteration": 2.7163336277008057 + }, + { + "auxiliary_loss_clip": 0.01058602, + "auxiliary_loss_mlp": 0.01001315, + "balance_loss_clip": 1.02913916, + "balance_loss_mlp": 0.99987298, + "epoch": 0.40751540658349616, + "flos": 58000008781440.0, + "grad_norm": 0.6823534121540719, + "language_loss": 0.5315339, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55213308, + "num_input_tokens_seen": 145509785, + "step": 6778, + "time_per_iteration": 3.1687095165252686 + }, + { + "auxiliary_loss_clip": 0.01135647, + "auxiliary_loss_mlp": 0.00772948, + "balance_loss_clip": 1.05136371, + "balance_loss_mlp": 1.0008893, + "epoch": 0.40757552983616413, + "flos": 21214659991680.0, + "grad_norm": 2.33935347330558, + "language_loss": 0.82312328, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.84220922, + "num_input_tokens_seen": 145528620, + "step": 6779, + "time_per_iteration": 2.584343194961548 + }, + { + "auxiliary_loss_clip": 0.0112113, + "auxiliary_loss_mlp": 0.01036902, + "balance_loss_clip": 1.04708278, + "balance_loss_mlp": 1.02178645, + "epoch": 0.4076356530888321, + "flos": 26833530556800.0, + "grad_norm": 1.5589663074171618, + "language_loss": 0.76523471, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.78681505, + "num_input_tokens_seen": 145547775, + "step": 6780, + "time_per_iteration": 2.6672446727752686 + }, + { + "auxiliary_loss_clip": 0.01117549, + "auxiliary_loss_mlp": 0.01034773, + "balance_loss_clip": 1.04889798, + "balance_loss_mlp": 1.0212667, + "epoch": 0.40769577634150006, + "flos": 12203021272320.0, + "grad_norm": 2.1749592638145123, + "language_loss": 0.65482175, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.67634493, + "num_input_tokens_seen": 145564465, + "step": 6781, + "time_per_iteration": 2.612326145172119 + }, + { + "auxiliary_loss_clip": 0.01107362, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.04472542, + "balance_loss_mlp": 1.01922643, + "epoch": 0.407755899594168, + "flos": 33655264796160.0, + "grad_norm": 1.5476514756078803, + "language_loss": 0.71028459, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.73170209, + "num_input_tokens_seen": 145585965, + "step": 6782, + "time_per_iteration": 2.7483837604522705 + }, + { + "auxiliary_loss_clip": 0.01124897, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.04813361, + "balance_loss_mlp": 1.01833797, + "epoch": 0.407816022846836, + "flos": 20157342226560.0, + "grad_norm": 1.9402515659282311, + "language_loss": 0.82272756, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.84430599, + "num_input_tokens_seen": 145605000, + "step": 6783, + "time_per_iteration": 2.6157009601593018 + }, + { + "auxiliary_loss_clip": 0.01117034, + "auxiliary_loss_mlp": 0.01038578, + "balance_loss_clip": 1.04744446, + "balance_loss_mlp": 1.0235455, + "epoch": 0.40787614609950396, + "flos": 21178821196800.0, + "grad_norm": 1.6713842677384587, + "language_loss": 0.81044209, + "learning_rate": 2.679992655730283e-06, + "loss": 0.83199817, + "num_input_tokens_seen": 145623740, + "step": 6784, + "time_per_iteration": 2.6054811477661133 + }, + { + "auxiliary_loss_clip": 0.01107175, + "auxiliary_loss_mlp": 0.01044009, + "balance_loss_clip": 1.05123401, + "balance_loss_mlp": 1.02725959, + "epoch": 0.407936269352172, + "flos": 20520650338560.0, + "grad_norm": 2.1708514595694655, + "language_loss": 0.65653902, + "learning_rate": 2.679626382651386e-06, + "loss": 0.67805088, + "num_input_tokens_seen": 145643515, + "step": 6785, + "time_per_iteration": 2.816330671310425 + }, + { + "auxiliary_loss_clip": 0.01115764, + "auxiliary_loss_mlp": 0.01038413, + "balance_loss_clip": 1.04758108, + "balance_loss_mlp": 1.02347052, + "epoch": 0.40799639260483994, + "flos": 20118809911680.0, + "grad_norm": 1.8523263348252557, + "language_loss": 0.79567587, + "learning_rate": 2.679260083800989e-06, + "loss": 0.81721765, + "num_input_tokens_seen": 145660890, + "step": 6786, + "time_per_iteration": 2.629009962081909 + }, + { + "auxiliary_loss_clip": 0.01132323, + "auxiliary_loss_mlp": 0.01042176, + "balance_loss_clip": 1.04911721, + "balance_loss_mlp": 1.02866411, + "epoch": 0.4080565158575079, + "flos": 20997328752000.0, + "grad_norm": 1.716981063220771, + "language_loss": 0.81870878, + "learning_rate": 2.678893759192982e-06, + "loss": 0.84045374, + "num_input_tokens_seen": 145680070, + "step": 6787, + "time_per_iteration": 2.6304709911346436 + }, + { + "auxiliary_loss_clip": 0.01117339, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.04705477, + "balance_loss_mlp": 1.02019691, + "epoch": 0.40811663911017587, + "flos": 19317714837120.0, + "grad_norm": 1.8408166150848957, + "language_loss": 0.67954206, + "learning_rate": 2.678527408841255e-06, + "loss": 0.70105749, + "num_input_tokens_seen": 145698010, + "step": 6788, + "time_per_iteration": 2.6314821243286133 + }, + { + "auxiliary_loss_clip": 0.01102044, + "auxiliary_loss_mlp": 0.01047553, + "balance_loss_clip": 1.04318452, + "balance_loss_mlp": 1.03095889, + "epoch": 0.40817676236284384, + "flos": 40625382119040.0, + "grad_norm": 2.0355882471601014, + "language_loss": 0.66265976, + "learning_rate": 2.678161032759701e-06, + "loss": 0.6841557, + "num_input_tokens_seen": 145722215, + "step": 6789, + "time_per_iteration": 2.8329808712005615 + }, + { + "auxiliary_loss_clip": 0.01084234, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.04236126, + "balance_loss_mlp": 1.021101, + "epoch": 0.4082368856155118, + "flos": 20522086882560.0, + "grad_norm": 1.7612282198179636, + "language_loss": 0.60220939, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.62341583, + "num_input_tokens_seen": 145741090, + "step": 6790, + "time_per_iteration": 2.705007791519165 + }, + { + "auxiliary_loss_clip": 0.01115221, + "auxiliary_loss_mlp": 0.0104035, + "balance_loss_clip": 1.04814339, + "balance_loss_mlp": 1.02482939, + "epoch": 0.40829700886817977, + "flos": 11427745098240.0, + "grad_norm": 2.946877052856992, + "language_loss": 0.69406867, + "learning_rate": 2.677428203462683e-06, + "loss": 0.71562433, + "num_input_tokens_seen": 145754985, + "step": 6791, + "time_per_iteration": 2.629746675491333 + }, + { + "auxiliary_loss_clip": 0.01047663, + "auxiliary_loss_mlp": 0.01005727, + "balance_loss_clip": 1.02732182, + "balance_loss_mlp": 1.00409365, + "epoch": 0.40835713212084773, + "flos": 67330677121920.0, + "grad_norm": 0.7512190297569652, + "language_loss": 0.59569383, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.61622775, + "num_input_tokens_seen": 145815260, + "step": 6792, + "time_per_iteration": 4.680825710296631 + }, + { + "auxiliary_loss_clip": 0.0113903, + "auxiliary_loss_mlp": 0.01043884, + "balance_loss_clip": 1.05271673, + "balance_loss_mlp": 1.02787423, + "epoch": 0.4084172553735157, + "flos": 21762010414080.0, + "grad_norm": 1.9475859316217028, + "language_loss": 0.80324817, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.8250773, + "num_input_tokens_seen": 145832665, + "step": 6793, + "time_per_iteration": 4.095003128051758 + }, + { + "auxiliary_loss_clip": 0.01124776, + "auxiliary_loss_mlp": 0.01044052, + "balance_loss_clip": 1.04916334, + "balance_loss_mlp": 1.02811408, + "epoch": 0.40847737862618366, + "flos": 27417258478080.0, + "grad_norm": 1.8631596367030567, + "language_loss": 0.84994531, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.87163359, + "num_input_tokens_seen": 145850240, + "step": 6794, + "time_per_iteration": 4.198231935501099 + }, + { + "auxiliary_loss_clip": 0.01100105, + "auxiliary_loss_mlp": 0.01040677, + "balance_loss_clip": 1.0469923, + "balance_loss_mlp": 1.02570391, + "epoch": 0.4085375018788516, + "flos": 18587255857920.0, + "grad_norm": 2.862264995792616, + "language_loss": 0.7989887, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.82039654, + "num_input_tokens_seen": 145869545, + "step": 6795, + "time_per_iteration": 2.7477807998657227 + }, + { + "auxiliary_loss_clip": 0.01121705, + "auxiliary_loss_mlp": 0.01039831, + "balance_loss_clip": 1.04831719, + "balance_loss_mlp": 1.02385116, + "epoch": 0.4085976251315196, + "flos": 15411783029760.0, + "grad_norm": 2.7561951150254633, + "language_loss": 0.70605052, + "learning_rate": 2.675595680920792e-06, + "loss": 0.7276659, + "num_input_tokens_seen": 145884025, + "step": 6796, + "time_per_iteration": 4.261413335800171 + }, + { + "auxiliary_loss_clip": 0.01116135, + "auxiliary_loss_mlp": 0.0077634, + "balance_loss_clip": 1.04606998, + "balance_loss_mlp": 1.00082135, + "epoch": 0.40865774838418756, + "flos": 21252222639360.0, + "grad_norm": 1.6356766399676357, + "language_loss": 0.78218019, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.80110496, + "num_input_tokens_seen": 145903210, + "step": 6797, + "time_per_iteration": 2.6453776359558105 + }, + { + "auxiliary_loss_clip": 0.01121906, + "auxiliary_loss_mlp": 0.0105076, + "balance_loss_clip": 1.04562223, + "balance_loss_mlp": 1.03619301, + "epoch": 0.4087178716368556, + "flos": 13772245714560.0, + "grad_norm": 2.2166943768421534, + "language_loss": 0.86117017, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.8828969, + "num_input_tokens_seen": 145920985, + "step": 6798, + "time_per_iteration": 2.67480731010437 + }, + { + "auxiliary_loss_clip": 0.01130307, + "auxiliary_loss_mlp": 0.01042728, + "balance_loss_clip": 1.04780984, + "balance_loss_mlp": 1.02931094, + "epoch": 0.40877799488952354, + "flos": 23621752056960.0, + "grad_norm": 1.473518761352831, + "language_loss": 0.84252232, + "learning_rate": 2.674495859860601e-06, + "loss": 0.86425269, + "num_input_tokens_seen": 145940350, + "step": 6799, + "time_per_iteration": 2.6273906230926514 + }, + { + "auxiliary_loss_clip": 0.01093085, + "auxiliary_loss_mlp": 0.01052249, + "balance_loss_clip": 1.04557848, + "balance_loss_mlp": 1.03427255, + "epoch": 0.4088381181421915, + "flos": 20918791664640.0, + "grad_norm": 2.1256660898165913, + "language_loss": 0.83567548, + "learning_rate": 2.6741292016681e-06, + "loss": 0.85712886, + "num_input_tokens_seen": 145957460, + "step": 6800, + "time_per_iteration": 2.7064268589019775 + }, + { + "auxiliary_loss_clip": 0.01119062, + "auxiliary_loss_mlp": 0.01043239, + "balance_loss_clip": 1.04534221, + "balance_loss_mlp": 1.02778912, + "epoch": 0.4088982413948595, + "flos": 13297578462720.0, + "grad_norm": 2.1612690472856353, + "language_loss": 0.74336559, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.76498854, + "num_input_tokens_seen": 145975285, + "step": 6801, + "time_per_iteration": 2.631030321121216 + }, + { + "auxiliary_loss_clip": 0.01122834, + "auxiliary_loss_mlp": 0.0104231, + "balance_loss_clip": 1.04511952, + "balance_loss_mlp": 1.02699137, + "epoch": 0.40895836464752744, + "flos": 15267673664640.0, + "grad_norm": 2.1715684147319907, + "language_loss": 0.80430126, + "learning_rate": 2.673395808607861e-06, + "loss": 0.82595277, + "num_input_tokens_seen": 145989150, + "step": 6802, + "time_per_iteration": 2.5802509784698486 + }, + { + "auxiliary_loss_clip": 0.0112096, + "auxiliary_loss_mlp": 0.01044934, + "balance_loss_clip": 1.04893684, + "balance_loss_mlp": 1.02843595, + "epoch": 0.4090184879001954, + "flos": 14501411804160.0, + "grad_norm": 2.2436343912353283, + "language_loss": 0.75734484, + "learning_rate": 2.673029073767934e-06, + "loss": 0.77900374, + "num_input_tokens_seen": 146006980, + "step": 6803, + "time_per_iteration": 2.609602689743042 + }, + { + "auxiliary_loss_clip": 0.0106898, + "auxiliary_loss_mlp": 0.00773774, + "balance_loss_clip": 1.04085743, + "balance_loss_mlp": 1.00086641, + "epoch": 0.40907861115286337, + "flos": 13881593692800.0, + "grad_norm": 1.8843395194203503, + "language_loss": 0.78824151, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.806669, + "num_input_tokens_seen": 146025125, + "step": 6804, + "time_per_iteration": 2.7654101848602295 + }, + { + "auxiliary_loss_clip": 0.01137979, + "auxiliary_loss_mlp": 0.01045985, + "balance_loss_clip": 1.04858065, + "balance_loss_mlp": 1.03147769, + "epoch": 0.40913873440553133, + "flos": 28037615293440.0, + "grad_norm": 2.2298994676504225, + "language_loss": 0.75672269, + "learning_rate": 2.672295527537998e-06, + "loss": 0.77856231, + "num_input_tokens_seen": 146044990, + "step": 6805, + "time_per_iteration": 2.680368185043335 + }, + { + "auxiliary_loss_clip": 0.01089569, + "auxiliary_loss_mlp": 0.01047964, + "balance_loss_clip": 1.04342198, + "balance_loss_mlp": 1.03309822, + "epoch": 0.4091988576581993, + "flos": 21618188357760.0, + "grad_norm": 1.8743994628433338, + "language_loss": 0.79440027, + "learning_rate": 2.671928716175804e-06, + "loss": 0.81577563, + "num_input_tokens_seen": 146066045, + "step": 6806, + "time_per_iteration": 2.8212954998016357 + }, + { + "auxiliary_loss_clip": 0.01126847, + "auxiliary_loss_mlp": 0.01038318, + "balance_loss_clip": 1.04977083, + "balance_loss_mlp": 1.02272499, + "epoch": 0.40925898091086726, + "flos": 25224085860480.0, + "grad_norm": 1.915245819215902, + "language_loss": 0.71779263, + "learning_rate": 2.671561879334007e-06, + "loss": 0.73944426, + "num_input_tokens_seen": 146086280, + "step": 6807, + "time_per_iteration": 2.7223496437072754 + }, + { + "auxiliary_loss_clip": 0.01034248, + "auxiliary_loss_mlp": 0.01005874, + "balance_loss_clip": 1.0356338, + "balance_loss_mlp": 1.00364494, + "epoch": 0.40931910416353523, + "flos": 68930568800640.0, + "grad_norm": 0.8232207365722912, + "language_loss": 0.58807027, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.60847151, + "num_input_tokens_seen": 146148840, + "step": 6808, + "time_per_iteration": 3.2951159477233887 + }, + { + "auxiliary_loss_clip": 0.01113663, + "auxiliary_loss_mlp": 0.01048693, + "balance_loss_clip": 1.04732299, + "balance_loss_mlp": 1.03419733, + "epoch": 0.4093792274162032, + "flos": 20189553747840.0, + "grad_norm": 1.705790136999867, + "language_loss": 0.54954052, + "learning_rate": 2.670828129267242e-06, + "loss": 0.57116413, + "num_input_tokens_seen": 146166195, + "step": 6809, + "time_per_iteration": 2.663210868835449 + }, + { + "auxiliary_loss_clip": 0.01108384, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.0446471, + "balance_loss_mlp": 1.01682281, + "epoch": 0.40943935066887116, + "flos": 25228754628480.0, + "grad_norm": 1.7788203343455933, + "language_loss": 0.83185786, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85325718, + "num_input_tokens_seen": 146185045, + "step": 6810, + "time_per_iteration": 2.683969020843506 + }, + { + "auxiliary_loss_clip": 0.01105454, + "auxiliary_loss_mlp": 0.01053382, + "balance_loss_clip": 1.0451473, + "balance_loss_mlp": 1.03608489, + "epoch": 0.4094994739215392, + "flos": 23255319461760.0, + "grad_norm": 2.954085357706404, + "language_loss": 0.77419919, + "learning_rate": 2.670094277448999e-06, + "loss": 0.79578757, + "num_input_tokens_seen": 146204655, + "step": 6811, + "time_per_iteration": 2.6727347373962402 + }, + { + "auxiliary_loss_clip": 0.01135893, + "auxiliary_loss_mlp": 0.01036603, + "balance_loss_clip": 1.04917455, + "balance_loss_mlp": 1.02042687, + "epoch": 0.40955959717420715, + "flos": 17382165540480.0, + "grad_norm": 1.6058461501005727, + "language_loss": 0.70272696, + "learning_rate": 2.669727313417857e-06, + "loss": 0.72445196, + "num_input_tokens_seen": 146222000, + "step": 6812, + "time_per_iteration": 2.6267693042755127 + }, + { + "auxiliary_loss_clip": 0.01132783, + "auxiliary_loss_mlp": 0.01048088, + "balance_loss_clip": 1.04780114, + "balance_loss_mlp": 1.03210163, + "epoch": 0.4096197204268751, + "flos": 25082418620160.0, + "grad_norm": 1.9378136524882912, + "language_loss": 0.66298044, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.68478918, + "num_input_tokens_seen": 146242630, + "step": 6813, + "time_per_iteration": 2.6447062492370605 + }, + { + "auxiliary_loss_clip": 0.01117463, + "auxiliary_loss_mlp": 0.00774455, + "balance_loss_clip": 1.04784274, + "balance_loss_mlp": 1.0009681, + "epoch": 0.4096798436795431, + "flos": 30586769648640.0, + "grad_norm": 1.8922051995482987, + "language_loss": 0.73949504, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.75841421, + "num_input_tokens_seen": 146263070, + "step": 6814, + "time_per_iteration": 2.7325870990753174 + }, + { + "auxiliary_loss_clip": 0.0108334, + "auxiliary_loss_mlp": 0.01038435, + "balance_loss_clip": 1.04231858, + "balance_loss_mlp": 1.02281821, + "epoch": 0.40973996693221104, + "flos": 24133622820480.0, + "grad_norm": 2.0095509453801728, + "language_loss": 0.65957761, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.68079543, + "num_input_tokens_seen": 146282890, + "step": 6815, + "time_per_iteration": 2.780668258666992 + }, + { + "auxiliary_loss_clip": 0.01122383, + "auxiliary_loss_mlp": 0.01045791, + "balance_loss_clip": 1.05130887, + "balance_loss_mlp": 1.03100336, + "epoch": 0.409800090184879, + "flos": 23988974751360.0, + "grad_norm": 1.5903260932613887, + "language_loss": 0.76872814, + "learning_rate": 2.668259203471188e-06, + "loss": 0.79040992, + "num_input_tokens_seen": 146301755, + "step": 6816, + "time_per_iteration": 2.6901748180389404 + }, + { + "auxiliary_loss_clip": 0.01118517, + "auxiliary_loss_mlp": 0.0104269, + "balance_loss_clip": 1.05008173, + "balance_loss_mlp": 1.02716875, + "epoch": 0.40986021343754697, + "flos": 16143678552960.0, + "grad_norm": 2.2788575244766966, + "language_loss": 0.81621635, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.8378284, + "num_input_tokens_seen": 146316835, + "step": 6817, + "time_per_iteration": 2.6194167137145996 + }, + { + "auxiliary_loss_clip": 0.01114033, + "auxiliary_loss_mlp": 0.01046853, + "balance_loss_clip": 1.04633307, + "balance_loss_mlp": 1.02987719, + "epoch": 0.40992033669021494, + "flos": 24790824011520.0, + "grad_norm": 2.698849637369061, + "language_loss": 0.8016938, + "learning_rate": 2.667524996399444e-06, + "loss": 0.82330263, + "num_input_tokens_seen": 146336650, + "step": 6818, + "time_per_iteration": 2.8449223041534424 + }, + { + "auxiliary_loss_clip": 0.0111157, + "auxiliary_loss_mlp": 0.01039221, + "balance_loss_clip": 1.05212271, + "balance_loss_mlp": 1.02459419, + "epoch": 0.4099804599428829, + "flos": 29641888431360.0, + "grad_norm": 1.781955605236185, + "language_loss": 0.66531783, + "learning_rate": 2.66715785488769e-06, + "loss": 0.68682575, + "num_input_tokens_seen": 146357640, + "step": 6819, + "time_per_iteration": 2.8016393184661865 + }, + { + "auxiliary_loss_clip": 0.01118061, + "auxiliary_loss_mlp": 0.01052321, + "balance_loss_clip": 1.05068922, + "balance_loss_mlp": 1.03429687, + "epoch": 0.41004058319555087, + "flos": 24826590979200.0, + "grad_norm": 1.7017427969889725, + "language_loss": 0.85438228, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.87608612, + "num_input_tokens_seen": 146379325, + "step": 6820, + "time_per_iteration": 2.7182726860046387 + }, + { + "auxiliary_loss_clip": 0.01127803, + "auxiliary_loss_mlp": 0.01035666, + "balance_loss_clip": 1.05361152, + "balance_loss_mlp": 1.02019835, + "epoch": 0.41010070644821883, + "flos": 25737464995200.0, + "grad_norm": 1.8388824613750698, + "language_loss": 0.71235943, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.73399413, + "num_input_tokens_seen": 146398635, + "step": 6821, + "time_per_iteration": 2.6716413497924805 + }, + { + "auxiliary_loss_clip": 0.01123531, + "auxiliary_loss_mlp": 0.01036959, + "balance_loss_clip": 1.05253363, + "balance_loss_mlp": 1.02228427, + "epoch": 0.4101608297008868, + "flos": 22346061557760.0, + "grad_norm": 1.9657765704612085, + "language_loss": 0.74500406, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.76660895, + "num_input_tokens_seen": 146417585, + "step": 6822, + "time_per_iteration": 2.652270793914795 + }, + { + "auxiliary_loss_clip": 0.01118135, + "auxiliary_loss_mlp": 0.01038075, + "balance_loss_clip": 1.05201709, + "balance_loss_mlp": 1.02313757, + "epoch": 0.41022095295355476, + "flos": 21945083057280.0, + "grad_norm": 2.1947910409652116, + "language_loss": 0.75539672, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.77695882, + "num_input_tokens_seen": 146437035, + "step": 6823, + "time_per_iteration": 2.767306327819824 + }, + { + "auxiliary_loss_clip": 0.01095631, + "auxiliary_loss_mlp": 0.01044283, + "balance_loss_clip": 1.05394316, + "balance_loss_mlp": 1.02697372, + "epoch": 0.4102810762062228, + "flos": 27450511493760.0, + "grad_norm": 2.0691169068872086, + "language_loss": 0.73186851, + "learning_rate": 2.665321768127001e-06, + "loss": 0.75326765, + "num_input_tokens_seen": 146457370, + "step": 6824, + "time_per_iteration": 2.793712615966797 + }, + { + "auxiliary_loss_clip": 0.01110429, + "auxiliary_loss_mlp": 0.0103962, + "balance_loss_clip": 1.05025351, + "balance_loss_mlp": 1.02316904, + "epoch": 0.41034119945889075, + "flos": 24499265316480.0, + "grad_norm": 2.036284375586757, + "language_loss": 0.72426587, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.7457664, + "num_input_tokens_seen": 146478105, + "step": 6825, + "time_per_iteration": 2.764977216720581 + }, + { + "auxiliary_loss_clip": 0.01097265, + "auxiliary_loss_mlp": 0.01045464, + "balance_loss_clip": 1.04605746, + "balance_loss_mlp": 1.03027654, + "epoch": 0.4104013227115587, + "flos": 24352641999360.0, + "grad_norm": 1.8249289811640228, + "language_loss": 0.85226274, + "learning_rate": 2.664587156721768e-06, + "loss": 0.87369001, + "num_input_tokens_seen": 146497835, + "step": 6826, + "time_per_iteration": 2.7680137157440186 + }, + { + "auxiliary_loss_clip": 0.01115829, + "auxiliary_loss_mlp": 0.00775051, + "balance_loss_clip": 1.05372024, + "balance_loss_mlp": 1.00099707, + "epoch": 0.4104614459642267, + "flos": 23729340268800.0, + "grad_norm": 1.8772466232345664, + "language_loss": 0.66074443, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.67965323, + "num_input_tokens_seen": 146517735, + "step": 6827, + "time_per_iteration": 2.791212797164917 + }, + { + "auxiliary_loss_clip": 0.01113343, + "auxiliary_loss_mlp": 0.01033945, + "balance_loss_clip": 1.04942787, + "balance_loss_mlp": 1.01910365, + "epoch": 0.41052156921689464, + "flos": 22127976132480.0, + "grad_norm": 2.0535618692070914, + "language_loss": 0.72474444, + "learning_rate": 2.663852444511689e-06, + "loss": 0.74621731, + "num_input_tokens_seen": 146537640, + "step": 6828, + "time_per_iteration": 2.6675491333007812 + }, + { + "auxiliary_loss_clip": 0.01111113, + "auxiliary_loss_mlp": 0.01048054, + "balance_loss_clip": 1.04920423, + "balance_loss_mlp": 1.03068542, + "epoch": 0.4105816924695626, + "flos": 20084371747200.0, + "grad_norm": 2.67524304617312, + "language_loss": 0.83464897, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.85624069, + "num_input_tokens_seen": 146554695, + "step": 6829, + "time_per_iteration": 2.762298107147217 + }, + { + "auxiliary_loss_clip": 0.01124628, + "auxiliary_loss_mlp": 0.01039003, + "balance_loss_clip": 1.05062759, + "balance_loss_mlp": 1.02405417, + "epoch": 0.4106418157222306, + "flos": 18076785724800.0, + "grad_norm": 1.5363498208464375, + "language_loss": 0.89878875, + "learning_rate": 2.663117631608206e-06, + "loss": 0.92042506, + "num_input_tokens_seen": 146573740, + "step": 6830, + "time_per_iteration": 2.7726032733917236 + }, + { + "auxiliary_loss_clip": 0.01098336, + "auxiliary_loss_mlp": 0.01034169, + "balance_loss_clip": 1.04938424, + "balance_loss_mlp": 1.01833797, + "epoch": 0.41070193897489854, + "flos": 21647850013440.0, + "grad_norm": 1.7853690904757185, + "language_loss": 0.65810287, + "learning_rate": 2.662750187431268e-06, + "loss": 0.67942798, + "num_input_tokens_seen": 146592885, + "step": 6831, + "time_per_iteration": 4.213804244995117 + }, + { + "auxiliary_loss_clip": 0.01137663, + "auxiliary_loss_mlp": 0.01039058, + "balance_loss_clip": 1.05280805, + "balance_loss_mlp": 1.02361393, + "epoch": 0.4107620622275665, + "flos": 26648195356800.0, + "grad_norm": 1.7075421510763598, + "language_loss": 0.69710165, + "learning_rate": 2.662382718122776e-06, + "loss": 0.71886885, + "num_input_tokens_seen": 146611995, + "step": 6832, + "time_per_iteration": 4.146309852600098 + }, + { + "auxiliary_loss_clip": 0.01089843, + "auxiliary_loss_mlp": 0.01042117, + "balance_loss_clip": 1.05080116, + "balance_loss_mlp": 1.02703142, + "epoch": 0.41082218548023447, + "flos": 18734310138240.0, + "grad_norm": 2.3374205466797537, + "language_loss": 0.73910743, + "learning_rate": 2.662015223696666e-06, + "loss": 0.760427, + "num_input_tokens_seen": 146628045, + "step": 6833, + "time_per_iteration": 4.23652195930481 + }, + { + "auxiliary_loss_clip": 0.01083988, + "auxiliary_loss_mlp": 0.01045346, + "balance_loss_clip": 1.04393578, + "balance_loss_mlp": 1.02754784, + "epoch": 0.41088230873290243, + "flos": 22893771116160.0, + "grad_norm": 1.56012293193972, + "language_loss": 0.7299009, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.75119424, + "num_input_tokens_seen": 146648355, + "step": 6834, + "time_per_iteration": 2.72806453704834 + }, + { + "auxiliary_loss_clip": 0.0113018, + "auxiliary_loss_mlp": 0.01049062, + "balance_loss_clip": 1.05203891, + "balance_loss_mlp": 1.03320765, + "epoch": 0.4109424319855704, + "flos": 24276978000000.0, + "grad_norm": 1.7978087117059114, + "language_loss": 0.71254998, + "learning_rate": 2.661280159547329e-06, + "loss": 0.73434246, + "num_input_tokens_seen": 146668370, + "step": 6835, + "time_per_iteration": 4.406278133392334 + }, + { + "auxiliary_loss_clip": 0.01130021, + "auxiliary_loss_mlp": 0.01043294, + "balance_loss_clip": 1.05188155, + "balance_loss_mlp": 1.02630687, + "epoch": 0.41100255523823837, + "flos": 12969139478400.0, + "grad_norm": 1.9060780079348063, + "language_loss": 0.87366456, + "learning_rate": 2.660912589851978e-06, + "loss": 0.89539772, + "num_input_tokens_seen": 146686665, + "step": 6836, + "time_per_iteration": 2.6482133865356445 + }, + { + "auxiliary_loss_clip": 0.0112613, + "auxiliary_loss_mlp": 0.01040074, + "balance_loss_clip": 1.05334806, + "balance_loss_mlp": 1.02461267, + "epoch": 0.4110626784909064, + "flos": 23145648261120.0, + "grad_norm": 6.565804686602276, + "language_loss": 0.69167227, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.71333432, + "num_input_tokens_seen": 146706570, + "step": 6837, + "time_per_iteration": 2.682241916656494 + }, + { + "auxiliary_loss_clip": 0.0114114, + "auxiliary_loss_mlp": 0.01041377, + "balance_loss_clip": 1.0544312, + "balance_loss_mlp": 1.02540302, + "epoch": 0.41112280174357435, + "flos": 22747399194240.0, + "grad_norm": 1.8671169017141842, + "language_loss": 0.75408459, + "learning_rate": 2.660177375289599e-06, + "loss": 0.77590978, + "num_input_tokens_seen": 146723425, + "step": 6838, + "time_per_iteration": 2.625422239303589 + }, + { + "auxiliary_loss_clip": 0.0110141, + "auxiliary_loss_mlp": 0.01042257, + "balance_loss_clip": 1.0521034, + "balance_loss_mlp": 1.02617598, + "epoch": 0.4111829249962423, + "flos": 21102403011840.0, + "grad_norm": 2.061873935528421, + "language_loss": 0.82113552, + "learning_rate": 2.659809730450451e-06, + "loss": 0.84257221, + "num_input_tokens_seen": 146741640, + "step": 6839, + "time_per_iteration": 2.7850279808044434 + }, + { + "auxiliary_loss_clip": 0.01135439, + "auxiliary_loss_mlp": 0.0103927, + "balance_loss_clip": 1.05122948, + "balance_loss_mlp": 1.02421379, + "epoch": 0.4112430482489103, + "flos": 21505787723520.0, + "grad_norm": 5.701831641175022, + "language_loss": 0.80077577, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.82252288, + "num_input_tokens_seen": 146759195, + "step": 6840, + "time_per_iteration": 2.656494140625 + }, + { + "auxiliary_loss_clip": 0.01120054, + "auxiliary_loss_mlp": 0.01035027, + "balance_loss_clip": 1.0487783, + "balance_loss_mlp": 1.02117467, + "epoch": 0.41130317150157825, + "flos": 19570022945280.0, + "grad_norm": 1.862146821875906, + "language_loss": 0.6778084, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.69935924, + "num_input_tokens_seen": 146774990, + "step": 6841, + "time_per_iteration": 2.6612377166748047 + }, + { + "auxiliary_loss_clip": 0.01055489, + "auxiliary_loss_mlp": 0.01004436, + "balance_loss_clip": 1.03532803, + "balance_loss_mlp": 1.00270772, + "epoch": 0.4113632947542462, + "flos": 62383157706240.0, + "grad_norm": 0.8163554776107808, + "language_loss": 0.59717554, + "learning_rate": 2.65870664586847e-06, + "loss": 0.61777478, + "num_input_tokens_seen": 146839610, + "step": 6842, + "time_per_iteration": 3.2157862186431885 + }, + { + "auxiliary_loss_clip": 0.01120166, + "auxiliary_loss_mlp": 0.01038325, + "balance_loss_clip": 1.05330658, + "balance_loss_mlp": 1.02400184, + "epoch": 0.4114234180069142, + "flos": 13918617636480.0, + "grad_norm": 2.3538351775584156, + "language_loss": 0.70293331, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.72451818, + "num_input_tokens_seen": 146857360, + "step": 6843, + "time_per_iteration": 2.6172597408294678 + }, + { + "auxiliary_loss_clip": 0.01014929, + "auxiliary_loss_mlp": 0.01002572, + "balance_loss_clip": 1.01983762, + "balance_loss_mlp": 1.00047398, + "epoch": 0.41148354125958214, + "flos": 64928505219840.0, + "grad_norm": 0.7263883634768764, + "language_loss": 0.53593683, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.55611187, + "num_input_tokens_seen": 146917055, + "step": 6844, + "time_per_iteration": 3.21069598197937 + }, + { + "auxiliary_loss_clip": 0.01124589, + "auxiliary_loss_mlp": 0.01041114, + "balance_loss_clip": 1.05226612, + "balance_loss_mlp": 1.02679706, + "epoch": 0.4115436645122501, + "flos": 18728779443840.0, + "grad_norm": 1.870188515464334, + "language_loss": 0.66065252, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.68230951, + "num_input_tokens_seen": 146935215, + "step": 6845, + "time_per_iteration": 2.6289329528808594 + }, + { + "auxiliary_loss_clip": 0.01134084, + "auxiliary_loss_mlp": 0.01038479, + "balance_loss_clip": 1.05250192, + "balance_loss_mlp": 1.02355433, + "epoch": 0.41160378776491807, + "flos": 16252918790400.0, + "grad_norm": 2.0932374873894655, + "language_loss": 0.70088863, + "learning_rate": 2.657235516795808e-06, + "loss": 0.72261429, + "num_input_tokens_seen": 146951970, + "step": 6846, + "time_per_iteration": 2.578780174255371 + }, + { + "auxiliary_loss_clip": 0.01111001, + "auxiliary_loss_mlp": 0.01041074, + "balance_loss_clip": 1.04926157, + "balance_loss_mlp": 1.0254035, + "epoch": 0.41166391101758604, + "flos": 27970031854080.0, + "grad_norm": 1.8006459441278344, + "language_loss": 0.65271175, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.67423248, + "num_input_tokens_seen": 146975615, + "step": 6847, + "time_per_iteration": 2.7504281997680664 + }, + { + "auxiliary_loss_clip": 0.01111807, + "auxiliary_loss_mlp": 0.01046607, + "balance_loss_clip": 1.04943776, + "balance_loss_mlp": 1.03167558, + "epoch": 0.411724034270254, + "flos": 34131296764800.0, + "grad_norm": 1.371398558221349, + "language_loss": 0.70655453, + "learning_rate": 2.656499802669069e-06, + "loss": 0.72813869, + "num_input_tokens_seen": 146998855, + "step": 6848, + "time_per_iteration": 2.7842190265655518 + }, + { + "auxiliary_loss_clip": 0.01032604, + "auxiliary_loss_mlp": 0.00753743, + "balance_loss_clip": 1.02356267, + "balance_loss_mlp": 1.00076866, + "epoch": 0.41178415752292197, + "flos": 67923670752000.0, + "grad_norm": 0.9037714041830832, + "language_loss": 0.5627954, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.58065879, + "num_input_tokens_seen": 147062710, + "step": 6849, + "time_per_iteration": 3.3100218772888184 + }, + { + "auxiliary_loss_clip": 0.01115279, + "auxiliary_loss_mlp": 0.0104026, + "balance_loss_clip": 1.05035055, + "balance_loss_mlp": 1.0254786, + "epoch": 0.41184428077558993, + "flos": 34313938444800.0, + "grad_norm": 2.6235370790375767, + "language_loss": 0.76318872, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.78474414, + "num_input_tokens_seen": 147086075, + "step": 6850, + "time_per_iteration": 2.879258632659912 + }, + { + "auxiliary_loss_clip": 0.010812, + "auxiliary_loss_mlp": 0.01037976, + "balance_loss_clip": 1.0412885, + "balance_loss_mlp": 1.02356339, + "epoch": 0.41190440402825795, + "flos": 35444118948480.0, + "grad_norm": 1.5473555335002718, + "language_loss": 0.68093288, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.70212466, + "num_input_tokens_seen": 147107590, + "step": 6851, + "time_per_iteration": 2.931530237197876 + }, + { + "auxiliary_loss_clip": 0.01101431, + "auxiliary_loss_mlp": 0.01049233, + "balance_loss_clip": 1.0504117, + "balance_loss_mlp": 1.03207839, + "epoch": 0.4119645272809259, + "flos": 20849879422080.0, + "grad_norm": 2.1361960755807634, + "language_loss": 0.79698718, + "learning_rate": 2.655028075792743e-06, + "loss": 0.81849384, + "num_input_tokens_seen": 147123715, + "step": 6852, + "time_per_iteration": 2.6807408332824707 + }, + { + "auxiliary_loss_clip": 0.01141214, + "auxiliary_loss_mlp": 0.01043074, + "balance_loss_clip": 1.05327845, + "balance_loss_mlp": 1.02688491, + "epoch": 0.4120246505335939, + "flos": 27562050201600.0, + "grad_norm": 1.901908158264802, + "language_loss": 0.77750659, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.79934943, + "num_input_tokens_seen": 147144290, + "step": 6853, + "time_per_iteration": 2.699430227279663 + }, + { + "auxiliary_loss_clip": 0.01126437, + "auxiliary_loss_mlp": 0.01046106, + "balance_loss_clip": 1.04821801, + "balance_loss_mlp": 1.0298574, + "epoch": 0.41208477378626185, + "flos": 37815444046080.0, + "grad_norm": 1.8090743517086876, + "language_loss": 0.65556479, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.6772902, + "num_input_tokens_seen": 147166340, + "step": 6854, + "time_per_iteration": 2.8111729621887207 + }, + { + "auxiliary_loss_clip": 0.01104516, + "auxiliary_loss_mlp": 0.01052436, + "balance_loss_clip": 1.04534888, + "balance_loss_mlp": 1.03615212, + "epoch": 0.4121448970389298, + "flos": 23440762402560.0, + "grad_norm": 2.1224683572406917, + "language_loss": 0.8348515, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.85642099, + "num_input_tokens_seen": 147184025, + "step": 6855, + "time_per_iteration": 2.6698896884918213 + }, + { + "auxiliary_loss_clip": 0.01117307, + "auxiliary_loss_mlp": 0.01044081, + "balance_loss_clip": 1.04969764, + "balance_loss_mlp": 1.02976418, + "epoch": 0.4122050202915978, + "flos": 21325300859520.0, + "grad_norm": 2.1069107949142554, + "language_loss": 0.7929827, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.81459653, + "num_input_tokens_seen": 147202730, + "step": 6856, + "time_per_iteration": 2.6754775047302246 + }, + { + "auxiliary_loss_clip": 0.01098846, + "auxiliary_loss_mlp": 0.01042601, + "balance_loss_clip": 1.04761338, + "balance_loss_mlp": 1.02777684, + "epoch": 0.41226514354426574, + "flos": 17306286059520.0, + "grad_norm": 2.5035417030553018, + "language_loss": 0.80352724, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.82494175, + "num_input_tokens_seen": 147215315, + "step": 6857, + "time_per_iteration": 2.7415785789489746 + }, + { + "auxiliary_loss_clip": 0.01123756, + "auxiliary_loss_mlp": 0.0077359, + "balance_loss_clip": 1.04799688, + "balance_loss_mlp": 1.00088441, + "epoch": 0.4123252667969337, + "flos": 17638855107840.0, + "grad_norm": 2.1785137319374575, + "language_loss": 0.70367694, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.72265041, + "num_input_tokens_seen": 147233330, + "step": 6858, + "time_per_iteration": 2.6482796669006348 + }, + { + "auxiliary_loss_clip": 0.01123125, + "auxiliary_loss_mlp": 0.01046787, + "balance_loss_clip": 1.04916668, + "balance_loss_mlp": 1.03116488, + "epoch": 0.4123853900496017, + "flos": 46424811375360.0, + "grad_norm": 2.660424997773602, + "language_loss": 0.59025121, + "learning_rate": 2.652451598005391e-06, + "loss": 0.61195034, + "num_input_tokens_seen": 147257780, + "step": 6859, + "time_per_iteration": 2.8688454627990723 + }, + { + "auxiliary_loss_clip": 0.01132817, + "auxiliary_loss_mlp": 0.0104458, + "balance_loss_clip": 1.04658365, + "balance_loss_mlp": 1.0293684, + "epoch": 0.41244551330226964, + "flos": 17675160779520.0, + "grad_norm": 2.4672414929748863, + "language_loss": 0.73583943, + "learning_rate": 2.652083430674264e-06, + "loss": 0.75761342, + "num_input_tokens_seen": 147276055, + "step": 6860, + "time_per_iteration": 2.552107572555542 + }, + { + "auxiliary_loss_clip": 0.01058973, + "auxiliary_loss_mlp": 0.01038942, + "balance_loss_clip": 1.04514742, + "balance_loss_mlp": 1.024279, + "epoch": 0.4125056365549376, + "flos": 18693730748160.0, + "grad_norm": 1.7024014286117355, + "language_loss": 0.7499401, + "learning_rate": 2.651715238616068e-06, + "loss": 0.7709192, + "num_input_tokens_seen": 147293200, + "step": 6861, + "time_per_iteration": 2.8850560188293457 + }, + { + "auxiliary_loss_clip": 0.01110545, + "auxiliary_loss_mlp": 0.01044439, + "balance_loss_clip": 1.04591155, + "balance_loss_mlp": 1.03024721, + "epoch": 0.41256575980760557, + "flos": 17895293280000.0, + "grad_norm": 2.2415523494511467, + "language_loss": 0.79298902, + "learning_rate": 2.651347021844765e-06, + "loss": 0.8145389, + "num_input_tokens_seen": 147310640, + "step": 6862, + "time_per_iteration": 2.900341510772705 + }, + { + "auxiliary_loss_clip": 0.01101386, + "auxiliary_loss_mlp": 0.01041536, + "balance_loss_clip": 1.04071999, + "balance_loss_mlp": 1.02640843, + "epoch": 0.41262588306027354, + "flos": 21981316901760.0, + "grad_norm": 1.8032442507418176, + "language_loss": 0.7571404, + "learning_rate": 2.650978780374318e-06, + "loss": 0.77856958, + "num_input_tokens_seen": 147329435, + "step": 6863, + "time_per_iteration": 2.653726100921631 + }, + { + "auxiliary_loss_clip": 0.01042253, + "auxiliary_loss_mlp": 0.0101594, + "balance_loss_clip": 1.02186918, + "balance_loss_mlp": 1.01400852, + "epoch": 0.41268600631294156, + "flos": 53350006740480.0, + "grad_norm": 0.7071869047358454, + "language_loss": 0.52727556, + "learning_rate": 2.650610514218691e-06, + "loss": 0.54785752, + "num_input_tokens_seen": 147385805, + "step": 6864, + "time_per_iteration": 3.1097042560577393 + }, + { + "auxiliary_loss_clip": 0.01138053, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.04946339, + "balance_loss_mlp": 1.02124572, + "epoch": 0.4127461295656095, + "flos": 24385356311040.0, + "grad_norm": 2.542549123445174, + "language_loss": 0.72281235, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.74456495, + "num_input_tokens_seen": 147405160, + "step": 6865, + "time_per_iteration": 2.6489152908325195 + }, + { + "auxiliary_loss_clip": 0.01052076, + "auxiliary_loss_mlp": 0.01005202, + "balance_loss_clip": 1.02275848, + "balance_loss_mlp": 1.0035094, + "epoch": 0.4128062528182775, + "flos": 71705242696320.0, + "grad_norm": 0.9209058739863084, + "language_loss": 0.66585267, + "learning_rate": 2.649873907907753e-06, + "loss": 0.68642545, + "num_input_tokens_seen": 147460245, + "step": 6866, + "time_per_iteration": 3.062208890914917 + }, + { + "auxiliary_loss_clip": 0.01129627, + "auxiliary_loss_mlp": 0.01039004, + "balance_loss_clip": 1.04632759, + "balance_loss_mlp": 1.02420402, + "epoch": 0.41286637607094545, + "flos": 17849111368320.0, + "grad_norm": 2.3224691577841905, + "language_loss": 0.8131212, + "learning_rate": 2.649505567780375e-06, + "loss": 0.83480746, + "num_input_tokens_seen": 147476200, + "step": 6867, + "time_per_iteration": 2.6058406829833984 + }, + { + "auxiliary_loss_clip": 0.01114316, + "auxiliary_loss_mlp": 0.01036267, + "balance_loss_clip": 1.04773378, + "balance_loss_mlp": 1.02069843, + "epoch": 0.4129264993236134, + "flos": 25549544016000.0, + "grad_norm": 2.2632029728217913, + "language_loss": 0.78249037, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.80399621, + "num_input_tokens_seen": 147494315, + "step": 6868, + "time_per_iteration": 2.7882273197174072 + }, + { + "auxiliary_loss_clip": 0.0104195, + "auxiliary_loss_mlp": 0.01002347, + "balance_loss_clip": 1.02322721, + "balance_loss_mlp": 1.00078535, + "epoch": 0.4129866225762814, + "flos": 65414446364160.0, + "grad_norm": 0.8559261941349585, + "language_loss": 0.57746547, + "learning_rate": 2.64876881365164e-06, + "loss": 0.59790844, + "num_input_tokens_seen": 147543665, + "step": 6869, + "time_per_iteration": 2.9020984172821045 + }, + { + "auxiliary_loss_clip": 0.01116756, + "auxiliary_loss_mlp": 0.01037209, + "balance_loss_clip": 1.04666448, + "balance_loss_mlp": 1.02235568, + "epoch": 0.41304674582894935, + "flos": 28876991287680.0, + "grad_norm": 2.064989454661501, + "language_loss": 0.74957705, + "learning_rate": 2.64840039967822e-06, + "loss": 0.77111673, + "num_input_tokens_seen": 147564870, + "step": 6870, + "time_per_iteration": 4.271910667419434 + }, + { + "auxiliary_loss_clip": 0.01102765, + "auxiliary_loss_mlp": 0.01045795, + "balance_loss_clip": 1.04849434, + "balance_loss_mlp": 1.0301609, + "epoch": 0.4131068690816173, + "flos": 22891975436160.0, + "grad_norm": 1.7132239618858751, + "language_loss": 0.83188486, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.85337055, + "num_input_tokens_seen": 147584840, + "step": 6871, + "time_per_iteration": 2.7382373809814453 + }, + { + "auxiliary_loss_clip": 0.01102249, + "auxiliary_loss_mlp": 0.01042486, + "balance_loss_clip": 1.04694879, + "balance_loss_mlp": 1.02648854, + "epoch": 0.4131669923342853, + "flos": 26065185707520.0, + "grad_norm": 1.8588331523997874, + "language_loss": 0.68419731, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.70564461, + "num_input_tokens_seen": 147604635, + "step": 6872, + "time_per_iteration": 2.731513738632202 + }, + { + "auxiliary_loss_clip": 0.01116452, + "auxiliary_loss_mlp": 0.0103393, + "balance_loss_clip": 1.0480907, + "balance_loss_mlp": 1.01936865, + "epoch": 0.41322711558695324, + "flos": 19244564789760.0, + "grad_norm": 2.0600406966329468, + "language_loss": 0.75857317, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.78007692, + "num_input_tokens_seen": 147620700, + "step": 6873, + "time_per_iteration": 4.200350999832153 + }, + { + "auxiliary_loss_clip": 0.0110667, + "auxiliary_loss_mlp": 0.01041498, + "balance_loss_clip": 1.04465103, + "balance_loss_mlp": 1.02552366, + "epoch": 0.4132872388396212, + "flos": 22674464628480.0, + "grad_norm": 2.335780539187462, + "language_loss": 0.83409697, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.85557866, + "num_input_tokens_seen": 147639490, + "step": 6874, + "time_per_iteration": 2.677481174468994 + }, + { + "auxiliary_loss_clip": 0.01095645, + "auxiliary_loss_mlp": 0.01037651, + "balance_loss_clip": 1.04236686, + "balance_loss_mlp": 1.02203512, + "epoch": 0.4133473620922892, + "flos": 20150195420160.0, + "grad_norm": 2.13686316676373, + "language_loss": 0.71832943, + "learning_rate": 2.646557961279436e-06, + "loss": 0.73966241, + "num_input_tokens_seen": 147657205, + "step": 6875, + "time_per_iteration": 4.490081548690796 + }, + { + "auxiliary_loss_clip": 0.01099487, + "auxiliary_loss_mlp": 0.0104606, + "balance_loss_clip": 1.0442456, + "balance_loss_mlp": 1.03144503, + "epoch": 0.41340748534495714, + "flos": 24242755317120.0, + "grad_norm": 2.0421788997824164, + "language_loss": 0.82396001, + "learning_rate": 2.646189399991154e-06, + "loss": 0.84541547, + "num_input_tokens_seen": 147677005, + "step": 6876, + "time_per_iteration": 2.7446470260620117 + }, + { + "auxiliary_loss_clip": 0.01120566, + "auxiliary_loss_mlp": 0.01041258, + "balance_loss_clip": 1.04677415, + "balance_loss_mlp": 1.02511716, + "epoch": 0.41346760859762516, + "flos": 14392171566720.0, + "grad_norm": 2.56742905987435, + "language_loss": 0.64847958, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.67009783, + "num_input_tokens_seen": 147693435, + "step": 6877, + "time_per_iteration": 2.5988993644714355 + }, + { + "auxiliary_loss_clip": 0.01117576, + "auxiliary_loss_mlp": 0.01038622, + "balance_loss_clip": 1.04535675, + "balance_loss_mlp": 1.02366138, + "epoch": 0.4135277318502931, + "flos": 22492002516480.0, + "grad_norm": 1.9690610536683542, + "language_loss": 0.76823169, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.78979367, + "num_input_tokens_seen": 147714000, + "step": 6878, + "time_per_iteration": 2.6289098262786865 + }, + { + "auxiliary_loss_clip": 0.01120186, + "auxiliary_loss_mlp": 0.0077293, + "balance_loss_clip": 1.04670906, + "balance_loss_mlp": 1.00107956, + "epoch": 0.4135878551029611, + "flos": 22418744728320.0, + "grad_norm": 1.7550266496384528, + "language_loss": 0.80281323, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.82174444, + "num_input_tokens_seen": 147731010, + "step": 6879, + "time_per_iteration": 2.661945343017578 + }, + { + "auxiliary_loss_clip": 0.01130865, + "auxiliary_loss_mlp": 0.01039257, + "balance_loss_clip": 1.04709899, + "balance_loss_mlp": 1.02471972, + "epoch": 0.41364797835562905, + "flos": 27053232094080.0, + "grad_norm": 2.4786614895541312, + "language_loss": 0.84795272, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.869654, + "num_input_tokens_seen": 147750880, + "step": 6880, + "time_per_iteration": 2.6188430786132812 + }, + { + "auxiliary_loss_clip": 0.01111764, + "auxiliary_loss_mlp": 0.0102976, + "balance_loss_clip": 1.04788852, + "balance_loss_mlp": 1.01497793, + "epoch": 0.413708101608297, + "flos": 22967603521920.0, + "grad_norm": 3.387576232567814, + "language_loss": 0.70222247, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.72363776, + "num_input_tokens_seen": 147771360, + "step": 6881, + "time_per_iteration": 2.733462333679199 + }, + { + "auxiliary_loss_clip": 0.0112877, + "auxiliary_loss_mlp": 0.01037286, + "balance_loss_clip": 1.04717231, + "balance_loss_mlp": 1.02352309, + "epoch": 0.413768224860965, + "flos": 13333991875200.0, + "grad_norm": 2.043279627081185, + "language_loss": 0.81609744, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.837758, + "num_input_tokens_seen": 147787440, + "step": 6882, + "time_per_iteration": 2.6478219032287598 + }, + { + "auxiliary_loss_clip": 0.01107335, + "auxiliary_loss_mlp": 0.0104742, + "balance_loss_clip": 1.04388988, + "balance_loss_mlp": 1.02958596, + "epoch": 0.41382834811363295, + "flos": 20813968800000.0, + "grad_norm": 2.1226762712951195, + "language_loss": 0.69825858, + "learning_rate": 2.643608785656077e-06, + "loss": 0.71980608, + "num_input_tokens_seen": 147805720, + "step": 6883, + "time_per_iteration": 2.7219526767730713 + }, + { + "auxiliary_loss_clip": 0.01117809, + "auxiliary_loss_mlp": 0.01042891, + "balance_loss_clip": 1.04390156, + "balance_loss_mlp": 1.02804899, + "epoch": 0.4138884713663009, + "flos": 20667130001280.0, + "grad_norm": 1.778769139531053, + "language_loss": 0.76219916, + "learning_rate": 2.643240028730663e-06, + "loss": 0.7838062, + "num_input_tokens_seen": 147824605, + "step": 6884, + "time_per_iteration": 2.7255208492279053 + }, + { + "auxiliary_loss_clip": 0.01095169, + "auxiliary_loss_mlp": 0.01038756, + "balance_loss_clip": 1.04337394, + "balance_loss_mlp": 1.02405715, + "epoch": 0.4139485946189689, + "flos": 29056616225280.0, + "grad_norm": 1.442860134230448, + "language_loss": 0.75787425, + "learning_rate": 2.642871247413523e-06, + "loss": 0.77921343, + "num_input_tokens_seen": 147845445, + "step": 6885, + "time_per_iteration": 2.759103775024414 + }, + { + "auxiliary_loss_clip": 0.0113157, + "auxiliary_loss_mlp": 0.01040383, + "balance_loss_clip": 1.04593658, + "balance_loss_mlp": 1.0249809, + "epoch": 0.41400871787163684, + "flos": 24425720219520.0, + "grad_norm": 2.975461049679227, + "language_loss": 0.70157146, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.72329092, + "num_input_tokens_seen": 147865580, + "step": 6886, + "time_per_iteration": 2.5969202518463135 + }, + { + "auxiliary_loss_clip": 0.01130858, + "auxiliary_loss_mlp": 0.00772578, + "balance_loss_clip": 1.04714894, + "balance_loss_mlp": 1.00082159, + "epoch": 0.4140688411243048, + "flos": 19464050845440.0, + "grad_norm": 4.863732808232375, + "language_loss": 0.75765413, + "learning_rate": 2.642133611660002e-06, + "loss": 0.77668852, + "num_input_tokens_seen": 147885230, + "step": 6887, + "time_per_iteration": 2.6130294799804688 + }, + { + "auxiliary_loss_clip": 0.01115226, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.04343033, + "balance_loss_mlp": 1.01858318, + "epoch": 0.4141289643769728, + "flos": 19313656600320.0, + "grad_norm": 1.960325409954457, + "language_loss": 0.70337266, + "learning_rate": 2.641764757251592e-06, + "loss": 0.72486007, + "num_input_tokens_seen": 147903035, + "step": 6888, + "time_per_iteration": 2.616093635559082 + }, + { + "auxiliary_loss_clip": 0.01125875, + "auxiliary_loss_mlp": 0.01041471, + "balance_loss_clip": 1.04317069, + "balance_loss_mlp": 1.02698743, + "epoch": 0.41418908762964074, + "flos": 16726903683840.0, + "grad_norm": 2.06267801428711, + "language_loss": 0.76650596, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.7881794, + "num_input_tokens_seen": 147918745, + "step": 6889, + "time_per_iteration": 2.5624022483825684 + }, + { + "auxiliary_loss_clip": 0.01098507, + "auxiliary_loss_mlp": 0.00771883, + "balance_loss_clip": 1.05070317, + "balance_loss_mlp": 1.00089312, + "epoch": 0.41424921088230876, + "flos": 25296840858240.0, + "grad_norm": 2.7156921824995224, + "language_loss": 0.80554968, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.82425356, + "num_input_tokens_seen": 147938265, + "step": 6890, + "time_per_iteration": 2.796128273010254 + }, + { + "auxiliary_loss_clip": 0.0112736, + "auxiliary_loss_mlp": 0.01038801, + "balance_loss_clip": 1.04589438, + "balance_loss_mlp": 1.0235126, + "epoch": 0.4143093341349767, + "flos": 20960520289920.0, + "grad_norm": 1.7630713030967287, + "language_loss": 0.74180973, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.76347136, + "num_input_tokens_seen": 147957320, + "step": 6891, + "time_per_iteration": 2.6974401473999023 + }, + { + "auxiliary_loss_clip": 0.01092037, + "auxiliary_loss_mlp": 0.01043425, + "balance_loss_clip": 1.0482198, + "balance_loss_mlp": 1.02647936, + "epoch": 0.4143694573876447, + "flos": 22017694400640.0, + "grad_norm": 1.8611116210645706, + "language_loss": 0.84570521, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.86705983, + "num_input_tokens_seen": 147977045, + "step": 6892, + "time_per_iteration": 2.8065037727355957 + }, + { + "auxiliary_loss_clip": 0.01081139, + "auxiliary_loss_mlp": 0.00774401, + "balance_loss_clip": 1.04017556, + "balance_loss_mlp": 1.00088513, + "epoch": 0.41442958064031266, + "flos": 35697396723840.0, + "grad_norm": 1.7475313827364956, + "language_loss": 0.70824122, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.72679669, + "num_input_tokens_seen": 147996905, + "step": 6893, + "time_per_iteration": 2.865112543106079 + }, + { + "auxiliary_loss_clip": 0.01126872, + "auxiliary_loss_mlp": 0.01033016, + "balance_loss_clip": 1.04508913, + "balance_loss_mlp": 1.01873493, + "epoch": 0.4144897038929806, + "flos": 28293766156800.0, + "grad_norm": 1.5118367219903406, + "language_loss": 0.72955495, + "learning_rate": 2.639551120239279e-06, + "loss": 0.75115383, + "num_input_tokens_seen": 148017875, + "step": 6894, + "time_per_iteration": 2.6412105560302734 + }, + { + "auxiliary_loss_clip": 0.0111867, + "auxiliary_loss_mlp": 0.01032409, + "balance_loss_clip": 1.0444473, + "balance_loss_mlp": 1.01803279, + "epoch": 0.4145498271456486, + "flos": 11648093080320.0, + "grad_norm": 2.8699191887217697, + "language_loss": 0.63006961, + "learning_rate": 2.63918209577416e-06, + "loss": 0.65158045, + "num_input_tokens_seen": 148032300, + "step": 6895, + "time_per_iteration": 2.6429762840270996 + }, + { + "auxiliary_loss_clip": 0.01084496, + "auxiliary_loss_mlp": 0.01047641, + "balance_loss_clip": 1.04230917, + "balance_loss_mlp": 1.03178644, + "epoch": 0.41460995039831655, + "flos": 27235622378880.0, + "grad_norm": 1.395247516884051, + "language_loss": 0.7072767, + "learning_rate": 2.638813047071192e-06, + "loss": 0.728598, + "num_input_tokens_seen": 148053260, + "step": 6896, + "time_per_iteration": 2.754567861557007 + }, + { + "auxiliary_loss_clip": 0.01125613, + "auxiliary_loss_mlp": 0.0104596, + "balance_loss_clip": 1.04233313, + "balance_loss_mlp": 1.03083241, + "epoch": 0.4146700736509845, + "flos": 25922369232000.0, + "grad_norm": 1.6183082189069362, + "language_loss": 0.73234701, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.75406271, + "num_input_tokens_seen": 148072965, + "step": 6897, + "time_per_iteration": 2.737884759902954 + }, + { + "auxiliary_loss_clip": 0.01114786, + "auxiliary_loss_mlp": 0.01041831, + "balance_loss_clip": 1.04562593, + "balance_loss_mlp": 1.02713859, + "epoch": 0.4147301969036525, + "flos": 26833243248000.0, + "grad_norm": 1.834097351521641, + "language_loss": 0.84865111, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.87021732, + "num_input_tokens_seen": 148093240, + "step": 6898, + "time_per_iteration": 2.689467430114746 + }, + { + "auxiliary_loss_clip": 0.01079261, + "auxiliary_loss_mlp": 0.01035002, + "balance_loss_clip": 1.03853178, + "balance_loss_mlp": 1.02030301, + "epoch": 0.41479032015632045, + "flos": 20298291194880.0, + "grad_norm": 1.6538444757930724, + "language_loss": 0.74696559, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.76810819, + "num_input_tokens_seen": 148110925, + "step": 6899, + "time_per_iteration": 2.73575758934021 + }, + { + "auxiliary_loss_clip": 0.0109529, + "auxiliary_loss_mlp": 0.0104143, + "balance_loss_clip": 1.04097557, + "balance_loss_mlp": 1.02549219, + "epoch": 0.4148504434089884, + "flos": 25264988472960.0, + "grad_norm": 2.0028183144746254, + "language_loss": 0.75739181, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.778759, + "num_input_tokens_seen": 148130670, + "step": 6900, + "time_per_iteration": 2.7304093837738037 + }, + { + "auxiliary_loss_clip": 0.01112354, + "auxiliary_loss_mlp": 0.01038142, + "balance_loss_clip": 1.04515111, + "balance_loss_mlp": 1.02218616, + "epoch": 0.4149105666616564, + "flos": 12822300679680.0, + "grad_norm": 37.61175094058464, + "language_loss": 0.79667652, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.81818151, + "num_input_tokens_seen": 148148350, + "step": 6901, + "time_per_iteration": 2.6238512992858887 + }, + { + "auxiliary_loss_clip": 0.01085977, + "auxiliary_loss_mlp": 0.01046173, + "balance_loss_clip": 1.03959978, + "balance_loss_mlp": 1.0302825, + "epoch": 0.41497068991432434, + "flos": 16763891713920.0, + "grad_norm": 1.6395274695924928, + "language_loss": 0.69640017, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.7177217, + "num_input_tokens_seen": 148167550, + "step": 6902, + "time_per_iteration": 2.6854305267333984 + }, + { + "auxiliary_loss_clip": 0.01097592, + "auxiliary_loss_mlp": 0.00770925, + "balance_loss_clip": 1.04278207, + "balance_loss_mlp": 1.00099885, + "epoch": 0.4150308131669923, + "flos": 18000906243840.0, + "grad_norm": 2.384025861502229, + "language_loss": 0.83949161, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.85817683, + "num_input_tokens_seen": 148184740, + "step": 6903, + "time_per_iteration": 2.6454520225524902 + }, + { + "auxiliary_loss_clip": 0.01133263, + "auxiliary_loss_mlp": 0.01042035, + "balance_loss_clip": 1.04633808, + "balance_loss_mlp": 1.02569163, + "epoch": 0.41509093641966033, + "flos": 30044770352640.0, + "grad_norm": 1.9553359330266324, + "language_loss": 0.67639846, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.69815147, + "num_input_tokens_seen": 148204605, + "step": 6904, + "time_per_iteration": 2.7322065830230713 + }, + { + "auxiliary_loss_clip": 0.01130567, + "auxiliary_loss_mlp": 0.0077237, + "balance_loss_clip": 1.04620719, + "balance_loss_mlp": 1.00097251, + "epoch": 0.4151510596723283, + "flos": 24279994742400.0, + "grad_norm": 1.8757192691258513, + "language_loss": 0.77572656, + "learning_rate": 2.635490520350643e-06, + "loss": 0.79475594, + "num_input_tokens_seen": 148224675, + "step": 6905, + "time_per_iteration": 2.648400068283081 + }, + { + "auxiliary_loss_clip": 0.0113062, + "auxiliary_loss_mlp": 0.01033001, + "balance_loss_clip": 1.04648256, + "balance_loss_mlp": 1.01869583, + "epoch": 0.41521118292499626, + "flos": 23476206147840.0, + "grad_norm": 1.5608092182069806, + "language_loss": 0.68316001, + "learning_rate": 2.635121230039025e-06, + "loss": 0.7047962, + "num_input_tokens_seen": 148243375, + "step": 6906, + "time_per_iteration": 2.6084086894989014 + }, + { + "auxiliary_loss_clip": 0.01104219, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.04238176, + "balance_loss_mlp": 1.02167583, + "epoch": 0.4152713061776642, + "flos": 22125498094080.0, + "grad_norm": 2.313429051291415, + "language_loss": 0.67982537, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.70122576, + "num_input_tokens_seen": 148261140, + "step": 6907, + "time_per_iteration": 2.715506076812744 + }, + { + "auxiliary_loss_clip": 0.01100263, + "auxiliary_loss_mlp": 0.01038198, + "balance_loss_clip": 1.0479455, + "balance_loss_mlp": 1.02419686, + "epoch": 0.4153314294303322, + "flos": 21251396626560.0, + "grad_norm": 2.133321939860832, + "language_loss": 0.77338696, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.79477155, + "num_input_tokens_seen": 148279655, + "step": 6908, + "time_per_iteration": 2.699028253555298 + }, + { + "auxiliary_loss_clip": 0.01035537, + "auxiliary_loss_mlp": 0.01050035, + "balance_loss_clip": 1.02502179, + "balance_loss_mlp": 1.04800892, + "epoch": 0.41539155268300015, + "flos": 57920681594880.0, + "grad_norm": 0.8023457423545532, + "language_loss": 0.64889216, + "learning_rate": 2.634013214657026e-06, + "loss": 0.66974789, + "num_input_tokens_seen": 148339005, + "step": 6909, + "time_per_iteration": 3.174577474594116 + }, + { + "auxiliary_loss_clip": 0.01096348, + "auxiliary_loss_mlp": 0.0103783, + "balance_loss_clip": 1.04794037, + "balance_loss_mlp": 1.02368009, + "epoch": 0.4154516759356681, + "flos": 21903677654400.0, + "grad_norm": 3.1710005220016293, + "language_loss": 0.8712942, + "learning_rate": 2.633643828093996e-06, + "loss": 0.89263594, + "num_input_tokens_seen": 148358715, + "step": 6910, + "time_per_iteration": 4.24171257019043 + }, + { + "auxiliary_loss_clip": 0.01040831, + "auxiliary_loss_mlp": 0.01008541, + "balance_loss_clip": 1.02141929, + "balance_loss_mlp": 1.00702703, + "epoch": 0.4155117991883361, + "flos": 67833677226240.0, + "grad_norm": 0.8180681021689019, + "language_loss": 0.62115103, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64164472, + "num_input_tokens_seen": 148417280, + "step": 6911, + "time_per_iteration": 3.171510696411133 + }, + { + "auxiliary_loss_clip": 0.01138851, + "auxiliary_loss_mlp": 0.01037606, + "balance_loss_clip": 1.05016613, + "balance_loss_mlp": 1.0219059, + "epoch": 0.41557192244100405, + "flos": 14282679934080.0, + "grad_norm": 2.4116200088670845, + "language_loss": 0.87474132, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.89650595, + "num_input_tokens_seen": 148432610, + "step": 6912, + "time_per_iteration": 5.576058864593506 + }, + { + "auxiliary_loss_clip": 0.01117561, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.04753387, + "balance_loss_mlp": 1.02098989, + "epoch": 0.415632045693672, + "flos": 24461954064000.0, + "grad_norm": 22.77173838310247, + "language_loss": 0.63224173, + "learning_rate": 2.632535524293914e-06, + "loss": 0.65376365, + "num_input_tokens_seen": 148451510, + "step": 6913, + "time_per_iteration": 2.702631711959839 + }, + { + "auxiliary_loss_clip": 0.01102511, + "auxiliary_loss_mlp": 0.00771597, + "balance_loss_clip": 1.04298615, + "balance_loss_mlp": 1.00093937, + "epoch": 0.41569216894634, + "flos": 20115290378880.0, + "grad_norm": 1.7272855093915238, + "language_loss": 0.74980754, + "learning_rate": 2.632166041703586e-06, + "loss": 0.76854861, + "num_input_tokens_seen": 148469945, + "step": 6914, + "time_per_iteration": 4.340964078903198 + }, + { + "auxiliary_loss_clip": 0.01077278, + "auxiliary_loss_mlp": 0.01044004, + "balance_loss_clip": 1.04201877, + "balance_loss_mlp": 1.02906704, + "epoch": 0.41575229219900794, + "flos": 23798827128960.0, + "grad_norm": 1.8325905436461942, + "language_loss": 0.87653631, + "learning_rate": 2.631796535141458e-06, + "loss": 0.89774919, + "num_input_tokens_seen": 148486655, + "step": 6915, + "time_per_iteration": 2.757596731185913 + }, + { + "auxiliary_loss_clip": 0.0109973, + "auxiliary_loss_mlp": 0.01041371, + "balance_loss_clip": 1.04447317, + "balance_loss_mlp": 1.02728081, + "epoch": 0.4158124154516759, + "flos": 23108229267840.0, + "grad_norm": 3.0600667343253214, + "language_loss": 0.70990372, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.73131478, + "num_input_tokens_seen": 148505035, + "step": 6916, + "time_per_iteration": 2.6894583702087402 + }, + { + "auxiliary_loss_clip": 0.01135969, + "auxiliary_loss_mlp": 0.01038621, + "balance_loss_clip": 1.04934418, + "balance_loss_mlp": 1.02361822, + "epoch": 0.41587253870434393, + "flos": 24242970798720.0, + "grad_norm": 1.53910679789622, + "language_loss": 0.71859491, + "learning_rate": 2.631057450157852e-06, + "loss": 0.74034083, + "num_input_tokens_seen": 148525575, + "step": 6917, + "time_per_iteration": 2.560401439666748 + }, + { + "auxiliary_loss_clip": 0.01104226, + "auxiliary_loss_mlp": 0.01032177, + "balance_loss_clip": 1.04427075, + "balance_loss_mlp": 1.01856291, + "epoch": 0.4159326619570119, + "flos": 23881602021120.0, + "grad_norm": 1.8609084037764254, + "language_loss": 0.80841225, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.82977629, + "num_input_tokens_seen": 148547270, + "step": 6918, + "time_per_iteration": 2.71455979347229 + }, + { + "auxiliary_loss_clip": 0.01122968, + "auxiliary_loss_mlp": 0.01038479, + "balance_loss_clip": 1.05033052, + "balance_loss_mlp": 1.02306533, + "epoch": 0.41599278520967986, + "flos": 40626531354240.0, + "grad_norm": 1.460873312199365, + "language_loss": 0.70399261, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.72560704, + "num_input_tokens_seen": 148572100, + "step": 6919, + "time_per_iteration": 2.784090518951416 + }, + { + "auxiliary_loss_clip": 0.01108371, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.04570937, + "balance_loss_mlp": 1.0255723, + "epoch": 0.4160529084623478, + "flos": 18222942165120.0, + "grad_norm": 1.8818708282287906, + "language_loss": 0.81701922, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.83851242, + "num_input_tokens_seen": 148591245, + "step": 6920, + "time_per_iteration": 2.644867181777954 + }, + { + "auxiliary_loss_clip": 0.01113217, + "auxiliary_loss_mlp": 0.01042119, + "balance_loss_clip": 1.04909408, + "balance_loss_mlp": 1.02627623, + "epoch": 0.4161130317150158, + "flos": 13661963982720.0, + "grad_norm": 2.168550443744471, + "language_loss": 0.65408564, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.67563891, + "num_input_tokens_seen": 148607980, + "step": 6921, + "time_per_iteration": 2.647270441055298 + }, + { + "auxiliary_loss_clip": 0.01108151, + "auxiliary_loss_mlp": 0.01042421, + "balance_loss_clip": 1.04479325, + "balance_loss_mlp": 1.02768648, + "epoch": 0.41617315496768376, + "flos": 16178511767040.0, + "grad_norm": 2.3873319200859004, + "language_loss": 0.80806041, + "learning_rate": 2.629209319173274e-06, + "loss": 0.82956612, + "num_input_tokens_seen": 148624490, + "step": 6922, + "time_per_iteration": 2.6521530151367188 + }, + { + "auxiliary_loss_clip": 0.01107722, + "auxiliary_loss_mlp": 0.01037357, + "balance_loss_clip": 1.04645085, + "balance_loss_mlp": 1.02304578, + "epoch": 0.4162332782203517, + "flos": 26213317395840.0, + "grad_norm": 1.6600188367705673, + "language_loss": 0.67455506, + "learning_rate": 2.628839621341247e-06, + "loss": 0.69600594, + "num_input_tokens_seen": 148646490, + "step": 6923, + "time_per_iteration": 2.6982760429382324 + }, + { + "auxiliary_loss_clip": 0.01100761, + "auxiliary_loss_mlp": 0.01052569, + "balance_loss_clip": 1.04614723, + "balance_loss_mlp": 1.03649926, + "epoch": 0.4162934014730197, + "flos": 28183987215360.0, + "grad_norm": 2.1905305361602676, + "language_loss": 0.75802875, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.77956206, + "num_input_tokens_seen": 148668580, + "step": 6924, + "time_per_iteration": 2.746675491333008 + }, + { + "auxiliary_loss_clip": 0.01134317, + "auxiliary_loss_mlp": 0.01042613, + "balance_loss_clip": 1.04869533, + "balance_loss_mlp": 1.02842665, + "epoch": 0.41635352472568765, + "flos": 19865316654720.0, + "grad_norm": 2.7384378444587774, + "language_loss": 0.73572767, + "learning_rate": 2.62810015415423e-06, + "loss": 0.75749695, + "num_input_tokens_seen": 148688410, + "step": 6925, + "time_per_iteration": 2.6443655490875244 + }, + { + "auxiliary_loss_clip": 0.01107096, + "auxiliary_loss_mlp": 0.01035039, + "balance_loss_clip": 1.04328012, + "balance_loss_mlp": 1.02092457, + "epoch": 0.4164136479783556, + "flos": 14935356011520.0, + "grad_norm": 2.2965796841293487, + "language_loss": 0.83732742, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.85874879, + "num_input_tokens_seen": 148704855, + "step": 6926, + "time_per_iteration": 2.688778877258301 + }, + { + "auxiliary_loss_clip": 0.01101563, + "auxiliary_loss_mlp": 0.0104323, + "balance_loss_clip": 1.04851913, + "balance_loss_mlp": 1.03019416, + "epoch": 0.4164737712310236, + "flos": 21757593041280.0, + "grad_norm": 1.7122304152619183, + "language_loss": 0.86459213, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.88604003, + "num_input_tokens_seen": 148723065, + "step": 6927, + "time_per_iteration": 2.6891677379608154 + }, + { + "auxiliary_loss_clip": 0.01123007, + "auxiliary_loss_mlp": 0.01048103, + "balance_loss_clip": 1.04902172, + "balance_loss_mlp": 1.03252852, + "epoch": 0.41653389448369155, + "flos": 20740136394240.0, + "grad_norm": 2.2496180093698555, + "language_loss": 0.72619522, + "learning_rate": 2.626990774776604e-06, + "loss": 0.74790633, + "num_input_tokens_seen": 148741780, + "step": 6928, + "time_per_iteration": 2.6853785514831543 + }, + { + "auxiliary_loss_clip": 0.01103421, + "auxiliary_loss_mlp": 0.01037571, + "balance_loss_clip": 1.04516923, + "balance_loss_mlp": 1.02305102, + "epoch": 0.4165940177363595, + "flos": 24972891073920.0, + "grad_norm": 2.3320684503004667, + "language_loss": 0.781192, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.80260193, + "num_input_tokens_seen": 148759795, + "step": 6929, + "time_per_iteration": 2.675412893295288 + }, + { + "auxiliary_loss_clip": 0.01130228, + "auxiliary_loss_mlp": 0.01034459, + "balance_loss_clip": 1.04634309, + "balance_loss_mlp": 1.02042162, + "epoch": 0.41665414098902753, + "flos": 20521727746560.0, + "grad_norm": 2.2076337971053897, + "language_loss": 0.70941442, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.73106134, + "num_input_tokens_seen": 148778680, + "step": 6930, + "time_per_iteration": 2.5896191596984863 + }, + { + "auxiliary_loss_clip": 0.0110378, + "auxiliary_loss_mlp": 0.01040113, + "balance_loss_clip": 1.04316616, + "balance_loss_mlp": 1.02566469, + "epoch": 0.4167142642416955, + "flos": 19682926369920.0, + "grad_norm": 1.7468000498396183, + "language_loss": 0.81265134, + "learning_rate": 2.625881181419007e-06, + "loss": 0.83409023, + "num_input_tokens_seen": 148796470, + "step": 6931, + "time_per_iteration": 2.693753719329834 + }, + { + "auxiliary_loss_clip": 0.01073611, + "auxiliary_loss_mlp": 0.01040047, + "balance_loss_clip": 1.03671885, + "balance_loss_mlp": 1.0253247, + "epoch": 0.41677438749436346, + "flos": 23763742519680.0, + "grad_norm": 1.7136797301427433, + "language_loss": 0.78969777, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.81083435, + "num_input_tokens_seen": 148815300, + "step": 6932, + "time_per_iteration": 2.900186061859131 + }, + { + "auxiliary_loss_clip": 0.01110051, + "auxiliary_loss_mlp": 0.00772641, + "balance_loss_clip": 1.04659891, + "balance_loss_mlp": 1.00109386, + "epoch": 0.41683451074703143, + "flos": 30410053712640.0, + "grad_norm": 1.8812444225834188, + "language_loss": 0.81995165, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.83877861, + "num_input_tokens_seen": 148834315, + "step": 6933, + "time_per_iteration": 2.815415143966675 + }, + { + "auxiliary_loss_clip": 0.01135077, + "auxiliary_loss_mlp": 0.01036525, + "balance_loss_clip": 1.04731107, + "balance_loss_mlp": 1.02077699, + "epoch": 0.4168946339996994, + "flos": 21506757390720.0, + "grad_norm": 2.9283724451949236, + "language_loss": 0.76852083, + "learning_rate": 2.624771374460121e-06, + "loss": 0.79023689, + "num_input_tokens_seen": 148852420, + "step": 6934, + "time_per_iteration": 2.7175137996673584 + }, + { + "auxiliary_loss_clip": 0.01122637, + "auxiliary_loss_mlp": 0.01034712, + "balance_loss_clip": 1.048594, + "balance_loss_mlp": 1.02038264, + "epoch": 0.41695475725236736, + "flos": 17638675539840.0, + "grad_norm": 1.7602525666099749, + "language_loss": 0.67555362, + "learning_rate": 2.624401391405668e-06, + "loss": 0.6971271, + "num_input_tokens_seen": 148869305, + "step": 6935, + "time_per_iteration": 2.740238666534424 + }, + { + "auxiliary_loss_clip": 0.01106934, + "auxiliary_loss_mlp": 0.01041015, + "balance_loss_clip": 1.04740202, + "balance_loss_mlp": 1.02606606, + "epoch": 0.4170148805050353, + "flos": 15668903560320.0, + "grad_norm": 2.0770148597671834, + "language_loss": 0.73310643, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.75458586, + "num_input_tokens_seen": 148886395, + "step": 6936, + "time_per_iteration": 2.71653413772583 + }, + { + "auxiliary_loss_clip": 0.01115958, + "auxiliary_loss_mlp": 0.01036656, + "balance_loss_clip": 1.04845905, + "balance_loss_mlp": 1.02274418, + "epoch": 0.4170750037577033, + "flos": 15159151699200.0, + "grad_norm": 2.3408521316198794, + "language_loss": 0.74009961, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.76162577, + "num_input_tokens_seen": 148905235, + "step": 6937, + "time_per_iteration": 2.627197265625 + }, + { + "auxiliary_loss_clip": 0.01105318, + "auxiliary_loss_mlp": 0.01038451, + "balance_loss_clip": 1.04543686, + "balance_loss_mlp": 1.02422357, + "epoch": 0.41713512701037125, + "flos": 28768289754240.0, + "grad_norm": 2.1407867738666977, + "language_loss": 0.84349155, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.8649292, + "num_input_tokens_seen": 148928130, + "step": 6938, + "time_per_iteration": 2.7512307167053223 + }, + { + "auxiliary_loss_clip": 0.01107641, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.04718804, + "balance_loss_mlp": 1.02217638, + "epoch": 0.4171952502630392, + "flos": 28256993608320.0, + "grad_norm": 1.985550471698889, + "language_loss": 0.7437641, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.76521742, + "num_input_tokens_seen": 148948790, + "step": 6939, + "time_per_iteration": 2.8480472564697266 + }, + { + "auxiliary_loss_clip": 0.01121822, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.0470984, + "balance_loss_mlp": 1.01803446, + "epoch": 0.4172553735157072, + "flos": 24571697091840.0, + "grad_norm": 2.560264252806934, + "language_loss": 0.74981248, + "learning_rate": 2.622551121253579e-06, + "loss": 0.77136433, + "num_input_tokens_seen": 148967690, + "step": 6940, + "time_per_iteration": 2.707803249359131 + }, + { + "auxiliary_loss_clip": 0.01132435, + "auxiliary_loss_mlp": 0.01040605, + "balance_loss_clip": 1.04839242, + "balance_loss_mlp": 1.0266397, + "epoch": 0.41731549676837515, + "flos": 27045797978880.0, + "grad_norm": 2.248952291582723, + "language_loss": 0.71683985, + "learning_rate": 2.622180996345424e-06, + "loss": 0.73857027, + "num_input_tokens_seen": 148987150, + "step": 6941, + "time_per_iteration": 2.6406352519989014 + }, + { + "auxiliary_loss_clip": 0.01119657, + "auxiliary_loss_mlp": 0.0103964, + "balance_loss_clip": 1.04871619, + "balance_loss_mlp": 1.02461994, + "epoch": 0.4173756200210431, + "flos": 28394063907840.0, + "grad_norm": 2.929963903641068, + "language_loss": 0.74062824, + "learning_rate": 2.621810847844104e-06, + "loss": 0.76222122, + "num_input_tokens_seen": 149004895, + "step": 6942, + "time_per_iteration": 2.7269139289855957 + }, + { + "auxiliary_loss_clip": 0.01096497, + "auxiliary_loss_mlp": 0.01046649, + "balance_loss_clip": 1.04605746, + "balance_loss_mlp": 1.03079462, + "epoch": 0.41743574327371114, + "flos": 22521556431360.0, + "grad_norm": 2.258418581580233, + "language_loss": 0.72607493, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.74750638, + "num_input_tokens_seen": 149020970, + "step": 6943, + "time_per_iteration": 2.8146276473999023 + }, + { + "auxiliary_loss_clip": 0.01100254, + "auxiliary_loss_mlp": 0.00772502, + "balance_loss_clip": 1.04520488, + "balance_loss_mlp": 1.00081825, + "epoch": 0.4174958665263791, + "flos": 30113431200000.0, + "grad_norm": 1.7970886758223585, + "language_loss": 0.63763773, + "learning_rate": 2.621070480118111e-06, + "loss": 0.65636539, + "num_input_tokens_seen": 149041795, + "step": 6944, + "time_per_iteration": 2.7709715366363525 + }, + { + "auxiliary_loss_clip": 0.0109928, + "auxiliary_loss_mlp": 0.01037535, + "balance_loss_clip": 1.03980803, + "balance_loss_mlp": 1.02262771, + "epoch": 0.41755598977904707, + "flos": 25263444188160.0, + "grad_norm": 1.5620596317333308, + "language_loss": 0.70201832, + "learning_rate": 2.620700260921513e-06, + "loss": 0.72338641, + "num_input_tokens_seen": 149063700, + "step": 6945, + "time_per_iteration": 2.7668464183807373 + }, + { + "auxiliary_loss_clip": 0.01086028, + "auxiliary_loss_mlp": 0.01052164, + "balance_loss_clip": 1.03888953, + "balance_loss_mlp": 1.03434181, + "epoch": 0.41761611303171503, + "flos": 19828580019840.0, + "grad_norm": 3.903492543127265, + "language_loss": 0.81313473, + "learning_rate": 2.620330018187899e-06, + "loss": 0.8345167, + "num_input_tokens_seen": 149082410, + "step": 6946, + "time_per_iteration": 2.7656164169311523 + }, + { + "auxiliary_loss_clip": 0.0111906, + "auxiliary_loss_mlp": 0.01033842, + "balance_loss_clip": 1.04820168, + "balance_loss_mlp": 1.01947689, + "epoch": 0.417676236284383, + "flos": 15523249910400.0, + "grad_norm": 3.3237502950686997, + "language_loss": 0.77819085, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.79971987, + "num_input_tokens_seen": 149098745, + "step": 6947, + "time_per_iteration": 2.6658904552459717 + }, + { + "auxiliary_loss_clip": 0.01131014, + "auxiliary_loss_mlp": 0.01035473, + "balance_loss_clip": 1.04678917, + "balance_loss_mlp": 1.020262, + "epoch": 0.41773635953705096, + "flos": 32524473761280.0, + "grad_norm": 4.535535573323162, + "language_loss": 0.72142154, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.7430864, + "num_input_tokens_seen": 149122255, + "step": 6948, + "time_per_iteration": 2.728604316711426 + }, + { + "auxiliary_loss_clip": 0.0111373, + "auxiliary_loss_mlp": 0.01035464, + "balance_loss_clip": 1.04416013, + "balance_loss_mlp": 1.02127814, + "epoch": 0.4177964827897189, + "flos": 23440941970560.0, + "grad_norm": 1.752796472610303, + "language_loss": 0.77020466, + "learning_rate": 2.619219148905362e-06, + "loss": 0.79169655, + "num_input_tokens_seen": 149142845, + "step": 6949, + "time_per_iteration": 4.2494752407073975 + }, + { + "auxiliary_loss_clip": 0.011131, + "auxiliary_loss_mlp": 0.01040025, + "balance_loss_clip": 1.05060196, + "balance_loss_mlp": 1.02523708, + "epoch": 0.4178566060423869, + "flos": 22748907565440.0, + "grad_norm": 1.637174584956538, + "language_loss": 0.8214075, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.84293878, + "num_input_tokens_seen": 149163375, + "step": 6950, + "time_per_iteration": 2.7383689880371094 + }, + { + "auxiliary_loss_clip": 0.01099413, + "auxiliary_loss_mlp": 0.00770849, + "balance_loss_clip": 1.04511857, + "balance_loss_mlp": 1.00090635, + "epoch": 0.41791672929505486, + "flos": 26032794618240.0, + "grad_norm": 1.501775844018401, + "language_loss": 0.7649653, + "learning_rate": 2.618478451956007e-06, + "loss": 0.78366792, + "num_input_tokens_seen": 149185610, + "step": 6951, + "time_per_iteration": 5.789496660232544 + }, + { + "auxiliary_loss_clip": 0.01088001, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.04565978, + "balance_loss_mlp": 1.01929939, + "epoch": 0.4179768525477228, + "flos": 19568694142080.0, + "grad_norm": 1.8438034417752391, + "language_loss": 0.73442549, + "learning_rate": 2.61810806829516e-06, + "loss": 0.75564867, + "num_input_tokens_seen": 149203990, + "step": 6952, + "time_per_iteration": 2.762404680252075 + }, + { + "auxiliary_loss_clip": 0.01116339, + "auxiliary_loss_mlp": 0.01038971, + "balance_loss_clip": 1.04836369, + "balance_loss_mlp": 1.0251013, + "epoch": 0.4180369758003908, + "flos": 17783826399360.0, + "grad_norm": 2.8847563198217667, + "language_loss": 0.7161783, + "learning_rate": 2.617737661195593e-06, + "loss": 0.73773146, + "num_input_tokens_seen": 149221385, + "step": 6953, + "time_per_iteration": 2.6514034271240234 + }, + { + "auxiliary_loss_clip": 0.01118442, + "auxiliary_loss_mlp": 0.01038634, + "balance_loss_clip": 1.04711723, + "balance_loss_mlp": 1.02363181, + "epoch": 0.41809709905305875, + "flos": 20960663944320.0, + "grad_norm": 1.7834717110535325, + "language_loss": 0.75982141, + "learning_rate": 2.617367230671353e-06, + "loss": 0.78139216, + "num_input_tokens_seen": 149241175, + "step": 6954, + "time_per_iteration": 4.3135082721710205 + }, + { + "auxiliary_loss_clip": 0.01092319, + "auxiliary_loss_mlp": 0.01046188, + "balance_loss_clip": 1.04647863, + "balance_loss_mlp": 1.02979708, + "epoch": 0.4181572223057267, + "flos": 22017622573440.0, + "grad_norm": 2.907950037168039, + "language_loss": 0.84492826, + "learning_rate": 2.616996776736485e-06, + "loss": 0.86631334, + "num_input_tokens_seen": 149259115, + "step": 6955, + "time_per_iteration": 2.7724356651306152 + }, + { + "auxiliary_loss_clip": 0.01121525, + "auxiliary_loss_mlp": 0.01040437, + "balance_loss_clip": 1.04870594, + "balance_loss_mlp": 1.02604234, + "epoch": 0.4182173455583947, + "flos": 26245528917120.0, + "grad_norm": 1.6794559400644542, + "language_loss": 0.83262718, + "learning_rate": 2.616626299405037e-06, + "loss": 0.8542468, + "num_input_tokens_seen": 149278705, + "step": 6956, + "time_per_iteration": 2.7260353565216064 + }, + { + "auxiliary_loss_clip": 0.01093652, + "auxiliary_loss_mlp": 0.01039325, + "balance_loss_clip": 1.04491091, + "balance_loss_mlp": 1.02423358, + "epoch": 0.4182774688110627, + "flos": 14791605782400.0, + "grad_norm": 2.3946498969788634, + "language_loss": 0.71788859, + "learning_rate": 2.616255798691059e-06, + "loss": 0.73921835, + "num_input_tokens_seen": 149294040, + "step": 6957, + "time_per_iteration": 2.6826114654541016 + }, + { + "auxiliary_loss_clip": 0.01099548, + "auxiliary_loss_mlp": 0.01043781, + "balance_loss_clip": 1.0462482, + "balance_loss_mlp": 1.02966106, + "epoch": 0.41833759206373067, + "flos": 20412020632320.0, + "grad_norm": 2.4781797095716276, + "language_loss": 0.75947559, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.78090888, + "num_input_tokens_seen": 149310385, + "step": 6958, + "time_per_iteration": 2.7528226375579834 + }, + { + "auxiliary_loss_clip": 0.01083285, + "auxiliary_loss_mlp": 0.00772338, + "balance_loss_clip": 1.04087532, + "balance_loss_mlp": 1.0007602, + "epoch": 0.41839771531639863, + "flos": 23656333875840.0, + "grad_norm": 1.8764496083097535, + "language_loss": 0.7693305, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.78788674, + "num_input_tokens_seen": 149328235, + "step": 6959, + "time_per_iteration": 2.7859151363372803 + }, + { + "auxiliary_loss_clip": 0.01089374, + "auxiliary_loss_mlp": 0.00772565, + "balance_loss_clip": 1.04304624, + "balance_loss_mlp": 1.00090861, + "epoch": 0.4184578385690666, + "flos": 19754137082880.0, + "grad_norm": 2.1131068778060498, + "language_loss": 0.77339065, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.79201001, + "num_input_tokens_seen": 149347465, + "step": 6960, + "time_per_iteration": 2.7497265338897705 + }, + { + "auxiliary_loss_clip": 0.01098942, + "auxiliary_loss_mlp": 0.01037539, + "balance_loss_clip": 1.04735017, + "balance_loss_mlp": 1.02385998, + "epoch": 0.41851796182173456, + "flos": 20193396503040.0, + "grad_norm": 1.8404962312042226, + "language_loss": 0.75842559, + "learning_rate": 2.614773562290835e-06, + "loss": 0.7797904, + "num_input_tokens_seen": 149366685, + "step": 6961, + "time_per_iteration": 2.6800267696380615 + }, + { + "auxiliary_loss_clip": 0.01038031, + "auxiliary_loss_mlp": 0.01001682, + "balance_loss_clip": 1.03925419, + "balance_loss_mlp": 0.99970287, + "epoch": 0.41857808507440253, + "flos": 59018794231680.0, + "grad_norm": 0.7827663866056928, + "language_loss": 0.54655838, + "learning_rate": 2.61440294487496e-06, + "loss": 0.56695551, + "num_input_tokens_seen": 149422925, + "step": 6962, + "time_per_iteration": 3.1537134647369385 + }, + { + "auxiliary_loss_clip": 0.01120288, + "auxiliary_loss_mlp": 0.0104634, + "balance_loss_clip": 1.04961705, + "balance_loss_mlp": 1.0318327, + "epoch": 0.4186382083270705, + "flos": 18478805719680.0, + "grad_norm": 1.960507757786237, + "language_loss": 0.85535777, + "learning_rate": 2.614032304160864e-06, + "loss": 0.87702405, + "num_input_tokens_seen": 149440820, + "step": 6963, + "time_per_iteration": 2.5925374031066895 + }, + { + "auxiliary_loss_clip": 0.01106535, + "auxiliary_loss_mlp": 0.01041093, + "balance_loss_clip": 1.04856253, + "balance_loss_mlp": 1.02657938, + "epoch": 0.41869833157973846, + "flos": 21578758202880.0, + "grad_norm": 1.6555227491445992, + "language_loss": 0.70422602, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.72570229, + "num_input_tokens_seen": 149461060, + "step": 6964, + "time_per_iteration": 2.675595760345459 + }, + { + "auxiliary_loss_clip": 0.01131013, + "auxiliary_loss_mlp": 0.01048168, + "balance_loss_clip": 1.04926276, + "balance_loss_mlp": 1.03433418, + "epoch": 0.4187584548324064, + "flos": 35517412650240.0, + "grad_norm": 2.107779734715906, + "language_loss": 0.71486962, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.73666137, + "num_input_tokens_seen": 149483115, + "step": 6965, + "time_per_iteration": 2.728795289993286 + }, + { + "auxiliary_loss_clip": 0.01081273, + "auxiliary_loss_mlp": 0.01038276, + "balance_loss_clip": 1.04315698, + "balance_loss_mlp": 1.02465594, + "epoch": 0.4188185780850744, + "flos": 18655880791680.0, + "grad_norm": 1.546256806673652, + "language_loss": 0.71920437, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.74039984, + "num_input_tokens_seen": 149501495, + "step": 6966, + "time_per_iteration": 2.9000282287597656 + }, + { + "auxiliary_loss_clip": 0.01127558, + "auxiliary_loss_mlp": 0.01037127, + "balance_loss_clip": 1.04965436, + "balance_loss_mlp": 1.02194023, + "epoch": 0.41887870133774235, + "flos": 40333428374400.0, + "grad_norm": 2.0539481091161664, + "language_loss": 0.71188843, + "learning_rate": 2.612549508603375e-06, + "loss": 0.73353529, + "num_input_tokens_seen": 149523170, + "step": 6967, + "time_per_iteration": 2.8494174480438232 + }, + { + "auxiliary_loss_clip": 0.01059483, + "auxiliary_loss_mlp": 0.01001432, + "balance_loss_clip": 1.039819, + "balance_loss_mlp": 0.99973947, + "epoch": 0.4189388245904103, + "flos": 61371336516480.0, + "grad_norm": 0.6719582962825281, + "language_loss": 0.46191829, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48252743, + "num_input_tokens_seen": 149583955, + "step": 6968, + "time_per_iteration": 3.2362303733825684 + }, + { + "auxiliary_loss_clip": 0.01123461, + "auxiliary_loss_mlp": 0.01043061, + "balance_loss_clip": 1.04708195, + "balance_loss_mlp": 1.02722979, + "epoch": 0.4189989478430783, + "flos": 28215624119040.0, + "grad_norm": 2.2684151977061386, + "language_loss": 0.75044996, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.77211517, + "num_input_tokens_seen": 149604440, + "step": 6969, + "time_per_iteration": 2.836956739425659 + }, + { + "auxiliary_loss_clip": 0.01108551, + "auxiliary_loss_mlp": 0.01045091, + "balance_loss_clip": 1.0470643, + "balance_loss_mlp": 1.03178096, + "epoch": 0.4190590710957463, + "flos": 24565879088640.0, + "grad_norm": 1.9985372976124152, + "language_loss": 0.8083396, + "learning_rate": 2.611437167992705e-06, + "loss": 0.82987607, + "num_input_tokens_seen": 149623745, + "step": 6970, + "time_per_iteration": 2.7209956645965576 + }, + { + "auxiliary_loss_clip": 0.01119916, + "auxiliary_loss_mlp": 0.0104141, + "balance_loss_clip": 1.04898238, + "balance_loss_mlp": 1.02689075, + "epoch": 0.41911919434841427, + "flos": 21726027964800.0, + "grad_norm": 2.2489196165322713, + "language_loss": 0.82699662, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.84860986, + "num_input_tokens_seen": 149643025, + "step": 6971, + "time_per_iteration": 2.6844992637634277 + }, + { + "auxiliary_loss_clip": 0.01105807, + "auxiliary_loss_mlp": 0.01047014, + "balance_loss_clip": 1.0493474, + "balance_loss_mlp": 1.03207135, + "epoch": 0.41917931760108224, + "flos": 17601543855360.0, + "grad_norm": 1.6553402405348427, + "language_loss": 0.74262661, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.76415479, + "num_input_tokens_seen": 149660695, + "step": 6972, + "time_per_iteration": 2.6240105628967285 + }, + { + "auxiliary_loss_clip": 0.01102199, + "auxiliary_loss_mlp": 0.01040482, + "balance_loss_clip": 1.0421176, + "balance_loss_mlp": 1.02589083, + "epoch": 0.4192394408537502, + "flos": 37816701022080.0, + "grad_norm": 1.5708676830874608, + "language_loss": 0.72811258, + "learning_rate": 2.610324618710212e-06, + "loss": 0.74953938, + "num_input_tokens_seen": 149682040, + "step": 6973, + "time_per_iteration": 2.8109309673309326 + }, + { + "auxiliary_loss_clip": 0.01101478, + "auxiliary_loss_mlp": 0.01038786, + "balance_loss_clip": 1.05107093, + "balance_loss_mlp": 1.02461183, + "epoch": 0.41929956410641817, + "flos": 23107726477440.0, + "grad_norm": 1.8294609220169469, + "language_loss": 0.74864107, + "learning_rate": 2.609953722643489e-06, + "loss": 0.77004373, + "num_input_tokens_seen": 149700855, + "step": 6974, + "time_per_iteration": 2.7036855220794678 + }, + { + "auxiliary_loss_clip": 0.01117361, + "auxiliary_loss_mlp": 0.01037617, + "balance_loss_clip": 1.04402697, + "balance_loss_mlp": 1.02359784, + "epoch": 0.41935968735908613, + "flos": 22524537260160.0, + "grad_norm": 1.843462386151443, + "language_loss": 0.7271533, + "learning_rate": 2.609582803447259e-06, + "loss": 0.748703, + "num_input_tokens_seen": 149717360, + "step": 6975, + "time_per_iteration": 2.632661819458008 + }, + { + "auxiliary_loss_clip": 0.01113766, + "auxiliary_loss_mlp": 0.01042513, + "balance_loss_clip": 1.04679942, + "balance_loss_mlp": 1.02849412, + "epoch": 0.4194198106117541, + "flos": 26870446759680.0, + "grad_norm": 1.580698900699299, + "language_loss": 0.80874467, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.83030754, + "num_input_tokens_seen": 149738975, + "step": 6976, + "time_per_iteration": 2.68833327293396 + }, + { + "auxiliary_loss_clip": 0.01098184, + "auxiliary_loss_mlp": 0.01042179, + "balance_loss_clip": 1.04087496, + "balance_loss_mlp": 1.02671123, + "epoch": 0.41947993386442206, + "flos": 19902412425600.0, + "grad_norm": 4.6015574144833264, + "language_loss": 0.6767152, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.69811881, + "num_input_tokens_seen": 149757055, + "step": 6977, + "time_per_iteration": 2.6453959941864014 + }, + { + "auxiliary_loss_clip": 0.01122702, + "auxiliary_loss_mlp": 0.0104277, + "balance_loss_clip": 1.04980922, + "balance_loss_mlp": 1.02926338, + "epoch": 0.41954005711709, + "flos": 17383889393280.0, + "grad_norm": 2.3946463459425966, + "language_loss": 0.80506754, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.82672226, + "num_input_tokens_seen": 149772885, + "step": 6978, + "time_per_iteration": 2.596269369125366 + }, + { + "auxiliary_loss_clip": 0.01133146, + "auxiliary_loss_mlp": 0.0103908, + "balance_loss_clip": 1.04677558, + "balance_loss_mlp": 1.02459598, + "epoch": 0.419600180369758, + "flos": 25003306915200.0, + "grad_norm": 1.7226002389356767, + "language_loss": 0.82708085, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.84880304, + "num_input_tokens_seen": 149791515, + "step": 6979, + "time_per_iteration": 2.588383197784424 + }, + { + "auxiliary_loss_clip": 0.01129014, + "auxiliary_loss_mlp": 0.01037351, + "balance_loss_clip": 1.04659355, + "balance_loss_mlp": 1.02302253, + "epoch": 0.41966030362242596, + "flos": 17383781652480.0, + "grad_norm": 2.4214608222579206, + "language_loss": 0.83723533, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.85889894, + "num_input_tokens_seen": 149807250, + "step": 6980, + "time_per_iteration": 2.5890002250671387 + }, + { + "auxiliary_loss_clip": 0.01132913, + "auxiliary_loss_mlp": 0.01043925, + "balance_loss_clip": 1.04753232, + "balance_loss_mlp": 1.02994215, + "epoch": 0.4197204268750939, + "flos": 22156165330560.0, + "grad_norm": 2.919161771051539, + "language_loss": 0.7951659, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.81693423, + "num_input_tokens_seen": 149821640, + "step": 6981, + "time_per_iteration": 2.6015915870666504 + }, + { + "auxiliary_loss_clip": 0.01096505, + "auxiliary_loss_mlp": 0.01037263, + "balance_loss_clip": 1.04636097, + "balance_loss_mlp": 1.02382243, + "epoch": 0.4197805501277619, + "flos": 22084128604800.0, + "grad_norm": 2.285836698514787, + "language_loss": 0.84386683, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.86520445, + "num_input_tokens_seen": 149840545, + "step": 6982, + "time_per_iteration": 2.755657434463501 + }, + { + "auxiliary_loss_clip": 0.01120032, + "auxiliary_loss_mlp": 0.01038775, + "balance_loss_clip": 1.04708028, + "balance_loss_mlp": 1.02419019, + "epoch": 0.4198406733804299, + "flos": 26432192920320.0, + "grad_norm": 2.941579449236281, + "language_loss": 0.57212174, + "learning_rate": 2.606614618903214e-06, + "loss": 0.59370977, + "num_input_tokens_seen": 149860375, + "step": 6983, + "time_per_iteration": 2.699927568435669 + }, + { + "auxiliary_loss_clip": 0.01120799, + "auxiliary_loss_mlp": 0.01037958, + "balance_loss_clip": 1.05017662, + "balance_loss_mlp": 1.02513719, + "epoch": 0.4199007966330979, + "flos": 12531029293440.0, + "grad_norm": 1.788715678149628, + "language_loss": 0.82569104, + "learning_rate": 2.606243492174471e-06, + "loss": 0.84727859, + "num_input_tokens_seen": 149877850, + "step": 6984, + "time_per_iteration": 2.6608574390411377 + }, + { + "auxiliary_loss_clip": 0.01110821, + "auxiliary_loss_mlp": 0.01031336, + "balance_loss_clip": 1.04403567, + "balance_loss_mlp": 1.01740074, + "epoch": 0.41996091988576584, + "flos": 21762944167680.0, + "grad_norm": 1.8578510762238896, + "language_loss": 0.79251826, + "learning_rate": 2.605872342456914e-06, + "loss": 0.81393987, + "num_input_tokens_seen": 149896110, + "step": 6985, + "time_per_iteration": 2.6915009021759033 + }, + { + "auxiliary_loss_clip": 0.01134356, + "auxiliary_loss_mlp": 0.01037444, + "balance_loss_clip": 1.04694271, + "balance_loss_mlp": 1.02278078, + "epoch": 0.4200210431384338, + "flos": 26541935948160.0, + "grad_norm": 1.6735394330256788, + "language_loss": 0.78439772, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.80611569, + "num_input_tokens_seen": 149916495, + "step": 6986, + "time_per_iteration": 2.6553595066070557 + }, + { + "auxiliary_loss_clip": 0.01108367, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.04705167, + "balance_loss_mlp": 1.01957011, + "epoch": 0.42008116639110177, + "flos": 26795824254720.0, + "grad_norm": 1.6966099884396408, + "language_loss": 0.72624969, + "learning_rate": 2.605129974111655e-06, + "loss": 0.7476564, + "num_input_tokens_seen": 149936445, + "step": 6987, + "time_per_iteration": 2.7428104877471924 + }, + { + "auxiliary_loss_clip": 0.01105896, + "auxiliary_loss_mlp": 0.00774749, + "balance_loss_clip": 1.04440594, + "balance_loss_mlp": 1.00098395, + "epoch": 0.42014128964376973, + "flos": 32087333243520.0, + "grad_norm": 1.4394465087417463, + "language_loss": 0.74992245, + "learning_rate": 2.604758755512104e-06, + "loss": 0.76872891, + "num_input_tokens_seen": 149959430, + "step": 6988, + "time_per_iteration": 4.499454975128174 + }, + { + "auxiliary_loss_clip": 0.01124153, + "auxiliary_loss_mlp": 0.01040193, + "balance_loss_clip": 1.04908502, + "balance_loss_mlp": 1.02585781, + "epoch": 0.4202014128964377, + "flos": 26467133875200.0, + "grad_norm": 1.6029470393888554, + "language_loss": 0.73995304, + "learning_rate": 2.60438751398004e-06, + "loss": 0.76159656, + "num_input_tokens_seen": 149980365, + "step": 6989, + "time_per_iteration": 2.6979968547821045 + }, + { + "auxiliary_loss_clip": 0.01109104, + "auxiliary_loss_mlp": 0.01037728, + "balance_loss_clip": 1.04531431, + "balance_loss_mlp": 1.02353013, + "epoch": 0.42026153614910566, + "flos": 13401216178560.0, + "grad_norm": 2.8939358842188043, + "language_loss": 0.70562875, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.72709703, + "num_input_tokens_seen": 149997375, + "step": 6990, + "time_per_iteration": 4.2269814014434814 + }, + { + "auxiliary_loss_clip": 0.01052428, + "auxiliary_loss_mlp": 0.00753318, + "balance_loss_clip": 1.03888559, + "balance_loss_mlp": 1.00109041, + "epoch": 0.42032165940177363, + "flos": 60250457635200.0, + "grad_norm": 1.417771869116233, + "language_loss": 0.60470819, + "learning_rate": 2.603644962174685e-06, + "loss": 0.62276566, + "num_input_tokens_seen": 150051230, + "step": 6991, + "time_per_iteration": 4.600361585617065 + }, + { + "auxiliary_loss_clip": 0.01135512, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_clip": 1.05044973, + "balance_loss_mlp": 1.02417135, + "epoch": 0.4203817826544416, + "flos": 24535211852160.0, + "grad_norm": 1.4766426515770763, + "language_loss": 0.832901, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.85464245, + "num_input_tokens_seen": 150071135, + "step": 6992, + "time_per_iteration": 2.688693046569824 + }, + { + "auxiliary_loss_clip": 0.01058225, + "auxiliary_loss_mlp": 0.01016781, + "balance_loss_clip": 1.02967906, + "balance_loss_mlp": 1.01523161, + "epoch": 0.42044190590710956, + "flos": 58820781530880.0, + "grad_norm": 0.8077151468791776, + "language_loss": 0.65494478, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.67569482, + "num_input_tokens_seen": 150125220, + "step": 6993, + "time_per_iteration": 4.7132039070129395 + }, + { + "auxiliary_loss_clip": 0.011371, + "auxiliary_loss_mlp": 0.010386, + "balance_loss_clip": 1.04959965, + "balance_loss_mlp": 1.02267361, + "epoch": 0.4205020291597775, + "flos": 16436063260800.0, + "grad_norm": 1.948890763784571, + "language_loss": 0.83380342, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.85556042, + "num_input_tokens_seen": 150142300, + "step": 6994, + "time_per_iteration": 2.5883679389953613 + }, + { + "auxiliary_loss_clip": 0.01120964, + "auxiliary_loss_mlp": 0.00771063, + "balance_loss_clip": 1.04939461, + "balance_loss_mlp": 1.00095451, + "epoch": 0.4205621524124455, + "flos": 18405655672320.0, + "grad_norm": 1.5483229522184627, + "language_loss": 0.78529471, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.80421495, + "num_input_tokens_seen": 150161345, + "step": 6995, + "time_per_iteration": 2.716649055480957 + }, + { + "auxiliary_loss_clip": 0.0109323, + "auxiliary_loss_mlp": 0.01032227, + "balance_loss_clip": 1.04375339, + "balance_loss_mlp": 1.01855421, + "epoch": 0.4206222756651135, + "flos": 25520097841920.0, + "grad_norm": 1.4060831947988737, + "language_loss": 0.80397403, + "learning_rate": 2.60178818232786e-06, + "loss": 0.82522857, + "num_input_tokens_seen": 150182420, + "step": 6996, + "time_per_iteration": 2.773655891418457 + }, + { + "auxiliary_loss_clip": 0.01111456, + "auxiliary_loss_mlp": 0.00771084, + "balance_loss_clip": 1.0477984, + "balance_loss_mlp": 1.00100029, + "epoch": 0.4206823989177815, + "flos": 15304338472320.0, + "grad_norm": 1.9934224916744, + "language_loss": 0.7558648, + "learning_rate": 2.601416757842559e-06, + "loss": 0.77469015, + "num_input_tokens_seen": 150200175, + "step": 6997, + "time_per_iteration": 2.6486191749572754 + }, + { + "auxiliary_loss_clip": 0.01130573, + "auxiliary_loss_mlp": 0.01042531, + "balance_loss_clip": 1.04606771, + "balance_loss_mlp": 1.02835727, + "epoch": 0.42074252217044944, + "flos": 15554096714880.0, + "grad_norm": 3.451993658012451, + "language_loss": 0.75860173, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.78033274, + "num_input_tokens_seen": 150217100, + "step": 6998, + "time_per_iteration": 2.548783540725708 + }, + { + "auxiliary_loss_clip": 0.01136566, + "auxiliary_loss_mlp": 0.01042996, + "balance_loss_clip": 1.05027032, + "balance_loss_mlp": 1.02827394, + "epoch": 0.4208026454231174, + "flos": 26145877610880.0, + "grad_norm": 1.6802908884651202, + "language_loss": 0.76294345, + "learning_rate": 2.60067384046869e-06, + "loss": 0.78473908, + "num_input_tokens_seen": 150239830, + "step": 6999, + "time_per_iteration": 2.6605780124664307 + }, + { + "auxiliary_loss_clip": 0.01082307, + "auxiliary_loss_mlp": 0.01039523, + "balance_loss_clip": 1.04213417, + "balance_loss_mlp": 1.02420449, + "epoch": 0.42086276867578537, + "flos": 23550110380800.0, + "grad_norm": 2.828142255503796, + "language_loss": 0.64361006, + "learning_rate": 2.600302347608295e-06, + "loss": 0.66482836, + "num_input_tokens_seen": 150260690, + "step": 7000, + "time_per_iteration": 2.7295126914978027 + }, + { + "auxiliary_loss_clip": 0.01089826, + "auxiliary_loss_mlp": 0.01039051, + "balance_loss_clip": 1.04259682, + "balance_loss_mlp": 1.02433491, + "epoch": 0.42092289192845334, + "flos": 18113414618880.0, + "grad_norm": 2.276209987309232, + "language_loss": 0.76550955, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.78679836, + "num_input_tokens_seen": 150279885, + "step": 7001, + "time_per_iteration": 2.793407917022705 + }, + { + "auxiliary_loss_clip": 0.01091534, + "auxiliary_loss_mlp": 0.00771163, + "balance_loss_clip": 1.04483819, + "balance_loss_mlp": 1.00107491, + "epoch": 0.4209830151811213, + "flos": 20006588845440.0, + "grad_norm": 1.4928891465725471, + "language_loss": 0.86682802, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.88545501, + "num_input_tokens_seen": 150297390, + "step": 7002, + "time_per_iteration": 2.719127655029297 + }, + { + "auxiliary_loss_clip": 0.0109333, + "auxiliary_loss_mlp": 0.0103625, + "balance_loss_clip": 1.04801917, + "balance_loss_mlp": 1.02297568, + "epoch": 0.42104313843378927, + "flos": 21978946604160.0, + "grad_norm": 1.8843999139097827, + "language_loss": 0.67807466, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.6993705, + "num_input_tokens_seen": 150317390, + "step": 7003, + "time_per_iteration": 2.732848882675171 + }, + { + "auxiliary_loss_clip": 0.01132341, + "auxiliary_loss_mlp": 0.01035322, + "balance_loss_clip": 1.04725492, + "balance_loss_mlp": 1.02031398, + "epoch": 0.42110326168645723, + "flos": 25443966965760.0, + "grad_norm": 1.9778982096910334, + "language_loss": 0.77774739, + "learning_rate": 2.598816148672344e-06, + "loss": 0.79942405, + "num_input_tokens_seen": 150337455, + "step": 7004, + "time_per_iteration": 2.630838394165039 + }, + { + "auxiliary_loss_clip": 0.01129987, + "auxiliary_loss_mlp": 0.0103854, + "balance_loss_clip": 1.04988933, + "balance_loss_mlp": 1.02351916, + "epoch": 0.4211633849391252, + "flos": 17822574195840.0, + "grad_norm": 2.0674356984544557, + "language_loss": 0.67855948, + "learning_rate": 2.59844454213521e-06, + "loss": 0.70024478, + "num_input_tokens_seen": 150355385, + "step": 7005, + "time_per_iteration": 2.588533401489258 + }, + { + "auxiliary_loss_clip": 0.01121703, + "auxiliary_loss_mlp": 0.01033597, + "balance_loss_clip": 1.0483923, + "balance_loss_mlp": 1.01941752, + "epoch": 0.42122350819179316, + "flos": 16282436791680.0, + "grad_norm": 1.9633544911967673, + "language_loss": 0.72481513, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.74636805, + "num_input_tokens_seen": 150371750, + "step": 7006, + "time_per_iteration": 2.5879828929901123 + }, + { + "auxiliary_loss_clip": 0.01133912, + "auxiliary_loss_mlp": 0.01032205, + "balance_loss_clip": 1.04963207, + "balance_loss_mlp": 1.01787031, + "epoch": 0.4212836314444611, + "flos": 19645866512640.0, + "grad_norm": 1.722108681435548, + "language_loss": 0.70495522, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.72661638, + "num_input_tokens_seen": 150389955, + "step": 7007, + "time_per_iteration": 2.5199153423309326 + }, + { + "auxiliary_loss_clip": 0.0110564, + "auxiliary_loss_mlp": 0.00771949, + "balance_loss_clip": 1.04377306, + "balance_loss_mlp": 1.00098372, + "epoch": 0.4213437546971291, + "flos": 18369026778240.0, + "grad_norm": 1.772679877033185, + "language_loss": 0.82893503, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.84771085, + "num_input_tokens_seen": 150405780, + "step": 7008, + "time_per_iteration": 2.6636033058166504 + }, + { + "auxiliary_loss_clip": 0.01089865, + "auxiliary_loss_mlp": 0.01039598, + "balance_loss_clip": 1.04483509, + "balance_loss_mlp": 1.02535856, + "epoch": 0.42140387794979706, + "flos": 27704507541120.0, + "grad_norm": 1.895033591472922, + "language_loss": 0.72206765, + "learning_rate": 2.596957889196831e-06, + "loss": 0.74336231, + "num_input_tokens_seen": 150425615, + "step": 7009, + "time_per_iteration": 2.738678216934204 + }, + { + "auxiliary_loss_clip": 0.01132456, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.04812074, + "balance_loss_mlp": 1.01674712, + "epoch": 0.4214640012024651, + "flos": 28147071012480.0, + "grad_norm": 2.558025018080716, + "language_loss": 0.66191494, + "learning_rate": 2.596586169335243e-06, + "loss": 0.68354768, + "num_input_tokens_seen": 150445765, + "step": 7010, + "time_per_iteration": 2.6812071800231934 + }, + { + "auxiliary_loss_clip": 0.01092262, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.0424943, + "balance_loss_mlp": 1.01774001, + "epoch": 0.42152412445513304, + "flos": 22997265177600.0, + "grad_norm": 2.024875050938184, + "language_loss": 0.72456133, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.74580765, + "num_input_tokens_seen": 150464405, + "step": 7011, + "time_per_iteration": 2.741454601287842 + }, + { + "auxiliary_loss_clip": 0.01046137, + "auxiliary_loss_mlp": 0.01001201, + "balance_loss_clip": 1.02718639, + "balance_loss_mlp": 0.99971068, + "epoch": 0.421584247707801, + "flos": 63749592938880.0, + "grad_norm": 0.7906258228604641, + "language_loss": 0.54322207, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.56369549, + "num_input_tokens_seen": 150520430, + "step": 7012, + "time_per_iteration": 3.1284689903259277 + }, + { + "auxiliary_loss_clip": 0.01123004, + "auxiliary_loss_mlp": 0.01031037, + "balance_loss_clip": 1.05000162, + "balance_loss_mlp": 1.01663089, + "epoch": 0.421644370960469, + "flos": 24314612474880.0, + "grad_norm": 1.3828895368097467, + "language_loss": 0.78401852, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.80555892, + "num_input_tokens_seen": 150542610, + "step": 7013, + "time_per_iteration": 2.6729819774627686 + }, + { + "auxiliary_loss_clip": 0.01133162, + "auxiliary_loss_mlp": 0.01033094, + "balance_loss_clip": 1.04858398, + "balance_loss_mlp": 1.01826453, + "epoch": 0.42170449421313694, + "flos": 23440690575360.0, + "grad_norm": 1.8094728177732207, + "language_loss": 0.81603825, + "learning_rate": 2.595099063803787e-06, + "loss": 0.83770084, + "num_input_tokens_seen": 150560970, + "step": 7014, + "time_per_iteration": 2.662652015686035 + }, + { + "auxiliary_loss_clip": 0.01117627, + "auxiliary_loss_mlp": 0.0103256, + "balance_loss_clip": 1.04452634, + "balance_loss_mlp": 1.01831448, + "epoch": 0.4217646174658049, + "flos": 23695476721920.0, + "grad_norm": 1.7861369926261594, + "language_loss": 0.77908784, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.80058968, + "num_input_tokens_seen": 150582615, + "step": 7015, + "time_per_iteration": 2.763761043548584 + }, + { + "auxiliary_loss_clip": 0.01132815, + "auxiliary_loss_mlp": 0.01036697, + "balance_loss_clip": 1.04966104, + "balance_loss_mlp": 1.02183151, + "epoch": 0.42182474071847287, + "flos": 24971562270720.0, + "grad_norm": 1.3268186837565954, + "language_loss": 0.82412994, + "learning_rate": 2.594355375584368e-06, + "loss": 0.84582508, + "num_input_tokens_seen": 150603640, + "step": 7016, + "time_per_iteration": 2.771812677383423 + }, + { + "auxiliary_loss_clip": 0.01091213, + "auxiliary_loss_mlp": 0.0103466, + "balance_loss_clip": 1.04072332, + "balance_loss_mlp": 1.01999736, + "epoch": 0.42188486397114083, + "flos": 22856639431680.0, + "grad_norm": 1.813350419138722, + "language_loss": 0.68270308, + "learning_rate": 2.593983497660586e-06, + "loss": 0.70396179, + "num_input_tokens_seen": 150622490, + "step": 7017, + "time_per_iteration": 2.703078508377075 + }, + { + "auxiliary_loss_clip": 0.01045206, + "auxiliary_loss_mlp": 0.01012048, + "balance_loss_clip": 1.02663231, + "balance_loss_mlp": 1.01053989, + "epoch": 0.4219449872238088, + "flos": 66975700965120.0, + "grad_norm": 0.7659311952437052, + "language_loss": 0.59381223, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61438477, + "num_input_tokens_seen": 150689545, + "step": 7018, + "time_per_iteration": 3.2514843940734863 + }, + { + "auxiliary_loss_clip": 0.01113322, + "auxiliary_loss_mlp": 0.01033039, + "balance_loss_clip": 1.04147184, + "balance_loss_mlp": 1.01840591, + "epoch": 0.42200511047647676, + "flos": 13115367745920.0, + "grad_norm": 2.3056993234384957, + "language_loss": 0.75083554, + "learning_rate": 2.593239674255382e-06, + "loss": 0.77229911, + "num_input_tokens_seen": 150707610, + "step": 7019, + "time_per_iteration": 2.6845014095306396 + }, + { + "auxiliary_loss_clip": 0.01106969, + "auxiliary_loss_mlp": 0.01035543, + "balance_loss_clip": 1.04650903, + "balance_loss_mlp": 1.02023685, + "epoch": 0.42206523372914473, + "flos": 13991193066240.0, + "grad_norm": 1.8835929197669175, + "language_loss": 0.69198954, + "learning_rate": 2.592867728802166e-06, + "loss": 0.71341467, + "num_input_tokens_seen": 150724530, + "step": 7020, + "time_per_iteration": 2.635646343231201 + }, + { + "auxiliary_loss_clip": 0.01107351, + "auxiliary_loss_mlp": 0.00771638, + "balance_loss_clip": 1.04847479, + "balance_loss_mlp": 1.00088549, + "epoch": 0.4221253569818127, + "flos": 21942317710080.0, + "grad_norm": 3.182010152232146, + "language_loss": 0.81085485, + "learning_rate": 2.592495760867347e-06, + "loss": 0.82964474, + "num_input_tokens_seen": 150742870, + "step": 7021, + "time_per_iteration": 2.712358236312866 + }, + { + "auxiliary_loss_clip": 0.0105744, + "auxiliary_loss_mlp": 0.01040763, + "balance_loss_clip": 1.03628528, + "balance_loss_mlp": 1.02439523, + "epoch": 0.42218548023448066, + "flos": 32192587071360.0, + "grad_norm": 1.7516152237568758, + "language_loss": 0.70298421, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.72396624, + "num_input_tokens_seen": 150765500, + "step": 7022, + "time_per_iteration": 2.9338343143463135 + }, + { + "auxiliary_loss_clip": 0.01114774, + "auxiliary_loss_mlp": 0.01028964, + "balance_loss_clip": 1.0467478, + "balance_loss_mlp": 1.01637506, + "epoch": 0.4222456034871487, + "flos": 30118961894400.0, + "grad_norm": 1.5162864908148717, + "language_loss": 0.67418218, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.69561946, + "num_input_tokens_seen": 150784945, + "step": 7023, + "time_per_iteration": 2.7014782428741455 + }, + { + "auxiliary_loss_clip": 0.01101297, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.0460372, + "balance_loss_mlp": 1.0259577, + "epoch": 0.42230572673981664, + "flos": 22127904305280.0, + "grad_norm": 1.6579428625462107, + "language_loss": 0.69768953, + "learning_rate": 2.591379722314322e-06, + "loss": 0.71913004, + "num_input_tokens_seen": 150803120, + "step": 7024, + "time_per_iteration": 2.8669025897979736 + }, + { + "auxiliary_loss_clip": 0.011321, + "auxiliary_loss_mlp": 0.01035188, + "balance_loss_clip": 1.04982734, + "balance_loss_mlp": 1.02107334, + "epoch": 0.4223658499924846, + "flos": 22055077480320.0, + "grad_norm": 1.7199232023790467, + "language_loss": 0.76781225, + "learning_rate": 2.591007664594147e-06, + "loss": 0.7894851, + "num_input_tokens_seen": 150823135, + "step": 7025, + "time_per_iteration": 2.696200132369995 + }, + { + "auxiliary_loss_clip": 0.01097355, + "auxiliary_loss_mlp": 0.01036622, + "balance_loss_clip": 1.04367328, + "balance_loss_mlp": 1.02268052, + "epoch": 0.4224259732451526, + "flos": 20410727742720.0, + "grad_norm": 1.6766870979897237, + "language_loss": 0.79664457, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.81798434, + "num_input_tokens_seen": 150842070, + "step": 7026, + "time_per_iteration": 2.7131056785583496 + }, + { + "auxiliary_loss_clip": 0.01053, + "auxiliary_loss_mlp": 0.00999983, + "balance_loss_clip": 1.02519512, + "balance_loss_mlp": 0.9985466, + "epoch": 0.42248609649782054, + "flos": 62846655828480.0, + "grad_norm": 0.7210787168966012, + "language_loss": 0.61874068, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.63927048, + "num_input_tokens_seen": 150907450, + "step": 7027, + "time_per_iteration": 3.2111167907714844 + }, + { + "auxiliary_loss_clip": 0.01131577, + "auxiliary_loss_mlp": 0.01038162, + "balance_loss_clip": 1.05022967, + "balance_loss_mlp": 1.02400613, + "epoch": 0.4225462197504885, + "flos": 26249946289920.0, + "grad_norm": 1.8872379728212205, + "language_loss": 0.71137869, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.7330761, + "num_input_tokens_seen": 150928040, + "step": 7028, + "time_per_iteration": 4.185323476791382 + }, + { + "auxiliary_loss_clip": 0.01109127, + "auxiliary_loss_mlp": 0.01041278, + "balance_loss_clip": 1.04935491, + "balance_loss_mlp": 1.02702022, + "epoch": 0.42260634300315647, + "flos": 20521943228160.0, + "grad_norm": 3.7456767842675136, + "language_loss": 0.82652044, + "learning_rate": 2.589519209743846e-06, + "loss": 0.84802449, + "num_input_tokens_seen": 150945760, + "step": 7029, + "time_per_iteration": 2.617464542388916 + }, + { + "auxiliary_loss_clip": 0.01086316, + "auxiliary_loss_mlp": 0.01043345, + "balance_loss_clip": 1.04393244, + "balance_loss_mlp": 1.02826512, + "epoch": 0.42266646625582444, + "flos": 24316731377280.0, + "grad_norm": 1.852504104659585, + "language_loss": 0.75125468, + "learning_rate": 2.589147040109424e-06, + "loss": 0.7725513, + "num_input_tokens_seen": 150965665, + "step": 7030, + "time_per_iteration": 5.787954807281494 + }, + { + "auxiliary_loss_clip": 0.01129772, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.04772067, + "balance_loss_mlp": 1.02368367, + "epoch": 0.4227265895084924, + "flos": 24204151175040.0, + "grad_norm": 1.9107182577124318, + "language_loss": 0.86337131, + "learning_rate": 2.588774848134486e-06, + "loss": 0.88506097, + "num_input_tokens_seen": 150982260, + "step": 7031, + "time_per_iteration": 2.622174024581909 + }, + { + "auxiliary_loss_clip": 0.01120469, + "auxiliary_loss_mlp": 0.01038756, + "balance_loss_clip": 1.04873753, + "balance_loss_mlp": 1.0234381, + "epoch": 0.42278671276116037, + "flos": 16909760845440.0, + "grad_norm": 1.9974648735142886, + "language_loss": 0.73489487, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.75648719, + "num_input_tokens_seen": 150999990, + "step": 7032, + "time_per_iteration": 2.681155204772949 + }, + { + "auxiliary_loss_clip": 0.01100841, + "auxiliary_loss_mlp": 0.01044575, + "balance_loss_clip": 1.04449272, + "balance_loss_mlp": 1.029531, + "epoch": 0.42284683601382833, + "flos": 25411073086080.0, + "grad_norm": 1.657781585480679, + "language_loss": 0.70232797, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.72378218, + "num_input_tokens_seen": 151021105, + "step": 7033, + "time_per_iteration": 4.264399290084839 + }, + { + "auxiliary_loss_clip": 0.01105188, + "auxiliary_loss_mlp": 0.00773118, + "balance_loss_clip": 1.04417682, + "balance_loss_mlp": 1.00101566, + "epoch": 0.4229069592664963, + "flos": 23040322606080.0, + "grad_norm": 2.084860036541982, + "language_loss": 0.90209413, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.92087722, + "num_input_tokens_seen": 151040665, + "step": 7034, + "time_per_iteration": 2.6903390884399414 + }, + { + "auxiliary_loss_clip": 0.01107447, + "auxiliary_loss_mlp": 0.01038024, + "balance_loss_clip": 1.04703283, + "balance_loss_mlp": 1.02456498, + "epoch": 0.42296708251916426, + "flos": 26067448264320.0, + "grad_norm": 1.854470548564886, + "language_loss": 0.77645576, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.79791045, + "num_input_tokens_seen": 151061240, + "step": 7035, + "time_per_iteration": 2.839463233947754 + }, + { + "auxiliary_loss_clip": 0.01118463, + "auxiliary_loss_mlp": 0.01043438, + "balance_loss_clip": 1.04904413, + "balance_loss_mlp": 1.02879918, + "epoch": 0.4230272057718323, + "flos": 19458376496640.0, + "grad_norm": 1.8047665428966375, + "language_loss": 0.82544887, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.84706789, + "num_input_tokens_seen": 151076870, + "step": 7036, + "time_per_iteration": 2.7344322204589844 + }, + { + "auxiliary_loss_clip": 0.01105244, + "auxiliary_loss_mlp": 0.01037982, + "balance_loss_clip": 1.04819334, + "balance_loss_mlp": 1.02430892, + "epoch": 0.42308732902450025, + "flos": 22383300983040.0, + "grad_norm": 1.7884357315749977, + "language_loss": 0.70379841, + "learning_rate": 2.58654122792447e-06, + "loss": 0.72523069, + "num_input_tokens_seen": 151095110, + "step": 7037, + "time_per_iteration": 2.7701706886291504 + }, + { + "auxiliary_loss_clip": 0.01088589, + "auxiliary_loss_mlp": 0.00773432, + "balance_loss_clip": 1.04192328, + "balance_loss_mlp": 1.00089622, + "epoch": 0.4231474522771682, + "flos": 20995425331200.0, + "grad_norm": 1.6174527275157642, + "language_loss": 0.78031301, + "learning_rate": 2.586168879961155e-06, + "loss": 0.79893327, + "num_input_tokens_seen": 151114355, + "step": 7038, + "time_per_iteration": 2.7142980098724365 + }, + { + "auxiliary_loss_clip": 0.01093843, + "auxiliary_loss_mlp": 0.01045553, + "balance_loss_clip": 1.04870033, + "balance_loss_mlp": 1.02938843, + "epoch": 0.4232075755298362, + "flos": 14975863574400.0, + "grad_norm": 2.472987059089125, + "language_loss": 0.67238259, + "learning_rate": 2.585796509770259e-06, + "loss": 0.69377655, + "num_input_tokens_seen": 151131505, + "step": 7039, + "time_per_iteration": 2.723700761795044 + }, + { + "auxiliary_loss_clip": 0.01126742, + "auxiliary_loss_mlp": 0.0103978, + "balance_loss_clip": 1.04828668, + "balance_loss_mlp": 1.02421153, + "epoch": 0.42326769878250414, + "flos": 24532661986560.0, + "grad_norm": 2.3861719735257627, + "language_loss": 0.75643921, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.77810442, + "num_input_tokens_seen": 151151555, + "step": 7040, + "time_per_iteration": 2.6909239292144775 + }, + { + "auxiliary_loss_clip": 0.01120351, + "auxiliary_loss_mlp": 0.01033687, + "balance_loss_clip": 1.04682565, + "balance_loss_mlp": 1.01907206, + "epoch": 0.4233278220351721, + "flos": 26870303105280.0, + "grad_norm": 1.612614450493485, + "language_loss": 0.6520682, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.67360854, + "num_input_tokens_seen": 151172385, + "step": 7041, + "time_per_iteration": 2.705819845199585 + }, + { + "auxiliary_loss_clip": 0.01105037, + "auxiliary_loss_mlp": 0.01044866, + "balance_loss_clip": 1.04526758, + "balance_loss_mlp": 1.02961886, + "epoch": 0.4233879452878401, + "flos": 42814927463040.0, + "grad_norm": 1.8077043446733942, + "language_loss": 0.74725586, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.76875484, + "num_input_tokens_seen": 151194930, + "step": 7042, + "time_per_iteration": 2.8701279163360596 + }, + { + "auxiliary_loss_clip": 0.01118432, + "auxiliary_loss_mlp": 0.01041709, + "balance_loss_clip": 1.04900146, + "balance_loss_mlp": 1.02783322, + "epoch": 0.42344806854050804, + "flos": 25229006023680.0, + "grad_norm": 1.5999390710673906, + "language_loss": 0.82543206, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84703344, + "num_input_tokens_seen": 151217905, + "step": 7043, + "time_per_iteration": 2.7351741790771484 + }, + { + "auxiliary_loss_clip": 0.01110906, + "auxiliary_loss_mlp": 0.01054459, + "balance_loss_clip": 1.04981089, + "balance_loss_mlp": 1.0383476, + "epoch": 0.423508191793176, + "flos": 22778820616320.0, + "grad_norm": 4.941461848597107, + "language_loss": 0.64840907, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.67006272, + "num_input_tokens_seen": 151234580, + "step": 7044, + "time_per_iteration": 2.729717969894409 + }, + { + "auxiliary_loss_clip": 0.01118394, + "auxiliary_loss_mlp": 0.01056481, + "balance_loss_clip": 1.04780793, + "balance_loss_mlp": 1.04023242, + "epoch": 0.42356831504584397, + "flos": 34637493179520.0, + "grad_norm": 4.901784512002612, + "language_loss": 0.75249708, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.77424586, + "num_input_tokens_seen": 151254765, + "step": 7045, + "time_per_iteration": 2.768423557281494 + }, + { + "auxiliary_loss_clip": 0.0109684, + "auxiliary_loss_mlp": 0.0105935, + "balance_loss_clip": 1.04820228, + "balance_loss_mlp": 1.04277968, + "epoch": 0.42362843829851193, + "flos": 17596767346560.0, + "grad_norm": 2.3365752409002027, + "language_loss": 0.80862033, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.83018219, + "num_input_tokens_seen": 151269045, + "step": 7046, + "time_per_iteration": 2.778648614883423 + }, + { + "auxiliary_loss_clip": 0.01050075, + "auxiliary_loss_mlp": 0.01043729, + "balance_loss_clip": 1.04536414, + "balance_loss_mlp": 1.02847028, + "epoch": 0.4236885615511799, + "flos": 22565691267840.0, + "grad_norm": 1.629581050390514, + "language_loss": 0.76806176, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.78899974, + "num_input_tokens_seen": 151287530, + "step": 7047, + "time_per_iteration": 2.957385301589966 + }, + { + "auxiliary_loss_clip": 0.01132762, + "auxiliary_loss_mlp": 0.01044489, + "balance_loss_clip": 1.05149937, + "balance_loss_mlp": 1.03061271, + "epoch": 0.42374868480384786, + "flos": 26469216864000.0, + "grad_norm": 2.0123660706562294, + "language_loss": 0.68135488, + "learning_rate": 2.582444180141098e-06, + "loss": 0.70312738, + "num_input_tokens_seen": 151308905, + "step": 7048, + "time_per_iteration": 2.976609468460083 + }, + { + "auxiliary_loss_clip": 0.01119986, + "auxiliary_loss_mlp": 0.0104419, + "balance_loss_clip": 1.04684722, + "balance_loss_mlp": 1.02822733, + "epoch": 0.4238088080565159, + "flos": 20370220179840.0, + "grad_norm": 1.9442365727521234, + "language_loss": 0.78292572, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.80456746, + "num_input_tokens_seen": 151326525, + "step": 7049, + "time_per_iteration": 2.7592408657073975 + }, + { + "auxiliary_loss_clip": 0.01128638, + "auxiliary_loss_mlp": 0.0105084, + "balance_loss_clip": 1.05336547, + "balance_loss_mlp": 1.03632045, + "epoch": 0.42386893130918385, + "flos": 21172105353600.0, + "grad_norm": 1.9473547987347861, + "language_loss": 0.82839847, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.85019326, + "num_input_tokens_seen": 151344675, + "step": 7050, + "time_per_iteration": 2.70487117767334 + }, + { + "auxiliary_loss_clip": 0.01132896, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.04812455, + "balance_loss_mlp": 1.02791238, + "epoch": 0.4239290545618518, + "flos": 17675627656320.0, + "grad_norm": 2.6140682586064754, + "language_loss": 0.73742986, + "learning_rate": 2.581326338868687e-06, + "loss": 0.75918031, + "num_input_tokens_seen": 151360730, + "step": 7051, + "time_per_iteration": 2.6406943798065186 + }, + { + "auxiliary_loss_clip": 0.01103657, + "auxiliary_loss_mlp": 0.0104179, + "balance_loss_clip": 1.05070043, + "balance_loss_mlp": 1.02773547, + "epoch": 0.4239891778145198, + "flos": 24314504734080.0, + "grad_norm": 1.6610077810318091, + "language_loss": 0.86273873, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.88419318, + "num_input_tokens_seen": 151380445, + "step": 7052, + "time_per_iteration": 2.7759416103363037 + }, + { + "auxiliary_loss_clip": 0.01106373, + "auxiliary_loss_mlp": 0.01058935, + "balance_loss_clip": 1.04475808, + "balance_loss_mlp": 1.04325902, + "epoch": 0.42404930106718774, + "flos": 20558428467840.0, + "grad_norm": 2.094212061505075, + "language_loss": 0.72460884, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.74626195, + "num_input_tokens_seen": 151399325, + "step": 7053, + "time_per_iteration": 2.6969964504241943 + }, + { + "auxiliary_loss_clip": 0.0110264, + "auxiliary_loss_mlp": 0.00773448, + "balance_loss_clip": 1.05001807, + "balance_loss_mlp": 1.00098944, + "epoch": 0.4241094243198557, + "flos": 22308067946880.0, + "grad_norm": 7.333766574531878, + "language_loss": 0.82380986, + "learning_rate": 2.580208299200704e-06, + "loss": 0.84257072, + "num_input_tokens_seen": 151417240, + "step": 7054, + "time_per_iteration": 2.71956205368042 + }, + { + "auxiliary_loss_clip": 0.01052303, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.03336191, + "balance_loss_mlp": 1.03490484, + "epoch": 0.4241695475725237, + "flos": 70612445272320.0, + "grad_norm": 0.7897337987883358, + "language_loss": 0.60378659, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62467366, + "num_input_tokens_seen": 151476015, + "step": 7055, + "time_per_iteration": 3.155177116394043 + }, + { + "auxiliary_loss_clip": 0.01136773, + "auxiliary_loss_mlp": 0.01045155, + "balance_loss_clip": 1.05100691, + "balance_loss_mlp": 1.0298965, + "epoch": 0.42422967082519164, + "flos": 14027462824320.0, + "grad_norm": 2.6219010938669998, + "language_loss": 0.7752226, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.79704189, + "num_input_tokens_seen": 151492035, + "step": 7056, + "time_per_iteration": 2.5975699424743652 + }, + { + "auxiliary_loss_clip": 0.01129986, + "auxiliary_loss_mlp": 0.01042696, + "balance_loss_clip": 1.05187988, + "balance_loss_mlp": 1.02583957, + "epoch": 0.4242897940778596, + "flos": 22345522853760.0, + "grad_norm": 2.481094371553488, + "language_loss": 0.8406778, + "learning_rate": 2.579090061518714e-06, + "loss": 0.86240464, + "num_input_tokens_seen": 151508970, + "step": 7057, + "time_per_iteration": 2.690188407897949 + }, + { + "auxiliary_loss_clip": 0.01095967, + "auxiliary_loss_mlp": 0.01043613, + "balance_loss_clip": 1.04596114, + "balance_loss_mlp": 1.02778184, + "epoch": 0.42434991733052757, + "flos": 22595855713920.0, + "grad_norm": 2.565187046091263, + "language_loss": 0.83179426, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.85319012, + "num_input_tokens_seen": 151525295, + "step": 7058, + "time_per_iteration": 2.9978904724121094 + }, + { + "auxiliary_loss_clip": 0.01107732, + "auxiliary_loss_mlp": 0.0077171, + "balance_loss_clip": 1.04935992, + "balance_loss_mlp": 1.000875, + "epoch": 0.42441004058319554, + "flos": 20011437181440.0, + "grad_norm": 1.910708490679684, + "language_loss": 0.80493343, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.82372791, + "num_input_tokens_seen": 151544435, + "step": 7059, + "time_per_iteration": 2.7227041721343994 + }, + { + "auxiliary_loss_clip": 0.01137284, + "auxiliary_loss_mlp": 0.01041284, + "balance_loss_clip": 1.05036783, + "balance_loss_mlp": 1.02469015, + "epoch": 0.4244701638358635, + "flos": 11144985235200.0, + "grad_norm": 2.371195034517477, + "language_loss": 0.70500332, + "learning_rate": 2.57797162620435e-06, + "loss": 0.726789, + "num_input_tokens_seen": 151559520, + "step": 7060, + "time_per_iteration": 2.6058552265167236 + }, + { + "auxiliary_loss_clip": 0.01128623, + "auxiliary_loss_mlp": 0.01038609, + "balance_loss_clip": 1.05295658, + "balance_loss_mlp": 1.02370787, + "epoch": 0.42453028708853147, + "flos": 23987753688960.0, + "grad_norm": 1.575928079295092, + "language_loss": 0.7634182, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78509057, + "num_input_tokens_seen": 151579790, + "step": 7061, + "time_per_iteration": 2.6592459678649902 + }, + { + "auxiliary_loss_clip": 0.01127164, + "auxiliary_loss_mlp": 0.01039243, + "balance_loss_clip": 1.05133295, + "balance_loss_mlp": 1.02308464, + "epoch": 0.42459041034119943, + "flos": 18406338030720.0, + "grad_norm": 2.3470563522902195, + "language_loss": 0.73278493, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.75444901, + "num_input_tokens_seen": 151598285, + "step": 7062, + "time_per_iteration": 2.5925838947296143 + }, + { + "auxiliary_loss_clip": 0.01110528, + "auxiliary_loss_mlp": 0.01044189, + "balance_loss_clip": 1.05038309, + "balance_loss_mlp": 1.02934098, + "epoch": 0.42465053359386745, + "flos": 20958006337920.0, + "grad_norm": 1.735369540351847, + "language_loss": 0.66238403, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.68393123, + "num_input_tokens_seen": 151615430, + "step": 7063, + "time_per_iteration": 2.618459939956665 + }, + { + "auxiliary_loss_clip": 0.0109746, + "auxiliary_loss_mlp": 0.00773106, + "balance_loss_clip": 1.04320812, + "balance_loss_mlp": 1.0009284, + "epoch": 0.4247106568465354, + "flos": 33106190520960.0, + "grad_norm": 1.673900676033667, + "language_loss": 0.78570068, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.80440634, + "num_input_tokens_seen": 151637030, + "step": 7064, + "time_per_iteration": 2.7396399974823 + }, + { + "auxiliary_loss_clip": 0.0113726, + "auxiliary_loss_mlp": 0.01038466, + "balance_loss_clip": 1.05053115, + "balance_loss_mlp": 1.02317119, + "epoch": 0.4247707800992034, + "flos": 20046916840320.0, + "grad_norm": 1.9847642008914126, + "language_loss": 0.75471151, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.77646875, + "num_input_tokens_seen": 151655745, + "step": 7065, + "time_per_iteration": 2.532046318054199 + }, + { + "auxiliary_loss_clip": 0.01124888, + "auxiliary_loss_mlp": 0.01038463, + "balance_loss_clip": 1.05094182, + "balance_loss_mlp": 1.02257848, + "epoch": 0.42483090335187135, + "flos": 22385132576640.0, + "grad_norm": 1.3355357629490912, + "language_loss": 0.72402596, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.74565947, + "num_input_tokens_seen": 151678040, + "step": 7066, + "time_per_iteration": 2.5829319953918457 + }, + { + "auxiliary_loss_clip": 0.01101493, + "auxiliary_loss_mlp": 0.01036883, + "balance_loss_clip": 1.04836977, + "balance_loss_mlp": 1.02044368, + "epoch": 0.4248910266045393, + "flos": 21356830022400.0, + "grad_norm": 2.4907013500628166, + "language_loss": 0.80009657, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.82148039, + "num_input_tokens_seen": 151696410, + "step": 7067, + "time_per_iteration": 2.6051836013793945 + }, + { + "auxiliary_loss_clip": 0.01053553, + "auxiliary_loss_mlp": 0.01005501, + "balance_loss_clip": 1.02524805, + "balance_loss_mlp": 1.00387979, + "epoch": 0.4249511498572073, + "flos": 64008114099840.0, + "grad_norm": 0.9135939410418532, + "language_loss": 0.6341064, + "learning_rate": 2.574988168733022e-06, + "loss": 0.65469694, + "num_input_tokens_seen": 151756365, + "step": 7068, + "time_per_iteration": 4.699309825897217 + }, + { + "auxiliary_loss_clip": 0.0113454, + "auxiliary_loss_mlp": 0.01036767, + "balance_loss_clip": 1.04911804, + "balance_loss_mlp": 1.02070904, + "epoch": 0.42501127310987524, + "flos": 19607046888960.0, + "grad_norm": 1.9072894618048717, + "language_loss": 0.72502887, + "learning_rate": 2.574615138284361e-06, + "loss": 0.74674189, + "num_input_tokens_seen": 151775165, + "step": 7069, + "time_per_iteration": 5.814046382904053 + }, + { + "auxiliary_loss_clip": 0.01136556, + "auxiliary_loss_mlp": 0.01039486, + "balance_loss_clip": 1.05074239, + "balance_loss_mlp": 1.02286839, + "epoch": 0.4250713963625432, + "flos": 19462326992640.0, + "grad_norm": 2.348420544652142, + "language_loss": 0.79105788, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.81281829, + "num_input_tokens_seen": 151792620, + "step": 7070, + "time_per_iteration": 2.6242294311523438 + }, + { + "auxiliary_loss_clip": 0.0112233, + "auxiliary_loss_mlp": 0.01033288, + "balance_loss_clip": 1.04764843, + "balance_loss_mlp": 1.01816082, + "epoch": 0.4251315196152112, + "flos": 25337707557120.0, + "grad_norm": 1.7541837021075046, + "language_loss": 0.70184052, + "learning_rate": 2.573869012032795e-06, + "loss": 0.72339666, + "num_input_tokens_seen": 151812850, + "step": 7071, + "time_per_iteration": 2.6695022583007812 + }, + { + "auxiliary_loss_clip": 0.01134965, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.05002129, + "balance_loss_mlp": 1.0191201, + "epoch": 0.42519164286787914, + "flos": 26359186527360.0, + "grad_norm": 2.353956848857114, + "language_loss": 0.71210682, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.73379803, + "num_input_tokens_seen": 151831785, + "step": 7072, + "time_per_iteration": 2.654045581817627 + }, + { + "auxiliary_loss_clip": 0.01090703, + "auxiliary_loss_mlp": 0.01042672, + "balance_loss_clip": 1.04456139, + "balance_loss_mlp": 1.02779484, + "epoch": 0.4252517661205471, + "flos": 26031070765440.0, + "grad_norm": 1.5509538260814284, + "language_loss": 0.81704801, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.83838177, + "num_input_tokens_seen": 151853885, + "step": 7073, + "time_per_iteration": 4.4267754554748535 + }, + { + "auxiliary_loss_clip": 0.01117821, + "auxiliary_loss_mlp": 0.01035489, + "balance_loss_clip": 1.04660416, + "balance_loss_mlp": 1.02130294, + "epoch": 0.42531188937321507, + "flos": 12713635059840.0, + "grad_norm": 2.6569023186466914, + "language_loss": 0.91360795, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.93514109, + "num_input_tokens_seen": 151871780, + "step": 7074, + "time_per_iteration": 2.655850887298584 + }, + { + "auxiliary_loss_clip": 0.01128859, + "auxiliary_loss_mlp": 0.00774468, + "balance_loss_clip": 1.05061221, + "balance_loss_mlp": 1.0009917, + "epoch": 0.42537201262588303, + "flos": 22091670460800.0, + "grad_norm": 1.6066127617392931, + "language_loss": 0.64610291, + "learning_rate": 2.572376498508805e-06, + "loss": 0.66513622, + "num_input_tokens_seen": 151891600, + "step": 7075, + "time_per_iteration": 2.7072041034698486 + }, + { + "auxiliary_loss_clip": 0.01097292, + "auxiliary_loss_mlp": 0.01030165, + "balance_loss_clip": 1.04872322, + "balance_loss_mlp": 1.01664686, + "epoch": 0.42543213587855105, + "flos": 23003119094400.0, + "grad_norm": 1.6801281915446873, + "language_loss": 0.736256, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.75753057, + "num_input_tokens_seen": 151911330, + "step": 7076, + "time_per_iteration": 2.7376084327697754 + }, + { + "auxiliary_loss_clip": 0.01107519, + "auxiliary_loss_mlp": 0.01042827, + "balance_loss_clip": 1.0442965, + "balance_loss_mlp": 1.02684128, + "epoch": 0.425492259131219, + "flos": 25082454533760.0, + "grad_norm": 2.293658429237098, + "language_loss": 0.78658164, + "learning_rate": 2.571630111462766e-06, + "loss": 0.80808508, + "num_input_tokens_seen": 151930355, + "step": 7077, + "time_per_iteration": 2.9069621562957764 + }, + { + "auxiliary_loss_clip": 0.01105315, + "auxiliary_loss_mlp": 0.01032074, + "balance_loss_clip": 1.04497409, + "balance_loss_mlp": 1.01881242, + "epoch": 0.425552382383887, + "flos": 22816850140800.0, + "grad_norm": 1.6369769525688158, + "language_loss": 0.73094088, + "learning_rate": 2.571256885418265e-06, + "loss": 0.75231481, + "num_input_tokens_seen": 151949695, + "step": 7078, + "time_per_iteration": 2.728288173675537 + }, + { + "auxiliary_loss_clip": 0.01104463, + "auxiliary_loss_mlp": 0.01040077, + "balance_loss_clip": 1.04849982, + "balance_loss_mlp": 1.02651131, + "epoch": 0.42561250563655495, + "flos": 13553585671680.0, + "grad_norm": 1.8849915988224846, + "language_loss": 0.79555357, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.81699896, + "num_input_tokens_seen": 151967640, + "step": 7079, + "time_per_iteration": 2.6294121742248535 + }, + { + "auxiliary_loss_clip": 0.01125077, + "auxiliary_loss_mlp": 0.01035166, + "balance_loss_clip": 1.05348229, + "balance_loss_mlp": 1.02171898, + "epoch": 0.4256726288892229, + "flos": 46978303023360.0, + "grad_norm": 1.3719098160070018, + "language_loss": 0.71853465, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.7401371, + "num_input_tokens_seen": 151994020, + "step": 7080, + "time_per_iteration": 2.8506548404693604 + }, + { + "auxiliary_loss_clip": 0.01130776, + "auxiliary_loss_mlp": 0.01033872, + "balance_loss_clip": 1.04765022, + "balance_loss_mlp": 1.02025867, + "epoch": 0.4257327521418909, + "flos": 23586451966080.0, + "grad_norm": 2.0309872529354283, + "language_loss": 0.80102706, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.82267356, + "num_input_tokens_seen": 152013415, + "step": 7081, + "time_per_iteration": 2.698814868927002 + }, + { + "auxiliary_loss_clip": 0.01100197, + "auxiliary_loss_mlp": 0.01034532, + "balance_loss_clip": 1.04303122, + "balance_loss_mlp": 1.02065063, + "epoch": 0.42579287539455885, + "flos": 18989994124800.0, + "grad_norm": 1.6770375884870488, + "language_loss": 0.81524366, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.83659089, + "num_input_tokens_seen": 152030860, + "step": 7082, + "time_per_iteration": 2.6388967037200928 + }, + { + "auxiliary_loss_clip": 0.01122609, + "auxiliary_loss_mlp": 0.01038264, + "balance_loss_clip": 1.05003822, + "balance_loss_mlp": 1.02411938, + "epoch": 0.4258529986472268, + "flos": 25191910252800.0, + "grad_norm": 2.777460036178925, + "language_loss": 0.70476681, + "learning_rate": 2.569390430547065e-06, + "loss": 0.72637558, + "num_input_tokens_seen": 152050395, + "step": 7083, + "time_per_iteration": 2.666609048843384 + }, + { + "auxiliary_loss_clip": 0.01045638, + "auxiliary_loss_mlp": 0.0101356, + "balance_loss_clip": 1.02604496, + "balance_loss_mlp": 1.01191545, + "epoch": 0.4259131218998948, + "flos": 69968280718080.0, + "grad_norm": 0.8664420799088798, + "language_loss": 0.6701948, + "learning_rate": 2.569017074742173e-06, + "loss": 0.69078678, + "num_input_tokens_seen": 152113555, + "step": 7084, + "time_per_iteration": 3.25407075881958 + }, + { + "auxiliary_loss_clip": 0.01120239, + "auxiliary_loss_mlp": 0.01042728, + "balance_loss_clip": 1.04841447, + "balance_loss_mlp": 1.02757668, + "epoch": 0.42597324515256274, + "flos": 18004964480640.0, + "grad_norm": 2.05020327260517, + "language_loss": 0.78917986, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.81080949, + "num_input_tokens_seen": 152131575, + "step": 7085, + "time_per_iteration": 2.6294076442718506 + }, + { + "auxiliary_loss_clip": 0.01123765, + "auxiliary_loss_mlp": 0.01045859, + "balance_loss_clip": 1.05045295, + "balance_loss_mlp": 1.03036761, + "epoch": 0.4260333684052307, + "flos": 15158792563200.0, + "grad_norm": 2.015450242409387, + "language_loss": 0.76097858, + "learning_rate": 2.568270298414995e-06, + "loss": 0.78267479, + "num_input_tokens_seen": 152149435, + "step": 7086, + "time_per_iteration": 2.606201648712158 + }, + { + "auxiliary_loss_clip": 0.01107732, + "auxiliary_loss_mlp": 0.01040875, + "balance_loss_clip": 1.04528451, + "balance_loss_mlp": 1.02682662, + "epoch": 0.42609349165789867, + "flos": 14939342421120.0, + "grad_norm": 4.435400492712099, + "language_loss": 0.80159658, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.82308263, + "num_input_tokens_seen": 152166860, + "step": 7087, + "time_per_iteration": 2.6517395973205566 + }, + { + "auxiliary_loss_clip": 0.01113938, + "auxiliary_loss_mlp": 0.0103375, + "balance_loss_clip": 1.04980528, + "balance_loss_mlp": 1.01878285, + "epoch": 0.42615361491056664, + "flos": 23731961961600.0, + "grad_norm": 1.6700745034234148, + "language_loss": 0.65982199, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.68129885, + "num_input_tokens_seen": 152187475, + "step": 7088, + "time_per_iteration": 2.6658740043640137 + }, + { + "auxiliary_loss_clip": 0.01079891, + "auxiliary_loss_mlp": 0.01038449, + "balance_loss_clip": 1.04348373, + "balance_loss_mlp": 1.02308249, + "epoch": 0.42621373816323466, + "flos": 24936441747840.0, + "grad_norm": 2.4696048983575376, + "language_loss": 0.68491185, + "learning_rate": 2.56714997234313e-06, + "loss": 0.70609522, + "num_input_tokens_seen": 152207235, + "step": 7089, + "time_per_iteration": 2.816352128982544 + }, + { + "auxiliary_loss_clip": 0.01083453, + "auxiliary_loss_mlp": 0.01038038, + "balance_loss_clip": 1.04270887, + "balance_loss_mlp": 1.02359009, + "epoch": 0.4262738614159026, + "flos": 13552975140480.0, + "grad_norm": 2.0671888191777623, + "language_loss": 0.73030579, + "learning_rate": 2.566776487287525e-06, + "loss": 0.75152063, + "num_input_tokens_seen": 152224240, + "step": 7090, + "time_per_iteration": 2.801116704940796 + }, + { + "auxiliary_loss_clip": 0.01114766, + "auxiliary_loss_mlp": 0.0104358, + "balance_loss_clip": 1.0483079, + "balance_loss_mlp": 1.02875018, + "epoch": 0.4263339846685706, + "flos": 29748794284800.0, + "grad_norm": 1.7852421559677654, + "language_loss": 0.75632602, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.77790952, + "num_input_tokens_seen": 152242595, + "step": 7091, + "time_per_iteration": 2.779731273651123 + }, + { + "auxiliary_loss_clip": 0.01081578, + "auxiliary_loss_mlp": 0.01031582, + "balance_loss_clip": 1.04725623, + "balance_loss_mlp": 1.01879716, + "epoch": 0.42639410792123855, + "flos": 16834204586880.0, + "grad_norm": 2.1009795194853567, + "language_loss": 0.82635152, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.84748316, + "num_input_tokens_seen": 152260840, + "step": 7092, + "time_per_iteration": 2.7296979427337646 + }, + { + "auxiliary_loss_clip": 0.01113469, + "auxiliary_loss_mlp": 0.0104261, + "balance_loss_clip": 1.04653692, + "balance_loss_mlp": 1.02812648, + "epoch": 0.4264542311739065, + "flos": 28763118195840.0, + "grad_norm": 1.6936837646094385, + "language_loss": 0.73936713, + "learning_rate": 2.565655903224038e-06, + "loss": 0.76092792, + "num_input_tokens_seen": 152280580, + "step": 7093, + "time_per_iteration": 2.738494634628296 + }, + { + "auxiliary_loss_clip": 0.01124772, + "auxiliary_loss_mlp": 0.01037897, + "balance_loss_clip": 1.05013132, + "balance_loss_mlp": 1.02285314, + "epoch": 0.4265143544265745, + "flos": 24713615727360.0, + "grad_norm": 2.248863473367437, + "language_loss": 0.69831914, + "learning_rate": 2.565282332284532e-06, + "loss": 0.71994585, + "num_input_tokens_seen": 152298455, + "step": 7094, + "time_per_iteration": 2.696377754211426 + }, + { + "auxiliary_loss_clip": 0.01102522, + "auxiliary_loss_mlp": 0.01035266, + "balance_loss_clip": 1.05082488, + "balance_loss_mlp": 1.02069819, + "epoch": 0.42657447767924245, + "flos": 21865971352320.0, + "grad_norm": 1.593904094988334, + "language_loss": 0.8160966, + "learning_rate": 2.564908739909464e-06, + "loss": 0.83747452, + "num_input_tokens_seen": 152316995, + "step": 7095, + "time_per_iteration": 2.7906196117401123 + }, + { + "auxiliary_loss_clip": 0.01135526, + "auxiliary_loss_mlp": 0.01039866, + "balance_loss_clip": 1.05080557, + "balance_loss_mlp": 1.02575183, + "epoch": 0.4266346009319104, + "flos": 21470236237440.0, + "grad_norm": 1.8045956329002426, + "language_loss": 0.80642307, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.82817698, + "num_input_tokens_seen": 152334800, + "step": 7096, + "time_per_iteration": 2.7473361492156982 + }, + { + "auxiliary_loss_clip": 0.01130201, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.05325663, + "balance_loss_mlp": 1.0182128, + "epoch": 0.4266947241845784, + "flos": 25519379569920.0, + "grad_norm": 2.602963129491376, + "language_loss": 0.64982784, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.67145991, + "num_input_tokens_seen": 152355175, + "step": 7097, + "time_per_iteration": 2.683868408203125 + }, + { + "auxiliary_loss_clip": 0.01103674, + "auxiliary_loss_mlp": 0.01032153, + "balance_loss_clip": 1.04987097, + "balance_loss_mlp": 1.01799679, + "epoch": 0.42675484743724634, + "flos": 26541217676160.0, + "grad_norm": 1.7913732947115202, + "language_loss": 0.74682045, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.76817876, + "num_input_tokens_seen": 152377245, + "step": 7098, + "time_per_iteration": 2.7669501304626465 + }, + { + "auxiliary_loss_clip": 0.0112361, + "auxiliary_loss_mlp": 0.01029914, + "balance_loss_clip": 1.05006361, + "balance_loss_mlp": 1.0165925, + "epoch": 0.4268149706899143, + "flos": 23112718467840.0, + "grad_norm": 1.7242280164199693, + "language_loss": 0.75574845, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.77728367, + "num_input_tokens_seen": 152396985, + "step": 7099, + "time_per_iteration": 2.652024507522583 + }, + { + "auxiliary_loss_clip": 0.01113615, + "auxiliary_loss_mlp": 0.01044502, + "balance_loss_clip": 1.04767907, + "balance_loss_mlp": 1.02964246, + "epoch": 0.4268750939425823, + "flos": 22706532495360.0, + "grad_norm": 2.4499059435945956, + "language_loss": 0.82854998, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.85013109, + "num_input_tokens_seen": 152415590, + "step": 7100, + "time_per_iteration": 2.66955304145813 + }, + { + "auxiliary_loss_clip": 0.01114994, + "auxiliary_loss_mlp": 0.01038973, + "balance_loss_clip": 1.05028403, + "balance_loss_mlp": 1.0246973, + "epoch": 0.42693521719525024, + "flos": 25374875155200.0, + "grad_norm": 1.3265740257801202, + "language_loss": 0.81932402, + "learning_rate": 2.562666736305627e-06, + "loss": 0.8408637, + "num_input_tokens_seen": 152436735, + "step": 7101, + "time_per_iteration": 2.734703540802002 + }, + { + "auxiliary_loss_clip": 0.01139197, + "auxiliary_loss_mlp": 0.01033271, + "balance_loss_clip": 1.0521878, + "balance_loss_mlp": 1.01856041, + "epoch": 0.42699534044791826, + "flos": 18150689957760.0, + "grad_norm": 6.39201802797086, + "language_loss": 0.72548246, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.74720716, + "num_input_tokens_seen": 152455685, + "step": 7102, + "time_per_iteration": 2.6193687915802 + }, + { + "auxiliary_loss_clip": 0.01123058, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.05015755, + "balance_loss_mlp": 1.01770973, + "epoch": 0.4270554637005862, + "flos": 13698413308800.0, + "grad_norm": 2.0187490499372243, + "language_loss": 0.83425319, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.8557986, + "num_input_tokens_seen": 152473500, + "step": 7103, + "time_per_iteration": 2.6151843070983887 + }, + { + "auxiliary_loss_clip": 0.01108466, + "auxiliary_loss_mlp": 0.01042825, + "balance_loss_clip": 1.04559612, + "balance_loss_mlp": 1.02617157, + "epoch": 0.4271155869532542, + "flos": 17493596507520.0, + "grad_norm": 4.588723714988328, + "language_loss": 0.74312592, + "learning_rate": 2.561545446271294e-06, + "loss": 0.76463884, + "num_input_tokens_seen": 152491320, + "step": 7104, + "time_per_iteration": 2.686087131500244 + }, + { + "auxiliary_loss_clip": 0.01118632, + "auxiliary_loss_mlp": 0.01030826, + "balance_loss_clip": 1.04769945, + "balance_loss_mlp": 1.01652098, + "epoch": 0.42717571020592215, + "flos": 32452293381120.0, + "grad_norm": 3.9751824788265226, + "language_loss": 0.7515536, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.77304816, + "num_input_tokens_seen": 152511970, + "step": 7105, + "time_per_iteration": 2.69466495513916 + }, + { + "auxiliary_loss_clip": 0.01138696, + "auxiliary_loss_mlp": 0.01032922, + "balance_loss_clip": 1.05365109, + "balance_loss_mlp": 1.01949859, + "epoch": 0.4272358334585901, + "flos": 16253062444800.0, + "grad_norm": 1.828100931914864, + "language_loss": 0.77001148, + "learning_rate": 2.560797813088819e-06, + "loss": 0.79172766, + "num_input_tokens_seen": 152530515, + "step": 7106, + "time_per_iteration": 2.7470526695251465 + }, + { + "auxiliary_loss_clip": 0.01113386, + "auxiliary_loss_mlp": 0.01032071, + "balance_loss_clip": 1.05155849, + "balance_loss_mlp": 1.01898193, + "epoch": 0.4272959567112581, + "flos": 24200092938240.0, + "grad_norm": 2.105539726439896, + "language_loss": 0.79606462, + "learning_rate": 2.560423964592229e-06, + "loss": 0.81751919, + "num_input_tokens_seen": 152549295, + "step": 7107, + "time_per_iteration": 4.302187919616699 + }, + { + "auxiliary_loss_clip": 0.01084956, + "auxiliary_loss_mlp": 0.01035225, + "balance_loss_clip": 1.04738021, + "balance_loss_mlp": 1.02138472, + "epoch": 0.42735607996392605, + "flos": 27963495578880.0, + "grad_norm": 1.6344878343023064, + "language_loss": 0.67924458, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.70044637, + "num_input_tokens_seen": 152570725, + "step": 7108, + "time_per_iteration": 6.044403314590454 + }, + { + "auxiliary_loss_clip": 0.01110243, + "auxiliary_loss_mlp": 0.01038292, + "balance_loss_clip": 1.05136764, + "balance_loss_mlp": 1.02539325, + "epoch": 0.427416203216594, + "flos": 20295597674880.0, + "grad_norm": 1.7692691179194058, + "language_loss": 0.71223509, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.73372042, + "num_input_tokens_seen": 152588950, + "step": 7109, + "time_per_iteration": 2.6695122718811035 + }, + { + "auxiliary_loss_clip": 0.01120979, + "auxiliary_loss_mlp": 0.01033902, + "balance_loss_clip": 1.048154, + "balance_loss_mlp": 1.01738, + "epoch": 0.427476326469262, + "flos": 26943955943040.0, + "grad_norm": 2.0357298685431595, + "language_loss": 0.64665484, + "learning_rate": 2.559302291651174e-06, + "loss": 0.66820359, + "num_input_tokens_seen": 152608965, + "step": 7110, + "time_per_iteration": 2.6609907150268555 + }, + { + "auxiliary_loss_clip": 0.01132801, + "auxiliary_loss_mlp": 0.00771481, + "balance_loss_clip": 1.04796886, + "balance_loss_mlp": 1.00075054, + "epoch": 0.42753644972192995, + "flos": 25702847262720.0, + "grad_norm": 6.311104463147988, + "language_loss": 0.76556361, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.7846064, + "num_input_tokens_seen": 152630220, + "step": 7111, + "time_per_iteration": 2.704688310623169 + }, + { + "auxiliary_loss_clip": 0.01111143, + "auxiliary_loss_mlp": 0.01033331, + "balance_loss_clip": 1.05706656, + "balance_loss_mlp": 1.01936615, + "epoch": 0.4275965729745979, + "flos": 18767419499520.0, + "grad_norm": 2.0174435424847084, + "language_loss": 0.72800988, + "learning_rate": 2.558554403622845e-06, + "loss": 0.74945462, + "num_input_tokens_seen": 152648835, + "step": 7112, + "time_per_iteration": 4.39399790763855 + }, + { + "auxiliary_loss_clip": 0.01107213, + "auxiliary_loss_mlp": 0.01037359, + "balance_loss_clip": 1.04838848, + "balance_loss_mlp": 1.02366805, + "epoch": 0.4276566962272659, + "flos": 23764424878080.0, + "grad_norm": 1.714295461522007, + "language_loss": 0.71427524, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.73572093, + "num_input_tokens_seen": 152668375, + "step": 7113, + "time_per_iteration": 2.6834428310394287 + }, + { + "auxiliary_loss_clip": 0.01126637, + "auxiliary_loss_mlp": 0.01040655, + "balance_loss_clip": 1.05207372, + "balance_loss_mlp": 1.02700508, + "epoch": 0.42771681947993384, + "flos": 22492505306880.0, + "grad_norm": 1.6108261365545002, + "language_loss": 0.61758566, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.63925862, + "num_input_tokens_seen": 152689725, + "step": 7114, + "time_per_iteration": 2.7341814041137695 + }, + { + "auxiliary_loss_clip": 0.01131369, + "auxiliary_loss_mlp": 0.01042209, + "balance_loss_clip": 1.05489218, + "balance_loss_mlp": 1.02556777, + "epoch": 0.42777694273260186, + "flos": 25044712318080.0, + "grad_norm": 1.6215320240925026, + "language_loss": 0.649822, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.67155778, + "num_input_tokens_seen": 152709375, + "step": 7115, + "time_per_iteration": 2.6360361576080322 + }, + { + "auxiliary_loss_clip": 0.01110467, + "auxiliary_loss_mlp": 0.01037107, + "balance_loss_clip": 1.04954565, + "balance_loss_mlp": 1.02359438, + "epoch": 0.4278370659852698, + "flos": 18661519226880.0, + "grad_norm": 1.8869093124336491, + "language_loss": 0.74057275, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.76204848, + "num_input_tokens_seen": 152727510, + "step": 7116, + "time_per_iteration": 2.701413869857788 + }, + { + "auxiliary_loss_clip": 0.01105537, + "auxiliary_loss_mlp": 0.01041231, + "balance_loss_clip": 1.04539752, + "balance_loss_mlp": 1.02783155, + "epoch": 0.4278971892379378, + "flos": 27308269635840.0, + "grad_norm": 1.8577367375008744, + "language_loss": 0.69426787, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.71573555, + "num_input_tokens_seen": 152746670, + "step": 7117, + "time_per_iteration": 2.740729570388794 + }, + { + "auxiliary_loss_clip": 0.01110879, + "auxiliary_loss_mlp": 0.0103835, + "balance_loss_clip": 1.05176735, + "balance_loss_mlp": 1.02402163, + "epoch": 0.42795731249060576, + "flos": 12888698970240.0, + "grad_norm": 2.8863290375892148, + "language_loss": 0.69564569, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.71713799, + "num_input_tokens_seen": 152760545, + "step": 7118, + "time_per_iteration": 2.7086899280548096 + }, + { + "auxiliary_loss_clip": 0.01092131, + "auxiliary_loss_mlp": 0.0104544, + "balance_loss_clip": 1.04521, + "balance_loss_mlp": 1.03076482, + "epoch": 0.4280174357432737, + "flos": 33401448316800.0, + "grad_norm": 2.453050871280299, + "language_loss": 0.74826419, + "learning_rate": 2.55593612908444e-06, + "loss": 0.76963991, + "num_input_tokens_seen": 152780970, + "step": 7119, + "time_per_iteration": 2.805619239807129 + }, + { + "auxiliary_loss_clip": 0.01069167, + "auxiliary_loss_mlp": 0.01038035, + "balance_loss_clip": 1.0436008, + "balance_loss_mlp": 1.02377188, + "epoch": 0.4280775589959417, + "flos": 18259104182400.0, + "grad_norm": 1.842272720773601, + "language_loss": 0.75238574, + "learning_rate": 2.555562005426573e-06, + "loss": 0.77345783, + "num_input_tokens_seen": 152798475, + "step": 7120, + "time_per_iteration": 2.8678669929504395 + }, + { + "auxiliary_loss_clip": 0.01112615, + "auxiliary_loss_mlp": 0.00770364, + "balance_loss_clip": 1.05290043, + "balance_loss_mlp": 1.00063229, + "epoch": 0.42813768224860965, + "flos": 21471277731840.0, + "grad_norm": 1.7037705311845839, + "language_loss": 0.76884449, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.78767425, + "num_input_tokens_seen": 152817555, + "step": 7121, + "time_per_iteration": 2.776524305343628 + }, + { + "auxiliary_loss_clip": 0.01114442, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.05325198, + "balance_loss_mlp": 1.02162266, + "epoch": 0.4281978055012776, + "flos": 15669262696320.0, + "grad_norm": 1.9187062544957278, + "language_loss": 0.85698652, + "learning_rate": 2.554813694924126e-06, + "loss": 0.87847555, + "num_input_tokens_seen": 152836295, + "step": 7122, + "time_per_iteration": 2.7109732627868652 + }, + { + "auxiliary_loss_clip": 0.01083707, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.04868889, + "balance_loss_mlp": 1.02191544, + "epoch": 0.4282579287539456, + "flos": 17712005155200.0, + "grad_norm": 2.4146794334180632, + "language_loss": 0.81251013, + "learning_rate": 2.554439508107921e-06, + "loss": 0.83370531, + "num_input_tokens_seen": 152854950, + "step": 7123, + "time_per_iteration": 2.7866828441619873 + }, + { + "auxiliary_loss_clip": 0.01090954, + "auxiliary_loss_mlp": 0.01034043, + "balance_loss_clip": 1.04922438, + "balance_loss_mlp": 1.02011371, + "epoch": 0.42831805200661355, + "flos": 19281157770240.0, + "grad_norm": 1.7481094896376608, + "language_loss": 0.81089389, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.8321439, + "num_input_tokens_seen": 152873995, + "step": 7124, + "time_per_iteration": 2.733530044555664 + }, + { + "auxiliary_loss_clip": 0.01125145, + "auxiliary_loss_mlp": 0.01037816, + "balance_loss_clip": 1.05205929, + "balance_loss_mlp": 1.02334404, + "epoch": 0.4283781752592815, + "flos": 19792633484160.0, + "grad_norm": 1.8145132685178345, + "language_loss": 0.80230892, + "learning_rate": 2.553691071416498e-06, + "loss": 0.82393849, + "num_input_tokens_seen": 152892925, + "step": 7125, + "time_per_iteration": 2.635104179382324 + }, + { + "auxiliary_loss_clip": 0.01132021, + "auxiliary_loss_mlp": 0.0076966, + "balance_loss_clip": 1.05282855, + "balance_loss_mlp": 1.00061083, + "epoch": 0.4284382985119495, + "flos": 16508064072960.0, + "grad_norm": 1.8752935538071442, + "language_loss": 0.74911773, + "learning_rate": 2.553316821569659e-06, + "loss": 0.76813453, + "num_input_tokens_seen": 152910935, + "step": 7126, + "time_per_iteration": 2.605344772338867 + }, + { + "auxiliary_loss_clip": 0.01124108, + "auxiliary_loss_mlp": 0.01031402, + "balance_loss_clip": 1.05336213, + "balance_loss_mlp": 1.01742435, + "epoch": 0.42849842176461744, + "flos": 23330767979520.0, + "grad_norm": 4.135943969267594, + "language_loss": 0.80782413, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.82937926, + "num_input_tokens_seen": 152931030, + "step": 7127, + "time_per_iteration": 2.662910223007202 + }, + { + "auxiliary_loss_clip": 0.01088729, + "auxiliary_loss_mlp": 0.0104147, + "balance_loss_clip": 1.04972112, + "balance_loss_mlp": 1.02753484, + "epoch": 0.4285585450172854, + "flos": 17274433674240.0, + "grad_norm": 2.1393882563291773, + "language_loss": 0.76243544, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.78373742, + "num_input_tokens_seen": 152948085, + "step": 7128, + "time_per_iteration": 2.7230868339538574 + }, + { + "auxiliary_loss_clip": 0.01089264, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.04796708, + "balance_loss_mlp": 1.02163041, + "epoch": 0.42861866826995343, + "flos": 24279599692800.0, + "grad_norm": 1.945213992632333, + "language_loss": 0.74079603, + "learning_rate": 2.552193946194937e-06, + "loss": 0.76204789, + "num_input_tokens_seen": 152966265, + "step": 7129, + "time_per_iteration": 2.775891065597534 + }, + { + "auxiliary_loss_clip": 0.01127944, + "auxiliary_loss_mlp": 0.00770117, + "balance_loss_clip": 1.05684757, + "balance_loss_mlp": 1.0005461, + "epoch": 0.4286787915226214, + "flos": 24353108876160.0, + "grad_norm": 1.5710338967277158, + "language_loss": 0.77974319, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.79872382, + "num_input_tokens_seen": 152986775, + "step": 7130, + "time_per_iteration": 2.6977498531341553 + }, + { + "auxiliary_loss_clip": 0.01119463, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.05768883, + "balance_loss_mlp": 1.02184367, + "epoch": 0.42873891477528936, + "flos": 15449992122240.0, + "grad_norm": 2.320631391566952, + "language_loss": 0.73168224, + "learning_rate": 2.551445257891886e-06, + "loss": 0.75323212, + "num_input_tokens_seen": 153003595, + "step": 7131, + "time_per_iteration": 2.6973114013671875 + }, + { + "auxiliary_loss_clip": 0.01116554, + "auxiliary_loss_mlp": 0.0103667, + "balance_loss_clip": 1.05293584, + "balance_loss_mlp": 1.02260923, + "epoch": 0.4287990380279573, + "flos": 17639573379840.0, + "grad_norm": 5.223518802520722, + "language_loss": 0.77257997, + "learning_rate": 2.551070882366973e-06, + "loss": 0.79411221, + "num_input_tokens_seen": 153021960, + "step": 7132, + "time_per_iteration": 2.644556999206543 + }, + { + "auxiliary_loss_clip": 0.01097397, + "auxiliary_loss_mlp": 0.00771143, + "balance_loss_clip": 1.05195022, + "balance_loss_mlp": 1.00064743, + "epoch": 0.4288591612806253, + "flos": 27162328677120.0, + "grad_norm": 2.003525879431933, + "language_loss": 0.78719372, + "learning_rate": 2.550696485945397e-06, + "loss": 0.80587912, + "num_input_tokens_seen": 153042110, + "step": 7133, + "time_per_iteration": 2.7668325901031494 + }, + { + "auxiliary_loss_clip": 0.01111172, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.05302238, + "balance_loss_mlp": 1.02091813, + "epoch": 0.42891928453329325, + "flos": 17163182275200.0, + "grad_norm": 1.850568768068126, + "language_loss": 0.7449469, + "learning_rate": 2.550322068641355e-06, + "loss": 0.76639962, + "num_input_tokens_seen": 153058925, + "step": 7134, + "time_per_iteration": 2.714893341064453 + }, + { + "auxiliary_loss_clip": 0.01112422, + "auxiliary_loss_mlp": 0.01035997, + "balance_loss_clip": 1.04541016, + "balance_loss_mlp": 1.02241349, + "epoch": 0.4289794077859612, + "flos": 18187031543040.0, + "grad_norm": 1.9214467858451951, + "language_loss": 0.84098607, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.86247027, + "num_input_tokens_seen": 153078070, + "step": 7135, + "time_per_iteration": 2.646799325942993 + }, + { + "auxiliary_loss_clip": 0.01060089, + "auxiliary_loss_mlp": 0.01040969, + "balance_loss_clip": 1.04197621, + "balance_loss_mlp": 1.02555561, + "epoch": 0.4290395310386292, + "flos": 28256885867520.0, + "grad_norm": 2.1625216270915493, + "language_loss": 0.75274026, + "learning_rate": 2.549573171442666e-06, + "loss": 0.77375078, + "num_input_tokens_seen": 153096680, + "step": 7136, + "time_per_iteration": 2.809598207473755 + }, + { + "auxiliary_loss_clip": 0.0112086, + "auxiliary_loss_mlp": 0.0103731, + "balance_loss_clip": 1.04999709, + "balance_loss_mlp": 1.02323103, + "epoch": 0.42909965429129715, + "flos": 16216074414720.0, + "grad_norm": 2.3663507288699743, + "language_loss": 0.79031229, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.81189406, + "num_input_tokens_seen": 153113305, + "step": 7137, + "time_per_iteration": 2.5979957580566406 + }, + { + "auxiliary_loss_clip": 0.01139951, + "auxiliary_loss_mlp": 0.01034457, + "balance_loss_clip": 1.05516219, + "balance_loss_mlp": 1.02047372, + "epoch": 0.4291597775439651, + "flos": 23112862122240.0, + "grad_norm": 2.7024255480814166, + "language_loss": 0.76951313, + "learning_rate": 2.548824190884499e-06, + "loss": 0.7912572, + "num_input_tokens_seen": 153132735, + "step": 7138, + "time_per_iteration": 2.659080982208252 + }, + { + "auxiliary_loss_clip": 0.01053167, + "auxiliary_loss_mlp": 0.01001874, + "balance_loss_clip": 1.04265583, + "balance_loss_mlp": 1.000193, + "epoch": 0.4292199007966331, + "flos": 67546212681600.0, + "grad_norm": 0.770527259841848, + "language_loss": 0.56189907, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58244956, + "num_input_tokens_seen": 153187925, + "step": 7139, + "time_per_iteration": 3.10082745552063 + }, + { + "auxiliary_loss_clip": 0.0113097, + "auxiliary_loss_mlp": 0.00769947, + "balance_loss_clip": 1.05131912, + "balance_loss_mlp": 1.00071657, + "epoch": 0.42928002404930105, + "flos": 22999850956800.0, + "grad_norm": 2.111862554587806, + "language_loss": 0.80871445, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.82772362, + "num_input_tokens_seen": 153206990, + "step": 7140, + "time_per_iteration": 2.795779228210449 + }, + { + "auxiliary_loss_clip": 0.01122496, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.05028069, + "balance_loss_mlp": 1.01853812, + "epoch": 0.429340147301969, + "flos": 11544922241280.0, + "grad_norm": 1.8811141343222446, + "language_loss": 0.82105601, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.84260583, + "num_input_tokens_seen": 153222345, + "step": 7141, + "time_per_iteration": 2.7634544372558594 + }, + { + "auxiliary_loss_clip": 0.0112355, + "auxiliary_loss_mlp": 0.0103971, + "balance_loss_clip": 1.05177212, + "balance_loss_mlp": 1.02569723, + "epoch": 0.42940027055463703, + "flos": 25264988472960.0, + "grad_norm": 3.1732751781519566, + "language_loss": 0.86466211, + "learning_rate": 2.547325980144166e-06, + "loss": 0.88629478, + "num_input_tokens_seen": 153240570, + "step": 7142, + "time_per_iteration": 2.73675537109375 + }, + { + "auxiliary_loss_clip": 0.01107323, + "auxiliary_loss_mlp": 0.0103324, + "balance_loss_clip": 1.05093384, + "balance_loss_mlp": 1.02018034, + "epoch": 0.429460393807305, + "flos": 23805004268160.0, + "grad_norm": 2.0666274749088704, + "language_loss": 0.78651458, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.80792016, + "num_input_tokens_seen": 153259575, + "step": 7143, + "time_per_iteration": 2.704951047897339 + }, + { + "auxiliary_loss_clip": 0.01085856, + "auxiliary_loss_mlp": 0.01042533, + "balance_loss_clip": 1.04870784, + "balance_loss_mlp": 1.02862692, + "epoch": 0.42952051705997296, + "flos": 13918294414080.0, + "grad_norm": 1.8720341937391007, + "language_loss": 0.77237451, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.79365838, + "num_input_tokens_seen": 153276650, + "step": 7144, + "time_per_iteration": 2.8080482482910156 + }, + { + "auxiliary_loss_clip": 0.01111048, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.05582607, + "balance_loss_mlp": 1.01565719, + "epoch": 0.4295806403126409, + "flos": 26760380509440.0, + "grad_norm": 2.7559580952375335, + "language_loss": 0.73788631, + "learning_rate": 2.54620210411532e-06, + "loss": 0.75928855, + "num_input_tokens_seen": 153298025, + "step": 7145, + "time_per_iteration": 2.876610040664673 + }, + { + "auxiliary_loss_clip": 0.01124065, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.05205083, + "balance_loss_mlp": 1.02291536, + "epoch": 0.4296407635653089, + "flos": 20952619297920.0, + "grad_norm": 2.2535739124191623, + "language_loss": 0.78997326, + "learning_rate": 2.545827437329352e-06, + "loss": 0.81157696, + "num_input_tokens_seen": 153315775, + "step": 7146, + "time_per_iteration": 4.237323999404907 + }, + { + "auxiliary_loss_clip": 0.01118325, + "auxiliary_loss_mlp": 0.01033233, + "balance_loss_clip": 1.04862475, + "balance_loss_mlp": 1.02041841, + "epoch": 0.42970088681797686, + "flos": 15852335339520.0, + "grad_norm": 2.134935554118882, + "language_loss": 0.83125973, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.85277522, + "num_input_tokens_seen": 153332765, + "step": 7147, + "time_per_iteration": 4.170353412628174 + }, + { + "auxiliary_loss_clip": 0.01120236, + "auxiliary_loss_mlp": 0.01036768, + "balance_loss_clip": 1.05321455, + "balance_loss_mlp": 1.02217066, + "epoch": 0.4297610100706448, + "flos": 22382618624640.0, + "grad_norm": 2.0255914888463837, + "language_loss": 0.87308717, + "learning_rate": 2.545078041678131e-06, + "loss": 0.89465714, + "num_input_tokens_seen": 153350760, + "step": 7148, + "time_per_iteration": 4.25404167175293 + }, + { + "auxiliary_loss_clip": 0.01106949, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.0480504, + "balance_loss_mlp": 1.02031255, + "epoch": 0.4298211333233128, + "flos": 27925681536000.0, + "grad_norm": 1.5866853205406048, + "language_loss": 0.77782673, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.79923236, + "num_input_tokens_seen": 153370765, + "step": 7149, + "time_per_iteration": 2.7506890296936035 + }, + { + "auxiliary_loss_clip": 0.01089941, + "auxiliary_loss_mlp": 0.01034354, + "balance_loss_clip": 1.04399276, + "balance_loss_mlp": 1.02023959, + "epoch": 0.42988125657598075, + "flos": 24425612478720.0, + "grad_norm": 1.8521512589115583, + "language_loss": 0.80214548, + "learning_rate": 2.544328563349256e-06, + "loss": 0.8233884, + "num_input_tokens_seen": 153390725, + "step": 7150, + "time_per_iteration": 2.7500832080841064 + }, + { + "auxiliary_loss_clip": 0.01129377, + "auxiliary_loss_mlp": 0.01039727, + "balance_loss_clip": 1.05486202, + "balance_loss_mlp": 1.02441442, + "epoch": 0.4299413798286487, + "flos": 15850180523520.0, + "grad_norm": 1.9985895227285218, + "language_loss": 0.75273871, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.7744298, + "num_input_tokens_seen": 153408010, + "step": 7151, + "time_per_iteration": 5.016021251678467 + }, + { + "auxiliary_loss_clip": 0.01085345, + "auxiliary_loss_mlp": 0.01034438, + "balance_loss_clip": 1.0429914, + "balance_loss_mlp": 1.02001333, + "epoch": 0.4300015030813167, + "flos": 22309504490880.0, + "grad_norm": 2.1817188720110954, + "language_loss": 0.70050609, + "learning_rate": 2.543579002456406e-06, + "loss": 0.72170389, + "num_input_tokens_seen": 153426865, + "step": 7152, + "time_per_iteration": 2.7800815105438232 + }, + { + "auxiliary_loss_clip": 0.01111211, + "auxiliary_loss_mlp": 0.01037662, + "balance_loss_clip": 1.04997575, + "balance_loss_mlp": 1.02443016, + "epoch": 0.43006162633398465, + "flos": 34897666366080.0, + "grad_norm": 1.6446083910432685, + "language_loss": 0.71179092, + "learning_rate": 2.54320419108402e-06, + "loss": 0.73327965, + "num_input_tokens_seen": 153449410, + "step": 7153, + "time_per_iteration": 2.829648017883301 + }, + { + "auxiliary_loss_clip": 0.01119902, + "auxiliary_loss_mlp": 0.01033204, + "balance_loss_clip": 1.0488553, + "balance_loss_mlp": 1.01928604, + "epoch": 0.4301217495866526, + "flos": 15961575576960.0, + "grad_norm": 1.892610527455045, + "language_loss": 0.78175116, + "learning_rate": 2.542829359113276e-06, + "loss": 0.80328226, + "num_input_tokens_seen": 153467910, + "step": 7154, + "time_per_iteration": 2.723484516143799 + }, + { + "auxiliary_loss_clip": 0.01099683, + "auxiliary_loss_mlp": 0.01040214, + "balance_loss_clip": 1.04681695, + "balance_loss_mlp": 1.02599812, + "epoch": 0.43018187283932063, + "flos": 18770364414720.0, + "grad_norm": 1.5463056134535458, + "language_loss": 0.78802991, + "learning_rate": 2.542454506558389e-06, + "loss": 0.80942887, + "num_input_tokens_seen": 153487100, + "step": 7155, + "time_per_iteration": 2.7014451026916504 + }, + { + "auxiliary_loss_clip": 0.01105109, + "auxiliary_loss_mlp": 0.01032701, + "balance_loss_clip": 1.04913473, + "balance_loss_mlp": 1.01963592, + "epoch": 0.4302419960919886, + "flos": 20151703791360.0, + "grad_norm": 1.7272401238355637, + "language_loss": 0.88303947, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.90441763, + "num_input_tokens_seen": 153505565, + "step": 7156, + "time_per_iteration": 2.696967363357544 + }, + { + "auxiliary_loss_clip": 0.01135167, + "auxiliary_loss_mlp": 0.01033618, + "balance_loss_clip": 1.05029023, + "balance_loss_mlp": 1.01970661, + "epoch": 0.43030211934465656, + "flos": 26432731624320.0, + "grad_norm": 1.8553568722970555, + "language_loss": 0.82653069, + "learning_rate": 2.541704739753042e-06, + "loss": 0.84821856, + "num_input_tokens_seen": 153526130, + "step": 7157, + "time_per_iteration": 2.706956148147583 + }, + { + "auxiliary_loss_clip": 0.01138655, + "auxiliary_loss_mlp": 0.01033446, + "balance_loss_clip": 1.05253196, + "balance_loss_mlp": 1.0191586, + "epoch": 0.43036224259732453, + "flos": 24389234979840.0, + "grad_norm": 1.8412394525159426, + "language_loss": 0.71535289, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.73707396, + "num_input_tokens_seen": 153546370, + "step": 7158, + "time_per_iteration": 2.717587471008301 + }, + { + "auxiliary_loss_clip": 0.01122952, + "auxiliary_loss_mlp": 0.01034381, + "balance_loss_clip": 1.05053186, + "balance_loss_mlp": 1.02094615, + "epoch": 0.4304223658499925, + "flos": 17201714590080.0, + "grad_norm": 2.4063235591116, + "language_loss": 0.82592964, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.84750295, + "num_input_tokens_seen": 153562800, + "step": 7159, + "time_per_iteration": 2.657625436782837 + }, + { + "auxiliary_loss_clip": 0.01105982, + "auxiliary_loss_mlp": 0.01034344, + "balance_loss_clip": 1.04629135, + "balance_loss_mlp": 1.02073002, + "epoch": 0.43048248910266046, + "flos": 14903000835840.0, + "grad_norm": 2.253245664419059, + "language_loss": 0.83222294, + "learning_rate": 2.54057993551933e-06, + "loss": 0.85362625, + "num_input_tokens_seen": 153578395, + "step": 7160, + "time_per_iteration": 2.6994106769561768 + }, + { + "auxiliary_loss_clip": 0.0112897, + "auxiliary_loss_mlp": 0.01040585, + "balance_loss_clip": 1.05215347, + "balance_loss_mlp": 1.02446771, + "epoch": 0.4305426123553284, + "flos": 21579835610880.0, + "grad_norm": 2.2814219127337236, + "language_loss": 0.77506208, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.79675758, + "num_input_tokens_seen": 153596880, + "step": 7161, + "time_per_iteration": 2.819274425506592 + }, + { + "auxiliary_loss_clip": 0.01120227, + "auxiliary_loss_mlp": 0.0103714, + "balance_loss_clip": 1.04739952, + "balance_loss_mlp": 1.02265632, + "epoch": 0.4306027356079964, + "flos": 22601278667520.0, + "grad_norm": 2.279224529598255, + "language_loss": 0.73028505, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.75185871, + "num_input_tokens_seen": 153616570, + "step": 7162, + "time_per_iteration": 2.62280011177063 + }, + { + "auxiliary_loss_clip": 0.01016488, + "auxiliary_loss_mlp": 0.00753107, + "balance_loss_clip": 1.02147388, + "balance_loss_mlp": 1.00100327, + "epoch": 0.43066285886066435, + "flos": 70672091806080.0, + "grad_norm": 0.7910606346239517, + "language_loss": 0.58986276, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.60755867, + "num_input_tokens_seen": 153671450, + "step": 7163, + "time_per_iteration": 3.1325736045837402 + }, + { + "auxiliary_loss_clip": 0.01104143, + "auxiliary_loss_mlp": 0.01044649, + "balance_loss_clip": 1.04593122, + "balance_loss_mlp": 1.02948582, + "epoch": 0.4307229821133323, + "flos": 26720591218560.0, + "grad_norm": 1.8311930089659938, + "language_loss": 0.79205155, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.81353945, + "num_input_tokens_seen": 153691405, + "step": 7164, + "time_per_iteration": 2.753256320953369 + }, + { + "auxiliary_loss_clip": 0.01138029, + "auxiliary_loss_mlp": 0.01040201, + "balance_loss_clip": 1.0510416, + "balance_loss_mlp": 1.02608645, + "epoch": 0.4307831053660003, + "flos": 26177119464960.0, + "grad_norm": 2.032413289263653, + "language_loss": 0.67551947, + "learning_rate": 2.538704852009177e-06, + "loss": 0.69730175, + "num_input_tokens_seen": 153711555, + "step": 7165, + "time_per_iteration": 2.719172477722168 + }, + { + "auxiliary_loss_clip": 0.01106688, + "auxiliary_loss_mlp": 0.00771886, + "balance_loss_clip": 1.05042744, + "balance_loss_mlp": 1.00068462, + "epoch": 0.43084322861866825, + "flos": 18910343715840.0, + "grad_norm": 2.1027726489364436, + "language_loss": 0.75451279, + "learning_rate": 2.538329773967034e-06, + "loss": 0.77329856, + "num_input_tokens_seen": 153730095, + "step": 7166, + "time_per_iteration": 2.710304021835327 + }, + { + "auxiliary_loss_clip": 0.01126475, + "auxiliary_loss_mlp": 0.01036095, + "balance_loss_clip": 1.05613852, + "balance_loss_mlp": 1.02310109, + "epoch": 0.4309033518713362, + "flos": 26432911192320.0, + "grad_norm": 1.6200122801673495, + "language_loss": 0.71809006, + "learning_rate": 2.537954675511372e-06, + "loss": 0.7397157, + "num_input_tokens_seen": 153749320, + "step": 7167, + "time_per_iteration": 2.676224946975708 + }, + { + "auxiliary_loss_clip": 0.01104337, + "auxiliary_loss_mlp": 0.00771035, + "balance_loss_clip": 1.04866242, + "balance_loss_mlp": 1.00059962, + "epoch": 0.43096347512400424, + "flos": 21213295274880.0, + "grad_norm": 1.6573858575043368, + "language_loss": 0.78183687, + "learning_rate": 2.537579556656414e-06, + "loss": 0.80059052, + "num_input_tokens_seen": 153767825, + "step": 7168, + "time_per_iteration": 2.8030035495758057 + }, + { + "auxiliary_loss_clip": 0.01111425, + "auxiliary_loss_mlp": 0.0104262, + "balance_loss_clip": 1.05006397, + "balance_loss_mlp": 1.02867889, + "epoch": 0.4310235983766722, + "flos": 16540131939840.0, + "grad_norm": 1.8701517899109106, + "language_loss": 0.82348084, + "learning_rate": 2.537204417416387e-06, + "loss": 0.84502125, + "num_input_tokens_seen": 153785350, + "step": 7169, + "time_per_iteration": 2.683119773864746 + }, + { + "auxiliary_loss_clip": 0.01047083, + "auxiliary_loss_mlp": 0.01001288, + "balance_loss_clip": 1.03727269, + "balance_loss_mlp": 0.99934483, + "epoch": 0.43108372162934017, + "flos": 64775704763520.0, + "grad_norm": 0.7280845280825856, + "language_loss": 0.60741472, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.6278984, + "num_input_tokens_seen": 153856400, + "step": 7170, + "time_per_iteration": 3.345574140548706 + }, + { + "auxiliary_loss_clip": 0.01135698, + "auxiliary_loss_mlp": 0.01037021, + "balance_loss_clip": 1.05163968, + "balance_loss_mlp": 1.02352667, + "epoch": 0.43114384488200813, + "flos": 13444094039040.0, + "grad_norm": 1.7903297890514136, + "language_loss": 0.75776696, + "learning_rate": 2.536454077838021e-06, + "loss": 0.77949417, + "num_input_tokens_seen": 153875230, + "step": 7171, + "time_per_iteration": 2.612459897994995 + }, + { + "auxiliary_loss_clip": 0.01120974, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.05036652, + "balance_loss_mlp": 1.02106678, + "epoch": 0.4312039681346761, + "flos": 26286682924800.0, + "grad_norm": 3.289099345654009, + "language_loss": 0.77644551, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.79800093, + "num_input_tokens_seen": 153894740, + "step": 7172, + "time_per_iteration": 2.69909930229187 + }, + { + "auxiliary_loss_clip": 0.01105721, + "auxiliary_loss_mlp": 0.010481, + "balance_loss_clip": 1.04574609, + "balance_loss_mlp": 1.03119648, + "epoch": 0.43126409138734406, + "flos": 20376684627840.0, + "grad_norm": 2.89880180493229, + "language_loss": 0.76759243, + "learning_rate": 2.535703656890086e-06, + "loss": 0.78913063, + "num_input_tokens_seen": 153913230, + "step": 7173, + "time_per_iteration": 2.6338369846343994 + }, + { + "auxiliary_loss_clip": 0.01130423, + "auxiliary_loss_mlp": 0.00772103, + "balance_loss_clip": 1.04817533, + "balance_loss_mlp": 1.00070202, + "epoch": 0.431324214640012, + "flos": 22123091882880.0, + "grad_norm": 1.4474212501027515, + "language_loss": 0.76933503, + "learning_rate": 2.5353284159381e-06, + "loss": 0.78836024, + "num_input_tokens_seen": 153933250, + "step": 7174, + "time_per_iteration": 2.809385061264038 + }, + { + "auxiliary_loss_clip": 0.01135393, + "auxiliary_loss_mlp": 0.01035645, + "balance_loss_clip": 1.0494926, + "balance_loss_mlp": 1.02004063, + "epoch": 0.43138433789268, + "flos": 15231008856960.0, + "grad_norm": 1.5868683627972313, + "language_loss": 0.8226738, + "learning_rate": 2.534953154686407e-06, + "loss": 0.84438419, + "num_input_tokens_seen": 153951325, + "step": 7175, + "time_per_iteration": 2.609368324279785 + }, + { + "auxiliary_loss_clip": 0.01092364, + "auxiliary_loss_mlp": 0.01052008, + "balance_loss_clip": 1.0459013, + "balance_loss_mlp": 1.03422189, + "epoch": 0.43144446114534796, + "flos": 18150294908160.0, + "grad_norm": 2.243705003900615, + "language_loss": 0.74261117, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.76405489, + "num_input_tokens_seen": 153966975, + "step": 7176, + "time_per_iteration": 2.680771827697754 + }, + { + "auxiliary_loss_clip": 0.01122908, + "auxiliary_loss_mlp": 0.01035728, + "balance_loss_clip": 1.04637945, + "balance_loss_mlp": 1.0215838, + "epoch": 0.4315045843980159, + "flos": 22929861306240.0, + "grad_norm": 1.6403527990581428, + "language_loss": 0.73309958, + "learning_rate": 2.534202571340819e-06, + "loss": 0.754686, + "num_input_tokens_seen": 153986695, + "step": 7177, + "time_per_iteration": 2.760601758956909 + }, + { + "auxiliary_loss_clip": 0.011222, + "auxiliary_loss_mlp": 0.01043971, + "balance_loss_clip": 1.05072641, + "balance_loss_mlp": 1.02720976, + "epoch": 0.4315647076506839, + "flos": 22126862810880.0, + "grad_norm": 1.7813773885441684, + "language_loss": 0.81519645, + "learning_rate": 2.533827249275387e-06, + "loss": 0.83685815, + "num_input_tokens_seen": 154004710, + "step": 7178, + "time_per_iteration": 2.6687469482421875 + }, + { + "auxiliary_loss_clip": 0.01109607, + "auxiliary_loss_mlp": 0.01033774, + "balance_loss_clip": 1.04922378, + "balance_loss_mlp": 1.02013087, + "epoch": 0.43162483090335185, + "flos": 26871129118080.0, + "grad_norm": 32.445562208198496, + "language_loss": 0.84143358, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.86286741, + "num_input_tokens_seen": 154024320, + "step": 7179, + "time_per_iteration": 2.696716547012329 + }, + { + "auxiliary_loss_clip": 0.01108857, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.04713559, + "balance_loss_mlp": 1.0200026, + "epoch": 0.4316849541560198, + "flos": 13913122855680.0, + "grad_norm": 1.7762155940538253, + "language_loss": 0.75679082, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.77822137, + "num_input_tokens_seen": 154041755, + "step": 7180, + "time_per_iteration": 2.6832194328308105 + }, + { + "auxiliary_loss_clip": 0.01104614, + "auxiliary_loss_mlp": 0.00776174, + "balance_loss_clip": 1.0417347, + "balance_loss_mlp": 1.00057638, + "epoch": 0.4317450774086878, + "flos": 16435165420800.0, + "grad_norm": 1.9971445999801452, + "language_loss": 0.81773126, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.83653915, + "num_input_tokens_seen": 154056775, + "step": 7181, + "time_per_iteration": 2.6499931812286377 + }, + { + "auxiliary_loss_clip": 0.01110303, + "auxiliary_loss_mlp": 0.01040472, + "balance_loss_clip": 1.04747176, + "balance_loss_mlp": 1.02473664, + "epoch": 0.4318052006613558, + "flos": 20554980762240.0, + "grad_norm": 1.7092925782952597, + "language_loss": 0.89020073, + "learning_rate": 2.532325758728165e-06, + "loss": 0.91170847, + "num_input_tokens_seen": 154075015, + "step": 7182, + "time_per_iteration": 2.6567654609680176 + }, + { + "auxiliary_loss_clip": 0.01121856, + "auxiliary_loss_mlp": 0.00772189, + "balance_loss_clip": 1.05025744, + "balance_loss_mlp": 1.00049865, + "epoch": 0.43186532391402377, + "flos": 22820046451200.0, + "grad_norm": 1.602704996145881, + "language_loss": 0.75739694, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.77633733, + "num_input_tokens_seen": 154095170, + "step": 7183, + "time_per_iteration": 2.6784613132476807 + }, + { + "auxiliary_loss_clip": 0.01123979, + "auxiliary_loss_mlp": 0.01032993, + "balance_loss_clip": 1.05125499, + "balance_loss_mlp": 1.01853919, + "epoch": 0.43192544716669173, + "flos": 25556583081600.0, + "grad_norm": 1.538308227417617, + "language_loss": 0.77589077, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.7974605, + "num_input_tokens_seen": 154116895, + "step": 7184, + "time_per_iteration": 2.6501550674438477 + }, + { + "auxiliary_loss_clip": 0.01103086, + "auxiliary_loss_mlp": 0.01037708, + "balance_loss_clip": 1.04594743, + "balance_loss_mlp": 1.02377832, + "epoch": 0.4319855704193597, + "flos": 30954674701440.0, + "grad_norm": 1.7848849500644928, + "language_loss": 0.73435313, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.75576103, + "num_input_tokens_seen": 154138395, + "step": 7185, + "time_per_iteration": 2.766298770904541 + }, + { + "auxiliary_loss_clip": 0.01122479, + "auxiliary_loss_mlp": 0.01042205, + "balance_loss_clip": 1.05223203, + "balance_loss_mlp": 1.02754247, + "epoch": 0.43204569367202766, + "flos": 24238732993920.0, + "grad_norm": 3.4842964823639515, + "language_loss": 0.75962853, + "learning_rate": 2.530823945207421e-06, + "loss": 0.78127533, + "num_input_tokens_seen": 154156775, + "step": 7186, + "time_per_iteration": 4.334157705307007 + }, + { + "auxiliary_loss_clip": 0.01099566, + "auxiliary_loss_mlp": 0.0103912, + "balance_loss_clip": 1.04762721, + "balance_loss_mlp": 1.02477932, + "epoch": 0.43210581692469563, + "flos": 18406948561920.0, + "grad_norm": 3.9729453010836218, + "language_loss": 0.76471615, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.78610301, + "num_input_tokens_seen": 154177500, + "step": 7187, + "time_per_iteration": 5.956019401550293 + }, + { + "auxiliary_loss_clip": 0.01025499, + "auxiliary_loss_mlp": 0.01034135, + "balance_loss_clip": 1.03011787, + "balance_loss_mlp": 1.03272867, + "epoch": 0.4321659401773636, + "flos": 49832378910720.0, + "grad_norm": 0.8609493763660439, + "language_loss": 0.68115592, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70175231, + "num_input_tokens_seen": 154237110, + "step": 7188, + "time_per_iteration": 3.246208667755127 + }, + { + "auxiliary_loss_clip": 0.01100014, + "auxiliary_loss_mlp": 0.01038065, + "balance_loss_clip": 1.0437665, + "balance_loss_mlp": 1.02437973, + "epoch": 0.43222606343003156, + "flos": 17128564542720.0, + "grad_norm": 1.9766532511253156, + "language_loss": 0.77875316, + "learning_rate": 2.529697373663614e-06, + "loss": 0.80013394, + "num_input_tokens_seen": 154253910, + "step": 7189, + "time_per_iteration": 2.681076765060425 + }, + { + "auxiliary_loss_clip": 0.01083825, + "auxiliary_loss_mlp": 0.01046889, + "balance_loss_clip": 1.04553795, + "balance_loss_mlp": 1.0314517, + "epoch": 0.4322861866826995, + "flos": 22749949059840.0, + "grad_norm": 1.8062049350419371, + "language_loss": 0.71379328, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.73510039, + "num_input_tokens_seen": 154274770, + "step": 7190, + "time_per_iteration": 2.785278081893921 + }, + { + "auxiliary_loss_clip": 0.01109749, + "auxiliary_loss_mlp": 0.01039244, + "balance_loss_clip": 1.04681444, + "balance_loss_mlp": 1.02500999, + "epoch": 0.4323463099353675, + "flos": 27891925729920.0, + "grad_norm": 1.4390067860166444, + "language_loss": 0.79639554, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.81788546, + "num_input_tokens_seen": 154295035, + "step": 7191, + "time_per_iteration": 4.571990728378296 + }, + { + "auxiliary_loss_clip": 0.0108611, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.04733062, + "balance_loss_mlp": 1.01954126, + "epoch": 0.43240643318803546, + "flos": 21614740652160.0, + "grad_norm": 1.5570148329267672, + "language_loss": 0.74904197, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.77023631, + "num_input_tokens_seen": 154314905, + "step": 7192, + "time_per_iteration": 2.7427282333374023 + }, + { + "auxiliary_loss_clip": 0.01090847, + "auxiliary_loss_mlp": 0.01047049, + "balance_loss_clip": 1.04693365, + "balance_loss_mlp": 1.03140879, + "epoch": 0.4324665564407034, + "flos": 17558378686080.0, + "grad_norm": 2.028484656266998, + "language_loss": 0.7934891, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.81486803, + "num_input_tokens_seen": 154331740, + "step": 7193, + "time_per_iteration": 2.708481550216675 + }, + { + "auxiliary_loss_clip": 0.01114828, + "auxiliary_loss_mlp": 0.0104506, + "balance_loss_clip": 1.04726183, + "balance_loss_mlp": 1.02971745, + "epoch": 0.4325266796933714, + "flos": 18402423448320.0, + "grad_norm": 1.769737496980083, + "language_loss": 0.75720823, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.77880704, + "num_input_tokens_seen": 154348740, + "step": 7194, + "time_per_iteration": 2.685701608657837 + }, + { + "auxiliary_loss_clip": 0.01135356, + "auxiliary_loss_mlp": 0.01041388, + "balance_loss_clip": 1.05137146, + "balance_loss_mlp": 1.02693963, + "epoch": 0.4325868029460394, + "flos": 22564793427840.0, + "grad_norm": 3.855960133728433, + "language_loss": 0.59479225, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.61655968, + "num_input_tokens_seen": 154368835, + "step": 7195, + "time_per_iteration": 2.634310483932495 + }, + { + "auxiliary_loss_clip": 0.01112701, + "auxiliary_loss_mlp": 0.01040238, + "balance_loss_clip": 1.04618812, + "balance_loss_mlp": 1.02434754, + "epoch": 0.43264692619870737, + "flos": 14605516396800.0, + "grad_norm": 2.711649843090413, + "language_loss": 0.65653574, + "learning_rate": 2.527068004376515e-06, + "loss": 0.67806506, + "num_input_tokens_seen": 154384620, + "step": 7196, + "time_per_iteration": 2.608530044555664 + }, + { + "auxiliary_loss_clip": 0.01141945, + "auxiliary_loss_mlp": 0.01038972, + "balance_loss_clip": 1.05338526, + "balance_loss_mlp": 1.02316523, + "epoch": 0.43270704945137534, + "flos": 21501657659520.0, + "grad_norm": 1.8654403969935065, + "language_loss": 0.72525519, + "learning_rate": 2.526692300132797e-06, + "loss": 0.74706435, + "num_input_tokens_seen": 154402865, + "step": 7197, + "time_per_iteration": 2.644087791442871 + }, + { + "auxiliary_loss_clip": 0.01124491, + "auxiliary_loss_mlp": 0.01040937, + "balance_loss_clip": 1.05245936, + "balance_loss_mlp": 1.02619135, + "epoch": 0.4327671727040433, + "flos": 25155891889920.0, + "grad_norm": 1.511486884186769, + "language_loss": 0.73146015, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.75311446, + "num_input_tokens_seen": 154423625, + "step": 7198, + "time_per_iteration": 2.7317864894866943 + }, + { + "auxiliary_loss_clip": 0.0109556, + "auxiliary_loss_mlp": 0.01034886, + "balance_loss_clip": 1.04451466, + "balance_loss_mlp": 1.02034283, + "epoch": 0.43282729595671127, + "flos": 25447163276160.0, + "grad_norm": 1.539323937310933, + "language_loss": 0.80887341, + "learning_rate": 2.525940831742934e-06, + "loss": 0.8301779, + "num_input_tokens_seen": 154444775, + "step": 7199, + "time_per_iteration": 2.736016035079956 + }, + { + "auxiliary_loss_clip": 0.01121231, + "auxiliary_loss_mlp": 0.01034417, + "balance_loss_clip": 1.05255413, + "balance_loss_mlp": 1.0201118, + "epoch": 0.43288741920937923, + "flos": 24126116878080.0, + "grad_norm": 2.6908376787400186, + "language_loss": 0.68332666, + "learning_rate": 2.525565067625286e-06, + "loss": 0.70488322, + "num_input_tokens_seen": 154460815, + "step": 7200, + "time_per_iteration": 2.688460350036621 + }, + { + "auxiliary_loss_clip": 0.01114262, + "auxiliary_loss_mlp": 0.00772856, + "balance_loss_clip": 1.05025625, + "balance_loss_mlp": 1.00067294, + "epoch": 0.4329475424620472, + "flos": 19204955066880.0, + "grad_norm": 1.9560728888597885, + "language_loss": 0.87379515, + "learning_rate": 2.525189283578157e-06, + "loss": 0.89266634, + "num_input_tokens_seen": 154479145, + "step": 7201, + "time_per_iteration": 2.7547309398651123 + }, + { + "auxiliary_loss_clip": 0.01086041, + "auxiliary_loss_mlp": 0.01040787, + "balance_loss_clip": 1.04952443, + "balance_loss_mlp": 1.02395487, + "epoch": 0.43300766571471516, + "flos": 22638374438400.0, + "grad_norm": 2.3345355752276706, + "language_loss": 0.64547086, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.66673917, + "num_input_tokens_seen": 154498905, + "step": 7202, + "time_per_iteration": 2.878486156463623 + }, + { + "auxiliary_loss_clip": 0.01082437, + "auxiliary_loss_mlp": 0.01030304, + "balance_loss_clip": 1.04730773, + "balance_loss_mlp": 1.01676202, + "epoch": 0.4330677889673831, + "flos": 22121080721280.0, + "grad_norm": 2.291722240352509, + "language_loss": 0.81795621, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.83908355, + "num_input_tokens_seen": 154517270, + "step": 7203, + "time_per_iteration": 2.7338409423828125 + }, + { + "auxiliary_loss_clip": 0.01102737, + "auxiliary_loss_mlp": 0.01051208, + "balance_loss_clip": 1.04656279, + "balance_loss_mlp": 1.0357945, + "epoch": 0.4331279122200511, + "flos": 23221527742080.0, + "grad_norm": 1.8864588919547398, + "language_loss": 0.81453216, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.83607161, + "num_input_tokens_seen": 154535945, + "step": 7204, + "time_per_iteration": 2.7719802856445312 + }, + { + "auxiliary_loss_clip": 0.01111895, + "auxiliary_loss_mlp": 0.01038561, + "balance_loss_clip": 1.04900229, + "balance_loss_mlp": 1.02450609, + "epoch": 0.43318803547271906, + "flos": 18259750627200.0, + "grad_norm": 2.1348551022614077, + "language_loss": 0.73979616, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.76130074, + "num_input_tokens_seen": 154554935, + "step": 7205, + "time_per_iteration": 2.73463773727417 + }, + { + "auxiliary_loss_clip": 0.01139834, + "auxiliary_loss_mlp": 0.00772219, + "balance_loss_clip": 1.05782342, + "balance_loss_mlp": 1.00075722, + "epoch": 0.433248158725387, + "flos": 27418407713280.0, + "grad_norm": 1.7497294767683989, + "language_loss": 0.75183374, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.77095425, + "num_input_tokens_seen": 154576065, + "step": 7206, + "time_per_iteration": 2.712897300720215 + }, + { + "auxiliary_loss_clip": 0.01082016, + "auxiliary_loss_mlp": 0.01036402, + "balance_loss_clip": 1.04904056, + "balance_loss_mlp": 1.02218044, + "epoch": 0.433308281978055, + "flos": 23218008209280.0, + "grad_norm": 5.825458886470942, + "language_loss": 0.79041201, + "learning_rate": 2.522934161574342e-06, + "loss": 0.81159621, + "num_input_tokens_seen": 154595110, + "step": 7207, + "time_per_iteration": 2.7708940505981445 + }, + { + "auxiliary_loss_clip": 0.01104721, + "auxiliary_loss_mlp": 0.01039597, + "balance_loss_clip": 1.04836667, + "balance_loss_mlp": 1.02374804, + "epoch": 0.433368405230723, + "flos": 15852407166720.0, + "grad_norm": 1.8464623058117935, + "language_loss": 0.81316662, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.83460987, + "num_input_tokens_seen": 154612255, + "step": 7208, + "time_per_iteration": 2.869554281234741 + }, + { + "auxiliary_loss_clip": 0.01114033, + "auxiliary_loss_mlp": 0.01033629, + "balance_loss_clip": 1.04989004, + "balance_loss_mlp": 1.01924682, + "epoch": 0.433428528483391, + "flos": 19026084314880.0, + "grad_norm": 2.1101386955173154, + "language_loss": 0.70337081, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.72484744, + "num_input_tokens_seen": 154630440, + "step": 7209, + "time_per_iteration": 2.692166805267334 + }, + { + "auxiliary_loss_clip": 0.01122508, + "auxiliary_loss_mlp": 0.0103772, + "balance_loss_clip": 1.04924512, + "balance_loss_mlp": 1.02234209, + "epoch": 0.43348865173605894, + "flos": 24718248581760.0, + "grad_norm": 1.435580666015418, + "language_loss": 0.81432891, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.83593118, + "num_input_tokens_seen": 154652515, + "step": 7210, + "time_per_iteration": 2.7368991374969482 + }, + { + "auxiliary_loss_clip": 0.01111056, + "auxiliary_loss_mlp": 0.01040693, + "balance_loss_clip": 1.05043674, + "balance_loss_mlp": 1.02690065, + "epoch": 0.4335487749887269, + "flos": 22090664880000.0, + "grad_norm": 2.4268266327689005, + "language_loss": 0.82382917, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.84534657, + "num_input_tokens_seen": 154670965, + "step": 7211, + "time_per_iteration": 2.6840522289276123 + }, + { + "auxiliary_loss_clip": 0.01124683, + "auxiliary_loss_mlp": 0.01036766, + "balance_loss_clip": 1.04992187, + "balance_loss_mlp": 1.02354002, + "epoch": 0.43360889824139487, + "flos": 22382941847040.0, + "grad_norm": 1.7229238689988244, + "language_loss": 0.74880648, + "learning_rate": 2.521054347790029e-06, + "loss": 0.77042103, + "num_input_tokens_seen": 154689980, + "step": 7212, + "time_per_iteration": 2.6535651683807373 + }, + { + "auxiliary_loss_clip": 0.01111992, + "auxiliary_loss_mlp": 0.01035698, + "balance_loss_clip": 1.05274439, + "balance_loss_mlp": 1.0224421, + "epoch": 0.43366902149406283, + "flos": 17528286067200.0, + "grad_norm": 1.7659929391516203, + "language_loss": 0.76887298, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.7903499, + "num_input_tokens_seen": 154706570, + "step": 7213, + "time_per_iteration": 2.7639784812927246 + }, + { + "auxiliary_loss_clip": 0.01127555, + "auxiliary_loss_mlp": 0.01037213, + "balance_loss_clip": 1.05343771, + "balance_loss_mlp": 1.0235039, + "epoch": 0.4337291447467308, + "flos": 19022672522880.0, + "grad_norm": 2.352447655586991, + "language_loss": 0.64672804, + "learning_rate": 2.520302283867471e-06, + "loss": 0.66837579, + "num_input_tokens_seen": 154725210, + "step": 7214, + "time_per_iteration": 2.6546545028686523 + }, + { + "auxiliary_loss_clip": 0.01107197, + "auxiliary_loss_mlp": 0.01037553, + "balance_loss_clip": 1.04624152, + "balance_loss_mlp": 1.02401102, + "epoch": 0.43378926799939876, + "flos": 27234042180480.0, + "grad_norm": 1.8015946289097802, + "language_loss": 0.71728516, + "learning_rate": 2.519926222304191e-06, + "loss": 0.73873264, + "num_input_tokens_seen": 154745945, + "step": 7215, + "time_per_iteration": 2.7694337368011475 + }, + { + "auxiliary_loss_clip": 0.01105367, + "auxiliary_loss_mlp": 0.01038295, + "balance_loss_clip": 1.04855013, + "balance_loss_mlp": 1.02280354, + "epoch": 0.43384939125206673, + "flos": 15961108700160.0, + "grad_norm": 2.003102925000143, + "language_loss": 0.75037885, + "learning_rate": 2.519550141025255e-06, + "loss": 0.77181542, + "num_input_tokens_seen": 154763580, + "step": 7216, + "time_per_iteration": 2.725843667984009 + }, + { + "auxiliary_loss_clip": 0.01116821, + "auxiliary_loss_mlp": 0.01045067, + "balance_loss_clip": 1.05096495, + "balance_loss_mlp": 1.02885413, + "epoch": 0.4339095145047347, + "flos": 21793216354560.0, + "grad_norm": 2.430460894289381, + "language_loss": 0.75723612, + "learning_rate": 2.519174040044927e-06, + "loss": 0.77885503, + "num_input_tokens_seen": 154776825, + "step": 7217, + "time_per_iteration": 2.7089385986328125 + }, + { + "auxiliary_loss_clip": 0.01100856, + "auxiliary_loss_mlp": 0.01039272, + "balance_loss_clip": 1.04884839, + "balance_loss_mlp": 1.02414465, + "epoch": 0.43396963775740266, + "flos": 14209853109120.0, + "grad_norm": 1.9588734650761437, + "language_loss": 0.74091554, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.76231682, + "num_input_tokens_seen": 154794025, + "step": 7218, + "time_per_iteration": 2.6733574867248535 + }, + { + "auxiliary_loss_clip": 0.01108387, + "auxiliary_loss_mlp": 0.01033005, + "balance_loss_clip": 1.05125904, + "balance_loss_mlp": 1.01892698, + "epoch": 0.4340297610100706, + "flos": 19719052473600.0, + "grad_norm": 1.867471044964119, + "language_loss": 0.69258481, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.71399873, + "num_input_tokens_seen": 154813105, + "step": 7219, + "time_per_iteration": 2.6384527683258057 + }, + { + "auxiliary_loss_clip": 0.01103251, + "auxiliary_loss_mlp": 0.01039305, + "balance_loss_clip": 1.04848611, + "balance_loss_mlp": 1.02513123, + "epoch": 0.4340898842627386, + "flos": 18953508885120.0, + "grad_norm": 2.2592610231798274, + "language_loss": 0.77296734, + "learning_rate": 2.518045619038202e-06, + "loss": 0.79439294, + "num_input_tokens_seen": 154833525, + "step": 7220, + "time_per_iteration": 2.693434476852417 + }, + { + "auxiliary_loss_clip": 0.01068716, + "auxiliary_loss_mlp": 0.01037568, + "balance_loss_clip": 1.04492617, + "balance_loss_mlp": 1.02248216, + "epoch": 0.4341500075154066, + "flos": 22018304931840.0, + "grad_norm": 2.0152794755447183, + "language_loss": 0.6924417, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.71350455, + "num_input_tokens_seen": 154853090, + "step": 7221, + "time_per_iteration": 2.8318276405334473 + }, + { + "auxiliary_loss_clip": 0.01126059, + "auxiliary_loss_mlp": 0.01040304, + "balance_loss_clip": 1.04850173, + "balance_loss_mlp": 1.02628446, + "epoch": 0.4342101307680746, + "flos": 23582465556480.0, + "grad_norm": 2.7538415889200554, + "language_loss": 0.65288424, + "learning_rate": 2.51729324012157e-06, + "loss": 0.67454779, + "num_input_tokens_seen": 154872055, + "step": 7222, + "time_per_iteration": 2.6848082542419434 + }, + { + "auxiliary_loss_clip": 0.01095727, + "auxiliary_loss_mlp": 0.0103288, + "balance_loss_clip": 1.04434943, + "balance_loss_mlp": 1.01868868, + "epoch": 0.43427025402074254, + "flos": 17967976450560.0, + "grad_norm": 2.2547341093747884, + "language_loss": 0.72800291, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.74928898, + "num_input_tokens_seen": 154886645, + "step": 7223, + "time_per_iteration": 2.6691431999206543 + }, + { + "auxiliary_loss_clip": 0.0113251, + "auxiliary_loss_mlp": 0.01035818, + "balance_loss_clip": 1.04656434, + "balance_loss_mlp": 1.02130401, + "epoch": 0.4343303772734105, + "flos": 26286395616000.0, + "grad_norm": 1.8756720282844566, + "language_loss": 0.93602765, + "learning_rate": 2.516540782741694e-06, + "loss": 0.95771086, + "num_input_tokens_seen": 154906775, + "step": 7224, + "time_per_iteration": 2.667450189590454 + }, + { + "auxiliary_loss_clip": 0.01092783, + "auxiliary_loss_mlp": 0.01039248, + "balance_loss_clip": 1.04234195, + "balance_loss_mlp": 1.02426362, + "epoch": 0.43439050052607847, + "flos": 26833961520000.0, + "grad_norm": 1.4167248746748424, + "language_loss": 0.61521256, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.63653284, + "num_input_tokens_seen": 154926990, + "step": 7225, + "time_per_iteration": 4.334634304046631 + }, + { + "auxiliary_loss_clip": 0.01107023, + "auxiliary_loss_mlp": 0.00773069, + "balance_loss_clip": 1.04763186, + "balance_loss_mlp": 1.00081611, + "epoch": 0.43445062377874644, + "flos": 21397660807680.0, + "grad_norm": 1.859930915167877, + "language_loss": 0.77928364, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.79808456, + "num_input_tokens_seen": 154946210, + "step": 7226, + "time_per_iteration": 5.937607765197754 + }, + { + "auxiliary_loss_clip": 0.01118617, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.047508, + "balance_loss_mlp": 1.02045417, + "epoch": 0.4345107470314144, + "flos": 19901945548800.0, + "grad_norm": 1.6822192052663985, + "language_loss": 0.84638822, + "learning_rate": 2.515411949802964e-06, + "loss": 0.86791462, + "num_input_tokens_seen": 154964995, + "step": 7227, + "time_per_iteration": 2.6521942615509033 + }, + { + "auxiliary_loss_clip": 0.01117348, + "auxiliary_loss_mlp": 0.0103842, + "balance_loss_clip": 1.04574108, + "balance_loss_mlp": 1.02328634, + "epoch": 0.43457087028408237, + "flos": 26432623883520.0, + "grad_norm": 1.9493500401331498, + "language_loss": 0.76725572, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.78881335, + "num_input_tokens_seen": 154984775, + "step": 7228, + "time_per_iteration": 2.6870598793029785 + }, + { + "auxiliary_loss_clip": 0.01089608, + "auxiliary_loss_mlp": 0.01040618, + "balance_loss_clip": 1.04927957, + "balance_loss_mlp": 1.02599132, + "epoch": 0.43463099353675033, + "flos": 31868816855040.0, + "grad_norm": 1.513481048537933, + "language_loss": 0.80442667, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.82572889, + "num_input_tokens_seen": 155008125, + "step": 7229, + "time_per_iteration": 2.9437830448150635 + }, + { + "auxiliary_loss_clip": 0.01121336, + "auxiliary_loss_mlp": 0.01045576, + "balance_loss_clip": 1.04673219, + "balance_loss_mlp": 1.03047252, + "epoch": 0.4346911167894183, + "flos": 24571266128640.0, + "grad_norm": 2.5474712737755016, + "language_loss": 0.81467843, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.83634758, + "num_input_tokens_seen": 155027885, + "step": 7230, + "time_per_iteration": 4.6465747356414795 + }, + { + "auxiliary_loss_clip": 0.0111898, + "auxiliary_loss_mlp": 0.01049467, + "balance_loss_clip": 1.04806113, + "balance_loss_mlp": 1.03399396, + "epoch": 0.43475124004208626, + "flos": 17090678672640.0, + "grad_norm": 2.126712012780947, + "language_loss": 0.76608211, + "learning_rate": 2.513906565661973e-06, + "loss": 0.78776658, + "num_input_tokens_seen": 155043375, + "step": 7231, + "time_per_iteration": 2.668262243270874 + }, + { + "auxiliary_loss_clip": 0.01085236, + "auxiliary_loss_mlp": 0.010365, + "balance_loss_clip": 1.04462624, + "balance_loss_mlp": 1.02319062, + "epoch": 0.4348113632947542, + "flos": 26104615862400.0, + "grad_norm": 1.4622957052763208, + "language_loss": 0.6875934, + "learning_rate": 2.513530170872575e-06, + "loss": 0.70881081, + "num_input_tokens_seen": 155062930, + "step": 7232, + "time_per_iteration": 2.7392327785491943 + }, + { + "auxiliary_loss_clip": 0.01098662, + "auxiliary_loss_mlp": 0.01036589, + "balance_loss_clip": 1.04562938, + "balance_loss_mlp": 1.02119923, + "epoch": 0.4348714865474222, + "flos": 34200496316160.0, + "grad_norm": 1.6380302947056737, + "language_loss": 0.72123957, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.74259216, + "num_input_tokens_seen": 155084980, + "step": 7233, + "time_per_iteration": 2.8322300910949707 + }, + { + "auxiliary_loss_clip": 0.01073793, + "auxiliary_loss_mlp": 0.01045709, + "balance_loss_clip": 1.04429805, + "balance_loss_mlp": 1.02930558, + "epoch": 0.43493160980009016, + "flos": 31537468869120.0, + "grad_norm": 1.5095585359817736, + "language_loss": 0.74440682, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.76560181, + "num_input_tokens_seen": 155107260, + "step": 7234, + "time_per_iteration": 2.9071762561798096 + }, + { + "auxiliary_loss_clip": 0.011103, + "auxiliary_loss_mlp": 0.01043772, + "balance_loss_clip": 1.04619622, + "balance_loss_mlp": 1.02835774, + "epoch": 0.4349917330527582, + "flos": 24061334699520.0, + "grad_norm": 2.005736270415063, + "language_loss": 0.59333825, + "learning_rate": 2.512400869722782e-06, + "loss": 0.61487895, + "num_input_tokens_seen": 155126720, + "step": 7235, + "time_per_iteration": 2.6738569736480713 + }, + { + "auxiliary_loss_clip": 0.01064764, + "auxiliary_loss_mlp": 0.01055431, + "balance_loss_clip": 1.03919065, + "balance_loss_mlp": 1.03892064, + "epoch": 0.43505185630542614, + "flos": 30519329863680.0, + "grad_norm": 1.6349929491691664, + "language_loss": 0.77779961, + "learning_rate": 2.512024397126566e-06, + "loss": 0.79900157, + "num_input_tokens_seen": 155148640, + "step": 7236, + "time_per_iteration": 2.8045287132263184 + }, + { + "auxiliary_loss_clip": 0.01129354, + "auxiliary_loss_mlp": 0.01036773, + "balance_loss_clip": 1.04843307, + "balance_loss_mlp": 1.02221155, + "epoch": 0.4351119795580941, + "flos": 15735158196480.0, + "grad_norm": 1.6962419767837338, + "language_loss": 0.81330889, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.83497024, + "num_input_tokens_seen": 155165870, + "step": 7237, + "time_per_iteration": 2.648671865463257 + }, + { + "auxiliary_loss_clip": 0.01115513, + "auxiliary_loss_mlp": 0.0103661, + "balance_loss_clip": 1.04350662, + "balance_loss_mlp": 1.02228153, + "epoch": 0.4351721028107621, + "flos": 18731760272640.0, + "grad_norm": 3.1516026268664485, + "language_loss": 0.62781835, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.64933956, + "num_input_tokens_seen": 155185315, + "step": 7238, + "time_per_iteration": 2.708812713623047 + }, + { + "auxiliary_loss_clip": 0.01093861, + "auxiliary_loss_mlp": 0.00771839, + "balance_loss_clip": 1.04551601, + "balance_loss_mlp": 1.00081944, + "epoch": 0.43523222606343004, + "flos": 25226887121280.0, + "grad_norm": 1.9011673436513334, + "language_loss": 0.85935599, + "learning_rate": 2.510894862898928e-06, + "loss": 0.87801301, + "num_input_tokens_seen": 155205790, + "step": 7239, + "time_per_iteration": 2.7664706707000732 + }, + { + "auxiliary_loss_clip": 0.01108836, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.04520702, + "balance_loss_mlp": 1.01814556, + "epoch": 0.435292349316098, + "flos": 22709190101760.0, + "grad_norm": 1.536559176560054, + "language_loss": 0.7257551, + "learning_rate": 2.510518312724309e-06, + "loss": 0.747168, + "num_input_tokens_seen": 155226475, + "step": 7240, + "time_per_iteration": 2.7275354862213135 + }, + { + "auxiliary_loss_clip": 0.01096929, + "auxiliary_loss_mlp": 0.01033031, + "balance_loss_clip": 1.04623103, + "balance_loss_mlp": 1.01821971, + "epoch": 0.43535247256876597, + "flos": 25775889569280.0, + "grad_norm": 2.0741794573690613, + "language_loss": 0.8174212, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.83872074, + "num_input_tokens_seen": 155247110, + "step": 7241, + "time_per_iteration": 2.7412314414978027 + }, + { + "auxiliary_loss_clip": 0.01104486, + "auxiliary_loss_mlp": 0.00773075, + "balance_loss_clip": 1.04755354, + "balance_loss_mlp": 1.000664, + "epoch": 0.43541259582143393, + "flos": 17528142412800.0, + "grad_norm": 2.5029472103375627, + "language_loss": 0.7954601, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.81423575, + "num_input_tokens_seen": 155261335, + "step": 7242, + "time_per_iteration": 2.7832155227661133 + }, + { + "auxiliary_loss_clip": 0.01105652, + "auxiliary_loss_mlp": 0.01038715, + "balance_loss_clip": 1.04170573, + "balance_loss_mlp": 1.0224551, + "epoch": 0.4354727190741019, + "flos": 15195205975680.0, + "grad_norm": 5.632863009629144, + "language_loss": 0.68174016, + "learning_rate": 2.509388546104138e-06, + "loss": 0.70318383, + "num_input_tokens_seen": 155278510, + "step": 7243, + "time_per_iteration": 2.731621742248535 + }, + { + "auxiliary_loss_clip": 0.01070337, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.04518962, + "balance_loss_mlp": 1.02096963, + "epoch": 0.43553284232676986, + "flos": 16649264436480.0, + "grad_norm": 1.737599591064028, + "language_loss": 0.81023276, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.83128881, + "num_input_tokens_seen": 155296450, + "step": 7244, + "time_per_iteration": 2.869999885559082 + }, + { + "auxiliary_loss_clip": 0.0107405, + "auxiliary_loss_mlp": 0.01033039, + "balance_loss_clip": 1.04502463, + "balance_loss_mlp": 1.01974106, + "epoch": 0.43559296557943783, + "flos": 23400865370880.0, + "grad_norm": 1.7613354011100055, + "language_loss": 0.73543227, + "learning_rate": 2.508635271753234e-06, + "loss": 0.75650311, + "num_input_tokens_seen": 155316080, + "step": 7245, + "time_per_iteration": 2.8238213062286377 + }, + { + "auxiliary_loss_clip": 0.01073655, + "auxiliary_loss_mlp": 0.01040109, + "balance_loss_clip": 1.042413, + "balance_loss_mlp": 1.02626252, + "epoch": 0.4356530888321058, + "flos": 22419067950720.0, + "grad_norm": 1.8556670419976653, + "language_loss": 0.76651436, + "learning_rate": 2.508258605639389e-06, + "loss": 0.78765202, + "num_input_tokens_seen": 155336765, + "step": 7246, + "time_per_iteration": 2.74566912651062 + }, + { + "auxiliary_loss_clip": 0.01117733, + "auxiliary_loss_mlp": 0.01046964, + "balance_loss_clip": 1.04482377, + "balance_loss_mlp": 1.03185987, + "epoch": 0.43571321208477376, + "flos": 21616141282560.0, + "grad_norm": 1.8292531725431629, + "language_loss": 0.85409153, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.8757385, + "num_input_tokens_seen": 155356440, + "step": 7247, + "time_per_iteration": 2.6183457374572754 + }, + { + "auxiliary_loss_clip": 0.01130523, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.047526, + "balance_loss_mlp": 1.02565122, + "epoch": 0.4357733353374418, + "flos": 23987358639360.0, + "grad_norm": 1.611147300467871, + "language_loss": 0.72544634, + "learning_rate": 2.507505215606333e-06, + "loss": 0.74714351, + "num_input_tokens_seen": 155377070, + "step": 7248, + "time_per_iteration": 2.614370822906494 + }, + { + "auxiliary_loss_clip": 0.01120332, + "auxiliary_loss_mlp": 0.01038636, + "balance_loss_clip": 1.0502224, + "balance_loss_mlp": 1.0246768, + "epoch": 0.43583345859010975, + "flos": 25264737077760.0, + "grad_norm": 1.6765876969892934, + "language_loss": 0.87089729, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.89248699, + "num_input_tokens_seen": 155398415, + "step": 7249, + "time_per_iteration": 2.6826605796813965 + }, + { + "auxiliary_loss_clip": 0.01113045, + "auxiliary_loss_mlp": 0.01045156, + "balance_loss_clip": 1.04740214, + "balance_loss_mlp": 1.03150034, + "epoch": 0.4358935818427777, + "flos": 23696302734720.0, + "grad_norm": 2.0541786270405495, + "language_loss": 0.81998801, + "learning_rate": 2.506751748594683e-06, + "loss": 0.84157008, + "num_input_tokens_seen": 155415625, + "step": 7250, + "time_per_iteration": 2.6470022201538086 + }, + { + "auxiliary_loss_clip": 0.01124271, + "auxiliary_loss_mlp": 0.01035047, + "balance_loss_clip": 1.05197597, + "balance_loss_mlp": 1.02089727, + "epoch": 0.4359537050954457, + "flos": 29532827761920.0, + "grad_norm": 1.9267289360456135, + "language_loss": 0.84933323, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.87092638, + "num_input_tokens_seen": 155435505, + "step": 7251, + "time_per_iteration": 2.665776014328003 + }, + { + "auxiliary_loss_clip": 0.01108984, + "auxiliary_loss_mlp": 0.01043132, + "balance_loss_clip": 1.04255629, + "balance_loss_mlp": 1.02783751, + "epoch": 0.43601382834811364, + "flos": 22711273090560.0, + "grad_norm": 2.7582881981862335, + "language_loss": 0.69538188, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.71690303, + "num_input_tokens_seen": 155455425, + "step": 7252, + "time_per_iteration": 2.644498825073242 + }, + { + "auxiliary_loss_clip": 0.01102038, + "auxiliary_loss_mlp": 0.01039697, + "balance_loss_clip": 1.04452658, + "balance_loss_mlp": 1.02410412, + "epoch": 0.4360739516007816, + "flos": 19098731571840.0, + "grad_norm": 2.1859211403409717, + "language_loss": 0.83621645, + "learning_rate": 2.505621403992348e-06, + "loss": 0.85763383, + "num_input_tokens_seen": 155474250, + "step": 7253, + "time_per_iteration": 2.662623882293701 + }, + { + "auxiliary_loss_clip": 0.01119158, + "auxiliary_loss_mlp": 0.01041761, + "balance_loss_clip": 1.04809666, + "balance_loss_mlp": 1.0271399, + "epoch": 0.43613407485344957, + "flos": 23404420817280.0, + "grad_norm": 1.5459938146205512, + "language_loss": 0.70561367, + "learning_rate": 2.505244584092757e-06, + "loss": 0.7272228, + "num_input_tokens_seen": 155494685, + "step": 7254, + "time_per_iteration": 2.677427053451538 + }, + { + "auxiliary_loss_clip": 0.01106538, + "auxiliary_loss_mlp": 0.01041179, + "balance_loss_clip": 1.04567051, + "balance_loss_mlp": 1.02734506, + "epoch": 0.43619419810611754, + "flos": 22637799820800.0, + "grad_norm": 1.8056505398017555, + "language_loss": 0.812729, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.83420616, + "num_input_tokens_seen": 155513040, + "step": 7255, + "time_per_iteration": 2.7150163650512695 + }, + { + "auxiliary_loss_clip": 0.01132135, + "auxiliary_loss_mlp": 0.01040806, + "balance_loss_clip": 1.04807031, + "balance_loss_mlp": 1.02626252, + "epoch": 0.4362543213587855, + "flos": 20047958334720.0, + "grad_norm": 1.9676871720710198, + "language_loss": 0.7780782, + "learning_rate": 2.504490886831089e-06, + "loss": 0.79980761, + "num_input_tokens_seen": 155530100, + "step": 7256, + "time_per_iteration": 2.551403522491455 + }, + { + "auxiliary_loss_clip": 0.0112974, + "auxiliary_loss_mlp": 0.01041375, + "balance_loss_clip": 1.04864502, + "balance_loss_mlp": 1.02721334, + "epoch": 0.43631444461145347, + "flos": 21361319222400.0, + "grad_norm": 1.9475980639851616, + "language_loss": 0.76180404, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.78351521, + "num_input_tokens_seen": 155549375, + "step": 7257, + "time_per_iteration": 2.6217384338378906 + }, + { + "auxiliary_loss_clip": 0.01120044, + "auxiliary_loss_mlp": 0.01042182, + "balance_loss_clip": 1.04656029, + "balance_loss_mlp": 1.02711391, + "epoch": 0.43637456786412143, + "flos": 22418529246720.0, + "grad_norm": 1.6554456872207661, + "language_loss": 0.73254454, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.75416678, + "num_input_tokens_seen": 155569395, + "step": 7258, + "time_per_iteration": 2.7399442195892334 + }, + { + "auxiliary_loss_clip": 0.01107425, + "auxiliary_loss_mlp": 0.01034778, + "balance_loss_clip": 1.0456903, + "balance_loss_mlp": 1.02084827, + "epoch": 0.4364346911167894, + "flos": 28548839612160.0, + "grad_norm": 2.059273423297749, + "language_loss": 0.76950562, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.79092765, + "num_input_tokens_seen": 155589090, + "step": 7259, + "time_per_iteration": 2.814030647277832 + }, + { + "auxiliary_loss_clip": 0.01025258, + "auxiliary_loss_mlp": 0.01002872, + "balance_loss_clip": 1.02231717, + "balance_loss_mlp": 1.0011797, + "epoch": 0.43649481436945736, + "flos": 62659345380480.0, + "grad_norm": 0.7406116287283647, + "language_loss": 0.56990582, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.59018713, + "num_input_tokens_seen": 155648660, + "step": 7260, + "time_per_iteration": 3.184105396270752 + }, + { + "auxiliary_loss_clip": 0.01114574, + "auxiliary_loss_mlp": 0.01046133, + "balance_loss_clip": 1.04780877, + "balance_loss_mlp": 1.03077888, + "epoch": 0.4365549376221254, + "flos": 30592120775040.0, + "grad_norm": 2.4789338774629024, + "language_loss": 0.71279275, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.73439986, + "num_input_tokens_seen": 155669945, + "step": 7261, + "time_per_iteration": 2.781569242477417 + }, + { + "auxiliary_loss_clip": 0.01084597, + "auxiliary_loss_mlp": 0.01054365, + "balance_loss_clip": 1.04558206, + "balance_loss_mlp": 1.0377475, + "epoch": 0.43661506087479335, + "flos": 17165875795200.0, + "grad_norm": 1.8767730803011844, + "language_loss": 0.69520628, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.71659589, + "num_input_tokens_seen": 155688555, + "step": 7262, + "time_per_iteration": 2.73209810256958 + }, + { + "auxiliary_loss_clip": 0.0106364, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.04300487, + "balance_loss_mlp": 1.02154875, + "epoch": 0.4366751841274613, + "flos": 22047499710720.0, + "grad_norm": 1.5954483681391127, + "language_loss": 0.79909682, + "learning_rate": 2.501852344559726e-06, + "loss": 0.82007402, + "num_input_tokens_seen": 155705370, + "step": 7263, + "time_per_iteration": 2.7780513763427734 + }, + { + "auxiliary_loss_clip": 0.01093795, + "auxiliary_loss_mlp": 0.01046831, + "balance_loss_clip": 1.0481534, + "balance_loss_mlp": 1.03220403, + "epoch": 0.4367353073801293, + "flos": 15997306631040.0, + "grad_norm": 1.6219151151282696, + "language_loss": 0.7545082, + "learning_rate": 2.50147533371401e-06, + "loss": 0.77591443, + "num_input_tokens_seen": 155721890, + "step": 7264, + "time_per_iteration": 4.158029079437256 + }, + { + "auxiliary_loss_clip": 0.01079604, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.04681587, + "balance_loss_mlp": 1.02243328, + "epoch": 0.43679543063279724, + "flos": 38217535868160.0, + "grad_norm": 2.5655359697781854, + "language_loss": 0.61799812, + "learning_rate": 2.501098303852298e-06, + "loss": 0.63916975, + "num_input_tokens_seen": 155743970, + "step": 7265, + "time_per_iteration": 4.454209804534912 + }, + { + "auxiliary_loss_clip": 0.01105521, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.04521823, + "balance_loss_mlp": 1.01762891, + "epoch": 0.4368555538854652, + "flos": 15193230727680.0, + "grad_norm": 2.0447032285328004, + "language_loss": 0.72610664, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.74747527, + "num_input_tokens_seen": 155761830, + "step": 7266, + "time_per_iteration": 4.213090181350708 + }, + { + "auxiliary_loss_clip": 0.0110385, + "auxiliary_loss_mlp": 0.01037912, + "balance_loss_clip": 1.04488015, + "balance_loss_mlp": 1.02356541, + "epoch": 0.4369156771381332, + "flos": 23069086421760.0, + "grad_norm": 1.8602157597317315, + "language_loss": 0.82307518, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.84449285, + "num_input_tokens_seen": 155779610, + "step": 7267, + "time_per_iteration": 2.6675074100494385 + }, + { + "auxiliary_loss_clip": 0.01126927, + "auxiliary_loss_mlp": 0.01028504, + "balance_loss_clip": 1.04546976, + "balance_loss_mlp": 1.01499796, + "epoch": 0.43697580039080114, + "flos": 23441085624960.0, + "grad_norm": 2.021840044875845, + "language_loss": 0.74740797, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.76896226, + "num_input_tokens_seen": 155798765, + "step": 7268, + "time_per_iteration": 2.6228766441345215 + }, + { + "auxiliary_loss_clip": 0.01135364, + "auxiliary_loss_mlp": 0.01041324, + "balance_loss_clip": 1.04851401, + "balance_loss_mlp": 1.02567148, + "epoch": 0.4370359236434691, + "flos": 18514680428160.0, + "grad_norm": 2.5093195722714365, + "language_loss": 0.80133688, + "learning_rate": 2.499589994531454e-06, + "loss": 0.82310379, + "num_input_tokens_seen": 155817750, + "step": 7269, + "time_per_iteration": 4.289510726928711 + }, + { + "auxiliary_loss_clip": 0.01110775, + "auxiliary_loss_mlp": 0.01036898, + "balance_loss_clip": 1.04814553, + "balance_loss_mlp": 1.02253354, + "epoch": 0.43709604689613707, + "flos": 23222497409280.0, + "grad_norm": 1.7772509501505356, + "language_loss": 0.74977714, + "learning_rate": 2.499212869804237e-06, + "loss": 0.77125382, + "num_input_tokens_seen": 155836490, + "step": 7270, + "time_per_iteration": 2.7519397735595703 + }, + { + "auxiliary_loss_clip": 0.01068873, + "auxiliary_loss_mlp": 0.01045139, + "balance_loss_clip": 1.04005837, + "balance_loss_mlp": 1.02886677, + "epoch": 0.43715617014880503, + "flos": 23803711378560.0, + "grad_norm": 1.9652522706029574, + "language_loss": 0.79716229, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.81830239, + "num_input_tokens_seen": 155856225, + "step": 7271, + "time_per_iteration": 2.8002872467041016 + }, + { + "auxiliary_loss_clip": 0.01036128, + "auxiliary_loss_mlp": 0.01021454, + "balance_loss_clip": 1.01824927, + "balance_loss_mlp": 1.01974964, + "epoch": 0.437216293401473, + "flos": 61941204766080.0, + "grad_norm": 0.7022630698763936, + "language_loss": 0.54855651, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.56913233, + "num_input_tokens_seen": 155916770, + "step": 7272, + "time_per_iteration": 3.1893959045410156 + }, + { + "auxiliary_loss_clip": 0.0113475, + "auxiliary_loss_mlp": 0.01041916, + "balance_loss_clip": 1.0497241, + "balance_loss_mlp": 1.02704489, + "epoch": 0.43727641665414096, + "flos": 21982250655360.0, + "grad_norm": 1.6582852351426143, + "language_loss": 0.69981074, + "learning_rate": 2.498081382098581e-06, + "loss": 0.72157741, + "num_input_tokens_seen": 155936490, + "step": 7273, + "time_per_iteration": 2.622006893157959 + }, + { + "auxiliary_loss_clip": 0.01109468, + "auxiliary_loss_mlp": 0.01050566, + "balance_loss_clip": 1.04725552, + "balance_loss_mlp": 1.03434145, + "epoch": 0.437336539906809, + "flos": 39530860842240.0, + "grad_norm": 7.356047522605187, + "language_loss": 0.75699592, + "learning_rate": 2.497704181736367e-06, + "loss": 0.77859622, + "num_input_tokens_seen": 155957595, + "step": 7274, + "time_per_iteration": 2.850834846496582 + }, + { + "auxiliary_loss_clip": 0.0111429, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.04778564, + "balance_loss_mlp": 1.01473844, + "epoch": 0.43739666315947695, + "flos": 17457147181440.0, + "grad_norm": 1.6567651402589496, + "language_loss": 0.80280751, + "learning_rate": 2.49732696250116e-06, + "loss": 0.82422453, + "num_input_tokens_seen": 155975710, + "step": 7275, + "time_per_iteration": 2.638493776321411 + }, + { + "auxiliary_loss_clip": 0.01107442, + "auxiliary_loss_mlp": 0.01039938, + "balance_loss_clip": 1.04763556, + "balance_loss_mlp": 1.02628231, + "epoch": 0.4374567864121449, + "flos": 16358747235840.0, + "grad_norm": 1.960961081760492, + "language_loss": 0.81285107, + "learning_rate": 2.496949724407266e-06, + "loss": 0.83432496, + "num_input_tokens_seen": 155993090, + "step": 7276, + "time_per_iteration": 2.665069341659546 + }, + { + "auxiliary_loss_clip": 0.01119385, + "auxiliary_loss_mlp": 0.01033929, + "balance_loss_clip": 1.05310118, + "balance_loss_mlp": 1.01923609, + "epoch": 0.4375169096648129, + "flos": 30587523834240.0, + "grad_norm": 1.9041346547019917, + "language_loss": 0.7327143, + "learning_rate": 2.496572467468988e-06, + "loss": 0.75424743, + "num_input_tokens_seen": 156013685, + "step": 7277, + "time_per_iteration": 2.7329320907592773 + }, + { + "auxiliary_loss_clip": 0.01109724, + "auxiliary_loss_mlp": 0.0077177, + "balance_loss_clip": 1.04805493, + "balance_loss_mlp": 1.00070667, + "epoch": 0.43757703291748085, + "flos": 30555599621760.0, + "grad_norm": 1.7992627956176412, + "language_loss": 0.73366892, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.7524839, + "num_input_tokens_seen": 156034300, + "step": 7278, + "time_per_iteration": 2.7531094551086426 + }, + { + "auxiliary_loss_clip": 0.01094743, + "auxiliary_loss_mlp": 0.0103965, + "balance_loss_clip": 1.0471983, + "balance_loss_mlp": 1.02677512, + "epoch": 0.4376371561701488, + "flos": 21397373498880.0, + "grad_norm": 1.4932293615412522, + "language_loss": 0.66024888, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.68159282, + "num_input_tokens_seen": 156053805, + "step": 7279, + "time_per_iteration": 2.671842336654663 + }, + { + "auxiliary_loss_clip": 0.01139939, + "auxiliary_loss_mlp": 0.01037875, + "balance_loss_clip": 1.05298817, + "balance_loss_mlp": 1.02337885, + "epoch": 0.4376972794228168, + "flos": 23404384903680.0, + "grad_norm": 1.7693107777348598, + "language_loss": 0.81793606, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.83971423, + "num_input_tokens_seen": 156073295, + "step": 7280, + "time_per_iteration": 2.588303565979004 + }, + { + "auxiliary_loss_clip": 0.01106326, + "auxiliary_loss_mlp": 0.01031835, + "balance_loss_clip": 1.04587424, + "balance_loss_mlp": 1.01867414, + "epoch": 0.43775740267548474, + "flos": 22892945103360.0, + "grad_norm": 1.5627499875085749, + "language_loss": 0.77005875, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.79144037, + "num_input_tokens_seen": 156094540, + "step": 7281, + "time_per_iteration": 2.6939706802368164 + }, + { + "auxiliary_loss_clip": 0.011079, + "auxiliary_loss_mlp": 0.01037268, + "balance_loss_clip": 1.04824066, + "balance_loss_mlp": 1.02360058, + "epoch": 0.4378175259281527, + "flos": 23294390480640.0, + "grad_norm": 1.8010941727109018, + "language_loss": 0.75983417, + "learning_rate": 2.494685900612569e-06, + "loss": 0.78128588, + "num_input_tokens_seen": 156114070, + "step": 7282, + "time_per_iteration": 2.6834237575531006 + }, + { + "auxiliary_loss_clip": 0.01092611, + "auxiliary_loss_mlp": 0.01040673, + "balance_loss_clip": 1.04627228, + "balance_loss_mlp": 1.02654076, + "epoch": 0.43787764918082067, + "flos": 23876897339520.0, + "grad_norm": 2.1500126437968925, + "language_loss": 0.85044593, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.87177879, + "num_input_tokens_seen": 156132130, + "step": 7283, + "time_per_iteration": 2.7042722702026367 + }, + { + "auxiliary_loss_clip": 0.01111303, + "auxiliary_loss_mlp": 0.01037633, + "balance_loss_clip": 1.04814124, + "balance_loss_mlp": 1.02266598, + "epoch": 0.43793777243348864, + "flos": 23988148738560.0, + "grad_norm": 14.144168664775597, + "language_loss": 0.80311596, + "learning_rate": 2.49393114246007e-06, + "loss": 0.82460535, + "num_input_tokens_seen": 156150820, + "step": 7284, + "time_per_iteration": 2.676689863204956 + }, + { + "auxiliary_loss_clip": 0.01123026, + "auxiliary_loss_mlp": 0.01038411, + "balance_loss_clip": 1.04910016, + "balance_loss_mlp": 1.02514315, + "epoch": 0.4379978956861566, + "flos": 18624064320000.0, + "grad_norm": 2.0075840095153925, + "language_loss": 0.80086255, + "learning_rate": 2.493553735281787e-06, + "loss": 0.82247692, + "num_input_tokens_seen": 156170125, + "step": 7285, + "time_per_iteration": 2.6446423530578613 + }, + { + "auxiliary_loss_clip": 0.01121831, + "auxiliary_loss_mlp": 0.01030999, + "balance_loss_clip": 1.04847312, + "balance_loss_mlp": 1.0175761, + "epoch": 0.43805801893882457, + "flos": 21981388728960.0, + "grad_norm": 2.1352627983894545, + "language_loss": 0.7498579, + "learning_rate": 2.493176309387897e-06, + "loss": 0.77138615, + "num_input_tokens_seen": 156187320, + "step": 7286, + "time_per_iteration": 2.6779184341430664 + }, + { + "auxiliary_loss_clip": 0.01095439, + "auxiliary_loss_mlp": 0.01032312, + "balance_loss_clip": 1.04372525, + "balance_loss_mlp": 1.0179832, + "epoch": 0.43811814219149253, + "flos": 26393337383040.0, + "grad_norm": 1.5473009908217328, + "language_loss": 0.73641115, + "learning_rate": 2.492798864792712e-06, + "loss": 0.75768864, + "num_input_tokens_seen": 156207455, + "step": 7287, + "time_per_iteration": 2.867501735687256 + }, + { + "auxiliary_loss_clip": 0.0111224, + "auxiliary_loss_mlp": 0.01045008, + "balance_loss_clip": 1.05047917, + "balance_loss_mlp": 1.03040457, + "epoch": 0.43817826544416055, + "flos": 17493309198720.0, + "grad_norm": 1.6804566494971647, + "language_loss": 0.8243767, + "learning_rate": 2.492421401510545e-06, + "loss": 0.84594917, + "num_input_tokens_seen": 156226560, + "step": 7288, + "time_per_iteration": 2.677922010421753 + }, + { + "auxiliary_loss_clip": 0.01094679, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.04326773, + "balance_loss_mlp": 1.01793718, + "epoch": 0.4382383886968285, + "flos": 21581020759680.0, + "grad_norm": 1.441403002582157, + "language_loss": 0.84301102, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.86427689, + "num_input_tokens_seen": 156246740, + "step": 7289, + "time_per_iteration": 2.8586435317993164 + }, + { + "auxiliary_loss_clip": 0.0109844, + "auxiliary_loss_mlp": 0.01052991, + "balance_loss_clip": 1.04162121, + "balance_loss_mlp": 1.03685021, + "epoch": 0.4382985119494965, + "flos": 27923742201600.0, + "grad_norm": 1.6202567248687665, + "language_loss": 0.78218126, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.80369556, + "num_input_tokens_seen": 156266440, + "step": 7290, + "time_per_iteration": 2.7211575508117676 + }, + { + "auxiliary_loss_clip": 0.01132305, + "auxiliary_loss_mlp": 0.01039679, + "balance_loss_clip": 1.05053866, + "balance_loss_mlp": 1.02617884, + "epoch": 0.43835863520216445, + "flos": 24936836797440.0, + "grad_norm": 1.8734686520238957, + "language_loss": 0.78314757, + "learning_rate": 2.491288899685288e-06, + "loss": 0.80486739, + "num_input_tokens_seen": 156286900, + "step": 7291, + "time_per_iteration": 2.629904270172119 + }, + { + "auxiliary_loss_clip": 0.0109159, + "auxiliary_loss_mlp": 0.01033172, + "balance_loss_clip": 1.04265332, + "balance_loss_mlp": 1.0194335, + "epoch": 0.4384187584548324, + "flos": 33510293504640.0, + "grad_norm": 1.5839432646062752, + "language_loss": 0.6487931, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.67004073, + "num_input_tokens_seen": 156307690, + "step": 7292, + "time_per_iteration": 2.7952499389648438 + }, + { + "auxiliary_loss_clip": 0.01112801, + "auxiliary_loss_mlp": 0.01036982, + "balance_loss_clip": 1.04319155, + "balance_loss_mlp": 1.0226171, + "epoch": 0.4384788817075004, + "flos": 23951052967680.0, + "grad_norm": 1.6336411060838572, + "language_loss": 0.74232095, + "learning_rate": 2.49053380529597e-06, + "loss": 0.7638188, + "num_input_tokens_seen": 156326620, + "step": 7293, + "time_per_iteration": 2.636462688446045 + }, + { + "auxiliary_loss_clip": 0.01098755, + "auxiliary_loss_mlp": 0.01037795, + "balance_loss_clip": 1.0494585, + "balance_loss_mlp": 1.02318609, + "epoch": 0.43853900496016834, + "flos": 19098516090240.0, + "grad_norm": 4.136423906080754, + "language_loss": 0.78758669, + "learning_rate": 2.490156230192516e-06, + "loss": 0.80895221, + "num_input_tokens_seen": 156345495, + "step": 7294, + "time_per_iteration": 2.670069456100464 + }, + { + "auxiliary_loss_clip": 0.01089917, + "auxiliary_loss_mlp": 0.01038854, + "balance_loss_clip": 1.04422832, + "balance_loss_mlp": 1.02485299, + "epoch": 0.4385991282128363, + "flos": 13225362168960.0, + "grad_norm": 1.7954692393859477, + "language_loss": 0.7296086, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.75089628, + "num_input_tokens_seen": 156363155, + "step": 7295, + "time_per_iteration": 2.7159199714660645 + }, + { + "auxiliary_loss_clip": 0.01090098, + "auxiliary_loss_mlp": 0.01044926, + "balance_loss_clip": 1.04397202, + "balance_loss_mlp": 1.02860653, + "epoch": 0.4386592514655043, + "flos": 14319883445760.0, + "grad_norm": 1.6136170201094728, + "language_loss": 0.75463378, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.77598405, + "num_input_tokens_seen": 156380940, + "step": 7296, + "time_per_iteration": 2.7475438117980957 + }, + { + "auxiliary_loss_clip": 0.01118725, + "auxiliary_loss_mlp": 0.01032483, + "balance_loss_clip": 1.04859519, + "balance_loss_mlp": 1.0183568, + "epoch": 0.43871937471817224, + "flos": 22784423137920.0, + "grad_norm": 1.7142829102326689, + "language_loss": 0.69474953, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.71626163, + "num_input_tokens_seen": 156400415, + "step": 7297, + "time_per_iteration": 2.6689095497131348 + }, + { + "auxiliary_loss_clip": 0.01111936, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.04589987, + "balance_loss_mlp": 1.02004242, + "epoch": 0.4387794979708402, + "flos": 28072304853120.0, + "grad_norm": 2.137486700340973, + "language_loss": 0.70327055, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.72472441, + "num_input_tokens_seen": 156421120, + "step": 7298, + "time_per_iteration": 2.7896294593811035 + }, + { + "auxiliary_loss_clip": 0.01117974, + "auxiliary_loss_mlp": 0.0102859, + "balance_loss_clip": 1.0481534, + "balance_loss_mlp": 1.01508379, + "epoch": 0.43883962122350817, + "flos": 26249551240320.0, + "grad_norm": 1.5518132007083414, + "language_loss": 0.72407347, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.74553907, + "num_input_tokens_seen": 156441535, + "step": 7299, + "time_per_iteration": 2.724134922027588 + }, + { + "auxiliary_loss_clip": 0.01100992, + "auxiliary_loss_mlp": 0.00773554, + "balance_loss_clip": 1.04556322, + "balance_loss_mlp": 1.00063753, + "epoch": 0.43889974447617613, + "flos": 25883765089920.0, + "grad_norm": 1.9116194577137513, + "language_loss": 0.7702527, + "learning_rate": 2.487890389750719e-06, + "loss": 0.78899813, + "num_input_tokens_seen": 156462015, + "step": 7300, + "time_per_iteration": 2.754582166671753 + }, + { + "auxiliary_loss_clip": 0.01105938, + "auxiliary_loss_mlp": 0.01033126, + "balance_loss_clip": 1.04505253, + "balance_loss_mlp": 1.01922047, + "epoch": 0.43895986772884416, + "flos": 25046615738880.0, + "grad_norm": 1.6899733258560021, + "language_loss": 0.70417237, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.72556305, + "num_input_tokens_seen": 156482165, + "step": 7301, + "time_per_iteration": 2.8213343620300293 + }, + { + "auxiliary_loss_clip": 0.01082543, + "auxiliary_loss_mlp": 0.01042943, + "balance_loss_clip": 1.04282618, + "balance_loss_mlp": 1.0270884, + "epoch": 0.4390199909815121, + "flos": 25994585525760.0, + "grad_norm": 1.824867215084726, + "language_loss": 0.70808041, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.72933531, + "num_input_tokens_seen": 156503170, + "step": 7302, + "time_per_iteration": 2.7875969409942627 + }, + { + "auxiliary_loss_clip": 0.01107602, + "auxiliary_loss_mlp": 0.01039104, + "balance_loss_clip": 1.04878247, + "balance_loss_mlp": 1.02599669, + "epoch": 0.4390801142341801, + "flos": 29022249888000.0, + "grad_norm": 1.5936626078522842, + "language_loss": 0.82381457, + "learning_rate": 2.486757219574983e-06, + "loss": 0.8452816, + "num_input_tokens_seen": 156523005, + "step": 7303, + "time_per_iteration": 2.838871717453003 + }, + { + "auxiliary_loss_clip": 0.01116821, + "auxiliary_loss_mlp": 0.01046972, + "balance_loss_clip": 1.04648411, + "balance_loss_mlp": 1.03164792, + "epoch": 0.43914023748684805, + "flos": 33438544087680.0, + "grad_norm": 10.027739157490931, + "language_loss": 0.69036293, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.71200085, + "num_input_tokens_seen": 156544440, + "step": 7304, + "time_per_iteration": 5.9847636222839355 + }, + { + "auxiliary_loss_clip": 0.01105223, + "auxiliary_loss_mlp": 0.00770446, + "balance_loss_clip": 1.04475939, + "balance_loss_mlp": 1.0005337, + "epoch": 0.439200360739516, + "flos": 34531844302080.0, + "grad_norm": 1.5264108649470638, + "language_loss": 0.78100759, + "learning_rate": 2.486001680477873e-06, + "loss": 0.79976428, + "num_input_tokens_seen": 156565410, + "step": 7305, + "time_per_iteration": 4.283693313598633 + }, + { + "auxiliary_loss_clip": 0.01102752, + "auxiliary_loss_mlp": 0.01034686, + "balance_loss_clip": 1.04440284, + "balance_loss_mlp": 1.02097106, + "epoch": 0.439260483992184, + "flos": 21907843632000.0, + "grad_norm": 1.7445713343884877, + "language_loss": 0.68756545, + "learning_rate": 2.485623883278308e-06, + "loss": 0.70893979, + "num_input_tokens_seen": 156584210, + "step": 7306, + "time_per_iteration": 2.7069246768951416 + }, + { + "auxiliary_loss_clip": 0.01089881, + "auxiliary_loss_mlp": 0.01031704, + "balance_loss_clip": 1.0450325, + "balance_loss_mlp": 1.01757789, + "epoch": 0.43932060724485195, + "flos": 20996430912000.0, + "grad_norm": 2.2471251539247428, + "language_loss": 0.62507868, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.64629447, + "num_input_tokens_seen": 156602730, + "step": 7307, + "time_per_iteration": 2.719836950302124 + }, + { + "auxiliary_loss_clip": 0.01130769, + "auxiliary_loss_mlp": 0.01032376, + "balance_loss_clip": 1.04645061, + "balance_loss_mlp": 1.0188818, + "epoch": 0.4393807304975199, + "flos": 17747053850880.0, + "grad_norm": 1.9621539490577573, + "language_loss": 0.71752089, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.73915237, + "num_input_tokens_seen": 156619405, + "step": 7308, + "time_per_iteration": 4.218705892562866 + }, + { + "auxiliary_loss_clip": 0.0110959, + "auxiliary_loss_mlp": 0.01034032, + "balance_loss_clip": 1.0438807, + "balance_loss_mlp": 1.020859, + "epoch": 0.4394408537501879, + "flos": 22528523669760.0, + "grad_norm": 1.855171270613647, + "language_loss": 0.76671213, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.78814828, + "num_input_tokens_seen": 156638165, + "step": 7309, + "time_per_iteration": 2.726790428161621 + }, + { + "auxiliary_loss_clip": 0.01111334, + "auxiliary_loss_mlp": 0.01031706, + "balance_loss_clip": 1.04383993, + "balance_loss_mlp": 1.01888466, + "epoch": 0.43950097700285584, + "flos": 23440654661760.0, + "grad_norm": 1.9900388133775502, + "language_loss": 0.7067014, + "learning_rate": 2.484112510474251e-06, + "loss": 0.72813171, + "num_input_tokens_seen": 156658845, + "step": 7310, + "time_per_iteration": 2.644737958908081 + }, + { + "auxiliary_loss_clip": 0.01099363, + "auxiliary_loss_mlp": 0.00771301, + "balance_loss_clip": 1.04282653, + "balance_loss_mlp": 1.00065351, + "epoch": 0.4395611002555238, + "flos": 23180696956800.0, + "grad_norm": 2.0308560550957813, + "language_loss": 0.76245713, + "learning_rate": 2.483734621343429e-06, + "loss": 0.78116381, + "num_input_tokens_seen": 156677275, + "step": 7311, + "time_per_iteration": 2.676393985748291 + }, + { + "auxiliary_loss_clip": 0.01118807, + "auxiliary_loss_mlp": 0.01036972, + "balance_loss_clip": 1.04605961, + "balance_loss_mlp": 1.02365649, + "epoch": 0.43962122350819177, + "flos": 22127365601280.0, + "grad_norm": 1.941188934607737, + "language_loss": 0.81554043, + "learning_rate": 2.483356713869341e-06, + "loss": 0.83709824, + "num_input_tokens_seen": 156695815, + "step": 7312, + "time_per_iteration": 2.734691858291626 + }, + { + "auxiliary_loss_clip": 0.01099053, + "auxiliary_loss_mlp": 0.01030855, + "balance_loss_clip": 1.04661798, + "balance_loss_mlp": 1.01802182, + "epoch": 0.43968134676085974, + "flos": 17420554200960.0, + "grad_norm": 4.309677618927981, + "language_loss": 0.85387003, + "learning_rate": 2.482978788066318e-06, + "loss": 0.8751691, + "num_input_tokens_seen": 156714385, + "step": 7313, + "time_per_iteration": 2.7130918502807617 + }, + { + "auxiliary_loss_clip": 0.01101603, + "auxiliary_loss_mlp": 0.01034502, + "balance_loss_clip": 1.04015613, + "balance_loss_mlp": 1.02131104, + "epoch": 0.43974147001352776, + "flos": 18952646958720.0, + "grad_norm": 1.7624997398560822, + "language_loss": 0.67982185, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.70118284, + "num_input_tokens_seen": 156732615, + "step": 7314, + "time_per_iteration": 2.660019636154175 + }, + { + "auxiliary_loss_clip": 0.01107647, + "auxiliary_loss_mlp": 0.01029957, + "balance_loss_clip": 1.04436517, + "balance_loss_mlp": 1.01645088, + "epoch": 0.4398015932661957, + "flos": 18953508885120.0, + "grad_norm": 1.864599678602129, + "language_loss": 0.76799178, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.78936785, + "num_input_tokens_seen": 156750920, + "step": 7315, + "time_per_iteration": 2.6958022117614746 + }, + { + "auxiliary_loss_clip": 0.01103713, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.04664755, + "balance_loss_mlp": 1.02002192, + "epoch": 0.4398617165188637, + "flos": 24199913370240.0, + "grad_norm": 2.581770130909348, + "language_loss": 0.74439812, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.76576865, + "num_input_tokens_seen": 156768520, + "step": 7316, + "time_per_iteration": 2.7142746448516846 + }, + { + "auxiliary_loss_clip": 0.01091829, + "auxiliary_loss_mlp": 0.01038409, + "balance_loss_clip": 1.04720306, + "balance_loss_mlp": 1.02546883, + "epoch": 0.43992183977153165, + "flos": 22236677665920.0, + "grad_norm": 2.381148700310756, + "language_loss": 0.64676511, + "learning_rate": 2.481466901851506e-06, + "loss": 0.66806751, + "num_input_tokens_seen": 156788700, + "step": 7317, + "time_per_iteration": 2.6647984981536865 + }, + { + "auxiliary_loss_clip": 0.01100358, + "auxiliary_loss_mlp": 0.01036318, + "balance_loss_clip": 1.04315925, + "balance_loss_mlp": 1.02252579, + "epoch": 0.4399819630241996, + "flos": 18697465762560.0, + "grad_norm": 2.00656387252293, + "language_loss": 0.79769003, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.81905675, + "num_input_tokens_seen": 156806470, + "step": 7318, + "time_per_iteration": 2.6569128036499023 + }, + { + "auxiliary_loss_clip": 0.01085209, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_clip": 1.03973842, + "balance_loss_mlp": 1.02808332, + "epoch": 0.4400420862768676, + "flos": 23879375377920.0, + "grad_norm": 1.4911827600564649, + "language_loss": 0.79173744, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.81301975, + "num_input_tokens_seen": 156825895, + "step": 7319, + "time_per_iteration": 2.7476212978363037 + }, + { + "auxiliary_loss_clip": 0.01110516, + "auxiliary_loss_mlp": 0.01041368, + "balance_loss_clip": 1.0416882, + "balance_loss_mlp": 1.02647328, + "epoch": 0.44010220952953555, + "flos": 28037615293440.0, + "grad_norm": 1.9147413156076512, + "language_loss": 0.80129063, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.82280946, + "num_input_tokens_seen": 156845990, + "step": 7320, + "time_per_iteration": 2.716813802719116 + }, + { + "auxiliary_loss_clip": 0.01088202, + "auxiliary_loss_mlp": 0.01041527, + "balance_loss_clip": 1.04271483, + "balance_loss_mlp": 1.02871788, + "epoch": 0.4401623327822035, + "flos": 23768985905280.0, + "grad_norm": 3.0980421986856777, + "language_loss": 0.69580001, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.71709728, + "num_input_tokens_seen": 156866685, + "step": 7321, + "time_per_iteration": 2.753053903579712 + }, + { + "auxiliary_loss_clip": 0.01016924, + "auxiliary_loss_mlp": 0.01013574, + "balance_loss_clip": 1.02610326, + "balance_loss_mlp": 1.01153517, + "epoch": 0.4402224560348715, + "flos": 70774583264640.0, + "grad_norm": 0.8888992176827548, + "language_loss": 0.56922823, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.58953327, + "num_input_tokens_seen": 156923450, + "step": 7322, + "time_per_iteration": 3.3513524532318115 + }, + { + "auxiliary_loss_clip": 0.01073209, + "auxiliary_loss_mlp": 0.01039777, + "balance_loss_clip": 1.03671217, + "balance_loss_mlp": 1.02677715, + "epoch": 0.44028257928753944, + "flos": 22891795868160.0, + "grad_norm": 1.5589182914821764, + "language_loss": 0.76272774, + "learning_rate": 2.479198525097822e-06, + "loss": 0.78385758, + "num_input_tokens_seen": 156944795, + "step": 7323, + "time_per_iteration": 2.7524306774139404 + }, + { + "auxiliary_loss_clip": 0.01119465, + "auxiliary_loss_mlp": 0.01043388, + "balance_loss_clip": 1.04591155, + "balance_loss_mlp": 1.0296607, + "epoch": 0.4403427025402074, + "flos": 17895760156800.0, + "grad_norm": 1.5124862196762965, + "language_loss": 0.80590653, + "learning_rate": 2.478820398622511e-06, + "loss": 0.82753503, + "num_input_tokens_seen": 156962755, + "step": 7324, + "time_per_iteration": 2.6558468341827393 + }, + { + "auxiliary_loss_clip": 0.01025531, + "auxiliary_loss_mlp": 0.0100492, + "balance_loss_clip": 1.02356136, + "balance_loss_mlp": 1.00322747, + "epoch": 0.4404028257928754, + "flos": 69562525708800.0, + "grad_norm": 0.6843753140185513, + "language_loss": 0.54592586, + "learning_rate": 2.478442253990283e-06, + "loss": 0.5662303, + "num_input_tokens_seen": 157028095, + "step": 7325, + "time_per_iteration": 3.228588819503784 + }, + { + "auxiliary_loss_clip": 0.01128033, + "auxiliary_loss_mlp": 0.01028317, + "balance_loss_clip": 1.04957604, + "balance_loss_mlp": 1.0163784, + "epoch": 0.44046294904554334, + "flos": 20923675914240.0, + "grad_norm": 1.4618535572581854, + "language_loss": 0.70052326, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.72208667, + "num_input_tokens_seen": 157048365, + "step": 7326, + "time_per_iteration": 2.643843650817871 + }, + { + "auxiliary_loss_clip": 0.01081906, + "auxiliary_loss_mlp": 0.010325, + "balance_loss_clip": 1.03812075, + "balance_loss_mlp": 1.01949978, + "epoch": 0.44052307229821136, + "flos": 23623475909760.0, + "grad_norm": 1.533904509031544, + "language_loss": 0.76754719, + "learning_rate": 2.477685910312432e-06, + "loss": 0.78869128, + "num_input_tokens_seen": 157069130, + "step": 7327, + "time_per_iteration": 2.7409613132476807 + }, + { + "auxiliary_loss_clip": 0.01097799, + "auxiliary_loss_mlp": 0.01038346, + "balance_loss_clip": 1.04025364, + "balance_loss_mlp": 1.0256505, + "epoch": 0.4405831955508793, + "flos": 17597665186560.0, + "grad_norm": 1.9457575580966853, + "language_loss": 0.8413341, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.86269557, + "num_input_tokens_seen": 157084940, + "step": 7328, + "time_per_iteration": 2.6578822135925293 + }, + { + "auxiliary_loss_clip": 0.01102477, + "auxiliary_loss_mlp": 0.01028668, + "balance_loss_clip": 1.04432774, + "balance_loss_mlp": 1.01576972, + "epoch": 0.4406433188035473, + "flos": 21463376739840.0, + "grad_norm": 2.377465022226765, + "language_loss": 0.77753079, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.79884225, + "num_input_tokens_seen": 157102770, + "step": 7329, + "time_per_iteration": 2.6732001304626465 + }, + { + "auxiliary_loss_clip": 0.01114069, + "auxiliary_loss_mlp": 0.01039308, + "balance_loss_clip": 1.04399741, + "balance_loss_mlp": 1.02568269, + "epoch": 0.44070344205621526, + "flos": 22673566788480.0, + "grad_norm": 1.63533295854216, + "language_loss": 0.73525596, + "learning_rate": 2.476551258977278e-06, + "loss": 0.75678968, + "num_input_tokens_seen": 157122035, + "step": 7330, + "time_per_iteration": 2.6258528232574463 + }, + { + "auxiliary_loss_clip": 0.01104463, + "auxiliary_loss_mlp": 0.01039279, + "balance_loss_clip": 1.04494476, + "balance_loss_mlp": 1.02678585, + "epoch": 0.4407635653088832, + "flos": 23441193365760.0, + "grad_norm": 1.852759340776506, + "language_loss": 0.74862218, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.77005959, + "num_input_tokens_seen": 157142800, + "step": 7331, + "time_per_iteration": 2.767972469329834 + }, + { + "auxiliary_loss_clip": 0.01075234, + "auxiliary_loss_mlp": 0.01034744, + "balance_loss_clip": 1.04043937, + "balance_loss_mlp": 1.02114189, + "epoch": 0.4408236885615512, + "flos": 24021294013440.0, + "grad_norm": 1.4106194210898035, + "language_loss": 0.76326358, + "learning_rate": 2.475794734375581e-06, + "loss": 0.78436339, + "num_input_tokens_seen": 157163295, + "step": 7332, + "time_per_iteration": 2.7810683250427246 + }, + { + "auxiliary_loss_clip": 0.01099425, + "auxiliary_loss_mlp": 0.01041411, + "balance_loss_clip": 1.04447377, + "balance_loss_mlp": 1.02958584, + "epoch": 0.44088381181421915, + "flos": 12676826597760.0, + "grad_norm": 1.919719554260373, + "language_loss": 0.73795688, + "learning_rate": 2.475416445004285e-06, + "loss": 0.75936526, + "num_input_tokens_seen": 157180890, + "step": 7333, + "time_per_iteration": 2.661736488342285 + }, + { + "auxiliary_loss_clip": 0.01086658, + "auxiliary_loss_mlp": 0.01034222, + "balance_loss_clip": 1.04458117, + "balance_loss_mlp": 1.02134728, + "epoch": 0.4409439350668871, + "flos": 24569865498240.0, + "grad_norm": 1.5776913121160454, + "language_loss": 0.79113179, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.81234062, + "num_input_tokens_seen": 157200580, + "step": 7334, + "time_per_iteration": 2.8023018836975098 + }, + { + "auxiliary_loss_clip": 0.01102091, + "auxiliary_loss_mlp": 0.01039411, + "balance_loss_clip": 1.04475522, + "balance_loss_mlp": 1.02343714, + "epoch": 0.4410040583195551, + "flos": 22668574798080.0, + "grad_norm": 2.426268589885391, + "language_loss": 0.75184131, + "learning_rate": 2.47465981219252e-06, + "loss": 0.77325642, + "num_input_tokens_seen": 157218345, + "step": 7335, + "time_per_iteration": 2.7240371704101562 + }, + { + "auxiliary_loss_clip": 0.01101432, + "auxiliary_loss_mlp": 0.0103515, + "balance_loss_clip": 1.04350579, + "balance_loss_mlp": 1.02189362, + "epoch": 0.44106418157222305, + "flos": 10852528700160.0, + "grad_norm": 1.9825426915131346, + "language_loss": 0.72498572, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.74635154, + "num_input_tokens_seen": 157234395, + "step": 7336, + "time_per_iteration": 2.6489880084991455 + }, + { + "auxiliary_loss_clip": 0.01118861, + "auxiliary_loss_mlp": 0.01040583, + "balance_loss_clip": 1.04398608, + "balance_loss_mlp": 1.02684367, + "epoch": 0.441124304824891, + "flos": 21726710323200.0, + "grad_norm": 2.2630715311051617, + "language_loss": 0.62847346, + "learning_rate": 2.473903107384165e-06, + "loss": 0.65006793, + "num_input_tokens_seen": 157254805, + "step": 7337, + "time_per_iteration": 2.632335901260376 + }, + { + "auxiliary_loss_clip": 0.01029242, + "auxiliary_loss_mlp": 0.00753616, + "balance_loss_clip": 1.0181427, + "balance_loss_mlp": 1.00070596, + "epoch": 0.441184428077559, + "flos": 63220486625280.0, + "grad_norm": 0.7364595311582042, + "language_loss": 0.52639711, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54422569, + "num_input_tokens_seen": 157317870, + "step": 7338, + "time_per_iteration": 3.253746509552002 + }, + { + "auxiliary_loss_clip": 0.01106453, + "auxiliary_loss_mlp": 0.01046288, + "balance_loss_clip": 1.04105973, + "balance_loss_mlp": 1.03120804, + "epoch": 0.44124455133022694, + "flos": 21177959270400.0, + "grad_norm": 2.22639682548465, + "language_loss": 0.70776093, + "learning_rate": 2.473146330693997e-06, + "loss": 0.7292884, + "num_input_tokens_seen": 157336505, + "step": 7339, + "time_per_iteration": 2.655733823776245 + }, + { + "auxiliary_loss_clip": 0.01053755, + "auxiliary_loss_mlp": 0.01042988, + "balance_loss_clip": 1.03682137, + "balance_loss_mlp": 1.02918971, + "epoch": 0.4413046745828949, + "flos": 17457865453440.0, + "grad_norm": 1.5022359473102205, + "language_loss": 0.70075929, + "learning_rate": 2.472767915429105e-06, + "loss": 0.72172678, + "num_input_tokens_seen": 157354995, + "step": 7340, + "time_per_iteration": 2.767920970916748 + }, + { + "auxiliary_loss_clip": 0.01030747, + "auxiliary_loss_mlp": 0.01003789, + "balance_loss_clip": 1.02245617, + "balance_loss_mlp": 1.00190568, + "epoch": 0.4413647978355629, + "flos": 61586153804160.0, + "grad_norm": 0.8827965218567749, + "language_loss": 0.63983381, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.66017926, + "num_input_tokens_seen": 157404260, + "step": 7341, + "time_per_iteration": 3.049508810043335 + }, + { + "auxiliary_loss_clip": 0.01091178, + "auxiliary_loss_mlp": 0.01040152, + "balance_loss_clip": 1.0418849, + "balance_loss_mlp": 1.02682424, + "epoch": 0.4414249210882309, + "flos": 27527001505920.0, + "grad_norm": 2.055823294856648, + "language_loss": 0.73636287, + "learning_rate": 2.47201103113145e-06, + "loss": 0.75767612, + "num_input_tokens_seen": 157423045, + "step": 7342, + "time_per_iteration": 2.795201063156128 + }, + { + "auxiliary_loss_clip": 0.01125069, + "auxiliary_loss_mlp": 0.01041127, + "balance_loss_clip": 1.04345822, + "balance_loss_mlp": 1.02709007, + "epoch": 0.44148504434089886, + "flos": 23513984277120.0, + "grad_norm": 2.2044048255358515, + "language_loss": 0.79979384, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.82145584, + "num_input_tokens_seen": 157441815, + "step": 7343, + "time_per_iteration": 5.804108142852783 + }, + { + "auxiliary_loss_clip": 0.010937, + "auxiliary_loss_mlp": 0.01034348, + "balance_loss_clip": 1.04503846, + "balance_loss_mlp": 1.02072287, + "epoch": 0.4415451675935668, + "flos": 21580589796480.0, + "grad_norm": 2.707350721832692, + "language_loss": 0.76721787, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.78849834, + "num_input_tokens_seen": 157460470, + "step": 7344, + "time_per_iteration": 2.7370471954345703 + }, + { + "auxiliary_loss_clip": 0.01038191, + "auxiliary_loss_mlp": 0.01020913, + "balance_loss_clip": 1.0274384, + "balance_loss_mlp": 1.01902914, + "epoch": 0.4416052908462348, + "flos": 59006368126080.0, + "grad_norm": 0.7980536604903562, + "language_loss": 0.63813043, + "learning_rate": 2.470875570480556e-06, + "loss": 0.65872145, + "num_input_tokens_seen": 157512655, + "step": 7345, + "time_per_iteration": 4.502060890197754 + }, + { + "auxiliary_loss_clip": 0.01130065, + "auxiliary_loss_mlp": 0.01040621, + "balance_loss_clip": 1.04656529, + "balance_loss_mlp": 1.02670372, + "epoch": 0.44166541409890275, + "flos": 26357642242560.0, + "grad_norm": 1.8234046338758734, + "language_loss": 0.86094856, + "learning_rate": 2.470497047866489e-06, + "loss": 0.88265538, + "num_input_tokens_seen": 157533700, + "step": 7346, + "time_per_iteration": 2.697648763656616 + }, + { + "auxiliary_loss_clip": 0.01119294, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_clip": 1.04583025, + "balance_loss_mlp": 1.02862179, + "epoch": 0.4417255373515707, + "flos": 20192678231040.0, + "grad_norm": 1.7966519054380148, + "language_loss": 0.80474353, + "learning_rate": 2.470118507411128e-06, + "loss": 0.8263666, + "num_input_tokens_seen": 157551105, + "step": 7347, + "time_per_iteration": 4.3498101234436035 + }, + { + "auxiliary_loss_clip": 0.01107859, + "auxiliary_loss_mlp": 0.01035246, + "balance_loss_clip": 1.04878783, + "balance_loss_mlp": 1.02088118, + "epoch": 0.4417856606042387, + "flos": 17887895078400.0, + "grad_norm": 1.7585337264872751, + "language_loss": 0.83156574, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.85299683, + "num_input_tokens_seen": 157568285, + "step": 7348, + "time_per_iteration": 2.6866180896759033 + }, + { + "auxiliary_loss_clip": 0.01119234, + "auxiliary_loss_mlp": 0.01035311, + "balance_loss_clip": 1.04732084, + "balance_loss_mlp": 1.02139926, + "epoch": 0.44184578385690665, + "flos": 27964034282880.0, + "grad_norm": 2.0657656881846505, + "language_loss": 0.70507312, + "learning_rate": 2.469361373033938e-06, + "loss": 0.72661853, + "num_input_tokens_seen": 157590405, + "step": 7349, + "time_per_iteration": 2.7241854667663574 + }, + { + "auxiliary_loss_clip": 0.0109864, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.04184258, + "balance_loss_mlp": 1.01935983, + "epoch": 0.4419059071095746, + "flos": 23367899664000.0, + "grad_norm": 1.9069897602324009, + "language_loss": 0.74060279, + "learning_rate": 2.468982779140819e-06, + "loss": 0.76192582, + "num_input_tokens_seen": 157607420, + "step": 7350, + "time_per_iteration": 2.724295139312744 + }, + { + "auxiliary_loss_clip": 0.01129716, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.04692149, + "balance_loss_mlp": 1.02279782, + "epoch": 0.4419660303622426, + "flos": 15012169246080.0, + "grad_norm": 4.28906993354027, + "language_loss": 0.81133771, + "learning_rate": 2.468604167463827e-06, + "loss": 0.83299923, + "num_input_tokens_seen": 157624990, + "step": 7351, + "time_per_iteration": 2.6151175498962402 + }, + { + "auxiliary_loss_clip": 0.01077442, + "auxiliary_loss_mlp": 0.00770493, + "balance_loss_clip": 1.03664398, + "balance_loss_mlp": 1.00027013, + "epoch": 0.44202615361491054, + "flos": 25371750672000.0, + "grad_norm": 1.4842739809833707, + "language_loss": 0.72872806, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.7472074, + "num_input_tokens_seen": 157645300, + "step": 7352, + "time_per_iteration": 2.822618007659912 + }, + { + "auxiliary_loss_clip": 0.01105652, + "auxiliary_loss_mlp": 0.01030051, + "balance_loss_clip": 1.05031562, + "balance_loss_mlp": 1.01625896, + "epoch": 0.4420862768675785, + "flos": 24681116897280.0, + "grad_norm": 2.2734813659209316, + "language_loss": 0.87014645, + "learning_rate": 2.467846890815649e-06, + "loss": 0.89150345, + "num_input_tokens_seen": 157664060, + "step": 7353, + "time_per_iteration": 2.8141496181488037 + }, + { + "auxiliary_loss_clip": 0.01131466, + "auxiliary_loss_mlp": 0.01036857, + "balance_loss_clip": 1.04851007, + "balance_loss_mlp": 1.02385104, + "epoch": 0.44214640012024653, + "flos": 19528437974400.0, + "grad_norm": 2.0005767830632464, + "language_loss": 0.75907683, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.78076005, + "num_input_tokens_seen": 157680905, + "step": 7354, + "time_per_iteration": 2.6416475772857666 + }, + { + "auxiliary_loss_clip": 0.01087376, + "auxiliary_loss_mlp": 0.01035112, + "balance_loss_clip": 1.04345286, + "balance_loss_mlp": 1.02218962, + "epoch": 0.4422065233729145, + "flos": 47557434003840.0, + "grad_norm": 1.702490286843937, + "language_loss": 0.64954734, + "learning_rate": 2.467089543204268e-06, + "loss": 0.67077219, + "num_input_tokens_seen": 157701980, + "step": 7355, + "time_per_iteration": 2.9349570274353027 + }, + { + "auxiliary_loss_clip": 0.01133882, + "auxiliary_loss_mlp": 0.01035511, + "balance_loss_clip": 1.04775596, + "balance_loss_mlp": 1.02121234, + "epoch": 0.44226664662558246, + "flos": 19281050029440.0, + "grad_norm": 1.8300716428477437, + "language_loss": 0.78527248, + "learning_rate": 2.466710842823274e-06, + "loss": 0.80696642, + "num_input_tokens_seen": 157720555, + "step": 7356, + "time_per_iteration": 2.5932910442352295 + }, + { + "auxiliary_loss_clip": 0.01109756, + "auxiliary_loss_mlp": 0.00771729, + "balance_loss_clip": 1.04629183, + "balance_loss_mlp": 1.0004859, + "epoch": 0.4423267698782504, + "flos": 17821820010240.0, + "grad_norm": 1.6708598029973696, + "language_loss": 0.77472621, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.79354107, + "num_input_tokens_seen": 157739160, + "step": 7357, + "time_per_iteration": 2.7050111293792725 + }, + { + "auxiliary_loss_clip": 0.01102733, + "auxiliary_loss_mlp": 0.01037231, + "balance_loss_clip": 1.04357672, + "balance_loss_mlp": 1.02280128, + "epoch": 0.4423868931309184, + "flos": 29204424691200.0, + "grad_norm": 1.492131344457668, + "language_loss": 0.73277801, + "learning_rate": 2.465953388982481e-06, + "loss": 0.75417769, + "num_input_tokens_seen": 157760020, + "step": 7358, + "time_per_iteration": 2.7339792251586914 + }, + { + "auxiliary_loss_clip": 0.01108517, + "auxiliary_loss_mlp": 0.01035507, + "balance_loss_clip": 1.04953265, + "balance_loss_mlp": 1.02198911, + "epoch": 0.44244701638358636, + "flos": 29713135057920.0, + "grad_norm": 1.890703165597896, + "language_loss": 0.75731266, + "learning_rate": 2.465574635551405e-06, + "loss": 0.77875292, + "num_input_tokens_seen": 157780435, + "step": 7359, + "time_per_iteration": 2.7597005367279053 + }, + { + "auxiliary_loss_clip": 0.01106411, + "auxiliary_loss_mlp": 0.01037766, + "balance_loss_clip": 1.04658461, + "balance_loss_mlp": 1.02315068, + "epoch": 0.4425071396362543, + "flos": 22930040874240.0, + "grad_norm": 1.6679218305876244, + "language_loss": 0.69988406, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.72132587, + "num_input_tokens_seen": 157799420, + "step": 7360, + "time_per_iteration": 2.7118403911590576 + }, + { + "auxiliary_loss_clip": 0.01104133, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.04686546, + "balance_loss_mlp": 1.01859379, + "epoch": 0.4425672628889223, + "flos": 19792346175360.0, + "grad_norm": 3.404305353939149, + "language_loss": 0.69860107, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.71997184, + "num_input_tokens_seen": 157817025, + "step": 7361, + "time_per_iteration": 2.672388792037964 + }, + { + "auxiliary_loss_clip": 0.01105237, + "auxiliary_loss_mlp": 0.01040581, + "balance_loss_clip": 1.04377937, + "balance_loss_mlp": 1.02539372, + "epoch": 0.44262738614159025, + "flos": 13662215377920.0, + "grad_norm": 2.0698565080434888, + "language_loss": 0.82494795, + "learning_rate": 2.464438269387809e-06, + "loss": 0.84640616, + "num_input_tokens_seen": 157834345, + "step": 7362, + "time_per_iteration": 2.6258609294891357 + }, + { + "auxiliary_loss_clip": 0.01102915, + "auxiliary_loss_mlp": 0.01040381, + "balance_loss_clip": 1.04801464, + "balance_loss_mlp": 1.02494311, + "epoch": 0.4426875093942582, + "flos": 14210212245120.0, + "grad_norm": 1.7089384580193987, + "language_loss": 0.74628377, + "learning_rate": 2.464059445424366e-06, + "loss": 0.76771677, + "num_input_tokens_seen": 157852290, + "step": 7363, + "time_per_iteration": 2.7868857383728027 + }, + { + "auxiliary_loss_clip": 0.01008645, + "auxiliary_loss_mlp": 0.01003596, + "balance_loss_clip": 1.02228582, + "balance_loss_mlp": 1.0016526, + "epoch": 0.4427476326469262, + "flos": 70117525728000.0, + "grad_norm": 0.6804595751696751, + "language_loss": 0.55677116, + "learning_rate": 2.463680603863743e-06, + "loss": 0.57689351, + "num_input_tokens_seen": 157923060, + "step": 7364, + "time_per_iteration": 3.3737823963165283 + }, + { + "auxiliary_loss_clip": 0.01109131, + "auxiliary_loss_mlp": 0.01040851, + "balance_loss_clip": 1.04670477, + "balance_loss_mlp": 1.02778566, + "epoch": 0.44280775589959415, + "flos": 25445080287360.0, + "grad_norm": 1.640155581598939, + "language_loss": 0.74618137, + "learning_rate": 2.463301744720305e-06, + "loss": 0.76768118, + "num_input_tokens_seen": 157944110, + "step": 7365, + "time_per_iteration": 2.789905071258545 + }, + { + "auxiliary_loss_clip": 0.01099825, + "auxiliary_loss_mlp": 0.01043396, + "balance_loss_clip": 1.04348397, + "balance_loss_mlp": 1.0287931, + "epoch": 0.4428678791522621, + "flos": 22857214049280.0, + "grad_norm": 1.5674103047703387, + "language_loss": 0.74297303, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.76440525, + "num_input_tokens_seen": 157964295, + "step": 7366, + "time_per_iteration": 2.700286626815796 + }, + { + "auxiliary_loss_clip": 0.01108412, + "auxiliary_loss_mlp": 0.0103649, + "balance_loss_clip": 1.04708481, + "balance_loss_mlp": 1.02240598, + "epoch": 0.44292800240493013, + "flos": 25812446636160.0, + "grad_norm": 3.271133633367276, + "language_loss": 0.73245466, + "learning_rate": 2.46254397374245e-06, + "loss": 0.75390375, + "num_input_tokens_seen": 157983970, + "step": 7367, + "time_per_iteration": 2.6946957111358643 + }, + { + "auxiliary_loss_clip": 0.01130142, + "auxiliary_loss_mlp": 0.01040167, + "balance_loss_clip": 1.04803169, + "balance_loss_mlp": 1.02645779, + "epoch": 0.4429881256575981, + "flos": 32416885549440.0, + "grad_norm": 1.566124307945558, + "language_loss": 0.73996794, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.76167101, + "num_input_tokens_seen": 158006515, + "step": 7368, + "time_per_iteration": 2.7544407844543457 + }, + { + "auxiliary_loss_clip": 0.01100906, + "auxiliary_loss_mlp": 0.01031006, + "balance_loss_clip": 1.04302347, + "balance_loss_mlp": 1.01735687, + "epoch": 0.44304824891026606, + "flos": 22163707186560.0, + "grad_norm": 2.0120848529023334, + "language_loss": 0.7961669, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.81748605, + "num_input_tokens_seen": 158025565, + "step": 7369, + "time_per_iteration": 2.697190046310425 + }, + { + "auxiliary_loss_clip": 0.010901, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.04244113, + "balance_loss_mlp": 1.02251637, + "epoch": 0.443108372162934, + "flos": 25338569483520.0, + "grad_norm": 1.9393131166495303, + "language_loss": 0.72057104, + "learning_rate": 2.461407185763737e-06, + "loss": 0.74182796, + "num_input_tokens_seen": 158045620, + "step": 7370, + "time_per_iteration": 2.7959940433502197 + }, + { + "auxiliary_loss_clip": 0.01129082, + "auxiliary_loss_mlp": 0.0103749, + "balance_loss_clip": 1.04668999, + "balance_loss_mlp": 1.02349448, + "epoch": 0.443168495415602, + "flos": 23330947547520.0, + "grad_norm": 1.8535232870502223, + "language_loss": 0.70380038, + "learning_rate": 2.461028221425126e-06, + "loss": 0.72546607, + "num_input_tokens_seen": 158063505, + "step": 7371, + "time_per_iteration": 2.677718162536621 + }, + { + "auxiliary_loss_clip": 0.01119855, + "auxiliary_loss_mlp": 0.01031238, + "balance_loss_clip": 1.0492835, + "balance_loss_mlp": 1.01867962, + "epoch": 0.44322861866826996, + "flos": 21871502046720.0, + "grad_norm": 2.0883513439310577, + "language_loss": 0.68410224, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.70561314, + "num_input_tokens_seen": 158080335, + "step": 7372, + "time_per_iteration": 2.6676101684570312 + }, + { + "auxiliary_loss_clip": 0.01096245, + "auxiliary_loss_mlp": 0.0103489, + "balance_loss_clip": 1.04236257, + "balance_loss_mlp": 1.0203104, + "epoch": 0.4432887419209379, + "flos": 20084407660800.0, + "grad_norm": 1.830573306058503, + "language_loss": 0.83560812, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.85691947, + "num_input_tokens_seen": 158098955, + "step": 7373, + "time_per_iteration": 2.706554651260376 + }, + { + "auxiliary_loss_clip": 0.0103821, + "auxiliary_loss_mlp": 0.0100315, + "balance_loss_clip": 1.01858282, + "balance_loss_mlp": 1.00125432, + "epoch": 0.4433488651736059, + "flos": 70035540935040.0, + "grad_norm": 0.769882260063621, + "language_loss": 0.55201387, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57242751, + "num_input_tokens_seen": 158164110, + "step": 7374, + "time_per_iteration": 3.2373340129852295 + }, + { + "auxiliary_loss_clip": 0.01078736, + "auxiliary_loss_mlp": 0.01042384, + "balance_loss_clip": 1.04519641, + "balance_loss_mlp": 1.02773309, + "epoch": 0.44340898842627385, + "flos": 16282472705280.0, + "grad_norm": 2.3490774090653592, + "language_loss": 0.8289665, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.85017765, + "num_input_tokens_seen": 158179850, + "step": 7375, + "time_per_iteration": 2.7468464374542236 + }, + { + "auxiliary_loss_clip": 0.01129641, + "auxiliary_loss_mlp": 0.01034202, + "balance_loss_clip": 1.04680073, + "balance_loss_mlp": 1.02032566, + "epoch": 0.4434691116789418, + "flos": 16611989097600.0, + "grad_norm": 1.9296092769688273, + "language_loss": 0.84076023, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.86239868, + "num_input_tokens_seen": 158196590, + "step": 7376, + "time_per_iteration": 2.5597686767578125 + }, + { + "auxiliary_loss_clip": 0.01105366, + "auxiliary_loss_mlp": 0.01036479, + "balance_loss_clip": 1.04541779, + "balance_loss_mlp": 1.02299011, + "epoch": 0.4435292349316098, + "flos": 19063251912960.0, + "grad_norm": 1.7983383352892115, + "language_loss": 0.77172405, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.79314244, + "num_input_tokens_seen": 158216355, + "step": 7377, + "time_per_iteration": 2.7065727710723877 + }, + { + "auxiliary_loss_clip": 0.01111732, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.04586828, + "balance_loss_mlp": 1.01916027, + "epoch": 0.44358935818427775, + "flos": 21251324799360.0, + "grad_norm": 2.2025516465061568, + "language_loss": 0.76422131, + "learning_rate": 2.458374982357057e-06, + "loss": 0.78566432, + "num_input_tokens_seen": 158235825, + "step": 7378, + "time_per_iteration": 2.6680550575256348 + }, + { + "auxiliary_loss_clip": 0.01104625, + "auxiliary_loss_mlp": 0.01055785, + "balance_loss_clip": 1.04471672, + "balance_loss_mlp": 1.0404191, + "epoch": 0.4436494814369457, + "flos": 12495298239360.0, + "grad_norm": 1.9484405267541265, + "language_loss": 0.69165838, + "learning_rate": 2.457995878562982e-06, + "loss": 0.7132625, + "num_input_tokens_seen": 158254230, + "step": 7379, + "time_per_iteration": 2.6700775623321533 + }, + { + "auxiliary_loss_clip": 0.01063579, + "auxiliary_loss_mlp": 0.01045674, + "balance_loss_clip": 1.03913927, + "balance_loss_mlp": 1.0297358, + "epoch": 0.44370960468961373, + "flos": 23659853408640.0, + "grad_norm": 2.073474855716146, + "language_loss": 0.7288872, + "learning_rate": 2.457616757401656e-06, + "loss": 0.74997967, + "num_input_tokens_seen": 158273400, + "step": 7380, + "time_per_iteration": 2.8017635345458984 + }, + { + "auxiliary_loss_clip": 0.01110205, + "auxiliary_loss_mlp": 0.01035159, + "balance_loss_clip": 1.04831696, + "balance_loss_mlp": 1.02124155, + "epoch": 0.4437697279422817, + "flos": 32416849635840.0, + "grad_norm": 1.6338701103198854, + "language_loss": 0.64961064, + "learning_rate": 2.457237618887458e-06, + "loss": 0.67106432, + "num_input_tokens_seen": 158296840, + "step": 7381, + "time_per_iteration": 2.791595458984375 + }, + { + "auxiliary_loss_clip": 0.01120176, + "auxiliary_loss_mlp": 0.0104083, + "balance_loss_clip": 1.04781485, + "balance_loss_mlp": 1.02696049, + "epoch": 0.44382985119494966, + "flos": 18112875914880.0, + "grad_norm": 5.151492667638541, + "language_loss": 0.80450714, + "learning_rate": 2.456858463034763e-06, + "loss": 0.82611728, + "num_input_tokens_seen": 158314935, + "step": 7382, + "time_per_iteration": 4.177164316177368 + }, + { + "auxiliary_loss_clip": 0.0112542, + "auxiliary_loss_mlp": 0.01039884, + "balance_loss_clip": 1.05130458, + "balance_loss_mlp": 1.02599657, + "epoch": 0.44388997444761763, + "flos": 30774151923840.0, + "grad_norm": 1.842434773727105, + "language_loss": 0.65955621, + "learning_rate": 2.456479289857949e-06, + "loss": 0.68120921, + "num_input_tokens_seen": 158334620, + "step": 7383, + "time_per_iteration": 4.142000436782837 + }, + { + "auxiliary_loss_clip": 0.01104406, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.04357898, + "balance_loss_mlp": 1.02228832, + "epoch": 0.4439500977002856, + "flos": 20339157893760.0, + "grad_norm": 2.431816949897044, + "language_loss": 0.76046586, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.78187954, + "num_input_tokens_seen": 158350550, + "step": 7384, + "time_per_iteration": 4.309042453765869 + }, + { + "auxiliary_loss_clip": 0.01132692, + "auxiliary_loss_mlp": 0.01040021, + "balance_loss_clip": 1.04878867, + "balance_loss_mlp": 1.02595425, + "epoch": 0.44401022095295356, + "flos": 20371225760640.0, + "grad_norm": 1.6001418974541146, + "language_loss": 0.81145859, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.83318579, + "num_input_tokens_seen": 158369555, + "step": 7385, + "time_per_iteration": 2.6569409370422363 + }, + { + "auxiliary_loss_clip": 0.01085589, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.04551208, + "balance_loss_mlp": 1.02062619, + "epoch": 0.4440703442056215, + "flos": 20230635928320.0, + "grad_norm": 1.8953258070837995, + "language_loss": 0.81531972, + "learning_rate": 2.455341666526582e-06, + "loss": 0.8365339, + "num_input_tokens_seen": 158388045, + "step": 7386, + "time_per_iteration": 2.757857084274292 + }, + { + "auxiliary_loss_clip": 0.01092623, + "auxiliary_loss_mlp": 0.01033697, + "balance_loss_clip": 1.04583073, + "balance_loss_mlp": 1.01829553, + "epoch": 0.4441304674582895, + "flos": 39494698824960.0, + "grad_norm": 2.1898431457791827, + "language_loss": 0.70026255, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.72152579, + "num_input_tokens_seen": 158410115, + "step": 7387, + "time_per_iteration": 4.4056620597839355 + }, + { + "auxiliary_loss_clip": 0.01064296, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_clip": 1.04571772, + "balance_loss_mlp": 1.02586579, + "epoch": 0.44419059071095746, + "flos": 14829671220480.0, + "grad_norm": 1.9497255625781733, + "language_loss": 0.71838999, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.73943412, + "num_input_tokens_seen": 158427765, + "step": 7388, + "time_per_iteration": 2.7504312992095947 + }, + { + "auxiliary_loss_clip": 0.01120562, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.04769969, + "balance_loss_mlp": 1.02277958, + "epoch": 0.4442507139636254, + "flos": 22637835734400.0, + "grad_norm": 1.8353800507100826, + "language_loss": 0.6930418, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.71461499, + "num_input_tokens_seen": 158446375, + "step": 7389, + "time_per_iteration": 2.620847702026367 + }, + { + "auxiliary_loss_clip": 0.01119935, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.04713047, + "balance_loss_mlp": 1.02149689, + "epoch": 0.4443108372162934, + "flos": 38290721829120.0, + "grad_norm": 1.8033342781314554, + "language_loss": 0.75145507, + "learning_rate": 2.453824593752788e-06, + "loss": 0.77301002, + "num_input_tokens_seen": 158467260, + "step": 7390, + "time_per_iteration": 2.794739246368408 + }, + { + "auxiliary_loss_clip": 0.01112569, + "auxiliary_loss_mlp": 0.0104339, + "balance_loss_clip": 1.04474115, + "balance_loss_mlp": 1.0285244, + "epoch": 0.44437096046896135, + "flos": 17748993185280.0, + "grad_norm": 2.757944013002859, + "language_loss": 0.8139115, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.83547109, + "num_input_tokens_seen": 158486720, + "step": 7391, + "time_per_iteration": 2.62081241607666 + }, + { + "auxiliary_loss_clip": 0.01100157, + "auxiliary_loss_mlp": 0.01039531, + "balance_loss_clip": 1.04489446, + "balance_loss_mlp": 1.02436733, + "epoch": 0.4444310837216293, + "flos": 13732348682880.0, + "grad_norm": 1.7057692393428199, + "language_loss": 0.73885345, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.76025033, + "num_input_tokens_seen": 158502530, + "step": 7392, + "time_per_iteration": 2.619123935699463 + }, + { + "auxiliary_loss_clip": 0.01116796, + "auxiliary_loss_mlp": 0.01032995, + "balance_loss_clip": 1.04451931, + "balance_loss_mlp": 1.01976895, + "epoch": 0.44449120697429734, + "flos": 25010238240000.0, + "grad_norm": 1.6244243517648933, + "language_loss": 0.79316819, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.81466603, + "num_input_tokens_seen": 158522715, + "step": 7393, + "time_per_iteration": 2.761636257171631 + }, + { + "auxiliary_loss_clip": 0.01123845, + "auxiliary_loss_mlp": 0.01034263, + "balance_loss_clip": 1.04784608, + "balance_loss_mlp": 1.02036357, + "epoch": 0.4445513302269653, + "flos": 32671707609600.0, + "grad_norm": 1.7936817608261026, + "language_loss": 0.80767369, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.82925481, + "num_input_tokens_seen": 158543615, + "step": 7394, + "time_per_iteration": 2.731896162033081 + }, + { + "auxiliary_loss_clip": 0.01101431, + "auxiliary_loss_mlp": 0.010406, + "balance_loss_clip": 1.04235363, + "balance_loss_mlp": 1.02805972, + "epoch": 0.44461145347963327, + "flos": 11655814504320.0, + "grad_norm": 2.5483522979722886, + "language_loss": 0.79701138, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.81843174, + "num_input_tokens_seen": 158560330, + "step": 7395, + "time_per_iteration": 2.6799733638763428 + }, + { + "auxiliary_loss_clip": 0.0110231, + "auxiliary_loss_mlp": 0.01040027, + "balance_loss_clip": 1.04210639, + "balance_loss_mlp": 1.02630031, + "epoch": 0.44467157673230123, + "flos": 20886759711360.0, + "grad_norm": 1.725775342310971, + "language_loss": 0.68280721, + "learning_rate": 2.451548468607584e-06, + "loss": 0.70423067, + "num_input_tokens_seen": 158579735, + "step": 7396, + "time_per_iteration": 2.7539262771606445 + }, + { + "auxiliary_loss_clip": 0.01115853, + "auxiliary_loss_mlp": 0.00771942, + "balance_loss_clip": 1.04396296, + "balance_loss_mlp": 1.00035286, + "epoch": 0.4447316999849692, + "flos": 18546137763840.0, + "grad_norm": 1.749232481773879, + "language_loss": 0.80780083, + "learning_rate": 2.451169054403126e-06, + "loss": 0.82667875, + "num_input_tokens_seen": 158597075, + "step": 7397, + "time_per_iteration": 2.6620333194732666 + }, + { + "auxiliary_loss_clip": 0.01119828, + "auxiliary_loss_mlp": 0.01038203, + "balance_loss_clip": 1.04740441, + "balance_loss_mlp": 1.02525663, + "epoch": 0.44479182323763716, + "flos": 23769057732480.0, + "grad_norm": 1.6626939297991263, + "language_loss": 0.67383635, + "learning_rate": 2.450789623090293e-06, + "loss": 0.69541669, + "num_input_tokens_seen": 158616650, + "step": 7398, + "time_per_iteration": 2.671193838119507 + }, + { + "auxiliary_loss_clip": 0.01097104, + "auxiliary_loss_mlp": 0.01040281, + "balance_loss_clip": 1.04477727, + "balance_loss_mlp": 1.0271976, + "epoch": 0.44485194649030513, + "flos": 16543831040640.0, + "grad_norm": 1.7055478439146432, + "language_loss": 0.69250667, + "learning_rate": 2.450410174683472e-06, + "loss": 0.71388054, + "num_input_tokens_seen": 158634515, + "step": 7399, + "time_per_iteration": 2.6823384761810303 + }, + { + "auxiliary_loss_clip": 0.01097596, + "auxiliary_loss_mlp": 0.01035766, + "balance_loss_clip": 1.04475021, + "balance_loss_mlp": 1.0225575, + "epoch": 0.4449120697429731, + "flos": 22600955445120.0, + "grad_norm": 1.8287170900617375, + "language_loss": 0.72332168, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.74465525, + "num_input_tokens_seen": 158653760, + "step": 7400, + "time_per_iteration": 2.7227253913879395 + }, + { + "auxiliary_loss_clip": 0.01076093, + "auxiliary_loss_mlp": 0.00770024, + "balance_loss_clip": 1.04184151, + "balance_loss_mlp": 1.00039887, + "epoch": 0.44497219299564106, + "flos": 20004864992640.0, + "grad_norm": 1.6814996958378423, + "language_loss": 0.85252142, + "learning_rate": 2.449651226645422e-06, + "loss": 0.87098259, + "num_input_tokens_seen": 158672190, + "step": 7401, + "time_per_iteration": 2.757293701171875 + }, + { + "auxiliary_loss_clip": 0.01102171, + "auxiliary_loss_mlp": 0.0103733, + "balance_loss_clip": 1.04564703, + "balance_loss_mlp": 1.02497375, + "epoch": 0.445032316248309, + "flos": 25594253470080.0, + "grad_norm": 1.6805452055908299, + "language_loss": 0.83201802, + "learning_rate": 2.449271727042973e-06, + "loss": 0.85341299, + "num_input_tokens_seen": 158694115, + "step": 7402, + "time_per_iteration": 2.7132928371429443 + }, + { + "auxiliary_loss_clip": 0.01107267, + "auxiliary_loss_mlp": 0.01032822, + "balance_loss_clip": 1.0461576, + "balance_loss_mlp": 1.0188688, + "epoch": 0.445092439500977, + "flos": 21250426959360.0, + "grad_norm": 1.9019306445781163, + "language_loss": 0.7714172, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.79281807, + "num_input_tokens_seen": 158711000, + "step": 7403, + "time_per_iteration": 2.6282217502593994 + }, + { + "auxiliary_loss_clip": 0.01023728, + "auxiliary_loss_mlp": 0.01005808, + "balance_loss_clip": 1.0202831, + "balance_loss_mlp": 1.00413918, + "epoch": 0.44515256275364495, + "flos": 57764900309760.0, + "grad_norm": 0.7456605721636542, + "language_loss": 0.59988129, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62017667, + "num_input_tokens_seen": 158769675, + "step": 7404, + "time_per_iteration": 3.173560619354248 + }, + { + "auxiliary_loss_clip": 0.01105136, + "auxiliary_loss_mlp": 0.01044638, + "balance_loss_clip": 1.04419279, + "balance_loss_mlp": 1.02934957, + "epoch": 0.4452126860063129, + "flos": 15596004908160.0, + "grad_norm": 1.6768296122026118, + "language_loss": 0.82246673, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.8439644, + "num_input_tokens_seen": 158788215, + "step": 7405, + "time_per_iteration": 2.6669278144836426 + }, + { + "auxiliary_loss_clip": 0.01104648, + "auxiliary_loss_mlp": 0.01029929, + "balance_loss_clip": 1.04628932, + "balance_loss_mlp": 1.01669657, + "epoch": 0.4452728092589809, + "flos": 21617398258560.0, + "grad_norm": 4.56209401129754, + "language_loss": 0.75126898, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.77261472, + "num_input_tokens_seen": 158809090, + "step": 7406, + "time_per_iteration": 2.6722404956817627 + }, + { + "auxiliary_loss_clip": 0.01091029, + "auxiliary_loss_mlp": 0.01030298, + "balance_loss_clip": 1.0434047, + "balance_loss_mlp": 1.01746488, + "epoch": 0.4453329325116489, + "flos": 29497491757440.0, + "grad_norm": 1.6633570284980403, + "language_loss": 0.6572476, + "learning_rate": 2.447373973772129e-06, + "loss": 0.67846084, + "num_input_tokens_seen": 158828320, + "step": 7407, + "time_per_iteration": 2.819289207458496 + }, + { + "auxiliary_loss_clip": 0.01102137, + "auxiliary_loss_mlp": 0.01031486, + "balance_loss_clip": 1.04499328, + "balance_loss_mlp": 1.0179081, + "epoch": 0.44539305576431687, + "flos": 21361139654400.0, + "grad_norm": 1.6186505097592758, + "language_loss": 0.67861688, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.69995308, + "num_input_tokens_seen": 158847040, + "step": 7408, + "time_per_iteration": 2.6846649646759033 + }, + { + "auxiliary_loss_clip": 0.01128678, + "auxiliary_loss_mlp": 0.01035504, + "balance_loss_clip": 1.04559541, + "balance_loss_mlp": 1.02121043, + "epoch": 0.44545317901698483, + "flos": 41427626428800.0, + "grad_norm": 1.4740715510068387, + "language_loss": 0.72127414, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.74291599, + "num_input_tokens_seen": 158870490, + "step": 7409, + "time_per_iteration": 2.7701869010925293 + }, + { + "auxiliary_loss_clip": 0.01107577, + "auxiliary_loss_mlp": 0.010375, + "balance_loss_clip": 1.04669523, + "balance_loss_mlp": 1.02308798, + "epoch": 0.4455133022696528, + "flos": 22055005653120.0, + "grad_norm": 1.9118661854704846, + "language_loss": 0.65146017, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.67291093, + "num_input_tokens_seen": 158889920, + "step": 7410, + "time_per_iteration": 2.780905246734619 + }, + { + "auxiliary_loss_clip": 0.01104956, + "auxiliary_loss_mlp": 0.01038448, + "balance_loss_clip": 1.04414868, + "balance_loss_mlp": 1.02369618, + "epoch": 0.44557342552232077, + "flos": 23476960333440.0, + "grad_norm": 2.076728084707015, + "language_loss": 0.73772335, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.75915742, + "num_input_tokens_seen": 158909580, + "step": 7411, + "time_per_iteration": 2.745547294616699 + }, + { + "auxiliary_loss_clip": 0.01061885, + "auxiliary_loss_mlp": 0.01033363, + "balance_loss_clip": 1.04457641, + "balance_loss_mlp": 1.01967764, + "epoch": 0.44563354877498873, + "flos": 19134678107520.0, + "grad_norm": 1.7330985507109689, + "language_loss": 0.79373199, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.81468445, + "num_input_tokens_seen": 158924600, + "step": 7412, + "time_per_iteration": 2.76361346244812 + }, + { + "auxiliary_loss_clip": 0.01108589, + "auxiliary_loss_mlp": 0.01037974, + "balance_loss_clip": 1.04357016, + "balance_loss_mlp": 1.02453899, + "epoch": 0.4456936720276567, + "flos": 13621420506240.0, + "grad_norm": 1.9356381581130233, + "language_loss": 0.80161285, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.82307845, + "num_input_tokens_seen": 158939345, + "step": 7413, + "time_per_iteration": 2.619915008544922 + }, + { + "auxiliary_loss_clip": 0.01113419, + "auxiliary_loss_mlp": 0.0103316, + "balance_loss_clip": 1.0433104, + "balance_loss_mlp": 1.01962328, + "epoch": 0.44575379528032466, + "flos": 14713715139840.0, + "grad_norm": 1.9889124982728665, + "language_loss": 0.76648301, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.78794879, + "num_input_tokens_seen": 158955855, + "step": 7414, + "time_per_iteration": 2.5959794521331787 + }, + { + "auxiliary_loss_clip": 0.01096052, + "auxiliary_loss_mlp": 0.01040946, + "balance_loss_clip": 1.0415467, + "balance_loss_mlp": 1.02701616, + "epoch": 0.4458139185329926, + "flos": 24170682677760.0, + "grad_norm": 1.6599120729875612, + "language_loss": 0.83765483, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.85902476, + "num_input_tokens_seen": 158976315, + "step": 7415, + "time_per_iteration": 2.785512685775757 + }, + { + "auxiliary_loss_clip": 0.01124247, + "auxiliary_loss_mlp": 0.01043831, + "balance_loss_clip": 1.04321933, + "balance_loss_mlp": 1.03093266, + "epoch": 0.4458740417856606, + "flos": 21762225895680.0, + "grad_norm": 2.1888037109264933, + "language_loss": 0.84245199, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.86413276, + "num_input_tokens_seen": 158996725, + "step": 7416, + "time_per_iteration": 2.60307240486145 + }, + { + "auxiliary_loss_clip": 0.01095417, + "auxiliary_loss_mlp": 0.01034003, + "balance_loss_clip": 1.04398692, + "balance_loss_mlp": 1.01991272, + "epoch": 0.44593416503832856, + "flos": 21068790860160.0, + "grad_norm": 1.494230693182331, + "language_loss": 0.81091261, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.83220685, + "num_input_tokens_seen": 159017255, + "step": 7417, + "time_per_iteration": 2.7423362731933594 + }, + { + "auxiliary_loss_clip": 0.01105133, + "auxiliary_loss_mlp": 0.01040379, + "balance_loss_clip": 1.04227042, + "balance_loss_mlp": 1.02650881, + "epoch": 0.4459942882909965, + "flos": 22600488568320.0, + "grad_norm": 2.47121292521638, + "language_loss": 0.81035185, + "learning_rate": 2.443197426237077e-06, + "loss": 0.8318069, + "num_input_tokens_seen": 159035010, + "step": 7418, + "time_per_iteration": 2.67476487159729 + }, + { + "auxiliary_loss_clip": 0.01120234, + "auxiliary_loss_mlp": 0.007712, + "balance_loss_clip": 1.04618478, + "balance_loss_mlp": 1.00049162, + "epoch": 0.4460544115436645, + "flos": 26505486622080.0, + "grad_norm": 2.084312717643635, + "language_loss": 0.77342117, + "learning_rate": 2.442817638972991e-06, + "loss": 0.79233551, + "num_input_tokens_seen": 159055345, + "step": 7419, + "time_per_iteration": 2.760847806930542 + }, + { + "auxiliary_loss_clip": 0.0108993, + "auxiliary_loss_mlp": 0.0103388, + "balance_loss_clip": 1.03954124, + "balance_loss_mlp": 1.02063632, + "epoch": 0.4461145347963325, + "flos": 17604021893760.0, + "grad_norm": 1.824664612180611, + "language_loss": 0.72570968, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.74694777, + "num_input_tokens_seen": 159074225, + "step": 7420, + "time_per_iteration": 2.6990244388580322 + }, + { + "auxiliary_loss_clip": 0.01104512, + "auxiliary_loss_mlp": 0.01032052, + "balance_loss_clip": 1.0432508, + "balance_loss_mlp": 1.01793802, + "epoch": 0.44617465804900047, + "flos": 27268193036160.0, + "grad_norm": 1.5590654083825235, + "language_loss": 0.75280499, + "learning_rate": 2.442058014084156e-06, + "loss": 0.77417064, + "num_input_tokens_seen": 159095415, + "step": 7421, + "time_per_iteration": 2.751757860183716 + }, + { + "auxiliary_loss_clip": 0.01059239, + "auxiliary_loss_mlp": 0.01037453, + "balance_loss_clip": 1.03808808, + "balance_loss_mlp": 1.02374959, + "epoch": 0.44623478130166844, + "flos": 17786412178560.0, + "grad_norm": 1.7359325284030627, + "language_loss": 0.75753498, + "learning_rate": 2.44167817648821e-06, + "loss": 0.77850193, + "num_input_tokens_seen": 159114615, + "step": 7422, + "time_per_iteration": 4.3189520835876465 + }, + { + "auxiliary_loss_clip": 0.01125756, + "auxiliary_loss_mlp": 0.01033879, + "balance_loss_clip": 1.04443765, + "balance_loss_mlp": 1.02083755, + "epoch": 0.4462949045543364, + "flos": 23003011353600.0, + "grad_norm": 1.436007196155178, + "language_loss": 0.65393054, + "learning_rate": 2.441298322143784e-06, + "loss": 0.67552686, + "num_input_tokens_seen": 159134370, + "step": 7423, + "time_per_iteration": 4.272382020950317 + }, + { + "auxiliary_loss_clip": 0.01096555, + "auxiliary_loss_mlp": 0.01034422, + "balance_loss_clip": 1.04093194, + "balance_loss_mlp": 1.02195287, + "epoch": 0.44635502780700437, + "flos": 17820096157440.0, + "grad_norm": 1.6490570846190094, + "language_loss": 0.79002917, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.8113389, + "num_input_tokens_seen": 159152540, + "step": 7424, + "time_per_iteration": 2.6641786098480225 + }, + { + "auxiliary_loss_clip": 0.01109138, + "auxiliary_loss_mlp": 0.01031872, + "balance_loss_clip": 1.04272473, + "balance_loss_mlp": 1.01960564, + "epoch": 0.44641515105967233, + "flos": 26688020561280.0, + "grad_norm": 1.5476168372337398, + "language_loss": 0.80515361, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.82656378, + "num_input_tokens_seen": 159173425, + "step": 7425, + "time_per_iteration": 2.677921772003174 + }, + { + "auxiliary_loss_clip": 0.01111593, + "auxiliary_loss_mlp": 0.01030626, + "balance_loss_clip": 1.04249597, + "balance_loss_mlp": 1.01805556, + "epoch": 0.4464752743123403, + "flos": 18913324544640.0, + "grad_norm": 1.7505920916906397, + "language_loss": 0.77314126, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.79456341, + "num_input_tokens_seen": 159191210, + "step": 7426, + "time_per_iteration": 4.264745712280273 + }, + { + "auxiliary_loss_clip": 0.01098153, + "auxiliary_loss_mlp": 0.00770786, + "balance_loss_clip": 1.04180968, + "balance_loss_mlp": 1.00042045, + "epoch": 0.44653539756500826, + "flos": 29570318582400.0, + "grad_norm": 2.512425150903693, + "language_loss": 0.64678168, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.66547108, + "num_input_tokens_seen": 159211755, + "step": 7427, + "time_per_iteration": 2.746807336807251 + }, + { + "auxiliary_loss_clip": 0.01114285, + "auxiliary_loss_mlp": 0.01032662, + "balance_loss_clip": 1.04756093, + "balance_loss_mlp": 1.01968026, + "epoch": 0.44659552081767623, + "flos": 21468979261440.0, + "grad_norm": 1.6794687580888963, + "language_loss": 0.7564522, + "learning_rate": 2.439398799698608e-06, + "loss": 0.77792168, + "num_input_tokens_seen": 159230315, + "step": 7428, + "time_per_iteration": 2.675830364227295 + }, + { + "auxiliary_loss_clip": 0.01089417, + "auxiliary_loss_mlp": 0.0103803, + "balance_loss_clip": 1.03992331, + "balance_loss_mlp": 1.0244813, + "epoch": 0.4466556440703442, + "flos": 17931886260480.0, + "grad_norm": 2.160723316992149, + "language_loss": 0.77906388, + "learning_rate": 2.439018845165806e-06, + "loss": 0.80033839, + "num_input_tokens_seen": 159249810, + "step": 7429, + "time_per_iteration": 2.6864819526672363 + }, + { + "auxiliary_loss_clip": 0.01117759, + "auxiliary_loss_mlp": 0.01036133, + "balance_loss_clip": 1.04573584, + "balance_loss_mlp": 1.02222157, + "epoch": 0.44671576732301216, + "flos": 21107430915840.0, + "grad_norm": 1.6783165407459442, + "language_loss": 0.91421354, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93575251, + "num_input_tokens_seen": 159271715, + "step": 7430, + "time_per_iteration": 2.6472880840301514 + }, + { + "auxiliary_loss_clip": 0.01105427, + "auxiliary_loss_mlp": 0.00772764, + "balance_loss_clip": 1.04418826, + "balance_loss_mlp": 1.000386, + "epoch": 0.4467758905756801, + "flos": 23508920459520.0, + "grad_norm": 1.918378394995702, + "language_loss": 0.79452366, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.8133055, + "num_input_tokens_seen": 159290690, + "step": 7431, + "time_per_iteration": 2.7096598148345947 + }, + { + "auxiliary_loss_clip": 0.01108777, + "auxiliary_loss_mlp": 0.01036954, + "balance_loss_clip": 1.04568875, + "balance_loss_mlp": 1.02245188, + "epoch": 0.4468360138283481, + "flos": 18734022829440.0, + "grad_norm": 1.6794320575098944, + "language_loss": 0.79817986, + "learning_rate": 2.437878881739204e-06, + "loss": 0.81963724, + "num_input_tokens_seen": 159309400, + "step": 7432, + "time_per_iteration": 2.676522970199585 + }, + { + "auxiliary_loss_clip": 0.01094927, + "auxiliary_loss_mlp": 0.01040483, + "balance_loss_clip": 1.04654121, + "balance_loss_mlp": 1.02803755, + "epoch": 0.4468961370810161, + "flos": 23477139901440.0, + "grad_norm": 1.8261946877850768, + "language_loss": 0.76878047, + "learning_rate": 2.437498860702301e-06, + "loss": 0.79013455, + "num_input_tokens_seen": 159327425, + "step": 7433, + "time_per_iteration": 2.6820082664489746 + }, + { + "auxiliary_loss_clip": 0.01106089, + "auxiliary_loss_mlp": 0.01034932, + "balance_loss_clip": 1.04236984, + "balance_loss_mlp": 1.02372587, + "epoch": 0.4469562603336841, + "flos": 30075042539520.0, + "grad_norm": 1.6244691365264956, + "language_loss": 0.77377415, + "learning_rate": 2.437118823075398e-06, + "loss": 0.79518431, + "num_input_tokens_seen": 159345805, + "step": 7434, + "time_per_iteration": 2.7471024990081787 + }, + { + "auxiliary_loss_clip": 0.01118898, + "auxiliary_loss_mlp": 0.01031979, + "balance_loss_clip": 1.04707336, + "balance_loss_mlp": 1.01909828, + "epoch": 0.44701638358635204, + "flos": 22456415116800.0, + "grad_norm": 1.6740796261727897, + "language_loss": 0.64705265, + "learning_rate": 2.436738768872905e-06, + "loss": 0.6685614, + "num_input_tokens_seen": 159364595, + "step": 7435, + "time_per_iteration": 2.649425983428955 + }, + { + "auxiliary_loss_clip": 0.01112389, + "auxiliary_loss_mlp": 0.01029389, + "balance_loss_clip": 1.04875195, + "balance_loss_mlp": 1.01587653, + "epoch": 0.44707650683902, + "flos": 24057851080320.0, + "grad_norm": 1.6005542791240868, + "language_loss": 0.83477545, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.85619318, + "num_input_tokens_seen": 159385265, + "step": 7436, + "time_per_iteration": 2.6727020740509033 + }, + { + "auxiliary_loss_clip": 0.01073439, + "auxiliary_loss_mlp": 0.01045352, + "balance_loss_clip": 1.0402267, + "balance_loss_mlp": 1.02884197, + "epoch": 0.44713663009168797, + "flos": 23766938830080.0, + "grad_norm": 2.1717582772549995, + "language_loss": 0.79815632, + "learning_rate": 2.435978610798798e-06, + "loss": 0.81934428, + "num_input_tokens_seen": 159405080, + "step": 7437, + "time_per_iteration": 2.7589898109436035 + }, + { + "auxiliary_loss_clip": 0.01079969, + "auxiliary_loss_mlp": 0.01037183, + "balance_loss_clip": 1.0433023, + "balance_loss_mlp": 1.02375364, + "epoch": 0.44719675334435594, + "flos": 24499265316480.0, + "grad_norm": 1.7231807337022225, + "language_loss": 0.71860999, + "learning_rate": 2.435598506956009e-06, + "loss": 0.7397815, + "num_input_tokens_seen": 159424595, + "step": 7438, + "time_per_iteration": 2.794978380203247 + }, + { + "auxiliary_loss_clip": 0.01084835, + "auxiliary_loss_mlp": 0.01035733, + "balance_loss_clip": 1.04564655, + "balance_loss_mlp": 1.02180314, + "epoch": 0.4472568765970239, + "flos": 29781759991680.0, + "grad_norm": 1.556366888574876, + "language_loss": 0.67619812, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.69740379, + "num_input_tokens_seen": 159443865, + "step": 7439, + "time_per_iteration": 2.9251644611358643 + }, + { + "auxiliary_loss_clip": 0.01102346, + "auxiliary_loss_mlp": 0.01039634, + "balance_loss_clip": 1.0403614, + "balance_loss_mlp": 1.02436376, + "epoch": 0.44731699984969187, + "flos": 24643123286400.0, + "grad_norm": 1.714649831944237, + "language_loss": 0.73915118, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.760571, + "num_input_tokens_seen": 159464525, + "step": 7440, + "time_per_iteration": 2.773106813430786 + }, + { + "auxiliary_loss_clip": 0.01072825, + "auxiliary_loss_mlp": 0.01042282, + "balance_loss_clip": 1.03706956, + "balance_loss_mlp": 1.02789354, + "epoch": 0.44737712310235983, + "flos": 29455691304960.0, + "grad_norm": 1.740924989183362, + "language_loss": 0.74161476, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.76276582, + "num_input_tokens_seen": 159486385, + "step": 7441, + "time_per_iteration": 2.9042701721191406 + }, + { + "auxiliary_loss_clip": 0.01096694, + "auxiliary_loss_mlp": 0.01036467, + "balance_loss_clip": 1.04596698, + "balance_loss_mlp": 1.0220542, + "epoch": 0.4474372463550278, + "flos": 24896832024960.0, + "grad_norm": 1.9641422641471569, + "language_loss": 0.75060695, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.77193856, + "num_input_tokens_seen": 159503880, + "step": 7442, + "time_per_iteration": 2.776219129562378 + }, + { + "auxiliary_loss_clip": 0.01131095, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.04641354, + "balance_loss_mlp": 1.01900017, + "epoch": 0.44749736960769576, + "flos": 33181603125120.0, + "grad_norm": 1.741320347682455, + "language_loss": 0.74572098, + "learning_rate": 2.433697740261273e-06, + "loss": 0.76737112, + "num_input_tokens_seen": 159522980, + "step": 7443, + "time_per_iteration": 2.783189058303833 + }, + { + "auxiliary_loss_clip": 0.01099877, + "auxiliary_loss_mlp": 0.0103204, + "balance_loss_clip": 1.03843653, + "balance_loss_mlp": 1.01699591, + "epoch": 0.4475574928603637, + "flos": 21071807602560.0, + "grad_norm": 1.581803518054495, + "language_loss": 0.77928406, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.80060327, + "num_input_tokens_seen": 159543340, + "step": 7444, + "time_per_iteration": 2.750493049621582 + }, + { + "auxiliary_loss_clip": 0.01108777, + "auxiliary_loss_mlp": 0.01033259, + "balance_loss_clip": 1.04501557, + "balance_loss_mlp": 1.01988959, + "epoch": 0.4476176161130317, + "flos": 21862523646720.0, + "grad_norm": 2.5006881318170917, + "language_loss": 0.85238421, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.87380457, + "num_input_tokens_seen": 159558210, + "step": 7445, + "time_per_iteration": 2.6802477836608887 + }, + { + "auxiliary_loss_clip": 0.01087309, + "auxiliary_loss_mlp": 0.0104165, + "balance_loss_clip": 1.04073787, + "balance_loss_mlp": 1.02471042, + "epoch": 0.4476777393656997, + "flos": 22528667324160.0, + "grad_norm": 3.110631371373827, + "language_loss": 0.63355798, + "learning_rate": 2.432557082778765e-06, + "loss": 0.65484762, + "num_input_tokens_seen": 159577920, + "step": 7446, + "time_per_iteration": 2.746697187423706 + }, + { + "auxiliary_loss_clip": 0.01039011, + "auxiliary_loss_mlp": 0.01002627, + "balance_loss_clip": 1.02036047, + "balance_loss_mlp": 1.00081527, + "epoch": 0.4477378626183677, + "flos": 49017133877760.0, + "grad_norm": 0.738380684617154, + "language_loss": 0.50261772, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.5230341, + "num_input_tokens_seen": 159632295, + "step": 7447, + "time_per_iteration": 3.0176138877868652 + }, + { + "auxiliary_loss_clip": 0.01047805, + "auxiliary_loss_mlp": 0.0099926, + "balance_loss_clip": 1.0195471, + "balance_loss_mlp": 0.9976145, + "epoch": 0.44779798587103564, + "flos": 56542179392640.0, + "grad_norm": 0.7822716011451579, + "language_loss": 0.59427667, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61474735, + "num_input_tokens_seen": 159698435, + "step": 7448, + "time_per_iteration": 3.1922085285186768 + }, + { + "auxiliary_loss_clip": 0.01093955, + "auxiliary_loss_mlp": 0.01032649, + "balance_loss_clip": 1.04417181, + "balance_loss_mlp": 1.01983976, + "epoch": 0.4478581091237036, + "flos": 46498536040320.0, + "grad_norm": 1.6983811072489297, + "language_loss": 0.58952618, + "learning_rate": 2.431416277672789e-06, + "loss": 0.61079222, + "num_input_tokens_seen": 159722150, + "step": 7449, + "time_per_iteration": 2.9170258045196533 + }, + { + "auxiliary_loss_clip": 0.01096033, + "auxiliary_loss_mlp": 0.01031648, + "balance_loss_clip": 1.04244077, + "balance_loss_mlp": 1.01851141, + "epoch": 0.4479182323763716, + "flos": 20814363849600.0, + "grad_norm": 2.0305308033418497, + "language_loss": 0.8022064, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.82348317, + "num_input_tokens_seen": 159740550, + "step": 7450, + "time_per_iteration": 2.640101671218872 + }, + { + "auxiliary_loss_clip": 0.01128944, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.04747844, + "balance_loss_mlp": 1.02609515, + "epoch": 0.44797835562903954, + "flos": 14245979212800.0, + "grad_norm": 2.0706353062233878, + "language_loss": 0.79404807, + "learning_rate": 2.430655659114697e-06, + "loss": 0.81572944, + "num_input_tokens_seen": 159758245, + "step": 7451, + "time_per_iteration": 2.6094324588775635 + }, + { + "auxiliary_loss_clip": 0.01008441, + "auxiliary_loss_mlp": 0.01004662, + "balance_loss_clip": 1.02162147, + "balance_loss_mlp": 1.00313568, + "epoch": 0.4480384788817075, + "flos": 63534560169600.0, + "grad_norm": 0.8263901394620045, + "language_loss": 0.62780499, + "learning_rate": 2.430275325332681e-06, + "loss": 0.64793605, + "num_input_tokens_seen": 159826790, + "step": 7452, + "time_per_iteration": 3.3816721439361572 + }, + { + "auxiliary_loss_clip": 0.01128154, + "auxiliary_loss_mlp": 0.01034079, + "balance_loss_clip": 1.04587567, + "balance_loss_mlp": 1.01958907, + "epoch": 0.44809860213437547, + "flos": 21652626522240.0, + "grad_norm": 1.717773614702603, + "language_loss": 0.62656605, + "learning_rate": 2.429894975234582e-06, + "loss": 0.64818835, + "num_input_tokens_seen": 159845805, + "step": 7453, + "time_per_iteration": 2.6495423316955566 + }, + { + "auxiliary_loss_clip": 0.0102644, + "auxiliary_loss_mlp": 0.01007957, + "balance_loss_clip": 1.01617622, + "balance_loss_mlp": 1.00627661, + "epoch": 0.44815872538704343, + "flos": 69190634246400.0, + "grad_norm": 0.7452851567935764, + "language_loss": 0.57032764, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59067166, + "num_input_tokens_seen": 159898860, + "step": 7454, + "time_per_iteration": 3.0483179092407227 + }, + { + "auxiliary_loss_clip": 0.0110232, + "auxiliary_loss_mlp": 0.0104097, + "balance_loss_clip": 1.04301405, + "balance_loss_mlp": 1.02651, + "epoch": 0.4482188486397114, + "flos": 12598289510400.0, + "grad_norm": 2.1814246614415795, + "language_loss": 0.75516129, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.77659416, + "num_input_tokens_seen": 159911555, + "step": 7455, + "time_per_iteration": 2.639425039291382 + }, + { + "auxiliary_loss_clip": 0.01103634, + "auxiliary_loss_mlp": 0.0103636, + "balance_loss_clip": 1.0440948, + "balance_loss_mlp": 1.02343822, + "epoch": 0.44827897189237936, + "flos": 34058182631040.0, + "grad_norm": 1.8295063999245702, + "language_loss": 0.75630772, + "learning_rate": 2.428753827188016e-06, + "loss": 0.7777077, + "num_input_tokens_seen": 159931470, + "step": 7456, + "time_per_iteration": 2.809356451034546 + }, + { + "auxiliary_loss_clip": 0.01130195, + "auxiliary_loss_mlp": 0.01036439, + "balance_loss_clip": 1.05033028, + "balance_loss_mlp": 1.02355289, + "epoch": 0.44833909514504733, + "flos": 25147416280320.0, + "grad_norm": 60.5899352460765, + "language_loss": 0.76306677, + "learning_rate": 2.428373411969818e-06, + "loss": 0.78473306, + "num_input_tokens_seen": 159946115, + "step": 7457, + "time_per_iteration": 2.632532835006714 + }, + { + "auxiliary_loss_clip": 0.01111792, + "auxiliary_loss_mlp": 0.01031449, + "balance_loss_clip": 1.04215193, + "balance_loss_mlp": 1.01695263, + "epoch": 0.4483992183977153, + "flos": 16179984224640.0, + "grad_norm": 2.8627685619088203, + "language_loss": 0.68479908, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.70623147, + "num_input_tokens_seen": 159963915, + "step": 7458, + "time_per_iteration": 2.6376359462738037 + }, + { + "auxiliary_loss_clip": 0.01091284, + "auxiliary_loss_mlp": 0.01033162, + "balance_loss_clip": 1.04267764, + "balance_loss_mlp": 1.018332, + "epoch": 0.44845934165038326, + "flos": 17746048270080.0, + "grad_norm": 1.5800915665139277, + "language_loss": 0.71851492, + "learning_rate": 2.427612532815961e-06, + "loss": 0.73975933, + "num_input_tokens_seen": 159982140, + "step": 7459, + "time_per_iteration": 2.713164806365967 + }, + { + "auxiliary_loss_clip": 0.01108578, + "auxiliary_loss_mlp": 0.01036526, + "balance_loss_clip": 1.04210949, + "balance_loss_mlp": 1.02282834, + "epoch": 0.4485194649030513, + "flos": 21835914647040.0, + "grad_norm": 1.672173614468041, + "language_loss": 0.70216429, + "learning_rate": 2.427232068909154e-06, + "loss": 0.72361535, + "num_input_tokens_seen": 160002280, + "step": 7460, + "time_per_iteration": 2.6243271827697754 + }, + { + "auxiliary_loss_clip": 0.01129261, + "auxiliary_loss_mlp": 0.01038736, + "balance_loss_clip": 1.04698896, + "balance_loss_mlp": 1.02463329, + "epoch": 0.44857958815571924, + "flos": 20084515401600.0, + "grad_norm": 1.9532472719910148, + "language_loss": 0.77566743, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.79734743, + "num_input_tokens_seen": 160020260, + "step": 7461, + "time_per_iteration": 4.114460468292236 + }, + { + "auxiliary_loss_clip": 0.01128704, + "auxiliary_loss_mlp": 0.01034261, + "balance_loss_clip": 1.0455538, + "balance_loss_mlp": 1.02091575, + "epoch": 0.4486397114083872, + "flos": 27053519402880.0, + "grad_norm": 1.943200777150693, + "language_loss": 0.67738903, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.69901872, + "num_input_tokens_seen": 160040240, + "step": 7462, + "time_per_iteration": 5.671550035476685 + }, + { + "auxiliary_loss_clip": 0.01046056, + "auxiliary_loss_mlp": 0.01002183, + "balance_loss_clip": 1.0179913, + "balance_loss_mlp": 1.0006094, + "epoch": 0.4486998346610552, + "flos": 67321195931520.0, + "grad_norm": 0.7528637907126196, + "language_loss": 0.5449208, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.5654031, + "num_input_tokens_seen": 160093865, + "step": 7463, + "time_per_iteration": 3.132819890975952 + }, + { + "auxiliary_loss_clip": 0.01117188, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.04449058, + "balance_loss_mlp": 1.01758814, + "epoch": 0.44875995791372314, + "flos": 27636816360960.0, + "grad_norm": 2.3886431821168954, + "language_loss": 0.7580359, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.77951968, + "num_input_tokens_seen": 160113590, + "step": 7464, + "time_per_iteration": 2.7005674839019775 + }, + { + "auxiliary_loss_clip": 0.01116572, + "auxiliary_loss_mlp": 0.01037604, + "balance_loss_clip": 1.04709184, + "balance_loss_mlp": 1.02484834, + "epoch": 0.4488200811663911, + "flos": 13005947940480.0, + "grad_norm": 1.7787597626645963, + "language_loss": 0.74147099, + "learning_rate": 2.425329506653441e-06, + "loss": 0.76301277, + "num_input_tokens_seen": 160131795, + "step": 7465, + "time_per_iteration": 4.423643112182617 + }, + { + "auxiliary_loss_clip": 0.01110783, + "auxiliary_loss_mlp": 0.01040781, + "balance_loss_clip": 1.04708648, + "balance_loss_mlp": 1.02503395, + "epoch": 0.44888020441905907, + "flos": 27489977562240.0, + "grad_norm": 2.0439366025173347, + "language_loss": 0.7991035, + "learning_rate": 2.424948945758966e-06, + "loss": 0.82061917, + "num_input_tokens_seen": 160150635, + "step": 7466, + "time_per_iteration": 2.7003092765808105 + }, + { + "auxiliary_loss_clip": 0.01110719, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.04898739, + "balance_loss_mlp": 1.02141774, + "epoch": 0.44894032767172704, + "flos": 18259678800000.0, + "grad_norm": 2.4307522297147357, + "language_loss": 0.81000906, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.83146888, + "num_input_tokens_seen": 160168615, + "step": 7467, + "time_per_iteration": 2.6656453609466553 + }, + { + "auxiliary_loss_clip": 0.01074952, + "auxiliary_loss_mlp": 0.01032302, + "balance_loss_clip": 1.04580259, + "balance_loss_mlp": 1.01924217, + "epoch": 0.449000450924395, + "flos": 21579835610880.0, + "grad_norm": 2.1126461235100726, + "language_loss": 0.74707794, + "learning_rate": 2.424187775642129e-06, + "loss": 0.76815045, + "num_input_tokens_seen": 160187295, + "step": 7468, + "time_per_iteration": 2.7112534046173096 + }, + { + "auxiliary_loss_clip": 0.01097239, + "auxiliary_loss_mlp": 0.01031291, + "balance_loss_clip": 1.04224133, + "balance_loss_mlp": 1.01881611, + "epoch": 0.44906057417706297, + "flos": 17967904623360.0, + "grad_norm": 1.845085412210932, + "language_loss": 0.71481991, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.7361052, + "num_input_tokens_seen": 160205115, + "step": 7469, + "time_per_iteration": 2.680678606033325 + }, + { + "auxiliary_loss_clip": 0.01115577, + "auxiliary_loss_mlp": 0.01040939, + "balance_loss_clip": 1.04739857, + "balance_loss_mlp": 1.02700388, + "epoch": 0.44912069742973093, + "flos": 20047347803520.0, + "grad_norm": 1.9353970520381958, + "language_loss": 0.71990728, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.74147248, + "num_input_tokens_seen": 160222580, + "step": 7470, + "time_per_iteration": 2.6266865730285645 + }, + { + "auxiliary_loss_clip": 0.01085169, + "auxiliary_loss_mlp": 0.01037894, + "balance_loss_clip": 1.04166925, + "balance_loss_mlp": 1.02263546, + "epoch": 0.4491808206823989, + "flos": 21033526682880.0, + "grad_norm": 1.7352200929350259, + "language_loss": 0.76839507, + "learning_rate": 2.423045899863634e-06, + "loss": 0.78962576, + "num_input_tokens_seen": 160241520, + "step": 7471, + "time_per_iteration": 2.692333698272705 + }, + { + "auxiliary_loss_clip": 0.0112922, + "auxiliary_loss_mlp": 0.010358, + "balance_loss_clip": 1.04736388, + "balance_loss_mlp": 1.02259803, + "epoch": 0.44924094393506686, + "flos": 22967136645120.0, + "grad_norm": 1.6949435247941296, + "language_loss": 0.70284784, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.72449803, + "num_input_tokens_seen": 160261815, + "step": 7472, + "time_per_iteration": 2.714059829711914 + }, + { + "auxiliary_loss_clip": 0.01033495, + "auxiliary_loss_mlp": 0.01004013, + "balance_loss_clip": 1.01477528, + "balance_loss_mlp": 1.00226104, + "epoch": 0.4493010671877349, + "flos": 59233467864960.0, + "grad_norm": 0.7390973196636706, + "language_loss": 0.6168009, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.63717604, + "num_input_tokens_seen": 160317070, + "step": 7473, + "time_per_iteration": 3.1489851474761963 + }, + { + "auxiliary_loss_clip": 0.01131224, + "auxiliary_loss_mlp": 0.00771593, + "balance_loss_clip": 1.04812014, + "balance_loss_mlp": 1.0004611, + "epoch": 0.44936119044040285, + "flos": 18004892653440.0, + "grad_norm": 2.3114379148666817, + "language_loss": 0.78279471, + "learning_rate": 2.421903879707657e-06, + "loss": 0.80182284, + "num_input_tokens_seen": 160334980, + "step": 7474, + "time_per_iteration": 2.5561118125915527 + }, + { + "auxiliary_loss_clip": 0.01074804, + "auxiliary_loss_mlp": 0.01040047, + "balance_loss_clip": 1.03983307, + "balance_loss_mlp": 1.0254494, + "epoch": 0.4494213136930708, + "flos": 21251827589760.0, + "grad_norm": 1.6204554836894525, + "language_loss": 0.72024751, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.74139607, + "num_input_tokens_seen": 160354500, + "step": 7475, + "time_per_iteration": 2.7745461463928223 + }, + { + "auxiliary_loss_clip": 0.01080301, + "auxiliary_loss_mlp": 0.01041054, + "balance_loss_clip": 1.04167461, + "balance_loss_mlp": 1.02563405, + "epoch": 0.4494814369457388, + "flos": 27418695022080.0, + "grad_norm": 2.241823557245511, + "language_loss": 0.76592773, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.78714132, + "num_input_tokens_seen": 160373650, + "step": 7476, + "time_per_iteration": 2.7856860160827637 + }, + { + "auxiliary_loss_clip": 0.01122132, + "auxiliary_loss_mlp": 0.00773102, + "balance_loss_clip": 1.04493368, + "balance_loss_mlp": 1.00047529, + "epoch": 0.44954156019840674, + "flos": 22854053652480.0, + "grad_norm": 4.385259299883037, + "language_loss": 0.72134888, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.74030131, + "num_input_tokens_seen": 160393430, + "step": 7477, + "time_per_iteration": 2.641645669937134 + }, + { + "auxiliary_loss_clip": 0.01103781, + "auxiliary_loss_mlp": 0.01047956, + "balance_loss_clip": 1.04083133, + "balance_loss_mlp": 1.03148091, + "epoch": 0.4496016834510747, + "flos": 17201570935680.0, + "grad_norm": 2.795464855062127, + "language_loss": 0.67799896, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.69951636, + "num_input_tokens_seen": 160410545, + "step": 7478, + "time_per_iteration": 2.6307947635650635 + }, + { + "auxiliary_loss_clip": 0.01102543, + "auxiliary_loss_mlp": 0.01038923, + "balance_loss_clip": 1.04405093, + "balance_loss_mlp": 1.02537441, + "epoch": 0.4496618067037427, + "flos": 18916628595840.0, + "grad_norm": 1.8532543047361745, + "language_loss": 0.89243561, + "learning_rate": 2.420000193000779e-06, + "loss": 0.91385025, + "num_input_tokens_seen": 160428105, + "step": 7479, + "time_per_iteration": 2.733828544616699 + }, + { + "auxiliary_loss_clip": 0.01068922, + "auxiliary_loss_mlp": 0.01043272, + "balance_loss_clip": 1.04273605, + "balance_loss_mlp": 1.02804279, + "epoch": 0.44972192995641064, + "flos": 21031659175680.0, + "grad_norm": 2.916606412127397, + "language_loss": 0.75539804, + "learning_rate": 2.419619407822302e-06, + "loss": 0.77652001, + "num_input_tokens_seen": 160448815, + "step": 7480, + "time_per_iteration": 2.8518130779266357 + }, + { + "auxiliary_loss_clip": 0.01095249, + "auxiliary_loss_mlp": 0.01035055, + "balance_loss_clip": 1.04253781, + "balance_loss_mlp": 1.02012968, + "epoch": 0.4497820532090786, + "flos": 20777088510720.0, + "grad_norm": 1.9829776726262367, + "language_loss": 0.79885375, + "learning_rate": 2.419238606731815e-06, + "loss": 0.82015675, + "num_input_tokens_seen": 160465940, + "step": 7481, + "time_per_iteration": 2.7299835681915283 + }, + { + "auxiliary_loss_clip": 0.01102494, + "auxiliary_loss_mlp": 0.01039566, + "balance_loss_clip": 1.04328001, + "balance_loss_mlp": 1.02454567, + "epoch": 0.44984217646174657, + "flos": 33802606385280.0, + "grad_norm": 1.6381608125682177, + "language_loss": 0.68340528, + "learning_rate": 2.418857789743758e-06, + "loss": 0.70482588, + "num_input_tokens_seen": 160486710, + "step": 7482, + "time_per_iteration": 2.8123154640197754 + }, + { + "auxiliary_loss_clip": 0.01122196, + "auxiliary_loss_mlp": 0.01040775, + "balance_loss_clip": 1.04835725, + "balance_loss_mlp": 1.02638626, + "epoch": 0.44990229971441453, + "flos": 15518365660800.0, + "grad_norm": 2.0379383366397232, + "language_loss": 0.84707004, + "learning_rate": 2.418476956872571e-06, + "loss": 0.86869979, + "num_input_tokens_seen": 160503405, + "step": 7483, + "time_per_iteration": 2.718548536300659 + }, + { + "auxiliary_loss_clip": 0.01099077, + "auxiliary_loss_mlp": 0.01046214, + "balance_loss_clip": 1.04296637, + "balance_loss_mlp": 1.03027594, + "epoch": 0.4499624229670825, + "flos": 29861913191040.0, + "grad_norm": 1.8017494037756971, + "language_loss": 0.80644262, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.82789552, + "num_input_tokens_seen": 160525080, + "step": 7484, + "time_per_iteration": 2.8435990810394287 + }, + { + "auxiliary_loss_clip": 0.01075163, + "auxiliary_loss_mlp": 0.01037509, + "balance_loss_clip": 1.03809166, + "balance_loss_mlp": 1.02145171, + "epoch": 0.45002254621975046, + "flos": 18513674847360.0, + "grad_norm": 2.526248303429359, + "language_loss": 0.75311351, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.77424026, + "num_input_tokens_seen": 160540895, + "step": 7485, + "time_per_iteration": 2.7453646659851074 + }, + { + "auxiliary_loss_clip": 0.01027401, + "auxiliary_loss_mlp": 0.0100295, + "balance_loss_clip": 1.01817155, + "balance_loss_mlp": 1.00125754, + "epoch": 0.4500826694724185, + "flos": 70420394229120.0, + "grad_norm": 0.7859680562883086, + "language_loss": 0.58644986, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.60675335, + "num_input_tokens_seen": 160598270, + "step": 7486, + "time_per_iteration": 3.2535924911499023 + }, + { + "auxiliary_loss_clip": 0.0111614, + "auxiliary_loss_mlp": 0.01045183, + "balance_loss_clip": 1.04657292, + "balance_loss_mlp": 1.02917325, + "epoch": 0.45014279272508645, + "flos": 15778897983360.0, + "grad_norm": 2.484631064514228, + "language_loss": 0.83677804, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.85839128, + "num_input_tokens_seen": 160614720, + "step": 7487, + "time_per_iteration": 2.7236413955688477 + }, + { + "auxiliary_loss_clip": 0.01128709, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.04632056, + "balance_loss_mlp": 1.02443182, + "epoch": 0.4502029159777544, + "flos": 21799573061760.0, + "grad_norm": 1.5508029399024128, + "language_loss": 0.77568138, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.79735959, + "num_input_tokens_seen": 160635170, + "step": 7488, + "time_per_iteration": 2.6660585403442383 + }, + { + "auxiliary_loss_clip": 0.0112874, + "auxiliary_loss_mlp": 0.01045145, + "balance_loss_clip": 1.04882014, + "balance_loss_mlp": 1.02954042, + "epoch": 0.4502630392304224, + "flos": 28767966531840.0, + "grad_norm": 1.97851616048007, + "language_loss": 0.72073781, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.74247664, + "num_input_tokens_seen": 160654490, + "step": 7489, + "time_per_iteration": 2.7274820804595947 + }, + { + "auxiliary_loss_clip": 0.01109274, + "auxiliary_loss_mlp": 0.01039798, + "balance_loss_clip": 1.04584038, + "balance_loss_mlp": 1.02314413, + "epoch": 0.45032316248309034, + "flos": 15844182952320.0, + "grad_norm": 2.9737823054207926, + "language_loss": 0.6968661, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.71835679, + "num_input_tokens_seen": 160669400, + "step": 7490, + "time_per_iteration": 2.650700569152832 + }, + { + "auxiliary_loss_clip": 0.01026171, + "auxiliary_loss_mlp": 0.01004705, + "balance_loss_clip": 1.0231657, + "balance_loss_mlp": 1.00323248, + "epoch": 0.4503832857357583, + "flos": 57853600945920.0, + "grad_norm": 0.7292674820176653, + "language_loss": 0.56675166, + "learning_rate": 2.415429723843495e-06, + "loss": 0.58706039, + "num_input_tokens_seen": 160733820, + "step": 7491, + "time_per_iteration": 3.1893656253814697 + }, + { + "auxiliary_loss_clip": 0.01116518, + "auxiliary_loss_mlp": 0.01037403, + "balance_loss_clip": 1.04746497, + "balance_loss_mlp": 1.02327061, + "epoch": 0.4504434089884263, + "flos": 23878082488320.0, + "grad_norm": 1.6154687272881363, + "language_loss": 0.7939685, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.81550771, + "num_input_tokens_seen": 160753175, + "step": 7492, + "time_per_iteration": 2.7314138412475586 + }, + { + "auxiliary_loss_clip": 0.010986, + "auxiliary_loss_mlp": 0.00775969, + "balance_loss_clip": 1.04425228, + "balance_loss_mlp": 1.000494, + "epoch": 0.45050353224109424, + "flos": 17785083375360.0, + "grad_norm": 2.875303360797025, + "language_loss": 0.92825645, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.94700211, + "num_input_tokens_seen": 160768310, + "step": 7493, + "time_per_iteration": 2.7123935222625732 + }, + { + "auxiliary_loss_clip": 0.01039208, + "auxiliary_loss_mlp": 0.01001589, + "balance_loss_clip": 1.02041435, + "balance_loss_mlp": 0.99994355, + "epoch": 0.4505636554937622, + "flos": 65063420703360.0, + "grad_norm": 0.8110713299155351, + "language_loss": 0.62929082, + "learning_rate": 2.4142867511336e-06, + "loss": 0.64969873, + "num_input_tokens_seen": 160827370, + "step": 7494, + "time_per_iteration": 3.289635181427002 + }, + { + "auxiliary_loss_clip": 0.01129658, + "auxiliary_loss_mlp": 0.01035034, + "balance_loss_clip": 1.04754305, + "balance_loss_mlp": 1.02150989, + "epoch": 0.45062377874643017, + "flos": 22200084685440.0, + "grad_norm": 1.7474777674384385, + "language_loss": 0.82263976, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.84428668, + "num_input_tokens_seen": 160849140, + "step": 7495, + "time_per_iteration": 2.659642219543457 + }, + { + "auxiliary_loss_clip": 0.01115544, + "auxiliary_loss_mlp": 0.01041634, + "balance_loss_clip": 1.04483461, + "balance_loss_mlp": 1.02449155, + "epoch": 0.45068390199909814, + "flos": 37670293186560.0, + "grad_norm": 1.8332713503860085, + "language_loss": 0.86039978, + "learning_rate": 2.41352469075395e-06, + "loss": 0.8819716, + "num_input_tokens_seen": 160871280, + "step": 7496, + "time_per_iteration": 2.798741579055786 + }, + { + "auxiliary_loss_clip": 0.01134499, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.04969478, + "balance_loss_mlp": 1.02054274, + "epoch": 0.4507440252517661, + "flos": 22302501338880.0, + "grad_norm": 2.0558646291387066, + "language_loss": 0.76101983, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.78272235, + "num_input_tokens_seen": 160888625, + "step": 7497, + "time_per_iteration": 2.6553680896759033 + }, + { + "auxiliary_loss_clip": 0.01098074, + "auxiliary_loss_mlp": 0.01037956, + "balance_loss_clip": 1.04377723, + "balance_loss_mlp": 1.02352023, + "epoch": 0.45080414850443407, + "flos": 13188374138880.0, + "grad_norm": 2.277785969464064, + "language_loss": 0.75305939, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.77441967, + "num_input_tokens_seen": 160907040, + "step": 7498, + "time_per_iteration": 2.6950063705444336 + }, + { + "auxiliary_loss_clip": 0.01133264, + "auxiliary_loss_mlp": 0.01044893, + "balance_loss_clip": 1.04848719, + "balance_loss_mlp": 1.02897298, + "epoch": 0.4508642717571021, + "flos": 21944939402880.0, + "grad_norm": 3.3346599205762826, + "language_loss": 0.70080638, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.72258794, + "num_input_tokens_seen": 160927115, + "step": 7499, + "time_per_iteration": 2.6134774684906006 + }, + { + "auxiliary_loss_clip": 0.01084574, + "auxiliary_loss_mlp": 0.0103806, + "balance_loss_clip": 1.04212165, + "balance_loss_mlp": 1.02309906, + "epoch": 0.45092439500977005, + "flos": 23367468700800.0, + "grad_norm": 1.9346658302408082, + "language_loss": 0.77361268, + "learning_rate": 2.412000381939477e-06, + "loss": 0.79483902, + "num_input_tokens_seen": 160944405, + "step": 7500, + "time_per_iteration": 4.306777000427246 + }, + { + "auxiliary_loss_clip": 0.01084228, + "auxiliary_loss_mlp": 0.01034656, + "balance_loss_clip": 1.04249573, + "balance_loss_mlp": 1.02007651, + "epoch": 0.450984518262438, + "flos": 20772958446720.0, + "grad_norm": 1.9176241989159464, + "language_loss": 0.63056326, + "learning_rate": 2.411619265641992e-06, + "loss": 0.65175211, + "num_input_tokens_seen": 160961345, + "step": 7501, + "time_per_iteration": 5.803133487701416 + }, + { + "auxiliary_loss_clip": 0.01135547, + "auxiliary_loss_mlp": 0.01040046, + "balance_loss_clip": 1.04915273, + "balance_loss_mlp": 1.02445376, + "epoch": 0.451044641515106, + "flos": 17707372300800.0, + "grad_norm": 1.9532762899000093, + "language_loss": 0.84446234, + "learning_rate": 2.411238133735863e-06, + "loss": 0.86621827, + "num_input_tokens_seen": 160977330, + "step": 7502, + "time_per_iteration": 2.604753017425537 + }, + { + "auxiliary_loss_clip": 0.01105383, + "auxiliary_loss_mlp": 0.01036548, + "balance_loss_clip": 1.04670203, + "balance_loss_mlp": 1.02238584, + "epoch": 0.45110476476777395, + "flos": 20594698225920.0, + "grad_norm": 1.3813112457968315, + "language_loss": 0.79642487, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.81784415, + "num_input_tokens_seen": 160997280, + "step": 7503, + "time_per_iteration": 2.666677236557007 + }, + { + "auxiliary_loss_clip": 0.01104325, + "auxiliary_loss_mlp": 0.01036781, + "balance_loss_clip": 1.04764807, + "balance_loss_mlp": 1.02240419, + "epoch": 0.4511648880204419, + "flos": 16034043265920.0, + "grad_norm": 2.051596804130354, + "language_loss": 0.81191939, + "learning_rate": 2.410475823155484e-06, + "loss": 0.83333045, + "num_input_tokens_seen": 161014235, + "step": 7504, + "time_per_iteration": 4.276456117630005 + }, + { + "auxiliary_loss_clip": 0.01087433, + "auxiliary_loss_mlp": 0.01038305, + "balance_loss_clip": 1.04069161, + "balance_loss_mlp": 1.02469158, + "epoch": 0.4512250112731099, + "flos": 23978811202560.0, + "grad_norm": 1.5834485358881918, + "language_loss": 0.63315797, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.65441537, + "num_input_tokens_seen": 161032360, + "step": 7505, + "time_per_iteration": 2.947556734085083 + }, + { + "auxiliary_loss_clip": 0.01014942, + "auxiliary_loss_mlp": 0.01003244, + "balance_loss_clip": 1.02198029, + "balance_loss_mlp": 1.00188541, + "epoch": 0.45128513452577784, + "flos": 71462308037760.0, + "grad_norm": 0.8317919198459461, + "language_loss": 0.58857071, + "learning_rate": 2.409713450313968e-06, + "loss": 0.60875255, + "num_input_tokens_seen": 161091360, + "step": 7506, + "time_per_iteration": 3.395158052444458 + }, + { + "auxiliary_loss_clip": 0.01075605, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.04096067, + "balance_loss_mlp": 1.02287173, + "epoch": 0.4513452577784458, + "flos": 22090844448000.0, + "grad_norm": 1.7149339287343461, + "language_loss": 0.79334831, + "learning_rate": 2.40933224058142e-06, + "loss": 0.81447887, + "num_input_tokens_seen": 161110825, + "step": 7507, + "time_per_iteration": 2.8281381130218506 + }, + { + "auxiliary_loss_clip": 0.01091142, + "auxiliary_loss_mlp": 0.01036706, + "balance_loss_clip": 1.0425905, + "balance_loss_mlp": 1.02066064, + "epoch": 0.4514053810311138, + "flos": 24276403382400.0, + "grad_norm": 1.5823194059388275, + "language_loss": 0.73703611, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.75831455, + "num_input_tokens_seen": 161130685, + "step": 7508, + "time_per_iteration": 2.75742506980896 + }, + { + "auxiliary_loss_clip": 0.01118642, + "auxiliary_loss_mlp": 0.0103619, + "balance_loss_clip": 1.04927611, + "balance_loss_mlp": 1.02279091, + "epoch": 0.45146550428378174, + "flos": 17886781756800.0, + "grad_norm": 2.075832981658432, + "language_loss": 0.79118419, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.81273252, + "num_input_tokens_seen": 161147555, + "step": 7509, + "time_per_iteration": 2.6641790866851807 + }, + { + "auxiliary_loss_clip": 0.01130929, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.05022097, + "balance_loss_mlp": 1.02104306, + "epoch": 0.4515256275364497, + "flos": 24243437675520.0, + "grad_norm": 1.9616298828862797, + "language_loss": 0.73389792, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.75555289, + "num_input_tokens_seen": 161166255, + "step": 7510, + "time_per_iteration": 2.754516839981079 + }, + { + "auxiliary_loss_clip": 0.01129503, + "auxiliary_loss_mlp": 0.01032701, + "balance_loss_clip": 1.04575419, + "balance_loss_mlp": 1.01789534, + "epoch": 0.45158575078911767, + "flos": 20631039811200.0, + "grad_norm": 1.8899584921112549, + "language_loss": 0.77046561, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.79208767, + "num_input_tokens_seen": 161184720, + "step": 7511, + "time_per_iteration": 2.633896589279175 + }, + { + "auxiliary_loss_clip": 0.01119455, + "auxiliary_loss_mlp": 0.01033368, + "balance_loss_clip": 1.04665303, + "balance_loss_mlp": 1.01832986, + "epoch": 0.45164587404178563, + "flos": 23327751237120.0, + "grad_norm": 1.8239087865443961, + "language_loss": 0.78791374, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.80944192, + "num_input_tokens_seen": 161204360, + "step": 7512, + "time_per_iteration": 2.701643466949463 + }, + { + "auxiliary_loss_clip": 0.01094327, + "auxiliary_loss_mlp": 0.01039327, + "balance_loss_clip": 1.04103267, + "balance_loss_mlp": 1.02404392, + "epoch": 0.45170599729445365, + "flos": 23805973935360.0, + "grad_norm": 2.0955290596831713, + "language_loss": 0.87512183, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.89645839, + "num_input_tokens_seen": 161223575, + "step": 7513, + "time_per_iteration": 2.716236114501953 + }, + { + "auxiliary_loss_clip": 0.01110578, + "auxiliary_loss_mlp": 0.0103311, + "balance_loss_clip": 1.0445292, + "balance_loss_mlp": 1.02031827, + "epoch": 0.4517661205471216, + "flos": 23512942782720.0, + "grad_norm": 2.109318524386585, + "language_loss": 0.6707387, + "learning_rate": 2.406663338649419e-06, + "loss": 0.69217563, + "num_input_tokens_seen": 161243805, + "step": 7514, + "time_per_iteration": 2.665377140045166 + }, + { + "auxiliary_loss_clip": 0.01113013, + "auxiliary_loss_mlp": 0.0103579, + "balance_loss_clip": 1.04554498, + "balance_loss_mlp": 1.01995873, + "epoch": 0.4518262437997896, + "flos": 23513948363520.0, + "grad_norm": 2.2260653694398242, + "language_loss": 0.69152886, + "learning_rate": 2.406282005146318e-06, + "loss": 0.71301687, + "num_input_tokens_seen": 161261450, + "step": 7515, + "time_per_iteration": 2.6233787536621094 + }, + { + "auxiliary_loss_clip": 0.01114597, + "auxiliary_loss_mlp": 0.01038013, + "balance_loss_clip": 1.04228842, + "balance_loss_mlp": 1.02269435, + "epoch": 0.45188636705245755, + "flos": 14568061489920.0, + "grad_norm": 6.104635540487547, + "language_loss": 0.82568568, + "learning_rate": 2.405900656236963e-06, + "loss": 0.84721178, + "num_input_tokens_seen": 161276965, + "step": 7516, + "time_per_iteration": 2.7125158309936523 + }, + { + "auxiliary_loss_clip": 0.0112394, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.04487455, + "balance_loss_mlp": 1.02003694, + "epoch": 0.4519464903051255, + "flos": 19901550499200.0, + "grad_norm": 1.657947130481532, + "language_loss": 0.65597039, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.67754936, + "num_input_tokens_seen": 161295375, + "step": 7517, + "time_per_iteration": 2.6732585430145264 + }, + { + "auxiliary_loss_clip": 0.01091101, + "auxiliary_loss_mlp": 0.01032878, + "balance_loss_clip": 1.04268789, + "balance_loss_mlp": 1.02015853, + "epoch": 0.4520066135577935, + "flos": 18844376388480.0, + "grad_norm": 2.0502430920821904, + "language_loss": 0.63127112, + "learning_rate": 2.405137912257333e-06, + "loss": 0.65251088, + "num_input_tokens_seen": 161313010, + "step": 7518, + "time_per_iteration": 2.6873538494110107 + }, + { + "auxiliary_loss_clip": 0.01116444, + "auxiliary_loss_mlp": 0.01033811, + "balance_loss_clip": 1.0465678, + "balance_loss_mlp": 1.02015519, + "epoch": 0.45206673681046144, + "flos": 48214419713280.0, + "grad_norm": 1.68859992173611, + "language_loss": 0.59658802, + "learning_rate": 2.404756517215982e-06, + "loss": 0.61809057, + "num_input_tokens_seen": 161336690, + "step": 7519, + "time_per_iteration": 2.8561198711395264 + }, + { + "auxiliary_loss_clip": 0.01116298, + "auxiliary_loss_mlp": 0.01038351, + "balance_loss_clip": 1.0457139, + "balance_loss_mlp": 1.02468395, + "epoch": 0.4521268600631294, + "flos": 23842171866240.0, + "grad_norm": 1.5141513880128057, + "language_loss": 0.72439361, + "learning_rate": 2.404375106826223e-06, + "loss": 0.74594009, + "num_input_tokens_seen": 161357845, + "step": 7520, + "time_per_iteration": 2.709179162979126 + }, + { + "auxiliary_loss_clip": 0.0110396, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_clip": 1.04404962, + "balance_loss_mlp": 1.02297747, + "epoch": 0.4521869833157974, + "flos": 18843622202880.0, + "grad_norm": 2.131399149186965, + "language_loss": 0.75379634, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.77519679, + "num_input_tokens_seen": 161375160, + "step": 7521, + "time_per_iteration": 2.78236722946167 + }, + { + "auxiliary_loss_clip": 0.01109339, + "auxiliary_loss_mlp": 0.01039668, + "balance_loss_clip": 1.04502964, + "balance_loss_mlp": 1.02507663, + "epoch": 0.45224710656846534, + "flos": 19788072456960.0, + "grad_norm": 2.2802922264962247, + "language_loss": 0.68217206, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.70366216, + "num_input_tokens_seen": 161393690, + "step": 7522, + "time_per_iteration": 2.698141574859619 + }, + { + "auxiliary_loss_clip": 0.01111702, + "auxiliary_loss_mlp": 0.01036701, + "balance_loss_clip": 1.04239058, + "balance_loss_mlp": 1.02306962, + "epoch": 0.4523072298211333, + "flos": 28256131681920.0, + "grad_norm": 1.6149288487041198, + "language_loss": 0.6114409, + "learning_rate": 2.403230783711134e-06, + "loss": 0.63292497, + "num_input_tokens_seen": 161415015, + "step": 7523, + "time_per_iteration": 2.765838623046875 + }, + { + "auxiliary_loss_clip": 0.01122412, + "auxiliary_loss_mlp": 0.01039402, + "balance_loss_clip": 1.04672575, + "balance_loss_mlp": 1.02425027, + "epoch": 0.45236735307380127, + "flos": 11181039511680.0, + "grad_norm": 2.0249866031396837, + "language_loss": 0.78044772, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.80206585, + "num_input_tokens_seen": 161432940, + "step": 7524, + "time_per_iteration": 2.6178715229034424 + }, + { + "auxiliary_loss_clip": 0.01083067, + "auxiliary_loss_mlp": 0.0103962, + "balance_loss_clip": 1.04386139, + "balance_loss_mlp": 1.02560115, + "epoch": 0.45242747632646924, + "flos": 22601386408320.0, + "grad_norm": 2.4629173570449447, + "language_loss": 0.63756073, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.65878761, + "num_input_tokens_seen": 161452215, + "step": 7525, + "time_per_iteration": 2.767791509628296 + }, + { + "auxiliary_loss_clip": 0.01116902, + "auxiliary_loss_mlp": 0.01037108, + "balance_loss_clip": 1.04607654, + "balance_loss_mlp": 1.02390599, + "epoch": 0.45248759957913726, + "flos": 18256267008000.0, + "grad_norm": 1.8561008840058875, + "language_loss": 0.78973663, + "learning_rate": 2.402086322981083e-06, + "loss": 0.81127673, + "num_input_tokens_seen": 161469520, + "step": 7526, + "time_per_iteration": 2.6315999031066895 + }, + { + "auxiliary_loss_clip": 0.01098614, + "auxiliary_loss_mlp": 0.01030271, + "balance_loss_clip": 1.04242575, + "balance_loss_mlp": 1.01696694, + "epoch": 0.4525477228318052, + "flos": 22450094323200.0, + "grad_norm": 1.8159616365895555, + "language_loss": 0.80961096, + "learning_rate": 2.40170480555747e-06, + "loss": 0.83089983, + "num_input_tokens_seen": 161487335, + "step": 7527, + "time_per_iteration": 2.6868715286254883 + }, + { + "auxiliary_loss_clip": 0.01092415, + "auxiliary_loss_mlp": 0.01031467, + "balance_loss_clip": 1.04517341, + "balance_loss_mlp": 1.01763892, + "epoch": 0.4526078460844732, + "flos": 29644869260160.0, + "grad_norm": 11.448753069744305, + "language_loss": 0.6562798, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.67751861, + "num_input_tokens_seen": 161510095, + "step": 7528, + "time_per_iteration": 2.816391944885254 + }, + { + "auxiliary_loss_clip": 0.01100127, + "auxiliary_loss_mlp": 0.01033759, + "balance_loss_clip": 1.04077947, + "balance_loss_mlp": 1.02030635, + "epoch": 0.45266796933714115, + "flos": 23039747988480.0, + "grad_norm": 1.584867366654962, + "language_loss": 0.75341809, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.77475703, + "num_input_tokens_seen": 161528725, + "step": 7529, + "time_per_iteration": 2.688854694366455 + }, + { + "auxiliary_loss_clip": 0.01127981, + "auxiliary_loss_mlp": 0.0103405, + "balance_loss_clip": 1.04677176, + "balance_loss_mlp": 1.02092457, + "epoch": 0.4527280925898091, + "flos": 14428405411200.0, + "grad_norm": 2.148118662824089, + "language_loss": 0.73154545, + "learning_rate": 2.400560161948384e-06, + "loss": 0.75316578, + "num_input_tokens_seen": 161547195, + "step": 7530, + "time_per_iteration": 2.626149892807007 + }, + { + "auxiliary_loss_clip": 0.01097205, + "auxiliary_loss_mlp": 0.01036532, + "balance_loss_clip": 1.04691768, + "balance_loss_mlp": 1.0233357, + "epoch": 0.4527882158424771, + "flos": 22925515760640.0, + "grad_norm": 1.600682021317837, + "language_loss": 0.75962186, + "learning_rate": 2.400178583680834e-06, + "loss": 0.78095925, + "num_input_tokens_seen": 161565565, + "step": 7531, + "time_per_iteration": 2.7901298999786377 + }, + { + "auxiliary_loss_clip": 0.01122835, + "auxiliary_loss_mlp": 0.01036019, + "balance_loss_clip": 1.04418015, + "balance_loss_mlp": 1.02203524, + "epoch": 0.45284833909514505, + "flos": 25555326105600.0, + "grad_norm": 1.5467116056600763, + "language_loss": 0.66987002, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.69145852, + "num_input_tokens_seen": 161586630, + "step": 7532, + "time_per_iteration": 2.693523645401001 + }, + { + "auxiliary_loss_clip": 0.01115241, + "auxiliary_loss_mlp": 0.0104024, + "balance_loss_clip": 1.04580188, + "balance_loss_mlp": 1.02715659, + "epoch": 0.452908462347813, + "flos": 18150007599360.0, + "grad_norm": 3.168484665922808, + "language_loss": 0.78721988, + "learning_rate": 2.399415381635768e-06, + "loss": 0.80877471, + "num_input_tokens_seen": 161603815, + "step": 7533, + "time_per_iteration": 2.6418774127960205 + }, + { + "auxiliary_loss_clip": 0.01101942, + "auxiliary_loss_mlp": 0.01039812, + "balance_loss_clip": 1.04315686, + "balance_loss_mlp": 1.0244813, + "epoch": 0.452968585600481, + "flos": 19062749122560.0, + "grad_norm": 2.220433880382594, + "language_loss": 0.83064616, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.85206366, + "num_input_tokens_seen": 161622900, + "step": 7534, + "time_per_iteration": 2.751016855239868 + }, + { + "auxiliary_loss_clip": 0.01102917, + "auxiliary_loss_mlp": 0.0103851, + "balance_loss_clip": 1.04744101, + "balance_loss_mlp": 1.02389479, + "epoch": 0.45302870885314894, + "flos": 22051737515520.0, + "grad_norm": 1.8531826529396993, + "language_loss": 0.76665461, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.78806889, + "num_input_tokens_seen": 161641700, + "step": 7535, + "time_per_iteration": 2.6611855030059814 + }, + { + "auxiliary_loss_clip": 0.01083875, + "auxiliary_loss_mlp": 0.01036335, + "balance_loss_clip": 1.04374576, + "balance_loss_mlp": 1.02368724, + "epoch": 0.4530888321058169, + "flos": 20376217751040.0, + "grad_norm": 1.5302063461742579, + "language_loss": 0.80437911, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.82558113, + "num_input_tokens_seen": 161661955, + "step": 7536, + "time_per_iteration": 2.7666051387786865 + }, + { + "auxiliary_loss_clip": 0.01097222, + "auxiliary_loss_mlp": 0.01036263, + "balance_loss_clip": 1.04180908, + "balance_loss_mlp": 1.02248287, + "epoch": 0.4531489553584849, + "flos": 14830425406080.0, + "grad_norm": 2.016168707097938, + "language_loss": 0.76173598, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.78307086, + "num_input_tokens_seen": 161679245, + "step": 7537, + "time_per_iteration": 2.690034866333008 + }, + { + "auxiliary_loss_clip": 0.01118629, + "auxiliary_loss_mlp": 0.01035481, + "balance_loss_clip": 1.04544806, + "balance_loss_mlp": 1.0222249, + "epoch": 0.45320907861115284, + "flos": 21944975316480.0, + "grad_norm": 1.9502516921913984, + "language_loss": 0.75985712, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.78139818, + "num_input_tokens_seen": 161698795, + "step": 7538, + "time_per_iteration": 2.692582130432129 + }, + { + "auxiliary_loss_clip": 0.01037446, + "auxiliary_loss_mlp": 0.01009452, + "balance_loss_clip": 1.01847482, + "balance_loss_mlp": 1.00774765, + "epoch": 0.45326920186382086, + "flos": 66251455038720.0, + "grad_norm": 0.7823640203744525, + "language_loss": 0.62291718, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.64338624, + "num_input_tokens_seen": 161761980, + "step": 7539, + "time_per_iteration": 6.417045593261719 + }, + { + "auxiliary_loss_clip": 0.01129753, + "auxiliary_loss_mlp": 0.01046019, + "balance_loss_clip": 1.04852843, + "balance_loss_mlp": 1.03270316, + "epoch": 0.4533293251164888, + "flos": 14684233052160.0, + "grad_norm": 1.7334435648675772, + "language_loss": 0.65637821, + "learning_rate": 2.396743698142872e-06, + "loss": 0.67813587, + "num_input_tokens_seen": 161779455, + "step": 7540, + "time_per_iteration": 2.7546002864837646 + }, + { + "auxiliary_loss_clip": 0.01106819, + "auxiliary_loss_mlp": 0.01043222, + "balance_loss_clip": 1.0439229, + "balance_loss_mlp": 1.02768898, + "epoch": 0.4533894483691568, + "flos": 22601206840320.0, + "grad_norm": 2.0843332238803587, + "language_loss": 0.84594655, + "learning_rate": 2.396361968778424e-06, + "loss": 0.86744702, + "num_input_tokens_seen": 161798980, + "step": 7541, + "time_per_iteration": 4.3779473304748535 + }, + { + "auxiliary_loss_clip": 0.01103981, + "auxiliary_loss_mlp": 0.01038274, + "balance_loss_clip": 1.04346132, + "balance_loss_mlp": 1.02451134, + "epoch": 0.45344957162182475, + "flos": 34751617666560.0, + "grad_norm": 1.786741767322354, + "language_loss": 0.76398253, + "learning_rate": 2.395980224383889e-06, + "loss": 0.78540504, + "num_input_tokens_seen": 161819745, + "step": 7542, + "time_per_iteration": 2.8061442375183105 + }, + { + "auxiliary_loss_clip": 0.01100521, + "auxiliary_loss_mlp": 0.01030908, + "balance_loss_clip": 1.04320002, + "balance_loss_mlp": 1.01665092, + "epoch": 0.4535096948744927, + "flos": 23550218121600.0, + "grad_norm": 4.384838077420028, + "language_loss": 0.80294377, + "learning_rate": 2.395598464973746e-06, + "loss": 0.82425809, + "num_input_tokens_seen": 161838575, + "step": 7543, + "time_per_iteration": 4.4142186641693115 + }, + { + "auxiliary_loss_clip": 0.01116855, + "auxiliary_loss_mlp": 0.00771625, + "balance_loss_clip": 1.04452896, + "balance_loss_mlp": 1.00043499, + "epoch": 0.4535698181271607, + "flos": 25557552748800.0, + "grad_norm": 1.7946145717938884, + "language_loss": 0.75708425, + "learning_rate": 2.395216690562469e-06, + "loss": 0.77596909, + "num_input_tokens_seen": 161858590, + "step": 7544, + "time_per_iteration": 2.706681966781616 + }, + { + "auxiliary_loss_clip": 0.01097765, + "auxiliary_loss_mlp": 0.01037632, + "balance_loss_clip": 1.04519629, + "balance_loss_mlp": 1.02378595, + "epoch": 0.45362994137982865, + "flos": 24864117713280.0, + "grad_norm": 1.7108154873098056, + "language_loss": 0.75483274, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.7761867, + "num_input_tokens_seen": 161878390, + "step": 7545, + "time_per_iteration": 2.741312026977539 + }, + { + "auxiliary_loss_clip": 0.01106771, + "auxiliary_loss_mlp": 0.0103517, + "balance_loss_clip": 1.04418731, + "balance_loss_mlp": 1.02098417, + "epoch": 0.4536900646324966, + "flos": 30806794408320.0, + "grad_norm": 2.2011621045210057, + "language_loss": 0.72520149, + "learning_rate": 2.394453096794423e-06, + "loss": 0.74662089, + "num_input_tokens_seen": 161898610, + "step": 7546, + "time_per_iteration": 2.7891902923583984 + }, + { + "auxiliary_loss_clip": 0.01108307, + "auxiliary_loss_mlp": 0.01035115, + "balance_loss_clip": 1.04388261, + "balance_loss_mlp": 1.02008224, + "epoch": 0.4537501878851646, + "flos": 23404313076480.0, + "grad_norm": 1.593135285125141, + "language_loss": 0.75609434, + "learning_rate": 2.394071277466609e-06, + "loss": 0.77752858, + "num_input_tokens_seen": 161918210, + "step": 7547, + "time_per_iteration": 2.7260210514068604 + }, + { + "auxiliary_loss_clip": 0.01120791, + "auxiliary_loss_mlp": 0.01033715, + "balance_loss_clip": 1.04588616, + "balance_loss_mlp": 1.01945722, + "epoch": 0.45381031113783254, + "flos": 18149289327360.0, + "grad_norm": 2.150959748604014, + "language_loss": 0.70081824, + "learning_rate": 2.393689443195573e-06, + "loss": 0.72236335, + "num_input_tokens_seen": 161936950, + "step": 7548, + "time_per_iteration": 2.652388095855713 + }, + { + "auxiliary_loss_clip": 0.01129285, + "auxiliary_loss_mlp": 0.01039378, + "balance_loss_clip": 1.04662538, + "balance_loss_mlp": 1.0256331, + "epoch": 0.4538704343905005, + "flos": 25336666062720.0, + "grad_norm": 2.8840782688813293, + "language_loss": 0.73135072, + "learning_rate": 2.393307593995794e-06, + "loss": 0.75303733, + "num_input_tokens_seen": 161955550, + "step": 7549, + "time_per_iteration": 2.8452274799346924 + }, + { + "auxiliary_loss_clip": 0.01091023, + "auxiliary_loss_mlp": 0.01028579, + "balance_loss_clip": 1.040573, + "balance_loss_mlp": 1.01576996, + "epoch": 0.4539305576431685, + "flos": 28731445378560.0, + "grad_norm": 1.9190169905093657, + "language_loss": 0.65320408, + "learning_rate": 2.392925729881751e-06, + "loss": 0.67440009, + "num_input_tokens_seen": 161976760, + "step": 7550, + "time_per_iteration": 2.783653497695923 + }, + { + "auxiliary_loss_clip": 0.01113741, + "auxiliary_loss_mlp": 0.01035092, + "balance_loss_clip": 1.05046797, + "balance_loss_mlp": 1.02172232, + "epoch": 0.45399068089583644, + "flos": 22492397566080.0, + "grad_norm": 1.6128261499338563, + "language_loss": 0.69028163, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.71176994, + "num_input_tokens_seen": 161996120, + "step": 7551, + "time_per_iteration": 2.6571664810180664 + }, + { + "auxiliary_loss_clip": 0.01115638, + "auxiliary_loss_mlp": 0.010339, + "balance_loss_clip": 1.04326105, + "balance_loss_mlp": 1.01979804, + "epoch": 0.45405080414850446, + "flos": 12893403651840.0, + "grad_norm": 1.789312830614556, + "language_loss": 0.79496789, + "learning_rate": 2.392161956968798e-06, + "loss": 0.81646329, + "num_input_tokens_seen": 162011125, + "step": 7552, + "time_per_iteration": 2.6482155323028564 + }, + { + "auxiliary_loss_clip": 0.01042694, + "auxiliary_loss_mlp": 0.0100358, + "balance_loss_clip": 1.02483499, + "balance_loss_mlp": 1.00200677, + "epoch": 0.4541109274011724, + "flos": 59766919724160.0, + "grad_norm": 0.8270469682211425, + "language_loss": 0.57826698, + "learning_rate": 2.39178004819885e-06, + "loss": 0.59872973, + "num_input_tokens_seen": 162068705, + "step": 7553, + "time_per_iteration": 3.1456856727600098 + }, + { + "auxiliary_loss_clip": 0.01064062, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.04350471, + "balance_loss_mlp": 1.02177691, + "epoch": 0.4541710506538404, + "flos": 28511743841280.0, + "grad_norm": 1.3658485385977341, + "language_loss": 0.76709622, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.78807783, + "num_input_tokens_seen": 162089655, + "step": 7554, + "time_per_iteration": 2.8080356121063232 + }, + { + "auxiliary_loss_clip": 0.01108851, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.0467329, + "balance_loss_mlp": 1.02056265, + "epoch": 0.45423117390650836, + "flos": 17675591742720.0, + "grad_norm": 3.0408177613289014, + "language_loss": 0.7764836, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.79792738, + "num_input_tokens_seen": 162108465, + "step": 7555, + "time_per_iteration": 2.6776504516601562 + }, + { + "auxiliary_loss_clip": 0.01059757, + "auxiliary_loss_mlp": 0.01032208, + "balance_loss_clip": 1.04157853, + "balance_loss_mlp": 1.01914918, + "epoch": 0.4542912971591763, + "flos": 28072556248320.0, + "grad_norm": 1.7035673731774164, + "language_loss": 0.72646725, + "learning_rate": 2.390634232808903e-06, + "loss": 0.74738687, + "num_input_tokens_seen": 162129910, + "step": 7556, + "time_per_iteration": 2.851022720336914 + }, + { + "auxiliary_loss_clip": 0.01133495, + "auxiliary_loss_mlp": 0.01038462, + "balance_loss_clip": 1.04808855, + "balance_loss_mlp": 1.02491426, + "epoch": 0.4543514204118443, + "flos": 22671771108480.0, + "grad_norm": 2.040538066845486, + "language_loss": 0.6298486, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.65156817, + "num_input_tokens_seen": 162148840, + "step": 7557, + "time_per_iteration": 2.7630646228790283 + }, + { + "auxiliary_loss_clip": 0.01029784, + "auxiliary_loss_mlp": 0.0100461, + "balance_loss_clip": 1.02091062, + "balance_loss_mlp": 1.00302434, + "epoch": 0.45441154366451225, + "flos": 58216549921920.0, + "grad_norm": 0.683633086089208, + "language_loss": 0.57569897, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.59604287, + "num_input_tokens_seen": 162208500, + "step": 7558, + "time_per_iteration": 3.1137866973876953 + }, + { + "auxiliary_loss_clip": 0.01120146, + "auxiliary_loss_mlp": 0.0104176, + "balance_loss_clip": 1.04774594, + "balance_loss_mlp": 1.02645946, + "epoch": 0.4544716669171802, + "flos": 16764286763520.0, + "grad_norm": 4.36821938683546, + "language_loss": 0.56214309, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.58376217, + "num_input_tokens_seen": 162224650, + "step": 7559, + "time_per_iteration": 2.6453661918640137 + }, + { + "auxiliary_loss_clip": 0.01114034, + "auxiliary_loss_mlp": 0.00771404, + "balance_loss_clip": 1.04701853, + "balance_loss_mlp": 1.00053644, + "epoch": 0.4545317901698482, + "flos": 15925233991680.0, + "grad_norm": 3.62707185125481, + "language_loss": 0.72154331, + "learning_rate": 2.389106271642792e-06, + "loss": 0.74039769, + "num_input_tokens_seen": 162242930, + "step": 7560, + "time_per_iteration": 2.734957456588745 + }, + { + "auxiliary_loss_clip": 0.01047807, + "auxiliary_loss_mlp": 0.01042508, + "balance_loss_clip": 1.03757131, + "balance_loss_mlp": 1.02745199, + "epoch": 0.45459191342251615, + "flos": 17639752947840.0, + "grad_norm": 2.1379103724447517, + "language_loss": 0.69509232, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.71599543, + "num_input_tokens_seen": 162261455, + "step": 7561, + "time_per_iteration": 2.8633503913879395 + }, + { + "auxiliary_loss_clip": 0.01103836, + "auxiliary_loss_mlp": 0.01038069, + "balance_loss_clip": 1.04502749, + "balance_loss_mlp": 1.02508128, + "epoch": 0.4546520366751841, + "flos": 16176608346240.0, + "grad_norm": 1.7850356135584633, + "language_loss": 0.85308814, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.87450719, + "num_input_tokens_seen": 162279725, + "step": 7562, + "time_per_iteration": 2.6936264038085938 + }, + { + "auxiliary_loss_clip": 0.01113259, + "auxiliary_loss_mlp": 0.01038297, + "balance_loss_clip": 1.04309893, + "balance_loss_mlp": 1.0244745, + "epoch": 0.4547121599278521, + "flos": 19751443562880.0, + "grad_norm": 1.7930294917475702, + "language_loss": 0.89894032, + "learning_rate": 2.38796014579055e-06, + "loss": 0.92045587, + "num_input_tokens_seen": 162297865, + "step": 7563, + "time_per_iteration": 2.6632707118988037 + }, + { + "auxiliary_loss_clip": 0.01128772, + "auxiliary_loss_mlp": 0.00772113, + "balance_loss_clip": 1.04633093, + "balance_loss_mlp": 1.00060475, + "epoch": 0.45477228318052004, + "flos": 19937461121280.0, + "grad_norm": 1.7120070486519374, + "language_loss": 0.71349525, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.73250407, + "num_input_tokens_seen": 162316010, + "step": 7564, + "time_per_iteration": 2.6610071659088135 + }, + { + "auxiliary_loss_clip": 0.01118776, + "auxiliary_loss_mlp": 0.01037586, + "balance_loss_clip": 1.04351079, + "balance_loss_mlp": 1.02376413, + "epoch": 0.454832406433188, + "flos": 21288312829440.0, + "grad_norm": 2.3273072225052998, + "language_loss": 0.67977536, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.70133895, + "num_input_tokens_seen": 162336115, + "step": 7565, + "time_per_iteration": 2.645447015762329 + }, + { + "auxiliary_loss_clip": 0.01084701, + "auxiliary_loss_mlp": 0.01033288, + "balance_loss_clip": 1.04171932, + "balance_loss_mlp": 1.02002633, + "epoch": 0.45489252968585603, + "flos": 24498726612480.0, + "grad_norm": 1.877770036567151, + "language_loss": 0.80176723, + "learning_rate": 2.386813887534922e-06, + "loss": 0.82294714, + "num_input_tokens_seen": 162355705, + "step": 7566, + "time_per_iteration": 2.7949163913726807 + }, + { + "auxiliary_loss_clip": 0.01090452, + "auxiliary_loss_mlp": 0.01035417, + "balance_loss_clip": 1.04210711, + "balance_loss_mlp": 1.01981235, + "epoch": 0.454952652938524, + "flos": 17092474352640.0, + "grad_norm": 1.6100724605132029, + "language_loss": 0.73702621, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.75828493, + "num_input_tokens_seen": 162374055, + "step": 7567, + "time_per_iteration": 2.8082687854766846 + }, + { + "auxiliary_loss_clip": 0.01093893, + "auxiliary_loss_mlp": 0.01039284, + "balance_loss_clip": 1.04401243, + "balance_loss_mlp": 1.02519345, + "epoch": 0.45501277619119196, + "flos": 27630387826560.0, + "grad_norm": 1.3909583171669249, + "language_loss": 0.81125635, + "learning_rate": 2.386049642000249e-06, + "loss": 0.83258814, + "num_input_tokens_seen": 162393560, + "step": 7568, + "time_per_iteration": 2.7837767601013184 + }, + { + "auxiliary_loss_clip": 0.01126615, + "auxiliary_loss_mlp": 0.01047153, + "balance_loss_clip": 1.04950857, + "balance_loss_mlp": 1.03145313, + "epoch": 0.4550728994438599, + "flos": 19974664632960.0, + "grad_norm": 2.2201304610210175, + "language_loss": 0.79881442, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.82055211, + "num_input_tokens_seen": 162413170, + "step": 7569, + "time_per_iteration": 2.6318490505218506 + }, + { + "auxiliary_loss_clip": 0.01121847, + "auxiliary_loss_mlp": 0.01038069, + "balance_loss_clip": 1.04655576, + "balance_loss_mlp": 1.02286983, + "epoch": 0.4551330226965279, + "flos": 26066873646720.0, + "grad_norm": 1.3612588382742794, + "language_loss": 0.75316679, + "learning_rate": 2.385285337909412e-06, + "loss": 0.77476597, + "num_input_tokens_seen": 162434080, + "step": 7570, + "time_per_iteration": 2.6693389415740967 + }, + { + "auxiliary_loss_clip": 0.0110874, + "auxiliary_loss_mlp": 0.01042662, + "balance_loss_clip": 1.0496285, + "balance_loss_mlp": 1.02787971, + "epoch": 0.45519314594919585, + "flos": 32781091501440.0, + "grad_norm": 1.7331933441120846, + "language_loss": 0.74851429, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.77002835, + "num_input_tokens_seen": 162455445, + "step": 7571, + "time_per_iteration": 2.8367550373077393 + }, + { + "auxiliary_loss_clip": 0.01118243, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.04903221, + "balance_loss_mlp": 1.02305007, + "epoch": 0.4552532692018638, + "flos": 19172671718400.0, + "grad_norm": 1.8103885190184377, + "language_loss": 0.81033444, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.83188736, + "num_input_tokens_seen": 162474940, + "step": 7572, + "time_per_iteration": 2.654205322265625 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01041135, + "balance_loss_clip": 1.04709005, + "balance_loss_mlp": 1.02511287, + "epoch": 0.4553133924545318, + "flos": 26027156183040.0, + "grad_norm": 1.7361541689984175, + "language_loss": 0.7262516, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.74781018, + "num_input_tokens_seen": 162493340, + "step": 7573, + "time_per_iteration": 2.7468600273132324 + }, + { + "auxiliary_loss_clip": 0.01124507, + "auxiliary_loss_mlp": 0.01039816, + "balance_loss_clip": 1.04916418, + "balance_loss_mlp": 1.02327013, + "epoch": 0.45537351570719975, + "flos": 30661535808000.0, + "grad_norm": 1.869301925708578, + "language_loss": 0.74335551, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.76499879, + "num_input_tokens_seen": 162514360, + "step": 7574, + "time_per_iteration": 2.7575597763061523 + }, + { + "auxiliary_loss_clip": 0.01121884, + "auxiliary_loss_mlp": 0.01036714, + "balance_loss_clip": 1.04758859, + "balance_loss_mlp": 1.02184868, + "epoch": 0.4554336389598677, + "flos": 24353396184960.0, + "grad_norm": 1.5603127476263212, + "language_loss": 0.7161333, + "learning_rate": 2.383374322259915e-06, + "loss": 0.7377193, + "num_input_tokens_seen": 162535240, + "step": 7575, + "time_per_iteration": 2.6638269424438477 + }, + { + "auxiliary_loss_clip": 0.01106959, + "auxiliary_loss_mlp": 0.01035456, + "balance_loss_clip": 1.04536855, + "balance_loss_mlp": 1.02120471, + "epoch": 0.4554937622125357, + "flos": 20557925677440.0, + "grad_norm": 1.872589408642276, + "language_loss": 0.73370463, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.7551288, + "num_input_tokens_seen": 162553880, + "step": 7576, + "time_per_iteration": 2.686311721801758 + }, + { + "auxiliary_loss_clip": 0.01129005, + "auxiliary_loss_mlp": 0.01036522, + "balance_loss_clip": 1.04784572, + "balance_loss_mlp": 1.02179956, + "epoch": 0.45555388546520365, + "flos": 22820764723200.0, + "grad_norm": 1.7873556557153987, + "language_loss": 0.66664052, + "learning_rate": 2.382609814135511e-06, + "loss": 0.68829584, + "num_input_tokens_seen": 162574485, + "step": 7577, + "time_per_iteration": 2.6766581535339355 + }, + { + "auxiliary_loss_clip": 0.01103092, + "auxiliary_loss_mlp": 0.01046596, + "balance_loss_clip": 1.04435253, + "balance_loss_mlp": 1.0300076, + "epoch": 0.4556140087178716, + "flos": 21725992051200.0, + "grad_norm": 1.9298557564452474, + "language_loss": 0.74309111, + "learning_rate": 2.382227538303157e-06, + "loss": 0.76458794, + "num_input_tokens_seen": 162595130, + "step": 7578, + "time_per_iteration": 4.310480356216431 + }, + { + "auxiliary_loss_clip": 0.01079377, + "auxiliary_loss_mlp": 0.00774819, + "balance_loss_clip": 1.04437256, + "balance_loss_mlp": 1.00061071, + "epoch": 0.45567413197053963, + "flos": 25994513698560.0, + "grad_norm": 1.7583976894464832, + "language_loss": 0.69843179, + "learning_rate": 2.381845247976697e-06, + "loss": 0.71697378, + "num_input_tokens_seen": 162615720, + "step": 7579, + "time_per_iteration": 4.325899362564087 + }, + { + "auxiliary_loss_clip": 0.01116252, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.0446142, + "balance_loss_mlp": 1.0195992, + "epoch": 0.4557342552232076, + "flos": 21537604195200.0, + "grad_norm": 1.7639178263730233, + "language_loss": 0.78628397, + "learning_rate": 2.381462943170627e-06, + "loss": 0.80777884, + "num_input_tokens_seen": 162635825, + "step": 7580, + "time_per_iteration": 2.6391446590423584 + }, + { + "auxiliary_loss_clip": 0.0113405, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.05214024, + "balance_loss_mlp": 1.01697779, + "epoch": 0.45579437847587556, + "flos": 40001972647680.0, + "grad_norm": 1.99718885063772, + "language_loss": 0.68943548, + "learning_rate": 2.381080623899444e-06, + "loss": 0.71109068, + "num_input_tokens_seen": 162659130, + "step": 7581, + "time_per_iteration": 4.234206914901733 + }, + { + "auxiliary_loss_clip": 0.01111938, + "auxiliary_loss_mlp": 0.01032821, + "balance_loss_clip": 1.04282808, + "balance_loss_mlp": 1.01836669, + "epoch": 0.4558545017285435, + "flos": 31138501530240.0, + "grad_norm": 1.6647606381596314, + "language_loss": 0.73356318, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.75501084, + "num_input_tokens_seen": 162681665, + "step": 7582, + "time_per_iteration": 4.333024978637695 + }, + { + "auxiliary_loss_clip": 0.0113626, + "auxiliary_loss_mlp": 0.01043946, + "balance_loss_clip": 1.05043411, + "balance_loss_mlp": 1.02829337, + "epoch": 0.4559146249812115, + "flos": 21725776569600.0, + "grad_norm": 1.9011112097623832, + "language_loss": 0.72327513, + "learning_rate": 2.380315942019729e-06, + "loss": 0.74507719, + "num_input_tokens_seen": 162702040, + "step": 7583, + "time_per_iteration": 2.633423089981079 + }, + { + "auxiliary_loss_clip": 0.01122524, + "auxiliary_loss_mlp": 0.01037395, + "balance_loss_clip": 1.05119634, + "balance_loss_mlp": 1.02291131, + "epoch": 0.45597474823387946, + "flos": 23805973935360.0, + "grad_norm": 1.6028864846132196, + "language_loss": 0.72692537, + "learning_rate": 2.379933579440195e-06, + "loss": 0.74852461, + "num_input_tokens_seen": 162722375, + "step": 7584, + "time_per_iteration": 2.6895499229431152 + }, + { + "auxiliary_loss_clip": 0.01089384, + "auxiliary_loss_mlp": 0.01040718, + "balance_loss_clip": 1.04311633, + "balance_loss_mlp": 1.02606773, + "epoch": 0.4560348714865474, + "flos": 31905661230720.0, + "grad_norm": 1.833639423310481, + "language_loss": 0.68204761, + "learning_rate": 2.379551202453541e-06, + "loss": 0.70334864, + "num_input_tokens_seen": 162746095, + "step": 7585, + "time_per_iteration": 2.7882261276245117 + }, + { + "auxiliary_loss_clip": 0.01132515, + "auxiliary_loss_mlp": 0.01030518, + "balance_loss_clip": 1.05002046, + "balance_loss_mlp": 1.01725006, + "epoch": 0.4560949947392154, + "flos": 22048828513920.0, + "grad_norm": 1.65915998971852, + "language_loss": 0.7634117, + "learning_rate": 2.379168811074267e-06, + "loss": 0.78504205, + "num_input_tokens_seen": 162766330, + "step": 7586, + "time_per_iteration": 2.636626720428467 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.04642403, + "balance_loss_mlp": 1.01651323, + "epoch": 0.45615511799188335, + "flos": 24571804832640.0, + "grad_norm": 44.63874812648689, + "language_loss": 0.78151405, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.80290735, + "num_input_tokens_seen": 162784755, + "step": 7587, + "time_per_iteration": 2.7801096439361572 + }, + { + "auxiliary_loss_clip": 0.01105539, + "auxiliary_loss_mlp": 0.01044536, + "balance_loss_clip": 1.04288149, + "balance_loss_mlp": 1.02933669, + "epoch": 0.4562152412445513, + "flos": 18330709944960.0, + "grad_norm": 2.252015566278715, + "language_loss": 0.6950196, + "learning_rate": 2.378403985195863e-06, + "loss": 0.71652043, + "num_input_tokens_seen": 162803850, + "step": 7588, + "time_per_iteration": 2.7108840942382812 + }, + { + "auxiliary_loss_clip": 0.01118383, + "auxiliary_loss_mlp": 0.01036327, + "balance_loss_clip": 1.05038464, + "balance_loss_mlp": 1.02234375, + "epoch": 0.4562753644972193, + "flos": 13516525814400.0, + "grad_norm": 1.6983482750091652, + "language_loss": 0.79372728, + "learning_rate": 2.378021550725735e-06, + "loss": 0.81527448, + "num_input_tokens_seen": 162820775, + "step": 7589, + "time_per_iteration": 2.6967854499816895 + }, + { + "auxiliary_loss_clip": 0.01121003, + "auxiliary_loss_mlp": 0.01035976, + "balance_loss_clip": 1.04755974, + "balance_loss_mlp": 1.02120006, + "epoch": 0.45633548774988725, + "flos": 29639697701760.0, + "grad_norm": 2.457585749278853, + "language_loss": 0.62875861, + "learning_rate": 2.377639101920992e-06, + "loss": 0.6503284, + "num_input_tokens_seen": 162839695, + "step": 7590, + "time_per_iteration": 2.6659393310546875 + }, + { + "auxiliary_loss_clip": 0.01101858, + "auxiliary_loss_mlp": 0.01045493, + "balance_loss_clip": 1.04248881, + "balance_loss_mlp": 1.03150392, + "epoch": 0.4563956110025552, + "flos": 22233409528320.0, + "grad_norm": 1.8064400322650376, + "language_loss": 0.73125023, + "learning_rate": 2.377256638796135e-06, + "loss": 0.75272369, + "num_input_tokens_seen": 162856095, + "step": 7591, + "time_per_iteration": 2.7296926975250244 + }, + { + "auxiliary_loss_clip": 0.01113505, + "auxiliary_loss_mlp": 0.01043243, + "balance_loss_clip": 1.04979515, + "balance_loss_mlp": 1.02757883, + "epoch": 0.45645573425522323, + "flos": 17092043389440.0, + "grad_norm": 2.6622201495184923, + "language_loss": 0.76661623, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.78818369, + "num_input_tokens_seen": 162874070, + "step": 7592, + "time_per_iteration": 2.855787992477417 + }, + { + "auxiliary_loss_clip": 0.01104851, + "auxiliary_loss_mlp": 0.01042123, + "balance_loss_clip": 1.04489005, + "balance_loss_mlp": 1.026191, + "epoch": 0.4565158575078912, + "flos": 20332334309760.0, + "grad_norm": 2.112667667080726, + "language_loss": 0.6938538, + "learning_rate": 2.376491669644098e-06, + "loss": 0.71532357, + "num_input_tokens_seen": 162891000, + "step": 7593, + "time_per_iteration": 2.7688679695129395 + }, + { + "auxiliary_loss_clip": 0.01110049, + "auxiliary_loss_mlp": 0.01034633, + "balance_loss_clip": 1.04238796, + "balance_loss_mlp": 1.02174079, + "epoch": 0.45657598076055916, + "flos": 23983013093760.0, + "grad_norm": 2.174557271524546, + "language_loss": 0.83913857, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.86058539, + "num_input_tokens_seen": 162910120, + "step": 7594, + "time_per_iteration": 2.807098865509033 + }, + { + "auxiliary_loss_clip": 0.01036589, + "auxiliary_loss_mlp": 0.00753626, + "balance_loss_clip": 1.01769352, + "balance_loss_mlp": 1.00077426, + "epoch": 0.45663610401322713, + "flos": 69364297526400.0, + "grad_norm": 0.7884707903863047, + "language_loss": 0.52737939, + "learning_rate": 2.375726643385654e-06, + "loss": 0.54528153, + "num_input_tokens_seen": 162963720, + "step": 7595, + "time_per_iteration": 3.2812860012054443 + }, + { + "auxiliary_loss_clip": 0.01096992, + "auxiliary_loss_mlp": 0.01034204, + "balance_loss_clip": 1.04297972, + "balance_loss_mlp": 1.01864684, + "epoch": 0.4566962272658951, + "flos": 15149095891200.0, + "grad_norm": 2.562717754165903, + "language_loss": 0.87188721, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.89319921, + "num_input_tokens_seen": 162975760, + "step": 7596, + "time_per_iteration": 2.683833122253418 + }, + { + "auxiliary_loss_clip": 0.01126007, + "auxiliary_loss_mlp": 0.01046188, + "balance_loss_clip": 1.05094647, + "balance_loss_mlp": 1.03226399, + "epoch": 0.45675635051856306, + "flos": 18697465762560.0, + "grad_norm": 8.947162495751469, + "language_loss": 0.77418292, + "learning_rate": 2.374961560136843e-06, + "loss": 0.79590482, + "num_input_tokens_seen": 162994865, + "step": 7597, + "time_per_iteration": 2.686328887939453 + }, + { + "auxiliary_loss_clip": 0.01117589, + "auxiliary_loss_mlp": 0.01038291, + "balance_loss_clip": 1.04493558, + "balance_loss_mlp": 1.02389073, + "epoch": 0.456816473771231, + "flos": 19098300608640.0, + "grad_norm": 1.6036220935275767, + "language_loss": 0.78581583, + "learning_rate": 2.374578997177314e-06, + "loss": 0.80737466, + "num_input_tokens_seen": 163014730, + "step": 7598, + "time_per_iteration": 2.6856606006622314 + }, + { + "auxiliary_loss_clip": 0.01128723, + "auxiliary_loss_mlp": 0.01034286, + "balance_loss_clip": 1.04699326, + "balance_loss_mlp": 1.02080941, + "epoch": 0.456876597023899, + "flos": 28950069507840.0, + "grad_norm": 3.021485745265107, + "language_loss": 0.71589166, + "learning_rate": 2.374196420013712e-06, + "loss": 0.73752177, + "num_input_tokens_seen": 163033405, + "step": 7599, + "time_per_iteration": 2.672055244445801 + }, + { + "auxiliary_loss_clip": 0.0109465, + "auxiliary_loss_mlp": 0.01038748, + "balance_loss_clip": 1.04185176, + "balance_loss_mlp": 1.02445507, + "epoch": 0.45693672027656695, + "flos": 23289470317440.0, + "grad_norm": 2.0431074720876046, + "language_loss": 0.70262265, + "learning_rate": 2.373813828660544e-06, + "loss": 0.72395658, + "num_input_tokens_seen": 163051400, + "step": 7600, + "time_per_iteration": 2.8163371086120605 + }, + { + "auxiliary_loss_clip": 0.01066248, + "auxiliary_loss_mlp": 0.01041467, + "balance_loss_clip": 1.04143667, + "balance_loss_mlp": 1.02802658, + "epoch": 0.4569968435292349, + "flos": 20558212986240.0, + "grad_norm": 6.700465706217943, + "language_loss": 0.79066253, + "learning_rate": 2.373431223132319e-06, + "loss": 0.81173962, + "num_input_tokens_seen": 163069250, + "step": 7601, + "time_per_iteration": 2.8098480701446533 + }, + { + "auxiliary_loss_clip": 0.01100447, + "auxiliary_loss_mlp": 0.01041284, + "balance_loss_clip": 1.04293573, + "balance_loss_mlp": 1.02730095, + "epoch": 0.4570569667819029, + "flos": 41282619223680.0, + "grad_norm": 6.824528646616988, + "language_loss": 0.71565419, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.73707151, + "num_input_tokens_seen": 163091755, + "step": 7602, + "time_per_iteration": 2.8971548080444336 + }, + { + "auxiliary_loss_clip": 0.01115269, + "auxiliary_loss_mlp": 0.01034582, + "balance_loss_clip": 1.04276979, + "balance_loss_mlp": 1.01859641, + "epoch": 0.45711709003457085, + "flos": 26031573555840.0, + "grad_norm": 1.8661067599139867, + "language_loss": 0.73023772, + "learning_rate": 2.372665969608729e-06, + "loss": 0.75173628, + "num_input_tokens_seen": 163111600, + "step": 7603, + "time_per_iteration": 2.709261417388916 + }, + { + "auxiliary_loss_clip": 0.01120961, + "auxiliary_loss_mlp": 0.01043179, + "balance_loss_clip": 1.04799032, + "balance_loss_mlp": 1.02714539, + "epoch": 0.4571772132872388, + "flos": 22158068751360.0, + "grad_norm": 1.901129043888336, + "language_loss": 0.83068597, + "learning_rate": 2.372283321642383e-06, + "loss": 0.85232735, + "num_input_tokens_seen": 163127350, + "step": 7604, + "time_per_iteration": 2.713744640350342 + }, + { + "auxiliary_loss_clip": 0.01113838, + "auxiliary_loss_mlp": 0.01045941, + "balance_loss_clip": 1.05216503, + "balance_loss_mlp": 1.02981162, + "epoch": 0.45723733653990684, + "flos": 23878872587520.0, + "grad_norm": 2.0592585158299133, + "language_loss": 0.85998154, + "learning_rate": 2.371900659559016e-06, + "loss": 0.88157928, + "num_input_tokens_seen": 163145855, + "step": 7605, + "time_per_iteration": 2.6666319370269775 + }, + { + "auxiliary_loss_clip": 0.010831, + "auxiliary_loss_mlp": 0.01041844, + "balance_loss_clip": 1.04206753, + "balance_loss_mlp": 1.02670407, + "epoch": 0.4572974597925748, + "flos": 16871803148160.0, + "grad_norm": 1.8551011968860212, + "language_loss": 0.73551464, + "learning_rate": 2.371517983373138e-06, + "loss": 0.75676405, + "num_input_tokens_seen": 163163830, + "step": 7606, + "time_per_iteration": 2.8618602752685547 + }, + { + "auxiliary_loss_clip": 0.01100268, + "auxiliary_loss_mlp": 0.01043762, + "balance_loss_clip": 1.0450927, + "balance_loss_mlp": 1.02790761, + "epoch": 0.45735758304524277, + "flos": 13771491528960.0, + "grad_norm": 1.9296458941386103, + "language_loss": 0.80260599, + "learning_rate": 2.371135293099262e-06, + "loss": 0.82404631, + "num_input_tokens_seen": 163180700, + "step": 7607, + "time_per_iteration": 2.717987537384033 + }, + { + "auxiliary_loss_clip": 0.01097097, + "auxiliary_loss_mlp": 0.01046228, + "balance_loss_clip": 1.05015063, + "balance_loss_mlp": 1.03169668, + "epoch": 0.45741770629791073, + "flos": 21100750986240.0, + "grad_norm": 1.7686881404445909, + "language_loss": 0.81263912, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.83407241, + "num_input_tokens_seen": 163199450, + "step": 7608, + "time_per_iteration": 2.7047500610351562 + }, + { + "auxiliary_loss_clip": 0.01110681, + "auxiliary_loss_mlp": 0.01043615, + "balance_loss_clip": 1.04563498, + "balance_loss_mlp": 1.02828515, + "epoch": 0.4574778295505787, + "flos": 23112898035840.0, + "grad_norm": 3.284613619336592, + "language_loss": 0.68429869, + "learning_rate": 2.370369870345559e-06, + "loss": 0.70584166, + "num_input_tokens_seen": 163217875, + "step": 7609, + "time_per_iteration": 2.7123308181762695 + }, + { + "auxiliary_loss_clip": 0.01105383, + "auxiliary_loss_mlp": 0.01045291, + "balance_loss_clip": 1.04979467, + "balance_loss_mlp": 1.03011012, + "epoch": 0.45753795280324666, + "flos": 24352929308160.0, + "grad_norm": 1.7858891409698046, + "language_loss": 0.80873275, + "learning_rate": 2.369987137894757e-06, + "loss": 0.83023953, + "num_input_tokens_seen": 163237430, + "step": 7610, + "time_per_iteration": 2.707108497619629 + }, + { + "auxiliary_loss_clip": 0.01122367, + "auxiliary_loss_mlp": 0.01042029, + "balance_loss_clip": 1.04675138, + "balance_loss_mlp": 1.02698421, + "epoch": 0.4575980760559146, + "flos": 16653789550080.0, + "grad_norm": 2.2133206913732746, + "language_loss": 0.82100248, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.84264642, + "num_input_tokens_seen": 163253905, + "step": 7611, + "time_per_iteration": 2.6911368370056152 + }, + { + "auxiliary_loss_clip": 0.01127544, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.05061793, + "balance_loss_mlp": 1.01889205, + "epoch": 0.4576581993085826, + "flos": 35911423912320.0, + "grad_norm": 2.6253593942917677, + "language_loss": 0.73971558, + "learning_rate": 2.369221630917819e-06, + "loss": 0.76133871, + "num_input_tokens_seen": 163274285, + "step": 7612, + "time_per_iteration": 2.8162691593170166 + }, + { + "auxiliary_loss_clip": 0.01103651, + "auxiliary_loss_mlp": 0.01042157, + "balance_loss_clip": 1.04241323, + "balance_loss_mlp": 1.02680302, + "epoch": 0.45771832256125056, + "flos": 20080421251200.0, + "grad_norm": 1.6042487302929564, + "language_loss": 0.84652913, + "learning_rate": 2.368838856420711e-06, + "loss": 0.86798728, + "num_input_tokens_seen": 163293150, + "step": 7613, + "time_per_iteration": 2.66471266746521 + }, + { + "auxiliary_loss_clip": 0.01096161, + "auxiliary_loss_mlp": 0.01038746, + "balance_loss_clip": 1.04437852, + "balance_loss_mlp": 1.02373135, + "epoch": 0.4577784458139185, + "flos": 10744329957120.0, + "grad_norm": 2.314421678604919, + "language_loss": 0.75271547, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.77406454, + "num_input_tokens_seen": 163310065, + "step": 7614, + "time_per_iteration": 2.740011215209961 + }, + { + "auxiliary_loss_clip": 0.01132592, + "auxiliary_loss_mlp": 0.01037968, + "balance_loss_clip": 1.05067575, + "balance_loss_mlp": 1.02378809, + "epoch": 0.4578385690665865, + "flos": 21907269014400.0, + "grad_norm": 1.5980870069512307, + "language_loss": 0.75026065, + "learning_rate": 2.368073265481791e-06, + "loss": 0.77196622, + "num_input_tokens_seen": 163329415, + "step": 7615, + "time_per_iteration": 2.694354772567749 + }, + { + "auxiliary_loss_clip": 0.01037366, + "auxiliary_loss_mlp": 0.01005104, + "balance_loss_clip": 1.02879357, + "balance_loss_mlp": 1.00286281, + "epoch": 0.45789869231925445, + "flos": 64758286667520.0, + "grad_norm": 0.785268606967784, + "language_loss": 0.57671446, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.59713912, + "num_input_tokens_seen": 163385875, + "step": 7616, + "time_per_iteration": 3.2036197185516357 + }, + { + "auxiliary_loss_clip": 0.010986, + "auxiliary_loss_mlp": 0.00772301, + "balance_loss_clip": 1.04307699, + "balance_loss_mlp": 1.00081253, + "epoch": 0.4579588155719224, + "flos": 16144001775360.0, + "grad_norm": 1.6020549029918738, + "language_loss": 0.70836008, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.72706908, + "num_input_tokens_seen": 163405170, + "step": 7617, + "time_per_iteration": 2.7075886726379395 + }, + { + "auxiliary_loss_clip": 0.01137127, + "auxiliary_loss_mlp": 0.01037359, + "balance_loss_clip": 1.05343175, + "balance_loss_mlp": 1.02264261, + "epoch": 0.45801893882459044, + "flos": 21395541905280.0, + "grad_norm": 1.8894449061399028, + "language_loss": 0.76292491, + "learning_rate": 2.36692477442939e-06, + "loss": 0.78466976, + "num_input_tokens_seen": 163423155, + "step": 7618, + "time_per_iteration": 5.8249146938323975 + }, + { + "auxiliary_loss_clip": 0.01101544, + "auxiliary_loss_mlp": 0.01045871, + "balance_loss_clip": 1.05301738, + "balance_loss_mlp": 1.03189957, + "epoch": 0.4580790620772584, + "flos": 19536554448000.0, + "grad_norm": 1.7481433677396025, + "language_loss": 0.77097881, + "learning_rate": 2.366541916231585e-06, + "loss": 0.79245299, + "num_input_tokens_seen": 163442450, + "step": 7619, + "time_per_iteration": 2.766615629196167 + }, + { + "auxiliary_loss_clip": 0.01134342, + "auxiliary_loss_mlp": 0.01040375, + "balance_loss_clip": 1.05348432, + "balance_loss_mlp": 1.02757239, + "epoch": 0.45813918532992637, + "flos": 16581070465920.0, + "grad_norm": 1.8920903156272437, + "language_loss": 0.72002041, + "learning_rate": 2.366159044134473e-06, + "loss": 0.74176759, + "num_input_tokens_seen": 163459810, + "step": 7620, + "time_per_iteration": 4.087975025177002 + }, + { + "auxiliary_loss_clip": 0.01109227, + "auxiliary_loss_mlp": 0.01032686, + "balance_loss_clip": 1.04942107, + "balance_loss_mlp": 1.01892948, + "epoch": 0.45819930858259433, + "flos": 42230301701760.0, + "grad_norm": 1.5465249381842834, + "language_loss": 0.77770388, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.79912305, + "num_input_tokens_seen": 163482970, + "step": 7621, + "time_per_iteration": 2.9124109745025635 + }, + { + "auxiliary_loss_clip": 0.01044673, + "auxiliary_loss_mlp": 0.01001257, + "balance_loss_clip": 1.02584982, + "balance_loss_mlp": 0.99903959, + "epoch": 0.4582594318352623, + "flos": 63714795638400.0, + "grad_norm": 0.7823065471017115, + "language_loss": 0.64958, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.6700393, + "num_input_tokens_seen": 163545330, + "step": 7622, + "time_per_iteration": 4.778898477554321 + }, + { + "auxiliary_loss_clip": 0.01120212, + "auxiliary_loss_mlp": 0.01034924, + "balance_loss_clip": 1.05105555, + "balance_loss_mlp": 1.02016604, + "epoch": 0.45831955508793026, + "flos": 26869979882880.0, + "grad_norm": 3.654827974152138, + "language_loss": 0.79468191, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.81623328, + "num_input_tokens_seen": 163564620, + "step": 7623, + "time_per_iteration": 2.7033259868621826 + }, + { + "auxiliary_loss_clip": 0.01078844, + "auxiliary_loss_mlp": 0.01041828, + "balance_loss_clip": 1.04181957, + "balance_loss_mlp": 1.02728403, + "epoch": 0.45837967834059823, + "flos": 18733951002240.0, + "grad_norm": 1.8933831090876323, + "language_loss": 0.70283759, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.72404432, + "num_input_tokens_seen": 163581010, + "step": 7624, + "time_per_iteration": 2.8526861667633057 + }, + { + "auxiliary_loss_clip": 0.01100025, + "auxiliary_loss_mlp": 0.01040188, + "balance_loss_clip": 1.04250479, + "balance_loss_mlp": 1.02558446, + "epoch": 0.4584398015932662, + "flos": 21178102924800.0, + "grad_norm": 2.2295023596293273, + "language_loss": 0.73171687, + "learning_rate": 2.364244475667491e-06, + "loss": 0.75311905, + "num_input_tokens_seen": 163599955, + "step": 7625, + "time_per_iteration": 2.77284574508667 + }, + { + "auxiliary_loss_clip": 0.01120178, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.05209434, + "balance_loss_mlp": 1.02369022, + "epoch": 0.45849992484593416, + "flos": 19790047704960.0, + "grad_norm": 2.499945379712242, + "language_loss": 0.77924562, + "learning_rate": 2.363861520479451e-06, + "loss": 0.80081707, + "num_input_tokens_seen": 163618545, + "step": 7626, + "time_per_iteration": 2.813945770263672 + }, + { + "auxiliary_loss_clip": 0.01137615, + "auxiliary_loss_mlp": 0.01040207, + "balance_loss_clip": 1.05263078, + "balance_loss_mlp": 1.02645612, + "epoch": 0.4585600480986021, + "flos": 18223265387520.0, + "grad_norm": 1.5689934094814115, + "language_loss": 0.84652817, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.8683064, + "num_input_tokens_seen": 163636055, + "step": 7627, + "time_per_iteration": 2.659053087234497 + }, + { + "auxiliary_loss_clip": 0.01138145, + "auxiliary_loss_mlp": 0.01040233, + "balance_loss_clip": 1.05155802, + "balance_loss_mlp": 1.02531946, + "epoch": 0.4586201713512701, + "flos": 29022213974400.0, + "grad_norm": 1.5125222475387885, + "language_loss": 0.6911087, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.71289253, + "num_input_tokens_seen": 163657485, + "step": 7628, + "time_per_iteration": 2.693678617477417 + }, + { + "auxiliary_loss_clip": 0.01118783, + "auxiliary_loss_mlp": 0.01034859, + "balance_loss_clip": 1.04731619, + "balance_loss_mlp": 1.02110827, + "epoch": 0.45868029460393805, + "flos": 23404600385280.0, + "grad_norm": 1.4972122231294245, + "language_loss": 0.78672099, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.80825746, + "num_input_tokens_seen": 163676030, + "step": 7629, + "time_per_iteration": 2.6437535285949707 + }, + { + "auxiliary_loss_clip": 0.01113389, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_clip": 1.04590559, + "balance_loss_mlp": 1.03034973, + "epoch": 0.458740417856606, + "flos": 18221972497920.0, + "grad_norm": 2.2059444062956985, + "language_loss": 0.79377991, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.81536937, + "num_input_tokens_seen": 163694490, + "step": 7630, + "time_per_iteration": 2.7565791606903076 + }, + { + "auxiliary_loss_clip": 0.01111942, + "auxiliary_loss_mlp": 0.01039415, + "balance_loss_clip": 1.04838312, + "balance_loss_mlp": 1.02481222, + "epoch": 0.458800541109274, + "flos": 34568760504960.0, + "grad_norm": 2.1212994157581293, + "language_loss": 0.72087741, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.74239099, + "num_input_tokens_seen": 163717035, + "step": 7631, + "time_per_iteration": 2.7880306243896484 + }, + { + "auxiliary_loss_clip": 0.01094955, + "auxiliary_loss_mlp": 0.01048432, + "balance_loss_clip": 1.04605651, + "balance_loss_mlp": 1.03280342, + "epoch": 0.458860664361942, + "flos": 17712112896000.0, + "grad_norm": 2.4606182879569145, + "language_loss": 0.71433818, + "learning_rate": 2.361563500108531e-06, + "loss": 0.73577201, + "num_input_tokens_seen": 163734525, + "step": 7632, + "time_per_iteration": 2.7352800369262695 + }, + { + "auxiliary_loss_clip": 0.01081835, + "auxiliary_loss_mlp": 0.00774034, + "balance_loss_clip": 1.04268694, + "balance_loss_mlp": 1.00058782, + "epoch": 0.45892078761460997, + "flos": 18441889516800.0, + "grad_norm": 2.5758659525876824, + "language_loss": 0.68867576, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.7072345, + "num_input_tokens_seen": 163752860, + "step": 7633, + "time_per_iteration": 2.848534107208252 + }, + { + "auxiliary_loss_clip": 0.01122955, + "auxiliary_loss_mlp": 0.01043952, + "balance_loss_clip": 1.05012798, + "balance_loss_mlp": 1.02942061, + "epoch": 0.45898091086727794, + "flos": 22672956257280.0, + "grad_norm": 1.690968390723207, + "language_loss": 0.80858737, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.83025646, + "num_input_tokens_seen": 163772495, + "step": 7634, + "time_per_iteration": 2.6536448001861572 + }, + { + "auxiliary_loss_clip": 0.01122911, + "auxiliary_loss_mlp": 0.00773021, + "balance_loss_clip": 1.04987049, + "balance_loss_mlp": 1.00053596, + "epoch": 0.4590410341199459, + "flos": 21652949744640.0, + "grad_norm": 1.6933583063541449, + "language_loss": 0.81255853, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.83151788, + "num_input_tokens_seen": 163791475, + "step": 7635, + "time_per_iteration": 2.6140496730804443 + }, + { + "auxiliary_loss_clip": 0.01110725, + "auxiliary_loss_mlp": 0.01043522, + "balance_loss_clip": 1.04990745, + "balance_loss_mlp": 1.02941322, + "epoch": 0.45910115737261387, + "flos": 36535372087680.0, + "grad_norm": 1.4938285014309638, + "language_loss": 0.64786839, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.66941082, + "num_input_tokens_seen": 163812995, + "step": 7636, + "time_per_iteration": 2.9211695194244385 + }, + { + "auxiliary_loss_clip": 0.01117391, + "auxiliary_loss_mlp": 0.01034493, + "balance_loss_clip": 1.05096126, + "balance_loss_mlp": 1.0207361, + "epoch": 0.45916128062528183, + "flos": 24419866302720.0, + "grad_norm": 1.5704675488980822, + "language_loss": 0.8052876, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.82680643, + "num_input_tokens_seen": 163833945, + "step": 7637, + "time_per_iteration": 2.703902244567871 + }, + { + "auxiliary_loss_clip": 0.0110221, + "auxiliary_loss_mlp": 0.0104296, + "balance_loss_clip": 1.04369295, + "balance_loss_mlp": 1.02650893, + "epoch": 0.4592214038779498, + "flos": 23221958705280.0, + "grad_norm": 1.340585421251073, + "language_loss": 0.75339955, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.7748512, + "num_input_tokens_seen": 163853885, + "step": 7638, + "time_per_iteration": 2.8683316707611084 + }, + { + "auxiliary_loss_clip": 0.01118666, + "auxiliary_loss_mlp": 0.01037335, + "balance_loss_clip": 1.04785442, + "balance_loss_mlp": 1.02312553, + "epoch": 0.45928152713061776, + "flos": 19172133014400.0, + "grad_norm": 1.8020175509044534, + "language_loss": 0.74017608, + "learning_rate": 2.358881852733989e-06, + "loss": 0.76173615, + "num_input_tokens_seen": 163871855, + "step": 7639, + "time_per_iteration": 2.6385724544525146 + }, + { + "auxiliary_loss_clip": 0.01134704, + "auxiliary_loss_mlp": 0.01038079, + "balance_loss_clip": 1.05116391, + "balance_loss_mlp": 1.02403021, + "epoch": 0.4593416503832857, + "flos": 22414686491520.0, + "grad_norm": 1.704541952239469, + "language_loss": 0.68183744, + "learning_rate": 2.358498705700346e-06, + "loss": 0.7035653, + "num_input_tokens_seen": 163891450, + "step": 7640, + "time_per_iteration": 2.6786441802978516 + }, + { + "auxiliary_loss_clip": 0.01104644, + "auxiliary_loss_mlp": 0.01040873, + "balance_loss_clip": 1.04305553, + "balance_loss_mlp": 1.02640736, + "epoch": 0.4594017736359537, + "flos": 18880215183360.0, + "grad_norm": 1.6440653073556697, + "language_loss": 0.75610799, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.77756315, + "num_input_tokens_seen": 163909345, + "step": 7641, + "time_per_iteration": 2.6967337131500244 + }, + { + "auxiliary_loss_clip": 0.01107468, + "auxiliary_loss_mlp": 0.0103519, + "balance_loss_clip": 1.04473758, + "balance_loss_mlp": 1.01987791, + "epoch": 0.45946189688862166, + "flos": 20518567349760.0, + "grad_norm": 1.7366807351650166, + "language_loss": 0.7477932, + "learning_rate": 2.357732370864668e-06, + "loss": 0.76921976, + "num_input_tokens_seen": 163926940, + "step": 7642, + "time_per_iteration": 2.7593836784362793 + }, + { + "auxiliary_loss_clip": 0.01033439, + "auxiliary_loss_mlp": 0.01015123, + "balance_loss_clip": 1.02063584, + "balance_loss_mlp": 1.01360917, + "epoch": 0.4595220201412896, + "flos": 61405990162560.0, + "grad_norm": 0.8870453562304583, + "language_loss": 0.58169055, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60217613, + "num_input_tokens_seen": 163977785, + "step": 7643, + "time_per_iteration": 3.008721351623535 + }, + { + "auxiliary_loss_clip": 0.01126407, + "auxiliary_loss_mlp": 0.01039184, + "balance_loss_clip": 1.04902744, + "balance_loss_mlp": 1.02468801, + "epoch": 0.4595821433939576, + "flos": 23330947547520.0, + "grad_norm": 1.6727361984558426, + "language_loss": 0.92977291, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.95142883, + "num_input_tokens_seen": 163996630, + "step": 7644, + "time_per_iteration": 2.6844348907470703 + }, + { + "auxiliary_loss_clip": 0.01118806, + "auxiliary_loss_mlp": 0.0103695, + "balance_loss_clip": 1.04879534, + "balance_loss_mlp": 1.02278805, + "epoch": 0.4596422666466256, + "flos": 14282356711680.0, + "grad_norm": 2.49930104784668, + "language_loss": 0.82485175, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.84640932, + "num_input_tokens_seen": 164013190, + "step": 7645, + "time_per_iteration": 2.649367332458496 + }, + { + "auxiliary_loss_clip": 0.01010103, + "auxiliary_loss_mlp": 0.00999811, + "balance_loss_clip": 1.01816797, + "balance_loss_mlp": 0.99795145, + "epoch": 0.4597023898992936, + "flos": 65727337737600.0, + "grad_norm": 0.7581805782249401, + "language_loss": 0.59857589, + "learning_rate": 2.356199538526593e-06, + "loss": 0.61867499, + "num_input_tokens_seen": 164074030, + "step": 7646, + "time_per_iteration": 3.211512327194214 + }, + { + "auxiliary_loss_clip": 0.01116258, + "auxiliary_loss_mlp": 0.01035245, + "balance_loss_clip": 1.04631102, + "balance_loss_mlp": 1.02006984, + "epoch": 0.45976251315196154, + "flos": 26907075653760.0, + "grad_norm": 1.794903772385352, + "language_loss": 0.72503293, + "learning_rate": 2.355816296637939e-06, + "loss": 0.74654794, + "num_input_tokens_seen": 164095515, + "step": 7647, + "time_per_iteration": 2.792795419692993 + }, + { + "auxiliary_loss_clip": 0.01096575, + "auxiliary_loss_mlp": 0.01041791, + "balance_loss_clip": 1.04206514, + "balance_loss_mlp": 1.02684855, + "epoch": 0.4598226364046295, + "flos": 26618066824320.0, + "grad_norm": 1.7350588372730733, + "language_loss": 0.66805142, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.68943512, + "num_input_tokens_seen": 164117270, + "step": 7648, + "time_per_iteration": 2.798882484436035 + }, + { + "auxiliary_loss_clip": 0.01120443, + "auxiliary_loss_mlp": 0.01037713, + "balance_loss_clip": 1.04601169, + "balance_loss_mlp": 1.0234313, + "epoch": 0.45988275965729747, + "flos": 24387762522240.0, + "grad_norm": 1.4487791655991338, + "language_loss": 0.78854847, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.81013, + "num_input_tokens_seen": 164137850, + "step": 7649, + "time_per_iteration": 2.710026979446411 + }, + { + "auxiliary_loss_clip": 0.01071387, + "auxiliary_loss_mlp": 0.01039161, + "balance_loss_clip": 1.0469979, + "balance_loss_mlp": 1.02459955, + "epoch": 0.45994288290996543, + "flos": 24535822383360.0, + "grad_norm": 1.68877556398497, + "language_loss": 0.69140404, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.71250951, + "num_input_tokens_seen": 164157960, + "step": 7650, + "time_per_iteration": 2.862882375717163 + }, + { + "auxiliary_loss_clip": 0.01128714, + "auxiliary_loss_mlp": 0.01042169, + "balance_loss_clip": 1.05184257, + "balance_loss_mlp": 1.02592099, + "epoch": 0.4600030061626334, + "flos": 14830245838080.0, + "grad_norm": 2.8986833449878686, + "language_loss": 0.844868, + "learning_rate": 2.354283194302761e-06, + "loss": 0.86657685, + "num_input_tokens_seen": 164174590, + "step": 7651, + "time_per_iteration": 2.624094247817993 + }, + { + "auxiliary_loss_clip": 0.01108337, + "auxiliary_loss_mlp": 0.00771732, + "balance_loss_clip": 1.04726708, + "balance_loss_mlp": 1.00045896, + "epoch": 0.46006312941530136, + "flos": 18113845582080.0, + "grad_norm": 1.8740934460638858, + "language_loss": 0.75375748, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.77255821, + "num_input_tokens_seen": 164192935, + "step": 7652, + "time_per_iteration": 2.7064099311828613 + }, + { + "auxiliary_loss_clip": 0.01083449, + "auxiliary_loss_mlp": 0.01033562, + "balance_loss_clip": 1.04353166, + "balance_loss_mlp": 1.019364, + "epoch": 0.46012325266796933, + "flos": 21976468565760.0, + "grad_norm": 1.6780448716001595, + "language_loss": 0.75990206, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.78107214, + "num_input_tokens_seen": 164213160, + "step": 7653, + "time_per_iteration": 2.840228319168091 + }, + { + "auxiliary_loss_clip": 0.01090017, + "auxiliary_loss_mlp": 0.01037352, + "balance_loss_clip": 1.04773235, + "balance_loss_mlp": 1.02063906, + "epoch": 0.4601833759206373, + "flos": 15268068714240.0, + "grad_norm": 4.060223218919271, + "language_loss": 0.65658432, + "learning_rate": 2.353133226438741e-06, + "loss": 0.67785805, + "num_input_tokens_seen": 164229330, + "step": 7654, + "time_per_iteration": 2.8097331523895264 + }, + { + "auxiliary_loss_clip": 0.0110323, + "auxiliary_loss_mlp": 0.01038674, + "balance_loss_clip": 1.04187179, + "balance_loss_mlp": 1.02436912, + "epoch": 0.46024349917330526, + "flos": 27088999061760.0, + "grad_norm": 1.8761760458574834, + "language_loss": 0.79274917, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.81416821, + "num_input_tokens_seen": 164248240, + "step": 7655, + "time_per_iteration": 2.758086681365967 + }, + { + "auxiliary_loss_clip": 0.01090903, + "auxiliary_loss_mlp": 0.01032546, + "balance_loss_clip": 1.0439781, + "balance_loss_mlp": 1.01801491, + "epoch": 0.4603036224259732, + "flos": 24462923731200.0, + "grad_norm": 1.6240518023721515, + "language_loss": 0.68172526, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.70295978, + "num_input_tokens_seen": 164268020, + "step": 7656, + "time_per_iteration": 2.740079402923584 + }, + { + "auxiliary_loss_clip": 0.01107571, + "auxiliary_loss_mlp": 0.01034222, + "balance_loss_clip": 1.04353023, + "balance_loss_mlp": 1.02023935, + "epoch": 0.4603637456786412, + "flos": 28109292883200.0, + "grad_norm": 2.01428243239582, + "language_loss": 0.80944681, + "learning_rate": 2.351983138057098e-06, + "loss": 0.83086479, + "num_input_tokens_seen": 164287305, + "step": 7657, + "time_per_iteration": 5.946510314941406 + }, + { + "auxiliary_loss_clip": 0.01130018, + "auxiliary_loss_mlp": 0.00771647, + "balance_loss_clip": 1.04671657, + "balance_loss_mlp": 1.00056028, + "epoch": 0.4604238689313092, + "flos": 24348942898560.0, + "grad_norm": 2.997035997447325, + "language_loss": 0.70678955, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.72580624, + "num_input_tokens_seen": 164306835, + "step": 7658, + "time_per_iteration": 2.710728883743286 + }, + { + "auxiliary_loss_clip": 0.01037878, + "auxiliary_loss_mlp": 0.01003053, + "balance_loss_clip": 1.01928806, + "balance_loss_mlp": 1.00126505, + "epoch": 0.4604839921839772, + "flos": 53606229431040.0, + "grad_norm": 0.9879963677197028, + "language_loss": 0.62104321, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64145255, + "num_input_tokens_seen": 164367095, + "step": 7659, + "time_per_iteration": 4.733903646469116 + }, + { + "auxiliary_loss_clip": 0.01079557, + "auxiliary_loss_mlp": 0.01042331, + "balance_loss_clip": 1.04242504, + "balance_loss_mlp": 1.02548122, + "epoch": 0.46054411543664514, + "flos": 31248424126080.0, + "grad_norm": 1.6833434349921483, + "language_loss": 0.68750244, + "learning_rate": 2.350832929550336e-06, + "loss": 0.70872128, + "num_input_tokens_seen": 164388895, + "step": 7660, + "time_per_iteration": 2.8501877784729004 + }, + { + "auxiliary_loss_clip": 0.01115644, + "auxiliary_loss_mlp": 0.01039595, + "balance_loss_clip": 1.04312992, + "balance_loss_mlp": 1.02450275, + "epoch": 0.4606042386893131, + "flos": 24092863862400.0, + "grad_norm": 4.508470627980692, + "language_loss": 0.77059424, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.79214668, + "num_input_tokens_seen": 164409080, + "step": 7661, + "time_per_iteration": 4.375652313232422 + }, + { + "auxiliary_loss_clip": 0.01111668, + "auxiliary_loss_mlp": 0.01045702, + "balance_loss_clip": 1.04530478, + "balance_loss_mlp": 1.02989531, + "epoch": 0.46066436194198107, + "flos": 26578457101440.0, + "grad_norm": 1.8557827945777399, + "language_loss": 0.75165689, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.77323061, + "num_input_tokens_seen": 164427585, + "step": 7662, + "time_per_iteration": 2.654381513595581 + }, + { + "auxiliary_loss_clip": 0.01104085, + "auxiliary_loss_mlp": 0.01041771, + "balance_loss_clip": 1.0422461, + "balance_loss_mlp": 1.02585697, + "epoch": 0.46072448519464904, + "flos": 17775602184960.0, + "grad_norm": 3.5055114571256922, + "language_loss": 0.79886508, + "learning_rate": 2.349682601310998e-06, + "loss": 0.82032371, + "num_input_tokens_seen": 164438455, + "step": 7663, + "time_per_iteration": 2.6240744590759277 + }, + { + "auxiliary_loss_clip": 0.0111588, + "auxiliary_loss_mlp": 0.01034844, + "balance_loss_clip": 1.04562616, + "balance_loss_mlp": 1.02098536, + "epoch": 0.460784608447317, + "flos": 15086109392640.0, + "grad_norm": 2.0015713101361565, + "language_loss": 0.73791528, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.75942254, + "num_input_tokens_seen": 164456830, + "step": 7664, + "time_per_iteration": 2.673335075378418 + }, + { + "auxiliary_loss_clip": 0.01096445, + "auxiliary_loss_mlp": 0.01036863, + "balance_loss_clip": 1.04571927, + "balance_loss_mlp": 1.02313614, + "epoch": 0.46084473169998497, + "flos": 18588261438720.0, + "grad_norm": 1.5274295482700302, + "language_loss": 0.7257731, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.74710619, + "num_input_tokens_seen": 164475375, + "step": 7665, + "time_per_iteration": 2.7057924270629883 + }, + { + "auxiliary_loss_clip": 0.01104187, + "auxiliary_loss_mlp": 0.01034968, + "balance_loss_clip": 1.04968786, + "balance_loss_mlp": 1.02148521, + "epoch": 0.46090485495265293, + "flos": 19494789909120.0, + "grad_norm": 1.7665019302136358, + "language_loss": 0.78369665, + "learning_rate": 2.348532153731669e-06, + "loss": 0.80508822, + "num_input_tokens_seen": 164492040, + "step": 7666, + "time_per_iteration": 2.6954169273376465 + }, + { + "auxiliary_loss_clip": 0.0108371, + "auxiliary_loss_mlp": 0.01035058, + "balance_loss_clip": 1.04061627, + "balance_loss_mlp": 1.01935792, + "epoch": 0.4609649782053209, + "flos": 33364927163520.0, + "grad_norm": 1.7291426769142197, + "language_loss": 0.74374932, + "learning_rate": 2.348148644753088e-06, + "loss": 0.76493704, + "num_input_tokens_seen": 164513665, + "step": 7667, + "time_per_iteration": 2.781087636947632 + }, + { + "auxiliary_loss_clip": 0.01083108, + "auxiliary_loss_mlp": 0.01038011, + "balance_loss_clip": 1.04470205, + "balance_loss_mlp": 1.02440965, + "epoch": 0.46102510145798886, + "flos": 23769165473280.0, + "grad_norm": 1.4213815945133983, + "language_loss": 0.75993818, + "learning_rate": 2.347765122572676e-06, + "loss": 0.78114939, + "num_input_tokens_seen": 164533890, + "step": 7668, + "time_per_iteration": 2.8653104305267334 + }, + { + "auxiliary_loss_clip": 0.010726, + "auxiliary_loss_mlp": 0.01033857, + "balance_loss_clip": 1.04025698, + "balance_loss_mlp": 1.02047563, + "epoch": 0.4610852247106568, + "flos": 23294821443840.0, + "grad_norm": 1.7696248586775516, + "language_loss": 0.78228277, + "learning_rate": 2.347381587204975e-06, + "loss": 0.80334735, + "num_input_tokens_seen": 164553815, + "step": 7669, + "time_per_iteration": 2.783662796020508 + }, + { + "auxiliary_loss_clip": 0.01110483, + "auxiliary_loss_mlp": 0.01038047, + "balance_loss_clip": 1.04095972, + "balance_loss_mlp": 1.02259183, + "epoch": 0.4611453479633248, + "flos": 25447450584960.0, + "grad_norm": 1.7322551840105593, + "language_loss": 0.82352221, + "learning_rate": 2.34699803866453e-06, + "loss": 0.84500754, + "num_input_tokens_seen": 164573125, + "step": 7670, + "time_per_iteration": 2.6722826957702637 + }, + { + "auxiliary_loss_clip": 0.01118191, + "auxiliary_loss_mlp": 0.01034929, + "balance_loss_clip": 1.04624724, + "balance_loss_mlp": 1.02086234, + "epoch": 0.4612054712159928, + "flos": 21139606523520.0, + "grad_norm": 1.6399167633004121, + "language_loss": 0.63361788, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.6551491, + "num_input_tokens_seen": 164592575, + "step": 7671, + "time_per_iteration": 2.6507785320281982 + }, + { + "auxiliary_loss_clip": 0.01038838, + "auxiliary_loss_mlp": 0.01005964, + "balance_loss_clip": 1.02976012, + "balance_loss_mlp": 1.0044564, + "epoch": 0.4612655944686608, + "flos": 69959266404480.0, + "grad_norm": 0.6926647500019024, + "language_loss": 0.55842638, + "learning_rate": 2.346230902123583e-06, + "loss": 0.57887447, + "num_input_tokens_seen": 164659795, + "step": 7672, + "time_per_iteration": 3.330268144607544 + }, + { + "auxiliary_loss_clip": 0.01119098, + "auxiliary_loss_mlp": 0.01040288, + "balance_loss_clip": 1.04617, + "balance_loss_mlp": 1.02645397, + "epoch": 0.46132571772132874, + "flos": 16837149502080.0, + "grad_norm": 1.8809200572873195, + "language_loss": 0.70954943, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.7311433, + "num_input_tokens_seen": 164678735, + "step": 7673, + "time_per_iteration": 2.65659499168396 + }, + { + "auxiliary_loss_clip": 0.01103001, + "auxiliary_loss_mlp": 0.01033294, + "balance_loss_clip": 1.04363799, + "balance_loss_mlp": 1.01938248, + "epoch": 0.4613858409739967, + "flos": 35808935431680.0, + "grad_norm": 1.9110713796675685, + "language_loss": 0.70837104, + "learning_rate": 2.345463713066195e-06, + "loss": 0.72973394, + "num_input_tokens_seen": 164700885, + "step": 7674, + "time_per_iteration": 2.8332366943359375 + }, + { + "auxiliary_loss_clip": 0.01103023, + "auxiliary_loss_mlp": 0.0104104, + "balance_loss_clip": 1.04143381, + "balance_loss_mlp": 1.02709818, + "epoch": 0.4614459642266647, + "flos": 35266756567680.0, + "grad_norm": 1.6933433527162, + "language_loss": 0.65489, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.67633063, + "num_input_tokens_seen": 164726960, + "step": 7675, + "time_per_iteration": 2.8454952239990234 + }, + { + "auxiliary_loss_clip": 0.01047065, + "auxiliary_loss_mlp": 0.01003099, + "balance_loss_clip": 1.02009785, + "balance_loss_mlp": 1.00131118, + "epoch": 0.46150608747933264, + "flos": 66704610044160.0, + "grad_norm": 0.8598142136337862, + "language_loss": 0.58659744, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.60709906, + "num_input_tokens_seen": 164788525, + "step": 7676, + "time_per_iteration": 3.1523091793060303 + }, + { + "auxiliary_loss_clip": 0.0101473, + "auxiliary_loss_mlp": 0.01002448, + "balance_loss_clip": 1.01614749, + "balance_loss_mlp": 1.00077868, + "epoch": 0.4615662107320006, + "flos": 55830177025920.0, + "grad_norm": 0.7931279707742926, + "language_loss": 0.62803817, + "learning_rate": 2.344312831266341e-06, + "loss": 0.64820993, + "num_input_tokens_seen": 164843525, + "step": 7677, + "time_per_iteration": 3.1055288314819336 + }, + { + "auxiliary_loss_clip": 0.01103004, + "auxiliary_loss_mlp": 0.01036602, + "balance_loss_clip": 1.04363084, + "balance_loss_mlp": 1.02309012, + "epoch": 0.46162633398466857, + "flos": 15483245137920.0, + "grad_norm": 2.4819209870900636, + "language_loss": 0.76371491, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.78511101, + "num_input_tokens_seen": 164859895, + "step": 7678, + "time_per_iteration": 2.6796817779541016 + }, + { + "auxiliary_loss_clip": 0.01131922, + "auxiliary_loss_mlp": 0.01035943, + "balance_loss_clip": 1.04888463, + "balance_loss_mlp": 1.02157795, + "epoch": 0.46168645723733653, + "flos": 20011437181440.0, + "grad_norm": 2.4568506909255974, + "language_loss": 0.66881382, + "learning_rate": 2.343545511426974e-06, + "loss": 0.69049251, + "num_input_tokens_seen": 164878030, + "step": 7679, + "time_per_iteration": 2.669527053833008 + }, + { + "auxiliary_loss_clip": 0.01095986, + "auxiliary_loss_mlp": 0.01037988, + "balance_loss_clip": 1.04533219, + "balance_loss_mlp": 1.02469063, + "epoch": 0.4617465804900045, + "flos": 20298542590080.0, + "grad_norm": 2.335341416202827, + "language_loss": 0.70432782, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.7256676, + "num_input_tokens_seen": 164895710, + "step": 7680, + "time_per_iteration": 2.7286808490753174 + }, + { + "auxiliary_loss_clip": 0.01137583, + "auxiliary_loss_mlp": 0.01043671, + "balance_loss_clip": 1.05160725, + "balance_loss_mlp": 1.02904963, + "epoch": 0.46180670374267246, + "flos": 22346312952960.0, + "grad_norm": 1.9037139750308347, + "language_loss": 0.63464803, + "learning_rate": 2.342778139478487e-06, + "loss": 0.65646052, + "num_input_tokens_seen": 164913365, + "step": 7681, + "time_per_iteration": 2.6214568614959717 + }, + { + "auxiliary_loss_clip": 0.01116453, + "auxiliary_loss_mlp": 0.01029466, + "balance_loss_clip": 1.04633749, + "balance_loss_mlp": 1.01636481, + "epoch": 0.46186682699534043, + "flos": 19895696582400.0, + "grad_norm": 1.5164971745129476, + "language_loss": 0.67357612, + "learning_rate": 2.342394433999697e-06, + "loss": 0.69503522, + "num_input_tokens_seen": 164931620, + "step": 7682, + "time_per_iteration": 2.647353410720825 + }, + { + "auxiliary_loss_clip": 0.01088835, + "auxiliary_loss_mlp": 0.01041013, + "balance_loss_clip": 1.04340196, + "balance_loss_mlp": 1.02619505, + "epoch": 0.4619269502480084, + "flos": 31503569408640.0, + "grad_norm": 2.227871519060849, + "language_loss": 0.73820949, + "learning_rate": 2.342010715537275e-06, + "loss": 0.75950789, + "num_input_tokens_seen": 164950905, + "step": 7683, + "time_per_iteration": 2.7580692768096924 + }, + { + "auxiliary_loss_clip": 0.01128951, + "auxiliary_loss_mlp": 0.01039533, + "balance_loss_clip": 1.04759753, + "balance_loss_mlp": 1.02627087, + "epoch": 0.46198707350067636, + "flos": 25009484054400.0, + "grad_norm": 1.7711337337418462, + "language_loss": 0.76479292, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.7864778, + "num_input_tokens_seen": 164970950, + "step": 7684, + "time_per_iteration": 2.6827478408813477 + }, + { + "auxiliary_loss_clip": 0.01136661, + "auxiliary_loss_mlp": 0.01044253, + "balance_loss_clip": 1.0495609, + "balance_loss_mlp": 1.02969098, + "epoch": 0.4620471967533444, + "flos": 18292357198080.0, + "grad_norm": 1.8114594945271643, + "language_loss": 0.79657519, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.81838435, + "num_input_tokens_seen": 164989855, + "step": 7685, + "time_per_iteration": 2.6539084911346436 + }, + { + "auxiliary_loss_clip": 0.01085193, + "auxiliary_loss_mlp": 0.01046975, + "balance_loss_clip": 1.04328656, + "balance_loss_mlp": 1.03158486, + "epoch": 0.46210732000601235, + "flos": 33985104410880.0, + "grad_norm": 2.276305365525513, + "language_loss": 0.66791403, + "learning_rate": 2.340859482393731e-06, + "loss": 0.68923569, + "num_input_tokens_seen": 165012290, + "step": 7686, + "time_per_iteration": 2.8229949474334717 + }, + { + "auxiliary_loss_clip": 0.01106797, + "auxiliary_loss_mlp": 0.00772257, + "balance_loss_clip": 1.04507184, + "balance_loss_mlp": 1.00066257, + "epoch": 0.4621674432586803, + "flos": 25009412227200.0, + "grad_norm": 2.1846142929829693, + "language_loss": 0.73938292, + "learning_rate": 2.340475712142296e-06, + "loss": 0.75817347, + "num_input_tokens_seen": 165030810, + "step": 7687, + "time_per_iteration": 2.8577284812927246 + }, + { + "auxiliary_loss_clip": 0.01066455, + "auxiliary_loss_mlp": 0.01038717, + "balance_loss_clip": 1.0470593, + "balance_loss_mlp": 1.02399492, + "epoch": 0.4622275665113483, + "flos": 22014031213440.0, + "grad_norm": 2.1409043019128253, + "language_loss": 0.74955392, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.77060568, + "num_input_tokens_seen": 165050205, + "step": 7688, + "time_per_iteration": 2.8981478214263916 + }, + { + "auxiliary_loss_clip": 0.01076735, + "auxiliary_loss_mlp": 0.00771909, + "balance_loss_clip": 1.03838563, + "balance_loss_mlp": 1.0005393, + "epoch": 0.46228768976401624, + "flos": 24058820747520.0, + "grad_norm": 1.6416992765701228, + "language_loss": 0.78753114, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.80601752, + "num_input_tokens_seen": 165069370, + "step": 7689, + "time_per_iteration": 2.8450090885162354 + }, + { + "auxiliary_loss_clip": 0.01117226, + "auxiliary_loss_mlp": 0.01039789, + "balance_loss_clip": 1.04319644, + "balance_loss_mlp": 1.02512646, + "epoch": 0.4623478130166842, + "flos": 26651391667200.0, + "grad_norm": 2.047300589730092, + "language_loss": 0.56996405, + "learning_rate": 2.339324323980964e-06, + "loss": 0.5915342, + "num_input_tokens_seen": 165089610, + "step": 7690, + "time_per_iteration": 2.6919097900390625 + }, + { + "auxiliary_loss_clip": 0.0111777, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.04474783, + "balance_loss_mlp": 1.02853799, + "epoch": 0.46240793626935217, + "flos": 20558428467840.0, + "grad_norm": 2.950419828824325, + "language_loss": 0.82586032, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.84746557, + "num_input_tokens_seen": 165109050, + "step": 7691, + "time_per_iteration": 2.695331573486328 + }, + { + "auxiliary_loss_clip": 0.01108828, + "auxiliary_loss_mlp": 0.01034489, + "balance_loss_clip": 1.04660177, + "balance_loss_mlp": 1.02088761, + "epoch": 0.46246805952202014, + "flos": 22456055980800.0, + "grad_norm": 1.4872733065963748, + "language_loss": 0.75199407, + "learning_rate": 2.338556667513091e-06, + "loss": 0.77342725, + "num_input_tokens_seen": 165130130, + "step": 7692, + "time_per_iteration": 2.6822991371154785 + }, + { + "auxiliary_loss_clip": 0.01097579, + "auxiliary_loss_mlp": 0.01044516, + "balance_loss_clip": 1.04742086, + "balance_loss_mlp": 1.0297097, + "epoch": 0.4625281827746881, + "flos": 35041308854400.0, + "grad_norm": 1.6276482481397991, + "language_loss": 0.74345845, + "learning_rate": 2.338172820014723e-06, + "loss": 0.76487935, + "num_input_tokens_seen": 165152685, + "step": 7693, + "time_per_iteration": 2.8581414222717285 + }, + { + "auxiliary_loss_clip": 0.01087933, + "auxiliary_loss_mlp": 0.01056162, + "balance_loss_clip": 1.04530871, + "balance_loss_mlp": 1.04086781, + "epoch": 0.46258830602735607, + "flos": 21068647205760.0, + "grad_norm": 2.088066659615079, + "language_loss": 0.85329688, + "learning_rate": 2.337788959692808e-06, + "loss": 0.8747378, + "num_input_tokens_seen": 165173315, + "step": 7694, + "time_per_iteration": 2.730196237564087 + }, + { + "auxiliary_loss_clip": 0.01111115, + "auxiliary_loss_mlp": 0.01042848, + "balance_loss_clip": 1.04707479, + "balance_loss_mlp": 1.02936506, + "epoch": 0.46264842928002403, + "flos": 26177227205760.0, + "grad_norm": 2.853578946778756, + "language_loss": 0.79611814, + "learning_rate": 2.337405086561902e-06, + "loss": 0.81765783, + "num_input_tokens_seen": 165192395, + "step": 7695, + "time_per_iteration": 2.7454562187194824 + }, + { + "auxiliary_loss_clip": 0.01114811, + "auxiliary_loss_mlp": 0.01037414, + "balance_loss_clip": 1.04553604, + "balance_loss_mlp": 1.02390218, + "epoch": 0.462708552532692, + "flos": 16764214936320.0, + "grad_norm": 1.803891217274167, + "language_loss": 0.72445035, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.74597263, + "num_input_tokens_seen": 165211355, + "step": 7696, + "time_per_iteration": 4.214217901229858 + }, + { + "auxiliary_loss_clip": 0.01110882, + "auxiliary_loss_mlp": 0.01046867, + "balance_loss_clip": 1.04748213, + "balance_loss_mlp": 1.03221607, + "epoch": 0.46276867578535996, + "flos": 15560453422080.0, + "grad_norm": 1.5710514609338178, + "language_loss": 0.69939005, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.72096753, + "num_input_tokens_seen": 165229380, + "step": 7697, + "time_per_iteration": 4.213683843612671 + }, + { + "auxiliary_loss_clip": 0.01133171, + "auxiliary_loss_mlp": 0.01036334, + "balance_loss_clip": 1.05145979, + "balance_loss_mlp": 1.02264249, + "epoch": 0.462828799038028, + "flos": 22415404763520.0, + "grad_norm": 1.9243080556164578, + "language_loss": 0.84559363, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.86728865, + "num_input_tokens_seen": 165247200, + "step": 7698, + "time_per_iteration": 2.6434006690979004 + }, + { + "auxiliary_loss_clip": 0.01130166, + "auxiliary_loss_mlp": 0.01037324, + "balance_loss_clip": 1.04838073, + "balance_loss_mlp": 1.02357352, + "epoch": 0.46288892229069595, + "flos": 21069580959360.0, + "grad_norm": 8.31912219741259, + "language_loss": 0.71345413, + "learning_rate": 2.335869466239502e-06, + "loss": 0.73512906, + "num_input_tokens_seen": 165265825, + "step": 7699, + "time_per_iteration": 4.157729387283325 + }, + { + "auxiliary_loss_clip": 0.01073609, + "auxiliary_loss_mlp": 0.01040377, + "balance_loss_clip": 1.04345739, + "balance_loss_mlp": 1.02550519, + "epoch": 0.4629490455433639, + "flos": 23185688947200.0, + "grad_norm": 1.732328117704307, + "language_loss": 0.71911675, + "learning_rate": 2.335485529281996e-06, + "loss": 0.74025667, + "num_input_tokens_seen": 165284380, + "step": 7700, + "time_per_iteration": 2.8432295322418213 + }, + { + "auxiliary_loss_clip": 0.01128125, + "auxiliary_loss_mlp": 0.00771852, + "balance_loss_clip": 1.04640698, + "balance_loss_mlp": 1.00047588, + "epoch": 0.4630091687960319, + "flos": 18835541642880.0, + "grad_norm": 2.4184025660528863, + "language_loss": 0.73149109, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.7504909, + "num_input_tokens_seen": 165300320, + "step": 7701, + "time_per_iteration": 4.2371203899383545 + }, + { + "auxiliary_loss_clip": 0.01087014, + "auxiliary_loss_mlp": 0.01044166, + "balance_loss_clip": 1.04401398, + "balance_loss_mlp": 1.02921128, + "epoch": 0.46306929204869984, + "flos": 38907020407680.0, + "grad_norm": 2.4372676297457216, + "language_loss": 0.65005761, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.67136943, + "num_input_tokens_seen": 165318130, + "step": 7702, + "time_per_iteration": 2.875633716583252 + }, + { + "auxiliary_loss_clip": 0.01103467, + "auxiliary_loss_mlp": 0.01032726, + "balance_loss_clip": 1.04441071, + "balance_loss_mlp": 1.01875424, + "epoch": 0.4631294153013678, + "flos": 19644178573440.0, + "grad_norm": 1.9024039666922008, + "language_loss": 0.73310453, + "learning_rate": 2.33433364213785e-06, + "loss": 0.75446641, + "num_input_tokens_seen": 165336225, + "step": 7703, + "time_per_iteration": 2.7307324409484863 + }, + { + "auxiliary_loss_clip": 0.01109216, + "auxiliary_loss_mlp": 0.0103683, + "balance_loss_clip": 1.04673266, + "balance_loss_mlp": 1.02145839, + "epoch": 0.4631895385540358, + "flos": 24608254158720.0, + "grad_norm": 1.9428423147374236, + "language_loss": 0.68751299, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.70897353, + "num_input_tokens_seen": 165355005, + "step": 7704, + "time_per_iteration": 2.7113852500915527 + }, + { + "auxiliary_loss_clip": 0.01120314, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.04720986, + "balance_loss_mlp": 1.01935196, + "epoch": 0.46324966180670374, + "flos": 26320115508480.0, + "grad_norm": 2.3420396256779443, + "language_loss": 0.81331742, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.83486044, + "num_input_tokens_seen": 165374910, + "step": 7705, + "time_per_iteration": 2.804708480834961 + }, + { + "auxiliary_loss_clip": 0.01119161, + "auxiliary_loss_mlp": 0.01035806, + "balance_loss_clip": 1.04762256, + "balance_loss_mlp": 1.02172124, + "epoch": 0.4633097850593717, + "flos": 19240506552960.0, + "grad_norm": 1.6909152504462979, + "language_loss": 0.77714217, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.79869187, + "num_input_tokens_seen": 165392590, + "step": 7706, + "time_per_iteration": 2.67990779876709 + }, + { + "auxiliary_loss_clip": 0.01102016, + "auxiliary_loss_mlp": 0.01033802, + "balance_loss_clip": 1.04767776, + "balance_loss_mlp": 1.02023578, + "epoch": 0.46336990831203967, + "flos": 22783166161920.0, + "grad_norm": 2.039386256395222, + "language_loss": 0.699494, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.7208522, + "num_input_tokens_seen": 165411195, + "step": 7707, + "time_per_iteration": 2.7109720706939697 + }, + { + "auxiliary_loss_clip": 0.01111011, + "auxiliary_loss_mlp": 0.01038647, + "balance_loss_clip": 1.04469609, + "balance_loss_mlp": 1.02306628, + "epoch": 0.46343003156470763, + "flos": 38210604543360.0, + "grad_norm": 1.931472234163978, + "language_loss": 0.61287057, + "learning_rate": 2.332413576865791e-06, + "loss": 0.63436711, + "num_input_tokens_seen": 165430150, + "step": 7708, + "time_per_iteration": 2.8489346504211426 + }, + { + "auxiliary_loss_clip": 0.01089075, + "auxiliary_loss_mlp": 0.01033464, + "balance_loss_clip": 1.04273093, + "balance_loss_mlp": 1.01930773, + "epoch": 0.4634901548173756, + "flos": 31938555110400.0, + "grad_norm": 2.4081522593734332, + "language_loss": 0.77443427, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.79565972, + "num_input_tokens_seen": 165450595, + "step": 7709, + "time_per_iteration": 2.720604419708252 + }, + { + "auxiliary_loss_clip": 0.01134634, + "auxiliary_loss_mlp": 0.0103959, + "balance_loss_clip": 1.04938257, + "balance_loss_mlp": 1.02433753, + "epoch": 0.46355027807004356, + "flos": 20082540153600.0, + "grad_norm": 1.78810829524809, + "language_loss": 0.77216917, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.79391134, + "num_input_tokens_seen": 165469515, + "step": 7710, + "time_per_iteration": 2.5303022861480713 + }, + { + "auxiliary_loss_clip": 0.01122514, + "auxiliary_loss_mlp": 0.01037619, + "balance_loss_clip": 1.04637122, + "balance_loss_mlp": 1.02172804, + "epoch": 0.4636104013227116, + "flos": 24061370613120.0, + "grad_norm": 2.2400017320201187, + "language_loss": 0.73509276, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.75669408, + "num_input_tokens_seen": 165488125, + "step": 7711, + "time_per_iteration": 2.5654797554016113 + }, + { + "auxiliary_loss_clip": 0.0110546, + "auxiliary_loss_mlp": 0.01046309, + "balance_loss_clip": 1.04776788, + "balance_loss_mlp": 1.03109789, + "epoch": 0.46367052457537955, + "flos": 23914639555200.0, + "grad_norm": 1.4625168937424313, + "language_loss": 0.71734262, + "learning_rate": 2.33087729766797e-06, + "loss": 0.73886031, + "num_input_tokens_seen": 165509225, + "step": 7712, + "time_per_iteration": 2.6021108627319336 + }, + { + "auxiliary_loss_clip": 0.01109448, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.04681897, + "balance_loss_mlp": 1.02359128, + "epoch": 0.4637306478280475, + "flos": 26396533693440.0, + "grad_norm": 10.680731903132253, + "language_loss": 0.73100054, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.75249463, + "num_input_tokens_seen": 165529945, + "step": 7713, + "time_per_iteration": 2.7074029445648193 + }, + { + "auxiliary_loss_clip": 0.01098034, + "auxiliary_loss_mlp": 0.01037925, + "balance_loss_clip": 1.0441041, + "balance_loss_mlp": 1.02191556, + "epoch": 0.4637907710807155, + "flos": 21980706370560.0, + "grad_norm": 1.6982870192648571, + "language_loss": 0.5889293, + "learning_rate": 2.3301090827294e-06, + "loss": 0.61028892, + "num_input_tokens_seen": 165550690, + "step": 7714, + "time_per_iteration": 2.710048198699951 + }, + { + "auxiliary_loss_clip": 0.01120282, + "auxiliary_loss_mlp": 0.01034073, + "balance_loss_clip": 1.04763293, + "balance_loss_mlp": 1.01950562, + "epoch": 0.46385089433338345, + "flos": 12422291846400.0, + "grad_norm": 1.91274815186046, + "language_loss": 0.70204347, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.72358704, + "num_input_tokens_seen": 165567775, + "step": 7715, + "time_per_iteration": 2.6403465270996094 + }, + { + "auxiliary_loss_clip": 0.01138235, + "auxiliary_loss_mlp": 0.01041941, + "balance_loss_clip": 1.04938495, + "balance_loss_mlp": 1.02725387, + "epoch": 0.4639110175860514, + "flos": 23915752876800.0, + "grad_norm": 2.6000471859571777, + "language_loss": 0.68646967, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.7082715, + "num_input_tokens_seen": 165587010, + "step": 7716, + "time_per_iteration": 2.6233439445495605 + }, + { + "auxiliary_loss_clip": 0.01132713, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.0472188, + "balance_loss_mlp": 1.01599193, + "epoch": 0.4639711408387194, + "flos": 25300396304640.0, + "grad_norm": 1.7614766285874086, + "language_loss": 0.809901, + "learning_rate": 2.328956666474691e-06, + "loss": 0.83153987, + "num_input_tokens_seen": 165607850, + "step": 7717, + "time_per_iteration": 2.6267318725585938 + }, + { + "auxiliary_loss_clip": 0.01131786, + "auxiliary_loss_mlp": 0.01036738, + "balance_loss_clip": 1.0477078, + "balance_loss_mlp": 1.02206373, + "epoch": 0.46403126409138734, + "flos": 21211822817280.0, + "grad_norm": 1.7513215449973674, + "language_loss": 0.73192513, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.75361037, + "num_input_tokens_seen": 165627175, + "step": 7718, + "time_per_iteration": 2.5936009883880615 + }, + { + "auxiliary_loss_clip": 0.01129362, + "auxiliary_loss_mlp": 0.00772229, + "balance_loss_clip": 1.04671347, + "balance_loss_mlp": 1.00063276, + "epoch": 0.4640913873440553, + "flos": 35845564325760.0, + "grad_norm": 1.6991265809872926, + "language_loss": 0.70156294, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.72057891, + "num_input_tokens_seen": 165648340, + "step": 7719, + "time_per_iteration": 2.7047362327575684 + }, + { + "auxiliary_loss_clip": 0.01112084, + "auxiliary_loss_mlp": 0.01036441, + "balance_loss_clip": 1.05082273, + "balance_loss_mlp": 1.02101541, + "epoch": 0.46415151059672327, + "flos": 19166207270400.0, + "grad_norm": 2.142564905802957, + "language_loss": 0.86823177, + "learning_rate": 2.327804137953357e-06, + "loss": 0.88971704, + "num_input_tokens_seen": 165667195, + "step": 7720, + "time_per_iteration": 2.7309963703155518 + }, + { + "auxiliary_loss_clip": 0.01032352, + "auxiliary_loss_mlp": 0.01008212, + "balance_loss_clip": 1.02414155, + "balance_loss_mlp": 1.00647151, + "epoch": 0.46421163384939124, + "flos": 58912750304640.0, + "grad_norm": 0.7188509278747012, + "language_loss": 0.55039424, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57079989, + "num_input_tokens_seen": 165726760, + "step": 7721, + "time_per_iteration": 3.236877679824829 + }, + { + "auxiliary_loss_clip": 0.01107525, + "auxiliary_loss_mlp": 0.01036882, + "balance_loss_clip": 1.04643178, + "balance_loss_mlp": 1.02240443, + "epoch": 0.4642717571020592, + "flos": 20157342226560.0, + "grad_norm": 2.140310045449241, + "language_loss": 0.79792923, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.81937331, + "num_input_tokens_seen": 165745005, + "step": 7722, + "time_per_iteration": 2.660754919052124 + }, + { + "auxiliary_loss_clip": 0.01135285, + "auxiliary_loss_mlp": 0.01039973, + "balance_loss_clip": 1.04771972, + "balance_loss_mlp": 1.02478552, + "epoch": 0.46433188035472717, + "flos": 25046184775680.0, + "grad_norm": 1.8420199747356898, + "language_loss": 0.77947485, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.80122739, + "num_input_tokens_seen": 165765750, + "step": 7723, + "time_per_iteration": 2.650667667388916 + }, + { + "auxiliary_loss_clip": 0.010296, + "auxiliary_loss_mlp": 0.01034411, + "balance_loss_clip": 1.03560913, + "balance_loss_mlp": 1.01945066, + "epoch": 0.4643920036073952, + "flos": 28075644817920.0, + "grad_norm": 1.6775959652720056, + "language_loss": 0.68506896, + "learning_rate": 2.326267259301118e-06, + "loss": 0.7057091, + "num_input_tokens_seen": 165787515, + "step": 7724, + "time_per_iteration": 3.0586209297180176 + }, + { + "auxiliary_loss_clip": 0.01115779, + "auxiliary_loss_mlp": 0.01034262, + "balance_loss_clip": 1.04832113, + "balance_loss_mlp": 1.0193367, + "epoch": 0.46445212686006315, + "flos": 18369350000640.0, + "grad_norm": 3.606583728635542, + "language_loss": 0.67163348, + "learning_rate": 2.325883008671415e-06, + "loss": 0.69313383, + "num_input_tokens_seen": 165806675, + "step": 7725, + "time_per_iteration": 2.9137332439422607 + }, + { + "auxiliary_loss_clip": 0.01113984, + "auxiliary_loss_mlp": 0.01038381, + "balance_loss_clip": 1.04604602, + "balance_loss_mlp": 1.02554178, + "epoch": 0.4645122501127311, + "flos": 31721618920320.0, + "grad_norm": 1.751091551827286, + "language_loss": 0.65037453, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.67189825, + "num_input_tokens_seen": 165829835, + "step": 7726, + "time_per_iteration": 2.7184534072875977 + }, + { + "auxiliary_loss_clip": 0.0110497, + "auxiliary_loss_mlp": 0.00772968, + "balance_loss_clip": 1.04436016, + "balance_loss_mlp": 1.00061822, + "epoch": 0.4645723733653991, + "flos": 23768806337280.0, + "grad_norm": 1.6559858063545494, + "language_loss": 0.74796247, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.76674187, + "num_input_tokens_seen": 165849380, + "step": 7727, + "time_per_iteration": 2.7193634510040283 + }, + { + "auxiliary_loss_clip": 0.01107461, + "auxiliary_loss_mlp": 0.0104049, + "balance_loss_clip": 1.0458529, + "balance_loss_mlp": 1.02506471, + "epoch": 0.46463249661806705, + "flos": 33145512935040.0, + "grad_norm": 2.1928121253358293, + "language_loss": 0.78549933, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.80697882, + "num_input_tokens_seen": 165868620, + "step": 7728, + "time_per_iteration": 2.744900703430176 + }, + { + "auxiliary_loss_clip": 0.01092904, + "auxiliary_loss_mlp": 0.01038861, + "balance_loss_clip": 1.0414784, + "balance_loss_mlp": 1.02373958, + "epoch": 0.464692619870735, + "flos": 18296020385280.0, + "grad_norm": 2.0549050897499135, + "language_loss": 0.75892472, + "learning_rate": 2.324345882723155e-06, + "loss": 0.78024244, + "num_input_tokens_seen": 165885915, + "step": 7729, + "time_per_iteration": 2.7145724296569824 + }, + { + "auxiliary_loss_clip": 0.01108829, + "auxiliary_loss_mlp": 0.01047351, + "balance_loss_clip": 1.0485568, + "balance_loss_mlp": 1.03153229, + "epoch": 0.464752743123403, + "flos": 22638051216000.0, + "grad_norm": 1.8824527818993837, + "language_loss": 0.79760742, + "learning_rate": 2.323961570451588e-06, + "loss": 0.81916922, + "num_input_tokens_seen": 165905465, + "step": 7730, + "time_per_iteration": 2.7782390117645264 + }, + { + "auxiliary_loss_clip": 0.01130146, + "auxiliary_loss_mlp": 0.01037223, + "balance_loss_clip": 1.04756629, + "balance_loss_mlp": 1.02265573, + "epoch": 0.46481286637607094, + "flos": 20412128373120.0, + "grad_norm": 1.6262082138117517, + "language_loss": 0.77182668, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.79350036, + "num_input_tokens_seen": 165924640, + "step": 7731, + "time_per_iteration": 2.617314577102661 + }, + { + "auxiliary_loss_clip": 0.01090917, + "auxiliary_loss_mlp": 0.01035098, + "balance_loss_clip": 1.04506755, + "balance_loss_mlp": 1.02119207, + "epoch": 0.4648729896287389, + "flos": 34275406129920.0, + "grad_norm": 1.6446435516271722, + "language_loss": 0.65999961, + "learning_rate": 2.323192909069061e-06, + "loss": 0.68125969, + "num_input_tokens_seen": 165945765, + "step": 7732, + "time_per_iteration": 2.806825876235962 + }, + { + "auxiliary_loss_clip": 0.01109545, + "auxiliary_loss_mlp": 0.0104247, + "balance_loss_clip": 1.04427695, + "balance_loss_mlp": 1.02551866, + "epoch": 0.4649331128814069, + "flos": 21321781326720.0, + "grad_norm": 2.341941786180864, + "language_loss": 0.72770941, + "learning_rate": 2.32280855998725e-06, + "loss": 0.74922955, + "num_input_tokens_seen": 165964025, + "step": 7733, + "time_per_iteration": 2.6884191036224365 + }, + { + "auxiliary_loss_clip": 0.01046209, + "auxiliary_loss_mlp": 0.01002418, + "balance_loss_clip": 1.01885557, + "balance_loss_mlp": 1.00089204, + "epoch": 0.46499323613407484, + "flos": 58308515717760.0, + "grad_norm": 1.2786299900123337, + "language_loss": 0.51944834, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.53993464, + "num_input_tokens_seen": 166021950, + "step": 7734, + "time_per_iteration": 3.0932440757751465 + }, + { + "auxiliary_loss_clip": 0.01111419, + "auxiliary_loss_mlp": 0.01034362, + "balance_loss_clip": 1.05044913, + "balance_loss_mlp": 1.01990235, + "epoch": 0.4650533593867428, + "flos": 10889660384640.0, + "grad_norm": 2.1631100357564788, + "language_loss": 0.75439203, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.77584982, + "num_input_tokens_seen": 166039675, + "step": 7735, + "time_per_iteration": 4.546087265014648 + }, + { + "auxiliary_loss_clip": 0.01087553, + "auxiliary_loss_mlp": 0.01045865, + "balance_loss_clip": 1.04543328, + "balance_loss_mlp": 1.0305233, + "epoch": 0.46511348263941077, + "flos": 19974592805760.0, + "grad_norm": 2.3653554564968435, + "language_loss": 0.69901764, + "learning_rate": 2.321655439354519e-06, + "loss": 0.72035182, + "num_input_tokens_seen": 166057745, + "step": 7736, + "time_per_iteration": 4.302860498428345 + }, + { + "auxiliary_loss_clip": 0.01128458, + "auxiliary_loss_mlp": 0.01036991, + "balance_loss_clip": 1.0473057, + "balance_loss_mlp": 1.0228653, + "epoch": 0.46517360589207873, + "flos": 19678401256320.0, + "grad_norm": 1.6411657567334208, + "language_loss": 0.71995008, + "learning_rate": 2.321271041396427e-06, + "loss": 0.74160457, + "num_input_tokens_seen": 166076440, + "step": 7737, + "time_per_iteration": 2.566603183746338 + }, + { + "auxiliary_loss_clip": 0.01111802, + "auxiliary_loss_mlp": 0.01040407, + "balance_loss_clip": 1.05224276, + "balance_loss_mlp": 1.02456391, + "epoch": 0.46523372914474675, + "flos": 16872665074560.0, + "grad_norm": 2.50928704064022, + "language_loss": 0.83606738, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.85758948, + "num_input_tokens_seen": 166092520, + "step": 7738, + "time_per_iteration": 2.602149486541748 + }, + { + "auxiliary_loss_clip": 0.0103645, + "auxiliary_loss_mlp": 0.01000487, + "balance_loss_clip": 1.01920033, + "balance_loss_mlp": 0.99899715, + "epoch": 0.4652938523974147, + "flos": 53439138339840.0, + "grad_norm": 0.7761784242108043, + "language_loss": 0.57855058, + "learning_rate": 2.320502208946932e-06, + "loss": 0.59891999, + "num_input_tokens_seen": 166156285, + "step": 7739, + "time_per_iteration": 4.744653940200806 + }, + { + "auxiliary_loss_clip": 0.01111735, + "auxiliary_loss_mlp": 0.0104196, + "balance_loss_clip": 1.04867125, + "balance_loss_mlp": 1.02728581, + "epoch": 0.4653539756500827, + "flos": 15231296165760.0, + "grad_norm": 1.7825482177936647, + "language_loss": 0.85391408, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.87545103, + "num_input_tokens_seen": 166173455, + "step": 7740, + "time_per_iteration": 4.26358962059021 + }, + { + "auxiliary_loss_clip": 0.01103788, + "auxiliary_loss_mlp": 0.01043392, + "balance_loss_clip": 1.04354095, + "balance_loss_mlp": 1.02769184, + "epoch": 0.46541409890275065, + "flos": 23732249270400.0, + "grad_norm": 1.728452967927443, + "language_loss": 0.75540549, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.77687728, + "num_input_tokens_seen": 166194370, + "step": 7741, + "time_per_iteration": 2.7189860343933105 + }, + { + "auxiliary_loss_clip": 0.01102378, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.04642224, + "balance_loss_mlp": 1.02583992, + "epoch": 0.4654742221554186, + "flos": 20847329556480.0, + "grad_norm": 1.6912495786690362, + "language_loss": 0.80807334, + "learning_rate": 2.319348869158064e-06, + "loss": 0.82949644, + "num_input_tokens_seen": 166213195, + "step": 7742, + "time_per_iteration": 2.7285542488098145 + }, + { + "auxiliary_loss_clip": 0.01109172, + "auxiliary_loss_mlp": 0.01044204, + "balance_loss_clip": 1.04378545, + "balance_loss_mlp": 1.02846837, + "epoch": 0.4655343454080866, + "flos": 20704836303360.0, + "grad_norm": 2.554211916953899, + "language_loss": 0.7287879, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.75032163, + "num_input_tokens_seen": 166231350, + "step": 7743, + "time_per_iteration": 2.8064794540405273 + }, + { + "auxiliary_loss_clip": 0.01097309, + "auxiliary_loss_mlp": 0.01035628, + "balance_loss_clip": 1.044186, + "balance_loss_mlp": 1.01989281, + "epoch": 0.46559446866075455, + "flos": 18989850470400.0, + "grad_norm": 1.9272268848768948, + "language_loss": 0.71113133, + "learning_rate": 2.318579915392483e-06, + "loss": 0.73246074, + "num_input_tokens_seen": 166250530, + "step": 7744, + "time_per_iteration": 2.7021846771240234 + }, + { + "auxiliary_loss_clip": 0.01081647, + "auxiliary_loss_mlp": 0.01033676, + "balance_loss_clip": 1.04821372, + "balance_loss_mlp": 1.01952028, + "epoch": 0.4656545919134225, + "flos": 34496364643200.0, + "grad_norm": 1.5788774332625253, + "language_loss": 0.84865856, + "learning_rate": 2.31819542038153e-06, + "loss": 0.86981177, + "num_input_tokens_seen": 166272545, + "step": 7745, + "time_per_iteration": 2.8962950706481934 + }, + { + "auxiliary_loss_clip": 0.01118243, + "auxiliary_loss_mlp": 0.01044667, + "balance_loss_clip": 1.04609525, + "balance_loss_mlp": 1.02958083, + "epoch": 0.4657147151660905, + "flos": 24310554238080.0, + "grad_norm": 1.3325532903447972, + "language_loss": 0.72868127, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75031042, + "num_input_tokens_seen": 166292135, + "step": 7746, + "time_per_iteration": 2.654744863510132 + }, + { + "auxiliary_loss_clip": 0.01115957, + "auxiliary_loss_mlp": 0.01039896, + "balance_loss_clip": 1.04620576, + "balance_loss_mlp": 1.02557254, + "epoch": 0.46577483841875844, + "flos": 58795139220480.0, + "grad_norm": 2.5149225133479667, + "language_loss": 0.69942105, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.72097951, + "num_input_tokens_seen": 166316710, + "step": 7747, + "time_per_iteration": 2.946551561355591 + }, + { + "auxiliary_loss_clip": 0.01087715, + "auxiliary_loss_mlp": 0.01043482, + "balance_loss_clip": 1.04082656, + "balance_loss_mlp": 1.0269475, + "epoch": 0.4658349616714264, + "flos": 31321969223040.0, + "grad_norm": 1.543824419854341, + "language_loss": 0.67369974, + "learning_rate": 2.317041863010978e-06, + "loss": 0.69501168, + "num_input_tokens_seen": 166338535, + "step": 7748, + "time_per_iteration": 2.7577450275421143 + }, + { + "auxiliary_loss_clip": 0.01095867, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.04655099, + "balance_loss_mlp": 1.0242455, + "epoch": 0.46589508492409437, + "flos": 14860338456960.0, + "grad_norm": 2.2493825617355805, + "language_loss": 0.6400212, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.66138601, + "num_input_tokens_seen": 166355540, + "step": 7749, + "time_per_iteration": 2.6768271923065186 + }, + { + "auxiliary_loss_clip": 0.01124878, + "auxiliary_loss_mlp": 0.01035356, + "balance_loss_clip": 1.04833543, + "balance_loss_mlp": 1.01912558, + "epoch": 0.46595520817676234, + "flos": 12895989431040.0, + "grad_norm": 2.0851109379556414, + "language_loss": 0.74756414, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.76916647, + "num_input_tokens_seen": 166372635, + "step": 7750, + "time_per_iteration": 2.6180553436279297 + }, + { + "auxiliary_loss_clip": 0.01112353, + "auxiliary_loss_mlp": 0.01032354, + "balance_loss_clip": 1.04888475, + "balance_loss_mlp": 1.01699984, + "epoch": 0.46601533142943036, + "flos": 32854169721600.0, + "grad_norm": 2.1197385056246, + "language_loss": 0.74433059, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.76577765, + "num_input_tokens_seen": 166393175, + "step": 7751, + "time_per_iteration": 2.7448816299438477 + }, + { + "auxiliary_loss_clip": 0.01105983, + "auxiliary_loss_mlp": 0.01039216, + "balance_loss_clip": 1.049245, + "balance_loss_mlp": 1.02301598, + "epoch": 0.4660754546820983, + "flos": 19967517826560.0, + "grad_norm": 2.5234072122891176, + "language_loss": 0.73595881, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.75741076, + "num_input_tokens_seen": 166408630, + "step": 7752, + "time_per_iteration": 2.6944475173950195 + }, + { + "auxiliary_loss_clip": 0.01108633, + "auxiliary_loss_mlp": 0.01040109, + "balance_loss_clip": 1.04941273, + "balance_loss_mlp": 1.02493417, + "epoch": 0.4661355779347663, + "flos": 26688164215680.0, + "grad_norm": 2.044776600528041, + "language_loss": 0.69086194, + "learning_rate": 2.315119027142644e-06, + "loss": 0.7123493, + "num_input_tokens_seen": 166428170, + "step": 7753, + "time_per_iteration": 2.736854076385498 + }, + { + "auxiliary_loss_clip": 0.01099142, + "auxiliary_loss_mlp": 0.01040064, + "balance_loss_clip": 1.04148221, + "balance_loss_mlp": 1.02494824, + "epoch": 0.46619570118743425, + "flos": 20959442881920.0, + "grad_norm": 2.155464287948458, + "language_loss": 0.72724748, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.74863952, + "num_input_tokens_seen": 166446705, + "step": 7754, + "time_per_iteration": 2.6782143115997314 + }, + { + "auxiliary_loss_clip": 0.01113403, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.04633951, + "balance_loss_mlp": 1.01993394, + "epoch": 0.4662558244401022, + "flos": 24426079355520.0, + "grad_norm": 1.424199388432646, + "language_loss": 0.78797996, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.80947065, + "num_input_tokens_seen": 166466750, + "step": 7755, + "time_per_iteration": 2.8091399669647217 + }, + { + "auxiliary_loss_clip": 0.01115387, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.04450297, + "balance_loss_mlp": 1.01545656, + "epoch": 0.4663159476927702, + "flos": 20595452411520.0, + "grad_norm": 1.631642654170447, + "language_loss": 0.72453964, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.74599707, + "num_input_tokens_seen": 166485400, + "step": 7756, + "time_per_iteration": 2.7136480808258057 + }, + { + "auxiliary_loss_clip": 0.01117973, + "auxiliary_loss_mlp": 0.01036177, + "balance_loss_clip": 1.04585207, + "balance_loss_mlp": 1.02137136, + "epoch": 0.46637607094543815, + "flos": 25661872823040.0, + "grad_norm": 2.024488409117557, + "language_loss": 0.78578007, + "learning_rate": 2.313580543272274e-06, + "loss": 0.80732161, + "num_input_tokens_seen": 166505730, + "step": 7757, + "time_per_iteration": 2.6828832626342773 + }, + { + "auxiliary_loss_clip": 0.01090573, + "auxiliary_loss_mlp": 0.01031697, + "balance_loss_clip": 1.04173446, + "balance_loss_mlp": 1.01717782, + "epoch": 0.4664361941981061, + "flos": 24273853516800.0, + "grad_norm": 2.116616009232987, + "language_loss": 0.6656999, + "learning_rate": 2.313195892540705e-06, + "loss": 0.68692255, + "num_input_tokens_seen": 166523770, + "step": 7758, + "time_per_iteration": 2.7238266468048096 + }, + { + "auxiliary_loss_clip": 0.01098442, + "auxiliary_loss_mlp": 0.01044236, + "balance_loss_clip": 1.04272914, + "balance_loss_mlp": 1.02916837, + "epoch": 0.4664963174507741, + "flos": 18405871153920.0, + "grad_norm": 1.6471741103867168, + "language_loss": 0.74542332, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.76685011, + "num_input_tokens_seen": 166542935, + "step": 7759, + "time_per_iteration": 2.648406744003296 + }, + { + "auxiliary_loss_clip": 0.01110559, + "auxiliary_loss_mlp": 0.01047546, + "balance_loss_clip": 1.04692769, + "balance_loss_mlp": 1.0325253, + "epoch": 0.46655644070344204, + "flos": 22455122227200.0, + "grad_norm": 1.575011375316493, + "language_loss": 0.77734709, + "learning_rate": 2.312426555462893e-06, + "loss": 0.79892808, + "num_input_tokens_seen": 166563935, + "step": 7760, + "time_per_iteration": 2.715393543243408 + }, + { + "auxiliary_loss_clip": 0.01104604, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.04476929, + "balance_loss_mlp": 1.01968408, + "epoch": 0.46661656395611, + "flos": 13808407731840.0, + "grad_norm": 1.8509707336449404, + "language_loss": 0.74408627, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.76547837, + "num_input_tokens_seen": 166582175, + "step": 7761, + "time_per_iteration": 2.679760217666626 + }, + { + "auxiliary_loss_clip": 0.01118037, + "auxiliary_loss_mlp": 0.01038779, + "balance_loss_clip": 1.04605913, + "balance_loss_mlp": 1.02199411, + "epoch": 0.466676687208778, + "flos": 21652159645440.0, + "grad_norm": 1.9428650174374826, + "language_loss": 0.78880894, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.81037712, + "num_input_tokens_seen": 166601870, + "step": 7762, + "time_per_iteration": 2.6236844062805176 + }, + { + "auxiliary_loss_clip": 0.01032755, + "auxiliary_loss_mlp": 0.01004567, + "balance_loss_clip": 1.01497078, + "balance_loss_mlp": 1.00300527, + "epoch": 0.46673681046144594, + "flos": 68534259068160.0, + "grad_norm": 0.7915263755311791, + "language_loss": 0.59707403, + "learning_rate": 2.311272461028297e-06, + "loss": 0.61744726, + "num_input_tokens_seen": 166668960, + "step": 7763, + "time_per_iteration": 3.2309603691101074 + }, + { + "auxiliary_loss_clip": 0.01092007, + "auxiliary_loss_mlp": 0.01038011, + "balance_loss_clip": 1.04239237, + "balance_loss_mlp": 1.02181077, + "epoch": 0.46679693371411396, + "flos": 15814449469440.0, + "grad_norm": 2.1149132662524766, + "language_loss": 0.78707278, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.80837297, + "num_input_tokens_seen": 166686110, + "step": 7764, + "time_per_iteration": 2.667523145675659 + }, + { + "auxiliary_loss_clip": 0.01102497, + "auxiliary_loss_mlp": 0.01038126, + "balance_loss_clip": 1.05066562, + "balance_loss_mlp": 1.02470863, + "epoch": 0.4668570569667819, + "flos": 18514572687360.0, + "grad_norm": 1.9076684434806583, + "language_loss": 0.72103167, + "learning_rate": 2.310503005696839e-06, + "loss": 0.74243796, + "num_input_tokens_seen": 166703930, + "step": 7765, + "time_per_iteration": 2.695037364959717 + }, + { + "auxiliary_loss_clip": 0.0108654, + "auxiliary_loss_mlp": 0.01041419, + "balance_loss_clip": 1.04354358, + "balance_loss_mlp": 1.02578509, + "epoch": 0.4669171802194499, + "flos": 19206643006080.0, + "grad_norm": 3.5524770939500763, + "language_loss": 0.77958077, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.80086035, + "num_input_tokens_seen": 166719940, + "step": 7766, + "time_per_iteration": 2.7083003520965576 + }, + { + "auxiliary_loss_clip": 0.01111478, + "auxiliary_loss_mlp": 0.01041119, + "balance_loss_clip": 1.0413723, + "balance_loss_mlp": 1.02596176, + "epoch": 0.46697730347211786, + "flos": 12276135406080.0, + "grad_norm": 2.008926604773062, + "language_loss": 0.64852947, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.67005551, + "num_input_tokens_seen": 166738285, + "step": 7767, + "time_per_iteration": 2.6344571113586426 + }, + { + "auxiliary_loss_clip": 0.01120029, + "auxiliary_loss_mlp": 0.0104422, + "balance_loss_clip": 1.04623926, + "balance_loss_mlp": 1.02955675, + "epoch": 0.4670374267247858, + "flos": 23586739274880.0, + "grad_norm": 1.9514245068590486, + "language_loss": 0.74225283, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.76389533, + "num_input_tokens_seen": 166758170, + "step": 7768, + "time_per_iteration": 2.7037155628204346 + }, + { + "auxiliary_loss_clip": 0.01101883, + "auxiliary_loss_mlp": 0.01035933, + "balance_loss_clip": 1.04606605, + "balance_loss_mlp": 1.02081776, + "epoch": 0.4670975499774538, + "flos": 15991093578240.0, + "grad_norm": 1.8795722363955685, + "language_loss": 0.70699239, + "learning_rate": 2.308963953858982e-06, + "loss": 0.72837055, + "num_input_tokens_seen": 166775750, + "step": 7769, + "time_per_iteration": 2.6716794967651367 + }, + { + "auxiliary_loss_clip": 0.0112823, + "auxiliary_loss_mlp": 0.01035755, + "balance_loss_clip": 1.04401624, + "balance_loss_mlp": 1.02156949, + "epoch": 0.46715767323012175, + "flos": 15377596260480.0, + "grad_norm": 2.0624542877059158, + "language_loss": 0.81268704, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.83432686, + "num_input_tokens_seen": 166791720, + "step": 7770, + "time_per_iteration": 2.5958662033081055 + }, + { + "auxiliary_loss_clip": 0.01043437, + "auxiliary_loss_mlp": 0.01001838, + "balance_loss_clip": 1.01635242, + "balance_loss_mlp": 1.00027645, + "epoch": 0.4672177964827897, + "flos": 60252217401600.0, + "grad_norm": 0.7961749107066677, + "language_loss": 0.5562135, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.57666636, + "num_input_tokens_seen": 166856360, + "step": 7771, + "time_per_iteration": 3.1569736003875732 + }, + { + "auxiliary_loss_clip": 0.01114939, + "auxiliary_loss_mlp": 0.00771824, + "balance_loss_clip": 1.04351723, + "balance_loss_mlp": 1.00060511, + "epoch": 0.4672779197354577, + "flos": 27636134002560.0, + "grad_norm": 1.896331384644372, + "language_loss": 0.65528286, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.67415047, + "num_input_tokens_seen": 166875925, + "step": 7772, + "time_per_iteration": 2.7263035774230957 + }, + { + "auxiliary_loss_clip": 0.01113556, + "auxiliary_loss_mlp": 0.01034989, + "balance_loss_clip": 1.04692101, + "balance_loss_mlp": 1.02061212, + "epoch": 0.46733804298812565, + "flos": 31394257344000.0, + "grad_norm": 2.0574903106475513, + "language_loss": 0.63557553, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.65706098, + "num_input_tokens_seen": 166896520, + "step": 7773, + "time_per_iteration": 2.691378593444824 + }, + { + "auxiliary_loss_clip": 0.01112174, + "auxiliary_loss_mlp": 0.01040289, + "balance_loss_clip": 1.04673469, + "balance_loss_mlp": 1.02454185, + "epoch": 0.4673981662407936, + "flos": 19500607912320.0, + "grad_norm": 1.9630472969764714, + "language_loss": 0.80073929, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.8222639, + "num_input_tokens_seen": 166915370, + "step": 7774, + "time_per_iteration": 2.661416530609131 + }, + { + "auxiliary_loss_clip": 0.01096265, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.04382384, + "balance_loss_mlp": 1.01813269, + "epoch": 0.4674582894934616, + "flos": 20521835487360.0, + "grad_norm": 1.5987951306887498, + "language_loss": 0.77369159, + "learning_rate": 2.306655024915726e-06, + "loss": 0.79499024, + "num_input_tokens_seen": 166934875, + "step": 7775, + "time_per_iteration": 4.281586647033691 + }, + { + "auxiliary_loss_clip": 0.01096609, + "auxiliary_loss_mlp": 0.01036176, + "balance_loss_clip": 1.04498506, + "balance_loss_mlp": 1.02137041, + "epoch": 0.46751841274612954, + "flos": 22090952188800.0, + "grad_norm": 1.8524613051021832, + "language_loss": 0.69526893, + "learning_rate": 2.306270162640694e-06, + "loss": 0.71659672, + "num_input_tokens_seen": 166954285, + "step": 7776, + "time_per_iteration": 4.289973497390747 + }, + { + "auxiliary_loss_clip": 0.0112105, + "auxiliary_loss_mlp": 0.0103614, + "balance_loss_clip": 1.04810274, + "balance_loss_mlp": 1.02246058, + "epoch": 0.46757853599879756, + "flos": 26980082046720.0, + "grad_norm": 1.5322212077638444, + "language_loss": 0.73980904, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.76138097, + "num_input_tokens_seen": 166975975, + "step": 7777, + "time_per_iteration": 2.7370285987854004 + }, + { + "auxiliary_loss_clip": 0.01118243, + "auxiliary_loss_mlp": 0.01036883, + "balance_loss_clip": 1.045416, + "balance_loss_mlp": 1.02208281, + "epoch": 0.4676386592514655, + "flos": 24134053783680.0, + "grad_norm": 2.891298768731385, + "language_loss": 0.69314432, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.71469557, + "num_input_tokens_seen": 166996140, + "step": 7778, + "time_per_iteration": 4.159350633621216 + }, + { + "auxiliary_loss_clip": 0.01119786, + "auxiliary_loss_mlp": 0.01041292, + "balance_loss_clip": 1.04801941, + "balance_loss_mlp": 1.02624786, + "epoch": 0.4676987825041335, + "flos": 25483720343040.0, + "grad_norm": 2.158752703527913, + "language_loss": 0.73216277, + "learning_rate": 2.305115506191206e-06, + "loss": 0.75377357, + "num_input_tokens_seen": 167016105, + "step": 7779, + "time_per_iteration": 2.6880576610565186 + }, + { + "auxiliary_loss_clip": 0.0108513, + "auxiliary_loss_mlp": 0.01043402, + "balance_loss_clip": 1.04270327, + "balance_loss_mlp": 1.02963924, + "epoch": 0.46775890575680146, + "flos": 21945298538880.0, + "grad_norm": 1.532169986090066, + "language_loss": 0.72447348, + "learning_rate": 2.304730597548562e-06, + "loss": 0.74575877, + "num_input_tokens_seen": 167036185, + "step": 7780, + "time_per_iteration": 4.378252267837524 + }, + { + "auxiliary_loss_clip": 0.01098995, + "auxiliary_loss_mlp": 0.01052099, + "balance_loss_clip": 1.03960943, + "balance_loss_mlp": 1.03428912, + "epoch": 0.4678190290094694, + "flos": 25228395492480.0, + "grad_norm": 1.8072634784489867, + "language_loss": 0.74489224, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.7664032, + "num_input_tokens_seen": 167054515, + "step": 7781, + "time_per_iteration": 2.684298038482666 + }, + { + "auxiliary_loss_clip": 0.01121556, + "auxiliary_loss_mlp": 0.01040282, + "balance_loss_clip": 1.04655743, + "balance_loss_mlp": 1.02464151, + "epoch": 0.4678791522621374, + "flos": 32268358811520.0, + "grad_norm": 3.3303395339611486, + "language_loss": 0.62934184, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.65096015, + "num_input_tokens_seen": 167077245, + "step": 7782, + "time_per_iteration": 2.801643133163452 + }, + { + "auxiliary_loss_clip": 0.01112208, + "auxiliary_loss_mlp": 0.01044015, + "balance_loss_clip": 1.04610753, + "balance_loss_mlp": 1.02925098, + "epoch": 0.46793927551480535, + "flos": 27046480337280.0, + "grad_norm": 2.527604831052906, + "language_loss": 0.63679516, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.65835738, + "num_input_tokens_seen": 167097235, + "step": 7783, + "time_per_iteration": 2.779493570327759 + }, + { + "auxiliary_loss_clip": 0.01126101, + "auxiliary_loss_mlp": 0.01040434, + "balance_loss_clip": 1.04948771, + "balance_loss_mlp": 1.02393532, + "epoch": 0.4679993987674733, + "flos": 17457398576640.0, + "grad_norm": 2.4796959185267884, + "language_loss": 0.67925286, + "learning_rate": 2.303190847569801e-06, + "loss": 0.70091814, + "num_input_tokens_seen": 167113155, + "step": 7784, + "time_per_iteration": 2.640165090560913 + }, + { + "auxiliary_loss_clip": 0.01100267, + "auxiliary_loss_mlp": 0.01033313, + "balance_loss_clip": 1.04564571, + "balance_loss_mlp": 1.0193001, + "epoch": 0.4680595220201413, + "flos": 17165121609600.0, + "grad_norm": 2.0879148282250304, + "language_loss": 0.84605902, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.8673948, + "num_input_tokens_seen": 167131765, + "step": 7785, + "time_per_iteration": 2.6447336673736572 + }, + { + "auxiliary_loss_clip": 0.01095846, + "auxiliary_loss_mlp": 0.01038359, + "balance_loss_clip": 1.0473485, + "balance_loss_mlp": 1.02278996, + "epoch": 0.46811964527280925, + "flos": 11327591001600.0, + "grad_norm": 1.936392485305852, + "language_loss": 0.77363992, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.79498196, + "num_input_tokens_seen": 167149030, + "step": 7786, + "time_per_iteration": 2.7023332118988037 + }, + { + "auxiliary_loss_clip": 0.01116619, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.04685593, + "balance_loss_mlp": 1.02089214, + "epoch": 0.4681797685254772, + "flos": 24278809593600.0, + "grad_norm": 2.0886119764466686, + "language_loss": 0.74195051, + "learning_rate": 2.302035914315856e-06, + "loss": 0.76346588, + "num_input_tokens_seen": 167167375, + "step": 7787, + "time_per_iteration": 2.704002618789673 + }, + { + "auxiliary_loss_clip": 0.0110227, + "auxiliary_loss_mlp": 0.01041247, + "balance_loss_clip": 1.04562151, + "balance_loss_mlp": 1.02654815, + "epoch": 0.4682398917781452, + "flos": 31650372293760.0, + "grad_norm": 1.9198703232455803, + "language_loss": 0.65471619, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.67615134, + "num_input_tokens_seen": 167188065, + "step": 7788, + "time_per_iteration": 2.767409324645996 + }, + { + "auxiliary_loss_clip": 0.01117478, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.0463376, + "balance_loss_mlp": 1.02198708, + "epoch": 0.46830001503081314, + "flos": 28110765340800.0, + "grad_norm": 1.576175997941932, + "language_loss": 0.63680893, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.65833306, + "num_input_tokens_seen": 167209675, + "step": 7789, + "time_per_iteration": 2.686382532119751 + }, + { + "auxiliary_loss_clip": 0.01034678, + "auxiliary_loss_mlp": 0.01000229, + "balance_loss_clip": 1.01769471, + "balance_loss_mlp": 0.99867934, + "epoch": 0.4683601382834811, + "flos": 57881718316800.0, + "grad_norm": 0.6946835696901172, + "language_loss": 0.61856973, + "learning_rate": 2.300880877982825e-06, + "loss": 0.63891876, + "num_input_tokens_seen": 167273940, + "step": 7790, + "time_per_iteration": 3.2082865238189697 + }, + { + "auxiliary_loss_clip": 0.01088531, + "auxiliary_loss_mlp": 0.01040894, + "balance_loss_clip": 1.04553008, + "balance_loss_mlp": 1.02514648, + "epoch": 0.46842026153614913, + "flos": 21871933009920.0, + "grad_norm": 1.7348641955250894, + "language_loss": 0.79120016, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.81249446, + "num_input_tokens_seen": 167292730, + "step": 7791, + "time_per_iteration": 2.7868592739105225 + }, + { + "auxiliary_loss_clip": 0.0112267, + "auxiliary_loss_mlp": 0.01038559, + "balance_loss_clip": 1.05027902, + "balance_loss_mlp": 1.0236336, + "epoch": 0.4684803847888171, + "flos": 24900818434560.0, + "grad_norm": 1.5319083860586857, + "language_loss": 0.7509321, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.77254432, + "num_input_tokens_seen": 167313460, + "step": 7792, + "time_per_iteration": 2.6591553688049316 + }, + { + "auxiliary_loss_clip": 0.01093652, + "auxiliary_loss_mlp": 0.01040808, + "balance_loss_clip": 1.03941143, + "balance_loss_mlp": 1.02582359, + "epoch": 0.46854050804148506, + "flos": 26251670142720.0, + "grad_norm": 1.6679874379457267, + "language_loss": 0.68283308, + "learning_rate": 2.299725738964898e-06, + "loss": 0.70417762, + "num_input_tokens_seen": 167335385, + "step": 7793, + "time_per_iteration": 2.714614152908325 + }, + { + "auxiliary_loss_clip": 0.01120793, + "auxiliary_loss_mlp": 0.00770869, + "balance_loss_clip": 1.05047464, + "balance_loss_mlp": 1.00063658, + "epoch": 0.468600631294153, + "flos": 21579799697280.0, + "grad_norm": 1.5900503410544595, + "language_loss": 0.74045742, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.75937402, + "num_input_tokens_seen": 167353625, + "step": 7794, + "time_per_iteration": 2.631113052368164 + }, + { + "auxiliary_loss_clip": 0.01101487, + "auxiliary_loss_mlp": 0.01040191, + "balance_loss_clip": 1.04786825, + "balance_loss_mlp": 1.02505112, + "epoch": 0.468660754546821, + "flos": 25885632597120.0, + "grad_norm": 1.7758607044197945, + "language_loss": 0.63441491, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.65583163, + "num_input_tokens_seen": 167374565, + "step": 7795, + "time_per_iteration": 2.755208969116211 + }, + { + "auxiliary_loss_clip": 0.01090992, + "auxiliary_loss_mlp": 0.01033775, + "balance_loss_clip": 1.04455793, + "balance_loss_mlp": 1.01939869, + "epoch": 0.46872087779948896, + "flos": 35475001666560.0, + "grad_norm": 1.5780628651808217, + "language_loss": 0.6815629, + "learning_rate": 2.298570497656304e-06, + "loss": 0.70281053, + "num_input_tokens_seen": 167395010, + "step": 7796, + "time_per_iteration": 2.8338258266448975 + }, + { + "auxiliary_loss_clip": 0.01132709, + "auxiliary_loss_mlp": 0.00772271, + "balance_loss_clip": 1.05046582, + "balance_loss_mlp": 1.00074291, + "epoch": 0.4687810010521569, + "flos": 26396425952640.0, + "grad_norm": 3.1208322005509705, + "language_loss": 0.7061345, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.72518432, + "num_input_tokens_seen": 167415285, + "step": 7797, + "time_per_iteration": 2.7184929847717285 + }, + { + "auxiliary_loss_clip": 0.01108205, + "auxiliary_loss_mlp": 0.01035831, + "balance_loss_clip": 1.04716921, + "balance_loss_mlp": 1.01992226, + "epoch": 0.4688411243048249, + "flos": 19972761212160.0, + "grad_norm": 2.050220537358762, + "language_loss": 0.67158788, + "learning_rate": 2.297800280150454e-06, + "loss": 0.69302827, + "num_input_tokens_seen": 167432405, + "step": 7798, + "time_per_iteration": 2.707491159439087 + }, + { + "auxiliary_loss_clip": 0.01033434, + "auxiliary_loss_mlp": 0.00999628, + "balance_loss_clip": 1.01507461, + "balance_loss_mlp": 0.99782771, + "epoch": 0.46890124755749285, + "flos": 63977015900160.0, + "grad_norm": 0.9512995219109956, + "language_loss": 0.64611268, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.66644335, + "num_input_tokens_seen": 167499365, + "step": 7799, + "time_per_iteration": 3.3521087169647217 + }, + { + "auxiliary_loss_clip": 0.01103151, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.0488441, + "balance_loss_mlp": 1.01467967, + "epoch": 0.4689613708101608, + "flos": 23768985905280.0, + "grad_norm": 1.342329921678728, + "language_loss": 0.72313237, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.74445534, + "num_input_tokens_seen": 167520390, + "step": 7800, + "time_per_iteration": 2.7983593940734863 + }, + { + "auxiliary_loss_clip": 0.01128952, + "auxiliary_loss_mlp": 0.01035275, + "balance_loss_clip": 1.04984462, + "balance_loss_mlp": 1.0224781, + "epoch": 0.4690214940628288, + "flos": 24788705109120.0, + "grad_norm": 1.7150056694833848, + "language_loss": 0.7285912, + "learning_rate": 2.296644869233568e-06, + "loss": 0.75023353, + "num_input_tokens_seen": 167539865, + "step": 7801, + "time_per_iteration": 2.635540008544922 + }, + { + "auxiliary_loss_clip": 0.01097741, + "auxiliary_loss_mlp": 0.010419, + "balance_loss_clip": 1.04270506, + "balance_loss_mlp": 1.02579427, + "epoch": 0.46908161731549675, + "flos": 18077324428800.0, + "grad_norm": 1.930712957606368, + "language_loss": 0.62748474, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.64888108, + "num_input_tokens_seen": 167558190, + "step": 7802, + "time_per_iteration": 2.8309857845306396 + }, + { + "auxiliary_loss_clip": 0.01131707, + "auxiliary_loss_mlp": 0.01041126, + "balance_loss_clip": 1.04824543, + "balance_loss_mlp": 1.02705908, + "epoch": 0.4691417405681647, + "flos": 25703350053120.0, + "grad_norm": 2.0983906256852647, + "language_loss": 0.73465741, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.75638568, + "num_input_tokens_seen": 167577685, + "step": 7803, + "time_per_iteration": 2.639453172683716 + }, + { + "auxiliary_loss_clip": 0.01105851, + "auxiliary_loss_mlp": 0.00771349, + "balance_loss_clip": 1.04883635, + "balance_loss_mlp": 1.00065053, + "epoch": 0.46920186382083273, + "flos": 17457039440640.0, + "grad_norm": 2.3177200047102486, + "language_loss": 0.77396876, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.7927407, + "num_input_tokens_seen": 167596390, + "step": 7804, + "time_per_iteration": 2.6661806106567383 + }, + { + "auxiliary_loss_clip": 0.01105528, + "auxiliary_loss_mlp": 0.01031688, + "balance_loss_clip": 1.04877174, + "balance_loss_mlp": 1.01763344, + "epoch": 0.4692619870735007, + "flos": 20339445202560.0, + "grad_norm": 2.089417814933236, + "language_loss": 0.77330643, + "learning_rate": 2.295104163929305e-06, + "loss": 0.79467863, + "num_input_tokens_seen": 167614980, + "step": 7805, + "time_per_iteration": 2.6670541763305664 + }, + { + "auxiliary_loss_clip": 0.01140382, + "auxiliary_loss_mlp": 0.01050591, + "balance_loss_clip": 1.05195141, + "balance_loss_mlp": 1.03487957, + "epoch": 0.46932211032616866, + "flos": 29496558003840.0, + "grad_norm": 1.6834011453476339, + "language_loss": 0.82446682, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.84637654, + "num_input_tokens_seen": 167635895, + "step": 7806, + "time_per_iteration": 2.641126871109009 + }, + { + "auxiliary_loss_clip": 0.01109262, + "auxiliary_loss_mlp": 0.01041295, + "balance_loss_clip": 1.04739761, + "balance_loss_mlp": 1.02634656, + "epoch": 0.4693822335788366, + "flos": 36211242735360.0, + "grad_norm": 1.815437092056069, + "language_loss": 0.77320337, + "learning_rate": 2.294333744076472e-06, + "loss": 0.79470897, + "num_input_tokens_seen": 167657440, + "step": 7807, + "time_per_iteration": 2.768772840499878 + }, + { + "auxiliary_loss_clip": 0.0110914, + "auxiliary_loss_mlp": 0.01038695, + "balance_loss_clip": 1.05083752, + "balance_loss_mlp": 1.02354348, + "epoch": 0.4694423568315046, + "flos": 20338978325760.0, + "grad_norm": 2.201580678066969, + "language_loss": 0.51815701, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.53963536, + "num_input_tokens_seen": 167675025, + "step": 7808, + "time_per_iteration": 2.6565470695495605 + }, + { + "auxiliary_loss_clip": 0.01003405, + "auxiliary_loss_mlp": 0.01005455, + "balance_loss_clip": 1.0168457, + "balance_loss_mlp": 1.00391757, + "epoch": 0.46950248008417256, + "flos": 64326353621760.0, + "grad_norm": 0.78732179125356, + "language_loss": 0.57700193, + "learning_rate": 2.293563279578978e-06, + "loss": 0.59709048, + "num_input_tokens_seen": 167729635, + "step": 7809, + "time_per_iteration": 3.1529645919799805 + }, + { + "auxiliary_loss_clip": 0.01087624, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.04826307, + "balance_loss_mlp": 1.02535129, + "epoch": 0.4695626033368405, + "flos": 19200106730880.0, + "grad_norm": 2.4452536224375403, + "language_loss": 0.7153672, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.73663932, + "num_input_tokens_seen": 167745135, + "step": 7810, + "time_per_iteration": 2.730975389480591 + }, + { + "auxiliary_loss_clip": 0.01122205, + "auxiliary_loss_mlp": 0.01041582, + "balance_loss_clip": 1.04927683, + "balance_loss_mlp": 1.02719331, + "epoch": 0.4696227265895085, + "flos": 23002436736000.0, + "grad_norm": 3.7864250348919284, + "language_loss": 0.81469715, + "learning_rate": 2.29279277055369e-06, + "loss": 0.83633506, + "num_input_tokens_seen": 167763875, + "step": 7811, + "time_per_iteration": 2.689089059829712 + }, + { + "auxiliary_loss_clip": 0.01117579, + "auxiliary_loss_mlp": 0.01038248, + "balance_loss_clip": 1.04989529, + "balance_loss_mlp": 1.02302504, + "epoch": 0.46968284984217645, + "flos": 21870855601920.0, + "grad_norm": 1.6520361935296233, + "language_loss": 0.8041414, + "learning_rate": 2.292407499379644e-06, + "loss": 0.82569969, + "num_input_tokens_seen": 167784895, + "step": 7812, + "time_per_iteration": 2.6615161895751953 + }, + { + "auxiliary_loss_clip": 0.01075193, + "auxiliary_loss_mlp": 0.01036276, + "balance_loss_clip": 1.04313707, + "balance_loss_mlp": 1.02170289, + "epoch": 0.4697429730948444, + "flos": 19974987855360.0, + "grad_norm": 1.6393784799199496, + "language_loss": 0.74155343, + "learning_rate": 2.292022217117477e-06, + "loss": 0.76266813, + "num_input_tokens_seen": 167803185, + "step": 7813, + "time_per_iteration": 2.7426726818084717 + }, + { + "auxiliary_loss_clip": 0.01102658, + "auxiliary_loss_mlp": 0.01036665, + "balance_loss_clip": 1.04594994, + "balance_loss_mlp": 1.02108407, + "epoch": 0.4698030963475124, + "flos": 15156206784000.0, + "grad_norm": 2.3178266219619994, + "language_loss": 0.84324849, + "learning_rate": 2.291636923781798e-06, + "loss": 0.86464167, + "num_input_tokens_seen": 167816550, + "step": 7814, + "time_per_iteration": 2.6519999504089355 + }, + { + "auxiliary_loss_clip": 0.01105673, + "auxiliary_loss_mlp": 0.01036813, + "balance_loss_clip": 1.04427862, + "balance_loss_mlp": 1.02291358, + "epoch": 0.46986321960018035, + "flos": 15151178880000.0, + "grad_norm": 1.8698068393605216, + "language_loss": 0.81723464, + "learning_rate": 2.291251619387217e-06, + "loss": 0.83865952, + "num_input_tokens_seen": 167831845, + "step": 7815, + "time_per_iteration": 5.720506906509399 + }, + { + "auxiliary_loss_clip": 0.01088353, + "auxiliary_loss_mlp": 0.01038971, + "balance_loss_clip": 1.04897821, + "balance_loss_mlp": 1.023808, + "epoch": 0.4699233428528483, + "flos": 23108911626240.0, + "grad_norm": 2.071255754681328, + "language_loss": 0.77463031, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.79590356, + "num_input_tokens_seen": 167850360, + "step": 7816, + "time_per_iteration": 2.738074541091919 + }, + { + "auxiliary_loss_clip": 0.01044982, + "auxiliary_loss_mlp": 0.01001103, + "balance_loss_clip": 1.01830792, + "balance_loss_mlp": 0.99944633, + "epoch": 0.46998346610551633, + "flos": 68105558246400.0, + "grad_norm": 0.838650178196428, + "language_loss": 0.58987319, + "learning_rate": 2.290480977479796e-06, + "loss": 0.6103341, + "num_input_tokens_seen": 167908660, + "step": 7817, + "time_per_iteration": 3.1292662620544434 + }, + { + "auxiliary_loss_clip": 0.01107632, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.04874861, + "balance_loss_mlp": 1.02005172, + "epoch": 0.4700435893581843, + "flos": 24129456842880.0, + "grad_norm": 1.7123630681211415, + "language_loss": 0.79417968, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.81559694, + "num_input_tokens_seen": 167927905, + "step": 7818, + "time_per_iteration": 5.943104028701782 + }, + { + "auxiliary_loss_clip": 0.0113212, + "auxiliary_loss_mlp": 0.01037162, + "balance_loss_clip": 1.04868269, + "balance_loss_mlp": 1.02325034, + "epoch": 0.47010371261085226, + "flos": 20150518642560.0, + "grad_norm": 1.6838154241149696, + "language_loss": 0.83469647, + "learning_rate": 2.289710291512104e-06, + "loss": 0.85638928, + "num_input_tokens_seen": 167945995, + "step": 7819, + "time_per_iteration": 2.6600770950317383 + }, + { + "auxiliary_loss_clip": 0.01101069, + "auxiliary_loss_mlp": 0.0103721, + "balance_loss_clip": 1.04507041, + "balance_loss_mlp": 1.02214193, + "epoch": 0.47016383586352023, + "flos": 15122199582720.0, + "grad_norm": 2.5448578806987974, + "language_loss": 0.7640624, + "learning_rate": 2.289324932042186e-06, + "loss": 0.78544521, + "num_input_tokens_seen": 167963380, + "step": 7820, + "time_per_iteration": 2.720524549484253 + }, + { + "auxiliary_loss_clip": 0.01114996, + "auxiliary_loss_mlp": 0.01040886, + "balance_loss_clip": 1.05066848, + "balance_loss_mlp": 1.02641368, + "epoch": 0.4702239591161882, + "flos": 13552975140480.0, + "grad_norm": 1.835793139157851, + "language_loss": 0.74591041, + "learning_rate": 2.288939561601039e-06, + "loss": 0.76746929, + "num_input_tokens_seen": 167981740, + "step": 7821, + "time_per_iteration": 2.6208953857421875 + }, + { + "auxiliary_loss_clip": 0.0112785, + "auxiliary_loss_mlp": 0.01044502, + "balance_loss_clip": 1.04762793, + "balance_loss_mlp": 1.03104329, + "epoch": 0.47028408236885616, + "flos": 24276511123200.0, + "grad_norm": 1.8086110799443134, + "language_loss": 0.89176404, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.91348755, + "num_input_tokens_seen": 167999380, + "step": 7822, + "time_per_iteration": 2.641425371170044 + }, + { + "auxiliary_loss_clip": 0.01113329, + "auxiliary_loss_mlp": 0.01033656, + "balance_loss_clip": 1.04665482, + "balance_loss_mlp": 1.01981544, + "epoch": 0.4703442056215241, + "flos": 22856926740480.0, + "grad_norm": 1.7930134528553263, + "language_loss": 0.79694283, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.81841266, + "num_input_tokens_seen": 168018395, + "step": 7823, + "time_per_iteration": 2.632756233215332 + }, + { + "auxiliary_loss_clip": 0.01025068, + "auxiliary_loss_mlp": 0.01003424, + "balance_loss_clip": 1.02190793, + "balance_loss_mlp": 1.00163603, + "epoch": 0.4704043288741921, + "flos": 69240227950080.0, + "grad_norm": 0.8086269167579946, + "language_loss": 0.56642514, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.5867101, + "num_input_tokens_seen": 168084080, + "step": 7824, + "time_per_iteration": 3.3140807151794434 + }, + { + "auxiliary_loss_clip": 0.01104679, + "auxiliary_loss_mlp": 0.01042887, + "balance_loss_clip": 1.04395127, + "balance_loss_mlp": 1.02718711, + "epoch": 0.47046445212686006, + "flos": 18041090584320.0, + "grad_norm": 1.8843796036347318, + "language_loss": 0.81223321, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.83370888, + "num_input_tokens_seen": 168101555, + "step": 7825, + "time_per_iteration": 2.700547695159912 + }, + { + "auxiliary_loss_clip": 0.01111276, + "auxiliary_loss_mlp": 0.01036611, + "balance_loss_clip": 1.0480845, + "balance_loss_mlp": 1.02218676, + "epoch": 0.470524575379528, + "flos": 23951448017280.0, + "grad_norm": 1.7729512383292405, + "language_loss": 0.66719514, + "learning_rate": 2.287012545338324e-06, + "loss": 0.68867397, + "num_input_tokens_seen": 168121530, + "step": 7826, + "time_per_iteration": 2.6998069286346436 + }, + { + "auxiliary_loss_clip": 0.01105784, + "auxiliary_loss_mlp": 0.01039915, + "balance_loss_clip": 1.04433072, + "balance_loss_mlp": 1.02479887, + "epoch": 0.470584698632196, + "flos": 18113558273280.0, + "grad_norm": 1.8432989970829954, + "language_loss": 0.84173524, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.86319232, + "num_input_tokens_seen": 168140335, + "step": 7827, + "time_per_iteration": 2.692657709121704 + }, + { + "auxiliary_loss_clip": 0.01024445, + "auxiliary_loss_mlp": 0.01004787, + "balance_loss_clip": 1.01622581, + "balance_loss_mlp": 1.00303495, + "epoch": 0.47064482188486395, + "flos": 57251916224640.0, + "grad_norm": 0.8086690003326286, + "language_loss": 0.5568617, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57715398, + "num_input_tokens_seen": 168200535, + "step": 7828, + "time_per_iteration": 3.184593439102173 + }, + { + "auxiliary_loss_clip": 0.01128245, + "auxiliary_loss_mlp": 0.01033804, + "balance_loss_clip": 1.04770434, + "balance_loss_mlp": 1.02036309, + "epoch": 0.4707049451375319, + "flos": 17895077798400.0, + "grad_norm": 2.799236307786822, + "language_loss": 0.80882025, + "learning_rate": 2.285856204861245e-06, + "loss": 0.8304407, + "num_input_tokens_seen": 168219610, + "step": 7829, + "time_per_iteration": 2.5789284706115723 + }, + { + "auxiliary_loss_clip": 0.01128236, + "auxiliary_loss_mlp": 0.01036042, + "balance_loss_clip": 1.04866183, + "balance_loss_mlp": 1.02311337, + "epoch": 0.47076506839019994, + "flos": 25232669210880.0, + "grad_norm": 1.589084017915349, + "language_loss": 0.76252091, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.78416359, + "num_input_tokens_seen": 168242505, + "step": 7830, + "time_per_iteration": 2.6604039669036865 + }, + { + "auxiliary_loss_clip": 0.01094201, + "auxiliary_loss_mlp": 0.01033866, + "balance_loss_clip": 1.04519463, + "balance_loss_mlp": 1.01907206, + "epoch": 0.4708251916428679, + "flos": 13479681438720.0, + "grad_norm": 1.9041514810278948, + "language_loss": 0.7839942, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.8052749, + "num_input_tokens_seen": 168260220, + "step": 7831, + "time_per_iteration": 2.7709531784057617 + }, + { + "auxiliary_loss_clip": 0.01084793, + "auxiliary_loss_mlp": 0.01045555, + "balance_loss_clip": 1.03967106, + "balance_loss_mlp": 1.0289377, + "epoch": 0.47088531489553587, + "flos": 30147833450880.0, + "grad_norm": 3.4524245779244045, + "language_loss": 0.75518548, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.7764889, + "num_input_tokens_seen": 168277360, + "step": 7832, + "time_per_iteration": 2.9078352451324463 + }, + { + "auxiliary_loss_clip": 0.01100887, + "auxiliary_loss_mlp": 0.01027155, + "balance_loss_clip": 1.04597783, + "balance_loss_mlp": 1.01476312, + "epoch": 0.47094543814820383, + "flos": 21798280172160.0, + "grad_norm": 1.3033633023675582, + "language_loss": 0.74446917, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.76574957, + "num_input_tokens_seen": 168296605, + "step": 7833, + "time_per_iteration": 2.7040505409240723 + }, + { + "auxiliary_loss_clip": 0.01115931, + "auxiliary_loss_mlp": 0.01039232, + "balance_loss_clip": 1.04605532, + "balance_loss_mlp": 1.02489686, + "epoch": 0.4710055614008718, + "flos": 23003011353600.0, + "grad_norm": 1.6784231271486025, + "language_loss": 0.75652939, + "learning_rate": 2.283928754133762e-06, + "loss": 0.778081, + "num_input_tokens_seen": 168316205, + "step": 7834, + "time_per_iteration": 2.651439666748047 + }, + { + "auxiliary_loss_clip": 0.01080958, + "auxiliary_loss_mlp": 0.0104352, + "balance_loss_clip": 1.04571462, + "balance_loss_mlp": 1.02942359, + "epoch": 0.47106568465353976, + "flos": 42741346452480.0, + "grad_norm": 1.5705960877616694, + "language_loss": 0.66198736, + "learning_rate": 2.283543231629972e-06, + "loss": 0.68323219, + "num_input_tokens_seen": 168338935, + "step": 7835, + "time_per_iteration": 2.8833723068237305 + }, + { + "auxiliary_loss_clip": 0.01030822, + "auxiliary_loss_mlp": 0.0075266, + "balance_loss_clip": 1.01354921, + "balance_loss_mlp": 1.00055587, + "epoch": 0.4711258079062077, + "flos": 68554008570240.0, + "grad_norm": 0.8682696962056556, + "language_loss": 0.62114525, + "learning_rate": 2.283157698374194e-06, + "loss": 0.63898003, + "num_input_tokens_seen": 168392800, + "step": 7836, + "time_per_iteration": 3.271106243133545 + }, + { + "auxiliary_loss_clip": 0.01089899, + "auxiliary_loss_mlp": 0.00772396, + "balance_loss_clip": 1.04188919, + "balance_loss_mlp": 1.00066912, + "epoch": 0.4711859311588757, + "flos": 25446588658560.0, + "grad_norm": 2.9726849992756623, + "language_loss": 0.69634271, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.71496564, + "num_input_tokens_seen": 168412940, + "step": 7837, + "time_per_iteration": 2.7227394580841064 + }, + { + "auxiliary_loss_clip": 0.01114908, + "auxiliary_loss_mlp": 0.01040024, + "balance_loss_clip": 1.04658818, + "balance_loss_mlp": 1.02449143, + "epoch": 0.47124605441154366, + "flos": 21981891519360.0, + "grad_norm": 1.834184212780789, + "language_loss": 0.66073495, + "learning_rate": 2.282386599665153e-06, + "loss": 0.68228424, + "num_input_tokens_seen": 168431995, + "step": 7838, + "time_per_iteration": 2.63415265083313 + }, + { + "auxiliary_loss_clip": 0.01101595, + "auxiliary_loss_mlp": 0.01040478, + "balance_loss_clip": 1.04245853, + "balance_loss_mlp": 1.02488542, + "epoch": 0.4713061776642116, + "flos": 25412689198080.0, + "grad_norm": 1.6613879226075605, + "language_loss": 0.77071315, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.79213387, + "num_input_tokens_seen": 168454585, + "step": 7839, + "time_per_iteration": 2.702371835708618 + }, + { + "auxiliary_loss_clip": 0.01089161, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.04446244, + "balance_loss_mlp": 1.0215137, + "epoch": 0.4713663009168796, + "flos": 26542259170560.0, + "grad_norm": 2.064347613929302, + "language_loss": 0.72607076, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.74731302, + "num_input_tokens_seen": 168471265, + "step": 7840, + "time_per_iteration": 2.7578155994415283 + }, + { + "auxiliary_loss_clip": 0.01098285, + "auxiliary_loss_mlp": 0.01033804, + "balance_loss_clip": 1.04248786, + "balance_loss_mlp": 1.01975548, + "epoch": 0.47142642416954755, + "flos": 23623583650560.0, + "grad_norm": 1.634270857219127, + "language_loss": 0.75153434, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.77285522, + "num_input_tokens_seen": 168491360, + "step": 7841, + "time_per_iteration": 2.7571516036987305 + }, + { + "auxiliary_loss_clip": 0.01097356, + "auxiliary_loss_mlp": 0.01036522, + "balance_loss_clip": 1.04522789, + "balance_loss_mlp": 1.02271175, + "epoch": 0.4714865474222155, + "flos": 22310150935680.0, + "grad_norm": 1.514171980299406, + "language_loss": 0.70372689, + "learning_rate": 2.280844273866501e-06, + "loss": 0.72506565, + "num_input_tokens_seen": 168511335, + "step": 7842, + "time_per_iteration": 2.6693220138549805 + }, + { + "auxiliary_loss_clip": 0.01122506, + "auxiliary_loss_mlp": 0.01036861, + "balance_loss_clip": 1.05041289, + "balance_loss_mlp": 1.02272844, + "epoch": 0.4715466706748835, + "flos": 17822430541440.0, + "grad_norm": 2.3877412842319243, + "language_loss": 0.78754079, + "learning_rate": 2.280458665756177e-06, + "loss": 0.80913448, + "num_input_tokens_seen": 168529920, + "step": 7843, + "time_per_iteration": 2.584821939468384 + }, + { + "auxiliary_loss_clip": 0.01112783, + "auxiliary_loss_mlp": 0.01033598, + "balance_loss_clip": 1.04609227, + "balance_loss_mlp": 1.02013922, + "epoch": 0.4716067939275515, + "flos": 23659530186240.0, + "grad_norm": 1.5083750473310347, + "language_loss": 0.73945224, + "learning_rate": 2.280073047010832e-06, + "loss": 0.76091611, + "num_input_tokens_seen": 168550595, + "step": 7844, + "time_per_iteration": 2.6947662830352783 + }, + { + "auxiliary_loss_clip": 0.01103523, + "auxiliary_loss_mlp": 0.01045426, + "balance_loss_clip": 1.04754925, + "balance_loss_mlp": 1.03077483, + "epoch": 0.47166691718021947, + "flos": 17930162407680.0, + "grad_norm": 1.6596812780951513, + "language_loss": 0.7849918, + "learning_rate": 2.279687417645088e-06, + "loss": 0.8064813, + "num_input_tokens_seen": 168569765, + "step": 7845, + "time_per_iteration": 2.64786434173584 + }, + { + "auxiliary_loss_clip": 0.01116093, + "auxiliary_loss_mlp": 0.01035695, + "balance_loss_clip": 1.04657555, + "balance_loss_mlp": 1.02204597, + "epoch": 0.47172704043288743, + "flos": 26614583205120.0, + "grad_norm": 1.4795134607526772, + "language_loss": 0.73325998, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.75477785, + "num_input_tokens_seen": 168591525, + "step": 7846, + "time_per_iteration": 2.6890015602111816 + }, + { + "auxiliary_loss_clip": 0.01112295, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.04567862, + "balance_loss_mlp": 1.02053618, + "epoch": 0.4717871636855554, + "flos": 27922700707200.0, + "grad_norm": 1.365245213481775, + "language_loss": 0.74306214, + "learning_rate": 2.2789161271109e-06, + "loss": 0.76451898, + "num_input_tokens_seen": 168611235, + "step": 7847, + "time_per_iteration": 2.664600133895874 + }, + { + "auxiliary_loss_clip": 0.01076671, + "auxiliary_loss_mlp": 0.01036211, + "balance_loss_clip": 1.04269147, + "balance_loss_mlp": 1.02244806, + "epoch": 0.47184728693822336, + "flos": 14502237816960.0, + "grad_norm": 1.614512390946798, + "language_loss": 0.80744767, + "learning_rate": 2.278530465971703e-06, + "loss": 0.82857651, + "num_input_tokens_seen": 168628710, + "step": 7848, + "time_per_iteration": 2.7662644386291504 + }, + { + "auxiliary_loss_clip": 0.01118674, + "auxiliary_loss_mlp": 0.01035868, + "balance_loss_clip": 1.04767179, + "balance_loss_mlp": 1.02170014, + "epoch": 0.47190741019089133, + "flos": 17856545483520.0, + "grad_norm": 3.381301580597114, + "language_loss": 0.70282733, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.72437274, + "num_input_tokens_seen": 168645645, + "step": 7849, + "time_per_iteration": 2.628324031829834 + }, + { + "auxiliary_loss_clip": 0.01102555, + "auxiliary_loss_mlp": 0.01043039, + "balance_loss_clip": 1.04688513, + "balance_loss_mlp": 1.02679062, + "epoch": 0.4719675334435593, + "flos": 17895472848000.0, + "grad_norm": 2.2108635677358968, + "language_loss": 0.6920523, + "learning_rate": 2.277759112022224e-06, + "loss": 0.71350825, + "num_input_tokens_seen": 168664165, + "step": 7850, + "time_per_iteration": 2.678515672683716 + }, + { + "auxiliary_loss_clip": 0.01071934, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.04294968, + "balance_loss_mlp": 1.0192523, + "epoch": 0.47202765669622726, + "flos": 20704369426560.0, + "grad_norm": 1.8559154127156776, + "language_loss": 0.75022864, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.77128029, + "num_input_tokens_seen": 168681940, + "step": 7851, + "time_per_iteration": 2.7907421588897705 + }, + { + "auxiliary_loss_clip": 0.01058717, + "auxiliary_loss_mlp": 0.0104416, + "balance_loss_clip": 1.03438354, + "balance_loss_mlp": 1.02636182, + "epoch": 0.4720877799488952, + "flos": 16360255607040.0, + "grad_norm": 1.8954666463572496, + "language_loss": 0.76087546, + "learning_rate": 2.276987715942132e-06, + "loss": 0.78190422, + "num_input_tokens_seen": 168698830, + "step": 7852, + "time_per_iteration": 2.751862049102783 + }, + { + "auxiliary_loss_clip": 0.01090696, + "auxiliary_loss_mlp": 0.01031466, + "balance_loss_clip": 1.0440855, + "balance_loss_mlp": 1.01667845, + "epoch": 0.4721479032015632, + "flos": 20668171495680.0, + "grad_norm": 1.6687991208994266, + "language_loss": 0.69092613, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.71214771, + "num_input_tokens_seen": 168718305, + "step": 7853, + "time_per_iteration": 2.8860716819763184 + }, + { + "auxiliary_loss_clip": 0.01023698, + "auxiliary_loss_mlp": 0.01005171, + "balance_loss_clip": 1.03293765, + "balance_loss_mlp": 1.00360918, + "epoch": 0.47220802645423116, + "flos": 67750438435200.0, + "grad_norm": 0.7060966439190681, + "language_loss": 0.50175303, + "learning_rate": 2.276216277848432e-06, + "loss": 0.52204174, + "num_input_tokens_seen": 168782365, + "step": 7854, + "time_per_iteration": 4.915671110153198 + }, + { + "auxiliary_loss_clip": 0.0112187, + "auxiliary_loss_mlp": 0.01035341, + "balance_loss_clip": 1.04927993, + "balance_loss_mlp": 1.02046967, + "epoch": 0.4722681497068991, + "flos": 20921449271040.0, + "grad_norm": 1.8544471627611243, + "language_loss": 0.63919318, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.66076523, + "num_input_tokens_seen": 168800485, + "step": 7855, + "time_per_iteration": 4.303591728210449 + }, + { + "auxiliary_loss_clip": 0.01115964, + "auxiliary_loss_mlp": 0.01039633, + "balance_loss_clip": 1.04526174, + "balance_loss_mlp": 1.02463675, + "epoch": 0.4723282729595671, + "flos": 28293083798400.0, + "grad_norm": 6.403691145457763, + "language_loss": 0.75835574, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.77991176, + "num_input_tokens_seen": 168818965, + "step": 7856, + "time_per_iteration": 2.669156074523926 + }, + { + "auxiliary_loss_clip": 0.01102045, + "auxiliary_loss_mlp": 0.01036544, + "balance_loss_clip": 1.04435217, + "balance_loss_mlp": 1.02334714, + "epoch": 0.4723883962122351, + "flos": 27125053338240.0, + "grad_norm": 1.8316073665627561, + "language_loss": 0.7513321, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.77271795, + "num_input_tokens_seen": 168840355, + "step": 7857, + "time_per_iteration": 5.926163673400879 + }, + { + "auxiliary_loss_clip": 0.0110506, + "auxiliary_loss_mlp": 0.01044055, + "balance_loss_clip": 1.04619288, + "balance_loss_mlp": 1.03164554, + "epoch": 0.47244851946490307, + "flos": 31537253387520.0, + "grad_norm": 1.4352718890089464, + "language_loss": 0.64871937, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.67021048, + "num_input_tokens_seen": 168861765, + "step": 7858, + "time_per_iteration": 2.7516961097717285 + }, + { + "auxiliary_loss_clip": 0.01115653, + "auxiliary_loss_mlp": 0.00772171, + "balance_loss_clip": 1.04487467, + "balance_loss_mlp": 1.00070405, + "epoch": 0.47250864271757104, + "flos": 20886544229760.0, + "grad_norm": 4.333924209566871, + "language_loss": 0.70584702, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.72472525, + "num_input_tokens_seen": 168881310, + "step": 7859, + "time_per_iteration": 2.63272762298584 + }, + { + "auxiliary_loss_clip": 0.01132339, + "auxiliary_loss_mlp": 0.01038437, + "balance_loss_clip": 1.0472064, + "balance_loss_mlp": 1.02420318, + "epoch": 0.472568765970239, + "flos": 20522086882560.0, + "grad_norm": 1.7578939418215658, + "language_loss": 0.62056947, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.64227724, + "num_input_tokens_seen": 168899470, + "step": 7860, + "time_per_iteration": 2.579881429672241 + }, + { + "auxiliary_loss_clip": 0.01104772, + "auxiliary_loss_mlp": 0.01042498, + "balance_loss_clip": 1.04455113, + "balance_loss_mlp": 1.02835417, + "epoch": 0.47262888922290697, + "flos": 35805200417280.0, + "grad_norm": 2.5847882369160584, + "language_loss": 0.71352196, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.73499465, + "num_input_tokens_seen": 168921495, + "step": 7861, + "time_per_iteration": 2.7616021633148193 + }, + { + "auxiliary_loss_clip": 0.01100093, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.04298115, + "balance_loss_mlp": 1.02136993, + "epoch": 0.47268901247557493, + "flos": 20667740532480.0, + "grad_norm": 1.877615917676971, + "language_loss": 0.85056359, + "learning_rate": 2.273130107677896e-06, + "loss": 0.87191874, + "num_input_tokens_seen": 168940515, + "step": 7862, + "time_per_iteration": 2.730851173400879 + }, + { + "auxiliary_loss_clip": 0.01126067, + "auxiliary_loss_mlp": 0.01032341, + "balance_loss_clip": 1.04310465, + "balance_loss_mlp": 1.01836395, + "epoch": 0.4727491357282429, + "flos": 19573291082880.0, + "grad_norm": 1.8403668162610285, + "language_loss": 0.84233111, + "learning_rate": 2.272744289645927e-06, + "loss": 0.86391521, + "num_input_tokens_seen": 168958340, + "step": 7863, + "time_per_iteration": 2.7247161865234375 + }, + { + "auxiliary_loss_clip": 0.01104075, + "auxiliary_loss_mlp": 0.01041818, + "balance_loss_clip": 1.04576826, + "balance_loss_mlp": 1.02810335, + "epoch": 0.47280925898091086, + "flos": 18217231902720.0, + "grad_norm": 2.0137135318025843, + "language_loss": 0.66243893, + "learning_rate": 2.272358461271467e-06, + "loss": 0.68389785, + "num_input_tokens_seen": 168974850, + "step": 7864, + "time_per_iteration": 2.7027535438537598 + }, + { + "auxiliary_loss_clip": 0.01126031, + "auxiliary_loss_mlp": 0.01038902, + "balance_loss_clip": 1.04373837, + "balance_loss_mlp": 1.02402425, + "epoch": 0.4728693822335788, + "flos": 17821820010240.0, + "grad_norm": 1.9458421333469222, + "language_loss": 0.64846861, + "learning_rate": 2.271972622569147e-06, + "loss": 0.67011791, + "num_input_tokens_seen": 168992860, + "step": 7865, + "time_per_iteration": 2.599947214126587 + }, + { + "auxiliary_loss_clip": 0.01095039, + "auxiliary_loss_mlp": 0.00771615, + "balance_loss_clip": 1.04065597, + "balance_loss_mlp": 1.00069022, + "epoch": 0.4729295054862468, + "flos": 20595057361920.0, + "grad_norm": 1.8988594463693396, + "language_loss": 0.73979223, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.75845885, + "num_input_tokens_seen": 169010325, + "step": 7866, + "time_per_iteration": 2.6904079914093018 + }, + { + "auxiliary_loss_clip": 0.01127633, + "auxiliary_loss_mlp": 0.01036812, + "balance_loss_clip": 1.0444746, + "balance_loss_mlp": 1.02215528, + "epoch": 0.47298962873891476, + "flos": 23368079232000.0, + "grad_norm": 1.7138995799513466, + "language_loss": 0.82882631, + "learning_rate": 2.271200914239451e-06, + "loss": 0.85047078, + "num_input_tokens_seen": 169029840, + "step": 7867, + "time_per_iteration": 2.66166353225708 + }, + { + "auxiliary_loss_clip": 0.01113116, + "auxiliary_loss_mlp": 0.01035066, + "balance_loss_clip": 1.04474282, + "balance_loss_mlp": 1.02197099, + "epoch": 0.4730497519915827, + "flos": 22052240305920.0, + "grad_norm": 1.59304374398017, + "language_loss": 0.79711115, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.81859303, + "num_input_tokens_seen": 169049975, + "step": 7868, + "time_per_iteration": 2.639418363571167 + }, + { + "auxiliary_loss_clip": 0.01048577, + "auxiliary_loss_mlp": 0.01036292, + "balance_loss_clip": 1.03682256, + "balance_loss_mlp": 1.02049041, + "epoch": 0.4731098752442507, + "flos": 21069724613760.0, + "grad_norm": 2.2697646545371772, + "language_loss": 0.74715841, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.7680071, + "num_input_tokens_seen": 169069540, + "step": 7869, + "time_per_iteration": 2.822831153869629 + }, + { + "auxiliary_loss_clip": 0.01108509, + "auxiliary_loss_mlp": 0.01048779, + "balance_loss_clip": 1.04608214, + "balance_loss_mlp": 1.03300154, + "epoch": 0.4731699984969187, + "flos": 22528775064960.0, + "grad_norm": 2.141854382789547, + "language_loss": 0.73684996, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.75842285, + "num_input_tokens_seen": 169089940, + "step": 7870, + "time_per_iteration": 2.7175748348236084 + }, + { + "auxiliary_loss_clip": 0.01133545, + "auxiliary_loss_mlp": 0.01041593, + "balance_loss_clip": 1.04755211, + "balance_loss_mlp": 1.02635777, + "epoch": 0.4732301217495867, + "flos": 24898124914560.0, + "grad_norm": 2.253339307670162, + "language_loss": 0.81085944, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.83261085, + "num_input_tokens_seen": 169109650, + "step": 7871, + "time_per_iteration": 2.6193602085113525 + }, + { + "auxiliary_loss_clip": 0.01113818, + "auxiliary_loss_mlp": 0.01036061, + "balance_loss_clip": 1.04329586, + "balance_loss_mlp": 1.02133834, + "epoch": 0.47329024500225464, + "flos": 22784423137920.0, + "grad_norm": 1.5762073479047713, + "language_loss": 0.75922841, + "learning_rate": 2.269271463701879e-06, + "loss": 0.78072715, + "num_input_tokens_seen": 169128990, + "step": 7872, + "time_per_iteration": 2.6391725540161133 + }, + { + "auxiliary_loss_clip": 0.01091788, + "auxiliary_loss_mlp": 0.01038432, + "balance_loss_clip": 1.04121172, + "balance_loss_mlp": 1.02376986, + "epoch": 0.4733503682549226, + "flos": 38695902220800.0, + "grad_norm": 3.094756801604535, + "language_loss": 0.67562377, + "learning_rate": 2.268885542903428e-06, + "loss": 0.696926, + "num_input_tokens_seen": 169154645, + "step": 7873, + "time_per_iteration": 2.8466758728027344 + }, + { + "auxiliary_loss_clip": 0.01117181, + "auxiliary_loss_mlp": 0.01036678, + "balance_loss_clip": 1.04567063, + "balance_loss_mlp": 1.02267087, + "epoch": 0.47341049150759057, + "flos": 22966849336320.0, + "grad_norm": 1.6392218744116203, + "language_loss": 0.72839928, + "learning_rate": 2.26849961190881e-06, + "loss": 0.74993783, + "num_input_tokens_seen": 169174995, + "step": 7874, + "time_per_iteration": 2.721020221710205 + }, + { + "auxiliary_loss_clip": 0.01113028, + "auxiliary_loss_mlp": 0.01038664, + "balance_loss_clip": 1.04846478, + "balance_loss_mlp": 1.02471697, + "epoch": 0.47347061476025853, + "flos": 14538471661440.0, + "grad_norm": 3.032092549096925, + "language_loss": 0.65002596, + "learning_rate": 2.26811367073266e-06, + "loss": 0.67154288, + "num_input_tokens_seen": 169191815, + "step": 7875, + "time_per_iteration": 2.6652960777282715 + }, + { + "auxiliary_loss_clip": 0.01083743, + "auxiliary_loss_mlp": 0.01035273, + "balance_loss_clip": 1.04805076, + "balance_loss_mlp": 1.02059197, + "epoch": 0.4735307380129265, + "flos": 30263250827520.0, + "grad_norm": 2.768907187204124, + "language_loss": 0.8101728, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.83136296, + "num_input_tokens_seen": 169210430, + "step": 7876, + "time_per_iteration": 2.7860774993896484 + }, + { + "auxiliary_loss_clip": 0.01096604, + "auxiliary_loss_mlp": 0.01049403, + "balance_loss_clip": 1.04034781, + "balance_loss_mlp": 1.03362572, + "epoch": 0.47359086126559446, + "flos": 19391044452480.0, + "grad_norm": 1.718915834241656, + "language_loss": 0.79123086, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81269091, + "num_input_tokens_seen": 169229295, + "step": 7877, + "time_per_iteration": 2.6741349697113037 + }, + { + "auxiliary_loss_clip": 0.01119367, + "auxiliary_loss_mlp": 0.00771148, + "balance_loss_clip": 1.04634619, + "balance_loss_mlp": 1.00065994, + "epoch": 0.47365098451826243, + "flos": 21939408708480.0, + "grad_norm": 1.9321122257733154, + "language_loss": 0.7070595, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.72596461, + "num_input_tokens_seen": 169247855, + "step": 7878, + "time_per_iteration": 2.65336012840271 + }, + { + "auxiliary_loss_clip": 0.01091201, + "auxiliary_loss_mlp": 0.01041141, + "balance_loss_clip": 1.04987168, + "balance_loss_mlp": 1.02767622, + "epoch": 0.4737111077709304, + "flos": 25845053207040.0, + "grad_norm": 1.650502341043129, + "language_loss": 0.75037253, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.77169597, + "num_input_tokens_seen": 169268860, + "step": 7879, + "time_per_iteration": 2.731395721435547 + }, + { + "auxiliary_loss_clip": 0.01030587, + "auxiliary_loss_mlp": 0.01009103, + "balance_loss_clip": 1.02360272, + "balance_loss_mlp": 1.00741053, + "epoch": 0.47377123102359836, + "flos": 67760886314880.0, + "grad_norm": 0.7327852929375173, + "language_loss": 0.61306548, + "learning_rate": 2.266183812641164e-06, + "loss": 0.63346243, + "num_input_tokens_seen": 169331855, + "step": 7880, + "time_per_iteration": 3.224714756011963 + }, + { + "auxiliary_loss_clip": 0.0110857, + "auxiliary_loss_mlp": 0.01041962, + "balance_loss_clip": 1.04677773, + "balance_loss_mlp": 1.02690625, + "epoch": 0.4738313542762663, + "flos": 24315977191680.0, + "grad_norm": 1.5081125335533625, + "language_loss": 0.68397921, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.70548451, + "num_input_tokens_seen": 169352175, + "step": 7881, + "time_per_iteration": 2.7536203861236572 + }, + { + "auxiliary_loss_clip": 0.01068036, + "auxiliary_loss_mlp": 0.01031577, + "balance_loss_clip": 1.04936802, + "balance_loss_mlp": 1.01798737, + "epoch": 0.4738914775289343, + "flos": 20705339093760.0, + "grad_norm": 1.7877053000392102, + "language_loss": 0.77066004, + "learning_rate": 2.265411798646092e-06, + "loss": 0.7916562, + "num_input_tokens_seen": 169371215, + "step": 7882, + "time_per_iteration": 2.873434543609619 + }, + { + "auxiliary_loss_clip": 0.01116489, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.04511285, + "balance_loss_mlp": 1.02132463, + "epoch": 0.4739516007816023, + "flos": 25446337263360.0, + "grad_norm": 2.3087904075212204, + "language_loss": 0.76111883, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.78264266, + "num_input_tokens_seen": 169391745, + "step": 7883, + "time_per_iteration": 2.7326574325561523 + }, + { + "auxiliary_loss_clip": 0.01107432, + "auxiliary_loss_mlp": 0.01031652, + "balance_loss_clip": 1.04656231, + "balance_loss_mlp": 1.01863456, + "epoch": 0.4740117240342703, + "flos": 19974341410560.0, + "grad_norm": 1.7217647008431887, + "language_loss": 0.72281808, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.74420893, + "num_input_tokens_seen": 169409845, + "step": 7884, + "time_per_iteration": 2.660172462463379 + }, + { + "auxiliary_loss_clip": 0.01123059, + "auxiliary_loss_mlp": 0.01037646, + "balance_loss_clip": 1.04745269, + "balance_loss_mlp": 1.02225614, + "epoch": 0.47407184728693824, + "flos": 15661146222720.0, + "grad_norm": 2.1356892731193557, + "language_loss": 0.82255256, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.8441596, + "num_input_tokens_seen": 169426085, + "step": 7885, + "time_per_iteration": 2.6816513538360596 + }, + { + "auxiliary_loss_clip": 0.01093494, + "auxiliary_loss_mlp": 0.01050029, + "balance_loss_clip": 1.0418942, + "balance_loss_mlp": 1.0338043, + "epoch": 0.4741319705396062, + "flos": 18588800142720.0, + "grad_norm": 1.6528542083339792, + "language_loss": 0.73020607, + "learning_rate": 2.263867649999751e-06, + "loss": 0.75164127, + "num_input_tokens_seen": 169444705, + "step": 7886, + "time_per_iteration": 2.6734073162078857 + }, + { + "auxiliary_loss_clip": 0.01110604, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.04582644, + "balance_loss_mlp": 1.02251315, + "epoch": 0.47419209379227417, + "flos": 13261093223040.0, + "grad_norm": 2.0346146652784327, + "language_loss": 0.74043691, + "learning_rate": 2.263481587786849e-06, + "loss": 0.76192516, + "num_input_tokens_seen": 169460850, + "step": 7887, + "time_per_iteration": 2.6761467456817627 + }, + { + "auxiliary_loss_clip": 0.01118145, + "auxiliary_loss_mlp": 0.01031795, + "balance_loss_clip": 1.0474298, + "balance_loss_mlp": 1.01849771, + "epoch": 0.47425221704494214, + "flos": 20044043752320.0, + "grad_norm": 1.7788052130685665, + "language_loss": 0.77452385, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.79602331, + "num_input_tokens_seen": 169478890, + "step": 7888, + "time_per_iteration": 2.6402924060821533 + }, + { + "auxiliary_loss_clip": 0.01118769, + "auxiliary_loss_mlp": 0.01034654, + "balance_loss_clip": 1.0469296, + "balance_loss_mlp": 1.02044427, + "epoch": 0.4743123402976101, + "flos": 27271892136960.0, + "grad_norm": 4.211713497556063, + "language_loss": 0.72521853, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.7467528, + "num_input_tokens_seen": 169499690, + "step": 7889, + "time_per_iteration": 2.693746566772461 + }, + { + "auxiliary_loss_clip": 0.0104991, + "auxiliary_loss_mlp": 0.01005818, + "balance_loss_clip": 1.02273417, + "balance_loss_mlp": 1.00418472, + "epoch": 0.47437246355027807, + "flos": 55393970261760.0, + "grad_norm": 0.7194077429508707, + "language_loss": 0.5605737, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58113098, + "num_input_tokens_seen": 169560475, + "step": 7890, + "time_per_iteration": 3.180250883102417 + }, + { + "auxiliary_loss_clip": 0.01120493, + "auxiliary_loss_mlp": 0.01032412, + "balance_loss_clip": 1.04944348, + "balance_loss_mlp": 1.01705146, + "epoch": 0.47443258680294603, + "flos": 23878477537920.0, + "grad_norm": 1.9527728253341778, + "language_loss": 0.65866226, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.68019128, + "num_input_tokens_seen": 169580110, + "step": 7891, + "time_per_iteration": 2.6768221855163574 + }, + { + "auxiliary_loss_clip": 0.01135111, + "auxiliary_loss_mlp": 0.01039265, + "balance_loss_clip": 1.04865634, + "balance_loss_mlp": 1.02342188, + "epoch": 0.474492710055614, + "flos": 21977761455360.0, + "grad_norm": 2.2722368949670493, + "language_loss": 0.7100271, + "learning_rate": 2.26155112714642e-06, + "loss": 0.73177087, + "num_input_tokens_seen": 169597510, + "step": 7892, + "time_per_iteration": 2.5857720375061035 + }, + { + "auxiliary_loss_clip": 0.01021432, + "auxiliary_loss_mlp": 0.01001129, + "balance_loss_clip": 1.01879561, + "balance_loss_mlp": 0.99938869, + "epoch": 0.47455283330828196, + "flos": 62557180122240.0, + "grad_norm": 0.8083016633053688, + "language_loss": 0.5854069, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60563254, + "num_input_tokens_seen": 169660010, + "step": 7893, + "time_per_iteration": 3.298412799835205 + }, + { + "auxiliary_loss_clip": 0.01119918, + "auxiliary_loss_mlp": 0.01040659, + "balance_loss_clip": 1.04893851, + "balance_loss_mlp": 1.02661026, + "epoch": 0.47461295656094993, + "flos": 12093637380480.0, + "grad_norm": 2.1787400532077608, + "language_loss": 0.77515149, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.79675728, + "num_input_tokens_seen": 169678485, + "step": 7894, + "time_per_iteration": 4.300025463104248 + }, + { + "auxiliary_loss_clip": 0.01119579, + "auxiliary_loss_mlp": 0.01038145, + "balance_loss_clip": 1.04634869, + "balance_loss_mlp": 1.02365553, + "epoch": 0.4746730798136179, + "flos": 20884568981760.0, + "grad_norm": 1.6992264056336024, + "language_loss": 0.75134289, + "learning_rate": 2.260392731628497e-06, + "loss": 0.77292013, + "num_input_tokens_seen": 169697335, + "step": 7895, + "time_per_iteration": 4.2042882442474365 + }, + { + "auxiliary_loss_clip": 0.01115221, + "auxiliary_loss_mlp": 0.01035192, + "balance_loss_clip": 1.04379582, + "balance_loss_mlp": 1.02000451, + "epoch": 0.4747332030662859, + "flos": 19974808287360.0, + "grad_norm": 2.3363867956596462, + "language_loss": 0.83016753, + "learning_rate": 2.260006580021429e-06, + "loss": 0.85167164, + "num_input_tokens_seen": 169715395, + "step": 7896, + "time_per_iteration": 2.6993515491485596 + }, + { + "auxiliary_loss_clip": 0.01115945, + "auxiliary_loss_mlp": 0.01033612, + "balance_loss_clip": 1.04578996, + "balance_loss_mlp": 1.01843619, + "epoch": 0.4747933263189539, + "flos": 16034186920320.0, + "grad_norm": 2.109517003677199, + "language_loss": 0.7557857, + "learning_rate": 2.259620418554886e-06, + "loss": 0.77728134, + "num_input_tokens_seen": 169733755, + "step": 7897, + "time_per_iteration": 4.253166198730469 + }, + { + "auxiliary_loss_clip": 0.01108787, + "auxiliary_loss_mlp": 0.01040894, + "balance_loss_clip": 1.04561198, + "balance_loss_mlp": 1.02645135, + "epoch": 0.47485344957162184, + "flos": 13955102876160.0, + "grad_norm": 2.267424442093673, + "language_loss": 0.63623869, + "learning_rate": 2.25923424724351e-06, + "loss": 0.65773547, + "num_input_tokens_seen": 169751390, + "step": 7898, + "time_per_iteration": 2.672621011734009 + }, + { + "auxiliary_loss_clip": 0.01091849, + "auxiliary_loss_mlp": 0.01057132, + "balance_loss_clip": 1.04254556, + "balance_loss_mlp": 1.03949475, + "epoch": 0.4749135728242898, + "flos": 20449080489600.0, + "grad_norm": 3.549969153580447, + "language_loss": 0.70200998, + "learning_rate": 2.258848066101946e-06, + "loss": 0.72349977, + "num_input_tokens_seen": 169769500, + "step": 7899, + "time_per_iteration": 2.6986401081085205 + }, + { + "auxiliary_loss_clip": 0.01119057, + "auxiliary_loss_mlp": 0.01040719, + "balance_loss_clip": 1.04576528, + "balance_loss_mlp": 1.02590108, + "epoch": 0.4749736960769578, + "flos": 28949961767040.0, + "grad_norm": 1.9384177803560112, + "language_loss": 0.68627715, + "learning_rate": 2.258461875144837e-06, + "loss": 0.70787489, + "num_input_tokens_seen": 169789215, + "step": 7900, + "time_per_iteration": 2.695420265197754 + }, + { + "auxiliary_loss_clip": 0.01088615, + "auxiliary_loss_mlp": 0.01048142, + "balance_loss_clip": 1.04223442, + "balance_loss_mlp": 1.0335629, + "epoch": 0.47503381932962574, + "flos": 31938770592000.0, + "grad_norm": 2.214181272016126, + "language_loss": 0.70571202, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.72707957, + "num_input_tokens_seen": 169808825, + "step": 7901, + "time_per_iteration": 2.7880799770355225 + }, + { + "auxiliary_loss_clip": 0.01101024, + "auxiliary_loss_mlp": 0.01063852, + "balance_loss_clip": 1.04344749, + "balance_loss_mlp": 1.04805636, + "epoch": 0.4750939425822937, + "flos": 22127257860480.0, + "grad_norm": 1.723548754677231, + "language_loss": 0.73669708, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.75834584, + "num_input_tokens_seen": 169827590, + "step": 7902, + "time_per_iteration": 2.67350172996521 + }, + { + "auxiliary_loss_clip": 0.01087876, + "auxiliary_loss_mlp": 0.01040789, + "balance_loss_clip": 1.04317856, + "balance_loss_mlp": 1.02710962, + "epoch": 0.47515406583496167, + "flos": 20850094903680.0, + "grad_norm": 1.7450056007143964, + "language_loss": 0.68050694, + "learning_rate": 2.257303243526688e-06, + "loss": 0.70179355, + "num_input_tokens_seen": 169844925, + "step": 7903, + "time_per_iteration": 2.7626256942749023 + }, + { + "auxiliary_loss_clip": 0.01104723, + "auxiliary_loss_mlp": 0.01035743, + "balance_loss_clip": 1.043818, + "balance_loss_mlp": 1.02206981, + "epoch": 0.47521418908762963, + "flos": 17524802448000.0, + "grad_norm": 1.9051075920789844, + "language_loss": 0.72356462, + "learning_rate": 2.256917013453848e-06, + "loss": 0.74496931, + "num_input_tokens_seen": 169862705, + "step": 7904, + "time_per_iteration": 2.6790597438812256 + }, + { + "auxiliary_loss_clip": 0.01065198, + "auxiliary_loss_mlp": 0.01045369, + "balance_loss_clip": 1.03584373, + "balance_loss_mlp": 1.02957416, + "epoch": 0.4752743123402976, + "flos": 20559434048640.0, + "grad_norm": 1.6154437659751681, + "language_loss": 0.86472631, + "learning_rate": 2.25653077363869e-06, + "loss": 0.88583207, + "num_input_tokens_seen": 169880155, + "step": 7905, + "time_per_iteration": 2.733799457550049 + }, + { + "auxiliary_loss_clip": 0.0110676, + "auxiliary_loss_mlp": 0.01037063, + "balance_loss_clip": 1.04021764, + "balance_loss_mlp": 1.02423561, + "epoch": 0.47533443559296557, + "flos": 26360623071360.0, + "grad_norm": 1.7729713006372103, + "language_loss": 0.82212102, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.84355921, + "num_input_tokens_seen": 169901525, + "step": 7906, + "time_per_iteration": 2.6994829177856445 + }, + { + "auxiliary_loss_clip": 0.01029489, + "auxiliary_loss_mlp": 0.01023044, + "balance_loss_clip": 1.03056157, + "balance_loss_mlp": 1.02150619, + "epoch": 0.47539455884563353, + "flos": 65949660967680.0, + "grad_norm": 0.6767545541611142, + "language_loss": 0.58947372, + "learning_rate": 2.255758264840002e-06, + "loss": 0.60999906, + "num_input_tokens_seen": 169970345, + "step": 7907, + "time_per_iteration": 3.409289836883545 + }, + { + "auxiliary_loss_clip": 0.01112328, + "auxiliary_loss_mlp": 0.0103978, + "balance_loss_clip": 1.04298031, + "balance_loss_mlp": 1.02575445, + "epoch": 0.4754546820983015, + "flos": 17238128002560.0, + "grad_norm": 2.5037076646878664, + "language_loss": 0.81147426, + "learning_rate": 2.255371995885765e-06, + "loss": 0.83299541, + "num_input_tokens_seen": 169986440, + "step": 7908, + "time_per_iteration": 2.6126997470855713 + }, + { + "auxiliary_loss_clip": 0.01120375, + "auxiliary_loss_mlp": 0.01045183, + "balance_loss_clip": 1.04887652, + "balance_loss_mlp": 1.03041351, + "epoch": 0.47551480535096946, + "flos": 19825886499840.0, + "grad_norm": 1.7145689882234993, + "language_loss": 0.73805857, + "learning_rate": 2.254985717247797e-06, + "loss": 0.75971419, + "num_input_tokens_seen": 170005705, + "step": 7909, + "time_per_iteration": 2.7153172492980957 + }, + { + "auxiliary_loss_clip": 0.01098915, + "auxiliary_loss_mlp": 0.0103739, + "balance_loss_clip": 1.04232681, + "balance_loss_mlp": 1.02348399, + "epoch": 0.4755749286036375, + "flos": 22163958581760.0, + "grad_norm": 1.5099683944930966, + "language_loss": 0.75533628, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.77669942, + "num_input_tokens_seen": 170023415, + "step": 7910, + "time_per_iteration": 2.7330431938171387 + }, + { + "auxiliary_loss_clip": 0.01113687, + "auxiliary_loss_mlp": 0.01030183, + "balance_loss_clip": 1.04379678, + "balance_loss_mlp": 1.01749897, + "epoch": 0.47563505185630545, + "flos": 21648280976640.0, + "grad_norm": 1.931062443356086, + "language_loss": 0.79401493, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.81545365, + "num_input_tokens_seen": 170042395, + "step": 7911, + "time_per_iteration": 2.6149117946624756 + }, + { + "auxiliary_loss_clip": 0.01098041, + "auxiliary_loss_mlp": 0.00773063, + "balance_loss_clip": 1.04096794, + "balance_loss_mlp": 1.00061882, + "epoch": 0.4756951751089734, + "flos": 20628777254400.0, + "grad_norm": 2.2768804327487113, + "language_loss": 0.75414324, + "learning_rate": 2.253826823377983e-06, + "loss": 0.77285427, + "num_input_tokens_seen": 170061610, + "step": 7912, + "time_per_iteration": 2.680414915084839 + }, + { + "auxiliary_loss_clip": 0.01123715, + "auxiliary_loss_mlp": 0.01037472, + "balance_loss_clip": 1.04319668, + "balance_loss_mlp": 1.02353013, + "epoch": 0.4757552983616414, + "flos": 25848788221440.0, + "grad_norm": 1.4371041113730632, + "language_loss": 0.74065906, + "learning_rate": 2.253440506151569e-06, + "loss": 0.76227093, + "num_input_tokens_seen": 170083505, + "step": 7913, + "time_per_iteration": 2.6565608978271484 + }, + { + "auxiliary_loss_clip": 0.0110748, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.04591024, + "balance_loss_mlp": 1.01694882, + "epoch": 0.47581542161430934, + "flos": 18223013992320.0, + "grad_norm": 2.17158702079863, + "language_loss": 0.72123522, + "learning_rate": 2.253054179314666e-06, + "loss": 0.7426281, + "num_input_tokens_seen": 170100690, + "step": 7914, + "time_per_iteration": 2.6789934635162354 + }, + { + "auxiliary_loss_clip": 0.01103912, + "auxiliary_loss_mlp": 0.01042984, + "balance_loss_clip": 1.04652143, + "balance_loss_mlp": 1.02944756, + "epoch": 0.4758755448669773, + "flos": 21579763783680.0, + "grad_norm": 2.3786315570139345, + "language_loss": 0.64855683, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.67002577, + "num_input_tokens_seen": 170119240, + "step": 7915, + "time_per_iteration": 2.65608549118042 + }, + { + "auxiliary_loss_clip": 0.01123163, + "auxiliary_loss_mlp": 0.01041838, + "balance_loss_clip": 1.04508734, + "balance_loss_mlp": 1.02774107, + "epoch": 0.47593566811964527, + "flos": 15231152511360.0, + "grad_norm": 1.7019759484121837, + "language_loss": 0.76935744, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.79100746, + "num_input_tokens_seen": 170136450, + "step": 7916, + "time_per_iteration": 2.585491418838501 + }, + { + "auxiliary_loss_clip": 0.01125392, + "auxiliary_loss_mlp": 0.01036553, + "balance_loss_clip": 1.04389, + "balance_loss_mlp": 1.02302265, + "epoch": 0.47599579137231324, + "flos": 21543242630400.0, + "grad_norm": 2.0866631919048175, + "language_loss": 0.63895321, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.66057259, + "num_input_tokens_seen": 170155295, + "step": 7917, + "time_per_iteration": 2.5544540882110596 + }, + { + "auxiliary_loss_clip": 0.01017258, + "auxiliary_loss_mlp": 0.01002335, + "balance_loss_clip": 1.01986837, + "balance_loss_mlp": 1.00074983, + "epoch": 0.4760559146249812, + "flos": 64554602595840.0, + "grad_norm": 0.8370962757635343, + "language_loss": 0.65689212, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.67708808, + "num_input_tokens_seen": 170222325, + "step": 7918, + "time_per_iteration": 3.4263010025024414 + }, + { + "auxiliary_loss_clip": 0.01114985, + "auxiliary_loss_mlp": 0.00771917, + "balance_loss_clip": 1.04313397, + "balance_loss_mlp": 1.00057673, + "epoch": 0.47611603787764917, + "flos": 22233876405120.0, + "grad_norm": 2.4555452771674067, + "language_loss": 0.68450713, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.70337617, + "num_input_tokens_seen": 170241625, + "step": 7919, + "time_per_iteration": 2.7581801414489746 + }, + { + "auxiliary_loss_clip": 0.01105197, + "auxiliary_loss_mlp": 0.01042973, + "balance_loss_clip": 1.04329574, + "balance_loss_mlp": 1.02922797, + "epoch": 0.47617616113031713, + "flos": 22780005765120.0, + "grad_norm": 1.6063666097186406, + "language_loss": 0.75389183, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.77537358, + "num_input_tokens_seen": 170262470, + "step": 7920, + "time_per_iteration": 2.7888362407684326 + }, + { + "auxiliary_loss_clip": 0.01109747, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.04727352, + "balance_loss_mlp": 1.01956415, + "epoch": 0.4762362843829851, + "flos": 24133802388480.0, + "grad_norm": 1.5207523519625543, + "language_loss": 0.7761817, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.79762518, + "num_input_tokens_seen": 170283460, + "step": 7921, + "time_per_iteration": 2.7462785243988037 + }, + { + "auxiliary_loss_clip": 0.01108901, + "auxiliary_loss_mlp": 0.01043608, + "balance_loss_clip": 1.0445503, + "balance_loss_mlp": 1.02778864, + "epoch": 0.47629640763565306, + "flos": 22452069571200.0, + "grad_norm": 3.2907516590332024, + "language_loss": 0.78146785, + "learning_rate": 2.249963220399845e-06, + "loss": 0.80299294, + "num_input_tokens_seen": 170304225, + "step": 7922, + "time_per_iteration": 2.6893417835235596 + }, + { + "auxiliary_loss_clip": 0.01094796, + "auxiliary_loss_mlp": 0.01043063, + "balance_loss_clip": 1.04391539, + "balance_loss_mlp": 1.02719617, + "epoch": 0.4763565308883211, + "flos": 11181398647680.0, + "grad_norm": 1.6628631162014398, + "language_loss": 0.7275365, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.74891508, + "num_input_tokens_seen": 170322110, + "step": 7923, + "time_per_iteration": 2.732468605041504 + }, + { + "auxiliary_loss_clip": 0.01102187, + "auxiliary_loss_mlp": 0.01039061, + "balance_loss_clip": 1.04838657, + "balance_loss_mlp": 1.02511382, + "epoch": 0.47641665414098905, + "flos": 22382151747840.0, + "grad_norm": 1.679365493038583, + "language_loss": 0.82141626, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.84282875, + "num_input_tokens_seen": 170340700, + "step": 7924, + "time_per_iteration": 2.7680320739746094 + }, + { + "auxiliary_loss_clip": 0.01126329, + "auxiliary_loss_mlp": 0.01038575, + "balance_loss_clip": 1.0495019, + "balance_loss_mlp": 1.02264822, + "epoch": 0.476476777393657, + "flos": 25046148862080.0, + "grad_norm": 2.2679110024074705, + "language_loss": 0.80316466, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.82481372, + "num_input_tokens_seen": 170359780, + "step": 7925, + "time_per_iteration": 2.649615526199341 + }, + { + "auxiliary_loss_clip": 0.01101728, + "auxiliary_loss_mlp": 0.01041222, + "balance_loss_clip": 1.04264617, + "balance_loss_mlp": 1.02741158, + "epoch": 0.476536900646325, + "flos": 27269916888960.0, + "grad_norm": 1.5530829773494035, + "language_loss": 0.72051573, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.74194521, + "num_input_tokens_seen": 170381260, + "step": 7926, + "time_per_iteration": 2.7393877506256104 + }, + { + "auxiliary_loss_clip": 0.0111858, + "auxiliary_loss_mlp": 0.01035544, + "balance_loss_clip": 1.04556048, + "balance_loss_mlp": 1.02015448, + "epoch": 0.47659702389899294, + "flos": 25301401885440.0, + "grad_norm": 1.973296217359943, + "language_loss": 0.68039131, + "learning_rate": 2.248031062546432e-06, + "loss": 0.70193255, + "num_input_tokens_seen": 170400595, + "step": 7927, + "time_per_iteration": 2.7364554405212402 + }, + { + "auxiliary_loss_clip": 0.01088729, + "auxiliary_loss_mlp": 0.01031301, + "balance_loss_clip": 1.04246449, + "balance_loss_mlp": 1.01772344, + "epoch": 0.4766571471516609, + "flos": 25992861672960.0, + "grad_norm": 1.624613635266834, + "language_loss": 0.67674315, + "learning_rate": 2.247644602701045e-06, + "loss": 0.69794345, + "num_input_tokens_seen": 170421110, + "step": 7928, + "time_per_iteration": 2.7200751304626465 + }, + { + "auxiliary_loss_clip": 0.01128959, + "auxiliary_loss_mlp": 0.0103446, + "balance_loss_clip": 1.04645658, + "balance_loss_mlp": 1.01979089, + "epoch": 0.4767172704043289, + "flos": 16032211672320.0, + "grad_norm": 2.0796504226810497, + "language_loss": 0.78678215, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.80841631, + "num_input_tokens_seen": 170436700, + "step": 7929, + "time_per_iteration": 2.6817221641540527 + }, + { + "auxiliary_loss_clip": 0.01102478, + "auxiliary_loss_mlp": 0.01039975, + "balance_loss_clip": 1.04257607, + "balance_loss_mlp": 1.0262301, + "epoch": 0.47677739365699684, + "flos": 39235351651200.0, + "grad_norm": 1.8131309373477071, + "language_loss": 0.6663419, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.68776643, + "num_input_tokens_seen": 170459555, + "step": 7930, + "time_per_iteration": 2.856072187423706 + }, + { + "auxiliary_loss_clip": 0.0111358, + "auxiliary_loss_mlp": 0.01036755, + "balance_loss_clip": 1.04616833, + "balance_loss_mlp": 1.02318919, + "epoch": 0.4768375169096648, + "flos": 24717781704960.0, + "grad_norm": 7.611219304969564, + "language_loss": 0.7973817, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.81888509, + "num_input_tokens_seen": 170479175, + "step": 7931, + "time_per_iteration": 2.646108865737915 + }, + { + "auxiliary_loss_clip": 0.01100642, + "auxiliary_loss_mlp": 0.01036826, + "balance_loss_clip": 1.04248762, + "balance_loss_mlp": 1.02181768, + "epoch": 0.47689764016233277, + "flos": 22528667324160.0, + "grad_norm": 1.747640555146421, + "language_loss": 0.76035368, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.78172839, + "num_input_tokens_seen": 170498450, + "step": 7932, + "time_per_iteration": 2.6632022857666016 + }, + { + "auxiliary_loss_clip": 0.01103619, + "auxiliary_loss_mlp": 0.00770594, + "balance_loss_clip": 1.04416108, + "balance_loss_mlp": 1.00076032, + "epoch": 0.47695776341500074, + "flos": 15120619384320.0, + "grad_norm": 1.7743205398157191, + "language_loss": 0.79733002, + "learning_rate": 2.245712162906593e-06, + "loss": 0.81607223, + "num_input_tokens_seen": 170516255, + "step": 7933, + "time_per_iteration": 4.2387471199035645 + }, + { + "auxiliary_loss_clip": 0.01123015, + "auxiliary_loss_mlp": 0.01041506, + "balance_loss_clip": 1.04555225, + "balance_loss_mlp": 1.02532899, + "epoch": 0.4770178866676687, + "flos": 14678917839360.0, + "grad_norm": 1.9828909232489866, + "language_loss": 0.73883361, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.76047885, + "num_input_tokens_seen": 170532705, + "step": 7934, + "time_per_iteration": 4.074187517166138 + }, + { + "auxiliary_loss_clip": 0.01116756, + "auxiliary_loss_mlp": 0.01034977, + "balance_loss_clip": 1.04362082, + "balance_loss_mlp": 1.02075577, + "epoch": 0.47707800992033667, + "flos": 22565583527040.0, + "grad_norm": 1.8305920873714958, + "language_loss": 0.80197936, + "learning_rate": 2.244939121664211e-06, + "loss": 0.8234967, + "num_input_tokens_seen": 170551925, + "step": 7935, + "time_per_iteration": 2.650474786758423 + }, + { + "auxiliary_loss_clip": 0.01101181, + "auxiliary_loss_mlp": 0.01043502, + "balance_loss_clip": 1.04532123, + "balance_loss_mlp": 1.02818346, + "epoch": 0.4771381331730047, + "flos": 30918225375360.0, + "grad_norm": 5.908138115579588, + "language_loss": 0.71829689, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.73974371, + "num_input_tokens_seen": 170572320, + "step": 7936, + "time_per_iteration": 4.428630113601685 + }, + { + "auxiliary_loss_clip": 0.01130752, + "auxiliary_loss_mlp": 0.01039041, + "balance_loss_clip": 1.04646921, + "balance_loss_mlp": 1.02419913, + "epoch": 0.47719825642567265, + "flos": 25738901539200.0, + "grad_norm": 2.4038439056994156, + "language_loss": 0.675704, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.69740188, + "num_input_tokens_seen": 170589470, + "step": 7937, + "time_per_iteration": 4.458148241043091 + }, + { + "auxiliary_loss_clip": 0.01034806, + "auxiliary_loss_mlp": 0.00999407, + "balance_loss_clip": 1.01822138, + "balance_loss_mlp": 0.99804842, + "epoch": 0.4772583796783406, + "flos": 66355128668160.0, + "grad_norm": 0.7105047811157361, + "language_loss": 0.56384945, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58419156, + "num_input_tokens_seen": 170662265, + "step": 7938, + "time_per_iteration": 3.3967578411102295 + }, + { + "auxiliary_loss_clip": 0.01099667, + "auxiliary_loss_mlp": 0.0104562, + "balance_loss_clip": 1.04193783, + "balance_loss_mlp": 1.02908564, + "epoch": 0.4773185029310086, + "flos": 22051091070720.0, + "grad_norm": 3.053079154163393, + "language_loss": 0.88725203, + "learning_rate": 2.243392927839317e-06, + "loss": 0.90870488, + "num_input_tokens_seen": 170679680, + "step": 7939, + "time_per_iteration": 2.7099897861480713 + }, + { + "auxiliary_loss_clip": 0.01115778, + "auxiliary_loss_mlp": 0.01037609, + "balance_loss_clip": 1.04160845, + "balance_loss_mlp": 1.02400148, + "epoch": 0.47737862618367655, + "flos": 16727801523840.0, + "grad_norm": 1.7393189284877646, + "language_loss": 0.77381486, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.79534876, + "num_input_tokens_seen": 170697340, + "step": 7940, + "time_per_iteration": 2.5913469791412354 + }, + { + "auxiliary_loss_clip": 0.01104457, + "auxiliary_loss_mlp": 0.01036057, + "balance_loss_clip": 1.04589248, + "balance_loss_mlp": 1.02288485, + "epoch": 0.4774387494363445, + "flos": 19609453100160.0, + "grad_norm": 1.5893003235088359, + "language_loss": 0.8474015, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.8688066, + "num_input_tokens_seen": 170714905, + "step": 7941, + "time_per_iteration": 2.605090856552124 + }, + { + "auxiliary_loss_clip": 0.0110803, + "auxiliary_loss_mlp": 0.01041538, + "balance_loss_clip": 1.04433787, + "balance_loss_mlp": 1.02682161, + "epoch": 0.4774988726890125, + "flos": 16653969118080.0, + "grad_norm": 2.1303607813841237, + "language_loss": 0.75943714, + "learning_rate": 2.24223318550976e-06, + "loss": 0.78093278, + "num_input_tokens_seen": 170731810, + "step": 7942, + "time_per_iteration": 2.612901449203491 + }, + { + "auxiliary_loss_clip": 0.01115811, + "auxiliary_loss_mlp": 0.01038801, + "balance_loss_clip": 1.04779172, + "balance_loss_mlp": 1.02491331, + "epoch": 0.47755899594168044, + "flos": 20485565729280.0, + "grad_norm": 1.7564628488897216, + "language_loss": 0.6467554, + "learning_rate": 2.241846586342682e-06, + "loss": 0.66830152, + "num_input_tokens_seen": 170750270, + "step": 7943, + "time_per_iteration": 2.6675846576690674 + }, + { + "auxiliary_loss_clip": 0.01088131, + "auxiliary_loss_mlp": 0.01040732, + "balance_loss_clip": 1.04014313, + "balance_loss_mlp": 1.02544951, + "epoch": 0.4776191191943484, + "flos": 21652806090240.0, + "grad_norm": 3.30514620611564, + "language_loss": 0.73474699, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.75603563, + "num_input_tokens_seen": 170769015, + "step": 7944, + "time_per_iteration": 2.6938626766204834 + }, + { + "auxiliary_loss_clip": 0.01116316, + "auxiliary_loss_mlp": 0.01035661, + "balance_loss_clip": 1.04835653, + "balance_loss_mlp": 1.01982975, + "epoch": 0.4776792424470164, + "flos": 18770220760320.0, + "grad_norm": 2.01255819211095, + "language_loss": 0.67873627, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.70025599, + "num_input_tokens_seen": 170785725, + "step": 7945, + "time_per_iteration": 2.5940043926239014 + }, + { + "auxiliary_loss_clip": 0.0108787, + "auxiliary_loss_mlp": 0.00774963, + "balance_loss_clip": 1.03865957, + "balance_loss_mlp": 1.00055337, + "epoch": 0.47773936569968434, + "flos": 29715828577920.0, + "grad_norm": 1.9730762461064726, + "language_loss": 0.75473535, + "learning_rate": 2.240686733875009e-06, + "loss": 0.77336371, + "num_input_tokens_seen": 170804600, + "step": 7946, + "time_per_iteration": 2.762983560562134 + }, + { + "auxiliary_loss_clip": 0.01105207, + "auxiliary_loss_mlp": 0.01042769, + "balance_loss_clip": 1.04477096, + "balance_loss_mlp": 1.0274632, + "epoch": 0.4777994889523523, + "flos": 24791542283520.0, + "grad_norm": 2.190560640838335, + "language_loss": 0.79071236, + "learning_rate": 2.240300098112506e-06, + "loss": 0.81219208, + "num_input_tokens_seen": 170824230, + "step": 7947, + "time_per_iteration": 2.692763328552246 + }, + { + "auxiliary_loss_clip": 0.010955, + "auxiliary_loss_mlp": 0.01037272, + "balance_loss_clip": 1.0440042, + "balance_loss_mlp": 1.02317524, + "epoch": 0.47785961220502027, + "flos": 17858161595520.0, + "grad_norm": 2.294285239078615, + "language_loss": 0.7329706, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.75429833, + "num_input_tokens_seen": 170843365, + "step": 7948, + "time_per_iteration": 2.6743998527526855 + }, + { + "auxiliary_loss_clip": 0.01106692, + "auxiliary_loss_mlp": 0.01038667, + "balance_loss_clip": 1.04329944, + "balance_loss_mlp": 1.0235039, + "epoch": 0.4779197354576883, + "flos": 20266546550400.0, + "grad_norm": 1.7991446580624026, + "language_loss": 0.78139675, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.80285037, + "num_input_tokens_seen": 170863515, + "step": 7949, + "time_per_iteration": 2.694549560546875 + }, + { + "auxiliary_loss_clip": 0.01096582, + "auxiliary_loss_mlp": 0.0104007, + "balance_loss_clip": 1.04018211, + "balance_loss_mlp": 1.0263133, + "epoch": 0.47797985871035625, + "flos": 17056599644160.0, + "grad_norm": 2.242348781817659, + "language_loss": 0.74315739, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.76452386, + "num_input_tokens_seen": 170881245, + "step": 7950, + "time_per_iteration": 2.718254327774048 + }, + { + "auxiliary_loss_clip": 0.01095843, + "auxiliary_loss_mlp": 0.01046859, + "balance_loss_clip": 1.04172587, + "balance_loss_mlp": 1.03179109, + "epoch": 0.4780399819630242, + "flos": 31358418549120.0, + "grad_norm": 1.9896022122587003, + "language_loss": 0.74343586, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.7648629, + "num_input_tokens_seen": 170901285, + "step": 7951, + "time_per_iteration": 2.7827391624450684 + }, + { + "auxiliary_loss_clip": 0.01094802, + "auxiliary_loss_mlp": 0.01036097, + "balance_loss_clip": 1.04424548, + "balance_loss_mlp": 1.02120781, + "epoch": 0.4781001052156922, + "flos": 24899597372160.0, + "grad_norm": 2.198904574593956, + "language_loss": 0.80032581, + "learning_rate": 2.238366782910174e-06, + "loss": 0.82163477, + "num_input_tokens_seen": 170919740, + "step": 7952, + "time_per_iteration": 2.812988519668579 + }, + { + "auxiliary_loss_clip": 0.01107213, + "auxiliary_loss_mlp": 0.01044893, + "balance_loss_clip": 1.04275584, + "balance_loss_mlp": 1.03007555, + "epoch": 0.47816022846836015, + "flos": 18697717157760.0, + "grad_norm": 1.8177204893019177, + "language_loss": 0.7794894, + "learning_rate": 2.23798009269438e-06, + "loss": 0.80101049, + "num_input_tokens_seen": 170938510, + "step": 7953, + "time_per_iteration": 2.6617591381073 + }, + { + "auxiliary_loss_clip": 0.01120456, + "auxiliary_loss_mlp": 0.01038164, + "balance_loss_clip": 1.04588997, + "balance_loss_mlp": 1.0237813, + "epoch": 0.4782203517210281, + "flos": 11977573559040.0, + "grad_norm": 2.347215604083738, + "language_loss": 0.84714645, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.86873269, + "num_input_tokens_seen": 170951170, + "step": 7954, + "time_per_iteration": 2.6208479404449463 + }, + { + "auxiliary_loss_clip": 0.01097068, + "auxiliary_loss_mlp": 0.01038972, + "balance_loss_clip": 1.0426054, + "balance_loss_mlp": 1.0248698, + "epoch": 0.4782804749736961, + "flos": 20813501923200.0, + "grad_norm": 1.4277916214046864, + "language_loss": 0.70472121, + "learning_rate": 2.237206685204768e-06, + "loss": 0.72608161, + "num_input_tokens_seen": 170970990, + "step": 7955, + "time_per_iteration": 2.821913719177246 + }, + { + "auxiliary_loss_clip": 0.0110203, + "auxiliary_loss_mlp": 0.01041668, + "balance_loss_clip": 1.04433143, + "balance_loss_mlp": 1.0281322, + "epoch": 0.47834059822636404, + "flos": 23840304359040.0, + "grad_norm": 1.5047634516845327, + "language_loss": 0.82269239, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.84412932, + "num_input_tokens_seen": 170991215, + "step": 7956, + "time_per_iteration": 2.683924913406372 + }, + { + "auxiliary_loss_clip": 0.01105668, + "auxiliary_loss_mlp": 0.01036371, + "balance_loss_clip": 1.04529083, + "balance_loss_mlp": 1.02021837, + "epoch": 0.478400721479032, + "flos": 22633777497600.0, + "grad_norm": 2.448858103633137, + "language_loss": 0.84977531, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.87119567, + "num_input_tokens_seen": 171007325, + "step": 7957, + "time_per_iteration": 2.6371145248413086 + }, + { + "auxiliary_loss_clip": 0.01118227, + "auxiliary_loss_mlp": 0.01040317, + "balance_loss_clip": 1.04562736, + "balance_loss_mlp": 1.02653635, + "epoch": 0.4784608447317, + "flos": 19354954262400.0, + "grad_norm": 1.5628888100251457, + "language_loss": 0.79777038, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.81935579, + "num_input_tokens_seen": 171025650, + "step": 7958, + "time_per_iteration": 2.639721632003784 + }, + { + "auxiliary_loss_clip": 0.01085054, + "auxiliary_loss_mlp": 0.0077548, + "balance_loss_clip": 1.03763032, + "balance_loss_mlp": 1.00064015, + "epoch": 0.47852096798436794, + "flos": 24021114445440.0, + "grad_norm": 1.8018566992199279, + "language_loss": 0.82972836, + "learning_rate": 2.235659762404047e-06, + "loss": 0.84833372, + "num_input_tokens_seen": 171045045, + "step": 7959, + "time_per_iteration": 2.733668565750122 + }, + { + "auxiliary_loss_clip": 0.01090487, + "auxiliary_loss_mlp": 0.01036767, + "balance_loss_clip": 1.04364586, + "balance_loss_mlp": 1.02436292, + "epoch": 0.4785810912370359, + "flos": 25666433850240.0, + "grad_norm": 2.7562627438628504, + "language_loss": 0.73275614, + "learning_rate": 2.235273009326599e-06, + "loss": 0.75402862, + "num_input_tokens_seen": 171062910, + "step": 7960, + "time_per_iteration": 2.6994166374206543 + }, + { + "auxiliary_loss_clip": 0.0109086, + "auxiliary_loss_mlp": 0.0103472, + "balance_loss_clip": 1.04504585, + "balance_loss_mlp": 1.02170801, + "epoch": 0.47864121448970387, + "flos": 21432134885760.0, + "grad_norm": 1.6649690841938434, + "language_loss": 0.76878142, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.79003716, + "num_input_tokens_seen": 171080875, + "step": 7961, + "time_per_iteration": 2.7051572799682617 + }, + { + "auxiliary_loss_clip": 0.01087757, + "auxiliary_loss_mlp": 0.0103463, + "balance_loss_clip": 1.04447055, + "balance_loss_mlp": 1.02050352, + "epoch": 0.47870133774237184, + "flos": 16143894034560.0, + "grad_norm": 7.35679067145723, + "language_loss": 0.7769649, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.79818881, + "num_input_tokens_seen": 171099190, + "step": 7962, + "time_per_iteration": 2.7466347217559814 + }, + { + "auxiliary_loss_clip": 0.0110573, + "auxiliary_loss_mlp": 0.01042926, + "balance_loss_clip": 1.04702401, + "balance_loss_mlp": 1.02871001, + "epoch": 0.47876146099503986, + "flos": 26906788344960.0, + "grad_norm": 1.6387698321198922, + "language_loss": 0.64764994, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.66913652, + "num_input_tokens_seen": 171119060, + "step": 7963, + "time_per_iteration": 2.77663516998291 + }, + { + "auxiliary_loss_clip": 0.01117113, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.04389668, + "balance_loss_mlp": 1.02196217, + "epoch": 0.4788215842477078, + "flos": 45332085778560.0, + "grad_norm": 1.655648847764305, + "language_loss": 0.77503848, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.79656863, + "num_input_tokens_seen": 171141900, + "step": 7964, + "time_per_iteration": 2.9196712970733643 + }, + { + "auxiliary_loss_clip": 0.01120902, + "auxiliary_loss_mlp": 0.01036482, + "balance_loss_clip": 1.04660964, + "balance_loss_mlp": 1.02042508, + "epoch": 0.4788817075003758, + "flos": 22237180456320.0, + "grad_norm": 2.8996801764774505, + "language_loss": 0.76540697, + "learning_rate": 2.233339110409044e-06, + "loss": 0.78698087, + "num_input_tokens_seen": 171161045, + "step": 7965, + "time_per_iteration": 2.6720781326293945 + }, + { + "auxiliary_loss_clip": 0.0106828, + "auxiliary_loss_mlp": 0.0105005, + "balance_loss_clip": 1.03929722, + "balance_loss_mlp": 1.03433788, + "epoch": 0.47894183075304375, + "flos": 16471183783680.0, + "grad_norm": 1.712219755604538, + "language_loss": 0.74560332, + "learning_rate": 2.232952304022137e-06, + "loss": 0.76678663, + "num_input_tokens_seen": 171179675, + "step": 7966, + "time_per_iteration": 2.7669286727905273 + }, + { + "auxiliary_loss_clip": 0.01101486, + "auxiliary_loss_mlp": 0.0103808, + "balance_loss_clip": 1.0444622, + "balance_loss_mlp": 1.02388787, + "epoch": 0.4790019540057117, + "flos": 24282688262400.0, + "grad_norm": 2.605899190258409, + "language_loss": 0.73308432, + "learning_rate": 2.232565488801655e-06, + "loss": 0.75448, + "num_input_tokens_seen": 171201175, + "step": 7967, + "time_per_iteration": 2.7271900177001953 + }, + { + "auxiliary_loss_clip": 0.01102984, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.04409146, + "balance_loss_mlp": 1.01838326, + "epoch": 0.4790620772583797, + "flos": 25666469763840.0, + "grad_norm": 2.103515425969552, + "language_loss": 0.79279423, + "learning_rate": 2.232178664762267e-06, + "loss": 0.81415194, + "num_input_tokens_seen": 171221750, + "step": 7968, + "time_per_iteration": 2.707740545272827 + }, + { + "auxiliary_loss_clip": 0.0102077, + "auxiliary_loss_mlp": 0.01020427, + "balance_loss_clip": 1.02207994, + "balance_loss_mlp": 1.01903248, + "epoch": 0.47912220051104765, + "flos": 69428077102080.0, + "grad_norm": 0.7660555925772923, + "language_loss": 0.62198806, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.64240003, + "num_input_tokens_seen": 171292235, + "step": 7969, + "time_per_iteration": 3.3662569522857666 + }, + { + "auxiliary_loss_clip": 0.01087594, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.04418397, + "balance_loss_mlp": 1.01662636, + "epoch": 0.4791823237637156, + "flos": 24168922911360.0, + "grad_norm": 1.7596129166374368, + "language_loss": 0.77306086, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.79423159, + "num_input_tokens_seen": 171312215, + "step": 7970, + "time_per_iteration": 2.69364857673645 + }, + { + "auxiliary_loss_clip": 0.01116664, + "auxiliary_loss_mlp": 0.01038161, + "balance_loss_clip": 1.04511642, + "balance_loss_mlp": 1.0235939, + "epoch": 0.4792424470163836, + "flos": 24751465683840.0, + "grad_norm": 1.5706742055007812, + "language_loss": 0.70431626, + "learning_rate": 2.231018139877349e-06, + "loss": 0.72586453, + "num_input_tokens_seen": 171332975, + "step": 7971, + "time_per_iteration": 2.690791130065918 + }, + { + "auxiliary_loss_clip": 0.01072275, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.03982508, + "balance_loss_mlp": 1.01899683, + "epoch": 0.47930257026905154, + "flos": 23257905240960.0, + "grad_norm": 1.30993945009872, + "language_loss": 0.79995155, + "learning_rate": 2.230631280709021e-06, + "loss": 0.82101291, + "num_input_tokens_seen": 171353880, + "step": 7972, + "time_per_iteration": 2.829455852508545 + }, + { + "auxiliary_loss_clip": 0.0111891, + "auxiliary_loss_mlp": 0.01028077, + "balance_loss_clip": 1.0466361, + "balance_loss_mlp": 1.01299727, + "epoch": 0.4793626935217195, + "flos": 14064091718400.0, + "grad_norm": 2.2411370214837807, + "language_loss": 0.69401908, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.71548891, + "num_input_tokens_seen": 171370930, + "step": 7973, + "time_per_iteration": 4.2368669509887695 + }, + { + "auxiliary_loss_clip": 0.01120125, + "auxiliary_loss_mlp": 0.01039183, + "balance_loss_clip": 1.05002046, + "balance_loss_mlp": 1.02575445, + "epoch": 0.4794228167743875, + "flos": 21798854789760.0, + "grad_norm": 1.967830357691446, + "language_loss": 0.78792048, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.80951357, + "num_input_tokens_seen": 171387575, + "step": 7974, + "time_per_iteration": 2.666619300842285 + }, + { + "auxiliary_loss_clip": 0.01029245, + "auxiliary_loss_mlp": 0.01003452, + "balance_loss_clip": 1.02188838, + "balance_loss_mlp": 1.00173593, + "epoch": 0.47948294002705544, + "flos": 66968805553920.0, + "grad_norm": 0.7538441683533625, + "language_loss": 0.54051983, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.56084681, + "num_input_tokens_seen": 171449980, + "step": 7975, + "time_per_iteration": 4.964555501937866 + }, + { + "auxiliary_loss_clip": 0.01114672, + "auxiliary_loss_mlp": 0.01039108, + "balance_loss_clip": 1.04530835, + "balance_loss_mlp": 1.02287221, + "epoch": 0.47954306327972346, + "flos": 12422471414400.0, + "grad_norm": 2.0524308160251707, + "language_loss": 0.89917016, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.92070794, + "num_input_tokens_seen": 171465290, + "step": 7976, + "time_per_iteration": 4.202557802200317 + }, + { + "auxiliary_loss_clip": 0.01135185, + "auxiliary_loss_mlp": 0.01039598, + "balance_loss_clip": 1.05056477, + "balance_loss_mlp": 1.02431524, + "epoch": 0.4796031865323914, + "flos": 18361951799040.0, + "grad_norm": 2.222330138734667, + "language_loss": 0.73720783, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.75895566, + "num_input_tokens_seen": 171481130, + "step": 7977, + "time_per_iteration": 2.653036117553711 + }, + { + "auxiliary_loss_clip": 0.01112997, + "auxiliary_loss_mlp": 0.00772063, + "balance_loss_clip": 1.0468123, + "balance_loss_mlp": 1.00047266, + "epoch": 0.4796633097850594, + "flos": 21835088634240.0, + "grad_norm": 1.5823767711410588, + "language_loss": 0.78372079, + "learning_rate": 2.228309942555734e-06, + "loss": 0.80257142, + "num_input_tokens_seen": 171501140, + "step": 7978, + "time_per_iteration": 2.7036852836608887 + }, + { + "auxiliary_loss_clip": 0.01106382, + "auxiliary_loss_mlp": 0.01039525, + "balance_loss_clip": 1.04519784, + "balance_loss_mlp": 1.02526784, + "epoch": 0.47972343303772735, + "flos": 23437350610560.0, + "grad_norm": 2.6635738232298944, + "language_loss": 0.89488423, + "learning_rate": 2.22792302247656e-06, + "loss": 0.91634321, + "num_input_tokens_seen": 171519835, + "step": 7979, + "time_per_iteration": 2.653221845626831 + }, + { + "auxiliary_loss_clip": 0.01122392, + "auxiliary_loss_mlp": 0.01040662, + "balance_loss_clip": 1.04798067, + "balance_loss_mlp": 1.02475905, + "epoch": 0.4797835562903953, + "flos": 24899776940160.0, + "grad_norm": 1.5901773617653536, + "language_loss": 0.76710582, + "learning_rate": 2.227536093754523e-06, + "loss": 0.78873634, + "num_input_tokens_seen": 171540980, + "step": 7980, + "time_per_iteration": 2.6700520515441895 + }, + { + "auxiliary_loss_clip": 0.01103639, + "auxiliary_loss_mlp": 0.01039114, + "balance_loss_clip": 1.04525447, + "balance_loss_mlp": 1.02261567, + "epoch": 0.4798436795430633, + "flos": 35042996793600.0, + "grad_norm": 1.9068781398056245, + "language_loss": 0.7128244, + "learning_rate": 2.227149156404295e-06, + "loss": 0.73425198, + "num_input_tokens_seen": 171563600, + "step": 7981, + "time_per_iteration": 2.817458391189575 + }, + { + "auxiliary_loss_clip": 0.01130721, + "auxiliary_loss_mlp": 0.01034361, + "balance_loss_clip": 1.05059981, + "balance_loss_mlp": 1.02040792, + "epoch": 0.47990380279573125, + "flos": 20590209025920.0, + "grad_norm": 2.189836625005686, + "language_loss": 0.70604527, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.72769606, + "num_input_tokens_seen": 171580700, + "step": 7982, + "time_per_iteration": 2.639772891998291 + }, + { + "auxiliary_loss_clip": 0.01101365, + "auxiliary_loss_mlp": 0.01031884, + "balance_loss_clip": 1.04456162, + "balance_loss_mlp": 1.01928937, + "epoch": 0.4799639260483992, + "flos": 26359402008960.0, + "grad_norm": 6.366705109750511, + "language_loss": 0.71019757, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.73153007, + "num_input_tokens_seen": 171602035, + "step": 7983, + "time_per_iteration": 2.7794454097747803 + }, + { + "auxiliary_loss_clip": 0.01038182, + "auxiliary_loss_mlp": 0.00752618, + "balance_loss_clip": 1.0209136, + "balance_loss_mlp": 1.00015247, + "epoch": 0.4800240493010672, + "flos": 70979021521920.0, + "grad_norm": 0.8025064053403466, + "language_loss": 0.59461898, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.61252695, + "num_input_tokens_seen": 171659215, + "step": 7984, + "time_per_iteration": 3.1715712547302246 + }, + { + "auxiliary_loss_clip": 0.01068728, + "auxiliary_loss_mlp": 0.01050145, + "balance_loss_clip": 1.03732944, + "balance_loss_mlp": 1.03350329, + "epoch": 0.48008417255373514, + "flos": 17086656349440.0, + "grad_norm": 1.9659657952718743, + "language_loss": 0.66784835, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.68903708, + "num_input_tokens_seen": 171675710, + "step": 7985, + "time_per_iteration": 2.8482425212860107 + }, + { + "auxiliary_loss_clip": 0.01105712, + "auxiliary_loss_mlp": 0.010384, + "balance_loss_clip": 1.04168916, + "balance_loss_mlp": 1.02367198, + "epoch": 0.4801442958064031, + "flos": 15413435055360.0, + "grad_norm": 1.731655766205416, + "language_loss": 0.69907761, + "learning_rate": 2.225214340743835e-06, + "loss": 0.72051871, + "num_input_tokens_seen": 171692510, + "step": 7986, + "time_per_iteration": 2.78254771232605 + }, + { + "auxiliary_loss_clip": 0.01094439, + "auxiliary_loss_mlp": 0.0104069, + "balance_loss_clip": 1.04537976, + "balance_loss_mlp": 1.02534223, + "epoch": 0.4802044190590711, + "flos": 11473747441920.0, + "grad_norm": 2.3008677930118844, + "language_loss": 0.78930938, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.81066066, + "num_input_tokens_seen": 171710235, + "step": 7987, + "time_per_iteration": 2.8055880069732666 + }, + { + "auxiliary_loss_clip": 0.01076423, + "auxiliary_loss_mlp": 0.01042206, + "balance_loss_clip": 1.04216504, + "balance_loss_mlp": 1.02793634, + "epoch": 0.48026454231173904, + "flos": 20951003185920.0, + "grad_norm": 2.0041399034857537, + "language_loss": 0.75381374, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.77499998, + "num_input_tokens_seen": 171726715, + "step": 7988, + "time_per_iteration": 2.7931642532348633 + }, + { + "auxiliary_loss_clip": 0.01099185, + "auxiliary_loss_mlp": 0.01033353, + "balance_loss_clip": 1.04829884, + "balance_loss_mlp": 1.01920891, + "epoch": 0.48032466556440706, + "flos": 20448110822400.0, + "grad_norm": 2.2052350267481984, + "language_loss": 0.79056877, + "learning_rate": 2.224053348748365e-06, + "loss": 0.81189418, + "num_input_tokens_seen": 171743605, + "step": 7989, + "time_per_iteration": 2.7195966243743896 + }, + { + "auxiliary_loss_clip": 0.01109361, + "auxiliary_loss_mlp": 0.01046506, + "balance_loss_clip": 1.04376316, + "balance_loss_mlp": 1.03094292, + "epoch": 0.480384788817075, + "flos": 37120823861760.0, + "grad_norm": 1.9525154549019321, + "language_loss": 0.73684812, + "learning_rate": 2.223666334404724e-06, + "loss": 0.75840676, + "num_input_tokens_seen": 171765445, + "step": 7990, + "time_per_iteration": 2.8826913833618164 + }, + { + "auxiliary_loss_clip": 0.01039921, + "auxiliary_loss_mlp": 0.00752733, + "balance_loss_clip": 1.02231336, + "balance_loss_mlp": 1.00023639, + "epoch": 0.480444912069743, + "flos": 69552577641600.0, + "grad_norm": 0.7651324576674445, + "language_loss": 0.59016085, + "learning_rate": 2.223279311579633e-06, + "loss": 0.60808742, + "num_input_tokens_seen": 171830115, + "step": 7991, + "time_per_iteration": 3.325892448425293 + }, + { + "auxiliary_loss_clip": 0.01119355, + "auxiliary_loss_mlp": 0.00772289, + "balance_loss_clip": 1.04751837, + "balance_loss_mlp": 1.00058734, + "epoch": 0.48050503532241096, + "flos": 29822231640960.0, + "grad_norm": 2.03548436048953, + "language_loss": 0.67551184, + "learning_rate": 2.222892280287768e-06, + "loss": 0.69442832, + "num_input_tokens_seen": 171849135, + "step": 7992, + "time_per_iteration": 2.7717204093933105 + }, + { + "auxiliary_loss_clip": 0.01102719, + "auxiliary_loss_mlp": 0.01037971, + "balance_loss_clip": 1.04047358, + "balance_loss_mlp": 1.02267683, + "epoch": 0.4805651585750789, + "flos": 23948539015680.0, + "grad_norm": 1.7261557593206558, + "language_loss": 0.76166683, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.78307372, + "num_input_tokens_seen": 171868880, + "step": 7993, + "time_per_iteration": 2.739190101623535 + }, + { + "auxiliary_loss_clip": 0.01080291, + "auxiliary_loss_mlp": 0.01038498, + "balance_loss_clip": 1.04301596, + "balance_loss_mlp": 1.02469933, + "epoch": 0.4806252818277469, + "flos": 25665428269440.0, + "grad_norm": 1.8324818551458955, + "language_loss": 0.79029763, + "learning_rate": 2.222118192362422e-06, + "loss": 0.81148541, + "num_input_tokens_seen": 171889455, + "step": 7994, + "time_per_iteration": 2.775120973587036 + }, + { + "auxiliary_loss_clip": 0.01107812, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.04342794, + "balance_loss_mlp": 1.01851845, + "epoch": 0.48068540508041485, + "flos": 13151996640000.0, + "grad_norm": 2.168964016546684, + "language_loss": 0.79452056, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.81592381, + "num_input_tokens_seen": 171906070, + "step": 7995, + "time_per_iteration": 2.684086561203003 + }, + { + "auxiliary_loss_clip": 0.01071477, + "auxiliary_loss_mlp": 0.01034963, + "balance_loss_clip": 1.04075575, + "balance_loss_mlp": 1.02081871, + "epoch": 0.4807455283330828, + "flos": 21176738208000.0, + "grad_norm": 1.4272883159105954, + "language_loss": 0.82732481, + "learning_rate": 2.2213440707461e-06, + "loss": 0.84838915, + "num_input_tokens_seen": 171926515, + "step": 7996, + "time_per_iteration": 2.801893711090088 + }, + { + "auxiliary_loss_clip": 0.0105538, + "auxiliary_loss_mlp": 0.01038724, + "balance_loss_clip": 1.03635919, + "balance_loss_mlp": 1.02432358, + "epoch": 0.4808056515857508, + "flos": 12275991751680.0, + "grad_norm": 1.7665973767451764, + "language_loss": 0.81008822, + "learning_rate": 2.220956997340516e-06, + "loss": 0.8310293, + "num_input_tokens_seen": 171943845, + "step": 7997, + "time_per_iteration": 2.7309181690216064 + }, + { + "auxiliary_loss_clip": 0.01079437, + "auxiliary_loss_mlp": 0.0103905, + "balance_loss_clip": 1.04144287, + "balance_loss_mlp": 1.0246973, + "epoch": 0.48086577483841875, + "flos": 24826052275200.0, + "grad_norm": 4.4511101438837555, + "language_loss": 0.7285195, + "learning_rate": 2.220569915556221e-06, + "loss": 0.74970436, + "num_input_tokens_seen": 171964970, + "step": 7998, + "time_per_iteration": 2.793765068054199 + }, + { + "auxiliary_loss_clip": 0.01129175, + "auxiliary_loss_mlp": 0.01042213, + "balance_loss_clip": 1.04769647, + "balance_loss_mlp": 1.02756786, + "epoch": 0.4809258980910867, + "flos": 24465365856000.0, + "grad_norm": 1.6928626075088686, + "language_loss": 0.71266204, + "learning_rate": 2.220182825407892e-06, + "loss": 0.73437595, + "num_input_tokens_seen": 171986340, + "step": 7999, + "time_per_iteration": 2.698373556137085 + }, + { + "auxiliary_loss_clip": 0.01120573, + "auxiliary_loss_mlp": 0.01049678, + "balance_loss_clip": 1.04650939, + "balance_loss_mlp": 1.035707, + "epoch": 0.4809860213437547, + "flos": 21215952881280.0, + "grad_norm": 3.5525090623309525, + "language_loss": 0.71445537, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.73615789, + "num_input_tokens_seen": 172007300, + "step": 8000, + "time_per_iteration": 2.677906036376953 + }, + { + "auxiliary_loss_clip": 0.01120936, + "auxiliary_loss_mlp": 0.01045001, + "balance_loss_clip": 1.04962945, + "balance_loss_mlp": 1.03024244, + "epoch": 0.48104614459642264, + "flos": 37632084094080.0, + "grad_norm": 1.397364252260559, + "language_loss": 0.75031364, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.77197301, + "num_input_tokens_seen": 172029585, + "step": 8001, + "time_per_iteration": 2.8079638481140137 + }, + { + "auxiliary_loss_clip": 0.01120097, + "auxiliary_loss_mlp": 0.01045878, + "balance_loss_clip": 1.04740191, + "balance_loss_mlp": 1.03150105, + "epoch": 0.48110626784909066, + "flos": 18406122549120.0, + "grad_norm": 1.760961408245497, + "language_loss": 0.8157444, + "learning_rate": 2.219021504925493e-06, + "loss": 0.83740413, + "num_input_tokens_seen": 172047495, + "step": 8002, + "time_per_iteration": 2.6615140438079834 + }, + { + "auxiliary_loss_clip": 0.01127724, + "auxiliary_loss_mlp": 0.01043569, + "balance_loss_clip": 1.05275476, + "balance_loss_mlp": 1.02780938, + "epoch": 0.48116639110175863, + "flos": 28439814856320.0, + "grad_norm": 1.7356718355873448, + "language_loss": 0.71858382, + "learning_rate": 2.218634381467819e-06, + "loss": 0.74029678, + "num_input_tokens_seen": 172067625, + "step": 8003, + "time_per_iteration": 2.7304186820983887 + }, + { + "auxiliary_loss_clip": 0.01114781, + "auxiliary_loss_mlp": 0.01040333, + "balance_loss_clip": 1.04751146, + "balance_loss_mlp": 1.02654088, + "epoch": 0.4812265143544266, + "flos": 21725237865600.0, + "grad_norm": 1.7533221004579713, + "language_loss": 0.82598346, + "learning_rate": 2.218247249719507e-06, + "loss": 0.84753454, + "num_input_tokens_seen": 172087885, + "step": 8004, + "time_per_iteration": 2.718576192855835 + }, + { + "auxiliary_loss_clip": 0.01110853, + "auxiliary_loss_mlp": 0.01042863, + "balance_loss_clip": 1.04705787, + "balance_loss_mlp": 1.02601874, + "epoch": 0.48128663760709456, + "flos": 13224679810560.0, + "grad_norm": 2.3721289724239587, + "language_loss": 0.77786469, + "learning_rate": 2.217860109695239e-06, + "loss": 0.79940188, + "num_input_tokens_seen": 172105815, + "step": 8005, + "time_per_iteration": 2.7602009773254395 + }, + { + "auxiliary_loss_clip": 0.01116298, + "auxiliary_loss_mlp": 0.01040763, + "balance_loss_clip": 1.04861951, + "balance_loss_mlp": 1.02662444, + "epoch": 0.4813467608597625, + "flos": 24243437675520.0, + "grad_norm": 1.8330364183017236, + "language_loss": 0.70666707, + "learning_rate": 2.217472961409692e-06, + "loss": 0.72823763, + "num_input_tokens_seen": 172126125, + "step": 8006, + "time_per_iteration": 2.7916948795318604 + }, + { + "auxiliary_loss_clip": 0.01101733, + "auxiliary_loss_mlp": 0.01039864, + "balance_loss_clip": 1.04409337, + "balance_loss_mlp": 1.02521324, + "epoch": 0.4814068841124305, + "flos": 27480424544640.0, + "grad_norm": 1.7951056960252978, + "language_loss": 0.70724428, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.72866029, + "num_input_tokens_seen": 172141945, + "step": 8007, + "time_per_iteration": 2.7661349773406982 + }, + { + "auxiliary_loss_clip": 0.01130133, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.0476191, + "balance_loss_mlp": 1.01881254, + "epoch": 0.48146700736509845, + "flos": 19572896033280.0, + "grad_norm": 11.665968104344772, + "language_loss": 0.71553946, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.73716843, + "num_input_tokens_seen": 172161095, + "step": 8008, + "time_per_iteration": 2.7019124031066895 + }, + { + "auxiliary_loss_clip": 0.01096611, + "auxiliary_loss_mlp": 0.01050794, + "balance_loss_clip": 1.04696894, + "balance_loss_mlp": 1.03467739, + "epoch": 0.4815271306177664, + "flos": 20627771673600.0, + "grad_norm": 2.289909942865545, + "language_loss": 0.60779428, + "learning_rate": 2.216311467132199e-06, + "loss": 0.62926841, + "num_input_tokens_seen": 172178750, + "step": 8009, + "time_per_iteration": 2.713092088699341 + }, + { + "auxiliary_loss_clip": 0.01022233, + "auxiliary_loss_mlp": 0.01005627, + "balance_loss_clip": 1.02350807, + "balance_loss_mlp": 1.00431013, + "epoch": 0.4815872538704344, + "flos": 67691076232320.0, + "grad_norm": 0.8584252589427176, + "language_loss": 0.61326265, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.63354123, + "num_input_tokens_seen": 172240235, + "step": 8010, + "time_per_iteration": 3.2182729244232178 + }, + { + "auxiliary_loss_clip": 0.01123367, + "auxiliary_loss_mlp": 0.01044563, + "balance_loss_clip": 1.0506475, + "balance_loss_mlp": 1.02956653, + "epoch": 0.48164737712310235, + "flos": 22820764723200.0, + "grad_norm": 1.7901877328371896, + "language_loss": 0.73432398, + "learning_rate": 2.215537096576639e-06, + "loss": 0.75600326, + "num_input_tokens_seen": 172259875, + "step": 8011, + "time_per_iteration": 2.671487331390381 + }, + { + "auxiliary_loss_clip": 0.01103596, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.04422355, + "balance_loss_mlp": 1.02199948, + "epoch": 0.4817075003757703, + "flos": 23733865382400.0, + "grad_norm": 1.7774743588215727, + "language_loss": 0.79526579, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.81665325, + "num_input_tokens_seen": 172280150, + "step": 8012, + "time_per_iteration": 5.769195079803467 + }, + { + "auxiliary_loss_clip": 0.01092738, + "auxiliary_loss_mlp": 0.01042222, + "balance_loss_clip": 1.04738641, + "balance_loss_mlp": 1.02718425, + "epoch": 0.4817676236284383, + "flos": 28182909807360.0, + "grad_norm": 1.8494845342416013, + "language_loss": 0.73714077, + "learning_rate": 2.214762693328326e-06, + "loss": 0.75849032, + "num_input_tokens_seen": 172300810, + "step": 8013, + "time_per_iteration": 2.77451491355896 + }, + { + "auxiliary_loss_clip": 0.01105203, + "auxiliary_loss_mlp": 0.0103627, + "balance_loss_clip": 1.05056131, + "balance_loss_mlp": 1.02266848, + "epoch": 0.48182774688110624, + "flos": 17091756080640.0, + "grad_norm": 2.3240899529345858, + "language_loss": 0.90755451, + "learning_rate": 2.214375479481094e-06, + "loss": 0.92896926, + "num_input_tokens_seen": 172317930, + "step": 8014, + "time_per_iteration": 4.2677695751190186 + }, + { + "auxiliary_loss_clip": 0.0113526, + "auxiliary_loss_mlp": 0.01039779, + "balance_loss_clip": 1.04945207, + "balance_loss_mlp": 1.02497888, + "epoch": 0.4818878701337742, + "flos": 12567873669120.0, + "grad_norm": 3.070306284191698, + "language_loss": 0.7404421, + "learning_rate": 2.213988257504722e-06, + "loss": 0.76219249, + "num_input_tokens_seen": 172336340, + "step": 8015, + "time_per_iteration": 4.188862085342407 + }, + { + "auxiliary_loss_clip": 0.01113922, + "auxiliary_loss_mlp": 0.01040149, + "balance_loss_clip": 1.04792023, + "balance_loss_mlp": 1.02514613, + "epoch": 0.48194799338644223, + "flos": 24608505553920.0, + "grad_norm": 2.1594847398910164, + "language_loss": 0.80143541, + "learning_rate": 2.213601027413894e-06, + "loss": 0.82297611, + "num_input_tokens_seen": 172354315, + "step": 8016, + "time_per_iteration": 2.745352268218994 + }, + { + "auxiliary_loss_clip": 0.01115904, + "auxiliary_loss_mlp": 0.010317, + "balance_loss_clip": 1.04995775, + "balance_loss_mlp": 1.01803231, + "epoch": 0.4820081166391102, + "flos": 21105204272640.0, + "grad_norm": 1.9897571760317019, + "language_loss": 0.77120233, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.79267836, + "num_input_tokens_seen": 172372695, + "step": 8017, + "time_per_iteration": 2.7234907150268555 + }, + { + "auxiliary_loss_clip": 0.01117431, + "auxiliary_loss_mlp": 0.01033072, + "balance_loss_clip": 1.05067015, + "balance_loss_mlp": 1.01848102, + "epoch": 0.48206823989177816, + "flos": 25264593423360.0, + "grad_norm": 2.391907623354337, + "language_loss": 0.80211884, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.8236239, + "num_input_tokens_seen": 172390905, + "step": 8018, + "time_per_iteration": 2.805011749267578 + }, + { + "auxiliary_loss_clip": 0.01113573, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.05918038, + "balance_loss_mlp": 1.01767111, + "epoch": 0.4821283631444461, + "flos": 24645062620800.0, + "grad_norm": 1.818966225047076, + "language_loss": 0.75859058, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.78003991, + "num_input_tokens_seen": 172412295, + "step": 8019, + "time_per_iteration": 2.767993688583374 + }, + { + "auxiliary_loss_clip": 0.01092977, + "auxiliary_loss_mlp": 0.01036734, + "balance_loss_clip": 1.04580545, + "balance_loss_mlp": 1.02204108, + "epoch": 0.4821884863971141, + "flos": 23952094462080.0, + "grad_norm": 1.8745546244507358, + "language_loss": 0.7907865, + "learning_rate": 2.212052026199701e-06, + "loss": 0.8120836, + "num_input_tokens_seen": 172432625, + "step": 8020, + "time_per_iteration": 2.708779811859131 + }, + { + "auxiliary_loss_clip": 0.01127117, + "auxiliary_loss_mlp": 0.01036574, + "balance_loss_clip": 1.04847205, + "balance_loss_mlp": 1.02219176, + "epoch": 0.48224860964978206, + "flos": 17160668323200.0, + "grad_norm": 2.712415162483374, + "language_loss": 0.69893312, + "learning_rate": 2.211664755756855e-06, + "loss": 0.72057003, + "num_input_tokens_seen": 172450010, + "step": 8021, + "time_per_iteration": 2.6083900928497314 + }, + { + "auxiliary_loss_clip": 0.01102125, + "auxiliary_loss_mlp": 0.01031516, + "balance_loss_clip": 1.04406881, + "balance_loss_mlp": 1.01672244, + "epoch": 0.48230873290245, + "flos": 23075838178560.0, + "grad_norm": 1.7410194963021717, + "language_loss": 0.62778926, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.6491257, + "num_input_tokens_seen": 172469080, + "step": 8022, + "time_per_iteration": 2.677368640899658 + }, + { + "auxiliary_loss_clip": 0.01108316, + "auxiliary_loss_mlp": 0.00770954, + "balance_loss_clip": 1.04996586, + "balance_loss_mlp": 1.00044918, + "epoch": 0.482368856155118, + "flos": 19353517718400.0, + "grad_norm": 2.505400955117215, + "language_loss": 0.66446078, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.68325341, + "num_input_tokens_seen": 172484850, + "step": 8023, + "time_per_iteration": 2.6412739753723145 + }, + { + "auxiliary_loss_clip": 0.01054811, + "auxiliary_loss_mlp": 0.0104073, + "balance_loss_clip": 1.03875041, + "balance_loss_mlp": 1.02531052, + "epoch": 0.48242897940778595, + "flos": 20078984707200.0, + "grad_norm": 1.7010312143912936, + "language_loss": 0.76777267, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.78872806, + "num_input_tokens_seen": 172503525, + "step": 8024, + "time_per_iteration": 2.858891010284424 + }, + { + "auxiliary_loss_clip": 0.01109606, + "auxiliary_loss_mlp": 0.01039089, + "balance_loss_clip": 1.04908574, + "balance_loss_mlp": 1.02432442, + "epoch": 0.4824891026604539, + "flos": 23403989854080.0, + "grad_norm": 1.4778625856906076, + "language_loss": 0.75417542, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.77566242, + "num_input_tokens_seen": 172524360, + "step": 8025, + "time_per_iteration": 2.6743719577789307 + }, + { + "auxiliary_loss_clip": 0.01129031, + "auxiliary_loss_mlp": 0.01034028, + "balance_loss_clip": 1.04835987, + "balance_loss_mlp": 1.01994324, + "epoch": 0.4825492259131219, + "flos": 20368675895040.0, + "grad_norm": 1.785974704334164, + "language_loss": 0.71310222, + "learning_rate": 2.209728283441112e-06, + "loss": 0.73473275, + "num_input_tokens_seen": 172541480, + "step": 8026, + "time_per_iteration": 2.5739991664886475 + }, + { + "auxiliary_loss_clip": 0.01115668, + "auxiliary_loss_mlp": 0.01045724, + "balance_loss_clip": 1.04429471, + "balance_loss_mlp": 1.02949929, + "epoch": 0.48260934916578985, + "flos": 14319021519360.0, + "grad_norm": 2.0186797289800182, + "language_loss": 0.74956793, + "learning_rate": 2.209340965060465e-06, + "loss": 0.77118182, + "num_input_tokens_seen": 172559005, + "step": 8027, + "time_per_iteration": 2.7139828205108643 + }, + { + "auxiliary_loss_clip": 0.01105318, + "auxiliary_loss_mlp": 0.01037258, + "balance_loss_clip": 1.04597318, + "balance_loss_mlp": 1.02348971, + "epoch": 0.4826694724184578, + "flos": 22121152548480.0, + "grad_norm": 1.6779938031508344, + "language_loss": 0.67332339, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.69474924, + "num_input_tokens_seen": 172578435, + "step": 8028, + "time_per_iteration": 2.809757709503174 + }, + { + "auxiliary_loss_clip": 0.01105459, + "auxiliary_loss_mlp": 0.01039975, + "balance_loss_clip": 1.04472148, + "balance_loss_mlp": 1.02583039, + "epoch": 0.48272959567112583, + "flos": 16181169373440.0, + "grad_norm": 1.5400710398474027, + "language_loss": 0.72719157, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.7486459, + "num_input_tokens_seen": 172596095, + "step": 8029, + "time_per_iteration": 2.692643165588379 + }, + { + "auxiliary_loss_clip": 0.01103521, + "auxiliary_loss_mlp": 0.01033131, + "balance_loss_clip": 1.04666233, + "balance_loss_mlp": 1.01880252, + "epoch": 0.4827897189237938, + "flos": 23180445561600.0, + "grad_norm": 1.8484439777749806, + "language_loss": 0.84841061, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.86977708, + "num_input_tokens_seen": 172615255, + "step": 8030, + "time_per_iteration": 2.6717677116394043 + }, + { + "auxiliary_loss_clip": 0.01094989, + "auxiliary_loss_mlp": 0.01034714, + "balance_loss_clip": 1.04217124, + "balance_loss_mlp": 1.02120733, + "epoch": 0.48284984217646176, + "flos": 21652626522240.0, + "grad_norm": 2.0183604756392715, + "language_loss": 0.74026352, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.76156056, + "num_input_tokens_seen": 172633185, + "step": 8031, + "time_per_iteration": 2.640707015991211 + }, + { + "auxiliary_loss_clip": 0.01099826, + "auxiliary_loss_mlp": 0.0104306, + "balance_loss_clip": 1.04307055, + "balance_loss_mlp": 1.02747965, + "epoch": 0.48290996542912973, + "flos": 31467443304960.0, + "grad_norm": 1.5998759210668847, + "language_loss": 0.71785772, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.7392866, + "num_input_tokens_seen": 172654280, + "step": 8032, + "time_per_iteration": 2.803567886352539 + }, + { + "auxiliary_loss_clip": 0.0110819, + "auxiliary_loss_mlp": 0.01037978, + "balance_loss_clip": 1.04093766, + "balance_loss_mlp": 1.02310669, + "epoch": 0.4829700886817977, + "flos": 24461954064000.0, + "grad_norm": 1.7179702458807065, + "language_loss": 0.73965132, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.76111305, + "num_input_tokens_seen": 172675545, + "step": 8033, + "time_per_iteration": 2.7292799949645996 + }, + { + "auxiliary_loss_clip": 0.01073662, + "auxiliary_loss_mlp": 0.01036715, + "balance_loss_clip": 1.04669857, + "balance_loss_mlp": 1.0225054, + "epoch": 0.48303021193446566, + "flos": 25702164904320.0, + "grad_norm": 1.7431687715385025, + "language_loss": 0.83544624, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.85655004, + "num_input_tokens_seen": 172696455, + "step": 8034, + "time_per_iteration": 2.807359218597412 + }, + { + "auxiliary_loss_clip": 0.01095417, + "auxiliary_loss_mlp": 0.01031736, + "balance_loss_clip": 1.04668856, + "balance_loss_mlp": 1.01843822, + "epoch": 0.4830903351871336, + "flos": 20085233673600.0, + "grad_norm": 1.6936524854207098, + "language_loss": 0.79185474, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.81312621, + "num_input_tokens_seen": 172716720, + "step": 8035, + "time_per_iteration": 2.7641072273254395 + }, + { + "auxiliary_loss_clip": 0.01102103, + "auxiliary_loss_mlp": 0.00772882, + "balance_loss_clip": 1.04296494, + "balance_loss_mlp": 1.00034285, + "epoch": 0.4831504584398016, + "flos": 39452216014080.0, + "grad_norm": 1.8720500560152205, + "language_loss": 0.69804895, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.71679878, + "num_input_tokens_seen": 172737435, + "step": 8036, + "time_per_iteration": 2.8137052059173584 + }, + { + "auxiliary_loss_clip": 0.01112606, + "auxiliary_loss_mlp": 0.01031964, + "balance_loss_clip": 1.04274416, + "balance_loss_mlp": 1.01839805, + "epoch": 0.48321058169246955, + "flos": 20006588845440.0, + "grad_norm": 1.9208219105474362, + "language_loss": 0.72910142, + "learning_rate": 2.205467347074847e-06, + "loss": 0.75054711, + "num_input_tokens_seen": 172755700, + "step": 8037, + "time_per_iteration": 2.635277271270752 + }, + { + "auxiliary_loss_clip": 0.01078506, + "auxiliary_loss_mlp": 0.0104898, + "balance_loss_clip": 1.04335546, + "balance_loss_mlp": 1.03224301, + "epoch": 0.4832707049451375, + "flos": 20741465197440.0, + "grad_norm": 3.147603880487906, + "language_loss": 0.68890101, + "learning_rate": 2.205079942181525e-06, + "loss": 0.71017587, + "num_input_tokens_seen": 172775185, + "step": 8038, + "time_per_iteration": 2.782864570617676 + }, + { + "auxiliary_loss_clip": 0.01090364, + "auxiliary_loss_mlp": 0.01038251, + "balance_loss_clip": 1.04244566, + "balance_loss_mlp": 1.02438653, + "epoch": 0.4833308281978055, + "flos": 33145584762240.0, + "grad_norm": 1.8173480840244864, + "language_loss": 0.79258525, + "learning_rate": 2.20469252951155e-06, + "loss": 0.81387138, + "num_input_tokens_seen": 172796990, + "step": 8039, + "time_per_iteration": 2.7726707458496094 + }, + { + "auxiliary_loss_clip": 0.01115294, + "auxiliary_loss_mlp": 0.01034301, + "balance_loss_clip": 1.04610348, + "balance_loss_mlp": 1.02035379, + "epoch": 0.48339095145047345, + "flos": 19099234362240.0, + "grad_norm": 1.6327731998252513, + "language_loss": 0.77608567, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.79758161, + "num_input_tokens_seen": 172814915, + "step": 8040, + "time_per_iteration": 2.634373903274536 + }, + { + "auxiliary_loss_clip": 0.01117481, + "auxiliary_loss_mlp": 0.01034936, + "balance_loss_clip": 1.04517746, + "balance_loss_mlp": 1.02007651, + "epoch": 0.4834510747031414, + "flos": 34459448440320.0, + "grad_norm": 1.603418513383397, + "language_loss": 0.75737631, + "learning_rate": 2.203917680900409e-06, + "loss": 0.7789005, + "num_input_tokens_seen": 172837060, + "step": 8041, + "time_per_iteration": 2.7551445960998535 + }, + { + "auxiliary_loss_clip": 0.01089791, + "auxiliary_loss_mlp": 0.01038452, + "balance_loss_clip": 1.04363966, + "balance_loss_mlp": 1.02388501, + "epoch": 0.48351119795580944, + "flos": 27380845065600.0, + "grad_norm": 1.7873938615261085, + "language_loss": 0.6681267, + "learning_rate": 2.203530244988624e-06, + "loss": 0.6894092, + "num_input_tokens_seen": 172856545, + "step": 8042, + "time_per_iteration": 2.7318594455718994 + }, + { + "auxiliary_loss_clip": 0.01029662, + "auxiliary_loss_mlp": 0.0100431, + "balance_loss_clip": 1.0224936, + "balance_loss_mlp": 1.00289762, + "epoch": 0.4835713212084774, + "flos": 67143941291520.0, + "grad_norm": 0.6894070214322334, + "language_loss": 0.5854131, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.60575283, + "num_input_tokens_seen": 172923055, + "step": 8043, + "time_per_iteration": 3.2759408950805664 + }, + { + "auxiliary_loss_clip": 0.01104355, + "auxiliary_loss_mlp": 0.01041979, + "balance_loss_clip": 1.04400086, + "balance_loss_mlp": 1.02605903, + "epoch": 0.48363144446114537, + "flos": 17967473660160.0, + "grad_norm": 1.92903629391714, + "language_loss": 0.71673858, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.73820192, + "num_input_tokens_seen": 172940700, + "step": 8044, + "time_per_iteration": 2.6627197265625 + }, + { + "auxiliary_loss_clip": 0.01073602, + "auxiliary_loss_mlp": 0.01033421, + "balance_loss_clip": 1.04103553, + "balance_loss_mlp": 1.01863277, + "epoch": 0.48369156771381333, + "flos": 20593513077120.0, + "grad_norm": 1.3783700874379357, + "language_loss": 0.75982356, + "learning_rate": 2.202367891004714e-06, + "loss": 0.7808938, + "num_input_tokens_seen": 172961125, + "step": 8045, + "time_per_iteration": 2.7301156520843506 + }, + { + "auxiliary_loss_clip": 0.01083343, + "auxiliary_loss_mlp": 0.01040882, + "balance_loss_clip": 1.04626942, + "balance_loss_mlp": 1.02615929, + "epoch": 0.4837516909664813, + "flos": 22675075159680.0, + "grad_norm": 1.8085917066759625, + "language_loss": 0.70038342, + "learning_rate": 2.201980424309533e-06, + "loss": 0.72162569, + "num_input_tokens_seen": 172980405, + "step": 8046, + "time_per_iteration": 2.853160858154297 + }, + { + "auxiliary_loss_clip": 0.01127438, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.04603601, + "balance_loss_mlp": 1.02220488, + "epoch": 0.48381181421914926, + "flos": 25518625384320.0, + "grad_norm": 2.1605387354357193, + "language_loss": 0.82558095, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.84722322, + "num_input_tokens_seen": 172999105, + "step": 8047, + "time_per_iteration": 2.711172103881836 + }, + { + "auxiliary_loss_clip": 0.01095021, + "auxiliary_loss_mlp": 0.01034535, + "balance_loss_clip": 1.04198444, + "balance_loss_mlp": 1.02066541, + "epoch": 0.4838719374718172, + "flos": 24207491139840.0, + "grad_norm": 1.6956601095110444, + "language_loss": 0.80573416, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.82702971, + "num_input_tokens_seen": 173019935, + "step": 8048, + "time_per_iteration": 2.714733839035034 + }, + { + "auxiliary_loss_clip": 0.01119221, + "auxiliary_loss_mlp": 0.01039156, + "balance_loss_clip": 1.04571271, + "balance_loss_mlp": 1.02458251, + "epoch": 0.4839320607244852, + "flos": 26724577628160.0, + "grad_norm": 1.6136989522042802, + "language_loss": 0.81565118, + "learning_rate": 2.200817978328054e-06, + "loss": 0.83723497, + "num_input_tokens_seen": 173039700, + "step": 8049, + "time_per_iteration": 2.740396738052368 + }, + { + "auxiliary_loss_clip": 0.0110148, + "auxiliary_loss_mlp": 0.01032329, + "balance_loss_clip": 1.04652369, + "balance_loss_mlp": 1.01979959, + "epoch": 0.48399218397715316, + "flos": 20448900921600.0, + "grad_norm": 1.738899019363266, + "language_loss": 0.72696805, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.74830616, + "num_input_tokens_seen": 173059170, + "step": 8050, + "time_per_iteration": 2.671696424484253 + }, + { + "auxiliary_loss_clip": 0.01036049, + "auxiliary_loss_mlp": 0.00752282, + "balance_loss_clip": 1.01914835, + "balance_loss_mlp": 1.00025868, + "epoch": 0.4840523072298211, + "flos": 67180570185600.0, + "grad_norm": 0.6909377773009905, + "language_loss": 0.562814, + "learning_rate": 2.200042976240723e-06, + "loss": 0.5806973, + "num_input_tokens_seen": 173119000, + "step": 8051, + "time_per_iteration": 6.922944784164429 + }, + { + "auxiliary_loss_clip": 0.01088902, + "auxiliary_loss_mlp": 0.01035544, + "balance_loss_clip": 1.04290557, + "balance_loss_mlp": 1.0208869, + "epoch": 0.4841124304824891, + "flos": 22411490181120.0, + "grad_norm": 1.8410570377760342, + "language_loss": 0.75224304, + "learning_rate": 2.199655463811236e-06, + "loss": 0.77348751, + "num_input_tokens_seen": 173137570, + "step": 8052, + "time_per_iteration": 2.7672088146209717 + }, + { + "auxiliary_loss_clip": 0.01115072, + "auxiliary_loss_mlp": 0.01037343, + "balance_loss_clip": 1.04730511, + "balance_loss_mlp": 1.02388382, + "epoch": 0.48417255373515705, + "flos": 13843959217920.0, + "grad_norm": 2.7757616025011296, + "language_loss": 0.6599009, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.68142503, + "num_input_tokens_seen": 173154355, + "step": 8053, + "time_per_iteration": 2.7092020511627197 + }, + { + "auxiliary_loss_clip": 0.01118659, + "auxiliary_loss_mlp": 0.0103362, + "balance_loss_clip": 1.04970407, + "balance_loss_mlp": 1.02048898, + "epoch": 0.484232676987825, + "flos": 31649689935360.0, + "grad_norm": 1.9021914395644282, + "language_loss": 0.69075954, + "learning_rate": 2.198880416254091e-06, + "loss": 0.7122823, + "num_input_tokens_seen": 173174845, + "step": 8054, + "time_per_iteration": 5.934173583984375 + }, + { + "auxiliary_loss_clip": 0.01055753, + "auxiliary_loss_mlp": 0.01032099, + "balance_loss_clip": 1.03702974, + "balance_loss_mlp": 1.01789522, + "epoch": 0.48429280024049304, + "flos": 24095377814400.0, + "grad_norm": 1.7332498206286664, + "language_loss": 0.69624376, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.71712232, + "num_input_tokens_seen": 173195025, + "step": 8055, + "time_per_iteration": 2.811734676361084 + }, + { + "auxiliary_loss_clip": 0.01121016, + "auxiliary_loss_mlp": 0.01038771, + "balance_loss_clip": 1.04966474, + "balance_loss_mlp": 1.02396512, + "epoch": 0.484352923493161, + "flos": 17530081747200.0, + "grad_norm": 2.8015304711701154, + "language_loss": 0.63522434, + "learning_rate": 2.198105338530685e-06, + "loss": 0.6568222, + "num_input_tokens_seen": 173213065, + "step": 8056, + "time_per_iteration": 2.6111772060394287 + }, + { + "auxiliary_loss_clip": 0.01115568, + "auxiliary_loss_mlp": 0.01036144, + "balance_loss_clip": 1.04465592, + "balance_loss_mlp": 1.0212791, + "epoch": 0.48441304674582897, + "flos": 29166862043520.0, + "grad_norm": 2.044514393553715, + "language_loss": 0.67968506, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.70120221, + "num_input_tokens_seen": 173234545, + "step": 8057, + "time_per_iteration": 2.678311824798584 + }, + { + "auxiliary_loss_clip": 0.01089017, + "auxiliary_loss_mlp": 0.01041569, + "balance_loss_clip": 1.04114962, + "balance_loss_mlp": 1.02560723, + "epoch": 0.48447316999849693, + "flos": 15886701676800.0, + "grad_norm": 1.6304795591829788, + "language_loss": 0.8145591, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.83586496, + "num_input_tokens_seen": 173252175, + "step": 8058, + "time_per_iteration": 2.676553964614868 + }, + { + "auxiliary_loss_clip": 0.0111574, + "auxiliary_loss_mlp": 0.01037327, + "balance_loss_clip": 1.04488969, + "balance_loss_mlp": 1.02229452, + "epoch": 0.4845332932511649, + "flos": 24381405815040.0, + "grad_norm": 1.66967797618368, + "language_loss": 0.79851902, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.82004976, + "num_input_tokens_seen": 173268790, + "step": 8059, + "time_per_iteration": 2.672071933746338 + }, + { + "auxiliary_loss_clip": 0.01134552, + "auxiliary_loss_mlp": 0.010436, + "balance_loss_clip": 1.04998326, + "balance_loss_mlp": 1.02804327, + "epoch": 0.48459341650383286, + "flos": 37116478316160.0, + "grad_norm": 1.8700605031219397, + "language_loss": 0.6685822, + "learning_rate": 2.196555093055352e-06, + "loss": 0.69036371, + "num_input_tokens_seen": 173288030, + "step": 8060, + "time_per_iteration": 2.7481517791748047 + }, + { + "auxiliary_loss_clip": 0.01115717, + "auxiliary_loss_mlp": 0.01047797, + "balance_loss_clip": 1.04782832, + "balance_loss_mlp": 1.03283644, + "epoch": 0.48465353975650083, + "flos": 22966777509120.0, + "grad_norm": 1.918934253409618, + "language_loss": 0.67403054, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.69566566, + "num_input_tokens_seen": 173305965, + "step": 8061, + "time_per_iteration": 2.6991710662841797 + }, + { + "auxiliary_loss_clip": 0.01112971, + "auxiliary_loss_mlp": 0.01047446, + "balance_loss_clip": 1.0495888, + "balance_loss_mlp": 1.03176975, + "epoch": 0.4847136630091688, + "flos": 17707695523200.0, + "grad_norm": 2.027913918653662, + "language_loss": 0.82387316, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.84547728, + "num_input_tokens_seen": 173321985, + "step": 8062, + "time_per_iteration": 2.6427886486053467 + }, + { + "auxiliary_loss_clip": 0.01062707, + "auxiliary_loss_mlp": 0.0103913, + "balance_loss_clip": 1.04044425, + "balance_loss_mlp": 1.02433586, + "epoch": 0.48477378626183676, + "flos": 22018269018240.0, + "grad_norm": 1.5908761940571217, + "language_loss": 0.74599862, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.76701701, + "num_input_tokens_seen": 173341315, + "step": 8063, + "time_per_iteration": 2.767857313156128 + }, + { + "auxiliary_loss_clip": 0.01103538, + "auxiliary_loss_mlp": 0.01036681, + "balance_loss_clip": 1.04380846, + "balance_loss_mlp": 1.02177346, + "epoch": 0.4848339095145047, + "flos": 27962956874880.0, + "grad_norm": 1.679199539296889, + "language_loss": 0.7897141, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.81111628, + "num_input_tokens_seen": 173361055, + "step": 8064, + "time_per_iteration": 2.702838182449341 + }, + { + "auxiliary_loss_clip": 0.01127143, + "auxiliary_loss_mlp": 0.0077039, + "balance_loss_clip": 1.04982877, + "balance_loss_mlp": 1.00042999, + "epoch": 0.4848940327671727, + "flos": 21688752625920.0, + "grad_norm": 1.758395032785765, + "language_loss": 0.78960353, + "learning_rate": 2.194617118620173e-06, + "loss": 0.80857891, + "num_input_tokens_seen": 173379255, + "step": 8065, + "time_per_iteration": 2.6464266777038574 + }, + { + "auxiliary_loss_clip": 0.01109206, + "auxiliary_loss_mlp": 0.00771166, + "balance_loss_clip": 1.04239869, + "balance_loss_mlp": 1.00034332, + "epoch": 0.48495415601984065, + "flos": 20631578515200.0, + "grad_norm": 1.717828669503626, + "language_loss": 0.76373905, + "learning_rate": 2.194229501534644e-06, + "loss": 0.78254277, + "num_input_tokens_seen": 173398370, + "step": 8066, + "time_per_iteration": 2.622279405593872 + }, + { + "auxiliary_loss_clip": 0.01129705, + "auxiliary_loss_mlp": 0.01032468, + "balance_loss_clip": 1.05031133, + "balance_loss_mlp": 1.0188905, + "epoch": 0.4850142792725086, + "flos": 25628152930560.0, + "grad_norm": 1.606995638926956, + "language_loss": 0.7245208, + "learning_rate": 2.193841877083912e-06, + "loss": 0.74614257, + "num_input_tokens_seen": 173419595, + "step": 8067, + "time_per_iteration": 2.6863858699798584 + }, + { + "auxiliary_loss_clip": 0.01062315, + "auxiliary_loss_mlp": 0.01036403, + "balance_loss_clip": 1.04658556, + "balance_loss_mlp": 1.02155542, + "epoch": 0.4850744025251766, + "flos": 13771958405760.0, + "grad_norm": 2.9723717970034826, + "language_loss": 0.79098403, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.81197119, + "num_input_tokens_seen": 173435390, + "step": 8068, + "time_per_iteration": 2.736361503601074 + }, + { + "auxiliary_loss_clip": 0.01096742, + "auxiliary_loss_mlp": 0.01035763, + "balance_loss_clip": 1.04122019, + "balance_loss_mlp": 1.02254295, + "epoch": 0.4851345257778446, + "flos": 20261339078400.0, + "grad_norm": 1.4037595191012704, + "language_loss": 0.84329617, + "learning_rate": 2.193066606145638e-06, + "loss": 0.86462128, + "num_input_tokens_seen": 173454095, + "step": 8069, + "time_per_iteration": 2.6671814918518066 + }, + { + "auxiliary_loss_clip": 0.01091404, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.04400659, + "balance_loss_mlp": 1.01972818, + "epoch": 0.48519464903051257, + "flos": 27089681420160.0, + "grad_norm": 1.7638547734342187, + "language_loss": 0.78171504, + "learning_rate": 2.192678959687493e-06, + "loss": 0.80295968, + "num_input_tokens_seen": 173475300, + "step": 8070, + "time_per_iteration": 2.7715907096862793 + }, + { + "auxiliary_loss_clip": 0.01066151, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.04079247, + "balance_loss_mlp": 1.01808023, + "epoch": 0.48525477228318054, + "flos": 17127235739520.0, + "grad_norm": 1.9176398781406192, + "language_loss": 0.78054178, + "learning_rate": 2.192291305922943e-06, + "loss": 0.80152905, + "num_input_tokens_seen": 173492005, + "step": 8071, + "time_per_iteration": 2.7427566051483154 + }, + { + "auxiliary_loss_clip": 0.01063848, + "auxiliary_loss_mlp": 0.0103312, + "balance_loss_clip": 1.04013515, + "balance_loss_mlp": 1.01852274, + "epoch": 0.4853148955358485, + "flos": 28180324028160.0, + "grad_norm": 1.9286974806008035, + "language_loss": 0.72312587, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.7440955, + "num_input_tokens_seen": 173511995, + "step": 8072, + "time_per_iteration": 2.8457834720611572 + }, + { + "auxiliary_loss_clip": 0.01077736, + "auxiliary_loss_mlp": 0.01038365, + "balance_loss_clip": 1.04195118, + "balance_loss_mlp": 1.02361333, + "epoch": 0.48537501878851647, + "flos": 17493309198720.0, + "grad_norm": 2.206546835183074, + "language_loss": 0.87933266, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.90049368, + "num_input_tokens_seen": 173530215, + "step": 8073, + "time_per_iteration": 2.7190656661987305 + }, + { + "auxiliary_loss_clip": 0.01081944, + "auxiliary_loss_mlp": 0.01041597, + "balance_loss_clip": 1.03932655, + "balance_loss_mlp": 1.02555168, + "epoch": 0.48543514204118443, + "flos": 28584857975040.0, + "grad_norm": 1.6453725477912577, + "language_loss": 0.60954368, + "learning_rate": 2.19112830093786e-06, + "loss": 0.63077909, + "num_input_tokens_seen": 173550920, + "step": 8074, + "time_per_iteration": 2.757408857345581 + }, + { + "auxiliary_loss_clip": 0.01088022, + "auxiliary_loss_mlp": 0.00773092, + "balance_loss_clip": 1.0409627, + "balance_loss_mlp": 1.00044906, + "epoch": 0.4854952652938524, + "flos": 20959981585920.0, + "grad_norm": 1.6130644581425704, + "language_loss": 0.735416, + "learning_rate": 2.19074061809469e-06, + "loss": 0.75402713, + "num_input_tokens_seen": 173569065, + "step": 8075, + "time_per_iteration": 2.8191847801208496 + }, + { + "auxiliary_loss_clip": 0.01121809, + "auxiliary_loss_mlp": 0.01039314, + "balance_loss_clip": 1.04537582, + "balance_loss_mlp": 1.02567613, + "epoch": 0.48555538854652036, + "flos": 66529543155840.0, + "grad_norm": 2.2867687714704665, + "language_loss": 0.81751764, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.83912885, + "num_input_tokens_seen": 173596085, + "step": 8076, + "time_per_iteration": 3.0270113945007324 + }, + { + "auxiliary_loss_clip": 0.01107841, + "auxiliary_loss_mlp": 0.01038327, + "balance_loss_clip": 1.04600549, + "balance_loss_mlp": 1.02161372, + "epoch": 0.4856155117991883, + "flos": 15924982596480.0, + "grad_norm": 2.702312951735234, + "language_loss": 0.86105502, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.88251674, + "num_input_tokens_seen": 173613900, + "step": 8077, + "time_per_iteration": 2.6272876262664795 + }, + { + "auxiliary_loss_clip": 0.01006449, + "auxiliary_loss_mlp": 0.0100721, + "balance_loss_clip": 1.01856184, + "balance_loss_mlp": 1.00564885, + "epoch": 0.4856756350518563, + "flos": 71047395060480.0, + "grad_norm": 0.8998346956373826, + "language_loss": 0.58465588, + "learning_rate": 2.189577526226564e-06, + "loss": 0.60479248, + "num_input_tokens_seen": 173671305, + "step": 8078, + "time_per_iteration": 3.254561424255371 + }, + { + "auxiliary_loss_clip": 0.01132159, + "auxiliary_loss_mlp": 0.01033911, + "balance_loss_clip": 1.04961872, + "balance_loss_mlp": 1.01946878, + "epoch": 0.48573575830452426, + "flos": 29825679346560.0, + "grad_norm": 1.7198368274974891, + "language_loss": 0.72365242, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.74531311, + "num_input_tokens_seen": 173692070, + "step": 8079, + "time_per_iteration": 2.6532506942749023 + }, + { + "auxiliary_loss_clip": 0.01088509, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.04440176, + "balance_loss_mlp": 1.01868141, + "epoch": 0.4857958815571922, + "flos": 17639501552640.0, + "grad_norm": 2.749999314487442, + "language_loss": 0.79557705, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.81678975, + "num_input_tokens_seen": 173709785, + "step": 8080, + "time_per_iteration": 2.6242940425872803 + }, + { + "auxiliary_loss_clip": 0.01097632, + "auxiliary_loss_mlp": 0.01033589, + "balance_loss_clip": 1.04023981, + "balance_loss_mlp": 1.01881862, + "epoch": 0.4858560048098602, + "flos": 21105491581440.0, + "grad_norm": 1.9603729393952303, + "language_loss": 0.84016395, + "learning_rate": 2.188414369659251e-06, + "loss": 0.86147618, + "num_input_tokens_seen": 173728770, + "step": 8081, + "time_per_iteration": 2.6701998710632324 + }, + { + "auxiliary_loss_clip": 0.01110096, + "auxiliary_loss_mlp": 0.01036956, + "balance_loss_clip": 1.04121375, + "balance_loss_mlp": 1.02081513, + "epoch": 0.4859161280625282, + "flos": 22090844448000.0, + "grad_norm": 1.4026106187948555, + "language_loss": 0.83353597, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.85500646, + "num_input_tokens_seen": 173747355, + "step": 8082, + "time_per_iteration": 2.6535134315490723 + }, + { + "auxiliary_loss_clip": 0.01102933, + "auxiliary_loss_mlp": 0.01034217, + "balance_loss_clip": 1.04525304, + "balance_loss_mlp": 1.02117527, + "epoch": 0.4859762513151962, + "flos": 17493452853120.0, + "grad_norm": 1.9462739217424578, + "language_loss": 0.87314546, + "learning_rate": 2.187638896199746e-06, + "loss": 0.89451694, + "num_input_tokens_seen": 173764825, + "step": 8083, + "time_per_iteration": 2.6324520111083984 + }, + { + "auxiliary_loss_clip": 0.01080799, + "auxiliary_loss_mlp": 0.01047109, + "balance_loss_clip": 1.04719186, + "balance_loss_mlp": 1.03410375, + "epoch": 0.48603637456786414, + "flos": 18004246208640.0, + "grad_norm": 1.6025248177358018, + "language_loss": 0.80759108, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.82887018, + "num_input_tokens_seen": 173783215, + "step": 8084, + "time_per_iteration": 2.679032325744629 + }, + { + "auxiliary_loss_clip": 0.01114846, + "auxiliary_loss_mlp": 0.01035804, + "balance_loss_clip": 1.04544878, + "balance_loss_mlp": 1.02149308, + "epoch": 0.4860964978205321, + "flos": 22492038430080.0, + "grad_norm": 1.9539653340908196, + "language_loss": 0.68145066, + "learning_rate": 2.186863394279098e-06, + "loss": 0.70295715, + "num_input_tokens_seen": 173801905, + "step": 8085, + "time_per_iteration": 2.6305296421051025 + }, + { + "auxiliary_loss_clip": 0.01113875, + "auxiliary_loss_mlp": 0.01040894, + "balance_loss_clip": 1.04487717, + "balance_loss_mlp": 1.02714896, + "epoch": 0.48615662107320007, + "flos": 23372532518400.0, + "grad_norm": 1.3763064439222144, + "language_loss": 0.77494752, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.79649526, + "num_input_tokens_seen": 173824690, + "step": 8086, + "time_per_iteration": 2.6941890716552734 + }, + { + "auxiliary_loss_clip": 0.01125139, + "auxiliary_loss_mlp": 0.01028743, + "balance_loss_clip": 1.04536629, + "balance_loss_mlp": 1.01461661, + "epoch": 0.48621674432586803, + "flos": 34418833136640.0, + "grad_norm": 2.3947564981199347, + "language_loss": 0.7014342, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.72297299, + "num_input_tokens_seen": 173844450, + "step": 8087, + "time_per_iteration": 2.7329354286193848 + }, + { + "auxiliary_loss_clip": 0.01119086, + "auxiliary_loss_mlp": 0.01040298, + "balance_loss_clip": 1.04627323, + "balance_loss_mlp": 1.0251466, + "epoch": 0.486276867578536, + "flos": 33107555237760.0, + "grad_norm": 1.710106545545042, + "language_loss": 0.72521967, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.74681354, + "num_input_tokens_seen": 173864975, + "step": 8088, + "time_per_iteration": 2.747058391571045 + }, + { + "auxiliary_loss_clip": 0.01103115, + "auxiliary_loss_mlp": 0.01037287, + "balance_loss_clip": 1.04365635, + "balance_loss_mlp": 1.02306569, + "epoch": 0.48633699083120396, + "flos": 21470703114240.0, + "grad_norm": 1.7297894528285667, + "language_loss": 0.7543239, + "learning_rate": 2.185312305524892e-06, + "loss": 0.77572793, + "num_input_tokens_seen": 173883805, + "step": 8089, + "time_per_iteration": 2.6639740467071533 + }, + { + "auxiliary_loss_clip": 0.01092992, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.04379344, + "balance_loss_mlp": 1.01733255, + "epoch": 0.48639711408387193, + "flos": 20084335833600.0, + "grad_norm": 1.6351614757671693, + "language_loss": 0.84245062, + "learning_rate": 2.184924515731926e-06, + "loss": 0.86369717, + "num_input_tokens_seen": 173903520, + "step": 8090, + "time_per_iteration": 4.404139757156372 + }, + { + "auxiliary_loss_clip": 0.01122239, + "auxiliary_loss_mlp": 0.01033955, + "balance_loss_clip": 1.04544723, + "balance_loss_mlp": 1.0203594, + "epoch": 0.4864572373365399, + "flos": 20778884190720.0, + "grad_norm": 1.7197214823091769, + "language_loss": 0.76290631, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.78446829, + "num_input_tokens_seen": 173924255, + "step": 8091, + "time_per_iteration": 2.7133665084838867 + }, + { + "auxiliary_loss_clip": 0.01115621, + "auxiliary_loss_mlp": 0.01029044, + "balance_loss_clip": 1.04440069, + "balance_loss_mlp": 1.01553202, + "epoch": 0.48651736058920786, + "flos": 26025360503040.0, + "grad_norm": 1.4953838782762103, + "language_loss": 0.80510461, + "learning_rate": 2.184148915123631e-06, + "loss": 0.82655126, + "num_input_tokens_seen": 173943285, + "step": 8092, + "time_per_iteration": 2.682349920272827 + }, + { + "auxiliary_loss_clip": 0.0110052, + "auxiliary_loss_mlp": 0.00775072, + "balance_loss_clip": 1.04398346, + "balance_loss_mlp": 1.00031447, + "epoch": 0.4865774838418758, + "flos": 20485601642880.0, + "grad_norm": 1.434156215667662, + "language_loss": 0.71867287, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.73742878, + "num_input_tokens_seen": 173962205, + "step": 8093, + "time_per_iteration": 5.686015367507935 + }, + { + "auxiliary_loss_clip": 0.01123791, + "auxiliary_loss_mlp": 0.01034202, + "balance_loss_clip": 1.04521751, + "balance_loss_mlp": 1.02074885, + "epoch": 0.4866376070945438, + "flos": 23547704169600.0, + "grad_norm": 1.581585117496142, + "language_loss": 0.67704266, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.69862258, + "num_input_tokens_seen": 173980945, + "step": 8094, + "time_per_iteration": 2.5890355110168457 + }, + { + "auxiliary_loss_clip": 0.01109259, + "auxiliary_loss_mlp": 0.01038119, + "balance_loss_clip": 1.04752278, + "balance_loss_mlp": 1.02342701, + "epoch": 0.4866977303472118, + "flos": 16690598012160.0, + "grad_norm": 2.317379685093866, + "language_loss": 0.66784161, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.68931544, + "num_input_tokens_seen": 173998860, + "step": 8095, + "time_per_iteration": 2.640468120574951 + }, + { + "auxiliary_loss_clip": 0.01110152, + "auxiliary_loss_mlp": 0.0103636, + "balance_loss_clip": 1.04456031, + "balance_loss_mlp": 1.02123296, + "epoch": 0.4867578535998798, + "flos": 17896011552000.0, + "grad_norm": 2.1481069791390346, + "language_loss": 0.78540075, + "learning_rate": 2.182597630229345e-06, + "loss": 0.80686581, + "num_input_tokens_seen": 174016665, + "step": 8096, + "time_per_iteration": 2.585015058517456 + }, + { + "auxiliary_loss_clip": 0.01092726, + "auxiliary_loss_mlp": 0.01036143, + "balance_loss_clip": 1.03732872, + "balance_loss_mlp": 1.02165902, + "epoch": 0.48681797685254774, + "flos": 22637799820800.0, + "grad_norm": 1.880706326191671, + "language_loss": 0.67753577, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.69882447, + "num_input_tokens_seen": 174034800, + "step": 8097, + "time_per_iteration": 2.6526336669921875 + }, + { + "auxiliary_loss_clip": 0.01097124, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.04311764, + "balance_loss_mlp": 1.02491093, + "epoch": 0.4868781001052157, + "flos": 20886077352960.0, + "grad_norm": 1.6144910396326548, + "language_loss": 0.71414316, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.73550731, + "num_input_tokens_seen": 174054445, + "step": 8098, + "time_per_iteration": 2.6669986248016357 + }, + { + "auxiliary_loss_clip": 0.01119656, + "auxiliary_loss_mlp": 0.01037345, + "balance_loss_clip": 1.04642081, + "balance_loss_mlp": 1.02226543, + "epoch": 0.48693822335788367, + "flos": 41974940937600.0, + "grad_norm": 2.9804894060925458, + "language_loss": 0.66267806, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.68424809, + "num_input_tokens_seen": 174077890, + "step": 8099, + "time_per_iteration": 2.7542026042938232 + }, + { + "auxiliary_loss_clip": 0.01070284, + "auxiliary_loss_mlp": 0.01040695, + "balance_loss_clip": 1.0372566, + "balance_loss_mlp": 1.02712917, + "epoch": 0.48699834661055164, + "flos": 24243294021120.0, + "grad_norm": 1.700994432394141, + "language_loss": 0.66787708, + "learning_rate": 2.181046234549138e-06, + "loss": 0.6889869, + "num_input_tokens_seen": 174097460, + "step": 8100, + "time_per_iteration": 2.7499735355377197 + }, + { + "auxiliary_loss_clip": 0.01087635, + "auxiliary_loss_mlp": 0.01033762, + "balance_loss_clip": 1.04155445, + "balance_loss_mlp": 1.02084517, + "epoch": 0.4870584698632196, + "flos": 25923877603200.0, + "grad_norm": 1.427277688843355, + "language_loss": 0.76812327, + "learning_rate": 2.180658368429088e-06, + "loss": 0.78933728, + "num_input_tokens_seen": 174120775, + "step": 8101, + "time_per_iteration": 2.7710418701171875 + }, + { + "auxiliary_loss_clip": 0.010432, + "auxiliary_loss_mlp": 0.00999689, + "balance_loss_clip": 1.01742899, + "balance_loss_mlp": 0.99847281, + "epoch": 0.48711859311588757, + "flos": 70211933648640.0, + "grad_norm": 0.6877166097191185, + "language_loss": 0.52341712, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54384601, + "num_input_tokens_seen": 174189135, + "step": 8102, + "time_per_iteration": 3.3232975006103516 + }, + { + "auxiliary_loss_clip": 0.0109639, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.04584694, + "balance_loss_mlp": 1.02250218, + "epoch": 0.48717871636855553, + "flos": 12342964659840.0, + "grad_norm": 2.1242457938350885, + "language_loss": 0.7405737, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.7618984, + "num_input_tokens_seen": 174203250, + "step": 8103, + "time_per_iteration": 2.6988277435302734 + }, + { + "auxiliary_loss_clip": 0.01116672, + "auxiliary_loss_mlp": 0.01043644, + "balance_loss_clip": 1.04631233, + "balance_loss_mlp": 1.0288384, + "epoch": 0.4872388396212235, + "flos": 23477139901440.0, + "grad_norm": 1.6106517558680102, + "language_loss": 0.63064033, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.65224349, + "num_input_tokens_seen": 174224145, + "step": 8104, + "time_per_iteration": 2.629725456237793 + }, + { + "auxiliary_loss_clip": 0.01125564, + "auxiliary_loss_mlp": 0.01032477, + "balance_loss_clip": 1.04695344, + "balance_loss_mlp": 1.01885152, + "epoch": 0.48729896287389146, + "flos": 31427582186880.0, + "grad_norm": 2.7588286364308217, + "language_loss": 0.69136071, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.71294117, + "num_input_tokens_seen": 174244435, + "step": 8105, + "time_per_iteration": 2.6670045852661133 + }, + { + "auxiliary_loss_clip": 0.01084626, + "auxiliary_loss_mlp": 0.01030665, + "balance_loss_clip": 1.04264283, + "balance_loss_mlp": 1.01766491, + "epoch": 0.4873590861265594, + "flos": 19057936700160.0, + "grad_norm": 2.072109036230495, + "language_loss": 0.73534381, + "learning_rate": 2.178718935364259e-06, + "loss": 0.75649679, + "num_input_tokens_seen": 174262710, + "step": 8106, + "time_per_iteration": 2.679194927215576 + }, + { + "auxiliary_loss_clip": 0.01107932, + "auxiliary_loss_mlp": 0.00772241, + "balance_loss_clip": 1.04675412, + "balance_loss_mlp": 1.00038791, + "epoch": 0.4874192093792274, + "flos": 24348296453760.0, + "grad_norm": 2.6438945384360157, + "language_loss": 0.76877642, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.78757817, + "num_input_tokens_seen": 174281545, + "step": 8107, + "time_per_iteration": 2.6732285022735596 + }, + { + "auxiliary_loss_clip": 0.01071333, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.04327512, + "balance_loss_mlp": 1.01932359, + "epoch": 0.4874793326318954, + "flos": 23112610727040.0, + "grad_norm": 3.5135482389125583, + "language_loss": 0.75034302, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.77137709, + "num_input_tokens_seen": 174300290, + "step": 8108, + "time_per_iteration": 2.8071932792663574 + }, + { + "auxiliary_loss_clip": 0.01111368, + "auxiliary_loss_mlp": 0.01030979, + "balance_loss_clip": 1.04524517, + "balance_loss_mlp": 1.01917136, + "epoch": 0.4875394558845634, + "flos": 19026156142080.0, + "grad_norm": 1.7033835018380465, + "language_loss": 0.73611033, + "learning_rate": 2.177555194083212e-06, + "loss": 0.75753379, + "num_input_tokens_seen": 174318490, + "step": 8109, + "time_per_iteration": 2.642854928970337 + }, + { + "auxiliary_loss_clip": 0.01108586, + "auxiliary_loss_mlp": 0.01031639, + "balance_loss_clip": 1.04274952, + "balance_loss_mlp": 1.01813245, + "epoch": 0.48759957913723134, + "flos": 21433607343360.0, + "grad_norm": 1.8383730211114537, + "language_loss": 0.78698927, + "learning_rate": 2.177167266837428e-06, + "loss": 0.80839157, + "num_input_tokens_seen": 174335505, + "step": 8110, + "time_per_iteration": 2.6471641063690186 + }, + { + "auxiliary_loss_clip": 0.01114056, + "auxiliary_loss_mlp": 0.01041552, + "balance_loss_clip": 1.04712057, + "balance_loss_mlp": 1.02802181, + "epoch": 0.4876597023898993, + "flos": 17748669962880.0, + "grad_norm": 1.8514316559502986, + "language_loss": 0.72086185, + "learning_rate": 2.176779332873444e-06, + "loss": 0.74241793, + "num_input_tokens_seen": 174353990, + "step": 8111, + "time_per_iteration": 2.6277401447296143 + }, + { + "auxiliary_loss_clip": 0.01113402, + "auxiliary_loss_mlp": 0.01036579, + "balance_loss_clip": 1.04676926, + "balance_loss_mlp": 1.02329946, + "epoch": 0.4877198256425673, + "flos": 17019647527680.0, + "grad_norm": 1.5795214961704311, + "language_loss": 0.76318377, + "learning_rate": 2.17639139220597e-06, + "loss": 0.78468353, + "num_input_tokens_seen": 174373425, + "step": 8112, + "time_per_iteration": 2.598010301589966 + }, + { + "auxiliary_loss_clip": 0.01117365, + "auxiliary_loss_mlp": 0.01038377, + "balance_loss_clip": 1.04562628, + "balance_loss_mlp": 1.02425683, + "epoch": 0.48777994889523524, + "flos": 22384091082240.0, + "grad_norm": 1.710789031048389, + "language_loss": 0.75035822, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.77191567, + "num_input_tokens_seen": 174393070, + "step": 8113, + "time_per_iteration": 2.6348531246185303 + }, + { + "auxiliary_loss_clip": 0.01028141, + "auxiliary_loss_mlp": 0.0075288, + "balance_loss_clip": 1.02038229, + "balance_loss_mlp": 1.0004046, + "epoch": 0.4878400721479032, + "flos": 61241772159360.0, + "grad_norm": 0.77879843500845, + "language_loss": 0.4887349, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.50654507, + "num_input_tokens_seen": 174446880, + "step": 8114, + "time_per_iteration": 3.1273062229156494 + }, + { + "auxiliary_loss_clip": 0.0109717, + "auxiliary_loss_mlp": 0.01040496, + "balance_loss_clip": 1.04649258, + "balance_loss_mlp": 1.02591658, + "epoch": 0.48790019540057117, + "flos": 24536612482560.0, + "grad_norm": 1.616579350296871, + "language_loss": 0.76760268, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.78897941, + "num_input_tokens_seen": 174468485, + "step": 8115, + "time_per_iteration": 2.759444236755371 + }, + { + "auxiliary_loss_clip": 0.01107443, + "auxiliary_loss_mlp": 0.01033169, + "balance_loss_clip": 1.0478245, + "balance_loss_mlp": 1.01930535, + "epoch": 0.48796031865323913, + "flos": 21833939399040.0, + "grad_norm": 2.031601085778298, + "language_loss": 0.71910083, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.74050689, + "num_input_tokens_seen": 174486360, + "step": 8116, + "time_per_iteration": 2.7063751220703125 + }, + { + "auxiliary_loss_clip": 0.01088547, + "auxiliary_loss_mlp": 0.01035995, + "balance_loss_clip": 1.04164481, + "balance_loss_mlp": 1.02276874, + "epoch": 0.4880204419059071, + "flos": 18588907883520.0, + "grad_norm": 3.4734402196051, + "language_loss": 0.63002747, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.65127283, + "num_input_tokens_seen": 174505075, + "step": 8117, + "time_per_iteration": 2.713792562484741 + }, + { + "auxiliary_loss_clip": 0.01093551, + "auxiliary_loss_mlp": 0.01042447, + "balance_loss_clip": 1.04097366, + "balance_loss_mlp": 1.02740264, + "epoch": 0.48808056515857506, + "flos": 19172168928000.0, + "grad_norm": 1.6679530296862457, + "language_loss": 0.79487926, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.81623924, + "num_input_tokens_seen": 174523385, + "step": 8118, + "time_per_iteration": 2.6479125022888184 + }, + { + "auxiliary_loss_clip": 0.01102071, + "auxiliary_loss_mlp": 0.01036823, + "balance_loss_clip": 1.04363036, + "balance_loss_mlp": 1.02295303, + "epoch": 0.48814068841124303, + "flos": 20120497850880.0, + "grad_norm": 1.8682176240686432, + "language_loss": 0.6328088, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.65419775, + "num_input_tokens_seen": 174542200, + "step": 8119, + "time_per_iteration": 2.6599643230438232 + }, + { + "auxiliary_loss_clip": 0.01061047, + "auxiliary_loss_mlp": 0.00770426, + "balance_loss_clip": 1.04209542, + "balance_loss_mlp": 1.00037444, + "epoch": 0.488200811663911, + "flos": 22965592360320.0, + "grad_norm": 1.676805190577927, + "language_loss": 0.72166741, + "learning_rate": 2.173287627305878e-06, + "loss": 0.73998219, + "num_input_tokens_seen": 174563620, + "step": 8120, + "time_per_iteration": 2.795185089111328 + }, + { + "auxiliary_loss_clip": 0.01118613, + "auxiliary_loss_mlp": 0.01031295, + "balance_loss_clip": 1.0469954, + "balance_loss_mlp": 1.01728177, + "epoch": 0.48826093491657896, + "flos": 33910697387520.0, + "grad_norm": 2.388334225725702, + "language_loss": 0.63951784, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.66101694, + "num_input_tokens_seen": 174586465, + "step": 8121, + "time_per_iteration": 2.7527153491973877 + }, + { + "auxiliary_loss_clip": 0.01112786, + "auxiliary_loss_mlp": 0.01036976, + "balance_loss_clip": 1.04261351, + "balance_loss_mlp": 1.02283835, + "epoch": 0.488321058169247, + "flos": 23070307484160.0, + "grad_norm": 1.985568603421553, + "language_loss": 0.82805705, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.84955472, + "num_input_tokens_seen": 174604035, + "step": 8122, + "time_per_iteration": 2.668754816055298 + }, + { + "auxiliary_loss_clip": 0.0111403, + "auxiliary_loss_mlp": 0.01043394, + "balance_loss_clip": 1.04526711, + "balance_loss_mlp": 1.02749181, + "epoch": 0.48838118142191494, + "flos": 19317714837120.0, + "grad_norm": 1.7149683973709622, + "language_loss": 0.85272485, + "learning_rate": 2.172123606640866e-06, + "loss": 0.87429905, + "num_input_tokens_seen": 174621715, + "step": 8123, + "time_per_iteration": 2.6014883518218994 + }, + { + "auxiliary_loss_clip": 0.01090574, + "auxiliary_loss_mlp": 0.01031767, + "balance_loss_clip": 1.04448855, + "balance_loss_mlp": 1.0185523, + "epoch": 0.4884413046745829, + "flos": 25410678036480.0, + "grad_norm": 1.3909354864913257, + "language_loss": 0.85614896, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.87737238, + "num_input_tokens_seen": 174643835, + "step": 8124, + "time_per_iteration": 2.754786968231201 + }, + { + "auxiliary_loss_clip": 0.01103222, + "auxiliary_loss_mlp": 0.01031579, + "balance_loss_clip": 1.04439664, + "balance_loss_mlp": 1.0179534, + "epoch": 0.4885014279272509, + "flos": 20991546662400.0, + "grad_norm": 1.926010658269172, + "language_loss": 0.79547518, + "learning_rate": 2.171347560204948e-06, + "loss": 0.81682324, + "num_input_tokens_seen": 174660955, + "step": 8125, + "time_per_iteration": 2.667335271835327 + }, + { + "auxiliary_loss_clip": 0.01078395, + "auxiliary_loss_mlp": 0.01040727, + "balance_loss_clip": 1.04347515, + "balance_loss_mlp": 1.0263145, + "epoch": 0.48856155117991884, + "flos": 13771599269760.0, + "grad_norm": 2.02778788313487, + "language_loss": 0.72584462, + "learning_rate": 2.170959527233356e-06, + "loss": 0.74703586, + "num_input_tokens_seen": 174678270, + "step": 8126, + "time_per_iteration": 2.7370314598083496 + }, + { + "auxiliary_loss_clip": 0.0111111, + "auxiliary_loss_mlp": 0.01038149, + "balance_loss_clip": 1.0410614, + "balance_loss_mlp": 1.02405286, + "epoch": 0.4886216744325868, + "flos": 32087764206720.0, + "grad_norm": 1.7703486674415694, + "language_loss": 0.68917644, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.71066898, + "num_input_tokens_seen": 174698360, + "step": 8127, + "time_per_iteration": 2.811074733734131 + }, + { + "auxiliary_loss_clip": 0.01125381, + "auxiliary_loss_mlp": 0.01033584, + "balance_loss_clip": 1.04334533, + "balance_loss_mlp": 1.01993454, + "epoch": 0.48868179768525477, + "flos": 19610063631360.0, + "grad_norm": 1.5960676368468543, + "language_loss": 0.76178646, + "learning_rate": 2.170183441856481e-06, + "loss": 0.78337616, + "num_input_tokens_seen": 174716755, + "step": 8128, + "time_per_iteration": 2.5751638412475586 + }, + { + "auxiliary_loss_clip": 0.01126548, + "auxiliary_loss_mlp": 0.01031229, + "balance_loss_clip": 1.04598355, + "balance_loss_mlp": 1.01818776, + "epoch": 0.48874192093792274, + "flos": 21286912199040.0, + "grad_norm": 1.5334009671548041, + "language_loss": 0.7574327, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.77901042, + "num_input_tokens_seen": 174735560, + "step": 8129, + "time_per_iteration": 4.080120325088501 + }, + { + "auxiliary_loss_clip": 0.01113338, + "auxiliary_loss_mlp": 0.01031411, + "balance_loss_clip": 1.04372275, + "balance_loss_mlp": 1.0174098, + "epoch": 0.4888020441905907, + "flos": 14173439696640.0, + "grad_norm": 2.756799094025314, + "language_loss": 0.64951944, + "learning_rate": 2.169407330666114e-06, + "loss": 0.67096692, + "num_input_tokens_seen": 174752730, + "step": 8130, + "time_per_iteration": 4.153359413146973 + }, + { + "auxiliary_loss_clip": 0.01087218, + "auxiliary_loss_mlp": 0.01036252, + "balance_loss_clip": 1.0399828, + "balance_loss_mlp": 1.02282333, + "epoch": 0.48886216744325867, + "flos": 24097891766400.0, + "grad_norm": 1.9114203912665453, + "language_loss": 0.72505724, + "learning_rate": 2.169019265427658e-06, + "loss": 0.746292, + "num_input_tokens_seen": 174772520, + "step": 8131, + "time_per_iteration": 2.751070499420166 + }, + { + "auxiliary_loss_clip": 0.0111646, + "auxiliary_loss_mlp": 0.01041385, + "balance_loss_clip": 1.04625905, + "balance_loss_mlp": 1.0270561, + "epoch": 0.48892229069592663, + "flos": 38431419402240.0, + "grad_norm": 1.3981624070335212, + "language_loss": 0.69684219, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.71842068, + "num_input_tokens_seen": 174796540, + "step": 8132, + "time_per_iteration": 4.478942632675171 + }, + { + "auxiliary_loss_clip": 0.01109765, + "auxiliary_loss_mlp": 0.01030128, + "balance_loss_clip": 1.04673529, + "balance_loss_mlp": 1.01630616, + "epoch": 0.4889824139485946, + "flos": 23843321101440.0, + "grad_norm": 1.328560083390073, + "language_loss": 0.69882882, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.72022772, + "num_input_tokens_seen": 174817840, + "step": 8133, + "time_per_iteration": 4.2415807247161865 + }, + { + "auxiliary_loss_clip": 0.01062397, + "auxiliary_loss_mlp": 0.01042948, + "balance_loss_clip": 1.03593254, + "balance_loss_mlp": 1.02922726, + "epoch": 0.48904253720126256, + "flos": 24425827960320.0, + "grad_norm": 1.919712430573748, + "language_loss": 0.70950568, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.73055917, + "num_input_tokens_seen": 174837885, + "step": 8134, + "time_per_iteration": 2.772383689880371 + }, + { + "auxiliary_loss_clip": 0.01084139, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.04342508, + "balance_loss_mlp": 1.02082086, + "epoch": 0.4891026604539306, + "flos": 24170682677760.0, + "grad_norm": 1.9244253075686233, + "language_loss": 0.80356431, + "learning_rate": 2.167466940528718e-06, + "loss": 0.82475942, + "num_input_tokens_seen": 174855240, + "step": 8135, + "time_per_iteration": 2.7362964153289795 + }, + { + "auxiliary_loss_clip": 0.01124035, + "auxiliary_loss_mlp": 0.01035694, + "balance_loss_clip": 1.04567957, + "balance_loss_mlp": 1.0232842, + "epoch": 0.48916278370659855, + "flos": 21470954509440.0, + "grad_norm": 1.8037329109010316, + "language_loss": 0.74794912, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.76954633, + "num_input_tokens_seen": 174875145, + "step": 8136, + "time_per_iteration": 2.766477346420288 + }, + { + "auxiliary_loss_clip": 0.01097387, + "auxiliary_loss_mlp": 0.01043558, + "balance_loss_clip": 1.04352307, + "balance_loss_mlp": 1.02971756, + "epoch": 0.4892229069592665, + "flos": 22309755886080.0, + "grad_norm": 1.6588593954338173, + "language_loss": 0.73403543, + "learning_rate": 2.166690739918204e-06, + "loss": 0.75544488, + "num_input_tokens_seen": 174894770, + "step": 8137, + "time_per_iteration": 2.720778703689575 + }, + { + "auxiliary_loss_clip": 0.01051073, + "auxiliary_loss_mlp": 0.01031061, + "balance_loss_clip": 1.03699243, + "balance_loss_mlp": 1.01726234, + "epoch": 0.4892830302119345, + "flos": 12786856934400.0, + "grad_norm": 2.090077124931452, + "language_loss": 0.75336611, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.77418739, + "num_input_tokens_seen": 174912780, + "step": 8138, + "time_per_iteration": 2.7975735664367676 + }, + { + "auxiliary_loss_clip": 0.01091927, + "auxiliary_loss_mlp": 0.01038351, + "balance_loss_clip": 1.04700375, + "balance_loss_mlp": 1.02536893, + "epoch": 0.48934315346460244, + "flos": 20813896972800.0, + "grad_norm": 1.6152276292204855, + "language_loss": 0.74018902, + "learning_rate": 2.165914514023972e-06, + "loss": 0.76149184, + "num_input_tokens_seen": 174931250, + "step": 8139, + "time_per_iteration": 2.7135186195373535 + }, + { + "auxiliary_loss_clip": 0.01115319, + "auxiliary_loss_mlp": 0.0103739, + "balance_loss_clip": 1.04502773, + "balance_loss_mlp": 1.02416921, + "epoch": 0.4894032767172704, + "flos": 19755537713280.0, + "grad_norm": 1.878714628680016, + "language_loss": 0.62168998, + "learning_rate": 2.165526391632255e-06, + "loss": 0.64321709, + "num_input_tokens_seen": 174951105, + "step": 8140, + "time_per_iteration": 2.6594550609588623 + }, + { + "auxiliary_loss_clip": 0.0109215, + "auxiliary_loss_mlp": 0.01040102, + "balance_loss_clip": 1.04310822, + "balance_loss_mlp": 1.02509928, + "epoch": 0.4894633999699384, + "flos": 17818982835840.0, + "grad_norm": 1.7004882369900214, + "language_loss": 0.82400143, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.84532392, + "num_input_tokens_seen": 174969120, + "step": 8141, + "time_per_iteration": 2.648696184158325 + }, + { + "auxiliary_loss_clip": 0.01095522, + "auxiliary_loss_mlp": 0.01034005, + "balance_loss_clip": 1.04897892, + "balance_loss_mlp": 1.01975965, + "epoch": 0.48952352322260634, + "flos": 25523222325120.0, + "grad_norm": 1.6750975318537598, + "language_loss": 0.72031653, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.74161184, + "num_input_tokens_seen": 174991295, + "step": 8142, + "time_per_iteration": 2.770524740219116 + }, + { + "auxiliary_loss_clip": 0.01124129, + "auxiliary_loss_mlp": 0.01033852, + "balance_loss_clip": 1.04588366, + "balance_loss_mlp": 1.02094769, + "epoch": 0.4895836464752743, + "flos": 29055502903680.0, + "grad_norm": 8.902000760681485, + "language_loss": 0.66877794, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.6903578, + "num_input_tokens_seen": 175012830, + "step": 8143, + "time_per_iteration": 2.717714786529541 + }, + { + "auxiliary_loss_clip": 0.01116098, + "auxiliary_loss_mlp": 0.00770078, + "balance_loss_clip": 1.04774415, + "balance_loss_mlp": 1.00015235, + "epoch": 0.48964376972794227, + "flos": 33546958312320.0, + "grad_norm": 1.880195910988658, + "language_loss": 0.75596797, + "learning_rate": 2.163973839444793e-06, + "loss": 0.77482975, + "num_input_tokens_seen": 175035695, + "step": 8144, + "time_per_iteration": 2.801825761795044 + }, + { + "auxiliary_loss_clip": 0.01099436, + "auxiliary_loss_mlp": 0.01031587, + "balance_loss_clip": 1.04169714, + "balance_loss_mlp": 1.01753187, + "epoch": 0.48970389298061023, + "flos": 22054035985920.0, + "grad_norm": 1.9123659180679726, + "language_loss": 0.75693774, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.77824795, + "num_input_tokens_seen": 175056425, + "step": 8145, + "time_per_iteration": 2.781550168991089 + }, + { + "auxiliary_loss_clip": 0.01108869, + "auxiliary_loss_mlp": 0.0077212, + "balance_loss_clip": 1.04549527, + "balance_loss_mlp": 1.00018287, + "epoch": 0.4897640162332782, + "flos": 20084299920000.0, + "grad_norm": 1.6675270752681912, + "language_loss": 0.80437362, + "learning_rate": 2.163197525984761e-06, + "loss": 0.82318354, + "num_input_tokens_seen": 175074800, + "step": 8146, + "time_per_iteration": 2.699277400970459 + }, + { + "auxiliary_loss_clip": 0.01109996, + "auxiliary_loss_mlp": 0.01033581, + "balance_loss_clip": 1.04312873, + "balance_loss_mlp": 1.02007508, + "epoch": 0.48982413948594616, + "flos": 23806225330560.0, + "grad_norm": 2.022171046548427, + "language_loss": 0.74193209, + "learning_rate": 2.162809359964687e-06, + "loss": 0.76336789, + "num_input_tokens_seen": 175094500, + "step": 8147, + "time_per_iteration": 2.732973337173462 + }, + { + "auxiliary_loss_clip": 0.01095071, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.0448947, + "balance_loss_mlp": 1.0193938, + "epoch": 0.4898842627386142, + "flos": 17639645207040.0, + "grad_norm": 2.1017800501084882, + "language_loss": 0.8286857, + "learning_rate": 2.162421187770864e-06, + "loss": 0.84996164, + "num_input_tokens_seen": 175112920, + "step": 8148, + "time_per_iteration": 2.662179708480835 + }, + { + "auxiliary_loss_clip": 0.01091374, + "auxiliary_loss_mlp": 0.01033444, + "balance_loss_clip": 1.04345882, + "balance_loss_mlp": 1.0213387, + "epoch": 0.48994438599128215, + "flos": 16617914841600.0, + "grad_norm": 1.9007753197415815, + "language_loss": 0.74256468, + "learning_rate": 2.162033009418015e-06, + "loss": 0.76381284, + "num_input_tokens_seen": 175129910, + "step": 8149, + "time_per_iteration": 2.7373321056365967 + }, + { + "auxiliary_loss_clip": 0.01130985, + "auxiliary_loss_mlp": 0.01037014, + "balance_loss_clip": 1.04766726, + "balance_loss_mlp": 1.02247095, + "epoch": 0.4900045092439501, + "flos": 26614834600320.0, + "grad_norm": 1.7000980888808985, + "language_loss": 0.76319683, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.78487676, + "num_input_tokens_seen": 175148705, + "step": 8150, + "time_per_iteration": 2.653003692626953 + }, + { + "auxiliary_loss_clip": 0.01103787, + "auxiliary_loss_mlp": 0.01035673, + "balance_loss_clip": 1.04736936, + "balance_loss_mlp": 1.02152276, + "epoch": 0.4900646324966181, + "flos": 19902125116800.0, + "grad_norm": 2.127966402053614, + "language_loss": 0.72754669, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.7489413, + "num_input_tokens_seen": 175167425, + "step": 8151, + "time_per_iteration": 2.7142715454101562 + }, + { + "auxiliary_loss_clip": 0.01018676, + "auxiliary_loss_mlp": 0.01008139, + "balance_loss_clip": 1.02870607, + "balance_loss_mlp": 1.00680435, + "epoch": 0.49012475574928605, + "flos": 59189620337280.0, + "grad_norm": 0.8300028938034224, + "language_loss": 0.54350889, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.56377703, + "num_input_tokens_seen": 175227985, + "step": 8152, + "time_per_iteration": 3.218646764755249 + }, + { + "auxiliary_loss_clip": 0.01066533, + "auxiliary_loss_mlp": 0.01034489, + "balance_loss_clip": 1.04041779, + "balance_loss_mlp": 1.02058959, + "epoch": 0.490184879001954, + "flos": 45259797657600.0, + "grad_norm": 1.9767488244056508, + "language_loss": 0.61212152, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.6331318, + "num_input_tokens_seen": 175251895, + "step": 8153, + "time_per_iteration": 3.043501615524292 + }, + { + "auxiliary_loss_clip": 0.01091315, + "auxiliary_loss_mlp": 0.01034977, + "balance_loss_clip": 1.04408598, + "balance_loss_mlp": 1.02139306, + "epoch": 0.490245002254622, + "flos": 28002135634560.0, + "grad_norm": 1.494326859026801, + "language_loss": 0.767699, + "learning_rate": 2.160092025783549e-06, + "loss": 0.78896195, + "num_input_tokens_seen": 175272770, + "step": 8154, + "time_per_iteration": 2.783686399459839 + }, + { + "auxiliary_loss_clip": 0.01032948, + "auxiliary_loss_mlp": 0.01009488, + "balance_loss_clip": 1.02573824, + "balance_loss_mlp": 1.00805795, + "epoch": 0.49030512550728994, + "flos": 58951318533120.0, + "grad_norm": 0.9569310885457037, + "language_loss": 0.6699397, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.69036406, + "num_input_tokens_seen": 175336320, + "step": 8155, + "time_per_iteration": 3.2836861610412598 + }, + { + "auxiliary_loss_clip": 0.01128627, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.04858041, + "balance_loss_mlp": 1.01990271, + "epoch": 0.4903652487599579, + "flos": 19791843384960.0, + "grad_norm": 1.7952288566158678, + "language_loss": 0.76406527, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.78568316, + "num_input_tokens_seen": 175353540, + "step": 8156, + "time_per_iteration": 2.77978515625 + }, + { + "auxiliary_loss_clip": 0.01115952, + "auxiliary_loss_mlp": 0.01033945, + "balance_loss_clip": 1.04693031, + "balance_loss_mlp": 1.02066517, + "epoch": 0.49042537201262587, + "flos": 21762082241280.0, + "grad_norm": 2.671892010748055, + "language_loss": 0.83756495, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.85906386, + "num_input_tokens_seen": 175370445, + "step": 8157, + "time_per_iteration": 2.6860296726226807 + }, + { + "auxiliary_loss_clip": 0.01116981, + "auxiliary_loss_mlp": 0.0103483, + "balance_loss_clip": 1.04626417, + "balance_loss_mlp": 1.02103734, + "epoch": 0.49048549526529384, + "flos": 18953042008320.0, + "grad_norm": 1.6916175452091182, + "language_loss": 0.79447746, + "learning_rate": 2.158539129514956e-06, + "loss": 0.81599557, + "num_input_tokens_seen": 175389020, + "step": 8158, + "time_per_iteration": 2.723398208618164 + }, + { + "auxiliary_loss_clip": 0.01130092, + "auxiliary_loss_mlp": 0.01036013, + "balance_loss_clip": 1.0493114, + "balance_loss_mlp": 1.02237535, + "epoch": 0.4905456185179618, + "flos": 26906393295360.0, + "grad_norm": 1.5924994780725177, + "language_loss": 0.69469124, + "learning_rate": 2.158150890381454e-06, + "loss": 0.71635228, + "num_input_tokens_seen": 175409545, + "step": 8159, + "time_per_iteration": 2.685887575149536 + }, + { + "auxiliary_loss_clip": 0.01109209, + "auxiliary_loss_mlp": 0.01041597, + "balance_loss_clip": 1.04416955, + "balance_loss_mlp": 1.02719688, + "epoch": 0.49060574177062977, + "flos": 20412343854720.0, + "grad_norm": 1.8488353997421354, + "language_loss": 0.73372805, + "learning_rate": 2.157762645250854e-06, + "loss": 0.75523615, + "num_input_tokens_seen": 175429335, + "step": 8160, + "time_per_iteration": 2.7002642154693604 + }, + { + "auxiliary_loss_clip": 0.01111433, + "auxiliary_loss_mlp": 0.01040851, + "balance_loss_clip": 1.04374194, + "balance_loss_mlp": 1.02655184, + "epoch": 0.4906658650232978, + "flos": 17493704248320.0, + "grad_norm": 4.058452856445761, + "language_loss": 0.71791285, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.73943567, + "num_input_tokens_seen": 175446955, + "step": 8161, + "time_per_iteration": 2.641211748123169 + }, + { + "auxiliary_loss_clip": 0.01077408, + "auxiliary_loss_mlp": 0.01036857, + "balance_loss_clip": 1.04114866, + "balance_loss_mlp": 1.02337408, + "epoch": 0.49072598827596575, + "flos": 26614439550720.0, + "grad_norm": 1.5881872934975843, + "language_loss": 0.68676394, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.7079066, + "num_input_tokens_seen": 175468195, + "step": 8162, + "time_per_iteration": 2.7768666744232178 + }, + { + "auxiliary_loss_clip": 0.01114289, + "auxiliary_loss_mlp": 0.01037181, + "balance_loss_clip": 1.04699993, + "balance_loss_mlp": 1.02219641, + "epoch": 0.4907861115286337, + "flos": 20412595249920.0, + "grad_norm": 1.6090900616469643, + "language_loss": 0.63697332, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.65848798, + "num_input_tokens_seen": 175487455, + "step": 8163, + "time_per_iteration": 2.658141851425171 + }, + { + "auxiliary_loss_clip": 0.01086004, + "auxiliary_loss_mlp": 0.01032891, + "balance_loss_clip": 1.03996313, + "balance_loss_mlp": 1.01987886, + "epoch": 0.4908462347813017, + "flos": 14064271286400.0, + "grad_norm": 2.5242130171230954, + "language_loss": 0.77383208, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.79502106, + "num_input_tokens_seen": 175504450, + "step": 8164, + "time_per_iteration": 2.6626484394073486 + }, + { + "auxiliary_loss_clip": 0.01110027, + "auxiliary_loss_mlp": 0.01037706, + "balance_loss_clip": 1.04298282, + "balance_loss_mlp": 1.0221138, + "epoch": 0.49090635803396965, + "flos": 18735100237440.0, + "grad_norm": 1.6753117148295888, + "language_loss": 0.76749474, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.78897208, + "num_input_tokens_seen": 175523600, + "step": 8165, + "time_per_iteration": 2.5757079124450684 + }, + { + "auxiliary_loss_clip": 0.0110394, + "auxiliary_loss_mlp": 0.01035745, + "balance_loss_clip": 1.04666007, + "balance_loss_mlp": 1.02205336, + "epoch": 0.4909664812866376, + "flos": 20558500295040.0, + "grad_norm": 1.5531816235742995, + "language_loss": 0.77461708, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.79601395, + "num_input_tokens_seen": 175542720, + "step": 8166, + "time_per_iteration": 2.7169244289398193 + }, + { + "auxiliary_loss_clip": 0.01040608, + "auxiliary_loss_mlp": 0.00998968, + "balance_loss_clip": 1.02393854, + "balance_loss_mlp": 0.99768084, + "epoch": 0.4910266045393056, + "flos": 54684017948160.0, + "grad_norm": 0.7914566078875801, + "language_loss": 0.54175258, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56214833, + "num_input_tokens_seen": 175598640, + "step": 8167, + "time_per_iteration": 3.192706823348999 + }, + { + "auxiliary_loss_clip": 0.01081549, + "auxiliary_loss_mlp": 0.01036447, + "balance_loss_clip": 1.04554164, + "balance_loss_mlp": 1.02288687, + "epoch": 0.49108672779197354, + "flos": 16246454342400.0, + "grad_norm": 1.702915470367474, + "language_loss": 0.85894108, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.88012105, + "num_input_tokens_seen": 175615675, + "step": 8168, + "time_per_iteration": 2.7353274822235107 + }, + { + "auxiliary_loss_clip": 0.01107152, + "auxiliary_loss_mlp": 0.01045094, + "balance_loss_clip": 1.04374826, + "balance_loss_mlp": 1.030586, + "epoch": 0.4911468510446415, + "flos": 19825419623040.0, + "grad_norm": 1.7298624053450853, + "language_loss": 0.73407066, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.75559318, + "num_input_tokens_seen": 175632255, + "step": 8169, + "time_per_iteration": 5.773583173751831 + }, + { + "auxiliary_loss_clip": 0.01112799, + "auxiliary_loss_mlp": 0.01029653, + "balance_loss_clip": 1.04443777, + "balance_loss_mlp": 1.01692092, + "epoch": 0.4912069742973095, + "flos": 21212684743680.0, + "grad_norm": 1.4410309608870682, + "language_loss": 0.77824241, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.79966694, + "num_input_tokens_seen": 175651625, + "step": 8170, + "time_per_iteration": 2.6583240032196045 + }, + { + "auxiliary_loss_clip": 0.01096689, + "auxiliary_loss_mlp": 0.010389, + "balance_loss_clip": 1.04164565, + "balance_loss_mlp": 1.02643037, + "epoch": 0.49126709754997744, + "flos": 19537129065600.0, + "grad_norm": 2.2423824181328853, + "language_loss": 0.76314211, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.78449798, + "num_input_tokens_seen": 175669265, + "step": 8171, + "time_per_iteration": 4.3524169921875 + }, + { + "auxiliary_loss_clip": 0.01104096, + "auxiliary_loss_mlp": 0.01036347, + "balance_loss_clip": 1.04284763, + "balance_loss_mlp": 1.02299559, + "epoch": 0.4913272208026454, + "flos": 12239686080000.0, + "grad_norm": 1.898078833449508, + "language_loss": 0.82055932, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.84196377, + "num_input_tokens_seen": 175686065, + "step": 8172, + "time_per_iteration": 4.201699495315552 + }, + { + "auxiliary_loss_clip": 0.0104227, + "auxiliary_loss_mlp": 0.01009809, + "balance_loss_clip": 1.02604604, + "balance_loss_mlp": 1.00842655, + "epoch": 0.49138734405531337, + "flos": 65465871661440.0, + "grad_norm": 0.6872688544677212, + "language_loss": 0.53258997, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.55311078, + "num_input_tokens_seen": 175748595, + "step": 8173, + "time_per_iteration": 3.1827917098999023 + }, + { + "auxiliary_loss_clip": 0.0111451, + "auxiliary_loss_mlp": 0.00771219, + "balance_loss_clip": 1.04312336, + "balance_loss_mlp": 1.00013208, + "epoch": 0.4914474673079814, + "flos": 18439052342400.0, + "grad_norm": 2.1937948702767054, + "language_loss": 0.63081181, + "learning_rate": 2.152326591972107e-06, + "loss": 0.64966911, + "num_input_tokens_seen": 175766770, + "step": 8174, + "time_per_iteration": 2.591662883758545 + }, + { + "auxiliary_loss_clip": 0.01086287, + "auxiliary_loss_mlp": 0.01044728, + "balance_loss_clip": 1.04296112, + "balance_loss_mlp": 1.02985096, + "epoch": 0.49150759056064935, + "flos": 21685053525120.0, + "grad_norm": 1.9252900771693722, + "language_loss": 0.69252932, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.71383941, + "num_input_tokens_seen": 175783605, + "step": 8175, + "time_per_iteration": 2.7286670207977295 + }, + { + "auxiliary_loss_clip": 0.01112428, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.0438236, + "balance_loss_mlp": 1.02018952, + "epoch": 0.4915677138133173, + "flos": 22382439056640.0, + "grad_norm": 1.7316891792167346, + "language_loss": 0.74424642, + "learning_rate": 2.151549919570068e-06, + "loss": 0.76570022, + "num_input_tokens_seen": 175801390, + "step": 8176, + "time_per_iteration": 2.623328685760498 + }, + { + "auxiliary_loss_clip": 0.01117272, + "auxiliary_loss_mlp": 0.0104375, + "balance_loss_clip": 1.04691124, + "balance_loss_mlp": 1.03022528, + "epoch": 0.4916278370659853, + "flos": 18402890325120.0, + "grad_norm": 1.776030453931397, + "language_loss": 0.70309961, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.72470981, + "num_input_tokens_seen": 175819830, + "step": 8177, + "time_per_iteration": 2.642073154449463 + }, + { + "auxiliary_loss_clip": 0.01031811, + "auxiliary_loss_mlp": 0.00752155, + "balance_loss_clip": 1.02581143, + "balance_loss_mlp": 0.99997473, + "epoch": 0.49168796031865325, + "flos": 66609124715520.0, + "grad_norm": 0.6890109431226723, + "language_loss": 0.46192822, + "learning_rate": 2.150773224180877e-06, + "loss": 0.47976786, + "num_input_tokens_seen": 175881765, + "step": 8178, + "time_per_iteration": 3.195594072341919 + }, + { + "auxiliary_loss_clip": 0.0112992, + "auxiliary_loss_mlp": 0.01036689, + "balance_loss_clip": 1.04735565, + "balance_loss_mlp": 1.02215147, + "epoch": 0.4917480835713212, + "flos": 20959335141120.0, + "grad_norm": 1.748461689040465, + "language_loss": 0.65961659, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.6812827, + "num_input_tokens_seen": 175901795, + "step": 8179, + "time_per_iteration": 2.675170421600342 + }, + { + "auxiliary_loss_clip": 0.01036062, + "auxiliary_loss_mlp": 0.01047888, + "balance_loss_clip": 1.03444839, + "balance_loss_mlp": 1.031497, + "epoch": 0.4918082068239892, + "flos": 15772900412160.0, + "grad_norm": 2.3413868243180493, + "language_loss": 0.70163, + "learning_rate": 2.149996505922343e-06, + "loss": 0.72246957, + "num_input_tokens_seen": 175917770, + "step": 8180, + "time_per_iteration": 2.9436681270599365 + }, + { + "auxiliary_loss_clip": 0.01099418, + "auxiliary_loss_mlp": 0.01037201, + "balance_loss_clip": 1.04268646, + "balance_loss_mlp": 1.02306247, + "epoch": 0.49186833007665715, + "flos": 24604806453120.0, + "grad_norm": 1.915055420772654, + "language_loss": 0.84369922, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.86506534, + "num_input_tokens_seen": 175937000, + "step": 8181, + "time_per_iteration": 2.8556039333343506 + }, + { + "auxiliary_loss_clip": 0.01125975, + "auxiliary_loss_mlp": 0.010356, + "balance_loss_clip": 1.04886341, + "balance_loss_mlp": 1.0226841, + "epoch": 0.4919284533293251, + "flos": 22090557139200.0, + "grad_norm": 2.841846979456106, + "language_loss": 0.72482812, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.74644387, + "num_input_tokens_seen": 175955170, + "step": 8182, + "time_per_iteration": 2.5908985137939453 + }, + { + "auxiliary_loss_clip": 0.01088743, + "auxiliary_loss_mlp": 0.01035989, + "balance_loss_clip": 1.04323542, + "balance_loss_mlp": 1.0227685, + "epoch": 0.4919885765819931, + "flos": 23368043318400.0, + "grad_norm": 2.038591418033226, + "language_loss": 0.72608387, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.74733126, + "num_input_tokens_seen": 175973725, + "step": 8183, + "time_per_iteration": 2.7704007625579834 + }, + { + "auxiliary_loss_clip": 0.0106529, + "auxiliary_loss_mlp": 0.01035357, + "balance_loss_clip": 1.03853834, + "balance_loss_mlp": 1.0204556, + "epoch": 0.49204869983466104, + "flos": 21360493209600.0, + "grad_norm": 3.5725391309360406, + "language_loss": 0.77354276, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.79454923, + "num_input_tokens_seen": 175993885, + "step": 8184, + "time_per_iteration": 2.8195126056671143 + }, + { + "auxiliary_loss_clip": 0.01094147, + "auxiliary_loss_mlp": 0.01040773, + "balance_loss_clip": 1.04233742, + "balance_loss_mlp": 1.02739143, + "epoch": 0.492108823087329, + "flos": 21142695093120.0, + "grad_norm": 1.8939343643350832, + "language_loss": 0.70917577, + "learning_rate": 2.148054610995789e-06, + "loss": 0.73052496, + "num_input_tokens_seen": 176014210, + "step": 8185, + "time_per_iteration": 2.678464412689209 + }, + { + "auxiliary_loss_clip": 0.01108334, + "auxiliary_loss_mlp": 0.01037918, + "balance_loss_clip": 1.0468477, + "balance_loss_mlp": 1.02306461, + "epoch": 0.49216894633999697, + "flos": 25116605389440.0, + "grad_norm": 1.7900274786799464, + "language_loss": 0.75134045, + "learning_rate": 2.147666215108831e-06, + "loss": 0.77280295, + "num_input_tokens_seen": 176033890, + "step": 8186, + "time_per_iteration": 2.754204273223877 + }, + { + "auxiliary_loss_clip": 0.01116557, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.04770708, + "balance_loss_mlp": 1.02050531, + "epoch": 0.49222906959266494, + "flos": 22637943475200.0, + "grad_norm": 2.9803647414716945, + "language_loss": 0.67526996, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.69678307, + "num_input_tokens_seen": 176052720, + "step": 8187, + "time_per_iteration": 2.6845459938049316 + }, + { + "auxiliary_loss_clip": 0.0108036, + "auxiliary_loss_mlp": 0.01036841, + "balance_loss_clip": 1.04077077, + "balance_loss_mlp": 1.02301288, + "epoch": 0.49228919284533296, + "flos": 20410548174720.0, + "grad_norm": 1.410632675975, + "language_loss": 0.67109811, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.6922701, + "num_input_tokens_seen": 176072545, + "step": 8188, + "time_per_iteration": 2.8322603702545166 + }, + { + "auxiliary_loss_clip": 0.01119978, + "auxiliary_loss_mlp": 0.01034509, + "balance_loss_clip": 1.04967701, + "balance_loss_mlp": 1.02131248, + "epoch": 0.4923493160980009, + "flos": 27122359818240.0, + "grad_norm": 1.8145698664310643, + "language_loss": 0.74643195, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.76797676, + "num_input_tokens_seen": 176091490, + "step": 8189, + "time_per_iteration": 2.700728178024292 + }, + { + "auxiliary_loss_clip": 0.01102804, + "auxiliary_loss_mlp": 0.01027439, + "balance_loss_clip": 1.04349804, + "balance_loss_mlp": 1.0138967, + "epoch": 0.4924094393506689, + "flos": 35736683224320.0, + "grad_norm": 1.5012400452063497, + "language_loss": 0.63989937, + "learning_rate": 2.146112575713104e-06, + "loss": 0.66120183, + "num_input_tokens_seen": 176113200, + "step": 8190, + "time_per_iteration": 2.781034231185913 + }, + { + "auxiliary_loss_clip": 0.01127618, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.04802811, + "balance_loss_mlp": 1.01666641, + "epoch": 0.49246956260333685, + "flos": 20412487509120.0, + "grad_norm": 2.59854956867769, + "language_loss": 0.71723747, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.73882031, + "num_input_tokens_seen": 176132485, + "step": 8191, + "time_per_iteration": 2.6378936767578125 + }, + { + "auxiliary_loss_clip": 0.01125365, + "auxiliary_loss_mlp": 0.00771087, + "balance_loss_clip": 1.04543817, + "balance_loss_mlp": 1.00005293, + "epoch": 0.4925296858560048, + "flos": 38976938231040.0, + "grad_norm": 1.5444009886503365, + "language_loss": 0.71964842, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.73861289, + "num_input_tokens_seen": 176155755, + "step": 8192, + "time_per_iteration": 2.748840570449829 + }, + { + "auxiliary_loss_clip": 0.01029185, + "auxiliary_loss_mlp": 0.01001084, + "balance_loss_clip": 1.02257538, + "balance_loss_mlp": 0.9996711, + "epoch": 0.4925898091086728, + "flos": 64278917712000.0, + "grad_norm": 0.718294486843201, + "language_loss": 0.52137887, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54168153, + "num_input_tokens_seen": 176216295, + "step": 8193, + "time_per_iteration": 3.264312267303467 + }, + { + "auxiliary_loss_clip": 0.01125829, + "auxiliary_loss_mlp": 0.01041308, + "balance_loss_clip": 1.04740691, + "balance_loss_mlp": 1.02760482, + "epoch": 0.49264993236134075, + "flos": 23036372110080.0, + "grad_norm": 1.4111181716707888, + "language_loss": 0.76839447, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.79006582, + "num_input_tokens_seen": 176235925, + "step": 8194, + "time_per_iteration": 2.7086539268493652 + }, + { + "auxiliary_loss_clip": 0.01098073, + "auxiliary_loss_mlp": 0.0103376, + "balance_loss_clip": 1.04026222, + "balance_loss_mlp": 1.02031338, + "epoch": 0.4927100556140087, + "flos": 24718212668160.0, + "grad_norm": 1.9420104205554047, + "language_loss": 0.70233512, + "learning_rate": 2.144170401915341e-06, + "loss": 0.72365344, + "num_input_tokens_seen": 176253865, + "step": 8195, + "time_per_iteration": 2.6881814002990723 + }, + { + "auxiliary_loss_clip": 0.01087059, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.04724264, + "balance_loss_mlp": 1.02013052, + "epoch": 0.4927701788666767, + "flos": 23505544581120.0, + "grad_norm": 2.097647655801467, + "language_loss": 0.81090224, + "learning_rate": 2.143781950696001e-06, + "loss": 0.83210671, + "num_input_tokens_seen": 176271525, + "step": 8196, + "time_per_iteration": 2.7997779846191406 + }, + { + "auxiliary_loss_clip": 0.01092387, + "auxiliary_loss_mlp": 0.01036049, + "balance_loss_clip": 1.04048955, + "balance_loss_mlp": 1.0212965, + "epoch": 0.49283030211934464, + "flos": 22928891639040.0, + "grad_norm": 1.9754651417860998, + "language_loss": 0.70963365, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.73091799, + "num_input_tokens_seen": 176290810, + "step": 8197, + "time_per_iteration": 2.687640428543091 + }, + { + "auxiliary_loss_clip": 0.01113685, + "auxiliary_loss_mlp": 0.01037816, + "balance_loss_clip": 1.04734302, + "balance_loss_mlp": 1.0245595, + "epoch": 0.4928904253720126, + "flos": 16873024210560.0, + "grad_norm": 2.0854468186505133, + "language_loss": 0.84519106, + "learning_rate": 2.143005031915374e-06, + "loss": 0.86670601, + "num_input_tokens_seen": 176309165, + "step": 8198, + "time_per_iteration": 2.660125255584717 + }, + { + "auxiliary_loss_clip": 0.01120431, + "auxiliary_loss_mlp": 0.01037402, + "balance_loss_clip": 1.04784405, + "balance_loss_mlp": 1.02326965, + "epoch": 0.4929505486246806, + "flos": 14866551509760.0, + "grad_norm": 1.8081780640264744, + "language_loss": 0.76137328, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.78295165, + "num_input_tokens_seen": 176324960, + "step": 8199, + "time_per_iteration": 2.6528286933898926 + }, + { + "auxiliary_loss_clip": 0.0110111, + "auxiliary_loss_mlp": 0.0103715, + "balance_loss_clip": 1.0420711, + "balance_loss_mlp": 1.02215934, + "epoch": 0.49301067187734854, + "flos": 23842351434240.0, + "grad_norm": 1.5743655623972015, + "language_loss": 0.60060918, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.62199175, + "num_input_tokens_seen": 176346195, + "step": 8200, + "time_per_iteration": 2.725208044052124 + }, + { + "auxiliary_loss_clip": 0.01112367, + "auxiliary_loss_mlp": 0.01042559, + "balance_loss_clip": 1.04529691, + "balance_loss_mlp": 1.0288918, + "epoch": 0.49307079513001656, + "flos": 22491284244480.0, + "grad_norm": 1.489817328340962, + "language_loss": 0.79219347, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.81374276, + "num_input_tokens_seen": 176366735, + "step": 8201, + "time_per_iteration": 2.6749329566955566 + }, + { + "auxiliary_loss_clip": 0.0112059, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.04529119, + "balance_loss_mlp": 1.02063608, + "epoch": 0.4931309183826845, + "flos": 15924587546880.0, + "grad_norm": 2.8764138588073527, + "language_loss": 0.67214566, + "learning_rate": 2.141451129398785e-06, + "loss": 0.69371456, + "num_input_tokens_seen": 176384475, + "step": 8202, + "time_per_iteration": 2.6964852809906006 + }, + { + "auxiliary_loss_clip": 0.01101254, + "auxiliary_loss_mlp": 0.01032575, + "balance_loss_clip": 1.04416037, + "balance_loss_mlp": 1.01929486, + "epoch": 0.4931910416353525, + "flos": 27309059735040.0, + "grad_norm": 2.180124290012348, + "language_loss": 0.75387114, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.77520943, + "num_input_tokens_seen": 176402645, + "step": 8203, + "time_per_iteration": 2.725586175918579 + }, + { + "auxiliary_loss_clip": 0.01070891, + "auxiliary_loss_mlp": 0.01037718, + "balance_loss_clip": 1.04055309, + "balance_loss_mlp": 1.02355599, + "epoch": 0.49325116488802045, + "flos": 20806139635200.0, + "grad_norm": 2.514240753505036, + "language_loss": 0.8037259, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.82481205, + "num_input_tokens_seen": 176416715, + "step": 8204, + "time_per_iteration": 2.6802115440368652 + }, + { + "auxiliary_loss_clip": 0.01112932, + "auxiliary_loss_mlp": 0.01040079, + "balance_loss_clip": 1.04543495, + "balance_loss_mlp": 1.02662015, + "epoch": 0.4933112881406884, + "flos": 19865963099520.0, + "grad_norm": 1.919360097124168, + "language_loss": 0.65891969, + "learning_rate": 2.140285646139455e-06, + "loss": 0.68044984, + "num_input_tokens_seen": 176435755, + "step": 8205, + "time_per_iteration": 2.6556243896484375 + }, + { + "auxiliary_loss_clip": 0.01131728, + "auxiliary_loss_mlp": 0.01037034, + "balance_loss_clip": 1.04643822, + "balance_loss_mlp": 1.02157259, + "epoch": 0.4933714113933564, + "flos": 21827977741440.0, + "grad_norm": 2.0939603582763207, + "language_loss": 0.66682738, + "learning_rate": 2.139897141060744e-06, + "loss": 0.68851495, + "num_input_tokens_seen": 176453915, + "step": 8206, + "time_per_iteration": 2.6004998683929443 + }, + { + "auxiliary_loss_clip": 0.01078434, + "auxiliary_loss_mlp": 0.01042651, + "balance_loss_clip": 1.04006064, + "balance_loss_mlp": 1.02803612, + "epoch": 0.49343153464602435, + "flos": 27890130049920.0, + "grad_norm": 1.7303473596412533, + "language_loss": 0.76393557, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.78514642, + "num_input_tokens_seen": 176475175, + "step": 8207, + "time_per_iteration": 2.7545268535614014 + }, + { + "auxiliary_loss_clip": 0.01104435, + "auxiliary_loss_mlp": 0.01037384, + "balance_loss_clip": 1.04703426, + "balance_loss_mlp": 1.02245331, + "epoch": 0.4934916578986923, + "flos": 24681080983680.0, + "grad_norm": 2.36511926609042, + "language_loss": 0.60212123, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.62353945, + "num_input_tokens_seen": 176494250, + "step": 8208, + "time_per_iteration": 4.556094408035278 + }, + { + "auxiliary_loss_clip": 0.01108642, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.04619265, + "balance_loss_mlp": 1.01990545, + "epoch": 0.4935517811513603, + "flos": 23405139089280.0, + "grad_norm": 1.7507431161374047, + "language_loss": 0.78938925, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.81082511, + "num_input_tokens_seen": 176513325, + "step": 8209, + "time_per_iteration": 4.171698093414307 + }, + { + "auxiliary_loss_clip": 0.01094204, + "auxiliary_loss_mlp": 0.00774879, + "balance_loss_clip": 1.03905034, + "balance_loss_mlp": 1.00007224, + "epoch": 0.49361190440402825, + "flos": 21944508439680.0, + "grad_norm": 2.001694580419455, + "language_loss": 0.79098332, + "learning_rate": 2.138343067844089e-06, + "loss": 0.80967414, + "num_input_tokens_seen": 176532915, + "step": 8210, + "time_per_iteration": 4.38470196723938 + }, + { + "auxiliary_loss_clip": 0.01113566, + "auxiliary_loss_mlp": 0.01039269, + "balance_loss_clip": 1.04458427, + "balance_loss_mlp": 1.02467823, + "epoch": 0.4936720276566962, + "flos": 25115671635840.0, + "grad_norm": 1.6707024820379262, + "language_loss": 0.81313854, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.83466691, + "num_input_tokens_seen": 176552775, + "step": 8211, + "time_per_iteration": 4.290592193603516 + }, + { + "auxiliary_loss_clip": 0.01082515, + "auxiliary_loss_mlp": 0.01050398, + "balance_loss_clip": 1.04066169, + "balance_loss_mlp": 1.03376865, + "epoch": 0.4937321509093642, + "flos": 26358935132160.0, + "grad_norm": 2.2904212815365477, + "language_loss": 0.9144789, + "learning_rate": 2.137565999700933e-06, + "loss": 0.93580806, + "num_input_tokens_seen": 176572185, + "step": 8212, + "time_per_iteration": 2.77516508102417 + }, + { + "auxiliary_loss_clip": 0.010785, + "auxiliary_loss_mlp": 0.01041938, + "balance_loss_clip": 1.03849816, + "balance_loss_mlp": 1.02666783, + "epoch": 0.49379227416203214, + "flos": 22961390469120.0, + "grad_norm": 2.314209741920176, + "language_loss": 0.65430582, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.67551017, + "num_input_tokens_seen": 176591490, + "step": 8213, + "time_per_iteration": 2.844672203063965 + }, + { + "auxiliary_loss_clip": 0.01074353, + "auxiliary_loss_mlp": 0.00772712, + "balance_loss_clip": 1.03954375, + "balance_loss_mlp": 1.00013876, + "epoch": 0.49385239741470016, + "flos": 32489101843200.0, + "grad_norm": 1.8844803433311228, + "language_loss": 0.7592994, + "learning_rate": 2.136788910691711e-06, + "loss": 0.77777004, + "num_input_tokens_seen": 176612715, + "step": 8214, + "time_per_iteration": 2.828538179397583 + }, + { + "auxiliary_loss_clip": 0.01131168, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.0492506, + "balance_loss_mlp": 1.02410388, + "epoch": 0.4939125206673681, + "flos": 22492864442880.0, + "grad_norm": 2.152096163807918, + "language_loss": 0.84490359, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.86660123, + "num_input_tokens_seen": 176631950, + "step": 8215, + "time_per_iteration": 2.6413228511810303 + }, + { + "auxiliary_loss_clip": 0.01108159, + "auxiliary_loss_mlp": 0.01033701, + "balance_loss_clip": 1.04206347, + "balance_loss_mlp": 1.02092147, + "epoch": 0.4939726439200361, + "flos": 31176351486720.0, + "grad_norm": 1.5888417840027016, + "language_loss": 0.83245987, + "learning_rate": 2.136011800934292e-06, + "loss": 0.8538785, + "num_input_tokens_seen": 176653060, + "step": 8216, + "time_per_iteration": 2.67913818359375 + }, + { + "auxiliary_loss_clip": 0.01097989, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.04419255, + "balance_loss_mlp": 1.02112412, + "epoch": 0.49403276717270406, + "flos": 22674213233280.0, + "grad_norm": 2.8019860461659087, + "language_loss": 0.74546432, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.76678985, + "num_input_tokens_seen": 176673895, + "step": 8217, + "time_per_iteration": 2.686866283416748 + }, + { + "auxiliary_loss_clip": 0.0112431, + "auxiliary_loss_mlp": 0.00771315, + "balance_loss_clip": 1.04717755, + "balance_loss_mlp": 1.00011575, + "epoch": 0.494092890425372, + "flos": 20741070147840.0, + "grad_norm": 1.5679905275329922, + "language_loss": 0.78933907, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.80829537, + "num_input_tokens_seen": 176692550, + "step": 8218, + "time_per_iteration": 2.6126081943511963 + }, + { + "auxiliary_loss_clip": 0.01073156, + "auxiliary_loss_mlp": 0.00770777, + "balance_loss_clip": 1.03962803, + "balance_loss_mlp": 1.000103, + "epoch": 0.49415301367804, + "flos": 18369026778240.0, + "grad_norm": 2.059466953332075, + "language_loss": 0.77003837, + "learning_rate": 2.134846097653142e-06, + "loss": 0.78847766, + "num_input_tokens_seen": 176709335, + "step": 8219, + "time_per_iteration": 2.705432176589966 + }, + { + "auxiliary_loss_clip": 0.01103123, + "auxiliary_loss_mlp": 0.01034129, + "balance_loss_clip": 1.04458046, + "balance_loss_mlp": 1.02009845, + "epoch": 0.49421313693070795, + "flos": 17530620451200.0, + "grad_norm": 1.9177646932354293, + "language_loss": 0.62838733, + "learning_rate": 2.134457519646357e-06, + "loss": 0.64975989, + "num_input_tokens_seen": 176727715, + "step": 8220, + "time_per_iteration": 2.615745782852173 + }, + { + "auxiliary_loss_clip": 0.01124834, + "auxiliary_loss_mlp": 0.01032605, + "balance_loss_clip": 1.04509032, + "balance_loss_mlp": 1.01844347, + "epoch": 0.4942732601833759, + "flos": 20812173120000.0, + "grad_norm": 1.9687050610151906, + "language_loss": 0.72233951, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.74391389, + "num_input_tokens_seen": 176747530, + "step": 8221, + "time_per_iteration": 2.6178054809570312 + }, + { + "auxiliary_loss_clip": 0.01085939, + "auxiliary_loss_mlp": 0.01035447, + "balance_loss_clip": 1.04544675, + "balance_loss_mlp": 1.02218497, + "epoch": 0.4943333834360439, + "flos": 15048941794560.0, + "grad_norm": 1.861092907129918, + "language_loss": 0.792252, + "learning_rate": 2.133680348351595e-06, + "loss": 0.81346589, + "num_input_tokens_seen": 176765260, + "step": 8222, + "time_per_iteration": 2.679504632949829 + }, + { + "auxiliary_loss_clip": 0.01115599, + "auxiliary_loss_mlp": 0.01036496, + "balance_loss_clip": 1.04686999, + "balance_loss_mlp": 1.022048, + "epoch": 0.49439350668871185, + "flos": 16070420764800.0, + "grad_norm": 2.9899447612273784, + "language_loss": 0.72679973, + "learning_rate": 2.133291755093088e-06, + "loss": 0.7483207, + "num_input_tokens_seen": 176781770, + "step": 8223, + "time_per_iteration": 2.581552028656006 + }, + { + "auxiliary_loss_clip": 0.01116938, + "auxiliary_loss_mlp": 0.01040425, + "balance_loss_clip": 1.04635167, + "balance_loss_mlp": 1.0257324, + "epoch": 0.4944536299413798, + "flos": 20880079781760.0, + "grad_norm": 2.0609443486265784, + "language_loss": 0.75248039, + "learning_rate": 2.132903156780144e-06, + "loss": 0.77405405, + "num_input_tokens_seen": 176800655, + "step": 8224, + "time_per_iteration": 2.6427581310272217 + }, + { + "auxiliary_loss_clip": 0.0110423, + "auxiliary_loss_mlp": 0.01033189, + "balance_loss_clip": 1.04815972, + "balance_loss_mlp": 1.01925385, + "epoch": 0.4945137531940478, + "flos": 26608908856320.0, + "grad_norm": 2.070444808683487, + "language_loss": 0.6428299, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.66420412, + "num_input_tokens_seen": 176820610, + "step": 8225, + "time_per_iteration": 2.685084104537964 + }, + { + "auxiliary_loss_clip": 0.01105728, + "auxiliary_loss_mlp": 0.01034446, + "balance_loss_clip": 1.04689407, + "balance_loss_mlp": 1.02097511, + "epoch": 0.49457387644671574, + "flos": 23988148738560.0, + "grad_norm": 2.0654038990553834, + "language_loss": 0.76539797, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.78679967, + "num_input_tokens_seen": 176840520, + "step": 8226, + "time_per_iteration": 2.776888132095337 + }, + { + "auxiliary_loss_clip": 0.01130995, + "auxiliary_loss_mlp": 0.01043657, + "balance_loss_clip": 1.04843736, + "balance_loss_mlp": 1.02849376, + "epoch": 0.49463399969938376, + "flos": 26976598427520.0, + "grad_norm": 1.7138853183765776, + "language_loss": 0.71274078, + "learning_rate": 2.131737331662051e-06, + "loss": 0.7344873, + "num_input_tokens_seen": 176860265, + "step": 8227, + "time_per_iteration": 2.6920416355133057 + }, + { + "auxiliary_loss_clip": 0.01109805, + "auxiliary_loss_mlp": 0.01042947, + "balance_loss_clip": 1.04749131, + "balance_loss_mlp": 1.02879047, + "epoch": 0.49469412295205173, + "flos": 29681534067840.0, + "grad_norm": 1.5610614491128025, + "language_loss": 0.7156117, + "learning_rate": 2.131348713278718e-06, + "loss": 0.73713928, + "num_input_tokens_seen": 176882910, + "step": 8228, + "time_per_iteration": 2.7586421966552734 + }, + { + "auxiliary_loss_clip": 0.01126513, + "auxiliary_loss_mlp": 0.01030651, + "balance_loss_clip": 1.04834974, + "balance_loss_mlp": 1.01664948, + "epoch": 0.4947542462047197, + "flos": 24131791226880.0, + "grad_norm": 1.7062154527873281, + "language_loss": 0.83690989, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.85848153, + "num_input_tokens_seen": 176903030, + "step": 8229, + "time_per_iteration": 2.643385887145996 + }, + { + "auxiliary_loss_clip": 0.01117283, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.04470325, + "balance_loss_mlp": 1.0201304, + "epoch": 0.49481436945738766, + "flos": 20045049333120.0, + "grad_norm": 1.8291146066570236, + "language_loss": 0.74686736, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.76839477, + "num_input_tokens_seen": 176919025, + "step": 8230, + "time_per_iteration": 2.6726033687591553 + }, + { + "auxiliary_loss_clip": 0.01112312, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.04797947, + "balance_loss_mlp": 1.01941717, + "epoch": 0.4948744927100556, + "flos": 15669550005120.0, + "grad_norm": 1.946821067065893, + "language_loss": 0.79830235, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.81975138, + "num_input_tokens_seen": 176937945, + "step": 8231, + "time_per_iteration": 2.627202272415161 + }, + { + "auxiliary_loss_clip": 0.01038701, + "auxiliary_loss_mlp": 0.01000467, + "balance_loss_clip": 1.02304196, + "balance_loss_mlp": 0.99924535, + "epoch": 0.4949346159627236, + "flos": 68872071502080.0, + "grad_norm": 0.7441317598934056, + "language_loss": 0.60252988, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62292159, + "num_input_tokens_seen": 177004575, + "step": 8232, + "time_per_iteration": 3.299022912979126 + }, + { + "auxiliary_loss_clip": 0.01103975, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.04270494, + "balance_loss_mlp": 1.0220201, + "epoch": 0.49499473921539155, + "flos": 24790285307520.0, + "grad_norm": 1.6243536723265515, + "language_loss": 0.69376481, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.71517295, + "num_input_tokens_seen": 177024155, + "step": 8233, + "time_per_iteration": 2.7124898433685303 + }, + { + "auxiliary_loss_clip": 0.01069129, + "auxiliary_loss_mlp": 0.01041459, + "balance_loss_clip": 1.03902805, + "balance_loss_mlp": 1.02584291, + "epoch": 0.4950548624680595, + "flos": 32707905540480.0, + "grad_norm": 1.998308286461765, + "language_loss": 0.66344726, + "learning_rate": 2.129016898898633e-06, + "loss": 0.68455309, + "num_input_tokens_seen": 177046185, + "step": 8234, + "time_per_iteration": 2.7932980060577393 + }, + { + "auxiliary_loss_clip": 0.01031932, + "auxiliary_loss_mlp": 0.01001723, + "balance_loss_clip": 1.02630067, + "balance_loss_mlp": 1.00048304, + "epoch": 0.4951149857207275, + "flos": 50082173066880.0, + "grad_norm": 0.7974470380945157, + "language_loss": 0.58048564, + "learning_rate": 2.128628245959482e-06, + "loss": 0.60082221, + "num_input_tokens_seen": 177099025, + "step": 8235, + "time_per_iteration": 3.095088481903076 + }, + { + "auxiliary_loss_clip": 0.01096356, + "auxiliary_loss_mlp": 0.01043085, + "balance_loss_clip": 1.0431416, + "balance_loss_mlp": 1.02861345, + "epoch": 0.49517510897339545, + "flos": 22236785406720.0, + "grad_norm": 1.5745194755893521, + "language_loss": 0.77200663, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.793401, + "num_input_tokens_seen": 177118365, + "step": 8236, + "time_per_iteration": 2.7678022384643555 + }, + { + "auxiliary_loss_clip": 0.01081616, + "auxiliary_loss_mlp": 0.01037859, + "balance_loss_clip": 1.0420413, + "balance_loss_mlp": 1.02397156, + "epoch": 0.4952352322260634, + "flos": 25374120969600.0, + "grad_norm": 1.6979000405196067, + "language_loss": 0.73080051, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.75199521, + "num_input_tokens_seen": 177136415, + "step": 8237, + "time_per_iteration": 2.764728307723999 + }, + { + "auxiliary_loss_clip": 0.01124754, + "auxiliary_loss_mlp": 0.01035631, + "balance_loss_clip": 1.04693317, + "balance_loss_mlp": 1.02215445, + "epoch": 0.4952953554787314, + "flos": 24608721035520.0, + "grad_norm": 1.914497446494958, + "language_loss": 0.75439888, + "learning_rate": 2.127462257935406e-06, + "loss": 0.77600276, + "num_input_tokens_seen": 177155690, + "step": 8238, + "time_per_iteration": 2.66549015045166 + }, + { + "auxiliary_loss_clip": 0.01084433, + "auxiliary_loss_mlp": 0.0104692, + "balance_loss_clip": 1.04372036, + "balance_loss_mlp": 1.03062415, + "epoch": 0.49535547873139935, + "flos": 17311278049920.0, + "grad_norm": 2.2478036902932508, + "language_loss": 0.73706102, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.75837457, + "num_input_tokens_seen": 177173350, + "step": 8239, + "time_per_iteration": 2.703118324279785 + }, + { + "auxiliary_loss_clip": 0.0104307, + "auxiliary_loss_mlp": 0.01038928, + "balance_loss_clip": 1.04188919, + "balance_loss_mlp": 1.0223105, + "epoch": 0.4954156019840673, + "flos": 20740315962240.0, + "grad_norm": 2.5033228354450667, + "language_loss": 0.7926327, + "learning_rate": 2.126684908394552e-06, + "loss": 0.8134526, + "num_input_tokens_seen": 177191115, + "step": 8240, + "time_per_iteration": 2.9256656169891357 + }, + { + "auxiliary_loss_clip": 0.01116686, + "auxiliary_loss_mlp": 0.01040866, + "balance_loss_clip": 1.04832554, + "balance_loss_mlp": 1.0278666, + "epoch": 0.49547572523673533, + "flos": 12820684567680.0, + "grad_norm": 2.1558656465787367, + "language_loss": 0.8547368, + "learning_rate": 2.126296226410898e-06, + "loss": 0.87631238, + "num_input_tokens_seen": 177206155, + "step": 8241, + "time_per_iteration": 2.9096901416778564 + }, + { + "auxiliary_loss_clip": 0.01067537, + "auxiliary_loss_mlp": 0.01039414, + "balance_loss_clip": 1.04159331, + "balance_loss_mlp": 1.02591348, + "epoch": 0.4955358484894033, + "flos": 15597046402560.0, + "grad_norm": 1.820610909823573, + "language_loss": 0.77092397, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.7919935, + "num_input_tokens_seen": 177224815, + "step": 8242, + "time_per_iteration": 2.6902410984039307 + }, + { + "auxiliary_loss_clip": 0.01104403, + "auxiliary_loss_mlp": 0.00771127, + "balance_loss_clip": 1.04569447, + "balance_loss_mlp": 1.00017774, + "epoch": 0.49559597174207126, + "flos": 26464368528000.0, + "grad_norm": 1.9730293387874334, + "language_loss": 0.67737073, + "learning_rate": 2.125518848090833e-06, + "loss": 0.69612604, + "num_input_tokens_seen": 177244490, + "step": 8243, + "time_per_iteration": 2.6972243785858154 + }, + { + "auxiliary_loss_clip": 0.01112124, + "auxiliary_loss_mlp": 0.01034088, + "balance_loss_clip": 1.04816341, + "balance_loss_mlp": 1.02076697, + "epoch": 0.4956560949947392, + "flos": 23148234040320.0, + "grad_norm": 2.2375947263106526, + "language_loss": 0.67908239, + "learning_rate": 2.125130151783901e-06, + "loss": 0.70054448, + "num_input_tokens_seen": 177264340, + "step": 8244, + "time_per_iteration": 2.762528419494629 + }, + { + "auxiliary_loss_clip": 0.01097015, + "auxiliary_loss_mlp": 0.01040284, + "balance_loss_clip": 1.04337358, + "balance_loss_mlp": 1.02460194, + "epoch": 0.4957162182474072, + "flos": 20773461237120.0, + "grad_norm": 1.8772229473228363, + "language_loss": 0.74776495, + "learning_rate": 2.12474145073202e-06, + "loss": 0.76913798, + "num_input_tokens_seen": 177283055, + "step": 8245, + "time_per_iteration": 2.7792561054229736 + }, + { + "auxiliary_loss_clip": 0.01115174, + "auxiliary_loss_mlp": 0.01036156, + "balance_loss_clip": 1.04705966, + "balance_loss_mlp": 1.02214909, + "epoch": 0.49577634150007516, + "flos": 18734202397440.0, + "grad_norm": 1.8990901453025917, + "language_loss": 0.8153336, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.83684695, + "num_input_tokens_seen": 177301140, + "step": 8246, + "time_per_iteration": 2.5740935802459717 + }, + { + "auxiliary_loss_clip": 0.01090358, + "auxiliary_loss_mlp": 0.0104326, + "balance_loss_clip": 1.04562306, + "balance_loss_mlp": 1.02767944, + "epoch": 0.4958364647527431, + "flos": 25554176870400.0, + "grad_norm": 1.8707658617569873, + "language_loss": 0.83808625, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.85942245, + "num_input_tokens_seen": 177323095, + "step": 8247, + "time_per_iteration": 4.410465955734253 + }, + { + "auxiliary_loss_clip": 0.01102086, + "auxiliary_loss_mlp": 0.01030712, + "balance_loss_clip": 1.05016184, + "balance_loss_mlp": 1.01716995, + "epoch": 0.4958965880054111, + "flos": 24425325169920.0, + "grad_norm": 1.9625896451991354, + "language_loss": 0.83650881, + "learning_rate": 2.123575319254087e-06, + "loss": 0.85783684, + "num_input_tokens_seen": 177339845, + "step": 8248, + "time_per_iteration": 4.395894289016724 + }, + { + "auxiliary_loss_clip": 0.01118567, + "auxiliary_loss_mlp": 0.01032735, + "balance_loss_clip": 1.04729056, + "balance_loss_mlp": 1.01836419, + "epoch": 0.49595671125807905, + "flos": 25083460114560.0, + "grad_norm": 1.8247689581014963, + "language_loss": 0.73558569, + "learning_rate": 2.123186599369812e-06, + "loss": 0.75709867, + "num_input_tokens_seen": 177359980, + "step": 8249, + "time_per_iteration": 4.36426305770874 + }, + { + "auxiliary_loss_clip": 0.01110094, + "auxiliary_loss_mlp": 0.01046161, + "balance_loss_clip": 1.04773486, + "balance_loss_mlp": 1.03169477, + "epoch": 0.496016834510747, + "flos": 16435883692800.0, + "grad_norm": 1.900690676640245, + "language_loss": 0.75902295, + "learning_rate": 2.122797874814289e-06, + "loss": 0.78058553, + "num_input_tokens_seen": 177378580, + "step": 8250, + "time_per_iteration": 4.203567266464233 + }, + { + "auxiliary_loss_clip": 0.011299, + "auxiliary_loss_mlp": 0.01042712, + "balance_loss_clip": 1.04861271, + "balance_loss_mlp": 1.02788305, + "epoch": 0.496076957763415, + "flos": 23437925228160.0, + "grad_norm": 1.7086851316152774, + "language_loss": 0.69983917, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.72156531, + "num_input_tokens_seen": 177398790, + "step": 8251, + "time_per_iteration": 2.6825788021087646 + }, + { + "auxiliary_loss_clip": 0.01092939, + "auxiliary_loss_mlp": 0.00771421, + "balance_loss_clip": 1.04950809, + "balance_loss_mlp": 1.00016773, + "epoch": 0.49613708101608295, + "flos": 16909509450240.0, + "grad_norm": 1.9257049963935782, + "language_loss": 0.80088174, + "learning_rate": 2.122020411748461e-06, + "loss": 0.81952536, + "num_input_tokens_seen": 177416515, + "step": 8252, + "time_per_iteration": 2.7017300128936768 + }, + { + "auxiliary_loss_clip": 0.01130139, + "auxiliary_loss_mlp": 0.01033677, + "balance_loss_clip": 1.04937637, + "balance_loss_mlp": 1.01769102, + "epoch": 0.4961972042687509, + "flos": 16618094409600.0, + "grad_norm": 1.7413302103337327, + "language_loss": 0.81005448, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.83169258, + "num_input_tokens_seen": 177434425, + "step": 8253, + "time_per_iteration": 2.5844311714172363 + }, + { + "auxiliary_loss_clip": 0.01092121, + "auxiliary_loss_mlp": 0.01031077, + "balance_loss_clip": 1.04245412, + "balance_loss_mlp": 1.01743925, + "epoch": 0.49625732752141893, + "flos": 28956749437440.0, + "grad_norm": 1.4814612406319185, + "language_loss": 0.67246485, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.69369686, + "num_input_tokens_seen": 177459675, + "step": 8254, + "time_per_iteration": 2.815851926803589 + }, + { + "auxiliary_loss_clip": 0.01091336, + "auxiliary_loss_mlp": 0.01052712, + "balance_loss_clip": 1.04560924, + "balance_loss_mlp": 1.03665471, + "epoch": 0.4963174507740869, + "flos": 23112359331840.0, + "grad_norm": 1.7981030707772934, + "language_loss": 0.74278247, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.76422298, + "num_input_tokens_seen": 177478895, + "step": 8255, + "time_per_iteration": 2.7599687576293945 + }, + { + "auxiliary_loss_clip": 0.01098276, + "auxiliary_loss_mlp": 0.01036505, + "balance_loss_clip": 1.04286051, + "balance_loss_mlp": 1.02203858, + "epoch": 0.49637757402675486, + "flos": 13917863450880.0, + "grad_norm": 1.736601635944992, + "language_loss": 0.81702995, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.83837777, + "num_input_tokens_seen": 177494920, + "step": 8256, + "time_per_iteration": 2.640913724899292 + }, + { + "auxiliary_loss_clip": 0.01096211, + "auxiliary_loss_mlp": 0.01033961, + "balance_loss_clip": 1.04346132, + "balance_loss_mlp": 1.02055597, + "epoch": 0.49643769727942283, + "flos": 22309001700480.0, + "grad_norm": 1.6034861047711904, + "language_loss": 0.81197649, + "learning_rate": 2.120076673368901e-06, + "loss": 0.83327824, + "num_input_tokens_seen": 177515455, + "step": 8257, + "time_per_iteration": 2.724745512008667 + }, + { + "auxiliary_loss_clip": 0.01133163, + "auxiliary_loss_mlp": 0.01039711, + "balance_loss_clip": 1.04763043, + "balance_loss_mlp": 1.02435732, + "epoch": 0.4964978205320908, + "flos": 19500248776320.0, + "grad_norm": 1.9280789180083706, + "language_loss": 0.66280329, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.68453205, + "num_input_tokens_seen": 177534040, + "step": 8258, + "time_per_iteration": 2.570275068283081 + }, + { + "auxiliary_loss_clip": 0.01110241, + "auxiliary_loss_mlp": 0.01032185, + "balance_loss_clip": 1.04361916, + "balance_loss_mlp": 1.01942396, + "epoch": 0.49655794378475876, + "flos": 23436524597760.0, + "grad_norm": 1.42579071834104, + "language_loss": 0.77627164, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.79769588, + "num_input_tokens_seen": 177554510, + "step": 8259, + "time_per_iteration": 2.676722288131714 + }, + { + "auxiliary_loss_clip": 0.01097253, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.04436278, + "balance_loss_mlp": 1.02406085, + "epoch": 0.4966180670374267, + "flos": 26831124345600.0, + "grad_norm": 1.5162865829701626, + "language_loss": 0.78461975, + "learning_rate": 2.1189103755834e-06, + "loss": 0.80597448, + "num_input_tokens_seen": 177575780, + "step": 8260, + "time_per_iteration": 2.7226130962371826 + }, + { + "auxiliary_loss_clip": 0.01100503, + "auxiliary_loss_mlp": 0.01035808, + "balance_loss_clip": 1.04154015, + "balance_loss_mlp": 1.02135992, + "epoch": 0.4966781902900947, + "flos": 22009326531840.0, + "grad_norm": 3.0057343325073456, + "language_loss": 0.76335442, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.78471756, + "num_input_tokens_seen": 177588965, + "step": 8261, + "time_per_iteration": 2.6477174758911133 + }, + { + "auxiliary_loss_clip": 0.01071745, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.03892851, + "balance_loss_mlp": 1.01939654, + "epoch": 0.49673831354276266, + "flos": 26213353309440.0, + "grad_norm": 1.835251427236856, + "language_loss": 0.89503151, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.9160741, + "num_input_tokens_seen": 177608425, + "step": 8262, + "time_per_iteration": 2.757200241088867 + }, + { + "auxiliary_loss_clip": 0.01068117, + "auxiliary_loss_mlp": 0.01035611, + "balance_loss_clip": 1.04000998, + "balance_loss_mlp": 1.0223608, + "epoch": 0.4967984367954306, + "flos": 23182277155200.0, + "grad_norm": 1.5869779774184047, + "language_loss": 0.73859417, + "learning_rate": 2.11774403721606e-06, + "loss": 0.7596314, + "num_input_tokens_seen": 177628240, + "step": 8263, + "time_per_iteration": 2.799468994140625 + }, + { + "auxiliary_loss_clip": 0.0108327, + "auxiliary_loss_mlp": 0.01039108, + "balance_loss_clip": 1.0480659, + "balance_loss_mlp": 1.02325881, + "epoch": 0.4968585600480986, + "flos": 19281445079040.0, + "grad_norm": 3.1164108836460036, + "language_loss": 0.70163679, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.72286057, + "num_input_tokens_seen": 177645920, + "step": 8264, + "time_per_iteration": 2.720449447631836 + }, + { + "auxiliary_loss_clip": 0.01098192, + "auxiliary_loss_mlp": 0.01032461, + "balance_loss_clip": 1.04328251, + "balance_loss_mlp": 1.01837087, + "epoch": 0.49691868330076655, + "flos": 22528703237760.0, + "grad_norm": 1.6446636391121152, + "language_loss": 0.65104395, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.67235053, + "num_input_tokens_seen": 177667185, + "step": 8265, + "time_per_iteration": 2.683858633041382 + }, + { + "auxiliary_loss_clip": 0.01028918, + "auxiliary_loss_mlp": 0.01002907, + "balance_loss_clip": 1.0220778, + "balance_loss_mlp": 1.00148249, + "epoch": 0.4969788065534345, + "flos": 66577128675840.0, + "grad_norm": 0.930084427968553, + "language_loss": 0.53491867, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.55523694, + "num_input_tokens_seen": 177733020, + "step": 8266, + "time_per_iteration": 3.2566375732421875 + }, + { + "auxiliary_loss_clip": 0.01113371, + "auxiliary_loss_mlp": 0.01032636, + "balance_loss_clip": 1.04611242, + "balance_loss_mlp": 1.01834857, + "epoch": 0.49703892980610254, + "flos": 24059503105920.0, + "grad_norm": 1.764439361537035, + "language_loss": 0.79587245, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.81733251, + "num_input_tokens_seen": 177753370, + "step": 8267, + "time_per_iteration": 2.6278576850891113 + }, + { + "auxiliary_loss_clip": 0.01102001, + "auxiliary_loss_mlp": 0.01039107, + "balance_loss_clip": 1.04590034, + "balance_loss_mlp": 1.02316856, + "epoch": 0.4970990530587705, + "flos": 29126174912640.0, + "grad_norm": 2.2169439003129385, + "language_loss": 0.74835396, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.76976496, + "num_input_tokens_seen": 177771530, + "step": 8268, + "time_per_iteration": 2.734259843826294 + }, + { + "auxiliary_loss_clip": 0.011141, + "auxiliary_loss_mlp": 0.00771431, + "balance_loss_clip": 1.04348183, + "balance_loss_mlp": 1.00014162, + "epoch": 0.49715917631143847, + "flos": 46026167258880.0, + "grad_norm": 4.0839840126254225, + "language_loss": 0.68041855, + "learning_rate": 2.115411240328073e-06, + "loss": 0.69927382, + "num_input_tokens_seen": 177796355, + "step": 8269, + "time_per_iteration": 2.90146541595459 + }, + { + "auxiliary_loss_clip": 0.01097171, + "auxiliary_loss_mlp": 0.01041712, + "balance_loss_clip": 1.04262531, + "balance_loss_mlp": 1.02837276, + "epoch": 0.49721929956410643, + "flos": 20191277600640.0, + "grad_norm": 2.5681883642378436, + "language_loss": 0.85533005, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.87671888, + "num_input_tokens_seen": 177814300, + "step": 8270, + "time_per_iteration": 2.8005404472351074 + }, + { + "auxiliary_loss_clip": 0.01081529, + "auxiliary_loss_mlp": 0.00771255, + "balance_loss_clip": 1.04315615, + "balance_loss_mlp": 1.00016665, + "epoch": 0.4972794228167744, + "flos": 21653560275840.0, + "grad_norm": 1.8215552302583695, + "language_loss": 0.70831466, + "learning_rate": 2.114633606196899e-06, + "loss": 0.72684252, + "num_input_tokens_seen": 177833615, + "step": 8271, + "time_per_iteration": 2.91554594039917 + }, + { + "auxiliary_loss_clip": 0.01112057, + "auxiliary_loss_mlp": 0.01035877, + "balance_loss_clip": 1.04666567, + "balance_loss_mlp": 1.02128029, + "epoch": 0.49733954606944236, + "flos": 24279743347200.0, + "grad_norm": 1.5312065445139798, + "language_loss": 0.78403968, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.80551904, + "num_input_tokens_seen": 177855315, + "step": 8272, + "time_per_iteration": 2.6702592372894287 + }, + { + "auxiliary_loss_clip": 0.01090488, + "auxiliary_loss_mlp": 0.01040546, + "balance_loss_clip": 1.0464623, + "balance_loss_mlp": 1.02679515, + "epoch": 0.4973996693221103, + "flos": 37852575730560.0, + "grad_norm": 2.547664660385474, + "language_loss": 0.6682387, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.68954909, + "num_input_tokens_seen": 177875590, + "step": 8273, + "time_per_iteration": 2.8257791996002197 + }, + { + "auxiliary_loss_clip": 0.01089829, + "auxiliary_loss_mlp": 0.01037205, + "balance_loss_clip": 1.04431605, + "balance_loss_mlp": 1.02347827, + "epoch": 0.4974597925747783, + "flos": 21361426963200.0, + "grad_norm": 1.5692617693087136, + "language_loss": 0.78097814, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.80224848, + "num_input_tokens_seen": 177894175, + "step": 8274, + "time_per_iteration": 2.6786539554595947 + }, + { + "auxiliary_loss_clip": 0.01087892, + "auxiliary_loss_mlp": 0.01037968, + "balance_loss_clip": 1.04171109, + "balance_loss_mlp": 1.02315021, + "epoch": 0.49751991582744626, + "flos": 30738133560960.0, + "grad_norm": 1.7539763145915706, + "language_loss": 0.75727397, + "learning_rate": 2.113078285889493e-06, + "loss": 0.77853251, + "num_input_tokens_seen": 177913920, + "step": 8275, + "time_per_iteration": 2.7289958000183105 + }, + { + "auxiliary_loss_clip": 0.01117048, + "auxiliary_loss_mlp": 0.01038819, + "balance_loss_clip": 1.04600728, + "balance_loss_mlp": 1.02240443, + "epoch": 0.4975800390801142, + "flos": 14100541044480.0, + "grad_norm": 2.0869085379368717, + "language_loss": 0.84277642, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.86433506, + "num_input_tokens_seen": 177930425, + "step": 8276, + "time_per_iteration": 2.612114667892456 + }, + { + "auxiliary_loss_clip": 0.01122283, + "auxiliary_loss_mlp": 0.00770821, + "balance_loss_clip": 1.04578209, + "balance_loss_mlp": 1.00012255, + "epoch": 0.4976401623327822, + "flos": 24207275658240.0, + "grad_norm": 2.0722406374843283, + "language_loss": 0.70213616, + "learning_rate": 2.112300599949172e-06, + "loss": 0.72106719, + "num_input_tokens_seen": 177949885, + "step": 8277, + "time_per_iteration": 2.627364158630371 + }, + { + "auxiliary_loss_clip": 0.01109969, + "auxiliary_loss_mlp": 0.01038763, + "balance_loss_clip": 1.04542017, + "balance_loss_mlp": 1.02430928, + "epoch": 0.49770028558545015, + "flos": 21136769349120.0, + "grad_norm": 1.855614041136712, + "language_loss": 0.82644826, + "learning_rate": 2.111911750583964e-06, + "loss": 0.84793556, + "num_input_tokens_seen": 177965720, + "step": 8278, + "time_per_iteration": 2.653998613357544 + }, + { + "auxiliary_loss_clip": 0.01117237, + "auxiliary_loss_mlp": 0.01041122, + "balance_loss_clip": 1.04625261, + "balance_loss_mlp": 1.02723408, + "epoch": 0.4977604088381181, + "flos": 16763927627520.0, + "grad_norm": 2.0212653893375276, + "language_loss": 0.67471039, + "learning_rate": 2.111522896975052e-06, + "loss": 0.69629395, + "num_input_tokens_seen": 177983190, + "step": 8279, + "time_per_iteration": 2.607090473175049 + }, + { + "auxiliary_loss_clip": 0.01115839, + "auxiliary_loss_mlp": 0.01041996, + "balance_loss_clip": 1.04406691, + "balance_loss_mlp": 1.02692842, + "epoch": 0.49782053209078614, + "flos": 15703521292800.0, + "grad_norm": 2.1427811758671527, + "language_loss": 0.70507026, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.72664863, + "num_input_tokens_seen": 178000155, + "step": 8280, + "time_per_iteration": 2.636384963989258 + }, + { + "auxiliary_loss_clip": 0.01090186, + "auxiliary_loss_mlp": 0.01035589, + "balance_loss_clip": 1.04237318, + "balance_loss_mlp": 1.02177858, + "epoch": 0.4978806553434541, + "flos": 24753692327040.0, + "grad_norm": 2.860421271049928, + "language_loss": 0.64889467, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.67015243, + "num_input_tokens_seen": 178021060, + "step": 8281, + "time_per_iteration": 2.6961820125579834 + }, + { + "auxiliary_loss_clip": 0.0111999, + "auxiliary_loss_mlp": 0.01036047, + "balance_loss_clip": 1.0478642, + "balance_loss_mlp": 1.02102113, + "epoch": 0.49794077859612207, + "flos": 13115726881920.0, + "grad_norm": 2.7426965878502845, + "language_loss": 0.73226738, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.75382769, + "num_input_tokens_seen": 178038180, + "step": 8282, + "time_per_iteration": 2.7749152183532715 + }, + { + "auxiliary_loss_clip": 0.01095648, + "auxiliary_loss_mlp": 0.01033955, + "balance_loss_clip": 1.04499686, + "balance_loss_mlp": 1.02106822, + "epoch": 0.49800090184879003, + "flos": 27525133998720.0, + "grad_norm": 1.749404235674241, + "language_loss": 0.73327482, + "learning_rate": 2.109967440397263e-06, + "loss": 0.75457078, + "num_input_tokens_seen": 178057565, + "step": 8283, + "time_per_iteration": 2.7039520740509033 + }, + { + "auxiliary_loss_clip": 0.01068275, + "auxiliary_loss_mlp": 0.01054525, + "balance_loss_clip": 1.0405463, + "balance_loss_mlp": 1.03883147, + "epoch": 0.498061025101458, + "flos": 19792489829760.0, + "grad_norm": 2.5573951668279102, + "language_loss": 0.7842927, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.80552071, + "num_input_tokens_seen": 178076965, + "step": 8284, + "time_per_iteration": 2.7534518241882324 + }, + { + "auxiliary_loss_clip": 0.01103825, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.045488, + "balance_loss_mlp": 1.02733326, + "epoch": 0.49812114835412596, + "flos": 29893909230720.0, + "grad_norm": 1.7317298938274186, + "language_loss": 0.73607123, + "learning_rate": 2.109189687029526e-06, + "loss": 0.75753278, + "num_input_tokens_seen": 178095105, + "step": 8285, + "time_per_iteration": 2.696913719177246 + }, + { + "auxiliary_loss_clip": 0.01114659, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.0496074, + "balance_loss_mlp": 1.01902318, + "epoch": 0.49818127160679393, + "flos": 23147048891520.0, + "grad_norm": 1.6428187648074233, + "language_loss": 0.74194658, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.76343036, + "num_input_tokens_seen": 178114505, + "step": 8286, + "time_per_iteration": 4.164494752883911 + }, + { + "auxiliary_loss_clip": 0.01106668, + "auxiliary_loss_mlp": 0.0104423, + "balance_loss_clip": 1.04752493, + "balance_loss_mlp": 1.02986491, + "epoch": 0.4982413948594619, + "flos": 21652806090240.0, + "grad_norm": 1.7990587687461415, + "language_loss": 0.85529351, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.87680244, + "num_input_tokens_seen": 178131595, + "step": 8287, + "time_per_iteration": 4.236407279968262 + }, + { + "auxiliary_loss_clip": 0.01076576, + "auxiliary_loss_mlp": 0.01032511, + "balance_loss_clip": 1.04194725, + "balance_loss_mlp": 1.01822948, + "epoch": 0.49830151811212986, + "flos": 32486982940800.0, + "grad_norm": 1.6860437652999367, + "language_loss": 0.72530627, + "learning_rate": 2.108023025961159e-06, + "loss": 0.74639714, + "num_input_tokens_seen": 178152055, + "step": 8288, + "time_per_iteration": 4.404609680175781 + }, + { + "auxiliary_loss_clip": 0.01106449, + "auxiliary_loss_mlp": 0.01040352, + "balance_loss_clip": 1.04326916, + "balance_loss_mlp": 1.02459288, + "epoch": 0.4983616413647978, + "flos": 18142358002560.0, + "grad_norm": 3.334734045415943, + "language_loss": 0.79885554, + "learning_rate": 2.10763413072622e-06, + "loss": 0.82032353, + "num_input_tokens_seen": 178168150, + "step": 8289, + "time_per_iteration": 2.6629836559295654 + }, + { + "auxiliary_loss_clip": 0.01114454, + "auxiliary_loss_mlp": 0.0103885, + "balance_loss_clip": 1.0446074, + "balance_loss_mlp": 1.02460992, + "epoch": 0.4984217646174658, + "flos": 19718836992000.0, + "grad_norm": 2.0640091139098256, + "language_loss": 0.72874933, + "learning_rate": 2.107245231409784e-06, + "loss": 0.75028241, + "num_input_tokens_seen": 178186150, + "step": 8290, + "time_per_iteration": 4.18574333190918 + }, + { + "auxiliary_loss_clip": 0.0112064, + "auxiliary_loss_mlp": 0.01040925, + "balance_loss_clip": 1.04972208, + "balance_loss_mlp": 1.02428377, + "epoch": 0.49848188787013376, + "flos": 24936549488640.0, + "grad_norm": 1.4927804425375188, + "language_loss": 0.8397218, + "learning_rate": 2.106856328026598e-06, + "loss": 0.86133754, + "num_input_tokens_seen": 178207665, + "step": 8291, + "time_per_iteration": 2.716386556625366 + }, + { + "auxiliary_loss_clip": 0.01103944, + "auxiliary_loss_mlp": 0.01046379, + "balance_loss_clip": 1.04420066, + "balance_loss_mlp": 1.02930808, + "epoch": 0.4985420111228017, + "flos": 22382439056640.0, + "grad_norm": 1.6316694600084898, + "language_loss": 0.67022264, + "learning_rate": 2.106467420591409e-06, + "loss": 0.69172579, + "num_input_tokens_seen": 178226325, + "step": 8292, + "time_per_iteration": 2.7027721405029297 + }, + { + "auxiliary_loss_clip": 0.01127175, + "auxiliary_loss_mlp": 0.01039323, + "balance_loss_clip": 1.04806566, + "balance_loss_mlp": 1.02625203, + "epoch": 0.4986021343754697, + "flos": 16216469464320.0, + "grad_norm": 1.6633361946509924, + "language_loss": 0.66995132, + "learning_rate": 2.106078509118965e-06, + "loss": 0.6916163, + "num_input_tokens_seen": 178244960, + "step": 8293, + "time_per_iteration": 2.5719261169433594 + }, + { + "auxiliary_loss_clip": 0.01111406, + "auxiliary_loss_mlp": 0.01029657, + "balance_loss_clip": 1.04379749, + "balance_loss_mlp": 1.01533389, + "epoch": 0.4986622576281377, + "flos": 23403594804480.0, + "grad_norm": 1.8610494318021187, + "language_loss": 0.82020485, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.84161556, + "num_input_tokens_seen": 178265400, + "step": 8294, + "time_per_iteration": 2.6504080295562744 + }, + { + "auxiliary_loss_clip": 0.01116097, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.04479063, + "balance_loss_mlp": 1.01604557, + "epoch": 0.49872238088080567, + "flos": 19974556892160.0, + "grad_norm": 2.2309244250260183, + "language_loss": 0.72901344, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.75048614, + "num_input_tokens_seen": 178284535, + "step": 8295, + "time_per_iteration": 2.6195027828216553 + }, + { + "auxiliary_loss_clip": 0.01059073, + "auxiliary_loss_mlp": 0.01038092, + "balance_loss_clip": 1.03994107, + "balance_loss_mlp": 1.02466345, + "epoch": 0.49878250413347364, + "flos": 22893016930560.0, + "grad_norm": 1.8092757241660187, + "language_loss": 0.67607826, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.69704998, + "num_input_tokens_seen": 178302425, + "step": 8296, + "time_per_iteration": 2.755263090133667 + }, + { + "auxiliary_loss_clip": 0.01104221, + "auxiliary_loss_mlp": 0.01042078, + "balance_loss_clip": 1.04649234, + "balance_loss_mlp": 1.02715254, + "epoch": 0.4988426273861416, + "flos": 32598449821440.0, + "grad_norm": 2.862724254512052, + "language_loss": 0.64573205, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.66719502, + "num_input_tokens_seen": 178323065, + "step": 8297, + "time_per_iteration": 2.77134108543396 + }, + { + "auxiliary_loss_clip": 0.01068772, + "auxiliary_loss_mlp": 0.01035076, + "balance_loss_clip": 1.04186463, + "balance_loss_mlp": 1.02241552, + "epoch": 0.49890275063880957, + "flos": 20923604087040.0, + "grad_norm": 1.6802177929429785, + "language_loss": 0.70005518, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.72109365, + "num_input_tokens_seen": 178343985, + "step": 8298, + "time_per_iteration": 2.7644965648651123 + }, + { + "auxiliary_loss_clip": 0.01123634, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.04611015, + "balance_loss_mlp": 1.02383053, + "epoch": 0.49896287389147753, + "flos": 18624459369600.0, + "grad_norm": 2.15895128631453, + "language_loss": 0.85060012, + "learning_rate": 2.103744956327814e-06, + "loss": 0.87220806, + "num_input_tokens_seen": 178362345, + "step": 8299, + "time_per_iteration": 2.6582682132720947 + }, + { + "auxiliary_loss_clip": 0.0109908, + "auxiliary_loss_mlp": 0.01042644, + "balance_loss_clip": 1.04576635, + "balance_loss_mlp": 1.02676535, + "epoch": 0.4990229971441455, + "flos": 24826555065600.0, + "grad_norm": 3.5746156367417177, + "language_loss": 0.69598472, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.71740198, + "num_input_tokens_seen": 178383190, + "step": 8300, + "time_per_iteration": 2.725041151046753 + }, + { + "auxiliary_loss_clip": 0.01026277, + "auxiliary_loss_mlp": 0.01006258, + "balance_loss_clip": 1.02488732, + "balance_loss_mlp": 1.00483894, + "epoch": 0.49908312039681346, + "flos": 71384525136000.0, + "grad_norm": 0.7557607717879434, + "language_loss": 0.51092541, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.53125077, + "num_input_tokens_seen": 178444250, + "step": 8301, + "time_per_iteration": 3.2866220474243164 + }, + { + "auxiliary_loss_clip": 0.01096877, + "auxiliary_loss_mlp": 0.01045659, + "balance_loss_clip": 1.04223108, + "balance_loss_mlp": 1.03140736, + "epoch": 0.4991432436494814, + "flos": 19828651847040.0, + "grad_norm": 1.7177443948136444, + "language_loss": 0.84648693, + "learning_rate": 2.102578126623879e-06, + "loss": 0.86791229, + "num_input_tokens_seen": 178463250, + "step": 8302, + "time_per_iteration": 2.66215181350708 + }, + { + "auxiliary_loss_clip": 0.01112659, + "auxiliary_loss_mlp": 0.01034193, + "balance_loss_clip": 1.04628754, + "balance_loss_mlp": 1.02111602, + "epoch": 0.4992033669021494, + "flos": 15121912273920.0, + "grad_norm": 5.640508686379792, + "language_loss": 0.68928391, + "learning_rate": 2.102189175590024e-06, + "loss": 0.71075243, + "num_input_tokens_seen": 178481340, + "step": 8303, + "time_per_iteration": 2.6031181812286377 + }, + { + "auxiliary_loss_clip": 0.01126853, + "auxiliary_loss_mlp": 0.01035164, + "balance_loss_clip": 1.04641497, + "balance_loss_mlp": 1.02095485, + "epoch": 0.49926349015481736, + "flos": 31207952476800.0, + "grad_norm": 1.6560759996443648, + "language_loss": 0.72727203, + "learning_rate": 2.101800220681144e-06, + "loss": 0.74889231, + "num_input_tokens_seen": 178501545, + "step": 8304, + "time_per_iteration": 2.706022262573242 + }, + { + "auxiliary_loss_clip": 0.01116141, + "auxiliary_loss_mlp": 0.01037357, + "balance_loss_clip": 1.0475409, + "balance_loss_mlp": 1.02420211, + "epoch": 0.4993236134074853, + "flos": 24900207903360.0, + "grad_norm": 2.1644384364092684, + "language_loss": 0.81342846, + "learning_rate": 2.10141126191199e-06, + "loss": 0.83496344, + "num_input_tokens_seen": 178519700, + "step": 8305, + "time_per_iteration": 2.6671528816223145 + }, + { + "auxiliary_loss_clip": 0.01024768, + "auxiliary_loss_mlp": 0.01003944, + "balance_loss_clip": 1.02671385, + "balance_loss_mlp": 1.00258529, + "epoch": 0.4993837366601533, + "flos": 70420573797120.0, + "grad_norm": 0.7597400638433706, + "language_loss": 0.56867081, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.58895797, + "num_input_tokens_seen": 178576740, + "step": 8306, + "time_per_iteration": 3.322448492050171 + }, + { + "auxiliary_loss_clip": 0.01127996, + "auxiliary_loss_mlp": 0.01039143, + "balance_loss_clip": 1.04948568, + "balance_loss_mlp": 1.02432525, + "epoch": 0.4994438599128213, + "flos": 15961216440960.0, + "grad_norm": 2.2302114161499236, + "language_loss": 0.82741839, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.84908974, + "num_input_tokens_seen": 178594745, + "step": 8307, + "time_per_iteration": 2.583996295928955 + }, + { + "auxiliary_loss_clip": 0.01126994, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.04805601, + "balance_loss_mlp": 1.02157855, + "epoch": 0.4995039831654893, + "flos": 27928303228800.0, + "grad_norm": 1.7094622949229625, + "language_loss": 0.60939324, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.63102394, + "num_input_tokens_seen": 178614110, + "step": 8308, + "time_per_iteration": 2.6170315742492676 + }, + { + "auxiliary_loss_clip": 0.01120806, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.04421234, + "balance_loss_mlp": 1.01890182, + "epoch": 0.49956410641815724, + "flos": 24204797619840.0, + "grad_norm": 1.8375312667766532, + "language_loss": 0.74889386, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.77042031, + "num_input_tokens_seen": 178634170, + "step": 8309, + "time_per_iteration": 2.6147258281707764 + }, + { + "auxiliary_loss_clip": 0.01102514, + "auxiliary_loss_mlp": 0.0103405, + "balance_loss_clip": 1.04401636, + "balance_loss_mlp": 1.02106261, + "epoch": 0.4996242296708252, + "flos": 16180127879040.0, + "grad_norm": 3.148005555228763, + "language_loss": 0.79502416, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.8163898, + "num_input_tokens_seen": 178651775, + "step": 8310, + "time_per_iteration": 2.6420629024505615 + }, + { + "auxiliary_loss_clip": 0.01111922, + "auxiliary_loss_mlp": 0.01040825, + "balance_loss_clip": 1.04564738, + "balance_loss_mlp": 1.02757514, + "epoch": 0.49968435292349317, + "flos": 16873527000960.0, + "grad_norm": 1.4976626914983278, + "language_loss": 0.70989597, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.73142344, + "num_input_tokens_seen": 178669720, + "step": 8311, + "time_per_iteration": 2.5778110027313232 + }, + { + "auxiliary_loss_clip": 0.01098554, + "auxiliary_loss_mlp": 0.01036482, + "balance_loss_clip": 1.04628289, + "balance_loss_mlp": 1.02355957, + "epoch": 0.49974447617616113, + "flos": 14939521989120.0, + "grad_norm": 2.0443790290482498, + "language_loss": 0.77375191, + "learning_rate": 2.098688443679187e-06, + "loss": 0.79510236, + "num_input_tokens_seen": 178686765, + "step": 8312, + "time_per_iteration": 2.6517751216888428 + }, + { + "auxiliary_loss_clip": 0.01095231, + "auxiliary_loss_mlp": 0.01035354, + "balance_loss_clip": 1.04635751, + "balance_loss_mlp": 1.02135265, + "epoch": 0.4998045994288291, + "flos": 26651535321600.0, + "grad_norm": 1.7937215644313522, + "language_loss": 0.84479403, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.86609983, + "num_input_tokens_seen": 178705845, + "step": 8313, + "time_per_iteration": 2.7882683277130127 + }, + { + "auxiliary_loss_clip": 0.01098533, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.04393864, + "balance_loss_mlp": 1.01856351, + "epoch": 0.49986472268149706, + "flos": 20953768533120.0, + "grad_norm": 1.8469644022391951, + "language_loss": 0.80625784, + "learning_rate": 2.097910461710939e-06, + "loss": 0.82756978, + "num_input_tokens_seen": 178723410, + "step": 8314, + "time_per_iteration": 2.6792070865631104 + }, + { + "auxiliary_loss_clip": 0.01093189, + "auxiliary_loss_mlp": 0.00772869, + "balance_loss_clip": 1.04282761, + "balance_loss_mlp": 1.00018048, + "epoch": 0.49992484593416503, + "flos": 22783884433920.0, + "grad_norm": 1.9116629548957604, + "language_loss": 0.79824436, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.8169049, + "num_input_tokens_seen": 178743560, + "step": 8315, + "time_per_iteration": 2.885185718536377 + }, + { + "auxiliary_loss_clip": 0.01126333, + "auxiliary_loss_mlp": 0.01033363, + "balance_loss_clip": 1.04775071, + "balance_loss_mlp": 1.02025628, + "epoch": 0.499984969186833, + "flos": 46786970252160.0, + "grad_norm": 1.6207947092177402, + "language_loss": 0.74976832, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.77136528, + "num_input_tokens_seen": 178767225, + "step": 8316, + "time_per_iteration": 2.865182399749756 + }, + { + "auxiliary_loss_clip": 0.01104962, + "auxiliary_loss_mlp": 0.0103454, + "balance_loss_clip": 1.04472423, + "balance_loss_mlp": 1.02195168, + "epoch": 0.500045092439501, + "flos": 25556978131200.0, + "grad_norm": 1.839667572981257, + "language_loss": 0.81122506, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.83262014, + "num_input_tokens_seen": 178786810, + "step": 8317, + "time_per_iteration": 2.781627893447876 + }, + { + "auxiliary_loss_clip": 0.011005, + "auxiliary_loss_mlp": 0.01038819, + "balance_loss_clip": 1.04331255, + "balance_loss_mlp": 1.02368522, + "epoch": 0.5001052156921689, + "flos": 20704764476160.0, + "grad_norm": 1.6607654789374993, + "language_loss": 0.83369392, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.8550871, + "num_input_tokens_seen": 178805660, + "step": 8318, + "time_per_iteration": 2.790937662124634 + }, + { + "auxiliary_loss_clip": 0.01114137, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.04552984, + "balance_loss_mlp": 1.01974225, + "epoch": 0.500165338944837, + "flos": 21251109317760.0, + "grad_norm": 1.7594247797212967, + "language_loss": 0.81800634, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.83947688, + "num_input_tokens_seen": 178824780, + "step": 8319, + "time_per_iteration": 2.6710760593414307 + }, + { + "auxiliary_loss_clip": 0.01080263, + "auxiliary_loss_mlp": 0.01030013, + "balance_loss_clip": 1.03828013, + "balance_loss_mlp": 1.01689363, + "epoch": 0.5002254621975049, + "flos": 27854398995840.0, + "grad_norm": 1.5279258864896563, + "language_loss": 0.71943277, + "learning_rate": 2.095576427171635e-06, + "loss": 0.7405355, + "num_input_tokens_seen": 178845640, + "step": 8320, + "time_per_iteration": 2.7864880561828613 + }, + { + "auxiliary_loss_clip": 0.01093478, + "auxiliary_loss_mlp": 0.01044698, + "balance_loss_clip": 1.04542255, + "balance_loss_mlp": 1.02964222, + "epoch": 0.5002855854501729, + "flos": 15551941898880.0, + "grad_norm": 2.783304711521318, + "language_loss": 0.76481223, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.78619403, + "num_input_tokens_seen": 178862290, + "step": 8321, + "time_per_iteration": 2.7580785751342773 + }, + { + "auxiliary_loss_clip": 0.01115908, + "auxiliary_loss_mlp": 0.00771212, + "balance_loss_clip": 1.04681301, + "balance_loss_mlp": 1.00017464, + "epoch": 0.5003457087028408, + "flos": 16107408794880.0, + "grad_norm": 6.807525102727238, + "language_loss": 0.82965297, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.84852415, + "num_input_tokens_seen": 178879805, + "step": 8322, + "time_per_iteration": 2.6580779552459717 + }, + { + "auxiliary_loss_clip": 0.01117442, + "auxiliary_loss_mlp": 0.0103527, + "balance_loss_clip": 1.0458411, + "balance_loss_mlp": 1.02163804, + "epoch": 0.5004058319555088, + "flos": 22710518904960.0, + "grad_norm": 2.2579769372834257, + "language_loss": 0.73329234, + "learning_rate": 2.094409360775228e-06, + "loss": 0.75481945, + "num_input_tokens_seen": 178896985, + "step": 8323, + "time_per_iteration": 2.6743083000183105 + }, + { + "auxiliary_loss_clip": 0.01086486, + "auxiliary_loss_mlp": 0.01036398, + "balance_loss_clip": 1.04470778, + "balance_loss_mlp": 1.02264738, + "epoch": 0.5004659552081767, + "flos": 30117956313600.0, + "grad_norm": 1.846103580376976, + "language_loss": 0.69483137, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.71606022, + "num_input_tokens_seen": 178920605, + "step": 8324, + "time_per_iteration": 2.783973217010498 + }, + { + "auxiliary_loss_clip": 0.01106501, + "auxiliary_loss_mlp": 0.00771259, + "balance_loss_clip": 1.0422833, + "balance_loss_mlp": 1.0000751, + "epoch": 0.5005260784608447, + "flos": 18624710764800.0, + "grad_norm": 3.4520936591258224, + "language_loss": 0.72325313, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.74203074, + "num_input_tokens_seen": 178937760, + "step": 8325, + "time_per_iteration": 4.274277448654175 + }, + { + "auxiliary_loss_clip": 0.01089915, + "auxiliary_loss_mlp": 0.01041836, + "balance_loss_clip": 1.04158878, + "balance_loss_mlp": 1.02669656, + "epoch": 0.5005862017135126, + "flos": 24859987649280.0, + "grad_norm": 1.7422514730064806, + "language_loss": 0.73518062, + "learning_rate": 2.093242262158709e-06, + "loss": 0.7564981, + "num_input_tokens_seen": 178957985, + "step": 8326, + "time_per_iteration": 4.3523108959198 + }, + { + "auxiliary_loss_clip": 0.01094661, + "auxiliary_loss_mlp": 0.01032547, + "balance_loss_clip": 1.04201293, + "balance_loss_mlp": 1.01984525, + "epoch": 0.5006463249661807, + "flos": 18734381965440.0, + "grad_norm": 1.5476902232241379, + "language_loss": 0.78111005, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.80238211, + "num_input_tokens_seen": 178977070, + "step": 8327, + "time_per_iteration": 4.4682557582855225 + }, + { + "auxiliary_loss_clip": 0.01128169, + "auxiliary_loss_mlp": 0.01040162, + "balance_loss_clip": 1.04810429, + "balance_loss_mlp": 1.02641153, + "epoch": 0.5007064482188487, + "flos": 13042145871360.0, + "grad_norm": 2.1714411479157296, + "language_loss": 0.88089001, + "learning_rate": 2.092464178710997e-06, + "loss": 0.90257335, + "num_input_tokens_seen": 178994175, + "step": 8328, + "time_per_iteration": 2.5710413455963135 + }, + { + "auxiliary_loss_clip": 0.01091641, + "auxiliary_loss_mlp": 0.01034728, + "balance_loss_clip": 1.04136801, + "balance_loss_mlp": 1.02050591, + "epoch": 0.5007665714715166, + "flos": 21288671965440.0, + "grad_norm": 2.863428491996577, + "language_loss": 0.73827946, + "learning_rate": 2.092075131720388e-06, + "loss": 0.75954318, + "num_input_tokens_seen": 179013710, + "step": 8329, + "time_per_iteration": 2.7770020961761475 + }, + { + "auxiliary_loss_clip": 0.01124061, + "auxiliary_loss_mlp": 0.0103094, + "balance_loss_clip": 1.04667771, + "balance_loss_mlp": 1.01824427, + "epoch": 0.5008266947241846, + "flos": 29754576374400.0, + "grad_norm": 1.6131098934363575, + "language_loss": 0.79715234, + "learning_rate": 2.091686081238281e-06, + "loss": 0.81870234, + "num_input_tokens_seen": 179035255, + "step": 8330, + "time_per_iteration": 4.167505979537964 + }, + { + "auxiliary_loss_clip": 0.01021039, + "auxiliary_loss_mlp": 0.00752271, + "balance_loss_clip": 1.02094173, + "balance_loss_mlp": 0.9997682, + "epoch": 0.5008868179768525, + "flos": 63557829204480.0, + "grad_norm": 0.7263095406539528, + "language_loss": 0.5601325, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.5778656, + "num_input_tokens_seen": 179090915, + "step": 8331, + "time_per_iteration": 3.008077621459961 + }, + { + "auxiliary_loss_clip": 0.01112181, + "auxiliary_loss_mlp": 0.01035155, + "balance_loss_clip": 1.04617071, + "balance_loss_mlp": 1.02216136, + "epoch": 0.5009469412295205, + "flos": 27375637593600.0, + "grad_norm": 2.025315423078993, + "language_loss": 0.65264666, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.67412001, + "num_input_tokens_seen": 179109160, + "step": 8332, + "time_per_iteration": 2.6730518341064453 + }, + { + "auxiliary_loss_clip": 0.01120357, + "auxiliary_loss_mlp": 0.01033936, + "balance_loss_clip": 1.04410577, + "balance_loss_mlp": 1.02124023, + "epoch": 0.5010070644821885, + "flos": 27378833904000.0, + "grad_norm": 1.5954618594032755, + "language_loss": 0.75023079, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.7717737, + "num_input_tokens_seen": 179130610, + "step": 8333, + "time_per_iteration": 2.685154914855957 + }, + { + "auxiliary_loss_clip": 0.01125291, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.04558921, + "balance_loss_mlp": 1.02145159, + "epoch": 0.5010671877348565, + "flos": 20662748542080.0, + "grad_norm": 1.9338828530124208, + "language_loss": 0.80424768, + "learning_rate": 2.090129844689929e-06, + "loss": 0.82585168, + "num_input_tokens_seen": 179147860, + "step": 8334, + "time_per_iteration": 2.627230405807495 + }, + { + "auxiliary_loss_clip": 0.01037349, + "auxiliary_loss_mlp": 0.01004574, + "balance_loss_clip": 1.02146554, + "balance_loss_mlp": 1.00316703, + "epoch": 0.5011273109875244, + "flos": 59128645000320.0, + "grad_norm": 0.8902108893007158, + "language_loss": 0.62708843, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64750767, + "num_input_tokens_seen": 179210490, + "step": 8335, + "time_per_iteration": 3.2171308994293213 + }, + { + "auxiliary_loss_clip": 0.01110054, + "auxiliary_loss_mlp": 0.01029223, + "balance_loss_clip": 1.04289985, + "balance_loss_mlp": 1.01612818, + "epoch": 0.5011874342401924, + "flos": 25336342840320.0, + "grad_norm": 1.3859166459285381, + "language_loss": 0.79553854, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.81693137, + "num_input_tokens_seen": 179231360, + "step": 8336, + "time_per_iteration": 2.6930394172668457 + }, + { + "auxiliary_loss_clip": 0.01082861, + "auxiliary_loss_mlp": 0.01032761, + "balance_loss_clip": 1.03948808, + "balance_loss_mlp": 1.01899827, + "epoch": 0.5012475574928603, + "flos": 20229953569920.0, + "grad_norm": 2.2337029404169457, + "language_loss": 0.80255198, + "learning_rate": 2.088962631340836e-06, + "loss": 0.82370824, + "num_input_tokens_seen": 179250625, + "step": 8337, + "time_per_iteration": 2.725379467010498 + }, + { + "auxiliary_loss_clip": 0.01129165, + "auxiliary_loss_mlp": 0.01038167, + "balance_loss_clip": 1.04644942, + "balance_loss_mlp": 1.0239507, + "epoch": 0.5013076807455283, + "flos": 22710123855360.0, + "grad_norm": 2.0126131839523835, + "language_loss": 0.79470736, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.81638074, + "num_input_tokens_seen": 179267360, + "step": 8338, + "time_per_iteration": 2.6641087532043457 + }, + { + "auxiliary_loss_clip": 0.01100565, + "auxiliary_loss_mlp": 0.01029861, + "balance_loss_clip": 1.04381251, + "balance_loss_mlp": 1.01617527, + "epoch": 0.5013678039981962, + "flos": 24245161528320.0, + "grad_norm": 1.6605427604759349, + "language_loss": 0.85052264, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87182683, + "num_input_tokens_seen": 179289810, + "step": 8339, + "time_per_iteration": 2.7899603843688965 + }, + { + "auxiliary_loss_clip": 0.0111167, + "auxiliary_loss_mlp": 0.01037127, + "balance_loss_clip": 1.04381561, + "balance_loss_mlp": 1.02343023, + "epoch": 0.5014279272508643, + "flos": 26176688501760.0, + "grad_norm": 1.4822129376950433, + "language_loss": 0.70713747, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.72862542, + "num_input_tokens_seen": 179310620, + "step": 8340, + "time_per_iteration": 2.773681402206421 + }, + { + "auxiliary_loss_clip": 0.01088541, + "auxiliary_loss_mlp": 0.01043525, + "balance_loss_clip": 1.04147744, + "balance_loss_mlp": 1.02764666, + "epoch": 0.5014880505035323, + "flos": 21430446946560.0, + "grad_norm": 1.9911594693512178, + "language_loss": 0.78301972, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.80434036, + "num_input_tokens_seen": 179329005, + "step": 8341, + "time_per_iteration": 2.7607786655426025 + }, + { + "auxiliary_loss_clip": 0.01096808, + "auxiliary_loss_mlp": 0.01038511, + "balance_loss_clip": 1.04584622, + "balance_loss_mlp": 1.02391934, + "epoch": 0.5015481737562002, + "flos": 15770745596160.0, + "grad_norm": 4.243666050944008, + "language_loss": 0.89054161, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.9118948, + "num_input_tokens_seen": 179343785, + "step": 8342, + "time_per_iteration": 2.7108232975006104 + }, + { + "auxiliary_loss_clip": 0.01103427, + "auxiliary_loss_mlp": 0.01036162, + "balance_loss_clip": 1.04467797, + "balance_loss_mlp": 1.02273893, + "epoch": 0.5016082970088682, + "flos": 26830801123200.0, + "grad_norm": 1.768885433843204, + "language_loss": 0.76325786, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.78465378, + "num_input_tokens_seen": 179364070, + "step": 8343, + "time_per_iteration": 2.6551196575164795 + }, + { + "auxiliary_loss_clip": 0.01113632, + "auxiliary_loss_mlp": 0.01028707, + "balance_loss_clip": 1.04612589, + "balance_loss_mlp": 1.01574898, + "epoch": 0.5016684202615361, + "flos": 21470595373440.0, + "grad_norm": 1.8502078003194165, + "language_loss": 0.6725269, + "learning_rate": 2.086239016143293e-06, + "loss": 0.6939503, + "num_input_tokens_seen": 179384225, + "step": 8344, + "time_per_iteration": 2.634850263595581 + }, + { + "auxiliary_loss_clip": 0.01104392, + "auxiliary_loss_mlp": 0.0103805, + "balance_loss_clip": 1.04439509, + "balance_loss_mlp": 1.025056, + "epoch": 0.5017285435142042, + "flos": 26246821806720.0, + "grad_norm": 2.403480744645997, + "language_loss": 0.75519335, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.77661783, + "num_input_tokens_seen": 179402595, + "step": 8345, + "time_per_iteration": 2.7551872730255127 + }, + { + "auxiliary_loss_clip": 0.01111042, + "auxiliary_loss_mlp": 0.01031467, + "balance_loss_clip": 1.04757214, + "balance_loss_mlp": 1.01661348, + "epoch": 0.5017886667668721, + "flos": 20777555387520.0, + "grad_norm": 2.18282722391055, + "language_loss": 0.78664625, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.80807132, + "num_input_tokens_seen": 179419635, + "step": 8346, + "time_per_iteration": 2.661569833755493 + }, + { + "auxiliary_loss_clip": 0.01102528, + "auxiliary_loss_mlp": 0.00770029, + "balance_loss_clip": 1.04322028, + "balance_loss_mlp": 1.00006032, + "epoch": 0.5018487900195401, + "flos": 20156408472960.0, + "grad_norm": 1.5952257408001917, + "language_loss": 0.69384575, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.71257138, + "num_input_tokens_seen": 179438770, + "step": 8347, + "time_per_iteration": 2.7273542881011963 + }, + { + "auxiliary_loss_clip": 0.0108784, + "auxiliary_loss_mlp": 0.01037703, + "balance_loss_clip": 1.04173744, + "balance_loss_mlp": 1.02352858, + "epoch": 0.501908913272208, + "flos": 18150689957760.0, + "grad_norm": 1.852088117198485, + "language_loss": 0.70635176, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.72760713, + "num_input_tokens_seen": 179457475, + "step": 8348, + "time_per_iteration": 2.7395875453948975 + }, + { + "auxiliary_loss_clip": 0.01110808, + "auxiliary_loss_mlp": 0.01035347, + "balance_loss_clip": 1.04538929, + "balance_loss_mlp": 1.02306843, + "epoch": 0.501969036524876, + "flos": 23112287504640.0, + "grad_norm": 1.775170825025465, + "language_loss": 0.74760187, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.76906341, + "num_input_tokens_seen": 179478140, + "step": 8349, + "time_per_iteration": 2.6996099948883057 + }, + { + "auxiliary_loss_clip": 0.01112401, + "auxiliary_loss_mlp": 0.01034238, + "balance_loss_clip": 1.0427202, + "balance_loss_mlp": 1.01971805, + "epoch": 0.5020291597775439, + "flos": 11363214314880.0, + "grad_norm": 2.078287176668375, + "language_loss": 0.63625813, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.6577245, + "num_input_tokens_seen": 179494325, + "step": 8350, + "time_per_iteration": 2.6264822483062744 + }, + { + "auxiliary_loss_clip": 0.01015981, + "auxiliary_loss_mlp": 0.01015388, + "balance_loss_clip": 1.01908755, + "balance_loss_mlp": 1.01377916, + "epoch": 0.5020892830302119, + "flos": 64011094928640.0, + "grad_norm": 0.7752505604108973, + "language_loss": 0.59761232, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.617926, + "num_input_tokens_seen": 179553545, + "step": 8351, + "time_per_iteration": 3.4168505668640137 + }, + { + "auxiliary_loss_clip": 0.01100468, + "auxiliary_loss_mlp": 0.0103649, + "balance_loss_clip": 1.04387021, + "balance_loss_mlp": 1.02232814, + "epoch": 0.5021494062828799, + "flos": 23732859801600.0, + "grad_norm": 1.746970205481512, + "language_loss": 0.74981982, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.77118939, + "num_input_tokens_seen": 179573645, + "step": 8352, + "time_per_iteration": 2.7219762802124023 + }, + { + "auxiliary_loss_clip": 0.01097371, + "auxiliary_loss_mlp": 0.0103593, + "balance_loss_clip": 1.04458284, + "balance_loss_mlp": 1.02211952, + "epoch": 0.5022095295355479, + "flos": 21576747041280.0, + "grad_norm": 1.6929263676943664, + "language_loss": 0.71971965, + "learning_rate": 2.082736990429464e-06, + "loss": 0.74105263, + "num_input_tokens_seen": 179591435, + "step": 8353, + "time_per_iteration": 2.6912848949432373 + }, + { + "auxiliary_loss_clip": 0.01123337, + "auxiliary_loss_mlp": 0.01037374, + "balance_loss_clip": 1.05196476, + "balance_loss_mlp": 1.02265787, + "epoch": 0.5022696527882159, + "flos": 21397229844480.0, + "grad_norm": 1.8297806631316527, + "language_loss": 0.74025398, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.76186109, + "num_input_tokens_seen": 179609955, + "step": 8354, + "time_per_iteration": 2.7325775623321533 + }, + { + "auxiliary_loss_clip": 0.0110051, + "auxiliary_loss_mlp": 0.01042571, + "balance_loss_clip": 1.04367399, + "balance_loss_mlp": 1.02817094, + "epoch": 0.5023297760408838, + "flos": 27160712565120.0, + "grad_norm": 1.8324523966840642, + "language_loss": 0.72395205, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.74538279, + "num_input_tokens_seen": 179630875, + "step": 8355, + "time_per_iteration": 2.717954158782959 + }, + { + "auxiliary_loss_clip": 0.01117118, + "auxiliary_loss_mlp": 0.01041207, + "balance_loss_clip": 1.0459739, + "balance_loss_mlp": 1.02644253, + "epoch": 0.5023898992935518, + "flos": 26213820186240.0, + "grad_norm": 1.6992540953340016, + "language_loss": 0.81400853, + "learning_rate": 2.081569591520548e-06, + "loss": 0.83559179, + "num_input_tokens_seen": 179649835, + "step": 8356, + "time_per_iteration": 2.7149479389190674 + }, + { + "auxiliary_loss_clip": 0.01117006, + "auxiliary_loss_mlp": 0.01044256, + "balance_loss_clip": 1.04384911, + "balance_loss_mlp": 1.02906859, + "epoch": 0.5024500225462197, + "flos": 13440323111040.0, + "grad_norm": 2.281950898223197, + "language_loss": 0.76235557, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.78396809, + "num_input_tokens_seen": 179667605, + "step": 8357, + "time_per_iteration": 2.6641504764556885 + }, + { + "auxiliary_loss_clip": 0.01115092, + "auxiliary_loss_mlp": 0.01038737, + "balance_loss_clip": 1.04538774, + "balance_loss_mlp": 1.02369308, + "epoch": 0.5025101457988878, + "flos": 21579584215680.0, + "grad_norm": 1.606830870939079, + "language_loss": 0.766074, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.78761232, + "num_input_tokens_seen": 179686910, + "step": 8358, + "time_per_iteration": 2.715304136276245 + }, + { + "auxiliary_loss_clip": 0.01101769, + "auxiliary_loss_mlp": 0.0103829, + "balance_loss_clip": 1.04243326, + "balance_loss_mlp": 1.02330494, + "epoch": 0.5025702690515557, + "flos": 24645134448000.0, + "grad_norm": 2.4091387510851354, + "language_loss": 0.72286153, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.7442621, + "num_input_tokens_seen": 179706395, + "step": 8359, + "time_per_iteration": 2.7783002853393555 + }, + { + "auxiliary_loss_clip": 0.01097913, + "auxiliary_loss_mlp": 0.01045718, + "balance_loss_clip": 1.04463625, + "balance_loss_mlp": 1.03208613, + "epoch": 0.5026303923042237, + "flos": 22090162089600.0, + "grad_norm": 1.9040983502257391, + "language_loss": 0.76839483, + "learning_rate": 2.080013016407077e-06, + "loss": 0.7898311, + "num_input_tokens_seen": 179725735, + "step": 8360, + "time_per_iteration": 2.6632778644561768 + }, + { + "auxiliary_loss_clip": 0.01085631, + "auxiliary_loss_mlp": 0.01038787, + "balance_loss_clip": 1.04737091, + "balance_loss_mlp": 1.02541208, + "epoch": 0.5026905155568916, + "flos": 23697200574720.0, + "grad_norm": 1.9221287440607566, + "language_loss": 0.7667141, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.78795838, + "num_input_tokens_seen": 179746150, + "step": 8361, + "time_per_iteration": 2.7411348819732666 + }, + { + "auxiliary_loss_clip": 0.01096697, + "auxiliary_loss_mlp": 0.01034867, + "balance_loss_clip": 1.04426289, + "balance_loss_mlp": 1.01988244, + "epoch": 0.5027506388095596, + "flos": 25812410722560.0, + "grad_norm": 1.5686217043676736, + "language_loss": 0.85069525, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.87201089, + "num_input_tokens_seen": 179767550, + "step": 8362, + "time_per_iteration": 2.827319622039795 + }, + { + "auxiliary_loss_clip": 0.01102707, + "auxiliary_loss_mlp": 0.01033879, + "balance_loss_clip": 1.0435946, + "balance_loss_mlp": 1.02022946, + "epoch": 0.5028107620622275, + "flos": 27526606456320.0, + "grad_norm": 1.54737690881779, + "language_loss": 0.78134143, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.80270725, + "num_input_tokens_seen": 179790075, + "step": 8363, + "time_per_iteration": 2.76174259185791 + }, + { + "auxiliary_loss_clip": 0.01111576, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.04562223, + "balance_loss_mlp": 1.02060819, + "epoch": 0.5028708853148955, + "flos": 24534278098560.0, + "grad_norm": 3.229087026174198, + "language_loss": 0.75995886, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.78142548, + "num_input_tokens_seen": 179806515, + "step": 8364, + "time_per_iteration": 4.35154914855957 + }, + { + "auxiliary_loss_clip": 0.01124922, + "auxiliary_loss_mlp": 0.01030963, + "balance_loss_clip": 1.04685044, + "balance_loss_mlp": 1.01810658, + "epoch": 0.5029310085675635, + "flos": 20813609664000.0, + "grad_norm": 1.5241312757107228, + "language_loss": 0.69465041, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.71620929, + "num_input_tokens_seen": 179826450, + "step": 8365, + "time_per_iteration": 2.619415283203125 + }, + { + "auxiliary_loss_clip": 0.01103666, + "auxiliary_loss_mlp": 0.01034829, + "balance_loss_clip": 1.04435158, + "balance_loss_mlp": 1.0207144, + "epoch": 0.5029911318202315, + "flos": 22342470197760.0, + "grad_norm": 1.4884180792885182, + "language_loss": 0.73293805, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.75432301, + "num_input_tokens_seen": 179846770, + "step": 8366, + "time_per_iteration": 4.228264331817627 + }, + { + "auxiliary_loss_clip": 0.01113401, + "auxiliary_loss_mlp": 0.01032302, + "balance_loss_clip": 1.04693627, + "balance_loss_mlp": 1.0189749, + "epoch": 0.5030512550728995, + "flos": 24352713826560.0, + "grad_norm": 1.4343945223262573, + "language_loss": 0.7806654, + "learning_rate": 2.077288893713735e-06, + "loss": 0.80212247, + "num_input_tokens_seen": 179866585, + "step": 8367, + "time_per_iteration": 4.1336071491241455 + }, + { + "auxiliary_loss_clip": 0.01113589, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.0443697, + "balance_loss_mlp": 1.01778555, + "epoch": 0.5031113783255674, + "flos": 18259930195200.0, + "grad_norm": 1.686368676940742, + "language_loss": 0.69880998, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.72025627, + "num_input_tokens_seen": 179885575, + "step": 8368, + "time_per_iteration": 2.5836374759674072 + }, + { + "auxiliary_loss_clip": 0.01036914, + "auxiliary_loss_mlp": 0.01003217, + "balance_loss_clip": 1.0201298, + "balance_loss_mlp": 1.00156045, + "epoch": 0.5031715015782354, + "flos": 57253173200640.0, + "grad_norm": 0.8467965026864039, + "language_loss": 0.63315928, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65356052, + "num_input_tokens_seen": 179939650, + "step": 8369, + "time_per_iteration": 4.438805103302002 + }, + { + "auxiliary_loss_clip": 0.011076, + "auxiliary_loss_mlp": 0.01034663, + "balance_loss_clip": 1.04427028, + "balance_loss_mlp": 1.0215379, + "epoch": 0.5032316248309033, + "flos": 27527360641920.0, + "grad_norm": 2.0752589468807043, + "language_loss": 0.60782373, + "learning_rate": 2.076121368302263e-06, + "loss": 0.62924629, + "num_input_tokens_seen": 179961765, + "step": 8370, + "time_per_iteration": 2.65816330909729 + }, + { + "auxiliary_loss_clip": 0.01076531, + "auxiliary_loss_mlp": 0.01043773, + "balance_loss_clip": 1.04144311, + "balance_loss_mlp": 1.02868104, + "epoch": 0.5032917480835714, + "flos": 34495825939200.0, + "grad_norm": 1.8954281033433134, + "language_loss": 0.68462563, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.70582867, + "num_input_tokens_seen": 179983015, + "step": 8371, + "time_per_iteration": 2.8479132652282715 + }, + { + "auxiliary_loss_clip": 0.01097422, + "auxiliary_loss_mlp": 0.01034396, + "balance_loss_clip": 1.04120922, + "balance_loss_mlp": 1.019364, + "epoch": 0.5033518713362393, + "flos": 33656773167360.0, + "grad_norm": 1.6611598690743674, + "language_loss": 0.67656618, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.69788438, + "num_input_tokens_seen": 180003210, + "step": 8372, + "time_per_iteration": 2.767489194869995 + }, + { + "auxiliary_loss_clip": 0.01085092, + "auxiliary_loss_mlp": 0.01043333, + "balance_loss_clip": 1.04139996, + "balance_loss_mlp": 1.02714443, + "epoch": 0.5034119945889073, + "flos": 28185495586560.0, + "grad_norm": 1.9018001021824607, + "language_loss": 0.66726547, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.68854976, + "num_input_tokens_seen": 180025530, + "step": 8373, + "time_per_iteration": 2.7779579162597656 + }, + { + "auxiliary_loss_clip": 0.0109703, + "auxiliary_loss_mlp": 0.01035617, + "balance_loss_clip": 1.04184651, + "balance_loss_mlp": 1.02208042, + "epoch": 0.5034721178415752, + "flos": 21358697529600.0, + "grad_norm": 1.7065424378128664, + "language_loss": 0.74679291, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.76811939, + "num_input_tokens_seen": 180043180, + "step": 8374, + "time_per_iteration": 2.673182487487793 + }, + { + "auxiliary_loss_clip": 0.01100104, + "auxiliary_loss_mlp": 0.01040932, + "balance_loss_clip": 1.04264212, + "balance_loss_mlp": 1.02604842, + "epoch": 0.5035322410942432, + "flos": 22674823764480.0, + "grad_norm": 1.5424981365737231, + "language_loss": 0.68154198, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.70295238, + "num_input_tokens_seen": 180062905, + "step": 8375, + "time_per_iteration": 2.6842665672302246 + }, + { + "auxiliary_loss_clip": 0.01077033, + "auxiliary_loss_mlp": 0.01034566, + "balance_loss_clip": 1.04517126, + "balance_loss_mlp": 1.0195334, + "epoch": 0.5035923643469111, + "flos": 19828723674240.0, + "grad_norm": 3.5828954699990656, + "language_loss": 0.79316169, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.81427765, + "num_input_tokens_seen": 180082000, + "step": 8376, + "time_per_iteration": 2.7780654430389404 + }, + { + "auxiliary_loss_clip": 0.01117369, + "auxiliary_loss_mlp": 0.00771622, + "balance_loss_clip": 1.04441619, + "balance_loss_mlp": 1.00016475, + "epoch": 0.5036524875995791, + "flos": 30514625182080.0, + "grad_norm": 2.6140774214814693, + "language_loss": 0.59478593, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.61367583, + "num_input_tokens_seen": 180101340, + "step": 8377, + "time_per_iteration": 2.8071539402008057 + }, + { + "auxiliary_loss_clip": 0.01101437, + "auxiliary_loss_mlp": 0.01036815, + "balance_loss_clip": 1.04309344, + "balance_loss_mlp": 1.02237928, + "epoch": 0.5037126108522471, + "flos": 14720574637440.0, + "grad_norm": 2.0235166884987663, + "language_loss": 0.76598781, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.78737032, + "num_input_tokens_seen": 180119160, + "step": 8378, + "time_per_iteration": 2.7332303524017334 + }, + { + "auxiliary_loss_clip": 0.01086538, + "auxiliary_loss_mlp": 0.01035008, + "balance_loss_clip": 1.04592919, + "balance_loss_mlp": 1.02190685, + "epoch": 0.5037727341049151, + "flos": 25297702784640.0, + "grad_norm": 1.7029006786118923, + "language_loss": 0.75000858, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.77122402, + "num_input_tokens_seen": 180138730, + "step": 8379, + "time_per_iteration": 2.8803420066833496 + }, + { + "auxiliary_loss_clip": 0.0111301, + "auxiliary_loss_mlp": 0.01035016, + "balance_loss_clip": 1.04890418, + "balance_loss_mlp": 1.02151, + "epoch": 0.5038328573575831, + "flos": 28541764632960.0, + "grad_norm": 2.071075437448324, + "language_loss": 0.67026305, + "learning_rate": 2.072229431544548e-06, + "loss": 0.69174337, + "num_input_tokens_seen": 180158810, + "step": 8380, + "time_per_iteration": 2.7347092628479004 + }, + { + "auxiliary_loss_clip": 0.01070606, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.04154301, + "balance_loss_mlp": 1.02420914, + "epoch": 0.503892980610251, + "flos": 31649869503360.0, + "grad_norm": 1.7540511910669407, + "language_loss": 0.63245583, + "learning_rate": 2.071840222561051e-06, + "loss": 0.65353596, + "num_input_tokens_seen": 180179700, + "step": 8381, + "time_per_iteration": 2.836247444152832 + }, + { + "auxiliary_loss_clip": 0.01101604, + "auxiliary_loss_mlp": 0.01039312, + "balance_loss_clip": 1.04428375, + "balance_loss_mlp": 1.02624631, + "epoch": 0.503953103862919, + "flos": 27089358197760.0, + "grad_norm": 1.4852984664170332, + "language_loss": 0.67586917, + "learning_rate": 2.071451010853365e-06, + "loss": 0.69727832, + "num_input_tokens_seen": 180199890, + "step": 8382, + "time_per_iteration": 2.776895523071289 + }, + { + "auxiliary_loss_clip": 0.01115945, + "auxiliary_loss_mlp": 0.010349, + "balance_loss_clip": 1.04923749, + "balance_loss_mlp": 1.02039194, + "epoch": 0.5040132271155869, + "flos": 15632957024640.0, + "grad_norm": 2.370012953933875, + "language_loss": 0.62379169, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.64530009, + "num_input_tokens_seen": 180217840, + "step": 8383, + "time_per_iteration": 2.7200045585632324 + }, + { + "auxiliary_loss_clip": 0.0108883, + "auxiliary_loss_mlp": 0.01037077, + "balance_loss_clip": 1.04611087, + "balance_loss_mlp": 1.02349341, + "epoch": 0.504073350368255, + "flos": 13590106824960.0, + "grad_norm": 1.70449565256652, + "language_loss": 0.66918409, + "learning_rate": 2.070672579324465e-06, + "loss": 0.69044316, + "num_input_tokens_seen": 180236465, + "step": 8384, + "time_per_iteration": 2.7442476749420166 + }, + { + "auxiliary_loss_clip": 0.01108405, + "auxiliary_loss_mlp": 0.01040675, + "balance_loss_clip": 1.04502487, + "balance_loss_mlp": 1.02765775, + "epoch": 0.5041334736209229, + "flos": 29058160510080.0, + "grad_norm": 3.2853523964565072, + "language_loss": 0.7103979, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.73188871, + "num_input_tokens_seen": 180258025, + "step": 8385, + "time_per_iteration": 2.7480194568634033 + }, + { + "auxiliary_loss_clip": 0.01110668, + "auxiliary_loss_mlp": 0.01029456, + "balance_loss_clip": 1.0450182, + "balance_loss_mlp": 1.01644468, + "epoch": 0.5041935968735909, + "flos": 24608361899520.0, + "grad_norm": 1.9814049774657359, + "language_loss": 0.83344412, + "learning_rate": 2.069894137075919e-06, + "loss": 0.8548454, + "num_input_tokens_seen": 180277825, + "step": 8386, + "time_per_iteration": 2.703789234161377 + }, + { + "auxiliary_loss_clip": 0.01108831, + "auxiliary_loss_mlp": 0.01037004, + "balance_loss_clip": 1.04437232, + "balance_loss_mlp": 1.02313972, + "epoch": 0.5042537201262588, + "flos": 26286934320000.0, + "grad_norm": 1.592773103928685, + "language_loss": 0.66832674, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.68978512, + "num_input_tokens_seen": 180300465, + "step": 8387, + "time_per_iteration": 2.8348472118377686 + }, + { + "auxiliary_loss_clip": 0.0106703, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.03972006, + "balance_loss_mlp": 1.02091861, + "epoch": 0.5043138433789268, + "flos": 22017371178240.0, + "grad_norm": 1.386335560273684, + "language_loss": 0.80273068, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.82373804, + "num_input_tokens_seen": 180321050, + "step": 8388, + "time_per_iteration": 2.8797311782836914 + }, + { + "auxiliary_loss_clip": 0.01112016, + "auxiliary_loss_mlp": 0.01032606, + "balance_loss_clip": 1.04459918, + "balance_loss_mlp": 1.01927233, + "epoch": 0.5043739666315947, + "flos": 28767104605440.0, + "grad_norm": 2.1659708262729436, + "language_loss": 0.69815123, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.7195974, + "num_input_tokens_seen": 180338870, + "step": 8389, + "time_per_iteration": 2.7739861011505127 + }, + { + "auxiliary_loss_clip": 0.01090981, + "auxiliary_loss_mlp": 0.01040643, + "balance_loss_clip": 1.04124045, + "balance_loss_mlp": 1.02756596, + "epoch": 0.5044340898842627, + "flos": 27599253713280.0, + "grad_norm": 1.6276843858059296, + "language_loss": 0.6986587, + "learning_rate": 2.068337220892191e-06, + "loss": 0.71997494, + "num_input_tokens_seen": 180361285, + "step": 8390, + "time_per_iteration": 2.844275712966919 + }, + { + "auxiliary_loss_clip": 0.01033792, + "auxiliary_loss_mlp": 0.01003101, + "balance_loss_clip": 1.02656126, + "balance_loss_mlp": 1.00192666, + "epoch": 0.5044942131369307, + "flos": 67458050749440.0, + "grad_norm": 0.9139771068710668, + "language_loss": 0.52933067, + "learning_rate": 2.067947985330974e-06, + "loss": 0.54969966, + "num_input_tokens_seen": 180415170, + "step": 8391, + "time_per_iteration": 3.054262638092041 + }, + { + "auxiliary_loss_clip": 0.01015619, + "auxiliary_loss_mlp": 0.01001074, + "balance_loss_clip": 1.02201819, + "balance_loss_mlp": 0.99963111, + "epoch": 0.5045543363895987, + "flos": 58630849390080.0, + "grad_norm": 0.853635093218063, + "language_loss": 0.60675329, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.62692022, + "num_input_tokens_seen": 180468060, + "step": 8392, + "time_per_iteration": 3.0727028846740723 + }, + { + "auxiliary_loss_clip": 0.01085218, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.04148042, + "balance_loss_mlp": 1.02351034, + "epoch": 0.5046144596422667, + "flos": 22526620248960.0, + "grad_norm": 2.343143032045354, + "language_loss": 0.84343797, + "learning_rate": 2.067169506493517e-06, + "loss": 0.86465156, + "num_input_tokens_seen": 180486610, + "step": 8393, + "time_per_iteration": 2.7260749340057373 + }, + { + "auxiliary_loss_clip": 0.01087949, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.04098725, + "balance_loss_mlp": 1.02107096, + "epoch": 0.5046745828949346, + "flos": 27454246508160.0, + "grad_norm": 1.8418334138160795, + "language_loss": 0.50936127, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.53057826, + "num_input_tokens_seen": 180508135, + "step": 8394, + "time_per_iteration": 2.827000617980957 + }, + { + "auxiliary_loss_clip": 0.01121524, + "auxiliary_loss_mlp": 0.0103809, + "balance_loss_clip": 1.04323471, + "balance_loss_mlp": 1.02311766, + "epoch": 0.5047347061476026, + "flos": 17274541415040.0, + "grad_norm": 1.5679941994223312, + "language_loss": 0.75414777, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.7757439, + "num_input_tokens_seen": 180527000, + "step": 8395, + "time_per_iteration": 2.6535708904266357 + }, + { + "auxiliary_loss_clip": 0.01106012, + "auxiliary_loss_mlp": 0.01041618, + "balance_loss_clip": 1.04312563, + "balance_loss_mlp": 1.02860057, + "epoch": 0.5047948294002705, + "flos": 16649515831680.0, + "grad_norm": 2.0910564250698562, + "language_loss": 0.68781769, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.70929396, + "num_input_tokens_seen": 180544715, + "step": 8396, + "time_per_iteration": 2.700747013092041 + }, + { + "auxiliary_loss_clip": 0.01111788, + "auxiliary_loss_mlp": 0.0103291, + "balance_loss_clip": 1.04604292, + "balance_loss_mlp": 1.02059579, + "epoch": 0.5048549526529386, + "flos": 26865706164480.0, + "grad_norm": 3.479269791703844, + "language_loss": 0.78899479, + "learning_rate": 2.065612518371792e-06, + "loss": 0.81044173, + "num_input_tokens_seen": 180565365, + "step": 8397, + "time_per_iteration": 2.716320514678955 + }, + { + "auxiliary_loss_clip": 0.01078686, + "auxiliary_loss_mlp": 0.01033767, + "balance_loss_clip": 1.04137075, + "balance_loss_mlp": 1.02079701, + "epoch": 0.5049150759056065, + "flos": 21833939399040.0, + "grad_norm": 3.435063442023246, + "language_loss": 0.66291559, + "learning_rate": 2.065223265084376e-06, + "loss": 0.68404007, + "num_input_tokens_seen": 180586670, + "step": 8398, + "time_per_iteration": 2.773245334625244 + }, + { + "auxiliary_loss_clip": 0.01113858, + "auxiliary_loss_mlp": 0.00770983, + "balance_loss_clip": 1.04783058, + "balance_loss_mlp": 1.00018215, + "epoch": 0.5049751991582745, + "flos": 21685807710720.0, + "grad_norm": 1.5640615321007765, + "language_loss": 0.720043, + "learning_rate": 2.064834009323688e-06, + "loss": 0.73889136, + "num_input_tokens_seen": 180605085, + "step": 8399, + "time_per_iteration": 2.697688341140747 + }, + { + "auxiliary_loss_clip": 0.01091578, + "auxiliary_loss_mlp": 0.01053063, + "balance_loss_clip": 1.04215539, + "balance_loss_mlp": 1.03741038, + "epoch": 0.5050353224109424, + "flos": 21359379888000.0, + "grad_norm": 3.5795224523825695, + "language_loss": 0.81615806, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.8376044, + "num_input_tokens_seen": 180624370, + "step": 8400, + "time_per_iteration": 2.7172608375549316 + }, + { + "auxiliary_loss_clip": 0.01084985, + "auxiliary_loss_mlp": 0.01039311, + "balance_loss_clip": 1.04359269, + "balance_loss_mlp": 1.02413547, + "epoch": 0.5050954456636104, + "flos": 22820082364800.0, + "grad_norm": 1.9975954417395212, + "language_loss": 0.78901821, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.81026119, + "num_input_tokens_seen": 180642450, + "step": 8401, + "time_per_iteration": 2.790361166000366 + }, + { + "auxiliary_loss_clip": 0.01125612, + "auxiliary_loss_mlp": 0.00770602, + "balance_loss_clip": 1.04576373, + "balance_loss_mlp": 1.00024748, + "epoch": 0.5051555689162783, + "flos": 30448226891520.0, + "grad_norm": 1.6142524162989784, + "language_loss": 0.70102769, + "learning_rate": 2.063666227349593e-06, + "loss": 0.7199899, + "num_input_tokens_seen": 180665250, + "step": 8402, + "time_per_iteration": 2.6950721740722656 + }, + { + "auxiliary_loss_clip": 0.01112822, + "auxiliary_loss_mlp": 0.00771289, + "balance_loss_clip": 1.04341567, + "balance_loss_mlp": 1.00022268, + "epoch": 0.5052156921689464, + "flos": 21287953693440.0, + "grad_norm": 2.3922403816883433, + "language_loss": 0.69298434, + "learning_rate": 2.063276961843422e-06, + "loss": 0.71182549, + "num_input_tokens_seen": 180687425, + "step": 8403, + "time_per_iteration": 4.257136344909668 + }, + { + "auxiliary_loss_clip": 0.01109967, + "auxiliary_loss_mlp": 0.01043124, + "balance_loss_clip": 1.04455948, + "balance_loss_mlp": 1.03021932, + "epoch": 0.5052758154216143, + "flos": 25081305298560.0, + "grad_norm": 1.6578366313908228, + "language_loss": 0.85693455, + "learning_rate": 2.062887693937781e-06, + "loss": 0.87846541, + "num_input_tokens_seen": 180708725, + "step": 8404, + "time_per_iteration": 2.725935459136963 + }, + { + "auxiliary_loss_clip": 0.01087696, + "auxiliary_loss_mlp": 0.00769912, + "balance_loss_clip": 1.04370379, + "balance_loss_mlp": 1.00018847, + "epoch": 0.5053359386742823, + "flos": 20885502735360.0, + "grad_norm": 1.5507323053673605, + "language_loss": 0.75329977, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.77187586, + "num_input_tokens_seen": 180727990, + "step": 8405, + "time_per_iteration": 4.237490653991699 + }, + { + "auxiliary_loss_clip": 0.01124188, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.04560125, + "balance_loss_mlp": 1.01756775, + "epoch": 0.5053960619269503, + "flos": 37743335493120.0, + "grad_norm": 1.5851552924914987, + "language_loss": 0.73046809, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.75202894, + "num_input_tokens_seen": 180749765, + "step": 8406, + "time_per_iteration": 4.387450218200684 + }, + { + "auxiliary_loss_clip": 0.0108276, + "auxiliary_loss_mlp": 0.01031932, + "balance_loss_clip": 1.04293895, + "balance_loss_mlp": 1.01945066, + "epoch": 0.5054561851796182, + "flos": 23513840622720.0, + "grad_norm": 1.8244341787972256, + "language_loss": 0.76631331, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.78746021, + "num_input_tokens_seen": 180769580, + "step": 8407, + "time_per_iteration": 2.765031099319458 + }, + { + "auxiliary_loss_clip": 0.01085678, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.04038286, + "balance_loss_mlp": 1.01838887, + "epoch": 0.5055163084322862, + "flos": 30410233280640.0, + "grad_norm": 1.769865286909125, + "language_loss": 0.63482308, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.65598726, + "num_input_tokens_seen": 180790295, + "step": 8408, + "time_per_iteration": 2.7497997283935547 + }, + { + "auxiliary_loss_clip": 0.01094613, + "auxiliary_loss_mlp": 0.01046938, + "balance_loss_clip": 1.04494774, + "balance_loss_mlp": 1.03097582, + "epoch": 0.5055764316849541, + "flos": 20259651139200.0, + "grad_norm": 1.9259425074827412, + "language_loss": 0.63427341, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.655689, + "num_input_tokens_seen": 180807875, + "step": 8409, + "time_per_iteration": 4.083381652832031 + }, + { + "auxiliary_loss_clip": 0.01099903, + "auxiliary_loss_mlp": 0.01029856, + "balance_loss_clip": 1.04535913, + "balance_loss_mlp": 1.01790488, + "epoch": 0.5056365549376222, + "flos": 26070895969920.0, + "grad_norm": 2.0381050127162528, + "language_loss": 0.71175253, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.73305017, + "num_input_tokens_seen": 180831300, + "step": 8410, + "time_per_iteration": 2.7279632091522217 + }, + { + "auxiliary_loss_clip": 0.01097675, + "auxiliary_loss_mlp": 0.0104194, + "balance_loss_clip": 1.04237318, + "balance_loss_mlp": 1.02787888, + "epoch": 0.5056966781902901, + "flos": 19279074781440.0, + "grad_norm": 1.4485544779958848, + "language_loss": 0.79037184, + "learning_rate": 2.060162752653113e-06, + "loss": 0.81176794, + "num_input_tokens_seen": 180849055, + "step": 8411, + "time_per_iteration": 2.6837332248687744 + }, + { + "auxiliary_loss_clip": 0.01125313, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.04655755, + "balance_loss_mlp": 1.02372837, + "epoch": 0.5057568014429581, + "flos": 21323325611520.0, + "grad_norm": 1.8986612146492552, + "language_loss": 0.81808418, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.83972836, + "num_input_tokens_seen": 180867395, + "step": 8412, + "time_per_iteration": 2.615809679031372 + }, + { + "auxiliary_loss_clip": 0.01103779, + "auxiliary_loss_mlp": 0.01041145, + "balance_loss_clip": 1.04390502, + "balance_loss_mlp": 1.02739954, + "epoch": 0.505816924695626, + "flos": 17493596507520.0, + "grad_norm": 1.9029826105260268, + "language_loss": 0.80660832, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.82805753, + "num_input_tokens_seen": 180886670, + "step": 8413, + "time_per_iteration": 2.7692911624908447 + }, + { + "auxiliary_loss_clip": 0.0109162, + "auxiliary_loss_mlp": 0.00771431, + "balance_loss_clip": 1.04406643, + "balance_loss_mlp": 1.00020945, + "epoch": 0.505877047948294, + "flos": 21142084561920.0, + "grad_norm": 1.9410580169313951, + "language_loss": 0.80582374, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.82445419, + "num_input_tokens_seen": 180904645, + "step": 8414, + "time_per_iteration": 2.6970348358154297 + }, + { + "auxiliary_loss_clip": 0.01107406, + "auxiliary_loss_mlp": 0.01030257, + "balance_loss_clip": 1.03923571, + "balance_loss_mlp": 1.0169946, + "epoch": 0.5059371712009619, + "flos": 36350036887680.0, + "grad_norm": 2.0609800291463225, + "language_loss": 0.62233627, + "learning_rate": 2.058605592832528e-06, + "loss": 0.64371288, + "num_input_tokens_seen": 180922340, + "step": 8415, + "time_per_iteration": 2.7197422981262207 + }, + { + "auxiliary_loss_clip": 0.01087332, + "auxiliary_loss_mlp": 0.01032316, + "balance_loss_clip": 1.04092574, + "balance_loss_mlp": 1.01899433, + "epoch": 0.50599729445363, + "flos": 22673387220480.0, + "grad_norm": 1.6231002317718672, + "language_loss": 0.81935573, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.84055215, + "num_input_tokens_seen": 180941350, + "step": 8416, + "time_per_iteration": 2.782719612121582 + }, + { + "auxiliary_loss_clip": 0.01091272, + "auxiliary_loss_mlp": 0.0103737, + "balance_loss_clip": 1.04698849, + "balance_loss_mlp": 1.02498984, + "epoch": 0.5060574177062979, + "flos": 22747866071040.0, + "grad_norm": 1.5803053727793945, + "language_loss": 0.78981423, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.81110072, + "num_input_tokens_seen": 180960720, + "step": 8417, + "time_per_iteration": 2.7089340686798096 + }, + { + "auxiliary_loss_clip": 0.01070059, + "auxiliary_loss_mlp": 0.01039058, + "balance_loss_clip": 1.0394783, + "balance_loss_mlp": 1.02599227, + "epoch": 0.5061175409589659, + "flos": 21653201139840.0, + "grad_norm": 1.8562945560748794, + "language_loss": 0.62433213, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.64542329, + "num_input_tokens_seen": 180979725, + "step": 8418, + "time_per_iteration": 2.719282388687134 + }, + { + "auxiliary_loss_clip": 0.0109094, + "auxiliary_loss_mlp": 0.01035325, + "balance_loss_clip": 1.04258347, + "balance_loss_mlp": 1.02194929, + "epoch": 0.5061776642116339, + "flos": 21616249023360.0, + "grad_norm": 2.2787836153634724, + "language_loss": 0.77394211, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.79520482, + "num_input_tokens_seen": 180998980, + "step": 8419, + "time_per_iteration": 2.741727113723755 + }, + { + "auxiliary_loss_clip": 0.01062039, + "auxiliary_loss_mlp": 0.0103574, + "balance_loss_clip": 1.04027188, + "balance_loss_mlp": 1.02160168, + "epoch": 0.5062377874643018, + "flos": 24426294837120.0, + "grad_norm": 1.7570247471688223, + "language_loss": 0.77180004, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.79277784, + "num_input_tokens_seen": 181019165, + "step": 8420, + "time_per_iteration": 2.8240675926208496 + }, + { + "auxiliary_loss_clip": 0.01123562, + "auxiliary_loss_mlp": 0.01036164, + "balance_loss_clip": 1.04462767, + "balance_loss_mlp": 1.02188277, + "epoch": 0.5062979107169698, + "flos": 22524429519360.0, + "grad_norm": 1.730716034871051, + "language_loss": 0.77317429, + "learning_rate": 2.056269786726999e-06, + "loss": 0.79477155, + "num_input_tokens_seen": 181037110, + "step": 8421, + "time_per_iteration": 2.6797008514404297 + }, + { + "auxiliary_loss_clip": 0.01106529, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.04212284, + "balance_loss_mlp": 1.01860261, + "epoch": 0.5063580339696377, + "flos": 24571984400640.0, + "grad_norm": 1.4584078249019805, + "language_loss": 0.66635919, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.68774974, + "num_input_tokens_seen": 181057775, + "step": 8422, + "time_per_iteration": 2.80218505859375 + }, + { + "auxiliary_loss_clip": 0.01123775, + "auxiliary_loss_mlp": 0.01032917, + "balance_loss_clip": 1.04679537, + "balance_loss_mlp": 1.01939869, + "epoch": 0.5064181572223058, + "flos": 22596143022720.0, + "grad_norm": 1.7069001340883154, + "language_loss": 0.818717, + "learning_rate": 2.05549116746431e-06, + "loss": 0.84028399, + "num_input_tokens_seen": 181078260, + "step": 8423, + "time_per_iteration": 2.6722168922424316 + }, + { + "auxiliary_loss_clip": 0.01124994, + "auxiliary_loss_mlp": 0.00771759, + "balance_loss_clip": 1.04458904, + "balance_loss_mlp": 1.00021005, + "epoch": 0.5064782804749737, + "flos": 25994944661760.0, + "grad_norm": 1.7762047243227106, + "language_loss": 0.74689841, + "learning_rate": 2.055101854669237e-06, + "loss": 0.76586592, + "num_input_tokens_seen": 181098755, + "step": 8424, + "time_per_iteration": 2.657538652420044 + }, + { + "auxiliary_loss_clip": 0.01121266, + "auxiliary_loss_mlp": 0.01037955, + "balance_loss_clip": 1.04494393, + "balance_loss_mlp": 1.02427602, + "epoch": 0.5065384037276417, + "flos": 28553041503360.0, + "grad_norm": 1.7147939268792267, + "language_loss": 0.71541035, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.73700261, + "num_input_tokens_seen": 181121570, + "step": 8425, + "time_per_iteration": 2.6696951389312744 + }, + { + "auxiliary_loss_clip": 0.0108314, + "auxiliary_loss_mlp": 0.01043142, + "balance_loss_clip": 1.04042649, + "balance_loss_mlp": 1.02828813, + "epoch": 0.5065985269803096, + "flos": 22966023323520.0, + "grad_norm": 1.7834107132976578, + "language_loss": 0.7868796, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.80814242, + "num_input_tokens_seen": 181140240, + "step": 8426, + "time_per_iteration": 2.702861785888672 + }, + { + "auxiliary_loss_clip": 0.01116039, + "auxiliary_loss_mlp": 0.01039376, + "balance_loss_clip": 1.0481956, + "balance_loss_mlp": 1.0255599, + "epoch": 0.5066586502329776, + "flos": 21608563512960.0, + "grad_norm": 2.9338643206598713, + "language_loss": 0.7762264, + "learning_rate": 2.053933903806265e-06, + "loss": 0.79778051, + "num_input_tokens_seen": 181158630, + "step": 8427, + "time_per_iteration": 2.5964066982269287 + }, + { + "auxiliary_loss_clip": 0.0112123, + "auxiliary_loss_mlp": 0.01028788, + "balance_loss_clip": 1.04505837, + "balance_loss_mlp": 1.014763, + "epoch": 0.5067187734856455, + "flos": 20339912079360.0, + "grad_norm": 2.519773325925209, + "language_loss": 0.71591479, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.73741496, + "num_input_tokens_seen": 181176405, + "step": 8428, + "time_per_iteration": 2.5878183841705322 + }, + { + "auxiliary_loss_clip": 0.01105053, + "auxiliary_loss_mlp": 0.00769921, + "balance_loss_clip": 1.041857, + "balance_loss_mlp": 1.00016701, + "epoch": 0.5067788967383136, + "flos": 28841080665600.0, + "grad_norm": 1.637474951892814, + "language_loss": 0.83266222, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.85141206, + "num_input_tokens_seen": 181197595, + "step": 8429, + "time_per_iteration": 2.6528842449188232 + }, + { + "auxiliary_loss_clip": 0.01094205, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.04527116, + "balance_loss_mlp": 1.02068472, + "epoch": 0.5068390199909815, + "flos": 32450174478720.0, + "grad_norm": 1.986559953193462, + "language_loss": 0.73507559, + "learning_rate": 2.052765934536682e-06, + "loss": 0.75637317, + "num_input_tokens_seen": 181218560, + "step": 8430, + "time_per_iteration": 2.8031511306762695 + }, + { + "auxiliary_loss_clip": 0.01057925, + "auxiliary_loss_mlp": 0.01041942, + "balance_loss_clip": 1.03520572, + "balance_loss_mlp": 1.02702332, + "epoch": 0.5068991432436495, + "flos": 23146582014720.0, + "grad_norm": 2.0458094547910766, + "language_loss": 0.77132332, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.79232198, + "num_input_tokens_seen": 181237095, + "step": 8431, + "time_per_iteration": 2.7593939304351807 + }, + { + "auxiliary_loss_clip": 0.01108688, + "auxiliary_loss_mlp": 0.01035857, + "balance_loss_clip": 1.04256523, + "balance_loss_mlp": 1.02171338, + "epoch": 0.5069592664963174, + "flos": 19936096404480.0, + "grad_norm": 1.5904348009832192, + "language_loss": 0.72110546, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.74255085, + "num_input_tokens_seen": 181255940, + "step": 8432, + "time_per_iteration": 2.6104278564453125 + }, + { + "auxiliary_loss_clip": 0.0100252, + "auxiliary_loss_mlp": 0.01010781, + "balance_loss_clip": 1.01845694, + "balance_loss_mlp": 1.00870693, + "epoch": 0.5070193897489854, + "flos": 65793771941760.0, + "grad_norm": 0.7570764213883562, + "language_loss": 0.63648349, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.65661651, + "num_input_tokens_seen": 181316945, + "step": 8433, + "time_per_iteration": 3.395040273666382 + }, + { + "auxiliary_loss_clip": 0.01089015, + "auxiliary_loss_mlp": 0.01040915, + "balance_loss_clip": 1.04288781, + "balance_loss_mlp": 1.02685428, + "epoch": 0.5070795130016534, + "flos": 17275331514240.0, + "grad_norm": 2.2603713431070194, + "language_loss": 0.78218484, + "learning_rate": 2.051208614233681e-06, + "loss": 0.80348414, + "num_input_tokens_seen": 181335555, + "step": 8434, + "time_per_iteration": 2.705864667892456 + }, + { + "auxiliary_loss_clip": 0.01099616, + "auxiliary_loss_mlp": 0.01035206, + "balance_loss_clip": 1.04088449, + "balance_loss_mlp": 1.02169967, + "epoch": 0.5071396362543213, + "flos": 21069940095360.0, + "grad_norm": 1.6177485307205706, + "language_loss": 0.70698971, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.72833788, + "num_input_tokens_seen": 181354580, + "step": 8435, + "time_per_iteration": 2.699631929397583 + }, + { + "auxiliary_loss_clip": 0.01115814, + "auxiliary_loss_mlp": 0.01036717, + "balance_loss_clip": 1.04539943, + "balance_loss_mlp": 1.02220905, + "epoch": 0.5071997595069894, + "flos": 23144822248320.0, + "grad_norm": 1.8141877812584497, + "language_loss": 0.72254074, + "learning_rate": 2.050429942372112e-06, + "loss": 0.74406612, + "num_input_tokens_seen": 181374320, + "step": 8436, + "time_per_iteration": 2.6646859645843506 + }, + { + "auxiliary_loss_clip": 0.01124514, + "auxiliary_loss_mlp": 0.01034184, + "balance_loss_clip": 1.04597569, + "balance_loss_mlp": 1.01978946, + "epoch": 0.5072598827596573, + "flos": 22747183712640.0, + "grad_norm": 1.5423854267163515, + "language_loss": 0.83801168, + "learning_rate": 2.050040603565483e-06, + "loss": 0.85959864, + "num_input_tokens_seen": 181392190, + "step": 8437, + "time_per_iteration": 2.6614348888397217 + }, + { + "auxiliary_loss_clip": 0.01110359, + "auxiliary_loss_mlp": 0.01028112, + "balance_loss_clip": 1.04387856, + "balance_loss_mlp": 1.01448607, + "epoch": 0.5073200060123253, + "flos": 22566301799040.0, + "grad_norm": 2.7232019997829924, + "language_loss": 0.80638587, + "learning_rate": 2.049651262861309e-06, + "loss": 0.82777059, + "num_input_tokens_seen": 181413890, + "step": 8438, + "time_per_iteration": 2.6778056621551514 + }, + { + "auxiliary_loss_clip": 0.01081177, + "auxiliary_loss_mlp": 0.01037532, + "balance_loss_clip": 1.04218078, + "balance_loss_mlp": 1.02235103, + "epoch": 0.5073801292649932, + "flos": 25806341324160.0, + "grad_norm": 1.4751942737164592, + "language_loss": 0.7943362, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.81552327, + "num_input_tokens_seen": 181433240, + "step": 8439, + "time_per_iteration": 2.694603443145752 + }, + { + "auxiliary_loss_clip": 0.01088705, + "auxiliary_loss_mlp": 0.0077357, + "balance_loss_clip": 1.04178834, + "balance_loss_mlp": 1.00020123, + "epoch": 0.5074402525176612, + "flos": 25373941401600.0, + "grad_norm": 1.5360675692672114, + "language_loss": 0.71413541, + "learning_rate": 2.048872575819383e-06, + "loss": 0.7327581, + "num_input_tokens_seen": 181453535, + "step": 8440, + "time_per_iteration": 2.68709397315979 + }, + { + "auxiliary_loss_clip": 0.01096271, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.04103327, + "balance_loss_mlp": 1.0227561, + "epoch": 0.5075003757703291, + "flos": 26064431521920.0, + "grad_norm": 1.6763306182018036, + "language_loss": 0.7087847, + "learning_rate": 2.048483229511158e-06, + "loss": 0.73011339, + "num_input_tokens_seen": 181474195, + "step": 8441, + "time_per_iteration": 2.728649377822876 + }, + { + "auxiliary_loss_clip": 0.01113949, + "auxiliary_loss_mlp": 0.00771406, + "balance_loss_clip": 1.04312265, + "balance_loss_mlp": 1.00021851, + "epoch": 0.5075604990229972, + "flos": 21835447770240.0, + "grad_norm": 1.794299641086803, + "language_loss": 0.63846874, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.65732235, + "num_input_tokens_seen": 181494000, + "step": 8442, + "time_per_iteration": 4.1495561599731445 + }, + { + "auxiliary_loss_clip": 0.01065064, + "auxiliary_loss_mlp": 0.01028245, + "balance_loss_clip": 1.03900802, + "balance_loss_mlp": 1.01582956, + "epoch": 0.5076206222756651, + "flos": 31978703537280.0, + "grad_norm": 1.7729718848020288, + "language_loss": 0.7149542, + "learning_rate": 2.047704531394006e-06, + "loss": 0.73588729, + "num_input_tokens_seen": 181515955, + "step": 8443, + "time_per_iteration": 2.84781551361084 + }, + { + "auxiliary_loss_clip": 0.01033895, + "auxiliary_loss_mlp": 0.01036606, + "balance_loss_clip": 1.03034997, + "balance_loss_mlp": 1.02093554, + "epoch": 0.5076807455283331, + "flos": 36904031326080.0, + "grad_norm": 1.237062481884337, + "language_loss": 0.62134659, + "learning_rate": 2.047315179614607e-06, + "loss": 0.64205158, + "num_input_tokens_seen": 181540225, + "step": 8444, + "time_per_iteration": 3.2103631496429443 + }, + { + "auxiliary_loss_clip": 0.01086312, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.04043984, + "balance_loss_mlp": 1.0172112, + "epoch": 0.507740868781001, + "flos": 29862415981440.0, + "grad_norm": 1.7245082556223335, + "language_loss": 0.64173615, + "learning_rate": 2.046925826041012e-06, + "loss": 0.66290236, + "num_input_tokens_seen": 181560125, + "step": 8445, + "time_per_iteration": 4.46838903427124 + }, + { + "auxiliary_loss_clip": 0.01013224, + "auxiliary_loss_mlp": 0.01008254, + "balance_loss_clip": 1.02398801, + "balance_loss_mlp": 1.00686538, + "epoch": 0.507800992033669, + "flos": 61918974247680.0, + "grad_norm": 0.8265855466772786, + "language_loss": 0.61854541, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.63876021, + "num_input_tokens_seen": 181618830, + "step": 8446, + "time_per_iteration": 3.267681121826172 + }, + { + "auxiliary_loss_clip": 0.01080886, + "auxiliary_loss_mlp": 0.01028563, + "balance_loss_clip": 1.0391748, + "balance_loss_mlp": 1.0157063, + "epoch": 0.507861115286337, + "flos": 20700490757760.0, + "grad_norm": 1.574417237275669, + "language_loss": 0.8065623, + "learning_rate": 2.04614711357029e-06, + "loss": 0.82765681, + "num_input_tokens_seen": 181637120, + "step": 8447, + "time_per_iteration": 2.761584758758545 + }, + { + "auxiliary_loss_clip": 0.01111406, + "auxiliary_loss_mlp": 0.01031653, + "balance_loss_clip": 1.04490948, + "balance_loss_mlp": 1.01859963, + "epoch": 0.507921238539005, + "flos": 30847050576000.0, + "grad_norm": 1.8510365938740598, + "language_loss": 0.70990604, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.73133665, + "num_input_tokens_seen": 181659965, + "step": 8448, + "time_per_iteration": 4.335421085357666 + }, + { + "auxiliary_loss_clip": 0.01121931, + "auxiliary_loss_mlp": 0.00769587, + "balance_loss_clip": 1.04565167, + "balance_loss_mlp": 1.00020599, + "epoch": 0.507981361791673, + "flos": 35700197984640.0, + "grad_norm": 3.0099403095172557, + "language_loss": 0.71958399, + "learning_rate": 2.045368394099955e-06, + "loss": 0.73849922, + "num_input_tokens_seen": 181685290, + "step": 8449, + "time_per_iteration": 2.7780673503875732 + }, + { + "auxiliary_loss_clip": 0.01094628, + "auxiliary_loss_mlp": 0.0103001, + "balance_loss_clip": 1.04017317, + "balance_loss_mlp": 1.01767778, + "epoch": 0.5080414850443409, + "flos": 27161466750720.0, + "grad_norm": 1.5810099588314865, + "language_loss": 0.73045403, + "learning_rate": 2.044979031776844e-06, + "loss": 0.7517004, + "num_input_tokens_seen": 181706080, + "step": 8450, + "time_per_iteration": 2.744396448135376 + }, + { + "auxiliary_loss_clip": 0.01123333, + "auxiliary_loss_mlp": 0.01027947, + "balance_loss_clip": 1.04468369, + "balance_loss_mlp": 1.01485837, + "epoch": 0.5081016082970089, + "flos": 27085192220160.0, + "grad_norm": 1.7103931675901212, + "language_loss": 0.77190459, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.79341733, + "num_input_tokens_seen": 181724805, + "step": 8451, + "time_per_iteration": 2.683182716369629 + }, + { + "auxiliary_loss_clip": 0.01122238, + "auxiliary_loss_mlp": 0.01037138, + "balance_loss_clip": 1.04372776, + "balance_loss_mlp": 1.02413273, + "epoch": 0.5081617315496768, + "flos": 22856531690880.0, + "grad_norm": 1.9627256153454082, + "language_loss": 0.85055304, + "learning_rate": 2.044200302028559e-06, + "loss": 0.87214684, + "num_input_tokens_seen": 181743725, + "step": 8452, + "time_per_iteration": 2.684624671936035 + }, + { + "auxiliary_loss_clip": 0.01126785, + "auxiliary_loss_mlp": 0.01034895, + "balance_loss_clip": 1.04584098, + "balance_loss_mlp": 1.02078056, + "epoch": 0.5082218548023448, + "flos": 16281898087680.0, + "grad_norm": 4.065129026902181, + "language_loss": 0.77099299, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.79260981, + "num_input_tokens_seen": 181757720, + "step": 8453, + "time_per_iteration": 2.572178602218628 + }, + { + "auxiliary_loss_clip": 0.01084848, + "auxiliary_loss_mlp": 0.01032198, + "balance_loss_clip": 1.04113591, + "balance_loss_mlp": 1.02010989, + "epoch": 0.5082819780550127, + "flos": 24460768915200.0, + "grad_norm": 1.6244227223176155, + "language_loss": 0.76530403, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.78647447, + "num_input_tokens_seen": 181778545, + "step": 8454, + "time_per_iteration": 2.8153836727142334 + }, + { + "auxiliary_loss_clip": 0.01097667, + "auxiliary_loss_mlp": 0.01036941, + "balance_loss_clip": 1.03992426, + "balance_loss_mlp": 1.02275562, + "epoch": 0.5083421013076808, + "flos": 23403271582080.0, + "grad_norm": 1.5351507829324025, + "language_loss": 0.89199609, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.91334224, + "num_input_tokens_seen": 181799495, + "step": 8455, + "time_per_iteration": 2.7793357372283936 + }, + { + "auxiliary_loss_clip": 0.01106838, + "auxiliary_loss_mlp": 0.00772606, + "balance_loss_clip": 1.04346323, + "balance_loss_mlp": 1.00026703, + "epoch": 0.5084022245603487, + "flos": 23872695448320.0, + "grad_norm": 1.6166334009695327, + "language_loss": 0.62119138, + "learning_rate": 2.042642822537149e-06, + "loss": 0.63998592, + "num_input_tokens_seen": 181818400, + "step": 8456, + "time_per_iteration": 2.7200372219085693 + }, + { + "auxiliary_loss_clip": 0.01034029, + "auxiliary_loss_mlp": 0.01006279, + "balance_loss_clip": 1.01840019, + "balance_loss_mlp": 1.00490177, + "epoch": 0.5084623478130167, + "flos": 62873336655360.0, + "grad_norm": 0.8116383799523507, + "language_loss": 0.6243, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.64470303, + "num_input_tokens_seen": 181875975, + "step": 8457, + "time_per_iteration": 3.087890625 + }, + { + "auxiliary_loss_clip": 0.01113045, + "auxiliary_loss_mlp": 0.01032551, + "balance_loss_clip": 1.0439477, + "balance_loss_mlp": 1.01853776, + "epoch": 0.5085224710656846, + "flos": 22346133384960.0, + "grad_norm": 1.6206653077395385, + "language_loss": 0.67609936, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.6975553, + "num_input_tokens_seen": 181896450, + "step": 8458, + "time_per_iteration": 2.6950957775115967 + }, + { + "auxiliary_loss_clip": 0.011096, + "auxiliary_loss_mlp": 0.01034441, + "balance_loss_clip": 1.04140186, + "balance_loss_mlp": 1.01998079, + "epoch": 0.5085825943183526, + "flos": 26066263115520.0, + "grad_norm": 1.6983738136244226, + "language_loss": 0.77766174, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.79910213, + "num_input_tokens_seen": 181916770, + "step": 8459, + "time_per_iteration": 2.699784278869629 + }, + { + "auxiliary_loss_clip": 0.01127851, + "auxiliary_loss_mlp": 0.01035156, + "balance_loss_clip": 1.04686987, + "balance_loss_mlp": 1.02099431, + "epoch": 0.5086427175710206, + "flos": 17420733768960.0, + "grad_norm": 10.198892862393663, + "language_loss": 0.8050856, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.82671559, + "num_input_tokens_seen": 181932710, + "step": 8460, + "time_per_iteration": 2.632998466491699 + }, + { + "auxiliary_loss_clip": 0.01101605, + "auxiliary_loss_mlp": 0.01038577, + "balance_loss_clip": 1.04293346, + "balance_loss_mlp": 1.0255115, + "epoch": 0.5087028408236886, + "flos": 20631758083200.0, + "grad_norm": 1.5613520556763807, + "language_loss": 0.68347144, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.70487332, + "num_input_tokens_seen": 181950665, + "step": 8461, + "time_per_iteration": 2.7463462352752686 + }, + { + "auxiliary_loss_clip": 0.01118492, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.04215729, + "balance_loss_mlp": 1.01677442, + "epoch": 0.5087629640763566, + "flos": 25593822506880.0, + "grad_norm": 1.9214201788253797, + "language_loss": 0.76016432, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.7816515, + "num_input_tokens_seen": 181971270, + "step": 8462, + "time_per_iteration": 2.780043363571167 + }, + { + "auxiliary_loss_clip": 0.01081215, + "auxiliary_loss_mlp": 0.01037857, + "balance_loss_clip": 1.0401057, + "balance_loss_mlp": 1.02322388, + "epoch": 0.5088230873290245, + "flos": 13261631927040.0, + "grad_norm": 2.117801536001897, + "language_loss": 0.81441897, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.83560967, + "num_input_tokens_seen": 181988410, + "step": 8463, + "time_per_iteration": 2.7101564407348633 + }, + { + "auxiliary_loss_clip": 0.0110518, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.04148602, + "balance_loss_mlp": 1.02201426, + "epoch": 0.5088832105816925, + "flos": 20043469134720.0, + "grad_norm": 2.6576302734312733, + "language_loss": 0.76305163, + "learning_rate": 2.039527786882341e-06, + "loss": 0.78445399, + "num_input_tokens_seen": 182006530, + "step": 8464, + "time_per_iteration": 2.6081295013427734 + }, + { + "auxiliary_loss_clip": 0.01034964, + "auxiliary_loss_mlp": 0.0100043, + "balance_loss_clip": 1.01882601, + "balance_loss_mlp": 0.99929708, + "epoch": 0.5089433338343604, + "flos": 67422179018880.0, + "grad_norm": 0.6843560168430419, + "language_loss": 0.59347767, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61383158, + "num_input_tokens_seen": 182074240, + "step": 8465, + "time_per_iteration": 3.308885097503662 + }, + { + "auxiliary_loss_clip": 0.0111949, + "auxiliary_loss_mlp": 0.01033343, + "balance_loss_clip": 1.04262543, + "balance_loss_mlp": 1.0203197, + "epoch": 0.5090034570870284, + "flos": 22710339336960.0, + "grad_norm": 2.5778248190048787, + "language_loss": 0.80206662, + "learning_rate": 2.038749012684354e-06, + "loss": 0.82359493, + "num_input_tokens_seen": 182093360, + "step": 8466, + "time_per_iteration": 2.6912481784820557 + }, + { + "auxiliary_loss_clip": 0.01107512, + "auxiliary_loss_mlp": 0.0102939, + "balance_loss_clip": 1.03987598, + "balance_loss_mlp": 1.01634204, + "epoch": 0.5090635803396963, + "flos": 20445812352000.0, + "grad_norm": 1.5056043379234754, + "language_loss": 0.78307688, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.80444586, + "num_input_tokens_seen": 182110170, + "step": 8467, + "time_per_iteration": 2.61828875541687 + }, + { + "auxiliary_loss_clip": 0.01119026, + "auxiliary_loss_mlp": 0.01034745, + "balance_loss_clip": 1.04424381, + "balance_loss_mlp": 1.02226961, + "epoch": 0.5091237035923644, + "flos": 23768878164480.0, + "grad_norm": 1.9340722959801353, + "language_loss": 0.74676347, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.76830113, + "num_input_tokens_seen": 182129570, + "step": 8468, + "time_per_iteration": 2.6233344078063965 + }, + { + "auxiliary_loss_clip": 0.01119943, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.04366863, + "balance_loss_mlp": 1.01651728, + "epoch": 0.5091838268450323, + "flos": 18327908684160.0, + "grad_norm": 1.884666390581893, + "language_loss": 0.77613342, + "learning_rate": 2.03758084040404e-06, + "loss": 0.7976234, + "num_input_tokens_seen": 182147565, + "step": 8469, + "time_per_iteration": 2.579117774963379 + }, + { + "auxiliary_loss_clip": 0.01107532, + "auxiliary_loss_mlp": 0.01038411, + "balance_loss_clip": 1.04521155, + "balance_loss_mlp": 1.02425504, + "epoch": 0.5092439500977003, + "flos": 29057621806080.0, + "grad_norm": 1.5718905230515574, + "language_loss": 0.69481277, + "learning_rate": 2.037191446774109e-06, + "loss": 0.71627223, + "num_input_tokens_seen": 182169695, + "step": 8470, + "time_per_iteration": 2.6437594890594482 + }, + { + "auxiliary_loss_clip": 0.01096004, + "auxiliary_loss_mlp": 0.01045395, + "balance_loss_clip": 1.04067326, + "balance_loss_mlp": 1.02993393, + "epoch": 0.5093040733503682, + "flos": 13553908894080.0, + "grad_norm": 2.534594931806725, + "language_loss": 0.73583853, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.75725245, + "num_input_tokens_seen": 182186385, + "step": 8471, + "time_per_iteration": 2.6213905811309814 + }, + { + "auxiliary_loss_clip": 0.01043282, + "auxiliary_loss_mlp": 0.00999685, + "balance_loss_clip": 1.01733398, + "balance_loss_mlp": 0.99825424, + "epoch": 0.5093641966030362, + "flos": 68906617407360.0, + "grad_norm": 0.7545989611287492, + "language_loss": 0.58065605, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60108572, + "num_input_tokens_seen": 182247095, + "step": 8472, + "time_per_iteration": 3.1640241146087646 + }, + { + "auxiliary_loss_clip": 0.01069354, + "auxiliary_loss_mlp": 0.01036283, + "balance_loss_clip": 1.03772914, + "balance_loss_mlp": 1.0235815, + "epoch": 0.5094243198557042, + "flos": 21580948932480.0, + "grad_norm": 2.4665832849090994, + "language_loss": 0.68956393, + "learning_rate": 2.03602325748156e-06, + "loss": 0.71062028, + "num_input_tokens_seen": 182266380, + "step": 8473, + "time_per_iteration": 2.806593179702759 + }, + { + "auxiliary_loss_clip": 0.01097364, + "auxiliary_loss_mlp": 0.01035609, + "balance_loss_clip": 1.04190159, + "balance_loss_mlp": 1.02250814, + "epoch": 0.5094844431083722, + "flos": 28840721529600.0, + "grad_norm": 1.8851162187904098, + "language_loss": 0.85464561, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.87597537, + "num_input_tokens_seen": 182284685, + "step": 8474, + "time_per_iteration": 2.7467737197875977 + }, + { + "auxiliary_loss_clip": 0.01097916, + "auxiliary_loss_mlp": 0.01035284, + "balance_loss_clip": 1.04213905, + "balance_loss_mlp": 1.02201009, + "epoch": 0.5095445663610402, + "flos": 14976114969600.0, + "grad_norm": 2.1580860587409867, + "language_loss": 0.65563238, + "learning_rate": 2.035244457765222e-06, + "loss": 0.6769644, + "num_input_tokens_seen": 182301810, + "step": 8475, + "time_per_iteration": 2.653343439102173 + }, + { + "auxiliary_loss_clip": 0.01101978, + "auxiliary_loss_mlp": 0.01044707, + "balance_loss_clip": 1.04155195, + "balance_loss_mlp": 1.03043771, + "epoch": 0.5096046896137081, + "flos": 20777088510720.0, + "grad_norm": 2.3692417745384886, + "language_loss": 0.82122153, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.84268838, + "num_input_tokens_seen": 182320285, + "step": 8476, + "time_per_iteration": 2.735163927078247 + }, + { + "auxiliary_loss_clip": 0.01069648, + "auxiliary_loss_mlp": 0.01043833, + "balance_loss_clip": 1.03814852, + "balance_loss_mlp": 1.02698851, + "epoch": 0.5096648128663761, + "flos": 23185078416000.0, + "grad_norm": 5.724576330634238, + "language_loss": 0.80651575, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.82765061, + "num_input_tokens_seen": 182339465, + "step": 8477, + "time_per_iteration": 2.8972108364105225 + }, + { + "auxiliary_loss_clip": 0.01096525, + "auxiliary_loss_mlp": 0.01028419, + "balance_loss_clip": 1.04044962, + "balance_loss_mlp": 1.01321959, + "epoch": 0.509724936119044, + "flos": 22309432663680.0, + "grad_norm": 1.8365176357872317, + "language_loss": 0.6178633, + "learning_rate": 2.034076248204082e-06, + "loss": 0.63911271, + "num_input_tokens_seen": 182358375, + "step": 8478, + "time_per_iteration": 2.77237606048584 + }, + { + "auxiliary_loss_clip": 0.01105596, + "auxiliary_loss_mlp": 0.01039662, + "balance_loss_clip": 1.04185414, + "balance_loss_mlp": 1.02667403, + "epoch": 0.509785059371712, + "flos": 26287077974400.0, + "grad_norm": 1.8436515105252975, + "language_loss": 0.66209054, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.68354309, + "num_input_tokens_seen": 182377935, + "step": 8479, + "time_per_iteration": 2.667865514755249 + }, + { + "auxiliary_loss_clip": 0.01108822, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.0434258, + "balance_loss_mlp": 1.02382231, + "epoch": 0.50984518262438, + "flos": 22964586779520.0, + "grad_norm": 1.5755275700627138, + "language_loss": 0.69447386, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.71592748, + "num_input_tokens_seen": 182396440, + "step": 8480, + "time_per_iteration": 2.630505323410034 + }, + { + "auxiliary_loss_clip": 0.01124122, + "auxiliary_loss_mlp": 0.0103478, + "balance_loss_clip": 1.04386926, + "balance_loss_mlp": 1.02133346, + "epoch": 0.509905305877048, + "flos": 26213389223040.0, + "grad_norm": 1.7899171043779052, + "language_loss": 0.79267204, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.81426102, + "num_input_tokens_seen": 182415890, + "step": 8481, + "time_per_iteration": 2.6193926334381104 + }, + { + "auxiliary_loss_clip": 0.01104496, + "auxiliary_loss_mlp": 0.01034587, + "balance_loss_clip": 1.04157507, + "balance_loss_mlp": 1.02124786, + "epoch": 0.5099654291297159, + "flos": 20340055733760.0, + "grad_norm": 1.468990392476105, + "language_loss": 0.83301556, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.85440642, + "num_input_tokens_seen": 182434235, + "step": 8482, + "time_per_iteration": 4.149403095245361 + }, + { + "auxiliary_loss_clip": 0.01113898, + "auxiliary_loss_mlp": 0.00771464, + "balance_loss_clip": 1.04287457, + "balance_loss_mlp": 1.00025356, + "epoch": 0.5100255523823839, + "flos": 29054820545280.0, + "grad_norm": 1.9010351115161617, + "language_loss": 0.85379988, + "learning_rate": 2.032129206622238e-06, + "loss": 0.87265354, + "num_input_tokens_seen": 182454360, + "step": 8483, + "time_per_iteration": 2.7000234127044678 + }, + { + "auxiliary_loss_clip": 0.01109801, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.04242575, + "balance_loss_mlp": 1.0214082, + "epoch": 0.5100856756350518, + "flos": 22455912326400.0, + "grad_norm": 2.079288328100567, + "language_loss": 0.82931423, + "learning_rate": 2.031739794591775e-06, + "loss": 0.85075212, + "num_input_tokens_seen": 182471940, + "step": 8484, + "time_per_iteration": 4.3401288986206055 + }, + { + "auxiliary_loss_clip": 0.01095037, + "auxiliary_loss_mlp": 0.01033642, + "balance_loss_clip": 1.0400697, + "balance_loss_mlp": 1.0194087, + "epoch": 0.5101457988877198, + "flos": 19171055606400.0, + "grad_norm": 2.530206097433835, + "language_loss": 0.81594586, + "learning_rate": 2.031350381357736e-06, + "loss": 0.83723271, + "num_input_tokens_seen": 182490685, + "step": 8485, + "time_per_iteration": 2.6573400497436523 + }, + { + "auxiliary_loss_clip": 0.01092909, + "auxiliary_loss_mlp": 0.01038281, + "balance_loss_clip": 1.03726983, + "balance_loss_mlp": 1.02494788, + "epoch": 0.5102059221403878, + "flos": 14866371941760.0, + "grad_norm": 1.9374375358888782, + "language_loss": 0.74155819, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.76287007, + "num_input_tokens_seen": 182508325, + "step": 8486, + "time_per_iteration": 2.676863670349121 + }, + { + "auxiliary_loss_clip": 0.01078995, + "auxiliary_loss_mlp": 0.01037671, + "balance_loss_clip": 1.03769588, + "balance_loss_mlp": 1.0228231, + "epoch": 0.5102660453930558, + "flos": 22961103160320.0, + "grad_norm": 1.4946123985675848, + "language_loss": 0.70439661, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.72556329, + "num_input_tokens_seen": 182527020, + "step": 8487, + "time_per_iteration": 2.740612030029297 + }, + { + "auxiliary_loss_clip": 0.01099488, + "auxiliary_loss_mlp": 0.01039832, + "balance_loss_clip": 1.04223216, + "balance_loss_mlp": 1.02521729, + "epoch": 0.5103261686457238, + "flos": 23149311448320.0, + "grad_norm": 2.286550245787084, + "language_loss": 0.73022705, + "learning_rate": 2.030182134581827e-06, + "loss": 0.75162029, + "num_input_tokens_seen": 182543505, + "step": 8488, + "time_per_iteration": 4.345505714416504 + }, + { + "auxiliary_loss_clip": 0.01081446, + "auxiliary_loss_mlp": 0.00771801, + "balance_loss_clip": 1.04138601, + "balance_loss_mlp": 1.00030088, + "epoch": 0.5103862918983917, + "flos": 14319237000960.0, + "grad_norm": 1.7726796746163496, + "language_loss": 0.69465196, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.71318448, + "num_input_tokens_seen": 182562250, + "step": 8489, + "time_per_iteration": 2.7057676315307617 + }, + { + "auxiliary_loss_clip": 0.01096056, + "auxiliary_loss_mlp": 0.01035357, + "balance_loss_clip": 1.04011536, + "balance_loss_mlp": 1.02176082, + "epoch": 0.5104464151510597, + "flos": 25848536826240.0, + "grad_norm": 2.097372581248088, + "language_loss": 0.73219633, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.75351048, + "num_input_tokens_seen": 182581910, + "step": 8490, + "time_per_iteration": 2.7062344551086426 + }, + { + "auxiliary_loss_clip": 0.01093699, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.04015577, + "balance_loss_mlp": 1.01796126, + "epoch": 0.5105065384037276, + "flos": 21652913831040.0, + "grad_norm": 1.454492701867694, + "language_loss": 0.80228478, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.82352787, + "num_input_tokens_seen": 182601350, + "step": 8491, + "time_per_iteration": 2.670520782470703 + }, + { + "auxiliary_loss_clip": 0.01108835, + "auxiliary_loss_mlp": 0.01031094, + "balance_loss_clip": 1.04258561, + "balance_loss_mlp": 1.01813614, + "epoch": 0.5105666616563956, + "flos": 22491571553280.0, + "grad_norm": 1.8545470770344947, + "language_loss": 0.78970987, + "learning_rate": 2.028624456259728e-06, + "loss": 0.81110907, + "num_input_tokens_seen": 182619660, + "step": 8492, + "time_per_iteration": 2.681852102279663 + }, + { + "auxiliary_loss_clip": 0.01088193, + "auxiliary_loss_mlp": 0.01045644, + "balance_loss_clip": 1.04025435, + "balance_loss_mlp": 1.03187561, + "epoch": 0.5106267849090635, + "flos": 22455768672000.0, + "grad_norm": 1.9312934890574833, + "language_loss": 0.77364743, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.79498577, + "num_input_tokens_seen": 182639815, + "step": 8493, + "time_per_iteration": 2.71234393119812 + }, + { + "auxiliary_loss_clip": 0.01079322, + "auxiliary_loss_mlp": 0.01035175, + "balance_loss_clip": 1.04074192, + "balance_loss_mlp": 1.02040458, + "epoch": 0.5106869081617316, + "flos": 23547093638400.0, + "grad_norm": 1.7772442138937719, + "language_loss": 0.84122825, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.86237323, + "num_input_tokens_seen": 182659655, + "step": 8494, + "time_per_iteration": 2.737844944000244 + }, + { + "auxiliary_loss_clip": 0.0112627, + "auxiliary_loss_mlp": 0.01037758, + "balance_loss_clip": 1.04641843, + "balance_loss_mlp": 1.02502632, + "epoch": 0.5107470314143995, + "flos": 26792987080320.0, + "grad_norm": 1.9716326999087717, + "language_loss": 0.78846836, + "learning_rate": 2.027456186069326e-06, + "loss": 0.81010866, + "num_input_tokens_seen": 182677075, + "step": 8495, + "time_per_iteration": 2.5992324352264404 + }, + { + "auxiliary_loss_clip": 0.01088486, + "auxiliary_loss_mlp": 0.0103671, + "balance_loss_clip": 1.04210663, + "balance_loss_mlp": 1.02254176, + "epoch": 0.5108071546670675, + "flos": 25739691638400.0, + "grad_norm": 1.7860993635097173, + "language_loss": 0.78245926, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.80371118, + "num_input_tokens_seen": 182699625, + "step": 8496, + "time_per_iteration": 2.764511823654175 + }, + { + "auxiliary_loss_clip": 0.01107232, + "auxiliary_loss_mlp": 0.01031296, + "balance_loss_clip": 1.04186177, + "balance_loss_mlp": 1.01885021, + "epoch": 0.5108672779197354, + "flos": 18697537589760.0, + "grad_norm": 2.583960220786706, + "language_loss": 0.78615016, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.80753547, + "num_input_tokens_seen": 182717020, + "step": 8497, + "time_per_iteration": 2.614715337753296 + }, + { + "auxiliary_loss_clip": 0.01119749, + "auxiliary_loss_mlp": 0.01032774, + "balance_loss_clip": 1.04238069, + "balance_loss_mlp": 1.01958323, + "epoch": 0.5109274011724034, + "flos": 26688164215680.0, + "grad_norm": 1.8043712312754003, + "language_loss": 0.81731009, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.83883524, + "num_input_tokens_seen": 182736955, + "step": 8498, + "time_per_iteration": 2.670713186264038 + }, + { + "auxiliary_loss_clip": 0.01086895, + "auxiliary_loss_mlp": 0.00771568, + "balance_loss_clip": 1.03893542, + "balance_loss_mlp": 1.00021791, + "epoch": 0.5109875244250714, + "flos": 22784028088320.0, + "grad_norm": 1.9502410959783398, + "language_loss": 0.70963287, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.72821754, + "num_input_tokens_seen": 182757620, + "step": 8499, + "time_per_iteration": 2.6890597343444824 + }, + { + "auxiliary_loss_clip": 0.01063023, + "auxiliary_loss_mlp": 0.01039504, + "balance_loss_clip": 1.03797197, + "balance_loss_mlp": 1.0247463, + "epoch": 0.5110476476777394, + "flos": 35588515622400.0, + "grad_norm": 1.532594294583486, + "language_loss": 0.72400367, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.74502897, + "num_input_tokens_seen": 182780195, + "step": 8500, + "time_per_iteration": 2.8889389038085938 + }, + { + "auxiliary_loss_clip": 0.01113898, + "auxiliary_loss_mlp": 0.01039834, + "balance_loss_clip": 1.04150367, + "balance_loss_mlp": 1.0244801, + "epoch": 0.5111077709304074, + "flos": 19280798634240.0, + "grad_norm": 2.6334939898019867, + "language_loss": 0.62424856, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.64578593, + "num_input_tokens_seen": 182795765, + "step": 8501, + "time_per_iteration": 2.564922571182251 + }, + { + "auxiliary_loss_clip": 0.01120814, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.04017985, + "balance_loss_mlp": 1.0265801, + "epoch": 0.5111678941830753, + "flos": 20668207409280.0, + "grad_norm": 2.184561184824311, + "language_loss": 0.87622821, + "learning_rate": 2.024730186540907e-06, + "loss": 0.89784235, + "num_input_tokens_seen": 182813120, + "step": 8502, + "time_per_iteration": 2.6287243366241455 + }, + { + "auxiliary_loss_clip": 0.01106628, + "auxiliary_loss_mlp": 0.01038615, + "balance_loss_clip": 1.04065216, + "balance_loss_mlp": 1.02592492, + "epoch": 0.5112280174357433, + "flos": 26287903987200.0, + "grad_norm": 1.480449524900748, + "language_loss": 0.82794261, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.84939504, + "num_input_tokens_seen": 182835745, + "step": 8503, + "time_per_iteration": 2.682711124420166 + }, + { + "auxiliary_loss_clip": 0.01025632, + "auxiliary_loss_mlp": 0.01004613, + "balance_loss_clip": 1.0205853, + "balance_loss_mlp": 1.00336099, + "epoch": 0.5112881406884112, + "flos": 59474247707520.0, + "grad_norm": 0.8583626669635097, + "language_loss": 0.63898063, + "learning_rate": 2.023951320871339e-06, + "loss": 0.65928316, + "num_input_tokens_seen": 182892540, + "step": 8504, + "time_per_iteration": 3.216397523880005 + }, + { + "auxiliary_loss_clip": 0.01091882, + "auxiliary_loss_mlp": 0.00771622, + "balance_loss_clip": 1.04488444, + "balance_loss_mlp": 1.00014472, + "epoch": 0.5113482639410792, + "flos": 26468857728000.0, + "grad_norm": 1.826391287063558, + "language_loss": 0.84206301, + "learning_rate": 2.023561886666816e-06, + "loss": 0.86069804, + "num_input_tokens_seen": 182911515, + "step": 8505, + "time_per_iteration": 2.8032052516937256 + }, + { + "auxiliary_loss_clip": 0.0110904, + "auxiliary_loss_mlp": 0.01030264, + "balance_loss_clip": 1.04468179, + "balance_loss_mlp": 1.01698971, + "epoch": 0.5114083871937471, + "flos": 29895848565120.0, + "grad_norm": 1.983310033112748, + "language_loss": 0.75608075, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.77747381, + "num_input_tokens_seen": 182930860, + "step": 8506, + "time_per_iteration": 2.699448347091675 + }, + { + "auxiliary_loss_clip": 0.01122646, + "auxiliary_loss_mlp": 0.01034693, + "balance_loss_clip": 1.04428148, + "balance_loss_mlp": 1.01986337, + "epoch": 0.5114685104464152, + "flos": 24314576561280.0, + "grad_norm": 1.918965700593569, + "language_loss": 0.58023655, + "learning_rate": 2.022783015592131e-06, + "loss": 0.60180998, + "num_input_tokens_seen": 182949960, + "step": 8507, + "time_per_iteration": 2.5828280448913574 + }, + { + "auxiliary_loss_clip": 0.01114406, + "auxiliary_loss_mlp": 0.01042669, + "balance_loss_clip": 1.04659033, + "balance_loss_mlp": 1.02820277, + "epoch": 0.5115286336990831, + "flos": 17019288391680.0, + "grad_norm": 1.7197846358145388, + "language_loss": 0.85691231, + "learning_rate": 2.022393578751503e-06, + "loss": 0.87848306, + "num_input_tokens_seen": 182968085, + "step": 8508, + "time_per_iteration": 2.691185235977173 + }, + { + "auxiliary_loss_clip": 0.01090388, + "auxiliary_loss_mlp": 0.00770619, + "balance_loss_clip": 1.04480338, + "balance_loss_mlp": 1.00018072, + "epoch": 0.5115887569517511, + "flos": 23659386531840.0, + "grad_norm": 1.8624731533798382, + "language_loss": 0.72326827, + "learning_rate": 2.022004141061709e-06, + "loss": 0.74187839, + "num_input_tokens_seen": 182987275, + "step": 8509, + "time_per_iteration": 2.7239418029785156 + }, + { + "auxiliary_loss_clip": 0.01120525, + "auxiliary_loss_mlp": 0.00770526, + "balance_loss_clip": 1.04470599, + "balance_loss_mlp": 1.00009036, + "epoch": 0.511648880204419, + "flos": 16107193313280.0, + "grad_norm": 2.5868792605641477, + "language_loss": 0.76204944, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.78095996, + "num_input_tokens_seen": 183004700, + "step": 8510, + "time_per_iteration": 2.6135294437408447 + }, + { + "auxiliary_loss_clip": 0.0112199, + "auxiliary_loss_mlp": 0.01035525, + "balance_loss_clip": 1.04560411, + "balance_loss_mlp": 1.022668, + "epoch": 0.511709003457087, + "flos": 32634970974720.0, + "grad_norm": 4.709097064233808, + "language_loss": 0.70997655, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.73155165, + "num_input_tokens_seen": 183025830, + "step": 8511, + "time_per_iteration": 2.7760493755340576 + }, + { + "auxiliary_loss_clip": 0.01095679, + "auxiliary_loss_mlp": 0.01029146, + "balance_loss_clip": 1.04216874, + "balance_loss_mlp": 1.01593149, + "epoch": 0.511769126709755, + "flos": 21762082241280.0, + "grad_norm": 2.953853433531297, + "language_loss": 0.66357356, + "learning_rate": 2.020835823045001e-06, + "loss": 0.68482178, + "num_input_tokens_seen": 183045140, + "step": 8512, + "time_per_iteration": 2.723987340927124 + }, + { + "auxiliary_loss_clip": 0.01060265, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.0384953, + "balance_loss_mlp": 1.02158666, + "epoch": 0.511829249962423, + "flos": 23915357827200.0, + "grad_norm": 1.7575723482240548, + "language_loss": 0.67203867, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.69301212, + "num_input_tokens_seen": 183063935, + "step": 8513, + "time_per_iteration": 2.759958505630493 + }, + { + "auxiliary_loss_clip": 0.01083159, + "auxiliary_loss_mlp": 0.01036169, + "balance_loss_clip": 1.04507256, + "balance_loss_mlp": 1.02201903, + "epoch": 0.511889373215091, + "flos": 23727005884800.0, + "grad_norm": 2.3341144576485116, + "language_loss": 0.68508673, + "learning_rate": 2.0200569403921e-06, + "loss": 0.70627999, + "num_input_tokens_seen": 183084135, + "step": 8514, + "time_per_iteration": 2.7791545391082764 + }, + { + "auxiliary_loss_clip": 0.01119085, + "auxiliary_loss_mlp": 0.0102933, + "balance_loss_clip": 1.04411948, + "balance_loss_mlp": 1.01689076, + "epoch": 0.5119494964677589, + "flos": 28111519526400.0, + "grad_norm": 1.6536407135597841, + "language_loss": 0.66139281, + "learning_rate": 2.019667497917424e-06, + "loss": 0.68287694, + "num_input_tokens_seen": 183104570, + "step": 8515, + "time_per_iteration": 2.6567435264587402 + }, + { + "auxiliary_loss_clip": 0.01109629, + "auxiliary_loss_mlp": 0.01035907, + "balance_loss_clip": 1.04417586, + "balance_loss_mlp": 1.02317524, + "epoch": 0.5120096197204269, + "flos": 24973214296320.0, + "grad_norm": 1.939516836327544, + "language_loss": 0.7526269, + "learning_rate": 2.019278054696955e-06, + "loss": 0.77408224, + "num_input_tokens_seen": 183123850, + "step": 8516, + "time_per_iteration": 2.7218270301818848 + }, + { + "auxiliary_loss_clip": 0.01093123, + "auxiliary_loss_mlp": 0.01039766, + "balance_loss_clip": 1.04275799, + "balance_loss_mlp": 1.02562129, + "epoch": 0.5120697429730948, + "flos": 17968012364160.0, + "grad_norm": 2.066446678045309, + "language_loss": 0.78090644, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.80223525, + "num_input_tokens_seen": 183141725, + "step": 8517, + "time_per_iteration": 2.6922826766967773 + }, + { + "auxiliary_loss_clip": 0.01114661, + "auxiliary_loss_mlp": 0.01034987, + "balance_loss_clip": 1.0449543, + "balance_loss_mlp": 1.02086043, + "epoch": 0.5121298662257628, + "flos": 23292343405440.0, + "grad_norm": 1.7160803061965533, + "language_loss": 0.74111056, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.76260698, + "num_input_tokens_seen": 183161300, + "step": 8518, + "time_per_iteration": 2.6781773567199707 + }, + { + "auxiliary_loss_clip": 0.01107849, + "auxiliary_loss_mlp": 0.0104112, + "balance_loss_clip": 1.04497719, + "balance_loss_mlp": 1.02699947, + "epoch": 0.5121899894784308, + "flos": 17311062568320.0, + "grad_norm": 1.7790366802945887, + "language_loss": 0.78405094, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.80554068, + "num_input_tokens_seen": 183180495, + "step": 8519, + "time_per_iteration": 2.634488582611084 + }, + { + "auxiliary_loss_clip": 0.01126735, + "auxiliary_loss_mlp": 0.01036152, + "balance_loss_clip": 1.04811025, + "balance_loss_mlp": 1.02241898, + "epoch": 0.5122501127310988, + "flos": 24930085040640.0, + "grad_norm": 1.8142627745056843, + "language_loss": 0.79518384, + "learning_rate": 2.017720274652497e-06, + "loss": 0.81681275, + "num_input_tokens_seen": 183200330, + "step": 8520, + "time_per_iteration": 2.6977620124816895 + }, + { + "auxiliary_loss_clip": 0.01104965, + "auxiliary_loss_mlp": 0.01041606, + "balance_loss_clip": 1.0438292, + "balance_loss_mlp": 1.02683616, + "epoch": 0.5123102359837667, + "flos": 18442859184000.0, + "grad_norm": 2.180675544150299, + "language_loss": 0.81294155, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.83440727, + "num_input_tokens_seen": 183218230, + "step": 8521, + "time_per_iteration": 4.264198303222656 + }, + { + "auxiliary_loss_clip": 0.0111372, + "auxiliary_loss_mlp": 0.01032737, + "balance_loss_clip": 1.04381251, + "balance_loss_mlp": 1.01808071, + "epoch": 0.5123703592364347, + "flos": 26684860164480.0, + "grad_norm": 1.8350455385455566, + "language_loss": 0.68333864, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.70480323, + "num_input_tokens_seen": 183236735, + "step": 8522, + "time_per_iteration": 2.755563735961914 + }, + { + "auxiliary_loss_clip": 0.0109986, + "auxiliary_loss_mlp": 0.01043615, + "balance_loss_clip": 1.04744244, + "balance_loss_mlp": 1.02636552, + "epoch": 0.5124304824891026, + "flos": 28803948981120.0, + "grad_norm": 1.6735611690288588, + "language_loss": 0.61849087, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.6399256, + "num_input_tokens_seen": 183257550, + "step": 8523, + "time_per_iteration": 2.752614974975586 + }, + { + "auxiliary_loss_clip": 0.01088964, + "auxiliary_loss_mlp": 0.01041136, + "balance_loss_clip": 1.04488027, + "balance_loss_mlp": 1.02776718, + "epoch": 0.5124906057417706, + "flos": 21761830846080.0, + "grad_norm": 2.1631882282248966, + "language_loss": 0.7807008, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.80200177, + "num_input_tokens_seen": 183275515, + "step": 8524, + "time_per_iteration": 5.938940763473511 + }, + { + "auxiliary_loss_clip": 0.0110059, + "auxiliary_loss_mlp": 0.01035868, + "balance_loss_clip": 1.04444933, + "balance_loss_mlp": 1.02287436, + "epoch": 0.5125507289944387, + "flos": 18880538405760.0, + "grad_norm": 2.5285806743725834, + "language_loss": 0.7489953, + "learning_rate": 2.015773034588706e-06, + "loss": 0.77035987, + "num_input_tokens_seen": 183293880, + "step": 8525, + "time_per_iteration": 2.6603550910949707 + }, + { + "auxiliary_loss_clip": 0.01100341, + "auxiliary_loss_mlp": 0.01045872, + "balance_loss_clip": 1.04424882, + "balance_loss_mlp": 1.02996945, + "epoch": 0.5126108522471066, + "flos": 35627838036480.0, + "grad_norm": 1.6545403659553666, + "language_loss": 0.74193799, + "learning_rate": 2.015383584722531e-06, + "loss": 0.76340014, + "num_input_tokens_seen": 183315860, + "step": 8526, + "time_per_iteration": 2.7631187438964844 + }, + { + "auxiliary_loss_clip": 0.01117967, + "auxiliary_loss_mlp": 0.010412, + "balance_loss_clip": 1.04805541, + "balance_loss_mlp": 1.02755094, + "epoch": 0.5126709754997746, + "flos": 20190918464640.0, + "grad_norm": 1.7970307477050764, + "language_loss": 0.65624464, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.6778363, + "num_input_tokens_seen": 183335480, + "step": 8527, + "time_per_iteration": 4.185753107070923 + }, + { + "auxiliary_loss_clip": 0.01099112, + "auxiliary_loss_mlp": 0.01038782, + "balance_loss_clip": 1.04767573, + "balance_loss_mlp": 1.02663493, + "epoch": 0.5127310987524425, + "flos": 18588548747520.0, + "grad_norm": 1.4652981434759074, + "language_loss": 0.74246556, + "learning_rate": 2.014604683254908e-06, + "loss": 0.76384449, + "num_input_tokens_seen": 183354395, + "step": 8528, + "time_per_iteration": 2.647268056869507 + }, + { + "auxiliary_loss_clip": 0.01110552, + "auxiliary_loss_mlp": 0.01034843, + "balance_loss_clip": 1.04382324, + "balance_loss_mlp": 1.02143764, + "epoch": 0.5127912220051105, + "flos": 22454691264000.0, + "grad_norm": 1.6345499952693072, + "language_loss": 0.82838154, + "learning_rate": 2.014215231682995e-06, + "loss": 0.84983552, + "num_input_tokens_seen": 183372980, + "step": 8529, + "time_per_iteration": 2.6546859741210938 + }, + { + "auxiliary_loss_clip": 0.0107231, + "auxiliary_loss_mlp": 0.01034968, + "balance_loss_clip": 1.04131067, + "balance_loss_mlp": 1.02149725, + "epoch": 0.5128513452577784, + "flos": 19093703667840.0, + "grad_norm": 2.6019709601767866, + "language_loss": 0.73687661, + "learning_rate": 2.01382577957204e-06, + "loss": 0.75794935, + "num_input_tokens_seen": 183390160, + "step": 8530, + "time_per_iteration": 2.754840612411499 + }, + { + "auxiliary_loss_clip": 0.01018433, + "auxiliary_loss_mlp": 0.01003338, + "balance_loss_clip": 1.02142978, + "balance_loss_mlp": 1.00163293, + "epoch": 0.5129114685104464, + "flos": 67892285243520.0, + "grad_norm": 0.7482622882096543, + "language_loss": 0.60775113, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.62796879, + "num_input_tokens_seen": 183455280, + "step": 8531, + "time_per_iteration": 3.331425666809082 + }, + { + "auxiliary_loss_clip": 0.01096599, + "auxiliary_loss_mlp": 0.01039227, + "balance_loss_clip": 1.04599643, + "balance_loss_mlp": 1.02387309, + "epoch": 0.5129715917631144, + "flos": 20449152316800.0, + "grad_norm": 1.6723134032232012, + "language_loss": 0.76866412, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.79002237, + "num_input_tokens_seen": 183473955, + "step": 8532, + "time_per_iteration": 2.8071939945220947 + }, + { + "auxiliary_loss_clip": 0.0110043, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.0434345, + "balance_loss_mlp": 1.02273178, + "epoch": 0.5130317150157824, + "flos": 35116146840960.0, + "grad_norm": 4.28948987854823, + "language_loss": 0.67031407, + "learning_rate": 2.012657420152597e-06, + "loss": 0.69168431, + "num_input_tokens_seen": 183497195, + "step": 8533, + "time_per_iteration": 2.7799179553985596 + }, + { + "auxiliary_loss_clip": 0.01094678, + "auxiliary_loss_mlp": 0.01039401, + "balance_loss_clip": 1.04602468, + "balance_loss_mlp": 1.02452362, + "epoch": 0.5130918382684503, + "flos": 19791627903360.0, + "grad_norm": 1.9915175591272611, + "language_loss": 0.8200537, + "learning_rate": 2.01226796603315e-06, + "loss": 0.84139454, + "num_input_tokens_seen": 183513675, + "step": 8534, + "time_per_iteration": 2.6692066192626953 + }, + { + "auxiliary_loss_clip": 0.01111793, + "auxiliary_loss_mlp": 0.01038613, + "balance_loss_clip": 1.04316652, + "balance_loss_mlp": 1.02398574, + "epoch": 0.5131519615211183, + "flos": 26323096337280.0, + "grad_norm": 1.4683279633381257, + "language_loss": 0.63850307, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.66000712, + "num_input_tokens_seen": 183535165, + "step": 8535, + "time_per_iteration": 2.6881463527679443 + }, + { + "auxiliary_loss_clip": 0.01118055, + "auxiliary_loss_mlp": 0.01031488, + "balance_loss_clip": 1.04930139, + "balance_loss_mlp": 1.01707554, + "epoch": 0.5132120847737862, + "flos": 19171917532800.0, + "grad_norm": 1.558826189326605, + "language_loss": 0.69832361, + "learning_rate": 2.011489056413418e-06, + "loss": 0.71981907, + "num_input_tokens_seen": 183553780, + "step": 8536, + "time_per_iteration": 2.7181568145751953 + }, + { + "auxiliary_loss_clip": 0.01116762, + "auxiliary_loss_mlp": 0.01038725, + "balance_loss_clip": 1.04751253, + "balance_loss_mlp": 1.02378178, + "epoch": 0.5132722080264542, + "flos": 20230420446720.0, + "grad_norm": 1.9464397996960447, + "language_loss": 0.70725036, + "learning_rate": 2.011099600942669e-06, + "loss": 0.72880518, + "num_input_tokens_seen": 183572285, + "step": 8537, + "time_per_iteration": 2.6996657848358154 + }, + { + "auxiliary_loss_clip": 0.01080908, + "auxiliary_loss_mlp": 0.01034474, + "balance_loss_clip": 1.04291606, + "balance_loss_mlp": 1.02007353, + "epoch": 0.5133323312791223, + "flos": 16469459930880.0, + "grad_norm": 1.8282608051087097, + "language_loss": 0.8028723, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.82402611, + "num_input_tokens_seen": 183589330, + "step": 8538, + "time_per_iteration": 2.752685308456421 + }, + { + "auxiliary_loss_clip": 0.01113197, + "auxiliary_loss_mlp": 0.01031357, + "balance_loss_clip": 1.0443325, + "balance_loss_mlp": 1.01739144, + "epoch": 0.5133924545317902, + "flos": 26068094709120.0, + "grad_norm": 2.0083592119837403, + "language_loss": 0.78388107, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.80532658, + "num_input_tokens_seen": 183609205, + "step": 8539, + "time_per_iteration": 2.6856329441070557 + }, + { + "auxiliary_loss_clip": 0.0109867, + "auxiliary_loss_mlp": 0.01033877, + "balance_loss_clip": 1.04138374, + "balance_loss_mlp": 1.01994729, + "epoch": 0.5134525777844582, + "flos": 29131023248640.0, + "grad_norm": 1.7382927125385157, + "language_loss": 0.76111883, + "learning_rate": 2.009931232064105e-06, + "loss": 0.78244424, + "num_input_tokens_seen": 183629985, + "step": 8540, + "time_per_iteration": 2.780198574066162 + }, + { + "auxiliary_loss_clip": 0.01074682, + "auxiliary_loss_mlp": 0.01038818, + "balance_loss_clip": 1.04355264, + "balance_loss_mlp": 1.02344, + "epoch": 0.5135127010371261, + "flos": 17454776883840.0, + "grad_norm": 1.7132610384814069, + "language_loss": 0.746566, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.76770097, + "num_input_tokens_seen": 183648220, + "step": 8541, + "time_per_iteration": 2.6982674598693848 + }, + { + "auxiliary_loss_clip": 0.01060333, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.0412941, + "balance_loss_mlp": 1.02475083, + "epoch": 0.5135728242897941, + "flos": 21944975316480.0, + "grad_norm": 1.5289233613121331, + "language_loss": 0.70432508, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.72532117, + "num_input_tokens_seen": 183668230, + "step": 8542, + "time_per_iteration": 2.783440113067627 + }, + { + "auxiliary_loss_clip": 0.01102439, + "auxiliary_loss_mlp": 0.01029643, + "balance_loss_clip": 1.04426861, + "balance_loss_mlp": 1.01601708, + "epoch": 0.513632947542462, + "flos": 22674859678080.0, + "grad_norm": 1.886898343071389, + "language_loss": 0.79691696, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.81823772, + "num_input_tokens_seen": 183687800, + "step": 8543, + "time_per_iteration": 2.906564950942993 + }, + { + "auxiliary_loss_clip": 0.01101285, + "auxiliary_loss_mlp": 0.01044679, + "balance_loss_clip": 1.04514194, + "balance_loss_mlp": 1.03012979, + "epoch": 0.51369307079513, + "flos": 29457163762560.0, + "grad_norm": 1.7217499667212701, + "language_loss": 0.67941636, + "learning_rate": 2.008373401689299e-06, + "loss": 0.700876, + "num_input_tokens_seen": 183709025, + "step": 8544, + "time_per_iteration": 2.815377950668335 + }, + { + "auxiliary_loss_clip": 0.01086355, + "auxiliary_loss_mlp": 0.01049073, + "balance_loss_clip": 1.03878117, + "balance_loss_mlp": 1.03430903, + "epoch": 0.513753194047798, + "flos": 18989347680000.0, + "grad_norm": 2.2112374430559214, + "language_loss": 0.72265953, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.74401385, + "num_input_tokens_seen": 183725740, + "step": 8545, + "time_per_iteration": 2.7677536010742188 + }, + { + "auxiliary_loss_clip": 0.01115821, + "auxiliary_loss_mlp": 0.01045255, + "balance_loss_clip": 1.04458177, + "balance_loss_mlp": 1.03013897, + "epoch": 0.513813317300466, + "flos": 17821855923840.0, + "grad_norm": 2.431720560794894, + "language_loss": 0.82277304, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.84438378, + "num_input_tokens_seen": 183743995, + "step": 8546, + "time_per_iteration": 2.6764519214630127 + }, + { + "auxiliary_loss_clip": 0.01110159, + "auxiliary_loss_mlp": 0.01037047, + "balance_loss_clip": 1.0421015, + "balance_loss_mlp": 1.02272379, + "epoch": 0.5138734405531339, + "flos": 24061191045120.0, + "grad_norm": 1.829642419824105, + "language_loss": 0.73038638, + "learning_rate": 2.007205025522544e-06, + "loss": 0.75185841, + "num_input_tokens_seen": 183764150, + "step": 8547, + "time_per_iteration": 2.664536714553833 + }, + { + "auxiliary_loss_clip": 0.01112692, + "auxiliary_loss_mlp": 0.01048016, + "balance_loss_clip": 1.04215682, + "balance_loss_mlp": 1.03369892, + "epoch": 0.5139335638058019, + "flos": 26097253574400.0, + "grad_norm": 1.6776951969003835, + "language_loss": 0.73548347, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.75709057, + "num_input_tokens_seen": 183783280, + "step": 8548, + "time_per_iteration": 2.6639697551727295 + }, + { + "auxiliary_loss_clip": 0.01086334, + "auxiliary_loss_mlp": 0.01037281, + "balance_loss_clip": 1.03931546, + "balance_loss_mlp": 1.02296984, + "epoch": 0.5139936870584698, + "flos": 18917095472640.0, + "grad_norm": 1.6001321585074282, + "language_loss": 0.82261604, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.84385222, + "num_input_tokens_seen": 183800725, + "step": 8549, + "time_per_iteration": 2.748581886291504 + }, + { + "auxiliary_loss_clip": 0.01115178, + "auxiliary_loss_mlp": 0.01033379, + "balance_loss_clip": 1.04665935, + "balance_loss_mlp": 1.0205524, + "epoch": 0.5140538103111378, + "flos": 16144001775360.0, + "grad_norm": 1.9742432137522015, + "language_loss": 0.71977437, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.74125993, + "num_input_tokens_seen": 183818735, + "step": 8550, + "time_per_iteration": 2.651068687438965 + }, + { + "auxiliary_loss_clip": 0.01112958, + "auxiliary_loss_mlp": 0.01041915, + "balance_loss_clip": 1.04612732, + "balance_loss_mlp": 1.02725196, + "epoch": 0.5141139335638057, + "flos": 22420145358720.0, + "grad_norm": 1.8069208573649895, + "language_loss": 0.75043917, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.77198792, + "num_input_tokens_seen": 183840015, + "step": 8551, + "time_per_iteration": 2.7058589458465576 + }, + { + "auxiliary_loss_clip": 0.01093993, + "auxiliary_loss_mlp": 0.01037756, + "balance_loss_clip": 1.0459106, + "balance_loss_mlp": 1.0240587, + "epoch": 0.5141740568164738, + "flos": 27089645506560.0, + "grad_norm": 1.6630090206247619, + "language_loss": 0.69182396, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.71314144, + "num_input_tokens_seen": 183860145, + "step": 8552, + "time_per_iteration": 2.7040834426879883 + }, + { + "auxiliary_loss_clip": 0.01114038, + "auxiliary_loss_mlp": 0.01039378, + "balance_loss_clip": 1.04381299, + "balance_loss_mlp": 1.02445841, + "epoch": 0.5142341800691418, + "flos": 24973250209920.0, + "grad_norm": 2.1567314432200364, + "language_loss": 0.753088, + "learning_rate": 2.004868266210965e-06, + "loss": 0.7746222, + "num_input_tokens_seen": 183880540, + "step": 8553, + "time_per_iteration": 2.6321310997009277 + }, + { + "auxiliary_loss_clip": 0.01125852, + "auxiliary_loss_mlp": 0.0104126, + "balance_loss_clip": 1.04767513, + "balance_loss_mlp": 1.02800989, + "epoch": 0.5142943033218097, + "flos": 20704513080960.0, + "grad_norm": 1.7807872167537822, + "language_loss": 0.67740041, + "learning_rate": 2.004478805593435e-06, + "loss": 0.69907153, + "num_input_tokens_seen": 183900895, + "step": 8554, + "time_per_iteration": 2.5353291034698486 + }, + { + "auxiliary_loss_clip": 0.01118225, + "auxiliary_loss_mlp": 0.01040414, + "balance_loss_clip": 1.04483485, + "balance_loss_mlp": 1.02390337, + "epoch": 0.5143544265744777, + "flos": 22925479847040.0, + "grad_norm": 1.822401657137422, + "language_loss": 0.73321033, + "learning_rate": 2.004089344806068e-06, + "loss": 0.75479674, + "num_input_tokens_seen": 183920335, + "step": 8555, + "time_per_iteration": 2.8193295001983643 + }, + { + "auxiliary_loss_clip": 0.01089525, + "auxiliary_loss_mlp": 0.01039524, + "balance_loss_clip": 1.04645813, + "balance_loss_mlp": 1.02570128, + "epoch": 0.5144145498271456, + "flos": 15921391236480.0, + "grad_norm": 2.4707318139003327, + "language_loss": 0.74175709, + "learning_rate": 2.003699883863633e-06, + "loss": 0.76304758, + "num_input_tokens_seen": 183936220, + "step": 8556, + "time_per_iteration": 2.721573829650879 + }, + { + "auxiliary_loss_clip": 0.0109284, + "auxiliary_loss_mlp": 0.01036355, + "balance_loss_clip": 1.04400861, + "balance_loss_mlp": 1.02320015, + "epoch": 0.5144746730798136, + "flos": 19681238430720.0, + "grad_norm": 1.790105253554859, + "language_loss": 0.85782719, + "learning_rate": 2.003310422780898e-06, + "loss": 0.87911922, + "num_input_tokens_seen": 183953250, + "step": 8557, + "time_per_iteration": 2.70686674118042 + }, + { + "auxiliary_loss_clip": 0.01106764, + "auxiliary_loss_mlp": 0.01043673, + "balance_loss_clip": 1.04357624, + "balance_loss_mlp": 1.0292908, + "epoch": 0.5145347963324816, + "flos": 23914711382400.0, + "grad_norm": 1.6124493392185149, + "language_loss": 0.88770819, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.90921259, + "num_input_tokens_seen": 183973865, + "step": 8558, + "time_per_iteration": 2.7256360054016113 + }, + { + "auxiliary_loss_clip": 0.01123218, + "auxiliary_loss_mlp": 0.00770892, + "balance_loss_clip": 1.04631722, + "balance_loss_mlp": 1.00014222, + "epoch": 0.5145949195851496, + "flos": 18260002022400.0, + "grad_norm": 2.0888380287595196, + "language_loss": 0.65300936, + "learning_rate": 2.002531500253602e-06, + "loss": 0.67195046, + "num_input_tokens_seen": 183992555, + "step": 8559, + "time_per_iteration": 2.64591646194458 + }, + { + "auxiliary_loss_clip": 0.01108519, + "auxiliary_loss_mlp": 0.00771269, + "balance_loss_clip": 1.04542136, + "balance_loss_mlp": 1.00025797, + "epoch": 0.5146550428378175, + "flos": 26213425136640.0, + "grad_norm": 1.9572467781311524, + "language_loss": 0.63094109, + "learning_rate": 2.002142038838577e-06, + "loss": 0.64973897, + "num_input_tokens_seen": 184010825, + "step": 8560, + "time_per_iteration": 4.225303888320923 + }, + { + "auxiliary_loss_clip": 0.0112394, + "auxiliary_loss_mlp": 0.01031949, + "balance_loss_clip": 1.04584384, + "balance_loss_mlp": 1.01820433, + "epoch": 0.5147151660904855, + "flos": 22674177319680.0, + "grad_norm": 1.85112269234195, + "language_loss": 0.70142567, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.72298455, + "num_input_tokens_seen": 184030155, + "step": 8561, + "time_per_iteration": 2.6462759971618652 + }, + { + "auxiliary_loss_clip": 0.01099376, + "auxiliary_loss_mlp": 0.01032154, + "balance_loss_clip": 1.04134226, + "balance_loss_mlp": 1.01888585, + "epoch": 0.5147752893431534, + "flos": 24972388283520.0, + "grad_norm": 1.6885707870282478, + "language_loss": 0.66502726, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.6863426, + "num_input_tokens_seen": 184051440, + "step": 8562, + "time_per_iteration": 2.6790151596069336 + }, + { + "auxiliary_loss_clip": 0.01118509, + "auxiliary_loss_mlp": 0.01035134, + "balance_loss_clip": 1.04731929, + "balance_loss_mlp": 1.02153838, + "epoch": 0.5148354125958214, + "flos": 22744669760640.0, + "grad_norm": 1.6641105551237323, + "language_loss": 0.77625287, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.79778934, + "num_input_tokens_seen": 184070205, + "step": 8563, + "time_per_iteration": 5.86843466758728 + }, + { + "auxiliary_loss_clip": 0.01117165, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.04520798, + "balance_loss_mlp": 1.01931095, + "epoch": 0.5148955358484893, + "flos": 23068763199360.0, + "grad_norm": 1.8668644890701778, + "language_loss": 0.82346904, + "learning_rate": 2.0005841925139e-06, + "loss": 0.84499174, + "num_input_tokens_seen": 184087345, + "step": 8564, + "time_per_iteration": 2.6531171798706055 + }, + { + "auxiliary_loss_clip": 0.01105481, + "auxiliary_loss_mlp": 0.01035772, + "balance_loss_clip": 1.04333782, + "balance_loss_mlp": 1.02130592, + "epoch": 0.5149556591011574, + "flos": 20340127560960.0, + "grad_norm": 1.6929228826937828, + "language_loss": 0.73255026, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.75396281, + "num_input_tokens_seen": 184107110, + "step": 8565, + "time_per_iteration": 2.8100740909576416 + }, + { + "auxiliary_loss_clip": 0.0111614, + "auxiliary_loss_mlp": 0.01036767, + "balance_loss_clip": 1.04448807, + "balance_loss_mlp": 1.02056694, + "epoch": 0.5150157823538254, + "flos": 22638230784000.0, + "grad_norm": 2.0356075529568596, + "language_loss": 0.68441874, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.70594788, + "num_input_tokens_seen": 184127105, + "step": 8566, + "time_per_iteration": 4.174206972122192 + }, + { + "auxiliary_loss_clip": 0.01126685, + "auxiliary_loss_mlp": 0.00772285, + "balance_loss_clip": 1.04328656, + "balance_loss_mlp": 1.00031221, + "epoch": 0.5150759056064933, + "flos": 26067627832320.0, + "grad_norm": 1.624621701105177, + "language_loss": 0.78153682, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.80052656, + "num_input_tokens_seen": 184148060, + "step": 8567, + "time_per_iteration": 2.6405906677246094 + }, + { + "auxiliary_loss_clip": 0.01115866, + "auxiliary_loss_mlp": 0.01034427, + "balance_loss_clip": 1.0444839, + "balance_loss_mlp": 1.01929939, + "epoch": 0.5151360288591613, + "flos": 25952641418880.0, + "grad_norm": 2.181301277452511, + "language_loss": 0.79243255, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.81393552, + "num_input_tokens_seen": 184166175, + "step": 8568, + "time_per_iteration": 2.6806869506835938 + }, + { + "auxiliary_loss_clip": 0.01100678, + "auxiliary_loss_mlp": 0.01033449, + "balance_loss_clip": 1.04264474, + "balance_loss_mlp": 1.02017546, + "epoch": 0.5151961521118292, + "flos": 18507246312960.0, + "grad_norm": 2.356580017264164, + "language_loss": 0.9131906, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.93453181, + "num_input_tokens_seen": 184182600, + "step": 8569, + "time_per_iteration": 2.6493630409240723 + }, + { + "auxiliary_loss_clip": 0.01128863, + "auxiliary_loss_mlp": 0.01034527, + "balance_loss_clip": 1.04688525, + "balance_loss_mlp": 1.0198164, + "epoch": 0.5152562753644973, + "flos": 22233696837120.0, + "grad_norm": 2.0115285980006967, + "language_loss": 0.76725376, + "learning_rate": 1.998247422657674e-06, + "loss": 0.78888762, + "num_input_tokens_seen": 184202020, + "step": 8570, + "time_per_iteration": 2.6327102184295654 + }, + { + "auxiliary_loss_clip": 0.01115897, + "auxiliary_loss_mlp": 0.01044719, + "balance_loss_clip": 1.04504037, + "balance_loss_mlp": 1.02880454, + "epoch": 0.5153163986171652, + "flos": 38436555047040.0, + "grad_norm": 1.735564613465363, + "language_loss": 0.73986542, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.76147163, + "num_input_tokens_seen": 184224850, + "step": 8571, + "time_per_iteration": 2.879904270172119 + }, + { + "auxiliary_loss_clip": 0.01031454, + "auxiliary_loss_mlp": 0.01001432, + "balance_loss_clip": 1.02375364, + "balance_loss_mlp": 1.00009048, + "epoch": 0.5153765218698332, + "flos": 66384503015040.0, + "grad_norm": 0.7786581254678329, + "language_loss": 0.52855021, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.54887909, + "num_input_tokens_seen": 184288520, + "step": 8572, + "time_per_iteration": 3.2987639904022217 + }, + { + "auxiliary_loss_clip": 0.01112833, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.04641247, + "balance_loss_mlp": 1.02542353, + "epoch": 0.5154366451225011, + "flos": 24024669891840.0, + "grad_norm": 1.82770535610101, + "language_loss": 0.76185274, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.78336841, + "num_input_tokens_seen": 184308565, + "step": 8573, + "time_per_iteration": 2.6767003536224365 + }, + { + "auxiliary_loss_clip": 0.01111651, + "auxiliary_loss_mlp": 0.01028763, + "balance_loss_clip": 1.04382682, + "balance_loss_mlp": 1.01498199, + "epoch": 0.5154967683751691, + "flos": 23468843859840.0, + "grad_norm": 2.7144169534848976, + "language_loss": 0.77198601, + "learning_rate": 1.996689577219102e-06, + "loss": 0.7933901, + "num_input_tokens_seen": 184326795, + "step": 8574, + "time_per_iteration": 2.6607704162597656 + }, + { + "auxiliary_loss_clip": 0.01099994, + "auxiliary_loss_mlp": 0.01033635, + "balance_loss_clip": 1.04476404, + "balance_loss_mlp": 1.02018237, + "epoch": 0.515556891627837, + "flos": 23805650712960.0, + "grad_norm": 3.244613949266341, + "language_loss": 0.8558231, + "learning_rate": 1.996300116136367e-06, + "loss": 0.87715936, + "num_input_tokens_seen": 184345990, + "step": 8575, + "time_per_iteration": 2.6699635982513428 + }, + { + "auxiliary_loss_clip": 0.01113561, + "auxiliary_loss_mlp": 0.0103516, + "balance_loss_clip": 1.04307377, + "balance_loss_mlp": 1.02077138, + "epoch": 0.515617014880505, + "flos": 19828544106240.0, + "grad_norm": 1.6301780240264319, + "language_loss": 0.76920515, + "learning_rate": 1.995910655193932e-06, + "loss": 0.79069233, + "num_input_tokens_seen": 184366300, + "step": 8576, + "time_per_iteration": 2.7603139877319336 + }, + { + "auxiliary_loss_clip": 0.01078348, + "auxiliary_loss_mlp": 0.00773356, + "balance_loss_clip": 1.04196084, + "balance_loss_mlp": 1.00032973, + "epoch": 0.515677138133173, + "flos": 14245907385600.0, + "grad_norm": 2.480047069773859, + "language_loss": 0.76414418, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.78266126, + "num_input_tokens_seen": 184383030, + "step": 8577, + "time_per_iteration": 2.694549083709717 + }, + { + "auxiliary_loss_clip": 0.01099471, + "auxiliary_loss_mlp": 0.01044811, + "balance_loss_clip": 1.04260516, + "balance_loss_mlp": 1.0279547, + "epoch": 0.515737261385841, + "flos": 28289707920000.0, + "grad_norm": 1.7162174586848327, + "language_loss": 0.80910254, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.83054537, + "num_input_tokens_seen": 184403410, + "step": 8578, + "time_per_iteration": 2.740527391433716 + }, + { + "auxiliary_loss_clip": 0.01121615, + "auxiliary_loss_mlp": 0.01032969, + "balance_loss_clip": 1.04364657, + "balance_loss_mlp": 1.01914644, + "epoch": 0.515797384638509, + "flos": 27891925729920.0, + "grad_norm": 1.8526777225789184, + "language_loss": 0.75880611, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.780352, + "num_input_tokens_seen": 184423830, + "step": 8579, + "time_per_iteration": 2.6643004417419434 + }, + { + "auxiliary_loss_clip": 0.01087857, + "auxiliary_loss_mlp": 0.01032352, + "balance_loss_clip": 1.04332745, + "balance_loss_mlp": 1.01849377, + "epoch": 0.5158575078911769, + "flos": 23040071210880.0, + "grad_norm": 3.647152473791378, + "language_loss": 0.7862978, + "learning_rate": 1.994352813122559e-06, + "loss": 0.80749989, + "num_input_tokens_seen": 184445050, + "step": 8580, + "time_per_iteration": 2.74796986579895 + }, + { + "auxiliary_loss_clip": 0.01086006, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_clip": 1.04050803, + "balance_loss_mlp": 1.03265989, + "epoch": 0.5159176311438449, + "flos": 12641346938880.0, + "grad_norm": 2.0718752995567966, + "language_loss": 0.73151392, + "learning_rate": 1.99396335310315e-06, + "loss": 0.75286567, + "num_input_tokens_seen": 184460775, + "step": 8581, + "time_per_iteration": 2.6738648414611816 + }, + { + "auxiliary_loss_clip": 0.01114558, + "auxiliary_loss_mlp": 0.01033417, + "balance_loss_clip": 1.0463438, + "balance_loss_mlp": 1.01976788, + "epoch": 0.5159777543965128, + "flos": 15558154951680.0, + "grad_norm": 2.080206363710033, + "language_loss": 0.74150515, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.76298487, + "num_input_tokens_seen": 184477365, + "step": 8582, + "time_per_iteration": 2.649186134338379 + }, + { + "auxiliary_loss_clip": 0.01085634, + "auxiliary_loss_mlp": 0.0103519, + "balance_loss_clip": 1.04351485, + "balance_loss_mlp": 1.02202952, + "epoch": 0.5160378776491809, + "flos": 23221671396480.0, + "grad_norm": 4.912834420865202, + "language_loss": 0.65803373, + "learning_rate": 1.99318443376583e-06, + "loss": 0.67924196, + "num_input_tokens_seen": 184497045, + "step": 8583, + "time_per_iteration": 2.7025017738342285 + }, + { + "auxiliary_loss_clip": 0.0111508, + "auxiliary_loss_mlp": 0.01037055, + "balance_loss_clip": 1.04503357, + "balance_loss_mlp": 1.02199888, + "epoch": 0.5160980009018488, + "flos": 21944616180480.0, + "grad_norm": 1.4135833939266678, + "language_loss": 0.76130998, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.78283131, + "num_input_tokens_seen": 184517675, + "step": 8584, + "time_per_iteration": 2.662471294403076 + }, + { + "auxiliary_loss_clip": 0.01093144, + "auxiliary_loss_mlp": 0.01043062, + "balance_loss_clip": 1.0425117, + "balance_loss_mlp": 1.02877474, + "epoch": 0.5161581241545168, + "flos": 22784064001920.0, + "grad_norm": 2.700643227023907, + "language_loss": 0.79112214, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.81248415, + "num_input_tokens_seen": 184537745, + "step": 8585, + "time_per_iteration": 2.727789878845215 + }, + { + "auxiliary_loss_clip": 0.01105983, + "auxiliary_loss_mlp": 0.01033747, + "balance_loss_clip": 1.0444293, + "balance_loss_mlp": 1.02064013, + "epoch": 0.5162182474071847, + "flos": 19675384513920.0, + "grad_norm": 2.398879690546405, + "language_loss": 0.81236124, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.83375853, + "num_input_tokens_seen": 184553630, + "step": 8586, + "time_per_iteration": 2.6371195316314697 + }, + { + "auxiliary_loss_clip": 0.01106215, + "auxiliary_loss_mlp": 0.0103541, + "balance_loss_clip": 1.04690671, + "balance_loss_mlp": 1.02083015, + "epoch": 0.5162783706598527, + "flos": 20046198568320.0, + "grad_norm": 1.819724898525227, + "language_loss": 0.71372288, + "learning_rate": 1.991626598310701e-06, + "loss": 0.73513913, + "num_input_tokens_seen": 184573530, + "step": 8587, + "time_per_iteration": 2.7760136127471924 + }, + { + "auxiliary_loss_clip": 0.01038098, + "auxiliary_loss_mlp": 0.01008101, + "balance_loss_clip": 1.02063632, + "balance_loss_mlp": 1.00669408, + "epoch": 0.5163384939125206, + "flos": 69959553713280.0, + "grad_norm": 0.7288340121404665, + "language_loss": 0.57740283, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.59786481, + "num_input_tokens_seen": 184637875, + "step": 8588, + "time_per_iteration": 3.183241844177246 + }, + { + "auxiliary_loss_clip": 0.01101129, + "auxiliary_loss_mlp": 0.01040283, + "balance_loss_clip": 1.04456651, + "balance_loss_mlp": 1.02572727, + "epoch": 0.5163986171651886, + "flos": 17417034668160.0, + "grad_norm": 1.7775907605960104, + "language_loss": 0.75007761, + "learning_rate": 1.990847682429185e-06, + "loss": 0.77149177, + "num_input_tokens_seen": 184656125, + "step": 8589, + "time_per_iteration": 2.8228790760040283 + }, + { + "auxiliary_loss_clip": 0.01117201, + "auxiliary_loss_mlp": 0.01029876, + "balance_loss_clip": 1.04574263, + "balance_loss_mlp": 1.01678646, + "epoch": 0.5164587404178566, + "flos": 21322679166720.0, + "grad_norm": 1.76753328713407, + "language_loss": 0.67530292, + "learning_rate": 1.990458225001627e-06, + "loss": 0.69677365, + "num_input_tokens_seen": 184675920, + "step": 8590, + "time_per_iteration": 2.6443076133728027 + }, + { + "auxiliary_loss_clip": 0.0104106, + "auxiliary_loss_mlp": 0.01004207, + "balance_loss_clip": 1.02416718, + "balance_loss_mlp": 1.00274086, + "epoch": 0.5165188636705246, + "flos": 68057149691520.0, + "grad_norm": 1.576071766619913, + "language_loss": 0.55832803, + "learning_rate": 1.990068767935895e-06, + "loss": 0.57878071, + "num_input_tokens_seen": 184730520, + "step": 8591, + "time_per_iteration": 3.062364101409912 + }, + { + "auxiliary_loss_clip": 0.01096175, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.04139185, + "balance_loss_mlp": 1.01549983, + "epoch": 0.5165789869231926, + "flos": 19385657412480.0, + "grad_norm": 1.5710435869577224, + "language_loss": 0.81707442, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.83831745, + "num_input_tokens_seen": 184748340, + "step": 8592, + "time_per_iteration": 2.6631641387939453 + }, + { + "auxiliary_loss_clip": 0.01108366, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.04346967, + "balance_loss_mlp": 1.01837873, + "epoch": 0.5166391101758605, + "flos": 20960197067520.0, + "grad_norm": 2.447309188835127, + "language_loss": 0.83472121, + "learning_rate": 1.989289854948979e-06, + "loss": 0.85612202, + "num_input_tokens_seen": 184766615, + "step": 8593, + "time_per_iteration": 2.6486148834228516 + }, + { + "auxiliary_loss_clip": 0.01097046, + "auxiliary_loss_mlp": 0.01044386, + "balance_loss_clip": 1.04197097, + "balance_loss_mlp": 1.02946699, + "epoch": 0.5166992334285285, + "flos": 29462407148160.0, + "grad_norm": 2.3092045349550374, + "language_loss": 0.69423366, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.71564794, + "num_input_tokens_seen": 184788075, + "step": 8594, + "time_per_iteration": 2.7182230949401855 + }, + { + "auxiliary_loss_clip": 0.01082123, + "auxiliary_loss_mlp": 0.01030642, + "balance_loss_clip": 1.04193354, + "balance_loss_mlp": 1.01663446, + "epoch": 0.5167593566811964, + "flos": 20304360593280.0, + "grad_norm": 1.4197237581629922, + "language_loss": 0.77434355, + "learning_rate": 1.988510943586582e-06, + "loss": 0.79547119, + "num_input_tokens_seen": 184808710, + "step": 8595, + "time_per_iteration": 2.7374019622802734 + }, + { + "auxiliary_loss_clip": 0.01123588, + "auxiliary_loss_mlp": 0.01039202, + "balance_loss_clip": 1.0457046, + "balance_loss_mlp": 1.02551079, + "epoch": 0.5168194799338645, + "flos": 14611370313600.0, + "grad_norm": 1.5026096017220443, + "language_loss": 0.650635, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.67226291, + "num_input_tokens_seen": 184826475, + "step": 8596, + "time_per_iteration": 2.581263542175293 + }, + { + "auxiliary_loss_clip": 0.01083842, + "auxiliary_loss_mlp": 0.01032453, + "balance_loss_clip": 1.0427258, + "balance_loss_mlp": 1.01740873, + "epoch": 0.5168796031865324, + "flos": 25007257411200.0, + "grad_norm": 1.5566562133380693, + "language_loss": 0.75481033, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.77597326, + "num_input_tokens_seen": 184845245, + "step": 8597, + "time_per_iteration": 2.741926670074463 + }, + { + "auxiliary_loss_clip": 0.01124007, + "auxiliary_loss_mlp": 0.01026784, + "balance_loss_clip": 1.04456997, + "balance_loss_mlp": 1.01349235, + "epoch": 0.5169397264392004, + "flos": 26939969533440.0, + "grad_norm": 1.5821649734534613, + "language_loss": 0.81177652, + "learning_rate": 1.987342579847403e-06, + "loss": 0.83328438, + "num_input_tokens_seen": 184866605, + "step": 8598, + "time_per_iteration": 2.690035343170166 + }, + { + "auxiliary_loss_clip": 0.01071801, + "auxiliary_loss_mlp": 0.01046328, + "balance_loss_clip": 1.03745472, + "balance_loss_mlp": 1.03122449, + "epoch": 0.5169998496918683, + "flos": 25407804948480.0, + "grad_norm": 1.4930779887062733, + "language_loss": 0.75179017, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.77297151, + "num_input_tokens_seen": 184886945, + "step": 8599, + "time_per_iteration": 2.8392081260681152 + }, + { + "auxiliary_loss_clip": 0.01105064, + "auxiliary_loss_mlp": 0.01033083, + "balance_loss_clip": 1.04534984, + "balance_loss_mlp": 1.02013683, + "epoch": 0.5170599729445363, + "flos": 24680793674880.0, + "grad_norm": 2.7626803107212825, + "language_loss": 0.72095126, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.7423327, + "num_input_tokens_seen": 184905590, + "step": 8600, + "time_per_iteration": 4.393568515777588 + }, + { + "auxiliary_loss_clip": 0.01085277, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.03932548, + "balance_loss_mlp": 1.02074337, + "epoch": 0.5171200961972042, + "flos": 20994455664000.0, + "grad_norm": 1.381905387614244, + "language_loss": 0.73886168, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.76006198, + "num_input_tokens_seen": 184925555, + "step": 8601, + "time_per_iteration": 2.7736306190490723 + }, + { + "auxiliary_loss_clip": 0.01114158, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.04510868, + "balance_loss_mlp": 1.02620816, + "epoch": 0.5171802194498722, + "flos": 22745639427840.0, + "grad_norm": 2.1013626788591817, + "language_loss": 0.83703583, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.85858572, + "num_input_tokens_seen": 184944490, + "step": 8602, + "time_per_iteration": 4.306191444396973 + }, + { + "auxiliary_loss_clip": 0.01124659, + "auxiliary_loss_mlp": 0.01033871, + "balance_loss_clip": 1.04496753, + "balance_loss_mlp": 1.01937509, + "epoch": 0.5172403427025402, + "flos": 28176732668160.0, + "grad_norm": 1.7451034136925476, + "language_loss": 0.74647379, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.76805902, + "num_input_tokens_seen": 184963190, + "step": 8603, + "time_per_iteration": 2.73425030708313 + }, + { + "auxiliary_loss_clip": 0.01101467, + "auxiliary_loss_mlp": 0.01037433, + "balance_loss_clip": 1.04518127, + "balance_loss_mlp": 1.02431369, + "epoch": 0.5173004659552082, + "flos": 20337829090560.0, + "grad_norm": 2.1792209860390503, + "language_loss": 0.72349811, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.74488711, + "num_input_tokens_seen": 184981220, + "step": 8604, + "time_per_iteration": 2.740248441696167 + }, + { + "auxiliary_loss_clip": 0.01107237, + "auxiliary_loss_mlp": 0.01042176, + "balance_loss_clip": 1.04422593, + "balance_loss_mlp": 1.02716208, + "epoch": 0.5173605892078762, + "flos": 19063323740160.0, + "grad_norm": 1.7719196350127329, + "language_loss": 0.85052991, + "learning_rate": 1.984616415277469e-06, + "loss": 0.87202406, + "num_input_tokens_seen": 184998810, + "step": 8605, + "time_per_iteration": 4.264687538146973 + }, + { + "auxiliary_loss_clip": 0.01107777, + "auxiliary_loss_mlp": 0.01027945, + "balance_loss_clip": 1.04396403, + "balance_loss_mlp": 1.01552308, + "epoch": 0.5174207124605441, + "flos": 27995168396160.0, + "grad_norm": 1.6794634480750013, + "language_loss": 0.64467752, + "learning_rate": 1.984226965411294e-06, + "loss": 0.6660347, + "num_input_tokens_seen": 185021185, + "step": 8606, + "time_per_iteration": 2.7390646934509277 + }, + { + "auxiliary_loss_clip": 0.01096289, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.04330635, + "balance_loss_mlp": 1.01885414, + "epoch": 0.5174808357132121, + "flos": 19496657416320.0, + "grad_norm": 1.503605725156866, + "language_loss": 0.77918422, + "learning_rate": 1.983837516143234e-06, + "loss": 0.80046678, + "num_input_tokens_seen": 185038465, + "step": 8607, + "time_per_iteration": 2.718864917755127 + }, + { + "auxiliary_loss_clip": 0.01114878, + "auxiliary_loss_mlp": 0.01036994, + "balance_loss_clip": 1.04531431, + "balance_loss_mlp": 1.0226177, + "epoch": 0.51754095896588, + "flos": 22784171742720.0, + "grad_norm": 2.7158797821524585, + "language_loss": 0.72334993, + "learning_rate": 1.983448067488057e-06, + "loss": 0.74486864, + "num_input_tokens_seen": 185057340, + "step": 8608, + "time_per_iteration": 2.767817258834839 + }, + { + "auxiliary_loss_clip": 0.01119837, + "auxiliary_loss_mlp": 0.01034295, + "balance_loss_clip": 1.04469681, + "balance_loss_mlp": 1.01979923, + "epoch": 0.5176010822185481, + "flos": 22669257156480.0, + "grad_norm": 1.8609844806921267, + "language_loss": 0.8623482, + "learning_rate": 1.983058619460531e-06, + "loss": 0.88388956, + "num_input_tokens_seen": 185074935, + "step": 8609, + "time_per_iteration": 2.8063855171203613 + }, + { + "auxiliary_loss_clip": 0.01111694, + "auxiliary_loss_mlp": 0.01037765, + "balance_loss_clip": 1.04306316, + "balance_loss_mlp": 1.02484906, + "epoch": 0.517661205471216, + "flos": 23951196622080.0, + "grad_norm": 2.050130502752804, + "language_loss": 0.73473549, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.75623012, + "num_input_tokens_seen": 185095050, + "step": 8610, + "time_per_iteration": 2.740083694458008 + }, + { + "auxiliary_loss_clip": 0.01129954, + "auxiliary_loss_mlp": 0.01038598, + "balance_loss_clip": 1.04616904, + "balance_loss_mlp": 1.02353036, + "epoch": 0.517721328723884, + "flos": 15596076735360.0, + "grad_norm": 2.3590336184711926, + "language_loss": 0.67205131, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.69373685, + "num_input_tokens_seen": 185112275, + "step": 8611, + "time_per_iteration": 2.648165464401245 + }, + { + "auxiliary_loss_clip": 0.01122336, + "auxiliary_loss_mlp": 0.01039403, + "balance_loss_clip": 1.0434556, + "balance_loss_mlp": 1.02535403, + "epoch": 0.5177814519765519, + "flos": 20960197067520.0, + "grad_norm": 2.3905761842565485, + "language_loss": 0.77420157, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.79581904, + "num_input_tokens_seen": 185132165, + "step": 8612, + "time_per_iteration": 2.663339376449585 + }, + { + "auxiliary_loss_clip": 0.01114318, + "auxiliary_loss_mlp": 0.01040798, + "balance_loss_clip": 1.04297137, + "balance_loss_mlp": 1.02688015, + "epoch": 0.5178415752292199, + "flos": 17967832796160.0, + "grad_norm": 2.1474229546439174, + "language_loss": 0.8168264, + "learning_rate": 1.981500833922294e-06, + "loss": 0.83837759, + "num_input_tokens_seen": 185151025, + "step": 8613, + "time_per_iteration": 2.6589057445526123 + }, + { + "auxiliary_loss_clip": 0.01128171, + "auxiliary_loss_mlp": 0.01042961, + "balance_loss_clip": 1.04804301, + "balance_loss_mlp": 1.02832222, + "epoch": 0.5179016984818878, + "flos": 17821496787840.0, + "grad_norm": 2.274335348251239, + "language_loss": 0.66216785, + "learning_rate": 1.981111389254541e-06, + "loss": 0.6838792, + "num_input_tokens_seen": 185168455, + "step": 8614, + "time_per_iteration": 2.692133903503418 + }, + { + "auxiliary_loss_clip": 0.01100612, + "auxiliary_loss_mlp": 0.01034486, + "balance_loss_clip": 1.04462051, + "balance_loss_mlp": 1.01982355, + "epoch": 0.5179618217345558, + "flos": 17820455293440.0, + "grad_norm": 2.0015033819610055, + "language_loss": 0.8693983, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.89074928, + "num_input_tokens_seen": 185184415, + "step": 8615, + "time_per_iteration": 2.690483808517456 + }, + { + "auxiliary_loss_clip": 0.01113112, + "auxiliary_loss_mlp": 0.01044655, + "balance_loss_clip": 1.04499412, + "balance_loss_mlp": 1.03147638, + "epoch": 0.5180219449872238, + "flos": 22522131048960.0, + "grad_norm": 1.8105595259457619, + "language_loss": 0.8084923, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.83007002, + "num_input_tokens_seen": 185202910, + "step": 8616, + "time_per_iteration": 2.6410508155822754 + }, + { + "auxiliary_loss_clip": 0.01120148, + "auxiliary_loss_mlp": 0.00772211, + "balance_loss_clip": 1.04987717, + "balance_loss_mlp": 1.00035763, + "epoch": 0.5180820682398918, + "flos": 23915465568000.0, + "grad_norm": 2.1203191332986675, + "language_loss": 0.75104189, + "learning_rate": 1.9799430596079e-06, + "loss": 0.76996547, + "num_input_tokens_seen": 185223085, + "step": 8617, + "time_per_iteration": 2.6979870796203613 + }, + { + "auxiliary_loss_clip": 0.01126304, + "auxiliary_loss_mlp": 0.01042481, + "balance_loss_clip": 1.04557788, + "balance_loss_mlp": 1.02717435, + "epoch": 0.5181421914925598, + "flos": 16979930064000.0, + "grad_norm": 1.6549706674723104, + "language_loss": 0.70240247, + "learning_rate": 1.979553617893785e-06, + "loss": 0.72409028, + "num_input_tokens_seen": 185241295, + "step": 8618, + "time_per_iteration": 2.6166911125183105 + }, + { + "auxiliary_loss_clip": 0.01038523, + "auxiliary_loss_mlp": 0.01004843, + "balance_loss_clip": 1.02117562, + "balance_loss_mlp": 1.00342429, + "epoch": 0.5182023147452277, + "flos": 66059870872320.0, + "grad_norm": 0.9503620431523022, + "language_loss": 0.67223799, + "learning_rate": 1.979164176954999e-06, + "loss": 0.69267166, + "num_input_tokens_seen": 185298295, + "step": 8619, + "time_per_iteration": 3.186922550201416 + }, + { + "auxiliary_loss_clip": 0.01079843, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.04400134, + "balance_loss_mlp": 1.02230954, + "epoch": 0.5182624379978957, + "flos": 18187749815040.0, + "grad_norm": 1.8983764009380637, + "language_loss": 0.79863739, + "learning_rate": 1.97877473680631e-06, + "loss": 0.8197943, + "num_input_tokens_seen": 185317000, + "step": 8620, + "time_per_iteration": 2.8446528911590576 + }, + { + "auxiliary_loss_clip": 0.01060893, + "auxiliary_loss_mlp": 0.00772403, + "balance_loss_clip": 1.04089034, + "balance_loss_mlp": 1.00029039, + "epoch": 0.5183225612505636, + "flos": 14026708638720.0, + "grad_norm": 2.0819192927399586, + "language_loss": 0.82402205, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.84235501, + "num_input_tokens_seen": 185331185, + "step": 8621, + "time_per_iteration": 2.753957509994507 + }, + { + "auxiliary_loss_clip": 0.01097265, + "auxiliary_loss_mlp": 0.010405, + "balance_loss_clip": 1.03958249, + "balance_loss_mlp": 1.02750611, + "epoch": 0.5183826845032317, + "flos": 23659781581440.0, + "grad_norm": 2.428940739700658, + "language_loss": 0.65491748, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.67629516, + "num_input_tokens_seen": 185348955, + "step": 8622, + "time_per_iteration": 2.7421741485595703 + }, + { + "auxiliary_loss_clip": 0.01106105, + "auxiliary_loss_mlp": 0.01044986, + "balance_loss_clip": 1.04371572, + "balance_loss_mlp": 1.03016257, + "epoch": 0.5184428077558996, + "flos": 15888605097600.0, + "grad_norm": 2.083884784089921, + "language_loss": 0.60552382, + "learning_rate": 1.977606421248497e-06, + "loss": 0.62703472, + "num_input_tokens_seen": 185367330, + "step": 8623, + "time_per_iteration": 2.690345048904419 + }, + { + "auxiliary_loss_clip": 0.0112578, + "auxiliary_loss_mlp": 0.01032047, + "balance_loss_clip": 1.04534173, + "balance_loss_mlp": 1.01890421, + "epoch": 0.5185029310085676, + "flos": 21030833162880.0, + "grad_norm": 1.609281256747452, + "language_loss": 0.76150465, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.78308284, + "num_input_tokens_seen": 185385060, + "step": 8624, + "time_per_iteration": 2.613788366317749 + }, + { + "auxiliary_loss_clip": 0.0107795, + "auxiliary_loss_mlp": 0.01043066, + "balance_loss_clip": 1.03900456, + "balance_loss_mlp": 1.02859426, + "epoch": 0.5185630542612355, + "flos": 26542690133760.0, + "grad_norm": 2.373822325498003, + "language_loss": 0.70952767, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.73073781, + "num_input_tokens_seen": 185403745, + "step": 8625, + "time_per_iteration": 2.7548205852508545 + }, + { + "auxiliary_loss_clip": 0.01100948, + "auxiliary_loss_mlp": 0.0103515, + "balance_loss_clip": 1.04119349, + "balance_loss_mlp": 1.02260327, + "epoch": 0.5186231775139035, + "flos": 20668422890880.0, + "grad_norm": 1.9009704883002407, + "language_loss": 0.67718256, + "learning_rate": 1.976438113333184e-06, + "loss": 0.69854349, + "num_input_tokens_seen": 185422620, + "step": 8626, + "time_per_iteration": 2.731328248977661 + }, + { + "auxiliary_loss_clip": 0.0111085, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.04271841, + "balance_loss_mlp": 1.02022982, + "epoch": 0.5186833007665714, + "flos": 20885502735360.0, + "grad_norm": 1.960489278080422, + "language_loss": 0.70780122, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.72924662, + "num_input_tokens_seen": 185439380, + "step": 8627, + "time_per_iteration": 2.6464414596557617 + }, + { + "auxiliary_loss_clip": 0.011279, + "auxiliary_loss_mlp": 0.00772067, + "balance_loss_clip": 1.04576206, + "balance_loss_mlp": 1.00029826, + "epoch": 0.5187434240192395, + "flos": 20886903365760.0, + "grad_norm": 2.0333805073835007, + "language_loss": 0.7303592, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.74935889, + "num_input_tokens_seen": 185458830, + "step": 8628, + "time_per_iteration": 2.7327346801757812 + }, + { + "auxiliary_loss_clip": 0.01102356, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.04561651, + "balance_loss_mlp": 1.01927686, + "epoch": 0.5188035472719074, + "flos": 19859929614720.0, + "grad_norm": 1.6190117042724865, + "language_loss": 0.77354944, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.79489267, + "num_input_tokens_seen": 185477270, + "step": 8629, + "time_per_iteration": 2.77992582321167 + }, + { + "auxiliary_loss_clip": 0.01115143, + "auxiliary_loss_mlp": 0.01034186, + "balance_loss_clip": 1.04428935, + "balance_loss_mlp": 1.01932621, + "epoch": 0.5188636705245754, + "flos": 21138313633920.0, + "grad_norm": 2.228815370750346, + "language_loss": 0.75078702, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.77228034, + "num_input_tokens_seen": 185495795, + "step": 8630, + "time_per_iteration": 2.6749987602233887 + }, + { + "auxiliary_loss_clip": 0.01112188, + "auxiliary_loss_mlp": 0.01038971, + "balance_loss_clip": 1.04358792, + "balance_loss_mlp": 1.02446306, + "epoch": 0.5189237937772434, + "flos": 22419786222720.0, + "grad_norm": 2.002083188679526, + "language_loss": 0.80665708, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.82816863, + "num_input_tokens_seen": 185514885, + "step": 8631, + "time_per_iteration": 2.7432682514190674 + }, + { + "auxiliary_loss_clip": 0.01114617, + "auxiliary_loss_mlp": 0.01034953, + "balance_loss_clip": 1.04478788, + "balance_loss_mlp": 1.02031374, + "epoch": 0.5189839170299113, + "flos": 25446696399360.0, + "grad_norm": 1.4933919289773454, + "language_loss": 0.74756616, + "learning_rate": 1.974101522024942e-06, + "loss": 0.76906186, + "num_input_tokens_seen": 185537155, + "step": 8632, + "time_per_iteration": 2.726018190383911 + }, + { + "auxiliary_loss_clip": 0.01093075, + "auxiliary_loss_mlp": 0.01033441, + "balance_loss_clip": 1.04612803, + "balance_loss_mlp": 1.01946926, + "epoch": 0.5190440402825793, + "flos": 18587722734720.0, + "grad_norm": 1.8814471450767234, + "language_loss": 0.78911304, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.81037819, + "num_input_tokens_seen": 185555520, + "step": 8633, + "time_per_iteration": 2.715510606765747 + }, + { + "auxiliary_loss_clip": 0.0111596, + "auxiliary_loss_mlp": 0.01028973, + "balance_loss_clip": 1.04581857, + "balance_loss_mlp": 1.01619983, + "epoch": 0.5191041635352472, + "flos": 21908633731200.0, + "grad_norm": 5.606824878452593, + "language_loss": 0.80551088, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.82696015, + "num_input_tokens_seen": 185573855, + "step": 8634, + "time_per_iteration": 2.6477181911468506 + }, + { + "auxiliary_loss_clip": 0.01122619, + "auxiliary_loss_mlp": 0.0103901, + "balance_loss_clip": 1.04603028, + "balance_loss_mlp": 1.02571273, + "epoch": 0.5191642867879153, + "flos": 27527971173120.0, + "grad_norm": 1.5734156514364543, + "language_loss": 0.69467652, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.71629286, + "num_input_tokens_seen": 185595145, + "step": 8635, + "time_per_iteration": 2.713585615158081 + }, + { + "auxiliary_loss_clip": 0.01102259, + "auxiliary_loss_mlp": 0.01035772, + "balance_loss_clip": 1.0431217, + "balance_loss_mlp": 1.02210498, + "epoch": 0.5192244100405832, + "flos": 15705999331200.0, + "grad_norm": 1.6343728145872918, + "language_loss": 0.77876496, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.80014527, + "num_input_tokens_seen": 185613320, + "step": 8636, + "time_per_iteration": 2.6876139640808105 + }, + { + "auxiliary_loss_clip": 0.01127572, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.04695189, + "balance_loss_mlp": 1.01938009, + "epoch": 0.5192845332932512, + "flos": 12057080313600.0, + "grad_norm": 2.1121159964360596, + "language_loss": 0.71433318, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.73593867, + "num_input_tokens_seen": 185630730, + "step": 8637, + "time_per_iteration": 2.6093368530273438 + }, + { + "auxiliary_loss_clip": 0.01088299, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.04357982, + "balance_loss_mlp": 1.01999319, + "epoch": 0.5193446565459191, + "flos": 18953185662720.0, + "grad_norm": 2.05486546466365, + "language_loss": 0.76026344, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.78147888, + "num_input_tokens_seen": 185648515, + "step": 8638, + "time_per_iteration": 2.680696725845337 + }, + { + "auxiliary_loss_clip": 0.0109108, + "auxiliary_loss_mlp": 0.01028733, + "balance_loss_clip": 1.04291189, + "balance_loss_mlp": 1.01578116, + "epoch": 0.5194047797985871, + "flos": 20374960775040.0, + "grad_norm": 13.373516582231533, + "language_loss": 0.74382144, + "learning_rate": 1.971375543740272e-06, + "loss": 0.7650196, + "num_input_tokens_seen": 185665220, + "step": 8639, + "time_per_iteration": 4.318557500839233 + }, + { + "auxiliary_loss_clip": 0.01123361, + "auxiliary_loss_mlp": 0.01032334, + "balance_loss_clip": 1.04529893, + "balance_loss_mlp": 1.01838636, + "epoch": 0.519464903051255, + "flos": 24353001135360.0, + "grad_norm": 1.5657899745454023, + "language_loss": 0.77311909, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.79467607, + "num_input_tokens_seen": 185683750, + "step": 8640, + "time_per_iteration": 2.5864639282226562 + }, + { + "auxiliary_loss_clip": 0.01082849, + "auxiliary_loss_mlp": 0.0103216, + "balance_loss_clip": 1.04260516, + "balance_loss_mlp": 1.01930904, + "epoch": 0.519525026303923, + "flos": 14061829161600.0, + "grad_norm": 2.0170540453425714, + "language_loss": 0.66183293, + "learning_rate": 1.97059670234927e-06, + "loss": 0.68298292, + "num_input_tokens_seen": 185700625, + "step": 8641, + "time_per_iteration": 2.692979574203491 + }, + { + "auxiliary_loss_clip": 0.01123177, + "auxiliary_loss_mlp": 0.01034193, + "balance_loss_clip": 1.04594493, + "balance_loss_mlp": 1.02172363, + "epoch": 0.519585149556591, + "flos": 28835873193600.0, + "grad_norm": 1.7554954360005686, + "language_loss": 0.76535702, + "learning_rate": 1.97020728331885e-06, + "loss": 0.78693068, + "num_input_tokens_seen": 185721155, + "step": 8642, + "time_per_iteration": 5.96128249168396 + }, + { + "auxiliary_loss_clip": 0.0112288, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.04584873, + "balance_loss_mlp": 1.01806307, + "epoch": 0.519645272809259, + "flos": 25373007648000.0, + "grad_norm": 2.255175934024536, + "language_loss": 0.83165199, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.85319304, + "num_input_tokens_seen": 185740990, + "step": 8643, + "time_per_iteration": 2.81384539604187 + }, + { + "auxiliary_loss_clip": 0.01126122, + "auxiliary_loss_mlp": 0.01041822, + "balance_loss_clip": 1.04520261, + "balance_loss_mlp": 1.02785623, + "epoch": 0.519705396061927, + "flos": 25372863993600.0, + "grad_norm": 2.2020503225508645, + "language_loss": 0.7044059, + "learning_rate": 1.969428448662004e-06, + "loss": 0.72608531, + "num_input_tokens_seen": 185762235, + "step": 8644, + "time_per_iteration": 2.7107033729553223 + }, + { + "auxiliary_loss_clip": 0.01111108, + "auxiliary_loss_mlp": 0.00770711, + "balance_loss_clip": 1.04354811, + "balance_loss_mlp": 1.00015676, + "epoch": 0.5197655193145949, + "flos": 28476228268800.0, + "grad_norm": 1.5309653957313616, + "language_loss": 0.80272603, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.82154423, + "num_input_tokens_seen": 185783415, + "step": 8645, + "time_per_iteration": 4.246826171875 + }, + { + "auxiliary_loss_clip": 0.01122573, + "auxiliary_loss_mlp": 0.01033869, + "balance_loss_clip": 1.04362488, + "balance_loss_mlp": 1.02058911, + "epoch": 0.5198256425672629, + "flos": 20009138711040.0, + "grad_norm": 1.7778396167930446, + "language_loss": 0.7800498, + "learning_rate": 1.968649618642264e-06, + "loss": 0.80161417, + "num_input_tokens_seen": 185801345, + "step": 8646, + "time_per_iteration": 2.630892276763916 + }, + { + "auxiliary_loss_clip": 0.01117401, + "auxiliary_loss_mlp": 0.01035003, + "balance_loss_clip": 1.04832959, + "balance_loss_mlp": 1.02218235, + "epoch": 0.5198857658199308, + "flos": 19828867328640.0, + "grad_norm": 1.6794769864367036, + "language_loss": 0.65647638, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.67800039, + "num_input_tokens_seen": 185820815, + "step": 8647, + "time_per_iteration": 2.6543033123016357 + }, + { + "auxiliary_loss_clip": 0.01127292, + "auxiliary_loss_mlp": 0.01036653, + "balance_loss_clip": 1.04618931, + "balance_loss_mlp": 1.02208591, + "epoch": 0.5199458890725989, + "flos": 24461918150400.0, + "grad_norm": 1.7193073170603235, + "language_loss": 0.71425897, + "learning_rate": 1.967870793377763e-06, + "loss": 0.73589844, + "num_input_tokens_seen": 185841450, + "step": 8648, + "time_per_iteration": 2.6632113456726074 + }, + { + "auxiliary_loss_clip": 0.0110717, + "auxiliary_loss_mlp": 0.01035133, + "balance_loss_clip": 1.0474503, + "balance_loss_mlp": 1.02016664, + "epoch": 0.5200060123252668, + "flos": 23404779953280.0, + "grad_norm": 2.0932120653926853, + "language_loss": 0.64383608, + "learning_rate": 1.967481382565642e-06, + "loss": 0.66525912, + "num_input_tokens_seen": 185859935, + "step": 8649, + "time_per_iteration": 2.708676815032959 + }, + { + "auxiliary_loss_clip": 0.01101881, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.04480278, + "balance_loss_mlp": 1.02039409, + "epoch": 0.5200661355779348, + "flos": 17201355454080.0, + "grad_norm": 2.0779038173518978, + "language_loss": 0.70331943, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.72469461, + "num_input_tokens_seen": 185876795, + "step": 8650, + "time_per_iteration": 2.650996446609497 + }, + { + "auxiliary_loss_clip": 0.01123307, + "auxiliary_loss_mlp": 0.01030812, + "balance_loss_clip": 1.04483724, + "balance_loss_mlp": 1.01754415, + "epoch": 0.5201262588306027, + "flos": 18515075477760.0, + "grad_norm": 1.793577075652819, + "language_loss": 0.77560079, + "learning_rate": 1.966702564655496e-06, + "loss": 0.79714197, + "num_input_tokens_seen": 185895570, + "step": 8651, + "time_per_iteration": 2.6181790828704834 + }, + { + "auxiliary_loss_clip": 0.01068752, + "auxiliary_loss_mlp": 0.01040289, + "balance_loss_clip": 1.04241145, + "balance_loss_mlp": 1.02557862, + "epoch": 0.5201863820832707, + "flos": 18619395552000.0, + "grad_norm": 1.579276828195563, + "language_loss": 0.78716815, + "learning_rate": 1.966313157587003e-06, + "loss": 0.80825853, + "num_input_tokens_seen": 185913700, + "step": 8652, + "time_per_iteration": 2.81169056892395 + }, + { + "auxiliary_loss_clip": 0.01087589, + "auxiliary_loss_mlp": 0.0103997, + "balance_loss_clip": 1.04238617, + "balance_loss_mlp": 1.02496183, + "epoch": 0.5202465053359386, + "flos": 22857142222080.0, + "grad_norm": 2.456126746607985, + "language_loss": 0.70069832, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.7219739, + "num_input_tokens_seen": 185932460, + "step": 8653, + "time_per_iteration": 2.8110082149505615 + }, + { + "auxiliary_loss_clip": 0.01094035, + "auxiliary_loss_mlp": 0.01042704, + "balance_loss_clip": 1.04702687, + "balance_loss_mlp": 1.02864337, + "epoch": 0.5203066285886067, + "flos": 21981532383360.0, + "grad_norm": 1.546190224311193, + "language_loss": 0.78555804, + "learning_rate": 1.965534347297008e-06, + "loss": 0.80692542, + "num_input_tokens_seen": 185952030, + "step": 8654, + "time_per_iteration": 2.8240180015563965 + }, + { + "auxiliary_loss_clip": 0.01115002, + "auxiliary_loss_mlp": 0.01046231, + "balance_loss_clip": 1.04417038, + "balance_loss_mlp": 1.03130579, + "epoch": 0.5203667518412746, + "flos": 20233329448320.0, + "grad_norm": 1.7757606906195533, + "language_loss": 0.84137118, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.86298347, + "num_input_tokens_seen": 185973130, + "step": 8655, + "time_per_iteration": 2.767338752746582 + }, + { + "auxiliary_loss_clip": 0.01113773, + "auxiliary_loss_mlp": 0.01038813, + "balance_loss_clip": 1.04705739, + "balance_loss_mlp": 1.02643943, + "epoch": 0.5204268750939426, + "flos": 15705460627200.0, + "grad_norm": 2.3853440972465227, + "language_loss": 0.66374946, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.68527532, + "num_input_tokens_seen": 185990200, + "step": 8656, + "time_per_iteration": 2.6653099060058594 + }, + { + "auxiliary_loss_clip": 0.01083984, + "auxiliary_loss_mlp": 0.01043204, + "balance_loss_clip": 1.04517853, + "balance_loss_mlp": 1.02981043, + "epoch": 0.5204869983466105, + "flos": 27449469999360.0, + "grad_norm": 1.9804929730339849, + "language_loss": 0.73262924, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.75390112, + "num_input_tokens_seen": 186009880, + "step": 8657, + "time_per_iteration": 2.8447728157043457 + }, + { + "auxiliary_loss_clip": 0.01091042, + "auxiliary_loss_mlp": 0.01039275, + "balance_loss_clip": 1.0432241, + "balance_loss_mlp": 1.02489877, + "epoch": 0.5205471215992785, + "flos": 20595452411520.0, + "grad_norm": 1.769785544944644, + "language_loss": 0.71705246, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.73835564, + "num_input_tokens_seen": 186026680, + "step": 8658, + "time_per_iteration": 2.8423781394958496 + }, + { + "auxiliary_loss_clip": 0.01123437, + "auxiliary_loss_mlp": 0.01039751, + "balance_loss_clip": 1.04425454, + "balance_loss_mlp": 1.02607751, + "epoch": 0.5206072448519465, + "flos": 22127904305280.0, + "grad_norm": 1.7936056694778655, + "language_loss": 0.83181685, + "learning_rate": 1.963587344701897e-06, + "loss": 0.85344875, + "num_input_tokens_seen": 186046920, + "step": 8659, + "time_per_iteration": 2.662799596786499 + }, + { + "auxiliary_loss_clip": 0.01103478, + "auxiliary_loss_mlp": 0.01045743, + "balance_loss_clip": 1.043998, + "balance_loss_mlp": 1.02959061, + "epoch": 0.5206673681046144, + "flos": 18330422636160.0, + "grad_norm": 1.9906097398392346, + "language_loss": 0.75777173, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.77926397, + "num_input_tokens_seen": 186062090, + "step": 8660, + "time_per_iteration": 2.6635682582855225 + }, + { + "auxiliary_loss_clip": 0.01123245, + "auxiliary_loss_mlp": 0.01039579, + "balance_loss_clip": 1.04523396, + "balance_loss_mlp": 1.02638865, + "epoch": 0.5207274913572825, + "flos": 20230240878720.0, + "grad_norm": 1.836365427627734, + "language_loss": 0.77897781, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.80060601, + "num_input_tokens_seen": 186081135, + "step": 8661, + "time_per_iteration": 2.6036980152130127 + }, + { + "auxiliary_loss_clip": 0.01101785, + "auxiliary_loss_mlp": 0.0103675, + "balance_loss_clip": 1.04206395, + "balance_loss_mlp": 1.02354193, + "epoch": 0.5207876146099504, + "flos": 22127042378880.0, + "grad_norm": 1.6821546298299666, + "language_loss": 0.70456815, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.72595346, + "num_input_tokens_seen": 186099700, + "step": 8662, + "time_per_iteration": 2.6941347122192383 + }, + { + "auxiliary_loss_clip": 0.01108537, + "auxiliary_loss_mlp": 0.01034478, + "balance_loss_clip": 1.04286838, + "balance_loss_mlp": 1.01910543, + "epoch": 0.5208477378626184, + "flos": 23878908501120.0, + "grad_norm": 1.571076572398917, + "language_loss": 0.69488823, + "learning_rate": 1.962029767391098e-06, + "loss": 0.71631837, + "num_input_tokens_seen": 186119740, + "step": 8663, + "time_per_iteration": 2.648148536682129 + }, + { + "auxiliary_loss_clip": 0.01096912, + "auxiliary_loss_mlp": 0.00772823, + "balance_loss_clip": 1.04340351, + "balance_loss_mlp": 1.00029683, + "epoch": 0.5209078611152863, + "flos": 20961525870720.0, + "grad_norm": 1.508064062466455, + "language_loss": 0.77011776, + "learning_rate": 1.961640376626072e-06, + "loss": 0.78881508, + "num_input_tokens_seen": 186140645, + "step": 8664, + "time_per_iteration": 2.713656187057495 + }, + { + "auxiliary_loss_clip": 0.01099911, + "auxiliary_loss_mlp": 0.01035751, + "balance_loss_clip": 1.04555953, + "balance_loss_mlp": 1.02207136, + "epoch": 0.5209679843679543, + "flos": 20667740532480.0, + "grad_norm": 2.174055653698437, + "language_loss": 0.76443201, + "learning_rate": 1.961250987315646e-06, + "loss": 0.78578866, + "num_input_tokens_seen": 186160130, + "step": 8665, + "time_per_iteration": 2.6254820823669434 + }, + { + "auxiliary_loss_clip": 0.0111827, + "auxiliary_loss_mlp": 0.0103845, + "balance_loss_clip": 1.04986227, + "balance_loss_mlp": 1.02577186, + "epoch": 0.5210281076206222, + "flos": 20227295963520.0, + "grad_norm": 1.6491776532454103, + "language_loss": 0.72156572, + "learning_rate": 1.960861599474586e-06, + "loss": 0.74313289, + "num_input_tokens_seen": 186179485, + "step": 8666, + "time_per_iteration": 2.680417060852051 + }, + { + "auxiliary_loss_clip": 0.01108853, + "auxiliary_loss_mlp": 0.01038135, + "balance_loss_clip": 1.04408336, + "balance_loss_mlp": 1.02222097, + "epoch": 0.5210882308732903, + "flos": 16069989801600.0, + "grad_norm": 2.5838170040517583, + "language_loss": 0.68477565, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.70624554, + "num_input_tokens_seen": 186197140, + "step": 8667, + "time_per_iteration": 2.665583372116089 + }, + { + "auxiliary_loss_clip": 0.01089337, + "auxiliary_loss_mlp": 0.01039011, + "balance_loss_clip": 1.05282402, + "balance_loss_mlp": 1.02584982, + "epoch": 0.5211483541259582, + "flos": 24825298089600.0, + "grad_norm": 1.3808961063616443, + "language_loss": 0.81199509, + "learning_rate": 1.960082828259629e-06, + "loss": 0.83327854, + "num_input_tokens_seen": 186216800, + "step": 8668, + "time_per_iteration": 2.802410125732422 + }, + { + "auxiliary_loss_clip": 0.01105597, + "auxiliary_loss_mlp": 0.01031995, + "balance_loss_clip": 1.04507339, + "balance_loss_mlp": 1.01803613, + "epoch": 0.5212084773786262, + "flos": 20370651143040.0, + "grad_norm": 2.086648647266329, + "language_loss": 0.63722765, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.65860361, + "num_input_tokens_seen": 186235320, + "step": 8669, + "time_per_iteration": 2.681579113006592 + }, + { + "auxiliary_loss_clip": 0.01102666, + "auxiliary_loss_mlp": 0.00771955, + "balance_loss_clip": 1.04595864, + "balance_loss_mlp": 1.00027704, + "epoch": 0.5212686006312941, + "flos": 23145468693120.0, + "grad_norm": 1.5766402887224458, + "language_loss": 0.66502392, + "learning_rate": 1.959304063099325e-06, + "loss": 0.68377018, + "num_input_tokens_seen": 186254460, + "step": 8670, + "time_per_iteration": 2.7425742149353027 + }, + { + "auxiliary_loss_clip": 0.01085453, + "auxiliary_loss_mlp": 0.01033651, + "balance_loss_clip": 1.04303861, + "balance_loss_mlp": 1.02063334, + "epoch": 0.5213287238839621, + "flos": 27774030314880.0, + "grad_norm": 2.122031398938641, + "language_loss": 0.76534224, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.78653324, + "num_input_tokens_seen": 186269465, + "step": 8671, + "time_per_iteration": 2.7530081272125244 + }, + { + "auxiliary_loss_clip": 0.01096106, + "auxiliary_loss_mlp": 0.01041463, + "balance_loss_clip": 1.04865241, + "balance_loss_mlp": 1.02665734, + "epoch": 0.5213888471366301, + "flos": 19937676602880.0, + "grad_norm": 2.569347871916013, + "language_loss": 0.78284293, + "learning_rate": 1.958525304111796e-06, + "loss": 0.80421865, + "num_input_tokens_seen": 186288660, + "step": 8672, + "time_per_iteration": 2.7782974243164062 + }, + { + "auxiliary_loss_clip": 0.01085385, + "auxiliary_loss_mlp": 0.01032995, + "balance_loss_clip": 1.04014993, + "balance_loss_mlp": 1.02035856, + "epoch": 0.521448970389298, + "flos": 16982731324800.0, + "grad_norm": 1.8835859039826313, + "language_loss": 0.72004962, + "learning_rate": 1.958135926969736e-06, + "loss": 0.74123341, + "num_input_tokens_seen": 186305760, + "step": 8673, + "time_per_iteration": 2.7094011306762695 + }, + { + "auxiliary_loss_clip": 0.01108751, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.04249072, + "balance_loss_mlp": 1.02049243, + "epoch": 0.5215090936419661, + "flos": 18989706816000.0, + "grad_norm": 1.4914552209414809, + "language_loss": 0.74901187, + "learning_rate": 1.957746551415166e-06, + "loss": 0.77044559, + "num_input_tokens_seen": 186324135, + "step": 8674, + "time_per_iteration": 2.6582236289978027 + }, + { + "auxiliary_loss_clip": 0.01097767, + "auxiliary_loss_mlp": 0.0103511, + "balance_loss_clip": 1.0421474, + "balance_loss_mlp": 1.02030408, + "epoch": 0.521569216894634, + "flos": 16143427157760.0, + "grad_norm": 2.0310628766426615, + "language_loss": 0.86121237, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.88254112, + "num_input_tokens_seen": 186340205, + "step": 8675, + "time_per_iteration": 2.659674882888794 + }, + { + "auxiliary_loss_clip": 0.01022959, + "auxiliary_loss_mlp": 0.01006796, + "balance_loss_clip": 1.01756668, + "balance_loss_mlp": 1.00524664, + "epoch": 0.521629340147302, + "flos": 57579493282560.0, + "grad_norm": 0.8681331347139113, + "language_loss": 0.63129932, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65159684, + "num_input_tokens_seen": 186396940, + "step": 8676, + "time_per_iteration": 3.205299139022827 + }, + { + "auxiliary_loss_clip": 0.01111064, + "auxiliary_loss_mlp": 0.01030098, + "balance_loss_clip": 1.04485834, + "balance_loss_mlp": 1.0172416, + "epoch": 0.5216894633999699, + "flos": 26796901662720.0, + "grad_norm": 1.5700830686566873, + "language_loss": 0.68696839, + "learning_rate": 1.956578434424046e-06, + "loss": 0.70837998, + "num_input_tokens_seen": 186418680, + "step": 8677, + "time_per_iteration": 2.7582013607025146 + }, + { + "auxiliary_loss_clip": 0.0111011, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.04261422, + "balance_loss_mlp": 1.01857519, + "epoch": 0.5217495866526379, + "flos": 26358719650560.0, + "grad_norm": 1.8246312930355708, + "language_loss": 0.65474886, + "learning_rate": 1.956189065367086e-06, + "loss": 0.67617249, + "num_input_tokens_seen": 186438265, + "step": 8678, + "time_per_iteration": 4.216279029846191 + }, + { + "auxiliary_loss_clip": 0.01101119, + "auxiliary_loss_mlp": 0.01036814, + "balance_loss_clip": 1.03927827, + "balance_loss_mlp": 1.02188301, + "epoch": 0.5218097099053058, + "flos": 23584009841280.0, + "grad_norm": 2.0476762683914287, + "language_loss": 0.67981493, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.70119429, + "num_input_tokens_seen": 186456870, + "step": 8679, + "time_per_iteration": 2.7411186695098877 + }, + { + "auxiliary_loss_clip": 0.01125585, + "auxiliary_loss_mlp": 0.01038661, + "balance_loss_clip": 1.04630351, + "balance_loss_mlp": 1.02463043, + "epoch": 0.5218698331579739, + "flos": 18077396256000.0, + "grad_norm": 1.6988813784316565, + "language_loss": 0.66861475, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.69025725, + "num_input_tokens_seen": 186476425, + "step": 8680, + "time_per_iteration": 2.656953811645508 + }, + { + "auxiliary_loss_clip": 0.0112586, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.04645705, + "balance_loss_mlp": 1.02533197, + "epoch": 0.5219299564106418, + "flos": 19281121856640.0, + "grad_norm": 2.024829019659845, + "language_loss": 0.83280826, + "learning_rate": 1.955020968223156e-06, + "loss": 0.85446072, + "num_input_tokens_seen": 186492555, + "step": 8681, + "time_per_iteration": 4.351206541061401 + }, + { + "auxiliary_loss_clip": 0.01098299, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.0424881, + "balance_loss_mlp": 1.02001929, + "epoch": 0.5219900796633098, + "flos": 26651355753600.0, + "grad_norm": 2.0563808347758563, + "language_loss": 0.77594543, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.79726237, + "num_input_tokens_seen": 186513190, + "step": 8682, + "time_per_iteration": 2.836205005645752 + }, + { + "auxiliary_loss_clip": 0.01084257, + "auxiliary_loss_mlp": 0.01048472, + "balance_loss_clip": 1.03948176, + "balance_loss_mlp": 1.03558517, + "epoch": 0.5220502029159777, + "flos": 34312717382400.0, + "grad_norm": 1.4694894100116993, + "language_loss": 0.68905342, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.71038067, + "num_input_tokens_seen": 186534830, + "step": 8683, + "time_per_iteration": 2.8703176975250244 + }, + { + "auxiliary_loss_clip": 0.01091474, + "auxiliary_loss_mlp": 0.01042368, + "balance_loss_clip": 1.04399586, + "balance_loss_mlp": 1.02824771, + "epoch": 0.5221103261686457, + "flos": 22156488552960.0, + "grad_norm": 1.7170989726331638, + "language_loss": 0.76116288, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.78250128, + "num_input_tokens_seen": 186554390, + "step": 8684, + "time_per_iteration": 2.8443922996520996 + }, + { + "auxiliary_loss_clip": 0.0110091, + "auxiliary_loss_mlp": 0.00771126, + "balance_loss_clip": 1.0387888, + "balance_loss_mlp": 1.00024819, + "epoch": 0.5221704494213137, + "flos": 19208402772480.0, + "grad_norm": 1.8259321745961588, + "language_loss": 0.75595027, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.7746706, + "num_input_tokens_seen": 186572360, + "step": 8685, + "time_per_iteration": 4.343646049499512 + }, + { + "auxiliary_loss_clip": 0.01101598, + "auxiliary_loss_mlp": 0.01041734, + "balance_loss_clip": 1.04539514, + "balance_loss_mlp": 1.02856123, + "epoch": 0.5222305726739817, + "flos": 19354056422400.0, + "grad_norm": 1.8098495762940472, + "language_loss": 0.80820441, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.82963777, + "num_input_tokens_seen": 186590655, + "step": 8686, + "time_per_iteration": 2.9524481296539307 + }, + { + "auxiliary_loss_clip": 0.01102372, + "auxiliary_loss_mlp": 0.01034624, + "balance_loss_clip": 1.04477715, + "balance_loss_mlp": 1.02207708, + "epoch": 0.5222906959266497, + "flos": 27814789272960.0, + "grad_norm": 1.5584733304526452, + "language_loss": 0.69955659, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.72092646, + "num_input_tokens_seen": 186610345, + "step": 8687, + "time_per_iteration": 2.8442130088806152 + }, + { + "auxiliary_loss_clip": 0.01119347, + "auxiliary_loss_mlp": 0.01033746, + "balance_loss_clip": 1.04286504, + "balance_loss_mlp": 1.02110982, + "epoch": 0.5223508191793176, + "flos": 12712988615040.0, + "grad_norm": 2.218511460216324, + "language_loss": 0.83229095, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85382187, + "num_input_tokens_seen": 186624360, + "step": 8688, + "time_per_iteration": 2.6338348388671875 + }, + { + "auxiliary_loss_clip": 0.01111374, + "auxiliary_loss_mlp": 0.00771369, + "balance_loss_clip": 1.04469848, + "balance_loss_mlp": 1.00028682, + "epoch": 0.5224109424319856, + "flos": 15632238752640.0, + "grad_norm": 2.3403806989505744, + "language_loss": 0.73484588, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.75367332, + "num_input_tokens_seen": 186638680, + "step": 8689, + "time_per_iteration": 2.7219626903533936 + }, + { + "auxiliary_loss_clip": 0.01098413, + "auxiliary_loss_mlp": 0.01039301, + "balance_loss_clip": 1.04080057, + "balance_loss_mlp": 1.02569962, + "epoch": 0.5224710656846535, + "flos": 15742233175680.0, + "grad_norm": 1.8348188856486891, + "language_loss": 0.83713108, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.85850823, + "num_input_tokens_seen": 186655840, + "step": 8690, + "time_per_iteration": 2.7358436584472656 + }, + { + "auxiliary_loss_clip": 0.01088108, + "auxiliary_loss_mlp": 0.01042101, + "balance_loss_clip": 1.04381537, + "balance_loss_mlp": 1.0276053, + "epoch": 0.5225311889373215, + "flos": 26030998938240.0, + "grad_norm": 2.015928049267595, + "language_loss": 0.79080188, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.81210393, + "num_input_tokens_seen": 186674150, + "step": 8691, + "time_per_iteration": 2.813861131668091 + }, + { + "auxiliary_loss_clip": 0.01120671, + "auxiliary_loss_mlp": 0.01040201, + "balance_loss_clip": 1.04700625, + "balance_loss_mlp": 1.02552676, + "epoch": 0.5225913121899894, + "flos": 18369278173440.0, + "grad_norm": 2.3023499072102194, + "language_loss": 0.76491982, + "learning_rate": 1.950738079725646e-06, + "loss": 0.78652847, + "num_input_tokens_seen": 186690675, + "step": 8692, + "time_per_iteration": 2.73480224609375 + }, + { + "auxiliary_loss_clip": 0.01108877, + "auxiliary_loss_mlp": 0.01039055, + "balance_loss_clip": 1.04479527, + "balance_loss_mlp": 1.02631116, + "epoch": 0.5226514354426575, + "flos": 29273516501760.0, + "grad_norm": 1.6247734368015925, + "language_loss": 0.72325015, + "learning_rate": 1.950348737138691e-06, + "loss": 0.7447294, + "num_input_tokens_seen": 186710380, + "step": 8693, + "time_per_iteration": 2.782871723175049 + }, + { + "auxiliary_loss_clip": 0.01126187, + "auxiliary_loss_mlp": 0.01042643, + "balance_loss_clip": 1.04384446, + "balance_loss_mlp": 1.02753901, + "epoch": 0.5227115586953254, + "flos": 22853299466880.0, + "grad_norm": 7.53216872329228, + "language_loss": 0.8220976, + "learning_rate": 1.949959396434517e-06, + "loss": 0.84378588, + "num_input_tokens_seen": 186729135, + "step": 8694, + "time_per_iteration": 2.6748385429382324 + }, + { + "auxiliary_loss_clip": 0.01013741, + "auxiliary_loss_mlp": 0.01003883, + "balance_loss_clip": 1.02031374, + "balance_loss_mlp": 1.00224972, + "epoch": 0.5227716819479934, + "flos": 57474419022720.0, + "grad_norm": 0.775564151874101, + "language_loss": 0.55647832, + "learning_rate": 1.949570057627888e-06, + "loss": 0.57665455, + "num_input_tokens_seen": 186791115, + "step": 8695, + "time_per_iteration": 3.345134973526001 + }, + { + "auxiliary_loss_clip": 0.01061261, + "auxiliary_loss_mlp": 0.01041707, + "balance_loss_clip": 1.04356098, + "balance_loss_mlp": 1.0283134, + "epoch": 0.5228318052006613, + "flos": 13808264077440.0, + "grad_norm": 1.8671615474987673, + "language_loss": 0.732638, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75366765, + "num_input_tokens_seen": 186808660, + "step": 8696, + "time_per_iteration": 2.782350540161133 + }, + { + "auxiliary_loss_clip": 0.01099328, + "auxiliary_loss_mlp": 0.01039177, + "balance_loss_clip": 1.0429219, + "balance_loss_mlp": 1.02538478, + "epoch": 0.5228919284533293, + "flos": 15596184476160.0, + "grad_norm": 1.7190001113055795, + "language_loss": 0.71068561, + "learning_rate": 1.948791385766319e-06, + "loss": 0.73207062, + "num_input_tokens_seen": 186825900, + "step": 8697, + "time_per_iteration": 2.781651735305786 + }, + { + "auxiliary_loss_clip": 0.01092255, + "auxiliary_loss_mlp": 0.01037704, + "balance_loss_clip": 1.04413819, + "balance_loss_mlp": 1.02498996, + "epoch": 0.5229520517059973, + "flos": 22491499726080.0, + "grad_norm": 1.9475868659159346, + "language_loss": 0.80332339, + "learning_rate": 1.948402052740906e-06, + "loss": 0.82462299, + "num_input_tokens_seen": 186843735, + "step": 8698, + "time_per_iteration": 2.7078070640563965 + }, + { + "auxiliary_loss_clip": 0.01110911, + "auxiliary_loss_mlp": 0.01038923, + "balance_loss_clip": 1.04286766, + "balance_loss_mlp": 1.02576292, + "epoch": 0.5230121749586653, + "flos": 22090880361600.0, + "grad_norm": 1.6510046342053804, + "language_loss": 0.74265802, + "learning_rate": 1.948012721672093e-06, + "loss": 0.7641564, + "num_input_tokens_seen": 186862440, + "step": 8699, + "time_per_iteration": 2.667205333709717 + }, + { + "auxiliary_loss_clip": 0.01113513, + "auxiliary_loss_mlp": 0.00773315, + "balance_loss_clip": 1.04171407, + "balance_loss_mlp": 1.00029182, + "epoch": 0.5230722982113333, + "flos": 22127150119680.0, + "grad_norm": 1.8535119798273105, + "language_loss": 0.73102427, + "learning_rate": 1.947623392574642e-06, + "loss": 0.74989247, + "num_input_tokens_seen": 186880940, + "step": 8700, + "time_per_iteration": 2.7250688076019287 + }, + { + "auxiliary_loss_clip": 0.01100202, + "auxiliary_loss_mlp": 0.01039746, + "balance_loss_clip": 1.04480553, + "balance_loss_mlp": 1.02510738, + "epoch": 0.5231324214640012, + "flos": 25009268572800.0, + "grad_norm": 1.8378710861613805, + "language_loss": 0.67156309, + "learning_rate": 1.947234065463318e-06, + "loss": 0.69296253, + "num_input_tokens_seen": 186900785, + "step": 8701, + "time_per_iteration": 2.830300807952881 + }, + { + "auxiliary_loss_clip": 0.0110603, + "auxiliary_loss_mlp": 0.00771586, + "balance_loss_clip": 1.04569697, + "balance_loss_mlp": 1.0002594, + "epoch": 0.5231925447166692, + "flos": 25740517651200.0, + "grad_norm": 1.7245960424067608, + "language_loss": 0.66710031, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.68587643, + "num_input_tokens_seen": 186920895, + "step": 8702, + "time_per_iteration": 2.725583791732788 + }, + { + "auxiliary_loss_clip": 0.01100659, + "auxiliary_loss_mlp": 0.01039254, + "balance_loss_clip": 1.04362679, + "balance_loss_mlp": 1.02464485, + "epoch": 0.5232526679693371, + "flos": 21433930565760.0, + "grad_norm": 1.7906940342438376, + "language_loss": 0.76647937, + "learning_rate": 1.946455417258101e-06, + "loss": 0.78787845, + "num_input_tokens_seen": 186940605, + "step": 8703, + "time_per_iteration": 2.7585973739624023 + }, + { + "auxiliary_loss_clip": 0.01117607, + "auxiliary_loss_mlp": 0.01043637, + "balance_loss_clip": 1.04529738, + "balance_loss_mlp": 1.02807403, + "epoch": 0.5233127912220051, + "flos": 35298393471360.0, + "grad_norm": 2.3077994186551036, + "language_loss": 0.76945215, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.79106462, + "num_input_tokens_seen": 186960820, + "step": 8704, + "time_per_iteration": 2.8613169193267822 + }, + { + "auxiliary_loss_clip": 0.01102832, + "auxiliary_loss_mlp": 0.0104096, + "balance_loss_clip": 1.04692268, + "balance_loss_mlp": 1.02798438, + "epoch": 0.523372914474673, + "flos": 17051320344960.0, + "grad_norm": 1.8023730932949449, + "language_loss": 0.78725791, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.80869591, + "num_input_tokens_seen": 186976240, + "step": 8705, + "time_per_iteration": 2.741025924682617 + }, + { + "auxiliary_loss_clip": 0.01106252, + "auxiliary_loss_mlp": 0.01037077, + "balance_loss_clip": 1.04467177, + "balance_loss_mlp": 1.02273059, + "epoch": 0.5234330377273411, + "flos": 18406302117120.0, + "grad_norm": 2.80572723928073, + "language_loss": 0.69824338, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.71967667, + "num_input_tokens_seen": 186992855, + "step": 8706, + "time_per_iteration": 2.6872975826263428 + }, + { + "auxiliary_loss_clip": 0.01035877, + "auxiliary_loss_mlp": 0.01013693, + "balance_loss_clip": 1.01881003, + "balance_loss_mlp": 1.01213157, + "epoch": 0.523493160980009, + "flos": 65850296970240.0, + "grad_norm": 0.6808139995313122, + "language_loss": 0.52465838, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.54515409, + "num_input_tokens_seen": 187051205, + "step": 8707, + "time_per_iteration": 3.2341713905334473 + }, + { + "auxiliary_loss_clip": 0.01098509, + "auxiliary_loss_mlp": 0.0103739, + "balance_loss_clip": 1.04139447, + "balance_loss_mlp": 1.02380002, + "epoch": 0.523553284232677, + "flos": 21872076664320.0, + "grad_norm": 1.6877057679435725, + "language_loss": 0.74618769, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.76754665, + "num_input_tokens_seen": 187070540, + "step": 8708, + "time_per_iteration": 2.8342666625976562 + }, + { + "auxiliary_loss_clip": 0.0109528, + "auxiliary_loss_mlp": 0.01031158, + "balance_loss_clip": 1.04457259, + "balance_loss_mlp": 1.01772881, + "epoch": 0.5236134074853449, + "flos": 20848191482880.0, + "grad_norm": 1.566541485414049, + "language_loss": 0.7730183, + "learning_rate": 1.944119521844849e-06, + "loss": 0.79428267, + "num_input_tokens_seen": 187089975, + "step": 8709, + "time_per_iteration": 2.708807945251465 + }, + { + "auxiliary_loss_clip": 0.01074175, + "auxiliary_loss_mlp": 0.0103878, + "balance_loss_clip": 1.03733826, + "balance_loss_mlp": 1.02211428, + "epoch": 0.5236735307380129, + "flos": 25520421064320.0, + "grad_norm": 2.041376547108184, + "language_loss": 0.83508044, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.85620999, + "num_input_tokens_seen": 187108775, + "step": 8710, + "time_per_iteration": 2.7781410217285156 + }, + { + "auxiliary_loss_clip": 0.01093974, + "auxiliary_loss_mlp": 0.01031634, + "balance_loss_clip": 1.04229414, + "balance_loss_mlp": 1.01794267, + "epoch": 0.523733653990681, + "flos": 23583112001280.0, + "grad_norm": 2.2254848949827983, + "language_loss": 0.69715381, + "learning_rate": 1.943340906834908e-06, + "loss": 0.7184099, + "num_input_tokens_seen": 187128830, + "step": 8711, + "time_per_iteration": 2.7991995811462402 + }, + { + "auxiliary_loss_clip": 0.01114283, + "auxiliary_loss_mlp": 0.01039219, + "balance_loss_clip": 1.04482269, + "balance_loss_mlp": 1.02475893, + "epoch": 0.5237937772433489, + "flos": 21106245767040.0, + "grad_norm": 2.0479693285364764, + "language_loss": 0.8319692, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.85350424, + "num_input_tokens_seen": 187149570, + "step": 8712, + "time_per_iteration": 2.6913018226623535 + }, + { + "auxiliary_loss_clip": 0.01126488, + "auxiliary_loss_mlp": 0.01042299, + "balance_loss_clip": 1.04477775, + "balance_loss_mlp": 1.02704, + "epoch": 0.5238539004960169, + "flos": 19172887200000.0, + "grad_norm": 2.12392132979159, + "language_loss": 0.69795638, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.71964419, + "num_input_tokens_seen": 187170575, + "step": 8713, + "time_per_iteration": 2.6037533283233643 + }, + { + "auxiliary_loss_clip": 0.01087813, + "auxiliary_loss_mlp": 0.01040708, + "balance_loss_clip": 1.03908944, + "balance_loss_mlp": 1.02369666, + "epoch": 0.5239140237486848, + "flos": 17888218300800.0, + "grad_norm": 2.8914750795344233, + "language_loss": 0.76703346, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.78831869, + "num_input_tokens_seen": 187187190, + "step": 8714, + "time_per_iteration": 2.717984676361084 + }, + { + "auxiliary_loss_clip": 0.01086969, + "auxiliary_loss_mlp": 0.01044306, + "balance_loss_clip": 1.0413481, + "balance_loss_mlp": 1.02729511, + "epoch": 0.5239741470013528, + "flos": 17930413802880.0, + "grad_norm": 1.9287276707329408, + "language_loss": 0.7608462, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.78215897, + "num_input_tokens_seen": 187204350, + "step": 8715, + "time_per_iteration": 2.6999671459198 + }, + { + "auxiliary_loss_clip": 0.01099192, + "auxiliary_loss_mlp": 0.01035578, + "balance_loss_clip": 1.0417552, + "balance_loss_mlp": 1.02110636, + "epoch": 0.5240342702540207, + "flos": 30993386584320.0, + "grad_norm": 2.1294970054785622, + "language_loss": 0.71165496, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.73300266, + "num_input_tokens_seen": 187225605, + "step": 8716, + "time_per_iteration": 2.744347333908081 + }, + { + "auxiliary_loss_clip": 0.01121973, + "auxiliary_loss_mlp": 0.0103854, + "balance_loss_clip": 1.04380643, + "balance_loss_mlp": 1.02563596, + "epoch": 0.5240943935066887, + "flos": 25005066681600.0, + "grad_norm": 3.2118480553546087, + "language_loss": 0.87086689, + "learning_rate": 1.941005113841926e-06, + "loss": 0.89247203, + "num_input_tokens_seen": 187241335, + "step": 8717, + "time_per_iteration": 4.158156394958496 + }, + { + "auxiliary_loss_clip": 0.01109045, + "auxiliary_loss_mlp": 0.01035747, + "balance_loss_clip": 1.0454371, + "balance_loss_mlp": 1.02164412, + "epoch": 0.5241545167593566, + "flos": 23659099223040.0, + "grad_norm": 1.880090780763199, + "language_loss": 0.61121464, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.63266253, + "num_input_tokens_seen": 187259925, + "step": 8718, + "time_per_iteration": 2.671760320663452 + }, + { + "auxiliary_loss_clip": 0.01094217, + "auxiliary_loss_mlp": 0.010389, + "balance_loss_clip": 1.04272294, + "balance_loss_mlp": 1.02387953, + "epoch": 0.5242146400120247, + "flos": 23400398494080.0, + "grad_norm": 1.8098933087704439, + "language_loss": 0.72060192, + "learning_rate": 1.940226533916872e-06, + "loss": 0.74193311, + "num_input_tokens_seen": 187279035, + "step": 8719, + "time_per_iteration": 2.815864324569702 + }, + { + "auxiliary_loss_clip": 0.01109147, + "auxiliary_loss_mlp": 0.01029429, + "balance_loss_clip": 1.04305363, + "balance_loss_mlp": 1.01676893, + "epoch": 0.5242747632646926, + "flos": 17749065012480.0, + "grad_norm": 1.9600898858885738, + "language_loss": 0.73258477, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.7539705, + "num_input_tokens_seen": 187297555, + "step": 8720, + "time_per_iteration": 4.34027624130249 + }, + { + "auxiliary_loss_clip": 0.01110975, + "auxiliary_loss_mlp": 0.01037749, + "balance_loss_clip": 1.042588, + "balance_loss_mlp": 1.02323568, + "epoch": 0.5243348865173606, + "flos": 32597731549440.0, + "grad_norm": 1.7136870064395262, + "language_loss": 0.7059021, + "learning_rate": 1.939447963058281e-06, + "loss": 0.72738934, + "num_input_tokens_seen": 187320265, + "step": 8721, + "time_per_iteration": 4.457958698272705 + }, + { + "auxiliary_loss_clip": 0.01064422, + "auxiliary_loss_mlp": 0.0103891, + "balance_loss_clip": 1.03628516, + "balance_loss_mlp": 1.02399719, + "epoch": 0.5243950097700285, + "flos": 25484115392640.0, + "grad_norm": 1.8741175153878353, + "language_loss": 0.86506796, + "learning_rate": 1.939058681065813e-06, + "loss": 0.88610125, + "num_input_tokens_seen": 187338045, + "step": 8722, + "time_per_iteration": 2.851713180541992 + }, + { + "auxiliary_loss_clip": 0.01122948, + "auxiliary_loss_mlp": 0.01033308, + "balance_loss_clip": 1.0449574, + "balance_loss_mlp": 1.01830578, + "epoch": 0.5244551330226965, + "flos": 15268391936640.0, + "grad_norm": 1.8614764349338224, + "language_loss": 0.79853708, + "learning_rate": 1.938669401384247e-06, + "loss": 0.82009959, + "num_input_tokens_seen": 187356040, + "step": 8723, + "time_per_iteration": 2.567403554916382 + }, + { + "auxiliary_loss_clip": 0.01111191, + "auxiliary_loss_mlp": 0.0104214, + "balance_loss_clip": 1.04611158, + "balance_loss_mlp": 1.02747166, + "epoch": 0.5245152562753645, + "flos": 22237108629120.0, + "grad_norm": 2.070314434964904, + "language_loss": 0.75515735, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.77669066, + "num_input_tokens_seen": 187374185, + "step": 8724, + "time_per_iteration": 4.372815847396851 + }, + { + "auxiliary_loss_clip": 0.01128433, + "auxiliary_loss_mlp": 0.01038668, + "balance_loss_clip": 1.04391563, + "balance_loss_mlp": 1.02228856, + "epoch": 0.5245753795280325, + "flos": 29426460612480.0, + "grad_norm": 1.7393951886603523, + "language_loss": 0.70450562, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.72617668, + "num_input_tokens_seen": 187396640, + "step": 8725, + "time_per_iteration": 2.691462278366089 + }, + { + "auxiliary_loss_clip": 0.01014562, + "auxiliary_loss_mlp": 0.0100467, + "balance_loss_clip": 1.01748943, + "balance_loss_mlp": 1.0025723, + "epoch": 0.5246355027807005, + "flos": 58834392785280.0, + "grad_norm": 0.751972672191828, + "language_loss": 0.55635381, + "learning_rate": 1.937501576352568e-06, + "loss": 0.57654613, + "num_input_tokens_seen": 187455945, + "step": 8726, + "time_per_iteration": 3.2482144832611084 + }, + { + "auxiliary_loss_clip": 0.01023582, + "auxiliary_loss_mlp": 0.01000951, + "balance_loss_clip": 1.02279115, + "balance_loss_mlp": 0.9995268, + "epoch": 0.5246956260333684, + "flos": 64526592965760.0, + "grad_norm": 0.7878423938979384, + "language_loss": 0.58313322, + "learning_rate": 1.937112306062219e-06, + "loss": 0.60337853, + "num_input_tokens_seen": 187519975, + "step": 8727, + "time_per_iteration": 3.2606794834136963 + }, + { + "auxiliary_loss_clip": 0.01114413, + "auxiliary_loss_mlp": 0.01036047, + "balance_loss_clip": 1.0418663, + "balance_loss_mlp": 1.02111006, + "epoch": 0.5247557492860364, + "flos": 24533631653760.0, + "grad_norm": 1.3167349097133665, + "language_loss": 0.70678449, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.72828913, + "num_input_tokens_seen": 187541775, + "step": 8728, + "time_per_iteration": 2.6979823112487793 + }, + { + "auxiliary_loss_clip": 0.01110188, + "auxiliary_loss_mlp": 0.01029551, + "balance_loss_clip": 1.04107904, + "balance_loss_mlp": 1.01636648, + "epoch": 0.5248158725387043, + "flos": 18806131382400.0, + "grad_norm": 1.4052080718589413, + "language_loss": 0.69816244, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.71955991, + "num_input_tokens_seen": 187560425, + "step": 8729, + "time_per_iteration": 2.6898272037506104 + }, + { + "auxiliary_loss_clip": 0.01084395, + "auxiliary_loss_mlp": 0.01034673, + "balance_loss_clip": 1.04138565, + "balance_loss_mlp": 1.02001655, + "epoch": 0.5248759957913723, + "flos": 20955851521920.0, + "grad_norm": 1.9953537122640765, + "language_loss": 0.83565557, + "learning_rate": 1.935944509558464e-06, + "loss": 0.85684621, + "num_input_tokens_seen": 187579930, + "step": 8730, + "time_per_iteration": 2.719953775405884 + }, + { + "auxiliary_loss_clip": 0.01087481, + "auxiliary_loss_mlp": 0.01037052, + "balance_loss_clip": 1.04011822, + "balance_loss_mlp": 1.02177548, + "epoch": 0.5249361190440403, + "flos": 18660980522880.0, + "grad_norm": 2.0205964009231816, + "language_loss": 0.79403269, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.81527805, + "num_input_tokens_seen": 187595365, + "step": 8731, + "time_per_iteration": 2.741563081741333 + }, + { + "auxiliary_loss_clip": 0.01105082, + "auxiliary_loss_mlp": 0.01030893, + "balance_loss_clip": 1.03996611, + "balance_loss_mlp": 1.0172075, + "epoch": 0.5249962422967083, + "flos": 24863327614080.0, + "grad_norm": 1.917738069421625, + "language_loss": 0.83558822, + "learning_rate": 1.935165990676312e-06, + "loss": 0.85694802, + "num_input_tokens_seen": 187614715, + "step": 8732, + "time_per_iteration": 2.672537326812744 + }, + { + "auxiliary_loss_clip": 0.01109755, + "auxiliary_loss_mlp": 0.01037546, + "balance_loss_clip": 1.04267287, + "balance_loss_mlp": 1.0239923, + "epoch": 0.5250563655493762, + "flos": 15262681674240.0, + "grad_norm": 1.7357983281517446, + "language_loss": 0.77602309, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.79749608, + "num_input_tokens_seen": 187630745, + "step": 8733, + "time_per_iteration": 2.651329278945923 + }, + { + "auxiliary_loss_clip": 0.01126312, + "auxiliary_loss_mlp": 0.01036227, + "balance_loss_clip": 1.04450274, + "balance_loss_mlp": 1.02157617, + "epoch": 0.5251164888020442, + "flos": 18625177641600.0, + "grad_norm": 1.892740616554097, + "language_loss": 0.8202911, + "learning_rate": 1.934387481628208e-06, + "loss": 0.84191644, + "num_input_tokens_seen": 187648200, + "step": 8734, + "time_per_iteration": 2.608727216720581 + }, + { + "auxiliary_loss_clip": 0.01091339, + "auxiliary_loss_mlp": 0.01028225, + "balance_loss_clip": 1.04116642, + "balance_loss_mlp": 1.01467109, + "epoch": 0.5251766120547121, + "flos": 29710764760320.0, + "grad_norm": 1.3668287037138613, + "language_loss": 0.76932037, + "learning_rate": 1.933998230828826e-06, + "loss": 0.79051596, + "num_input_tokens_seen": 187669205, + "step": 8735, + "time_per_iteration": 2.703274965286255 + }, + { + "auxiliary_loss_clip": 0.01112983, + "auxiliary_loss_mlp": 0.01038692, + "balance_loss_clip": 1.04413259, + "balance_loss_mlp": 1.02544188, + "epoch": 0.5252367353073801, + "flos": 23440295525760.0, + "grad_norm": 1.7627870360178364, + "language_loss": 0.80808437, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.82960117, + "num_input_tokens_seen": 187690890, + "step": 8736, + "time_per_iteration": 2.6869864463806152 + }, + { + "auxiliary_loss_clip": 0.01124902, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.04460597, + "balance_loss_mlp": 1.02199018, + "epoch": 0.5252968585600482, + "flos": 30810708990720.0, + "grad_norm": 2.2019442049314626, + "language_loss": 0.69824821, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.71986508, + "num_input_tokens_seen": 187713045, + "step": 8737, + "time_per_iteration": 2.694178342819214 + }, + { + "auxiliary_loss_clip": 0.01101601, + "auxiliary_loss_mlp": 0.01038957, + "balance_loss_clip": 1.04274702, + "balance_loss_mlp": 1.02473521, + "epoch": 0.5253569818127161, + "flos": 20628274464000.0, + "grad_norm": 1.4444028137471083, + "language_loss": 0.77386785, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.79527342, + "num_input_tokens_seen": 187733640, + "step": 8738, + "time_per_iteration": 2.7655301094055176 + }, + { + "auxiliary_loss_clip": 0.01012696, + "auxiliary_loss_mlp": 0.00752303, + "balance_loss_clip": 1.01498532, + "balance_loss_mlp": 0.99995118, + "epoch": 0.5254171050653841, + "flos": 63428695810560.0, + "grad_norm": 0.7418872270660203, + "language_loss": 0.54437888, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56202877, + "num_input_tokens_seen": 187792930, + "step": 8739, + "time_per_iteration": 3.183931350708008 + }, + { + "auxiliary_loss_clip": 0.01093164, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.03987527, + "balance_loss_mlp": 1.02572989, + "epoch": 0.525477228318052, + "flos": 34670782108800.0, + "grad_norm": 1.6115423077763054, + "language_loss": 0.84719479, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.8685174, + "num_input_tokens_seen": 187812495, + "step": 8740, + "time_per_iteration": 2.8701846599578857 + }, + { + "auxiliary_loss_clip": 0.01106251, + "auxiliary_loss_mlp": 0.00771888, + "balance_loss_clip": 1.03936994, + "balance_loss_mlp": 1.00030541, + "epoch": 0.52553735157072, + "flos": 17930844766080.0, + "grad_norm": 2.112576285349714, + "language_loss": 0.69466913, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.71345055, + "num_input_tokens_seen": 187829685, + "step": 8741, + "time_per_iteration": 2.721233606338501 + }, + { + "auxiliary_loss_clip": 0.01101687, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.04140949, + "balance_loss_mlp": 1.02171421, + "epoch": 0.5255974748233879, + "flos": 9940864584960.0, + "grad_norm": 1.8031333880336204, + "language_loss": 0.66328311, + "learning_rate": 1.931273546137947e-06, + "loss": 0.68465841, + "num_input_tokens_seen": 187846495, + "step": 8742, + "time_per_iteration": 2.695504903793335 + }, + { + "auxiliary_loss_clip": 0.01086092, + "auxiliary_loss_mlp": 0.01042238, + "balance_loss_clip": 1.03882444, + "balance_loss_mlp": 1.02666903, + "epoch": 0.5256575980760559, + "flos": 16868427269760.0, + "grad_norm": 1.9909144400242709, + "language_loss": 0.63219392, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.65347725, + "num_input_tokens_seen": 187862010, + "step": 8743, + "time_per_iteration": 2.712376832962036 + }, + { + "auxiliary_loss_clip": 0.0102969, + "auxiliary_loss_mlp": 0.01008337, + "balance_loss_clip": 1.01230693, + "balance_loss_mlp": 1.00641751, + "epoch": 0.5257177213287239, + "flos": 62386210362240.0, + "grad_norm": 0.7739828883360421, + "language_loss": 0.5410347, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56141496, + "num_input_tokens_seen": 187922730, + "step": 8744, + "time_per_iteration": 3.281756639480591 + }, + { + "auxiliary_loss_clip": 0.01106094, + "auxiliary_loss_mlp": 0.01037818, + "balance_loss_clip": 1.04534447, + "balance_loss_mlp": 1.02202296, + "epoch": 0.5257778445813919, + "flos": 20776908942720.0, + "grad_norm": 2.5030900138953274, + "language_loss": 0.75859022, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.7800293, + "num_input_tokens_seen": 187940160, + "step": 8745, + "time_per_iteration": 2.642817258834839 + }, + { + "auxiliary_loss_clip": 0.01110515, + "auxiliary_loss_mlp": 0.0104281, + "balance_loss_clip": 1.04153466, + "balance_loss_mlp": 1.02948213, + "epoch": 0.5258379678340598, + "flos": 17018606033280.0, + "grad_norm": 1.7823830080970366, + "language_loss": 0.8089028, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.83043599, + "num_input_tokens_seen": 187958625, + "step": 8746, + "time_per_iteration": 2.5678205490112305 + }, + { + "auxiliary_loss_clip": 0.01108698, + "auxiliary_loss_mlp": 0.01036575, + "balance_loss_clip": 1.04006267, + "balance_loss_mlp": 1.02191806, + "epoch": 0.5258980910867278, + "flos": 21068754946560.0, + "grad_norm": 2.1394959039376475, + "language_loss": 0.75231433, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.77376711, + "num_input_tokens_seen": 187977575, + "step": 8747, + "time_per_iteration": -0.009610652923583984 + }, + { + "auxiliary_loss_clip": 0.0105854, + "auxiliary_loss_mlp": 0.01033909, + "balance_loss_clip": 1.03949201, + "balance_loss_mlp": 1.01987767, + "epoch": 0.5259582143393957, + "flos": 18004461690240.0, + "grad_norm": 2.0175880820051058, + "language_loss": 0.82632613, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.84725058, + "num_input_tokens_seen": 187996650, + "step": 8748, + "time_per_iteration": 2.7604665756225586 + }, + { + "auxiliary_loss_clip": 0.01099486, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.03856742, + "balance_loss_mlp": 1.01846862, + "epoch": 0.5260183375920637, + "flos": 22783848520320.0, + "grad_norm": 2.328081087853481, + "language_loss": 0.80873966, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.83006573, + "num_input_tokens_seen": 188013510, + "step": 8749, + "time_per_iteration": 2.6853184700012207 + }, + { + "auxiliary_loss_clip": 0.01109749, + "auxiliary_loss_mlp": 0.01040189, + "balance_loss_clip": 1.04381132, + "balance_loss_mlp": 1.02556193, + "epoch": 0.5260784608447318, + "flos": 27052406081280.0, + "grad_norm": 1.7699462129252088, + "language_loss": 0.72291499, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.74441439, + "num_input_tokens_seen": 188032085, + "step": 8750, + "time_per_iteration": 2.6771364212036133 + }, + { + "auxiliary_loss_clip": 0.01098374, + "auxiliary_loss_mlp": 0.01037371, + "balance_loss_clip": 1.03887165, + "balance_loss_mlp": 1.02362585, + "epoch": 0.5261385840973997, + "flos": 20662820369280.0, + "grad_norm": 1.3348346616556535, + "language_loss": 0.76186317, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.78322065, + "num_input_tokens_seen": 188050590, + "step": 8751, + "time_per_iteration": 2.7016804218292236 + }, + { + "auxiliary_loss_clip": 0.01119796, + "auxiliary_loss_mlp": 0.01039709, + "balance_loss_clip": 1.04339051, + "balance_loss_mlp": 1.02622056, + "epoch": 0.5261987073500677, + "flos": 23622649896960.0, + "grad_norm": 1.7424279065253616, + "language_loss": 0.75831163, + "learning_rate": 1.927381362210902e-06, + "loss": 0.77990663, + "num_input_tokens_seen": 188071620, + "step": 8752, + "time_per_iteration": 2.7128703594207764 + }, + { + "auxiliary_loss_clip": 0.01112565, + "auxiliary_loss_mlp": 0.01033514, + "balance_loss_clip": 1.04177046, + "balance_loss_mlp": 1.01780224, + "epoch": 0.5262588306027356, + "flos": 27636241743360.0, + "grad_norm": 2.1757268908288707, + "language_loss": 0.67754769, + "learning_rate": 1.926992158720058e-06, + "loss": 0.69900852, + "num_input_tokens_seen": 188091740, + "step": 8753, + "time_per_iteration": 2.678269147872925 + }, + { + "auxiliary_loss_clip": 0.01111599, + "auxiliary_loss_mlp": 0.01034275, + "balance_loss_clip": 1.04266751, + "balance_loss_mlp": 1.02072084, + "epoch": 0.5263189538554036, + "flos": 21759711943680.0, + "grad_norm": 1.6342208992061138, + "language_loss": 0.84114075, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.86259949, + "num_input_tokens_seen": 188111165, + "step": 8754, + "time_per_iteration": 2.6858248710632324 + }, + { + "auxiliary_loss_clip": 0.01109767, + "auxiliary_loss_mlp": 0.01035863, + "balance_loss_clip": 1.04159164, + "balance_loss_mlp": 1.02159333, + "epoch": 0.5263790771080715, + "flos": 14276359140480.0, + "grad_norm": 2.0064086672514323, + "language_loss": 0.87360156, + "learning_rate": 1.926213760058522e-06, + "loss": 0.89505792, + "num_input_tokens_seen": 188127825, + "step": 8755, + "time_per_iteration": 2.5783674716949463 + }, + { + "auxiliary_loss_clip": 0.01007681, + "auxiliary_loss_mlp": 0.01000927, + "balance_loss_clip": 1.01328659, + "balance_loss_mlp": 0.99918669, + "epoch": 0.5264392003607395, + "flos": 65806413528960.0, + "grad_norm": 0.7404552494369754, + "language_loss": 0.5880959, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.60818201, + "num_input_tokens_seen": 188194050, + "step": 8756, + "time_per_iteration": 3.308302402496338 + }, + { + "auxiliary_loss_clip": 0.01094156, + "auxiliary_loss_mlp": 0.01036712, + "balance_loss_clip": 1.0415833, + "balance_loss_mlp": 1.02182269, + "epoch": 0.5264993236134075, + "flos": 21032413361280.0, + "grad_norm": 1.6572717697992079, + "language_loss": 0.70703959, + "learning_rate": 1.925435372588913e-06, + "loss": 0.72834826, + "num_input_tokens_seen": 188212565, + "step": 8757, + "time_per_iteration": 4.195650100708008 + }, + { + "auxiliary_loss_clip": 0.0110952, + "auxiliary_loss_mlp": 0.01040036, + "balance_loss_clip": 1.04061294, + "balance_loss_mlp": 1.02590346, + "epoch": 0.5265594468660755, + "flos": 16618202150400.0, + "grad_norm": 2.0494500796269577, + "language_loss": 0.88039553, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.90189111, + "num_input_tokens_seen": 188229505, + "step": 8758, + "time_per_iteration": 2.63089656829834 + }, + { + "auxiliary_loss_clip": 0.01061465, + "auxiliary_loss_mlp": 0.01037569, + "balance_loss_clip": 1.03887105, + "balance_loss_mlp": 1.02301979, + "epoch": 0.5266195701187434, + "flos": 24134125610880.0, + "grad_norm": 1.4473751902891179, + "language_loss": 0.75895298, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.77994329, + "num_input_tokens_seen": 188250395, + "step": 8759, + "time_per_iteration": 4.702188968658447 + }, + { + "auxiliary_loss_clip": 0.01098136, + "auxiliary_loss_mlp": 0.01030934, + "balance_loss_clip": 1.04185557, + "balance_loss_mlp": 1.0181073, + "epoch": 0.5266796933714114, + "flos": 15844111125120.0, + "grad_norm": 1.7900777891811301, + "language_loss": 0.71485013, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.73614085, + "num_input_tokens_seen": 188266785, + "step": 8760, + "time_per_iteration": 4.256975412368774 + }, + { + "auxiliary_loss_clip": 0.01098696, + "auxiliary_loss_mlp": 0.01040967, + "balance_loss_clip": 1.04177952, + "balance_loss_mlp": 1.02593493, + "epoch": 0.5267398166240793, + "flos": 20951434149120.0, + "grad_norm": 2.6157951761776697, + "language_loss": 0.75801802, + "learning_rate": 1.923878631697736e-06, + "loss": 0.77941465, + "num_input_tokens_seen": 188282525, + "step": 8761, + "time_per_iteration": 2.685028553009033 + }, + { + "auxiliary_loss_clip": 0.01104735, + "auxiliary_loss_mlp": 0.00771727, + "balance_loss_clip": 1.03871739, + "balance_loss_mlp": 1.00023258, + "epoch": 0.5267999398767473, + "flos": 20996394998400.0, + "grad_norm": 1.8739254444127986, + "language_loss": 0.70466101, + "learning_rate": 1.923489453654373e-06, + "loss": 0.72342563, + "num_input_tokens_seen": 188301395, + "step": 8762, + "time_per_iteration": 2.727120876312256 + }, + { + "auxiliary_loss_clip": 0.01014324, + "auxiliary_loss_mlp": 0.00999661, + "balance_loss_clip": 1.00980198, + "balance_loss_mlp": 0.99816543, + "epoch": 0.5268600631294152, + "flos": 66849401767680.0, + "grad_norm": 0.9282030794038212, + "language_loss": 0.65443593, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.67457575, + "num_input_tokens_seen": 188357665, + "step": 8763, + "time_per_iteration": 3.109525203704834 + }, + { + "auxiliary_loss_clip": 0.01109455, + "auxiliary_loss_mlp": 0.01030406, + "balance_loss_clip": 1.04166603, + "balance_loss_mlp": 1.01676226, + "epoch": 0.5269201863820833, + "flos": 17165552572800.0, + "grad_norm": 1.6243900815433006, + "language_loss": 0.71050072, + "learning_rate": 1.922711106286265e-06, + "loss": 0.73189938, + "num_input_tokens_seen": 188376935, + "step": 8764, + "time_per_iteration": 4.168430328369141 + }, + { + "auxiliary_loss_clip": 0.01080487, + "auxiliary_loss_mlp": 0.01033821, + "balance_loss_clip": 1.03809977, + "balance_loss_mlp": 1.01832938, + "epoch": 0.5269803096347513, + "flos": 20522589672960.0, + "grad_norm": 1.5962933914095123, + "language_loss": 0.74318087, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.76432389, + "num_input_tokens_seen": 188394995, + "step": 8765, + "time_per_iteration": 2.7441658973693848 + }, + { + "auxiliary_loss_clip": 0.01098499, + "auxiliary_loss_mlp": 0.01037098, + "balance_loss_clip": 1.03631091, + "balance_loss_mlp": 1.02200055, + "epoch": 0.5270404328874192, + "flos": 27230989524480.0, + "grad_norm": 1.60818818085183, + "language_loss": 0.85403508, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.87539107, + "num_input_tokens_seen": 188415475, + "step": 8766, + "time_per_iteration": 2.7902116775512695 + }, + { + "auxiliary_loss_clip": 0.0112556, + "auxiliary_loss_mlp": 0.01039583, + "balance_loss_clip": 1.0449605, + "balance_loss_mlp": 1.02492046, + "epoch": 0.5271005561400872, + "flos": 23110491824640.0, + "grad_norm": 1.780636206979604, + "language_loss": 0.79070592, + "learning_rate": 1.921543607252017e-06, + "loss": 0.81235737, + "num_input_tokens_seen": 188435665, + "step": 8767, + "time_per_iteration": 2.6986846923828125 + }, + { + "auxiliary_loss_clip": 0.01114967, + "auxiliary_loss_mlp": 0.01039357, + "balance_loss_clip": 1.04406393, + "balance_loss_mlp": 1.02407432, + "epoch": 0.5271606793927551, + "flos": 22564793427840.0, + "grad_norm": 1.6576657234027676, + "language_loss": 0.73513746, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.75668073, + "num_input_tokens_seen": 188455405, + "step": 8768, + "time_per_iteration": 2.695497989654541 + }, + { + "auxiliary_loss_clip": 0.01092606, + "auxiliary_loss_mlp": 0.01048135, + "balance_loss_clip": 1.03795791, + "balance_loss_mlp": 1.03445613, + "epoch": 0.5272208026454231, + "flos": 18764259102720.0, + "grad_norm": 1.9012673693956994, + "language_loss": 0.7428031, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.76421046, + "num_input_tokens_seen": 188472940, + "step": 8769, + "time_per_iteration": 2.7763235569000244 + }, + { + "auxiliary_loss_clip": 0.01082308, + "auxiliary_loss_mlp": 0.0104049, + "balance_loss_clip": 1.03746688, + "balance_loss_mlp": 1.02675128, + "epoch": 0.5272809258980911, + "flos": 20412164286720.0, + "grad_norm": 1.8328085669464766, + "language_loss": 0.7360974, + "learning_rate": 1.920376134993436e-06, + "loss": 0.75732535, + "num_input_tokens_seen": 188493035, + "step": 8770, + "time_per_iteration": 2.7274930477142334 + }, + { + "auxiliary_loss_clip": 0.011224, + "auxiliary_loss_mlp": 0.01035685, + "balance_loss_clip": 1.04366255, + "balance_loss_mlp": 1.02199364, + "epoch": 0.5273410491507591, + "flos": 28256742213120.0, + "grad_norm": 1.7661010025178618, + "language_loss": 0.68258119, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.704162, + "num_input_tokens_seen": 188513860, + "step": 8771, + "time_per_iteration": 2.6751418113708496 + }, + { + "auxiliary_loss_clip": 0.01109367, + "auxiliary_loss_mlp": 0.01038799, + "balance_loss_clip": 1.0429647, + "balance_loss_mlp": 1.02500653, + "epoch": 0.527401172403427, + "flos": 22455158140800.0, + "grad_norm": 1.9220412670697933, + "language_loss": 0.76438117, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.78586286, + "num_input_tokens_seen": 188533345, + "step": 8772, + "time_per_iteration": 2.7865138053894043 + }, + { + "auxiliary_loss_clip": 0.01107055, + "auxiliary_loss_mlp": 0.01047604, + "balance_loss_clip": 1.04159784, + "balance_loss_mlp": 1.03290582, + "epoch": 0.527461295656095, + "flos": 21031084558080.0, + "grad_norm": 2.1683746410962472, + "language_loss": 0.65569091, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.67723751, + "num_input_tokens_seen": 188551550, + "step": 8773, + "time_per_iteration": 2.648556709289551 + }, + { + "auxiliary_loss_clip": 0.01089634, + "auxiliary_loss_mlp": 0.01040938, + "balance_loss_clip": 1.04127073, + "balance_loss_mlp": 1.02838576, + "epoch": 0.5275214189087629, + "flos": 26322018929280.0, + "grad_norm": 1.7479537399696432, + "language_loss": 0.85893595, + "learning_rate": 1.91881954765502e-06, + "loss": 0.88024169, + "num_input_tokens_seen": 188571615, + "step": 8774, + "time_per_iteration": 2.8036038875579834 + }, + { + "auxiliary_loss_clip": 0.01088366, + "auxiliary_loss_mlp": 0.01035546, + "balance_loss_clip": 1.03889024, + "balance_loss_mlp": 1.02204525, + "epoch": 0.5275815421614309, + "flos": 20047024581120.0, + "grad_norm": 1.657417688760408, + "language_loss": 0.80199802, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.82323706, + "num_input_tokens_seen": 188591965, + "step": 8775, + "time_per_iteration": 2.7011687755584717 + }, + { + "auxiliary_loss_clip": 0.01096581, + "auxiliary_loss_mlp": 0.01042615, + "balance_loss_clip": 1.03883219, + "balance_loss_mlp": 1.02843523, + "epoch": 0.5276416654140988, + "flos": 21432206712960.0, + "grad_norm": 1.7666023716485497, + "language_loss": 0.83578467, + "learning_rate": 1.918041272397012e-06, + "loss": 0.85717654, + "num_input_tokens_seen": 188610675, + "step": 8776, + "time_per_iteration": 2.6593801975250244 + }, + { + "auxiliary_loss_clip": 0.01093105, + "auxiliary_loss_mlp": 0.01036597, + "balance_loss_clip": 1.04135871, + "balance_loss_mlp": 1.0225482, + "epoch": 0.5277017886667669, + "flos": 17165085696000.0, + "grad_norm": 1.7073238735749807, + "language_loss": 0.67856812, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.6998651, + "num_input_tokens_seen": 188628235, + "step": 8777, + "time_per_iteration": 2.684119462966919 + }, + { + "auxiliary_loss_clip": 0.01098291, + "auxiliary_loss_mlp": 0.01042578, + "balance_loss_clip": 1.0435065, + "balance_loss_mlp": 1.02887487, + "epoch": 0.5277619119194349, + "flos": 20448146736000.0, + "grad_norm": 1.6906001817136074, + "language_loss": 0.8258512, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.84725994, + "num_input_tokens_seen": 188648925, + "step": 8778, + "time_per_iteration": 2.682415723800659 + }, + { + "auxiliary_loss_clip": 0.01111904, + "auxiliary_loss_mlp": 0.01042858, + "balance_loss_clip": 1.04339361, + "balance_loss_mlp": 1.02807617, + "epoch": 0.5278220351721028, + "flos": 24061083304320.0, + "grad_norm": 2.7851808389493913, + "language_loss": 0.79809994, + "learning_rate": 1.916873882856013e-06, + "loss": 0.81964755, + "num_input_tokens_seen": 188668125, + "step": 8779, + "time_per_iteration": 2.6585779190063477 + }, + { + "auxiliary_loss_clip": 0.01105817, + "auxiliary_loss_mlp": 0.01036255, + "balance_loss_clip": 1.04011083, + "balance_loss_mlp": 1.02326131, + "epoch": 0.5278821584247708, + "flos": 24642907804800.0, + "grad_norm": 2.3801118784221487, + "language_loss": 0.76782715, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.78924787, + "num_input_tokens_seen": 188684410, + "step": 8780, + "time_per_iteration": 2.64528489112854 + }, + { + "auxiliary_loss_clip": 0.01092369, + "auxiliary_loss_mlp": 0.01031311, + "balance_loss_clip": 1.04324102, + "balance_loss_mlp": 1.01723862, + "epoch": 0.5279422816774387, + "flos": 35408244240000.0, + "grad_norm": 1.6460087018057796, + "language_loss": 0.7001918, + "learning_rate": 1.916095638898174e-06, + "loss": 0.72142857, + "num_input_tokens_seen": 188706130, + "step": 8781, + "time_per_iteration": 2.8247299194335938 + }, + { + "auxiliary_loss_clip": 0.01107498, + "auxiliary_loss_mlp": 0.01040285, + "balance_loss_clip": 1.04195011, + "balance_loss_mlp": 1.02773809, + "epoch": 0.5280024049301068, + "flos": 22967028904320.0, + "grad_norm": 1.5355974889681627, + "language_loss": 0.72236538, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.7438432, + "num_input_tokens_seen": 188725030, + "step": 8782, + "time_per_iteration": 2.6150832176208496 + }, + { + "auxiliary_loss_clip": 0.01090709, + "auxiliary_loss_mlp": 0.01033496, + "balance_loss_clip": 1.03973758, + "balance_loss_mlp": 1.0204308, + "epoch": 0.5280625281827747, + "flos": 21507619317120.0, + "grad_norm": 1.8366229943518229, + "language_loss": 0.68489599, + "learning_rate": 1.915317407666982e-06, + "loss": 0.70613807, + "num_input_tokens_seen": 188744325, + "step": 8783, + "time_per_iteration": 2.7228338718414307 + }, + { + "auxiliary_loss_clip": 0.01120029, + "auxiliary_loss_mlp": 0.01042206, + "balance_loss_clip": 1.04475784, + "balance_loss_mlp": 1.02599382, + "epoch": 0.5281226514354427, + "flos": 31208167958400.0, + "grad_norm": 1.8621065563663965, + "language_loss": 0.69557488, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.71719718, + "num_input_tokens_seen": 188765100, + "step": 8784, + "time_per_iteration": 2.756030797958374 + }, + { + "auxiliary_loss_clip": 0.01124818, + "auxiliary_loss_mlp": 0.01034258, + "balance_loss_clip": 1.04128921, + "balance_loss_mlp": 1.01935077, + "epoch": 0.5281827746881106, + "flos": 25077821679360.0, + "grad_norm": 3.8002246773271238, + "language_loss": 0.7503646, + "learning_rate": 1.91453918928048e-06, + "loss": 0.77195537, + "num_input_tokens_seen": 188783995, + "step": 8785, + "time_per_iteration": 2.6486949920654297 + }, + { + "auxiliary_loss_clip": 0.01110957, + "auxiliary_loss_mlp": 0.01035187, + "balance_loss_clip": 1.04315662, + "balance_loss_mlp": 1.02070904, + "epoch": 0.5282428979407786, + "flos": 20631255292800.0, + "grad_norm": 1.5855662273934061, + "language_loss": 0.83260286, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.85406423, + "num_input_tokens_seen": 188803120, + "step": 8786, + "time_per_iteration": 2.6352970600128174 + }, + { + "auxiliary_loss_clip": 0.01083443, + "auxiliary_loss_mlp": 0.01025911, + "balance_loss_clip": 1.04014111, + "balance_loss_mlp": 1.0136745, + "epoch": 0.5283030211934465, + "flos": 22419391173120.0, + "grad_norm": 2.305341017618089, + "language_loss": 0.82486933, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.84596282, + "num_input_tokens_seen": 188820960, + "step": 8787, + "time_per_iteration": 2.712639570236206 + }, + { + "auxiliary_loss_clip": 0.01066097, + "auxiliary_loss_mlp": 0.01026546, + "balance_loss_clip": 1.03866088, + "balance_loss_mlp": 1.01387453, + "epoch": 0.5283631444461145, + "flos": 23615467176960.0, + "grad_norm": 1.663088771358256, + "language_loss": 0.83609009, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.85701656, + "num_input_tokens_seen": 188837165, + "step": 8788, + "time_per_iteration": 2.7158761024475098 + }, + { + "auxiliary_loss_clip": 0.01087908, + "auxiliary_loss_mlp": 0.01041692, + "balance_loss_clip": 1.04602289, + "balance_loss_mlp": 1.02696919, + "epoch": 0.5284232676987825, + "flos": 32671994918400.0, + "grad_norm": 1.8980499308542007, + "language_loss": 0.75046682, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.77176291, + "num_input_tokens_seen": 188858555, + "step": 8789, + "time_per_iteration": 2.806339979171753 + }, + { + "auxiliary_loss_clip": 0.01113755, + "auxiliary_loss_mlp": 0.01037056, + "balance_loss_clip": 1.04411733, + "balance_loss_mlp": 1.02322817, + "epoch": 0.5284833909514505, + "flos": 26760919213440.0, + "grad_norm": 1.5263217177178625, + "language_loss": 0.69562709, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.71713525, + "num_input_tokens_seen": 188879050, + "step": 8790, + "time_per_iteration": 2.701814651489258 + }, + { + "auxiliary_loss_clip": 0.01117978, + "auxiliary_loss_mlp": 0.01029171, + "balance_loss_clip": 1.04194212, + "balance_loss_mlp": 1.01685631, + "epoch": 0.5285435142041185, + "flos": 22090700793600.0, + "grad_norm": 1.472851859989372, + "language_loss": 0.79096156, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.812433, + "num_input_tokens_seen": 188898885, + "step": 8791, + "time_per_iteration": 2.609342575073242 + }, + { + "auxiliary_loss_clip": 0.01063984, + "auxiliary_loss_mlp": 0.01029869, + "balance_loss_clip": 1.04006243, + "balance_loss_mlp": 1.01632702, + "epoch": 0.5286036374567864, + "flos": 20375463565440.0, + "grad_norm": 2.747278304747908, + "language_loss": 0.66302419, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.6839627, + "num_input_tokens_seen": 188917225, + "step": 8792, + "time_per_iteration": 2.713622570037842 + }, + { + "auxiliary_loss_clip": 0.01090251, + "auxiliary_loss_mlp": 0.01040633, + "balance_loss_clip": 1.03743482, + "balance_loss_mlp": 1.02670956, + "epoch": 0.5286637607094544, + "flos": 24352175122560.0, + "grad_norm": 1.9116255636929125, + "language_loss": 0.79727674, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.81858563, + "num_input_tokens_seen": 188936120, + "step": 8793, + "time_per_iteration": 2.6645493507385254 + }, + { + "auxiliary_loss_clip": 0.01121499, + "auxiliary_loss_mlp": 0.01045468, + "balance_loss_clip": 1.04323554, + "balance_loss_mlp": 1.03118658, + "epoch": 0.5287238839621223, + "flos": 17271165536640.0, + "grad_norm": 2.655732529836172, + "language_loss": 0.84749115, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.86916077, + "num_input_tokens_seen": 188953405, + "step": 8794, + "time_per_iteration": 2.8306803703308105 + }, + { + "auxiliary_loss_clip": 0.01097868, + "auxiliary_loss_mlp": 0.0103516, + "balance_loss_clip": 1.03908813, + "balance_loss_mlp": 1.02062225, + "epoch": 0.5287840072147904, + "flos": 17566890209280.0, + "grad_norm": 2.1997369626435894, + "language_loss": 0.676875, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.69820529, + "num_input_tokens_seen": 188971150, + "step": 8795, + "time_per_iteration": 2.703134059906006 + }, + { + "auxiliary_loss_clip": 0.01098455, + "auxiliary_loss_mlp": 0.010334, + "balance_loss_clip": 1.04339266, + "balance_loss_mlp": 1.01989961, + "epoch": 0.5288441304674583, + "flos": 18552099421440.0, + "grad_norm": 2.036052201037856, + "language_loss": 0.80291003, + "learning_rate": 1.910259223028374e-06, + "loss": 0.82422858, + "num_input_tokens_seen": 188989550, + "step": 8796, + "time_per_iteration": 2.6733570098876953 + }, + { + "auxiliary_loss_clip": 0.01079591, + "auxiliary_loss_mlp": 0.01043571, + "balance_loss_clip": 1.03867388, + "balance_loss_mlp": 1.02758455, + "epoch": 0.5289042537201263, + "flos": 20814507504000.0, + "grad_norm": 1.5572831824692925, + "language_loss": 0.69010925, + "learning_rate": 1.909870155310071e-06, + "loss": 0.71134079, + "num_input_tokens_seen": 189008795, + "step": 8797, + "time_per_iteration": 4.254164934158325 + }, + { + "auxiliary_loss_clip": 0.01101135, + "auxiliary_loss_mlp": 0.01036632, + "balance_loss_clip": 1.04237545, + "balance_loss_mlp": 1.02374518, + "epoch": 0.5289643769727942, + "flos": 15735265937280.0, + "grad_norm": 1.6872492204324914, + "language_loss": 0.82684171, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.84821934, + "num_input_tokens_seen": 189025540, + "step": 8798, + "time_per_iteration": 2.7167000770568848 + }, + { + "auxiliary_loss_clip": 0.01096424, + "auxiliary_loss_mlp": 0.00774405, + "balance_loss_clip": 1.03896332, + "balance_loss_mlp": 1.00029516, + "epoch": 0.5290245002254622, + "flos": 19537308633600.0, + "grad_norm": 1.9585595365508919, + "language_loss": 0.70825863, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.72696698, + "num_input_tokens_seen": 189044885, + "step": 8799, + "time_per_iteration": 4.350652694702148 + }, + { + "auxiliary_loss_clip": 0.01111399, + "auxiliary_loss_mlp": 0.01038005, + "balance_loss_clip": 1.04659581, + "balance_loss_mlp": 1.02507114, + "epoch": 0.5290846234781301, + "flos": 15815131827840.0, + "grad_norm": 2.2031970702340704, + "language_loss": 0.69286144, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.71435547, + "num_input_tokens_seen": 189061280, + "step": 8800, + "time_per_iteration": 4.109759569168091 + }, + { + "auxiliary_loss_clip": 0.01017957, + "auxiliary_loss_mlp": 0.01037827, + "balance_loss_clip": 1.01865292, + "balance_loss_mlp": 1.03631306, + "epoch": 0.5291447467307981, + "flos": 70057624821120.0, + "grad_norm": 0.9935539305247675, + "language_loss": 0.56959099, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.59014881, + "num_input_tokens_seen": 189114775, + "step": 8801, + "time_per_iteration": 3.1419920921325684 + }, + { + "auxiliary_loss_clip": 0.01110756, + "auxiliary_loss_mlp": 0.01036206, + "balance_loss_clip": 1.04886377, + "balance_loss_mlp": 1.02271795, + "epoch": 0.529204869983466, + "flos": 28364186770560.0, + "grad_norm": 1.5688016044474997, + "language_loss": 0.6425091, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.6639787, + "num_input_tokens_seen": 189134700, + "step": 8802, + "time_per_iteration": 2.7467000484466553 + }, + { + "auxiliary_loss_clip": 0.01101463, + "auxiliary_loss_mlp": 0.01031341, + "balance_loss_clip": 1.04380429, + "balance_loss_mlp": 1.01772761, + "epoch": 0.5292649932361341, + "flos": 33758830684800.0, + "grad_norm": 3.351871019760029, + "language_loss": 0.69098222, + "learning_rate": 1.907535821289003e-06, + "loss": 0.71231019, + "num_input_tokens_seen": 189155365, + "step": 8803, + "time_per_iteration": 4.278867721557617 + }, + { + "auxiliary_loss_clip": 0.01106005, + "auxiliary_loss_mlp": 0.00770288, + "balance_loss_clip": 1.04076648, + "balance_loss_mlp": 1.00028872, + "epoch": 0.5293251164888021, + "flos": 20447679859200.0, + "grad_norm": 1.7989646267917587, + "language_loss": 0.76156348, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.78032649, + "num_input_tokens_seen": 189173885, + "step": 8804, + "time_per_iteration": 2.683661699295044 + }, + { + "auxiliary_loss_clip": 0.01032487, + "auxiliary_loss_mlp": 0.01019664, + "balance_loss_clip": 1.01553822, + "balance_loss_mlp": 1.01836514, + "epoch": 0.52938523974147, + "flos": 66545312204160.0, + "grad_norm": 0.7526453486337231, + "language_loss": 0.5290755, + "learning_rate": 1.906757737841291e-06, + "loss": 0.54959702, + "num_input_tokens_seen": 189236515, + "step": 8805, + "time_per_iteration": 3.243603467941284 + }, + { + "auxiliary_loss_clip": 0.0103203, + "auxiliary_loss_mlp": 0.01016047, + "balance_loss_clip": 1.01495409, + "balance_loss_mlp": 1.01418769, + "epoch": 0.529445362994138, + "flos": 67151734542720.0, + "grad_norm": 0.7522317031499139, + "language_loss": 0.6378004, + "learning_rate": 1.906368701413693e-06, + "loss": 0.65828121, + "num_input_tokens_seen": 189300500, + "step": 8806, + "time_per_iteration": 3.185899257659912 + }, + { + "auxiliary_loss_clip": 0.01112977, + "auxiliary_loss_mlp": 0.01034283, + "balance_loss_clip": 1.04236031, + "balance_loss_mlp": 1.02053213, + "epoch": 0.5295054862468059, + "flos": 17749316407680.0, + "grad_norm": 1.5696878511475738, + "language_loss": 0.72756052, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.74903309, + "num_input_tokens_seen": 189319745, + "step": 8807, + "time_per_iteration": 2.652667284011841 + }, + { + "auxiliary_loss_clip": 0.01079975, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.04053009, + "balance_loss_mlp": 1.01760888, + "epoch": 0.529565609499474, + "flos": 11397401084160.0, + "grad_norm": 2.191041401806776, + "language_loss": 0.69626606, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.71737224, + "num_input_tokens_seen": 189334550, + "step": 8808, + "time_per_iteration": 2.6991183757781982 + }, + { + "auxiliary_loss_clip": 0.01109251, + "auxiliary_loss_mlp": 0.01032489, + "balance_loss_clip": 1.041991, + "balance_loss_mlp": 1.01962066, + "epoch": 0.5296257327521419, + "flos": 17196363463680.0, + "grad_norm": 1.8261828078243632, + "language_loss": 0.8653447, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.88676214, + "num_input_tokens_seen": 189351735, + "step": 8809, + "time_per_iteration": 2.5995731353759766 + }, + { + "auxiliary_loss_clip": 0.0111469, + "auxiliary_loss_mlp": 0.01041403, + "balance_loss_clip": 1.04281509, + "balance_loss_mlp": 1.02607894, + "epoch": 0.5296858560048099, + "flos": 39964086777600.0, + "grad_norm": 1.9222242916722383, + "language_loss": 0.64388674, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.66544765, + "num_input_tokens_seen": 189373105, + "step": 8810, + "time_per_iteration": 2.776230573654175 + }, + { + "auxiliary_loss_clip": 0.01119011, + "auxiliary_loss_mlp": 0.01038636, + "balance_loss_clip": 1.04296374, + "balance_loss_mlp": 1.02509344, + "epoch": 0.5297459792574778, + "flos": 20961418129920.0, + "grad_norm": 1.8063937788931883, + "language_loss": 0.68213391, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.70371044, + "num_input_tokens_seen": 189394615, + "step": 8811, + "time_per_iteration": 2.684617757797241 + }, + { + "auxiliary_loss_clip": 0.01007367, + "auxiliary_loss_mlp": 0.0100546, + "balance_loss_clip": 1.01854634, + "balance_loss_mlp": 1.00402915, + "epoch": 0.5298061025101458, + "flos": 66523620389760.0, + "grad_norm": 0.689972629111167, + "language_loss": 0.53345251, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.55358076, + "num_input_tokens_seen": 189459750, + "step": 8812, + "time_per_iteration": 3.3905134201049805 + }, + { + "auxiliary_loss_clip": 0.01023218, + "auxiliary_loss_mlp": 0.01004548, + "balance_loss_clip": 1.01716316, + "balance_loss_mlp": 1.00321257, + "epoch": 0.5298662257628137, + "flos": 67662994775040.0, + "grad_norm": 0.7359658604916758, + "language_loss": 0.56288284, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.58316052, + "num_input_tokens_seen": 189527540, + "step": 8813, + "time_per_iteration": 3.2840702533721924 + }, + { + "auxiliary_loss_clip": 0.01064136, + "auxiliary_loss_mlp": 0.01033208, + "balance_loss_clip": 1.0387466, + "balance_loss_mlp": 1.01986289, + "epoch": 0.5299263490154817, + "flos": 19646405216640.0, + "grad_norm": 1.8723589062576662, + "language_loss": 0.81484783, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.83582127, + "num_input_tokens_seen": 189546900, + "step": 8814, + "time_per_iteration": 2.7889370918273926 + }, + { + "auxiliary_loss_clip": 0.01129463, + "auxiliary_loss_mlp": 0.01035706, + "balance_loss_clip": 1.04835963, + "balance_loss_mlp": 1.02225351, + "epoch": 0.5299864722681497, + "flos": 22055005653120.0, + "grad_norm": 1.8736963674991467, + "language_loss": 0.85159796, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.87324965, + "num_input_tokens_seen": 189566490, + "step": 8815, + "time_per_iteration": 2.588376998901367 + }, + { + "auxiliary_loss_clip": 0.01119356, + "auxiliary_loss_mlp": 0.01030819, + "balance_loss_clip": 1.04443836, + "balance_loss_mlp": 1.01802766, + "epoch": 0.5300465955208177, + "flos": 21763698353280.0, + "grad_norm": 2.360835312755498, + "language_loss": 0.66173548, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.6832372, + "num_input_tokens_seen": 189585580, + "step": 8816, + "time_per_iteration": 2.6367204189300537 + }, + { + "auxiliary_loss_clip": 0.01098885, + "auxiliary_loss_mlp": 0.01037316, + "balance_loss_clip": 1.04165578, + "balance_loss_mlp": 1.02370238, + "epoch": 0.5301067187734857, + "flos": 42996491735040.0, + "grad_norm": 1.8428826452353317, + "language_loss": 0.72204578, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.74340779, + "num_input_tokens_seen": 189608485, + "step": 8817, + "time_per_iteration": 2.8511815071105957 + }, + { + "auxiliary_loss_clip": 0.01093351, + "auxiliary_loss_mlp": 0.0103512, + "balance_loss_clip": 1.03981018, + "balance_loss_mlp": 1.01959896, + "epoch": 0.5301668420261536, + "flos": 20554298403840.0, + "grad_norm": 1.7783802077805728, + "language_loss": 0.65400332, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.67528808, + "num_input_tokens_seen": 189627815, + "step": 8818, + "time_per_iteration": 2.757228374481201 + }, + { + "auxiliary_loss_clip": 0.01075022, + "auxiliary_loss_mlp": 0.01033272, + "balance_loss_clip": 1.04101062, + "balance_loss_mlp": 1.01816273, + "epoch": 0.5302269652788216, + "flos": 17486665182720.0, + "grad_norm": 1.8529738404346974, + "language_loss": 0.75020683, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.77128971, + "num_input_tokens_seen": 189644850, + "step": 8819, + "time_per_iteration": 2.7458016872406006 + }, + { + "auxiliary_loss_clip": 0.01088004, + "auxiliary_loss_mlp": 0.01047287, + "balance_loss_clip": 1.04190588, + "balance_loss_mlp": 1.03143191, + "epoch": 0.5302870885314895, + "flos": 14574202715520.0, + "grad_norm": 2.236781046268797, + "language_loss": 0.81955135, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.84090424, + "num_input_tokens_seen": 189660945, + "step": 8820, + "time_per_iteration": 2.7917025089263916 + }, + { + "auxiliary_loss_clip": 0.01101102, + "auxiliary_loss_mlp": 0.01034433, + "balance_loss_clip": 1.04137421, + "balance_loss_mlp": 1.02192223, + "epoch": 0.5303472117841576, + "flos": 23438032968960.0, + "grad_norm": 1.5105877277652986, + "language_loss": 0.72733676, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.74869215, + "num_input_tokens_seen": 189680425, + "step": 8821, + "time_per_iteration": 2.664912462234497 + }, + { + "auxiliary_loss_clip": 0.01092575, + "auxiliary_loss_mlp": 0.01032249, + "balance_loss_clip": 1.04237318, + "balance_loss_mlp": 1.01958346, + "epoch": 0.5304073350368255, + "flos": 22709010533760.0, + "grad_norm": 1.4432589414019072, + "language_loss": 0.74112785, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.76237607, + "num_input_tokens_seen": 189700375, + "step": 8822, + "time_per_iteration": 2.7494471073150635 + }, + { + "auxiliary_loss_clip": 0.01087967, + "auxiliary_loss_mlp": 0.01034985, + "balance_loss_clip": 1.03945005, + "balance_loss_mlp": 1.02029884, + "epoch": 0.5304674582894935, + "flos": 27928554624000.0, + "grad_norm": 1.6561028390766985, + "language_loss": 0.67739707, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.69862658, + "num_input_tokens_seen": 189721225, + "step": 8823, + "time_per_iteration": 2.8298280239105225 + }, + { + "auxiliary_loss_clip": 0.01127487, + "auxiliary_loss_mlp": 0.01042695, + "balance_loss_clip": 1.0455004, + "balance_loss_mlp": 1.02722192, + "epoch": 0.5305275815421614, + "flos": 21250642440960.0, + "grad_norm": 1.7679489191905855, + "language_loss": 0.69459474, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.71629655, + "num_input_tokens_seen": 189740170, + "step": 8824, + "time_per_iteration": 2.7093706130981445 + }, + { + "auxiliary_loss_clip": 0.01098459, + "auxiliary_loss_mlp": 0.00770579, + "balance_loss_clip": 1.04351103, + "balance_loss_mlp": 1.00028551, + "epoch": 0.5305877047948294, + "flos": 17603088140160.0, + "grad_norm": 2.079936946962719, + "language_loss": 0.7578221, + "learning_rate": 1.898977700702689e-06, + "loss": 0.77651244, + "num_input_tokens_seen": 189757890, + "step": 8825, + "time_per_iteration": 2.7240397930145264 + }, + { + "auxiliary_loss_clip": 0.01042177, + "auxiliary_loss_mlp": 0.01041743, + "balance_loss_clip": 1.03510904, + "balance_loss_mlp": 1.02771175, + "epoch": 0.5306478280474973, + "flos": 15195493284480.0, + "grad_norm": 1.902170532497994, + "language_loss": 0.85671568, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.87755489, + "num_input_tokens_seen": 189775390, + "step": 8826, + "time_per_iteration": 2.786893367767334 + }, + { + "auxiliary_loss_clip": 0.0112111, + "auxiliary_loss_mlp": 0.0103337, + "balance_loss_clip": 1.04376101, + "balance_loss_mlp": 1.01967907, + "epoch": 0.5307079513001653, + "flos": 15341218761600.0, + "grad_norm": 1.3295158202050776, + "language_loss": 0.64655942, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.66810423, + "num_input_tokens_seen": 189793975, + "step": 8827, + "time_per_iteration": 2.650259017944336 + }, + { + "auxiliary_loss_clip": 0.01100521, + "auxiliary_loss_mlp": 0.01041689, + "balance_loss_clip": 1.04230511, + "balance_loss_mlp": 1.02720535, + "epoch": 0.5307680745528333, + "flos": 43544452688640.0, + "grad_norm": 1.5763280036459053, + "language_loss": 0.60055244, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.62197453, + "num_input_tokens_seen": 189817870, + "step": 8828, + "time_per_iteration": 2.9273712635040283 + }, + { + "auxiliary_loss_clip": 0.01115165, + "auxiliary_loss_mlp": 0.01032955, + "balance_loss_clip": 1.04400516, + "balance_loss_mlp": 1.01779199, + "epoch": 0.5308281978055013, + "flos": 20048928001920.0, + "grad_norm": 1.6623375431972864, + "language_loss": 0.81171465, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.83319587, + "num_input_tokens_seen": 189837905, + "step": 8829, + "time_per_iteration": 2.6640090942382812 + }, + { + "auxiliary_loss_clip": 0.01104846, + "auxiliary_loss_mlp": 0.01035043, + "balance_loss_clip": 1.043993, + "balance_loss_mlp": 1.02136946, + "epoch": 0.5308883210581693, + "flos": 20703938463360.0, + "grad_norm": 1.3895948203919835, + "language_loss": 0.78245443, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.80385327, + "num_input_tokens_seen": 189856970, + "step": 8830, + "time_per_iteration": 2.736316680908203 + }, + { + "auxiliary_loss_clip": 0.01111385, + "auxiliary_loss_mlp": 0.01033264, + "balance_loss_clip": 1.04335451, + "balance_loss_mlp": 1.02036524, + "epoch": 0.5309484443108372, + "flos": 14355506759040.0, + "grad_norm": 2.4391763831493165, + "language_loss": 0.8031435, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.82458997, + "num_input_tokens_seen": 189872830, + "step": 8831, + "time_per_iteration": 2.6151957511901855 + }, + { + "auxiliary_loss_clip": 0.01108777, + "auxiliary_loss_mlp": 0.01032917, + "balance_loss_clip": 1.0430057, + "balance_loss_mlp": 1.01951742, + "epoch": 0.5310085675635052, + "flos": 20010503427840.0, + "grad_norm": 4.592110703983282, + "language_loss": 0.73025942, + "learning_rate": 1.896255043672186e-06, + "loss": 0.75167632, + "num_input_tokens_seen": 189891635, + "step": 8832, + "time_per_iteration": 2.6464226245880127 + }, + { + "auxiliary_loss_clip": 0.01089691, + "auxiliary_loss_mlp": 0.01036866, + "balance_loss_clip": 1.04126275, + "balance_loss_mlp": 1.02198887, + "epoch": 0.5310686908161731, + "flos": 22127293774080.0, + "grad_norm": 2.4188792138763513, + "language_loss": 0.75694382, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.77820939, + "num_input_tokens_seen": 189909050, + "step": 8833, + "time_per_iteration": 2.757716178894043 + }, + { + "auxiliary_loss_clip": 0.01087272, + "auxiliary_loss_mlp": 0.01036493, + "balance_loss_clip": 1.03743505, + "balance_loss_mlp": 1.02260494, + "epoch": 0.5311288140688412, + "flos": 24717889445760.0, + "grad_norm": 1.6684529348681687, + "language_loss": 0.73618537, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.75742298, + "num_input_tokens_seen": 189927405, + "step": 8834, + "time_per_iteration": 2.7447376251220703 + }, + { + "auxiliary_loss_clip": 0.01127832, + "auxiliary_loss_mlp": 0.01042563, + "balance_loss_clip": 1.04435921, + "balance_loss_mlp": 1.02734029, + "epoch": 0.5311889373215091, + "flos": 24097712198400.0, + "grad_norm": 1.9940250251862053, + "language_loss": 0.77417272, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.79587668, + "num_input_tokens_seen": 189947740, + "step": 8835, + "time_per_iteration": 2.654860734939575 + }, + { + "auxiliary_loss_clip": 0.01097251, + "auxiliary_loss_mlp": 0.01046402, + "balance_loss_clip": 1.04259109, + "balance_loss_mlp": 1.03138208, + "epoch": 0.5312490605741771, + "flos": 22017012042240.0, + "grad_norm": 2.4706637723930505, + "language_loss": 0.72355223, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.7449888, + "num_input_tokens_seen": 189966495, + "step": 8836, + "time_per_iteration": 2.694772243499756 + }, + { + "auxiliary_loss_clip": 0.01104585, + "auxiliary_loss_mlp": 0.01040344, + "balance_loss_clip": 1.04374099, + "balance_loss_mlp": 1.02537167, + "epoch": 0.531309183826845, + "flos": 19390541662080.0, + "grad_norm": 1.705704926785557, + "language_loss": 0.81026083, + "learning_rate": 1.894310406375987e-06, + "loss": 0.8317101, + "num_input_tokens_seen": 189985325, + "step": 8837, + "time_per_iteration": 4.218893527984619 + }, + { + "auxiliary_loss_clip": 0.01107393, + "auxiliary_loss_mlp": 0.01036209, + "balance_loss_clip": 1.04489708, + "balance_loss_mlp": 1.02216005, + "epoch": 0.531369307079513, + "flos": 20190056538240.0, + "grad_norm": 1.8031911656804687, + "language_loss": 0.8618502, + "learning_rate": 1.893921490881035e-06, + "loss": 0.88328624, + "num_input_tokens_seen": 190003290, + "step": 8838, + "time_per_iteration": 4.327972888946533 + }, + { + "auxiliary_loss_clip": 0.01097617, + "auxiliary_loss_mlp": 0.01036447, + "balance_loss_clip": 1.04136765, + "balance_loss_mlp": 1.02366185, + "epoch": 0.5314294303321809, + "flos": 18880143356160.0, + "grad_norm": 1.7768925166398193, + "language_loss": 0.72961235, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.75095296, + "num_input_tokens_seen": 190023260, + "step": 8839, + "time_per_iteration": 4.2734081745147705 + }, + { + "auxiliary_loss_clip": 0.0110159, + "auxiliary_loss_mlp": 0.01042278, + "balance_loss_clip": 1.04086304, + "balance_loss_mlp": 1.02885473, + "epoch": 0.531489553584849, + "flos": 23040035297280.0, + "grad_norm": 1.7238696185302183, + "language_loss": 0.76902539, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.79046404, + "num_input_tokens_seen": 190042035, + "step": 8840, + "time_per_iteration": 2.708387613296509 + }, + { + "auxiliary_loss_clip": 0.01085488, + "auxiliary_loss_mlp": 0.01033907, + "balance_loss_clip": 1.04072022, + "balance_loss_mlp": 1.01934612, + "epoch": 0.5315496768375169, + "flos": 19790478668160.0, + "grad_norm": 2.0047240823259385, + "language_loss": 0.77301592, + "learning_rate": 1.892754768590216e-06, + "loss": 0.7942099, + "num_input_tokens_seen": 190057545, + "step": 8841, + "time_per_iteration": 2.6982758045196533 + }, + { + "auxiliary_loss_clip": 0.0102526, + "auxiliary_loss_mlp": 0.01022764, + "balance_loss_clip": 1.01826656, + "balance_loss_mlp": 1.02119017, + "epoch": 0.5316098000901849, + "flos": 71023228185600.0, + "grad_norm": 0.6981779601463162, + "language_loss": 0.56741858, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.58789885, + "num_input_tokens_seen": 190123800, + "step": 8842, + "time_per_iteration": 4.895024299621582 + }, + { + "auxiliary_loss_clip": 0.01102673, + "auxiliary_loss_mlp": 0.01041259, + "balance_loss_clip": 1.04331183, + "balance_loss_mlp": 1.02621484, + "epoch": 0.5316699233428529, + "flos": 16435560470400.0, + "grad_norm": 1.8735975877067965, + "language_loss": 0.73998511, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.76142448, + "num_input_tokens_seen": 190141625, + "step": 8843, + "time_per_iteration": 2.66169810295105 + }, + { + "auxiliary_loss_clip": 0.01023627, + "auxiliary_loss_mlp": 0.0100589, + "balance_loss_clip": 1.01690733, + "balance_loss_mlp": 1.00456095, + "epoch": 0.5317300465955208, + "flos": 67420814302080.0, + "grad_norm": 0.8814346849515853, + "language_loss": 0.61057651, + "learning_rate": 1.891588082900145e-06, + "loss": 0.63087165, + "num_input_tokens_seen": 190198110, + "step": 8844, + "time_per_iteration": 3.297545909881592 + }, + { + "auxiliary_loss_clip": 0.01032752, + "auxiliary_loss_mlp": 0.01005725, + "balance_loss_clip": 1.01528263, + "balance_loss_mlp": 1.00425863, + "epoch": 0.5317901698481888, + "flos": 59508075340800.0, + "grad_norm": 0.8422745451421196, + "language_loss": 0.62147105, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64185584, + "num_input_tokens_seen": 190259950, + "step": 8845, + "time_per_iteration": 3.1747312545776367 + }, + { + "auxiliary_loss_clip": 0.01088974, + "auxiliary_loss_mlp": 0.01040872, + "balance_loss_clip": 1.04063165, + "balance_loss_mlp": 1.02521944, + "epoch": 0.5318502931008567, + "flos": 19129219240320.0, + "grad_norm": 1.8386701394288745, + "language_loss": 0.74980247, + "learning_rate": 1.890810312970474e-06, + "loss": 0.77110094, + "num_input_tokens_seen": 190278265, + "step": 8846, + "time_per_iteration": 2.734652519226074 + }, + { + "auxiliary_loss_clip": 0.01111858, + "auxiliary_loss_mlp": 0.01034985, + "balance_loss_clip": 1.04369533, + "balance_loss_mlp": 1.0226109, + "epoch": 0.5319104163535248, + "flos": 24681045070080.0, + "grad_norm": 1.562458752543025, + "language_loss": 0.75478411, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.77625251, + "num_input_tokens_seen": 190298400, + "step": 8847, + "time_per_iteration": 2.7175981998443604 + }, + { + "auxiliary_loss_clip": 0.0110005, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.04175198, + "balance_loss_mlp": 1.0193609, + "epoch": 0.5319705396061927, + "flos": 19385513758080.0, + "grad_norm": 1.5938668259379032, + "language_loss": 0.87875456, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.90007627, + "num_input_tokens_seen": 190316235, + "step": 8848, + "time_per_iteration": 2.777731418609619 + }, + { + "auxiliary_loss_clip": 0.01084561, + "auxiliary_loss_mlp": 0.01041363, + "balance_loss_clip": 1.04119325, + "balance_loss_mlp": 1.02549624, + "epoch": 0.5320306628588607, + "flos": 18259319664000.0, + "grad_norm": 2.1051582434291833, + "language_loss": 0.74326992, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.76452917, + "num_input_tokens_seen": 190335060, + "step": 8849, + "time_per_iteration": 2.7248313426971436 + }, + { + "auxiliary_loss_clip": 0.01107496, + "auxiliary_loss_mlp": 0.01030316, + "balance_loss_clip": 1.03895473, + "balance_loss_mlp": 1.0154624, + "epoch": 0.5320907861115286, + "flos": 23732321097600.0, + "grad_norm": 1.8915242874982603, + "language_loss": 0.79657137, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.81794947, + "num_input_tokens_seen": 190353265, + "step": 8850, + "time_per_iteration": 2.7357401847839355 + }, + { + "auxiliary_loss_clip": 0.01121659, + "auxiliary_loss_mlp": 0.01031858, + "balance_loss_clip": 1.04192996, + "balance_loss_mlp": 1.01804209, + "epoch": 0.5321509093641966, + "flos": 34495251321600.0, + "grad_norm": 1.633301633467878, + "language_loss": 0.55076206, + "learning_rate": 1.888865960862821e-06, + "loss": 0.57229722, + "num_input_tokens_seen": 190376575, + "step": 8851, + "time_per_iteration": 2.730081081390381 + }, + { + "auxiliary_loss_clip": 0.01110617, + "auxiliary_loss_mlp": 0.01036207, + "balance_loss_clip": 1.04243159, + "balance_loss_mlp": 1.0228914, + "epoch": 0.5322110326168645, + "flos": 20010934391040.0, + "grad_norm": 1.5393101812132837, + "language_loss": 0.68206942, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.70353764, + "num_input_tokens_seen": 190395185, + "step": 8852, + "time_per_iteration": 2.685267925262451 + }, + { + "auxiliary_loss_clip": 0.01020981, + "auxiliary_loss_mlp": 0.00752764, + "balance_loss_clip": 1.01425028, + "balance_loss_mlp": 0.99977398, + "epoch": 0.5322711558695326, + "flos": 64631164435200.0, + "grad_norm": 0.7921902417648442, + "language_loss": 0.62794167, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.64567912, + "num_input_tokens_seen": 190452595, + "step": 8853, + "time_per_iteration": 3.154197931289673 + }, + { + "auxiliary_loss_clip": 0.01113411, + "auxiliary_loss_mlp": 0.01027799, + "balance_loss_clip": 1.04064846, + "balance_loss_mlp": 1.01379788, + "epoch": 0.5323312791222005, + "flos": 14939342421120.0, + "grad_norm": 2.437651920606879, + "language_loss": 0.79789698, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.81930912, + "num_input_tokens_seen": 190469140, + "step": 8854, + "time_per_iteration": 2.6569535732269287 + }, + { + "auxiliary_loss_clip": 0.01092841, + "auxiliary_loss_mlp": 0.0102808, + "balance_loss_clip": 1.0418992, + "balance_loss_mlp": 1.01586115, + "epoch": 0.5323914023748685, + "flos": 23440834229760.0, + "grad_norm": 1.7182223658194644, + "language_loss": 0.73290253, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.75411177, + "num_input_tokens_seen": 190489015, + "step": 8855, + "time_per_iteration": 2.6984002590179443 + }, + { + "auxiliary_loss_clip": 0.01095667, + "auxiliary_loss_mlp": 0.0077104, + "balance_loss_clip": 1.03969502, + "balance_loss_mlp": 1.00030267, + "epoch": 0.5324515256275365, + "flos": 26286180134400.0, + "grad_norm": 1.9960339019119333, + "language_loss": 0.6505388, + "learning_rate": 1.886921714110507e-06, + "loss": 0.66920584, + "num_input_tokens_seen": 190508065, + "step": 8856, + "time_per_iteration": 2.7057278156280518 + }, + { + "auxiliary_loss_clip": 0.01100444, + "auxiliary_loss_mlp": 0.0103908, + "balance_loss_clip": 1.04079795, + "balance_loss_mlp": 1.02341616, + "epoch": 0.5325116488802044, + "flos": 26870913636480.0, + "grad_norm": 2.078757662178109, + "language_loss": 0.77651089, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.79790616, + "num_input_tokens_seen": 190527045, + "step": 8857, + "time_per_iteration": 2.764199733734131 + }, + { + "auxiliary_loss_clip": 0.01092407, + "auxiliary_loss_mlp": 0.01034608, + "balance_loss_clip": 1.04279578, + "balance_loss_mlp": 1.02039194, + "epoch": 0.5325717721328724, + "flos": 25884734757120.0, + "grad_norm": 2.3746118235231592, + "language_loss": 0.70823711, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.72950727, + "num_input_tokens_seen": 190544075, + "step": 8858, + "time_per_iteration": 2.735534191131592 + }, + { + "auxiliary_loss_clip": 0.01108427, + "auxiliary_loss_mlp": 0.01040186, + "balance_loss_clip": 1.0411067, + "balance_loss_mlp": 1.02518916, + "epoch": 0.5326318953855403, + "flos": 21799321666560.0, + "grad_norm": 1.83105211007431, + "language_loss": 0.69232476, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.71381092, + "num_input_tokens_seen": 190566030, + "step": 8859, + "time_per_iteration": 2.773764133453369 + }, + { + "auxiliary_loss_clip": 0.01109944, + "auxiliary_loss_mlp": 0.01028838, + "balance_loss_clip": 1.04517436, + "balance_loss_mlp": 1.01671481, + "epoch": 0.5326920186382084, + "flos": 20922921728640.0, + "grad_norm": 1.8423028887831514, + "language_loss": 0.69617528, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.71756315, + "num_input_tokens_seen": 190585605, + "step": 8860, + "time_per_iteration": 2.689471483230591 + }, + { + "auxiliary_loss_clip": 0.01102885, + "auxiliary_loss_mlp": 0.01035827, + "balance_loss_clip": 1.0451107, + "balance_loss_mlp": 1.02258921, + "epoch": 0.5327521418908763, + "flos": 21433427775360.0, + "grad_norm": 2.3281195979693297, + "language_loss": 0.78340018, + "learning_rate": 1.884977574556683e-06, + "loss": 0.80478734, + "num_input_tokens_seen": 190604625, + "step": 8861, + "time_per_iteration": 2.66679048538208 + }, + { + "auxiliary_loss_clip": 0.01077125, + "auxiliary_loss_mlp": 0.01040454, + "balance_loss_clip": 1.03987145, + "balance_loss_mlp": 1.02606571, + "epoch": 0.5328122651435443, + "flos": 21760250647680.0, + "grad_norm": 1.7664447291359346, + "language_loss": 0.85554659, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.87672234, + "num_input_tokens_seen": 190625060, + "step": 8862, + "time_per_iteration": 2.7928006649017334 + }, + { + "auxiliary_loss_clip": 0.0109879, + "auxiliary_loss_mlp": 0.01039782, + "balance_loss_clip": 1.03952289, + "balance_loss_mlp": 1.0237242, + "epoch": 0.5328723883962122, + "flos": 18296487262080.0, + "grad_norm": 2.2696975914187116, + "language_loss": 0.62147439, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.64286011, + "num_input_tokens_seen": 190643150, + "step": 8863, + "time_per_iteration": 2.685253381729126 + }, + { + "auxiliary_loss_clip": 0.01098767, + "auxiliary_loss_mlp": 0.01040661, + "balance_loss_clip": 1.04511809, + "balance_loss_mlp": 1.02661765, + "epoch": 0.5329325116488802, + "flos": 25374911068800.0, + "grad_norm": 1.8529881391436633, + "language_loss": 0.73310483, + "learning_rate": 1.883811143046377e-06, + "loss": 0.75449914, + "num_input_tokens_seen": 190662725, + "step": 8864, + "time_per_iteration": 2.703639030456543 + }, + { + "auxiliary_loss_clip": 0.01120661, + "auxiliary_loss_mlp": 0.01035736, + "balance_loss_clip": 1.04301071, + "balance_loss_mlp": 1.02275968, + "epoch": 0.5329926349015481, + "flos": 25592098654080.0, + "grad_norm": 1.6333657309737846, + "language_loss": 0.64201105, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.66357499, + "num_input_tokens_seen": 190683680, + "step": 8865, + "time_per_iteration": 2.691087245941162 + }, + { + "auxiliary_loss_clip": 0.01113033, + "auxiliary_loss_mlp": 0.01029706, + "balance_loss_clip": 1.0424211, + "balance_loss_mlp": 1.01641965, + "epoch": 0.5330527581542162, + "flos": 22889605138560.0, + "grad_norm": 3.0767575494694985, + "language_loss": 0.78091645, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.80234385, + "num_input_tokens_seen": 190703350, + "step": 8866, + "time_per_iteration": 2.674612283706665 + }, + { + "auxiliary_loss_clip": 0.01108068, + "auxiliary_loss_mlp": 0.0103023, + "balance_loss_clip": 1.04092908, + "balance_loss_mlp": 1.01696241, + "epoch": 0.5331128814068841, + "flos": 16026752805120.0, + "grad_norm": 1.842224927457961, + "language_loss": 0.73840493, + "learning_rate": 1.882644751189108e-06, + "loss": 0.75978798, + "num_input_tokens_seen": 190721170, + "step": 8867, + "time_per_iteration": 2.6963648796081543 + }, + { + "auxiliary_loss_clip": 0.01098718, + "auxiliary_loss_mlp": 0.01039247, + "balance_loss_clip": 1.04040504, + "balance_loss_mlp": 1.02402985, + "epoch": 0.5331730046595521, + "flos": 39344699629440.0, + "grad_norm": 1.5703549780422514, + "language_loss": 0.71881396, + "learning_rate": 1.88225596278394e-06, + "loss": 0.74019361, + "num_input_tokens_seen": 190743795, + "step": 8868, + "time_per_iteration": 2.830118417739868 + }, + { + "auxiliary_loss_clip": 0.01090763, + "auxiliary_loss_mlp": 0.01034625, + "balance_loss_clip": 1.04197335, + "balance_loss_mlp": 1.0212791, + "epoch": 0.5332331279122201, + "flos": 24024382583040.0, + "grad_norm": 5.550281122060094, + "language_loss": 0.78397369, + "learning_rate": 1.881867178843637e-06, + "loss": 0.80522758, + "num_input_tokens_seen": 190761560, + "step": 8869, + "time_per_iteration": 2.738565444946289 + }, + { + "auxiliary_loss_clip": 0.01114623, + "auxiliary_loss_mlp": 0.01037147, + "balance_loss_clip": 1.0432862, + "balance_loss_mlp": 1.02336633, + "epoch": 0.533293251164888, + "flos": 17129318728320.0, + "grad_norm": 1.7588400416982446, + "language_loss": 0.75840724, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.77992487, + "num_input_tokens_seen": 190778875, + "step": 8870, + "time_per_iteration": 2.598963499069214 + }, + { + "auxiliary_loss_clip": 0.01100618, + "auxiliary_loss_mlp": 0.01038316, + "balance_loss_clip": 1.04231286, + "balance_loss_mlp": 1.02373052, + "epoch": 0.533353374417556, + "flos": 22126360020480.0, + "grad_norm": 5.617051153369423, + "language_loss": 0.75663799, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.7780273, + "num_input_tokens_seen": 190799830, + "step": 8871, + "time_per_iteration": 2.7459628582000732 + }, + { + "auxiliary_loss_clip": 0.01099152, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.04201055, + "balance_loss_mlp": 1.0202924, + "epoch": 0.533413497670224, + "flos": 15011091838080.0, + "grad_norm": 1.8041252581471448, + "language_loss": 0.7247498, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.74607694, + "num_input_tokens_seen": 190817155, + "step": 8872, + "time_per_iteration": 2.6604373455047607 + }, + { + "auxiliary_loss_clip": 0.01100126, + "auxiliary_loss_mlp": 0.0104147, + "balance_loss_clip": 1.04733372, + "balance_loss_mlp": 1.02694392, + "epoch": 0.533473620922892, + "flos": 19609955890560.0, + "grad_norm": 1.7875555414889834, + "language_loss": 0.65306997, + "learning_rate": 1.880312088025936e-06, + "loss": 0.67448598, + "num_input_tokens_seen": 190835240, + "step": 8873, + "time_per_iteration": 2.6587424278259277 + }, + { + "auxiliary_loss_clip": 0.01098214, + "auxiliary_loss_mlp": 0.0104372, + "balance_loss_clip": 1.04254389, + "balance_loss_mlp": 1.03035116, + "epoch": 0.5335337441755599, + "flos": 14282644020480.0, + "grad_norm": 2.157272820213575, + "language_loss": 0.80225539, + "learning_rate": 1.879923326631099e-06, + "loss": 0.82367474, + "num_input_tokens_seen": 190851620, + "step": 8874, + "time_per_iteration": 2.723454475402832 + }, + { + "auxiliary_loss_clip": 0.01112328, + "auxiliary_loss_mlp": 0.01030163, + "balance_loss_clip": 1.04300058, + "balance_loss_mlp": 1.01653171, + "epoch": 0.5335938674282279, + "flos": 20814830726400.0, + "grad_norm": 1.8315602861194333, + "language_loss": 0.69789159, + "learning_rate": 1.879534569789582e-06, + "loss": 0.71931654, + "num_input_tokens_seen": 190870545, + "step": 8875, + "time_per_iteration": 2.6051578521728516 + }, + { + "auxiliary_loss_clip": 0.01045431, + "auxiliary_loss_mlp": 0.01001312, + "balance_loss_clip": 1.01922286, + "balance_loss_mlp": 0.99979252, + "epoch": 0.5336539906808958, + "flos": 71396448451200.0, + "grad_norm": 0.7211200965927701, + "language_loss": 0.59631079, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61677825, + "num_input_tokens_seen": 190931995, + "step": 8876, + "time_per_iteration": 3.3114185333251953 + }, + { + "auxiliary_loss_clip": 0.01113481, + "auxiliary_loss_mlp": 0.01040016, + "balance_loss_clip": 1.04467189, + "balance_loss_mlp": 1.02705741, + "epoch": 0.5337141139335638, + "flos": 20152996680960.0, + "grad_norm": 1.6786856291224888, + "language_loss": 0.74847406, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.77000904, + "num_input_tokens_seen": 190949890, + "step": 8877, + "time_per_iteration": 4.474783182144165 + }, + { + "auxiliary_loss_clip": 0.01030394, + "auxiliary_loss_mlp": 0.01002162, + "balance_loss_clip": 1.01585436, + "balance_loss_mlp": 1.00046921, + "epoch": 0.5337742371862317, + "flos": 67728387484800.0, + "grad_norm": 0.7582021069840851, + "language_loss": 0.57155037, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59187591, + "num_input_tokens_seen": 191008480, + "step": 8878, + "time_per_iteration": 4.623803615570068 + }, + { + "auxiliary_loss_clip": 0.0112711, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.04414368, + "balance_loss_mlp": 1.02169418, + "epoch": 0.5338343604388998, + "flos": 25008909436800.0, + "grad_norm": 1.4672061419232192, + "language_loss": 0.72301328, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.74464571, + "num_input_tokens_seen": 191028995, + "step": 8879, + "time_per_iteration": 2.646631956100464 + }, + { + "auxiliary_loss_clip": 0.01126385, + "auxiliary_loss_mlp": 0.01039416, + "balance_loss_clip": 1.04535294, + "balance_loss_mlp": 1.02487254, + "epoch": 0.5338944836915677, + "flos": 17601256546560.0, + "grad_norm": 2.878615745391383, + "language_loss": 0.83403212, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.85569012, + "num_input_tokens_seen": 191045285, + "step": 8880, + "time_per_iteration": 2.578953504562378 + }, + { + "auxiliary_loss_clip": 0.01053817, + "auxiliary_loss_mlp": 0.01036139, + "balance_loss_clip": 1.03627348, + "balance_loss_mlp": 1.02279377, + "epoch": 0.5339546069442357, + "flos": 21724124544000.0, + "grad_norm": 1.3711441541735603, + "language_loss": 0.79637486, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.81727445, + "num_input_tokens_seen": 191066105, + "step": 8881, + "time_per_iteration": 4.335238695144653 + }, + { + "auxiliary_loss_clip": 0.0102058, + "auxiliary_loss_mlp": 0.00999984, + "balance_loss_clip": 1.01616335, + "balance_loss_mlp": 0.99846381, + "epoch": 0.5340147301969036, + "flos": 69723583315200.0, + "grad_norm": 0.7924040124288975, + "language_loss": 0.59248376, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61268938, + "num_input_tokens_seen": 191126315, + "step": 8882, + "time_per_iteration": 3.1252357959747314 + }, + { + "auxiliary_loss_clip": 0.01025577, + "auxiliary_loss_mlp": 0.01019116, + "balance_loss_clip": 1.01780772, + "balance_loss_mlp": 1.01768577, + "epoch": 0.5340748534495716, + "flos": 63880701580800.0, + "grad_norm": 0.8651438881324313, + "language_loss": 0.63574433, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65619123, + "num_input_tokens_seen": 191174240, + "step": 8883, + "time_per_iteration": 3.0245001316070557 + }, + { + "auxiliary_loss_clip": 0.01079245, + "auxiliary_loss_mlp": 0.01040102, + "balance_loss_clip": 1.03873086, + "balance_loss_mlp": 1.02523685, + "epoch": 0.5341349767022396, + "flos": 28694313694080.0, + "grad_norm": 2.1049960022330385, + "language_loss": 0.8200773, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.8412708, + "num_input_tokens_seen": 191193335, + "step": 8884, + "time_per_iteration": 2.8096158504486084 + }, + { + "auxiliary_loss_clip": 0.01088886, + "auxiliary_loss_mlp": 0.01042992, + "balance_loss_clip": 1.0403688, + "balance_loss_mlp": 1.02865684, + "epoch": 0.5341950999549075, + "flos": 16289691338880.0, + "grad_norm": 1.6281705583461854, + "language_loss": 0.72372848, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.74504721, + "num_input_tokens_seen": 191210900, + "step": 8885, + "time_per_iteration": 2.6555016040802 + }, + { + "auxiliary_loss_clip": 0.01103878, + "auxiliary_loss_mlp": 0.01037971, + "balance_loss_clip": 1.04014146, + "balance_loss_mlp": 1.02301598, + "epoch": 0.5342552232075756, + "flos": 14355650413440.0, + "grad_norm": 2.9046192596208846, + "language_loss": 0.79004246, + "learning_rate": 1.87525854926798e-06, + "loss": 0.81146097, + "num_input_tokens_seen": 191226730, + "step": 8886, + "time_per_iteration": 2.6476478576660156 + }, + { + "auxiliary_loss_clip": 0.01083524, + "auxiliary_loss_mlp": 0.00772223, + "balance_loss_clip": 1.04013681, + "balance_loss_mlp": 1.00027037, + "epoch": 0.5343153464602435, + "flos": 30297976300800.0, + "grad_norm": 1.5332505330022492, + "language_loss": 0.750615, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.76917243, + "num_input_tokens_seen": 191250435, + "step": 8887, + "time_per_iteration": 2.7690041065216064 + }, + { + "auxiliary_loss_clip": 0.01095123, + "auxiliary_loss_mlp": 0.01034453, + "balance_loss_clip": 1.03800249, + "balance_loss_mlp": 1.02050543, + "epoch": 0.5343754697129115, + "flos": 15596292216960.0, + "grad_norm": 2.322348043408552, + "language_loss": 0.68717337, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.70846909, + "num_input_tokens_seen": 191268315, + "step": 8888, + "time_per_iteration": 2.631999969482422 + }, + { + "auxiliary_loss_clip": 0.01118819, + "auxiliary_loss_mlp": 0.01041785, + "balance_loss_clip": 1.04266095, + "balance_loss_mlp": 1.02738404, + "epoch": 0.5344355929655794, + "flos": 16909617191040.0, + "grad_norm": 2.080624189448151, + "language_loss": 0.77346873, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.79507482, + "num_input_tokens_seen": 191287000, + "step": 8889, + "time_per_iteration": 2.621675729751587 + }, + { + "auxiliary_loss_clip": 0.01122598, + "auxiliary_loss_mlp": 0.01042684, + "balance_loss_clip": 1.04449213, + "balance_loss_mlp": 1.02922511, + "epoch": 0.5344957162182474, + "flos": 16798186224000.0, + "grad_norm": 2.052201989860069, + "language_loss": 0.69323713, + "learning_rate": 1.873703773589102e-06, + "loss": 0.71489, + "num_input_tokens_seen": 191304565, + "step": 8890, + "time_per_iteration": 2.6052801609039307 + }, + { + "auxiliary_loss_clip": 0.01128191, + "auxiliary_loss_mlp": 0.01052498, + "balance_loss_clip": 1.04494905, + "balance_loss_mlp": 1.0359515, + "epoch": 0.5345558394709153, + "flos": 12705590413440.0, + "grad_norm": 2.21737658698942, + "language_loss": 0.77022809, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.79203498, + "num_input_tokens_seen": 191318300, + "step": 8891, + "time_per_iteration": 2.533200263977051 + }, + { + "auxiliary_loss_clip": 0.01103794, + "auxiliary_loss_mlp": 0.01042349, + "balance_loss_clip": 1.04030669, + "balance_loss_mlp": 1.02807951, + "epoch": 0.5346159627235834, + "flos": 22455050400000.0, + "grad_norm": 2.8109589169570857, + "language_loss": 0.74259919, + "learning_rate": 1.872926414425699e-06, + "loss": 0.76406056, + "num_input_tokens_seen": 191337925, + "step": 8892, + "time_per_iteration": 2.674466609954834 + }, + { + "auxiliary_loss_clip": 0.01107598, + "auxiliary_loss_mlp": 0.01038096, + "balance_loss_clip": 1.04592252, + "balance_loss_mlp": 1.02414215, + "epoch": 0.5346760859762513, + "flos": 22415763899520.0, + "grad_norm": 1.9745937936433648, + "language_loss": 0.87865257, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.90010953, + "num_input_tokens_seen": 191357120, + "step": 8893, + "time_per_iteration": 2.7012922763824463 + }, + { + "auxiliary_loss_clip": 0.0111971, + "auxiliary_loss_mlp": 0.01036459, + "balance_loss_clip": 1.04291701, + "balance_loss_mlp": 1.02377474, + "epoch": 0.5347362092289193, + "flos": 22816131868800.0, + "grad_norm": 1.9421223728327293, + "language_loss": 0.72379559, + "learning_rate": 1.872149074536869e-06, + "loss": 0.74535728, + "num_input_tokens_seen": 191375395, + "step": 8894, + "time_per_iteration": 2.590670108795166 + }, + { + "auxiliary_loss_clip": 0.01111441, + "auxiliary_loss_mlp": 0.01031552, + "balance_loss_clip": 1.04253268, + "balance_loss_mlp": 1.01799238, + "epoch": 0.5347963324815872, + "flos": 23219480666880.0, + "grad_norm": 1.965554622310178, + "language_loss": 0.74611443, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.76754439, + "num_input_tokens_seen": 191395595, + "step": 8895, + "time_per_iteration": 2.6462347507476807 + }, + { + "auxiliary_loss_clip": 0.01089565, + "auxiliary_loss_mlp": 0.01036535, + "balance_loss_clip": 1.04067063, + "balance_loss_mlp": 1.02246881, + "epoch": 0.5348564557342552, + "flos": 22601350494720.0, + "grad_norm": 1.8148089507657776, + "language_loss": 0.76860476, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.78986579, + "num_input_tokens_seen": 191413730, + "step": 8896, + "time_per_iteration": 2.6798579692840576 + }, + { + "auxiliary_loss_clip": 0.01093639, + "auxiliary_loss_mlp": 0.01027964, + "balance_loss_clip": 1.04279101, + "balance_loss_mlp": 1.01502943, + "epoch": 0.5349165789869232, + "flos": 18002378701440.0, + "grad_norm": 1.8518658883520687, + "language_loss": 0.78188956, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.80310559, + "num_input_tokens_seen": 191432400, + "step": 8897, + "time_per_iteration": 2.6509950160980225 + }, + { + "auxiliary_loss_clip": 0.01113143, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.04366183, + "balance_loss_mlp": 1.01799703, + "epoch": 0.5349767022395912, + "flos": 17159770483200.0, + "grad_norm": 1.7403204910626056, + "language_loss": 0.75393677, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.7753883, + "num_input_tokens_seen": 191448855, + "step": 8898, + "time_per_iteration": 2.682753086090088 + }, + { + "auxiliary_loss_clip": 0.01037971, + "auxiliary_loss_mlp": 0.01005108, + "balance_loss_clip": 1.0205543, + "balance_loss_mlp": 1.00373161, + "epoch": 0.5350368254922592, + "flos": 70992058158720.0, + "grad_norm": 0.9010106507685076, + "language_loss": 0.57955837, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.59998918, + "num_input_tokens_seen": 191519690, + "step": 8899, + "time_per_iteration": 3.3475701808929443 + }, + { + "auxiliary_loss_clip": 0.01101715, + "auxiliary_loss_mlp": 0.0103468, + "balance_loss_clip": 1.04445124, + "balance_loss_mlp": 1.02107263, + "epoch": 0.5350969487449271, + "flos": 27417833095680.0, + "grad_norm": 2.547752496503206, + "language_loss": 0.69974548, + "learning_rate": 1.869817171696868e-06, + "loss": 0.72110939, + "num_input_tokens_seen": 191539380, + "step": 8900, + "time_per_iteration": 2.7260618209838867 + }, + { + "auxiliary_loss_clip": 0.01099442, + "auxiliary_loss_mlp": 0.01035798, + "balance_loss_clip": 1.03943968, + "balance_loss_mlp": 1.02212465, + "epoch": 0.5351570719975951, + "flos": 19316134638720.0, + "grad_norm": 1.7903210344042488, + "language_loss": 0.71756148, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.73891389, + "num_input_tokens_seen": 191557400, + "step": 8901, + "time_per_iteration": 2.661510467529297 + }, + { + "auxiliary_loss_clip": 0.01087314, + "auxiliary_loss_mlp": 0.01036972, + "balance_loss_clip": 1.03631806, + "balance_loss_mlp": 1.02237511, + "epoch": 0.535217195250263, + "flos": 19828580019840.0, + "grad_norm": 1.7989041746924002, + "language_loss": 0.77021015, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.791453, + "num_input_tokens_seen": 191575860, + "step": 8902, + "time_per_iteration": 2.69665789604187 + }, + { + "auxiliary_loss_clip": 0.01087231, + "auxiliary_loss_mlp": 0.01041891, + "balance_loss_clip": 1.04053831, + "balance_loss_mlp": 1.0283792, + "epoch": 0.535277318502931, + "flos": 22127868391680.0, + "grad_norm": 1.509633063766185, + "language_loss": 0.70147592, + "learning_rate": 1.868651286721281e-06, + "loss": 0.72276717, + "num_input_tokens_seen": 191595775, + "step": 8903, + "time_per_iteration": 2.676028251647949 + }, + { + "auxiliary_loss_clip": 0.0111537, + "auxiliary_loss_mlp": 0.00772296, + "balance_loss_clip": 1.04395127, + "balance_loss_mlp": 1.00028765, + "epoch": 0.5353374417555989, + "flos": 25045897466880.0, + "grad_norm": 1.6001480056643833, + "language_loss": 0.72911739, + "learning_rate": 1.86826266833795e-06, + "loss": 0.74799401, + "num_input_tokens_seen": 191617785, + "step": 8904, + "time_per_iteration": 2.7466139793395996 + }, + { + "auxiliary_loss_clip": 0.01099985, + "auxiliary_loss_mlp": 0.01041546, + "balance_loss_clip": 1.04453778, + "balance_loss_mlp": 1.02705002, + "epoch": 0.535397565008267, + "flos": 19388710068480.0, + "grad_norm": 1.8242307652956307, + "language_loss": 0.73365581, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.7550711, + "num_input_tokens_seen": 191636900, + "step": 8905, + "time_per_iteration": 2.772406578063965 + }, + { + "auxiliary_loss_clip": 0.01105525, + "auxiliary_loss_mlp": 0.0103776, + "balance_loss_clip": 1.04141188, + "balance_loss_mlp": 1.02607787, + "epoch": 0.5354576882609349, + "flos": 21471205904640.0, + "grad_norm": 1.6628200467542797, + "language_loss": 0.83795619, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.85938901, + "num_input_tokens_seen": 191656720, + "step": 8906, + "time_per_iteration": 2.7151100635528564 + }, + { + "auxiliary_loss_clip": 0.01115256, + "auxiliary_loss_mlp": 0.00771962, + "balance_loss_clip": 1.04406035, + "balance_loss_mlp": 1.00027847, + "epoch": 0.5355178115136029, + "flos": 20777519473920.0, + "grad_norm": 1.884591574516044, + "language_loss": 0.74096596, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.75983804, + "num_input_tokens_seen": 191674445, + "step": 8907, + "time_per_iteration": 2.6978471279144287 + }, + { + "auxiliary_loss_clip": 0.01106969, + "auxiliary_loss_mlp": 0.01040191, + "balance_loss_clip": 1.04144001, + "balance_loss_mlp": 1.02508759, + "epoch": 0.5355779347662708, + "flos": 23514020190720.0, + "grad_norm": 2.160786888323469, + "language_loss": 0.76593792, + "learning_rate": 1.866708244906912e-06, + "loss": 0.7874096, + "num_input_tokens_seen": 191695000, + "step": 8908, + "time_per_iteration": 2.6536221504211426 + }, + { + "auxiliary_loss_clip": 0.01097449, + "auxiliary_loss_mlp": 0.00772377, + "balance_loss_clip": 1.04248428, + "balance_loss_mlp": 1.00030112, + "epoch": 0.5356380580189388, + "flos": 20303211358080.0, + "grad_norm": 3.03117864576072, + "language_loss": 0.740637, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.75933528, + "num_input_tokens_seen": 191713295, + "step": 8909, + "time_per_iteration": 2.665473461151123 + }, + { + "auxiliary_loss_clip": 0.01082798, + "auxiliary_loss_mlp": 0.01042054, + "balance_loss_clip": 1.0436362, + "balance_loss_mlp": 1.02891159, + "epoch": 0.5356981812716068, + "flos": 21361642444800.0, + "grad_norm": 2.1999922776778233, + "language_loss": 0.84319562, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.86444414, + "num_input_tokens_seen": 191732725, + "step": 8910, + "time_per_iteration": 2.715521812438965 + }, + { + "auxiliary_loss_clip": 0.01102329, + "auxiliary_loss_mlp": 0.0103318, + "balance_loss_clip": 1.04114723, + "balance_loss_mlp": 1.01928067, + "epoch": 0.5357583045242748, + "flos": 23111246010240.0, + "grad_norm": 1.6725390900013062, + "language_loss": 0.81822705, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.8395822, + "num_input_tokens_seen": 191753765, + "step": 8911, + "time_per_iteration": 2.715254068374634 + }, + { + "auxiliary_loss_clip": 0.0108401, + "auxiliary_loss_mlp": 0.01044047, + "balance_loss_clip": 1.04012454, + "balance_loss_mlp": 1.03019536, + "epoch": 0.5358184277769428, + "flos": 21141761339520.0, + "grad_norm": 5.639232337071921, + "language_loss": 0.69078076, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.71206129, + "num_input_tokens_seen": 191773560, + "step": 8912, + "time_per_iteration": 2.6743216514587402 + }, + { + "auxiliary_loss_clip": 0.01098459, + "auxiliary_loss_mlp": 0.01036297, + "balance_loss_clip": 1.04129279, + "balance_loss_mlp": 1.02273059, + "epoch": 0.5358785510296107, + "flos": 16282400878080.0, + "grad_norm": 2.041064157993178, + "language_loss": 0.71507263, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.73642015, + "num_input_tokens_seen": 191791255, + "step": 8913, + "time_per_iteration": 2.6959731578826904 + }, + { + "auxiliary_loss_clip": 0.01092724, + "auxiliary_loss_mlp": 0.01039004, + "balance_loss_clip": 1.04161441, + "balance_loss_mlp": 1.02512836, + "epoch": 0.5359386742822787, + "flos": 16976877408000.0, + "grad_norm": 1.9206134465038889, + "language_loss": 0.72290546, + "learning_rate": 1.864376761688156e-06, + "loss": 0.74422276, + "num_input_tokens_seen": 191809325, + "step": 8914, + "time_per_iteration": 2.678020477294922 + }, + { + "auxiliary_loss_clip": 0.01104699, + "auxiliary_loss_mlp": 0.01039806, + "balance_loss_clip": 1.04611683, + "balance_loss_mlp": 1.02468383, + "epoch": 0.5359987975349466, + "flos": 20812927305600.0, + "grad_norm": 1.8719693529557881, + "language_loss": 0.70668626, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.72813135, + "num_input_tokens_seen": 191829795, + "step": 8915, + "time_per_iteration": 2.653940200805664 + }, + { + "auxiliary_loss_clip": 0.01094002, + "auxiliary_loss_mlp": 0.01045487, + "balance_loss_clip": 1.04047489, + "balance_loss_mlp": 1.03118742, + "epoch": 0.5360589207876146, + "flos": 22199941031040.0, + "grad_norm": 1.5982896811499068, + "language_loss": 0.74664176, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.76803666, + "num_input_tokens_seen": 191850840, + "step": 8916, + "time_per_iteration": 4.3477959632873535 + }, + { + "auxiliary_loss_clip": 0.01081313, + "auxiliary_loss_mlp": 0.00772126, + "balance_loss_clip": 1.04081666, + "balance_loss_mlp": 1.00021815, + "epoch": 0.5361190440402825, + "flos": 31394365084800.0, + "grad_norm": 1.8553858112595492, + "language_loss": 0.72677946, + "learning_rate": 1.863211089308289e-06, + "loss": 0.74531382, + "num_input_tokens_seen": 191869520, + "step": 8917, + "time_per_iteration": 2.808074712753296 + }, + { + "auxiliary_loss_clip": 0.01102423, + "auxiliary_loss_mlp": 0.01041518, + "balance_loss_clip": 1.0441047, + "balance_loss_mlp": 1.02715325, + "epoch": 0.5361791672929506, + "flos": 16069882060800.0, + "grad_norm": 1.960367430660897, + "language_loss": 0.71014392, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.73158336, + "num_input_tokens_seen": 191887240, + "step": 8918, + "time_per_iteration": 4.185984134674072 + }, + { + "auxiliary_loss_clip": 0.01106012, + "auxiliary_loss_mlp": 0.01036881, + "balance_loss_clip": 1.0469594, + "balance_loss_mlp": 1.02306461, + "epoch": 0.5362392905456185, + "flos": 20740926493440.0, + "grad_norm": 1.4605213362212828, + "language_loss": 0.74976659, + "learning_rate": 1.862434000299067e-06, + "loss": 0.77119553, + "num_input_tokens_seen": 191905690, + "step": 8919, + "time_per_iteration": 2.694120407104492 + }, + { + "auxiliary_loss_clip": 0.01093376, + "auxiliary_loss_mlp": 0.01035953, + "balance_loss_clip": 1.04010797, + "balance_loss_mlp": 1.02207744, + "epoch": 0.5362994137982865, + "flos": 17340077779200.0, + "grad_norm": 1.9976483392210334, + "language_loss": 0.71690488, + "learning_rate": 1.862045463611864e-06, + "loss": 0.73819816, + "num_input_tokens_seen": 191920725, + "step": 8920, + "time_per_iteration": 2.6273410320281982 + }, + { + "auxiliary_loss_clip": 0.01105087, + "auxiliary_loss_mlp": 0.01040608, + "balance_loss_clip": 1.03961456, + "balance_loss_mlp": 1.02532554, + "epoch": 0.5363595370509544, + "flos": 42813957795840.0, + "grad_norm": 1.3877970230156793, + "language_loss": 0.68828928, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.70974618, + "num_input_tokens_seen": 191944645, + "step": 8921, + "time_per_iteration": 4.31537938117981 + }, + { + "auxiliary_loss_clip": 0.01114121, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.04631782, + "balance_loss_mlp": 1.0227288, + "epoch": 0.5364196603036224, + "flos": 19171953446400.0, + "grad_norm": 1.8336561717381605, + "language_loss": 0.81926084, + "learning_rate": 1.86126840594594e-06, + "loss": 0.84076393, + "num_input_tokens_seen": 191962265, + "step": 8922, + "time_per_iteration": 2.6045267581939697 + }, + { + "auxiliary_loss_clip": 0.01117037, + "auxiliary_loss_mlp": 0.01031036, + "balance_loss_clip": 1.04637003, + "balance_loss_mlp": 1.01782727, + "epoch": 0.5364797835562904, + "flos": 17931060247680.0, + "grad_norm": 2.029402038210475, + "language_loss": 0.76969302, + "learning_rate": 1.860879884996686e-06, + "loss": 0.79117376, + "num_input_tokens_seen": 191978850, + "step": 8923, + "time_per_iteration": 2.627131223678589 + }, + { + "auxiliary_loss_clip": 0.01097305, + "auxiliary_loss_mlp": 0.01035531, + "balance_loss_clip": 1.04099584, + "balance_loss_mlp": 1.02144074, + "epoch": 0.5365399068089584, + "flos": 30228058477440.0, + "grad_norm": 1.4696173336709724, + "language_loss": 0.70680726, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.72813559, + "num_input_tokens_seen": 192002000, + "step": 8924, + "time_per_iteration": 2.7947139739990234 + }, + { + "auxiliary_loss_clip": 0.01093943, + "auxiliary_loss_mlp": 0.01040337, + "balance_loss_clip": 1.0430336, + "balance_loss_mlp": 1.02501917, + "epoch": 0.5366000300616264, + "flos": 24891696380160.0, + "grad_norm": 2.0937693746484456, + "language_loss": 0.87335229, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.8946951, + "num_input_tokens_seen": 192019100, + "step": 8925, + "time_per_iteration": 2.768362045288086 + }, + { + "auxiliary_loss_clip": 0.01123484, + "auxiliary_loss_mlp": 0.01031699, + "balance_loss_clip": 1.04188776, + "balance_loss_mlp": 1.01764417, + "epoch": 0.5366601533142943, + "flos": 29826649013760.0, + "grad_norm": 1.5047259348419413, + "language_loss": 0.77962756, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.80117941, + "num_input_tokens_seen": 192041660, + "step": 8926, + "time_per_iteration": 2.715451955795288 + }, + { + "auxiliary_loss_clip": 0.01087054, + "auxiliary_loss_mlp": 0.01032082, + "balance_loss_clip": 1.04502523, + "balance_loss_mlp": 1.01944578, + "epoch": 0.5367202765669623, + "flos": 27199352620800.0, + "grad_norm": 1.5425961750104156, + "language_loss": 0.66906953, + "learning_rate": 1.85932585410148e-06, + "loss": 0.69026089, + "num_input_tokens_seen": 192063540, + "step": 8927, + "time_per_iteration": 2.7890443801879883 + }, + { + "auxiliary_loss_clip": 0.0111207, + "auxiliary_loss_mlp": 0.0103082, + "balance_loss_clip": 1.04044211, + "balance_loss_mlp": 1.01719475, + "epoch": 0.5367803998196302, + "flos": 20229953569920.0, + "grad_norm": 1.7627850836145547, + "language_loss": 0.73644257, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.75787145, + "num_input_tokens_seen": 192081760, + "step": 8928, + "time_per_iteration": 2.6679322719573975 + }, + { + "auxiliary_loss_clip": 0.01097621, + "auxiliary_loss_mlp": 0.01033753, + "balance_loss_clip": 1.03983617, + "balance_loss_mlp": 1.02038312, + "epoch": 0.5368405230722982, + "flos": 32154629374080.0, + "grad_norm": 1.8947277080350169, + "language_loss": 0.63138568, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.65269947, + "num_input_tokens_seen": 192101620, + "step": 8929, + "time_per_iteration": 2.77915620803833 + }, + { + "auxiliary_loss_clip": 0.01112721, + "auxiliary_loss_mlp": 0.01035038, + "balance_loss_clip": 1.04284871, + "balance_loss_mlp": 1.02102494, + "epoch": 0.5369006463249661, + "flos": 26247935128320.0, + "grad_norm": 1.6504217106645076, + "language_loss": 0.65814567, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.67962325, + "num_input_tokens_seen": 192121805, + "step": 8930, + "time_per_iteration": 2.671699285507202 + }, + { + "auxiliary_loss_clip": 0.01070837, + "auxiliary_loss_mlp": 0.01029378, + "balance_loss_clip": 1.03888655, + "balance_loss_mlp": 1.01519203, + "epoch": 0.5369607695776342, + "flos": 26211306234240.0, + "grad_norm": 1.4657060850123025, + "language_loss": 0.67106915, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.69207126, + "num_input_tokens_seen": 192141765, + "step": 8931, + "time_per_iteration": 2.791450023651123 + }, + { + "auxiliary_loss_clip": 0.0107183, + "auxiliary_loss_mlp": 0.01035308, + "balance_loss_clip": 1.03937209, + "balance_loss_mlp": 1.02028155, + "epoch": 0.5370208928303021, + "flos": 25009017177600.0, + "grad_norm": 1.6675319791175172, + "language_loss": 0.76147091, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.78254229, + "num_input_tokens_seen": 192161560, + "step": 8932, + "time_per_iteration": 2.817074775695801 + }, + { + "auxiliary_loss_clip": 0.0108812, + "auxiliary_loss_mlp": 0.01035166, + "balance_loss_clip": 1.04271507, + "balance_loss_mlp": 1.02086663, + "epoch": 0.5370810160829701, + "flos": 31792147274880.0, + "grad_norm": 1.7321457922490968, + "language_loss": 0.66103363, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.68226647, + "num_input_tokens_seen": 192180190, + "step": 8933, + "time_per_iteration": 2.7999963760375977 + }, + { + "auxiliary_loss_clip": 0.01106374, + "auxiliary_loss_mlp": 0.0077107, + "balance_loss_clip": 1.04321599, + "balance_loss_mlp": 1.00030327, + "epoch": 0.537141139335638, + "flos": 23842602829440.0, + "grad_norm": 1.7096623259043264, + "language_loss": 0.83137345, + "learning_rate": 1.856606505975565e-06, + "loss": 0.8501479, + "num_input_tokens_seen": 192198855, + "step": 8934, + "time_per_iteration": 2.77140474319458 + }, + { + "auxiliary_loss_clip": 0.01083657, + "auxiliary_loss_mlp": 0.01038537, + "balance_loss_clip": 1.03906775, + "balance_loss_mlp": 1.02371967, + "epoch": 0.537201262588306, + "flos": 18508826511360.0, + "grad_norm": 1.9684207217946548, + "language_loss": 0.79907835, + "learning_rate": 1.856218049303999e-06, + "loss": 0.82030034, + "num_input_tokens_seen": 192216555, + "step": 8935, + "time_per_iteration": 2.714343547821045 + }, + { + "auxiliary_loss_clip": 0.01111571, + "auxiliary_loss_mlp": 0.01041616, + "balance_loss_clip": 1.04217649, + "balance_loss_mlp": 1.02750206, + "epoch": 0.537261385840974, + "flos": 25662950231040.0, + "grad_norm": 2.937428754588345, + "language_loss": 0.84070867, + "learning_rate": 1.855829598084659e-06, + "loss": 0.86224055, + "num_input_tokens_seen": 192236910, + "step": 8936, + "time_per_iteration": 2.6816179752349854 + }, + { + "auxiliary_loss_clip": 0.01092497, + "auxiliary_loss_mlp": 0.01030736, + "balance_loss_clip": 1.04575956, + "balance_loss_mlp": 1.018255, + "epoch": 0.537321509093642, + "flos": 40735017406080.0, + "grad_norm": 1.2320449417851727, + "language_loss": 0.72774732, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.74897963, + "num_input_tokens_seen": 192260790, + "step": 8937, + "time_per_iteration": 2.9294662475585938 + }, + { + "auxiliary_loss_clip": 0.01097303, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.03866911, + "balance_loss_mlp": 1.02411556, + "epoch": 0.53738163234631, + "flos": 17238487138560.0, + "grad_norm": 2.4958463124017825, + "language_loss": 0.82070464, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.84207237, + "num_input_tokens_seen": 192277230, + "step": 8938, + "time_per_iteration": 2.7016329765319824 + }, + { + "auxiliary_loss_clip": 0.01128942, + "auxiliary_loss_mlp": 0.01037787, + "balance_loss_clip": 1.04445028, + "balance_loss_mlp": 1.02425027, + "epoch": 0.5374417555989779, + "flos": 12821977457280.0, + "grad_norm": 2.39037719214814, + "language_loss": 0.80410939, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.8257767, + "num_input_tokens_seen": 192292840, + "step": 8939, + "time_per_iteration": 2.588257312774658 + }, + { + "auxiliary_loss_clip": 0.01012372, + "auxiliary_loss_mlp": 0.01007323, + "balance_loss_clip": 1.01498079, + "balance_loss_mlp": 1.00561845, + "epoch": 0.5375018788516459, + "flos": 67256018703360.0, + "grad_norm": 0.706070728219951, + "language_loss": 0.52408826, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.5442853, + "num_input_tokens_seen": 192358240, + "step": 8940, + "time_per_iteration": 3.276360273361206 + }, + { + "auxiliary_loss_clip": 0.01083174, + "auxiliary_loss_mlp": 0.01033229, + "balance_loss_clip": 1.04148936, + "balance_loss_mlp": 1.01995516, + "epoch": 0.5375620021043138, + "flos": 18114168804480.0, + "grad_norm": 2.0987581231461725, + "language_loss": 0.71804386, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.73920786, + "num_input_tokens_seen": 192377370, + "step": 8941, + "time_per_iteration": 2.732537269592285 + }, + { + "auxiliary_loss_clip": 0.01092897, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.03881931, + "balance_loss_mlp": 1.01767242, + "epoch": 0.5376221253569818, + "flos": 23149383275520.0, + "grad_norm": 1.733585832372728, + "language_loss": 0.79825974, + "learning_rate": 1.853499006090237e-06, + "loss": 0.81949472, + "num_input_tokens_seen": 192396450, + "step": 8942, + "time_per_iteration": 2.723686695098877 + }, + { + "auxiliary_loss_clip": 0.01126783, + "auxiliary_loss_mlp": 0.01038334, + "balance_loss_clip": 1.04432559, + "balance_loss_mlp": 1.02416599, + "epoch": 0.5376822486096497, + "flos": 29972302663680.0, + "grad_norm": 1.8527940596038397, + "language_loss": 0.70161736, + "learning_rate": 1.853110593448911e-06, + "loss": 0.72326851, + "num_input_tokens_seen": 192417390, + "step": 8943, + "time_per_iteration": 2.683830499649048 + }, + { + "auxiliary_loss_clip": 0.01030181, + "auxiliary_loss_mlp": 0.01002794, + "balance_loss_clip": 1.01417148, + "balance_loss_mlp": 1.00145841, + "epoch": 0.5377423718623178, + "flos": 54168950874240.0, + "grad_norm": 0.8559023322108498, + "language_loss": 0.5964179, + "learning_rate": 1.852722186377645e-06, + "loss": 0.61674768, + "num_input_tokens_seen": 192478060, + "step": 8944, + "time_per_iteration": 3.195451498031616 + }, + { + "auxiliary_loss_clip": 0.01075816, + "auxiliary_loss_mlp": 0.01037224, + "balance_loss_clip": 1.04020023, + "balance_loss_mlp": 1.02198291, + "epoch": 0.5378024951149857, + "flos": 23257079228160.0, + "grad_norm": 2.0363151234070567, + "language_loss": 0.77896553, + "learning_rate": 1.852333784891169e-06, + "loss": 0.80009592, + "num_input_tokens_seen": 192495985, + "step": 8945, + "time_per_iteration": 2.7992632389068604 + }, + { + "auxiliary_loss_clip": 0.01114593, + "auxiliary_loss_mlp": 0.01035525, + "balance_loss_clip": 1.04309297, + "balance_loss_mlp": 1.02173805, + "epoch": 0.5378626183676537, + "flos": 24024095274240.0, + "grad_norm": 1.6722587357114949, + "language_loss": 0.68561995, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.70712113, + "num_input_tokens_seen": 192515445, + "step": 8946, + "time_per_iteration": 2.6522717475891113 + }, + { + "auxiliary_loss_clip": 0.01078154, + "auxiliary_loss_mlp": 0.0104253, + "balance_loss_clip": 1.04271758, + "balance_loss_mlp": 1.02895761, + "epoch": 0.5379227416203216, + "flos": 27161789973120.0, + "grad_norm": 1.8248631368800923, + "language_loss": 0.76991701, + "learning_rate": 1.851556998731498e-06, + "loss": 0.79112387, + "num_input_tokens_seen": 192536530, + "step": 8947, + "time_per_iteration": 2.796123743057251 + }, + { + "auxiliary_loss_clip": 0.0111442, + "auxiliary_loss_mlp": 0.01032597, + "balance_loss_clip": 1.04487777, + "balance_loss_mlp": 1.01940608, + "epoch": 0.5379828648729896, + "flos": 24681619687680.0, + "grad_norm": 1.55307874766799, + "language_loss": 0.60198331, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.6234535, + "num_input_tokens_seen": 192556075, + "step": 8948, + "time_per_iteration": 2.7054309844970703 + }, + { + "auxiliary_loss_clip": 0.01082153, + "auxiliary_loss_mlp": 0.01037517, + "balance_loss_clip": 1.03970575, + "balance_loss_mlp": 1.02415979, + "epoch": 0.5380429881256577, + "flos": 22523280284160.0, + "grad_norm": 1.6281037537893495, + "language_loss": 0.79697102, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.81816769, + "num_input_tokens_seen": 192575535, + "step": 8949, + "time_per_iteration": 2.8140738010406494 + }, + { + "auxiliary_loss_clip": 0.01078335, + "auxiliary_loss_mlp": 0.01042356, + "balance_loss_clip": 1.03704572, + "balance_loss_mlp": 1.02679944, + "epoch": 0.5381031113783256, + "flos": 26979543342720.0, + "grad_norm": 2.0888170828860444, + "language_loss": 0.77963328, + "learning_rate": 1.850391861746111e-06, + "loss": 0.80084026, + "num_input_tokens_seen": 192594490, + "step": 8950, + "time_per_iteration": 2.7498505115509033 + }, + { + "auxiliary_loss_clip": 0.01110071, + "auxiliary_loss_mlp": 0.01029664, + "balance_loss_clip": 1.05072141, + "balance_loss_mlp": 1.01671791, + "epoch": 0.5381632346309936, + "flos": 24754087376640.0, + "grad_norm": 1.5816580812213883, + "language_loss": 0.72668755, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.7480849, + "num_input_tokens_seen": 192615650, + "step": 8951, + "time_per_iteration": 2.7927658557891846 + }, + { + "auxiliary_loss_clip": 0.01122901, + "auxiliary_loss_mlp": 0.00772698, + "balance_loss_clip": 1.04232633, + "balance_loss_mlp": 1.00031877, + "epoch": 0.5382233578836615, + "flos": 15560058372480.0, + "grad_norm": 1.7038907930473366, + "language_loss": 0.74791837, + "learning_rate": 1.849615132097085e-06, + "loss": 0.76687431, + "num_input_tokens_seen": 192633840, + "step": 8952, + "time_per_iteration": 2.663555860519409 + }, + { + "auxiliary_loss_clip": 0.01103413, + "auxiliary_loss_mlp": 0.01034816, + "balance_loss_clip": 1.04635072, + "balance_loss_mlp": 1.02090442, + "epoch": 0.5382834811363295, + "flos": 25084501608960.0, + "grad_norm": 1.486507819644587, + "language_loss": 0.79733002, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.81871235, + "num_input_tokens_seen": 192655890, + "step": 8953, + "time_per_iteration": 2.7213597297668457 + }, + { + "auxiliary_loss_clip": 0.01092412, + "auxiliary_loss_mlp": 0.01036672, + "balance_loss_clip": 1.04632258, + "balance_loss_mlp": 1.02147865, + "epoch": 0.5383436043889974, + "flos": 13297901685120.0, + "grad_norm": 1.8841614793520622, + "language_loss": 0.80665779, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.82794857, + "num_input_tokens_seen": 192673025, + "step": 8954, + "time_per_iteration": 2.7119338512420654 + }, + { + "auxiliary_loss_clip": 0.01124989, + "auxiliary_loss_mlp": 0.0103348, + "balance_loss_clip": 1.04552889, + "balance_loss_mlp": 1.0192287, + "epoch": 0.5384037276416654, + "flos": 23039388852480.0, + "grad_norm": 2.080642260770838, + "language_loss": 0.76782274, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.78940743, + "num_input_tokens_seen": 192692190, + "step": 8955, + "time_per_iteration": 4.170248746871948 + }, + { + "auxiliary_loss_clip": 0.01100368, + "auxiliary_loss_mlp": 0.01043375, + "balance_loss_clip": 1.04422796, + "balance_loss_mlp": 1.02911186, + "epoch": 0.5384638508943334, + "flos": 20631147552000.0, + "grad_norm": 1.64526518725267, + "language_loss": 0.78446829, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.8059057, + "num_input_tokens_seen": 192710380, + "step": 8956, + "time_per_iteration": 4.346608638763428 + }, + { + "auxiliary_loss_clip": 0.01014882, + "auxiliary_loss_mlp": 0.01009567, + "balance_loss_clip": 1.01641572, + "balance_loss_mlp": 1.00802886, + "epoch": 0.5385239741470014, + "flos": 66737683491840.0, + "grad_norm": 0.8632221777835867, + "language_loss": 0.63366526, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.6539098, + "num_input_tokens_seen": 192768995, + "step": 8957, + "time_per_iteration": 4.689607381820679 + }, + { + "auxiliary_loss_clip": 0.01003314, + "auxiliary_loss_mlp": 0.00999601, + "balance_loss_clip": 1.01686144, + "balance_loss_mlp": 0.99808067, + "epoch": 0.5385840973996693, + "flos": 64716058229760.0, + "grad_norm": 0.7163688318545376, + "language_loss": 0.5155347, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.53556383, + "num_input_tokens_seen": 192825585, + "step": 8958, + "time_per_iteration": 3.263490676879883 + }, + { + "auxiliary_loss_clip": 0.01118278, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.04870462, + "balance_loss_mlp": 1.01945472, + "epoch": 0.5386442206523373, + "flos": 26141783460480.0, + "grad_norm": 1.5599827476789179, + "language_loss": 0.77335596, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.79488432, + "num_input_tokens_seen": 192847335, + "step": 8959, + "time_per_iteration": 2.6936423778533936 + }, + { + "auxiliary_loss_clip": 0.01078149, + "auxiliary_loss_mlp": 0.01035897, + "balance_loss_clip": 1.04148221, + "balance_loss_mlp": 1.02258778, + "epoch": 0.5387043439050052, + "flos": 18251849635200.0, + "grad_norm": 2.554990268603387, + "language_loss": 0.84077597, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.86191648, + "num_input_tokens_seen": 192862205, + "step": 8960, + "time_per_iteration": 4.281194686889648 + }, + { + "auxiliary_loss_clip": 0.01114712, + "auxiliary_loss_mlp": 0.01032916, + "balance_loss_clip": 1.0460726, + "balance_loss_mlp": 1.01955807, + "epoch": 0.5387644671576732, + "flos": 29788296266880.0, + "grad_norm": 1.4386251393877574, + "language_loss": 0.78275657, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.80423284, + "num_input_tokens_seen": 192883695, + "step": 8961, + "time_per_iteration": 2.7518913745880127 + }, + { + "auxiliary_loss_clip": 0.01089107, + "auxiliary_loss_mlp": 0.01035524, + "balance_loss_clip": 1.041345, + "balance_loss_mlp": 1.02189803, + "epoch": 0.5388245904103413, + "flos": 22374466237440.0, + "grad_norm": 11.100507002897315, + "language_loss": 0.84070158, + "learning_rate": 1.845731828364681e-06, + "loss": 0.86194789, + "num_input_tokens_seen": 192900190, + "step": 8962, + "time_per_iteration": 2.745964288711548 + }, + { + "auxiliary_loss_clip": 0.01020426, + "auxiliary_loss_mlp": 0.01002497, + "balance_loss_clip": 1.01872444, + "balance_loss_mlp": 1.00114429, + "epoch": 0.5388847136630092, + "flos": 69807794751360.0, + "grad_norm": 0.7287303599556714, + "language_loss": 0.5418579, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.56208712, + "num_input_tokens_seen": 192958675, + "step": 8963, + "time_per_iteration": 3.0952982902526855 + }, + { + "auxiliary_loss_clip": 0.01022568, + "auxiliary_loss_mlp": 0.01009564, + "balance_loss_clip": 1.01615238, + "balance_loss_mlp": 1.00817513, + "epoch": 0.5389448369156772, + "flos": 69822303845760.0, + "grad_norm": 1.4175775222807738, + "language_loss": 0.63305563, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.65337688, + "num_input_tokens_seen": 193033135, + "step": 8964, + "time_per_iteration": 3.2670536041259766 + }, + { + "auxiliary_loss_clip": 0.01065573, + "auxiliary_loss_mlp": 0.01035006, + "balance_loss_clip": 1.04052043, + "balance_loss_mlp": 1.02049828, + "epoch": 0.5390049601683451, + "flos": 31722444933120.0, + "grad_norm": 1.4839412969014603, + "language_loss": 0.69941193, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.72041768, + "num_input_tokens_seen": 193055570, + "step": 8965, + "time_per_iteration": 2.8793537616729736 + }, + { + "auxiliary_loss_clip": 0.01097921, + "auxiliary_loss_mlp": 0.00772841, + "balance_loss_clip": 1.04318738, + "balance_loss_mlp": 1.00031877, + "epoch": 0.5390650834210131, + "flos": 18113486446080.0, + "grad_norm": 3.9331383698311297, + "language_loss": 0.82359982, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.84230745, + "num_input_tokens_seen": 193073120, + "step": 8966, + "time_per_iteration": 2.7008259296417236 + }, + { + "auxiliary_loss_clip": 0.0112489, + "auxiliary_loss_mlp": 0.01032097, + "balance_loss_clip": 1.04688132, + "balance_loss_mlp": 1.01831603, + "epoch": 0.539125206673681, + "flos": 17416711445760.0, + "grad_norm": 1.8273360824105822, + "language_loss": 0.72234643, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.74391627, + "num_input_tokens_seen": 193090105, + "step": 8967, + "time_per_iteration": 2.536813974380493 + }, + { + "auxiliary_loss_clip": 0.01101272, + "auxiliary_loss_mlp": 0.01034325, + "balance_loss_clip": 1.04193211, + "balance_loss_mlp": 1.02177858, + "epoch": 0.539185329926349, + "flos": 22198935450240.0, + "grad_norm": 2.8461637045489394, + "language_loss": 0.81760883, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.83896482, + "num_input_tokens_seen": 193109325, + "step": 8968, + "time_per_iteration": 2.6812336444854736 + }, + { + "auxiliary_loss_clip": 0.01095464, + "auxiliary_loss_mlp": 0.01039812, + "balance_loss_clip": 1.04489422, + "balance_loss_mlp": 1.0244931, + "epoch": 0.539245453179017, + "flos": 21434397442560.0, + "grad_norm": 1.479768408322399, + "language_loss": 0.74093103, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.76228386, + "num_input_tokens_seen": 193130595, + "step": 8969, + "time_per_iteration": 2.775066614151001 + }, + { + "auxiliary_loss_clip": 0.01089398, + "auxiliary_loss_mlp": 0.00772297, + "balance_loss_clip": 1.03885353, + "balance_loss_mlp": 1.00019646, + "epoch": 0.539305576431685, + "flos": 20735000749440.0, + "grad_norm": 1.789523366494458, + "language_loss": 0.82301641, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.84163332, + "num_input_tokens_seen": 193148930, + "step": 8970, + "time_per_iteration": 2.709660053253174 + }, + { + "auxiliary_loss_clip": 0.0109962, + "auxiliary_loss_mlp": 0.01036868, + "balance_loss_clip": 1.04434752, + "balance_loss_mlp": 1.02398705, + "epoch": 0.5393656996843529, + "flos": 30920452018560.0, + "grad_norm": 1.3749735874734272, + "language_loss": 0.75481087, + "learning_rate": 1.842237354749146e-06, + "loss": 0.77617574, + "num_input_tokens_seen": 193170140, + "step": 8971, + "time_per_iteration": 2.759859800338745 + }, + { + "auxiliary_loss_clip": 0.01031428, + "auxiliary_loss_mlp": 0.01020808, + "balance_loss_clip": 1.01404476, + "balance_loss_mlp": 1.01906729, + "epoch": 0.5394258229370209, + "flos": 50317781351040.0, + "grad_norm": 0.8852076637627846, + "language_loss": 0.60268009, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62320244, + "num_input_tokens_seen": 193227235, + "step": 8972, + "time_per_iteration": 3.1906497478485107 + }, + { + "auxiliary_loss_clip": 0.01113524, + "auxiliary_loss_mlp": 0.01042903, + "balance_loss_clip": 1.0430851, + "balance_loss_mlp": 1.02806175, + "epoch": 0.5394859461896888, + "flos": 25411935012480.0, + "grad_norm": 1.3798913966673876, + "language_loss": 0.78418267, + "learning_rate": 1.841460870485045e-06, + "loss": 0.80574697, + "num_input_tokens_seen": 193248435, + "step": 8973, + "time_per_iteration": 2.67616868019104 + }, + { + "auxiliary_loss_clip": 0.01119952, + "auxiliary_loss_mlp": 0.01038926, + "balance_loss_clip": 1.04402721, + "balance_loss_mlp": 1.0234288, + "epoch": 0.5395460694423568, + "flos": 25478476957440.0, + "grad_norm": 1.97267381364002, + "language_loss": 0.73745018, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.75903904, + "num_input_tokens_seen": 193267490, + "step": 8974, + "time_per_iteration": 2.6896610260009766 + }, + { + "auxiliary_loss_clip": 0.01038786, + "auxiliary_loss_mlp": 0.01002204, + "balance_loss_clip": 1.01252413, + "balance_loss_mlp": 1.00089288, + "epoch": 0.5396061926950249, + "flos": 53249493507840.0, + "grad_norm": 0.7368178577125409, + "language_loss": 0.51070768, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.53111756, + "num_input_tokens_seen": 193326050, + "step": 8975, + "time_per_iteration": 3.1316938400268555 + }, + { + "auxiliary_loss_clip": 0.01110433, + "auxiliary_loss_mlp": 0.01042663, + "balance_loss_clip": 1.04242885, + "balance_loss_mlp": 1.02821445, + "epoch": 0.5396663159476928, + "flos": 26725080418560.0, + "grad_norm": 2.630341512403146, + "language_loss": 0.72291577, + "learning_rate": 1.840296189214344e-06, + "loss": 0.74444675, + "num_input_tokens_seen": 193348785, + "step": 8976, + "time_per_iteration": 2.722482681274414 + }, + { + "auxiliary_loss_clip": 0.01107068, + "auxiliary_loss_mlp": 0.00771891, + "balance_loss_clip": 1.0392096, + "balance_loss_mlp": 1.00027895, + "epoch": 0.5397264392003608, + "flos": 23253380127360.0, + "grad_norm": 1.6269165395400453, + "language_loss": 0.69827849, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.71706808, + "num_input_tokens_seen": 193367080, + "step": 8977, + "time_per_iteration": 2.661503553390503 + }, + { + "auxiliary_loss_clip": 0.0105269, + "auxiliary_loss_mlp": 0.01038261, + "balance_loss_clip": 1.03996563, + "balance_loss_mlp": 1.02390814, + "epoch": 0.5397865624530287, + "flos": 18294188791680.0, + "grad_norm": 1.662156020825611, + "language_loss": 0.7259683, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.74687779, + "num_input_tokens_seen": 193383715, + "step": 8978, + "time_per_iteration": 2.7381365299224854 + }, + { + "auxiliary_loss_clip": 0.01087228, + "auxiliary_loss_mlp": 0.010397, + "balance_loss_clip": 1.04297757, + "balance_loss_mlp": 1.02373815, + "epoch": 0.5398466857056967, + "flos": 15297514888320.0, + "grad_norm": 1.853626793115837, + "language_loss": 0.74536407, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.76663339, + "num_input_tokens_seen": 193400560, + "step": 8979, + "time_per_iteration": 2.694063425064087 + }, + { + "auxiliary_loss_clip": 0.01072362, + "auxiliary_loss_mlp": 0.01049968, + "balance_loss_clip": 1.04104912, + "balance_loss_mlp": 1.03438091, + "epoch": 0.5399068089583646, + "flos": 17821748183040.0, + "grad_norm": 1.8942057962212562, + "language_loss": 0.76699525, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.78821856, + "num_input_tokens_seen": 193418680, + "step": 8980, + "time_per_iteration": 2.820065498352051 + }, + { + "auxiliary_loss_clip": 0.01123296, + "auxiliary_loss_mlp": 0.01035485, + "balance_loss_clip": 1.04266453, + "balance_loss_mlp": 1.02159715, + "epoch": 0.5399669322110326, + "flos": 27381635164800.0, + "grad_norm": 1.799033275645953, + "language_loss": 0.82047689, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.84206468, + "num_input_tokens_seen": 193439310, + "step": 8981, + "time_per_iteration": 2.6362786293029785 + }, + { + "auxiliary_loss_clip": 0.01114328, + "auxiliary_loss_mlp": 0.01033767, + "balance_loss_clip": 1.0414052, + "balance_loss_mlp": 1.01837707, + "epoch": 0.5400270554637006, + "flos": 20449116403200.0, + "grad_norm": 1.8414706821019682, + "language_loss": 0.66744691, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.68892789, + "num_input_tokens_seen": 193458115, + "step": 8982, + "time_per_iteration": 2.621446371078491 + }, + { + "auxiliary_loss_clip": 0.01087174, + "auxiliary_loss_mlp": 0.00771772, + "balance_loss_clip": 1.04236412, + "balance_loss_mlp": 1.00037217, + "epoch": 0.5400871787163686, + "flos": 21689578638720.0, + "grad_norm": 1.585959219226275, + "language_loss": 0.82838899, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.84697849, + "num_input_tokens_seen": 193477365, + "step": 8983, + "time_per_iteration": 2.725118637084961 + }, + { + "auxiliary_loss_clip": 0.0107373, + "auxiliary_loss_mlp": 0.01037262, + "balance_loss_clip": 1.03868723, + "balance_loss_mlp": 1.0226711, + "epoch": 0.5401473019690365, + "flos": 19204739585280.0, + "grad_norm": 1.7940455633993566, + "language_loss": 0.71052921, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.73163915, + "num_input_tokens_seen": 193495595, + "step": 8984, + "time_per_iteration": 2.7552812099456787 + }, + { + "auxiliary_loss_clip": 0.01129583, + "auxiliary_loss_mlp": 0.01039978, + "balance_loss_clip": 1.04673409, + "balance_loss_mlp": 1.02464724, + "epoch": 0.5402074252217045, + "flos": 20627376624000.0, + "grad_norm": 1.7153215255445333, + "language_loss": 0.80088288, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.82257855, + "num_input_tokens_seen": 193514035, + "step": 8985, + "time_per_iteration": 2.6251611709594727 + }, + { + "auxiliary_loss_clip": 0.01076326, + "auxiliary_loss_mlp": 0.01030482, + "balance_loss_clip": 1.03776312, + "balance_loss_mlp": 1.01603341, + "epoch": 0.5402675484743724, + "flos": 24973465691520.0, + "grad_norm": 1.6597478268739005, + "language_loss": 0.79092562, + "learning_rate": 1.83641431418363e-06, + "loss": 0.81199366, + "num_input_tokens_seen": 193535445, + "step": 8986, + "time_per_iteration": 2.7512738704681396 + }, + { + "auxiliary_loss_clip": 0.01105948, + "auxiliary_loss_mlp": 0.01041249, + "balance_loss_clip": 1.0403738, + "balance_loss_mlp": 1.02647913, + "epoch": 0.5403276717270404, + "flos": 19459022941440.0, + "grad_norm": 1.5813568652048575, + "language_loss": 0.77027225, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.79174423, + "num_input_tokens_seen": 193554780, + "step": 8987, + "time_per_iteration": 2.678215265274048 + }, + { + "auxiliary_loss_clip": 0.01094562, + "auxiliary_loss_mlp": 0.01035835, + "balance_loss_clip": 1.04025865, + "balance_loss_mlp": 1.021613, + "epoch": 0.5403877949797083, + "flos": 18442140912000.0, + "grad_norm": 3.169719409567684, + "language_loss": 0.71186262, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.73316658, + "num_input_tokens_seen": 193573580, + "step": 8988, + "time_per_iteration": 2.779327869415283 + }, + { + "auxiliary_loss_clip": 0.01073421, + "auxiliary_loss_mlp": 0.01040131, + "balance_loss_clip": 1.03765535, + "balance_loss_mlp": 1.02508116, + "epoch": 0.5404479182323764, + "flos": 28292868316800.0, + "grad_norm": 2.25930737507901, + "language_loss": 0.67611122, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.69724679, + "num_input_tokens_seen": 193590490, + "step": 8989, + "time_per_iteration": 2.7891674041748047 + }, + { + "auxiliary_loss_clip": 0.01111206, + "auxiliary_loss_mlp": 0.01041114, + "balance_loss_clip": 1.04164839, + "balance_loss_mlp": 1.02667785, + "epoch": 0.5405080414850444, + "flos": 23367325046400.0, + "grad_norm": 1.5585472280182338, + "language_loss": 0.77394271, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.79546589, + "num_input_tokens_seen": 193609900, + "step": 8990, + "time_per_iteration": 2.6976635456085205 + }, + { + "auxiliary_loss_clip": 0.01106061, + "auxiliary_loss_mlp": 0.01026872, + "balance_loss_clip": 1.03980994, + "balance_loss_mlp": 1.01427758, + "epoch": 0.5405681647377123, + "flos": 21106425335040.0, + "grad_norm": 1.9802166321118257, + "language_loss": 0.69258702, + "learning_rate": 1.834473608367745e-06, + "loss": 0.71391636, + "num_input_tokens_seen": 193629775, + "step": 8991, + "time_per_iteration": 2.6734046936035156 + }, + { + "auxiliary_loss_clip": 0.01061373, + "auxiliary_loss_mlp": 0.01034138, + "balance_loss_clip": 1.03470838, + "balance_loss_mlp": 1.01864719, + "epoch": 0.5406282879903803, + "flos": 20449188230400.0, + "grad_norm": 1.8615919781627641, + "language_loss": 0.75722122, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.77817637, + "num_input_tokens_seen": 193648070, + "step": 8992, + "time_per_iteration": 2.7986576557159424 + }, + { + "auxiliary_loss_clip": 0.01094937, + "auxiliary_loss_mlp": 0.01042345, + "balance_loss_clip": 1.03807545, + "balance_loss_mlp": 1.02672255, + "epoch": 0.5406884112430482, + "flos": 14209493973120.0, + "grad_norm": 2.5485108966117704, + "language_loss": 0.76453286, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.78590572, + "num_input_tokens_seen": 193665060, + "step": 8993, + "time_per_iteration": 2.7208335399627686 + }, + { + "auxiliary_loss_clip": 0.01106981, + "auxiliary_loss_mlp": 0.01031441, + "balance_loss_clip": 1.03966081, + "balance_loss_mlp": 1.01783895, + "epoch": 0.5407485344957162, + "flos": 23875568536320.0, + "grad_norm": 1.7082267966393392, + "language_loss": 0.70645487, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.72783911, + "num_input_tokens_seen": 193683620, + "step": 8994, + "time_per_iteration": 2.724794626235962 + }, + { + "auxiliary_loss_clip": 0.01107598, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_clip": 1.03957391, + "balance_loss_mlp": 1.01583362, + "epoch": 0.5408086577483842, + "flos": 23148485435520.0, + "grad_norm": 3.058822592256831, + "language_loss": 0.75407541, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.77546465, + "num_input_tokens_seen": 193702990, + "step": 8995, + "time_per_iteration": 5.971833229064941 + }, + { + "auxiliary_loss_clip": 0.0110732, + "auxiliary_loss_mlp": 0.01036119, + "balance_loss_clip": 1.04115582, + "balance_loss_mlp": 1.02335215, + "epoch": 0.5408687810010522, + "flos": 18771046773120.0, + "grad_norm": 1.7630879917097735, + "language_loss": 0.73701608, + "learning_rate": 1.832533059471282e-06, + "loss": 0.75845045, + "num_input_tokens_seen": 193721785, + "step": 8996, + "time_per_iteration": 4.209546327590942 + }, + { + "auxiliary_loss_clip": 0.0107249, + "auxiliary_loss_mlp": 0.0103344, + "balance_loss_clip": 1.03679025, + "balance_loss_mlp": 1.02018428, + "epoch": 0.5409289042537201, + "flos": 13881557779200.0, + "grad_norm": 2.7958611639566557, + "language_loss": 0.73200142, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.75306082, + "num_input_tokens_seen": 193740315, + "step": 8997, + "time_per_iteration": 2.6815428733825684 + }, + { + "auxiliary_loss_clip": 0.0112099, + "auxiliary_loss_mlp": 0.01036623, + "balance_loss_clip": 1.04214144, + "balance_loss_mlp": 1.02241874, + "epoch": 0.5409890275063881, + "flos": 14465357527680.0, + "grad_norm": 2.1382567541010706, + "language_loss": 0.71990108, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.74147719, + "num_input_tokens_seen": 193757580, + "step": 8998, + "time_per_iteration": 2.516322374343872 + }, + { + "auxiliary_loss_clip": 0.01084198, + "auxiliary_loss_mlp": 0.01038336, + "balance_loss_clip": 1.03824925, + "balance_loss_mlp": 1.02481771, + "epoch": 0.541049150759056, + "flos": 48977449349760.0, + "grad_norm": 1.4737906597538892, + "language_loss": 0.7077291, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.72895443, + "num_input_tokens_seen": 193780965, + "step": 8999, + "time_per_iteration": 4.582181215286255 + }, + { + "auxiliary_loss_clip": 0.01092675, + "auxiliary_loss_mlp": 0.01037736, + "balance_loss_clip": 1.04016924, + "balance_loss_mlp": 1.02372253, + "epoch": 0.541109274011724, + "flos": 18147601388160.0, + "grad_norm": 2.7892757576067972, + "language_loss": 0.80210066, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.82340479, + "num_input_tokens_seen": 193797855, + "step": 9000, + "time_per_iteration": 2.6335151195526123 + }, + { + "auxiliary_loss_clip": 0.01069713, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.03577805, + "balance_loss_mlp": 1.01837265, + "epoch": 0.541169397264392, + "flos": 20522553759360.0, + "grad_norm": 1.6231589706551275, + "language_loss": 0.73037231, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.75140095, + "num_input_tokens_seen": 193817375, + "step": 9001, + "time_per_iteration": 2.854574680328369 + }, + { + "auxiliary_loss_clip": 0.01088976, + "auxiliary_loss_mlp": 0.01037285, + "balance_loss_clip": 1.03875196, + "balance_loss_mlp": 1.0225749, + "epoch": 0.54122952051706, + "flos": 20044043752320.0, + "grad_norm": 2.3946252475459704, + "language_loss": 0.85775471, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.87901723, + "num_input_tokens_seen": 193832205, + "step": 9002, + "time_per_iteration": 2.83799409866333 + }, + { + "auxiliary_loss_clip": 0.01071827, + "auxiliary_loss_mlp": 0.01036651, + "balance_loss_clip": 1.03876507, + "balance_loss_mlp": 1.0244441, + "epoch": 0.541289643769728, + "flos": 19062246332160.0, + "grad_norm": 1.9022782971983632, + "language_loss": 0.78010678, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.80119157, + "num_input_tokens_seen": 193849830, + "step": 9003, + "time_per_iteration": 2.8771512508392334 + }, + { + "auxiliary_loss_clip": 0.01105804, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.03998888, + "balance_loss_mlp": 1.01659191, + "epoch": 0.5413497670223959, + "flos": 22382295402240.0, + "grad_norm": 2.4815464780266905, + "language_loss": 0.69489288, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.71626127, + "num_input_tokens_seen": 193869945, + "step": 9004, + "time_per_iteration": 2.7296600341796875 + }, + { + "auxiliary_loss_clip": 0.01027886, + "auxiliary_loss_mlp": 0.01000864, + "balance_loss_clip": 1.01221299, + "balance_loss_mlp": 0.99943334, + "epoch": 0.5414098902750639, + "flos": 70031734093440.0, + "grad_norm": 0.9691738453098017, + "language_loss": 0.59067202, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.61095953, + "num_input_tokens_seen": 193930860, + "step": 9005, + "time_per_iteration": 3.2482104301452637 + }, + { + "auxiliary_loss_clip": 0.011229, + "auxiliary_loss_mlp": 0.00771475, + "balance_loss_clip": 1.04402518, + "balance_loss_mlp": 1.00026715, + "epoch": 0.5414700135277318, + "flos": 21798962530560.0, + "grad_norm": 3.1081571461352357, + "language_loss": 0.78251934, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.80146307, + "num_input_tokens_seen": 193949075, + "step": 9006, + "time_per_iteration": 2.646697521209717 + }, + { + "auxiliary_loss_clip": 0.01099607, + "auxiliary_loss_mlp": 0.01035785, + "balance_loss_clip": 1.04162097, + "balance_loss_mlp": 1.02380466, + "epoch": 0.5415301367803999, + "flos": 16907929251840.0, + "grad_norm": 2.04905315291525, + "language_loss": 0.82968152, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.85103542, + "num_input_tokens_seen": 193967630, + "step": 9007, + "time_per_iteration": 2.6906566619873047 + }, + { + "auxiliary_loss_clip": 0.01105367, + "auxiliary_loss_mlp": 0.01035166, + "balance_loss_clip": 1.04186976, + "balance_loss_mlp": 1.02084875, + "epoch": 0.5415902600330678, + "flos": 25704176065920.0, + "grad_norm": 2.002533361325265, + "language_loss": 0.67188275, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.69328809, + "num_input_tokens_seen": 193988730, + "step": 9008, + "time_per_iteration": 2.6538190841674805 + }, + { + "auxiliary_loss_clip": 0.01126211, + "auxiliary_loss_mlp": 0.01033213, + "balance_loss_clip": 1.04396832, + "balance_loss_mlp": 1.01836514, + "epoch": 0.5416503832857358, + "flos": 19208151377280.0, + "grad_norm": 1.9615897276879948, + "language_loss": 0.73713046, + "learning_rate": 1.827488379924234e-06, + "loss": 0.75872469, + "num_input_tokens_seen": 194005160, + "step": 9009, + "time_per_iteration": 2.5716910362243652 + }, + { + "auxiliary_loss_clip": 0.01072637, + "auxiliary_loss_mlp": 0.01036076, + "balance_loss_clip": 1.04184818, + "balance_loss_mlp": 1.02171135, + "epoch": 0.5417105065384037, + "flos": 12713706887040.0, + "grad_norm": 2.1963503735452417, + "language_loss": 0.87984347, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.90093064, + "num_input_tokens_seen": 194021700, + "step": 9010, + "time_per_iteration": 2.725271701812744 + }, + { + "auxiliary_loss_clip": 0.01120446, + "auxiliary_loss_mlp": 0.01037388, + "balance_loss_clip": 1.04260874, + "balance_loss_mlp": 1.02448332, + "epoch": 0.5417706297910717, + "flos": 30335933998080.0, + "grad_norm": 1.8667479755469423, + "language_loss": 0.65187848, + "learning_rate": 1.826712372694122e-06, + "loss": 0.67345679, + "num_input_tokens_seen": 194042620, + "step": 9011, + "time_per_iteration": 2.6546692848205566 + }, + { + "auxiliary_loss_clip": 0.01111756, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.04458547, + "balance_loss_mlp": 1.02324426, + "epoch": 0.5418307530437396, + "flos": 29020992912000.0, + "grad_norm": 2.8570982701345797, + "language_loss": 0.79252279, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.81400692, + "num_input_tokens_seen": 194061800, + "step": 9012, + "time_per_iteration": 2.6907572746276855 + }, + { + "auxiliary_loss_clip": 0.01119813, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.04184949, + "balance_loss_mlp": 1.01965845, + "epoch": 0.5418908762964076, + "flos": 16873455173760.0, + "grad_norm": 2.191987247231765, + "language_loss": 0.74450612, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.76603615, + "num_input_tokens_seen": 194079890, + "step": 9013, + "time_per_iteration": 2.6294262409210205 + }, + { + "auxiliary_loss_clip": 0.01085863, + "auxiliary_loss_mlp": 0.01030984, + "balance_loss_clip": 1.04200959, + "balance_loss_mlp": 1.01776361, + "epoch": 0.5419509995490756, + "flos": 18949702043520.0, + "grad_norm": 2.094538198423721, + "language_loss": 0.72306025, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.74422872, + "num_input_tokens_seen": 194097625, + "step": 9014, + "time_per_iteration": 2.653125524520874 + }, + { + "auxiliary_loss_clip": 0.01099897, + "auxiliary_loss_mlp": 0.01032361, + "balance_loss_clip": 1.04301429, + "balance_loss_mlp": 1.01888418, + "epoch": 0.5420111228017436, + "flos": 18077719478400.0, + "grad_norm": 1.5497382301526352, + "language_loss": 0.807073, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.82839555, + "num_input_tokens_seen": 194116055, + "step": 9015, + "time_per_iteration": 2.6197831630706787 + }, + { + "auxiliary_loss_clip": 0.01117394, + "auxiliary_loss_mlp": 0.01039918, + "balance_loss_clip": 1.04648256, + "balance_loss_mlp": 1.02635145, + "epoch": 0.5420712460544116, + "flos": 19061779455360.0, + "grad_norm": 2.4637362060141053, + "language_loss": 0.81252277, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.83409584, + "num_input_tokens_seen": 194130365, + "step": 9016, + "time_per_iteration": 2.617722988128662 + }, + { + "auxiliary_loss_clip": 0.01121755, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.04375124, + "balance_loss_mlp": 1.01745152, + "epoch": 0.5421313693070795, + "flos": 18187103370240.0, + "grad_norm": 1.6999373176246328, + "language_loss": 0.81182349, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.83334422, + "num_input_tokens_seen": 194148975, + "step": 9017, + "time_per_iteration": 2.629488706588745 + }, + { + "auxiliary_loss_clip": 0.01119384, + "auxiliary_loss_mlp": 0.01035787, + "balance_loss_clip": 1.04308951, + "balance_loss_mlp": 1.02270925, + "epoch": 0.5421914925597475, + "flos": 13005947940480.0, + "grad_norm": 1.767329743248484, + "language_loss": 0.77847707, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.80002874, + "num_input_tokens_seen": 194167185, + "step": 9018, + "time_per_iteration": 2.595520257949829 + }, + { + "auxiliary_loss_clip": 0.01121333, + "auxiliary_loss_mlp": 0.010389, + "balance_loss_clip": 1.04014397, + "balance_loss_mlp": 1.02462447, + "epoch": 0.5422516158124154, + "flos": 46758457831680.0, + "grad_norm": 1.6302803515957, + "language_loss": 0.66417134, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.68577361, + "num_input_tokens_seen": 194192840, + "step": 9019, + "time_per_iteration": 2.8572912216186523 + }, + { + "auxiliary_loss_clip": 0.01101197, + "auxiliary_loss_mlp": 0.01036576, + "balance_loss_clip": 1.03910589, + "balance_loss_mlp": 1.02315295, + "epoch": 0.5423117390650835, + "flos": 31758642864000.0, + "grad_norm": 1.5350920710342792, + "language_loss": 0.69515598, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.71653378, + "num_input_tokens_seen": 194213150, + "step": 9020, + "time_per_iteration": 2.710081100463867 + }, + { + "auxiliary_loss_clip": 0.01082322, + "auxiliary_loss_mlp": 0.01037191, + "balance_loss_clip": 1.03962088, + "balance_loss_mlp": 1.02462053, + "epoch": 0.5423718623177514, + "flos": 27201974313600.0, + "grad_norm": 1.5706670053852172, + "language_loss": 0.80494618, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.82614136, + "num_input_tokens_seen": 194234665, + "step": 9021, + "time_per_iteration": 2.760133743286133 + }, + { + "auxiliary_loss_clip": 0.01069543, + "auxiliary_loss_mlp": 0.01037659, + "balance_loss_clip": 1.03820395, + "balance_loss_mlp": 1.02405667, + "epoch": 0.5424319855704194, + "flos": 23546447193600.0, + "grad_norm": 2.2946341773433496, + "language_loss": 0.78887641, + "learning_rate": 1.822444805916788e-06, + "loss": 0.80994844, + "num_input_tokens_seen": 194253790, + "step": 9022, + "time_per_iteration": 2.8245437145233154 + }, + { + "auxiliary_loss_clip": 0.01085662, + "auxiliary_loss_mlp": 0.00771451, + "balance_loss_clip": 1.03742123, + "balance_loss_mlp": 1.00025558, + "epoch": 0.5424921088230873, + "flos": 26615624699520.0, + "grad_norm": 1.6811700220554942, + "language_loss": 0.8234387, + "learning_rate": 1.822056885403915e-06, + "loss": 0.84200984, + "num_input_tokens_seen": 194274950, + "step": 9023, + "time_per_iteration": 2.722637891769409 + }, + { + "auxiliary_loss_clip": 0.01105066, + "auxiliary_loss_mlp": 0.01031053, + "balance_loss_clip": 1.04266286, + "balance_loss_mlp": 1.01798785, + "epoch": 0.5425522320757553, + "flos": 23586811102080.0, + "grad_norm": 1.7285453701222258, + "language_loss": 0.71582222, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.73718333, + "num_input_tokens_seen": 194296155, + "step": 9024, + "time_per_iteration": 2.6643166542053223 + }, + { + "auxiliary_loss_clip": 0.01109023, + "auxiliary_loss_mlp": 0.01034832, + "balance_loss_clip": 1.03978658, + "balance_loss_mlp": 1.02231479, + "epoch": 0.5426123553284232, + "flos": 30592264429440.0, + "grad_norm": 1.7605396052132907, + "language_loss": 0.65074313, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.67218173, + "num_input_tokens_seen": 194318025, + "step": 9025, + "time_per_iteration": 2.6963577270507812 + }, + { + "auxiliary_loss_clip": 0.0109579, + "auxiliary_loss_mlp": 0.00769932, + "balance_loss_clip": 1.04664063, + "balance_loss_mlp": 1.00038803, + "epoch": 0.5426724785810912, + "flos": 12495118671360.0, + "grad_norm": 2.055737651503127, + "language_loss": 0.73914909, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.7578063, + "num_input_tokens_seen": 194336150, + "step": 9026, + "time_per_iteration": 2.6317172050476074 + }, + { + "auxiliary_loss_clip": 0.01095155, + "auxiliary_loss_mlp": 0.01040442, + "balance_loss_clip": 1.03804421, + "balance_loss_mlp": 1.02545154, + "epoch": 0.5427326018337592, + "flos": 26064611089920.0, + "grad_norm": 2.1949475938623224, + "language_loss": 0.7840718, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.80542773, + "num_input_tokens_seen": 194355980, + "step": 9027, + "time_per_iteration": 2.6076927185058594 + }, + { + "auxiliary_loss_clip": 0.01004652, + "auxiliary_loss_mlp": 0.01011362, + "balance_loss_clip": 1.00918782, + "balance_loss_mlp": 1.00950241, + "epoch": 0.5427927250864272, + "flos": 65984745576960.0, + "grad_norm": 0.759944437260396, + "language_loss": 0.56566465, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.58582479, + "num_input_tokens_seen": 194422660, + "step": 9028, + "time_per_iteration": 3.173718214035034 + }, + { + "auxiliary_loss_clip": 0.01078653, + "auxiliary_loss_mlp": 0.01029607, + "balance_loss_clip": 1.0437665, + "balance_loss_mlp": 1.01519442, + "epoch": 0.5428528483390952, + "flos": 19975382904960.0, + "grad_norm": 2.1789279213341857, + "language_loss": 0.7763471, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.79742968, + "num_input_tokens_seen": 194438545, + "step": 9029, + "time_per_iteration": 2.6010968685150146 + }, + { + "auxiliary_loss_clip": 0.01080602, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.0426538, + "balance_loss_mlp": 1.01611137, + "epoch": 0.5429129715917631, + "flos": 21832323287040.0, + "grad_norm": 1.5227150839007966, + "language_loss": 0.8289423, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.85004783, + "num_input_tokens_seen": 194458060, + "step": 9030, + "time_per_iteration": 2.673872232437134 + }, + { + "auxiliary_loss_clip": 0.01119103, + "auxiliary_loss_mlp": 0.0103115, + "balance_loss_clip": 1.04308653, + "balance_loss_mlp": 1.01801896, + "epoch": 0.5429730948444311, + "flos": 27782685492480.0, + "grad_norm": 1.5242093045096456, + "language_loss": 0.74554878, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.76705134, + "num_input_tokens_seen": 194477405, + "step": 9031, + "time_per_iteration": 2.6361796855926514 + }, + { + "auxiliary_loss_clip": 0.01099875, + "auxiliary_loss_mlp": 0.01039492, + "balance_loss_clip": 1.03957534, + "balance_loss_mlp": 1.02664721, + "epoch": 0.543033218097099, + "flos": 26760452336640.0, + "grad_norm": 1.8557133497087115, + "language_loss": 0.85526693, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.87666059, + "num_input_tokens_seen": 194497085, + "step": 9032, + "time_per_iteration": 2.633051872253418 + }, + { + "auxiliary_loss_clip": 0.01101785, + "auxiliary_loss_mlp": 0.01037126, + "balance_loss_clip": 1.04154074, + "balance_loss_mlp": 1.02405477, + "epoch": 0.5430933413497671, + "flos": 22675254727680.0, + "grad_norm": 1.789713495487195, + "language_loss": 0.74318242, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.76457155, + "num_input_tokens_seen": 194516785, + "step": 9033, + "time_per_iteration": 2.654573917388916 + }, + { + "auxiliary_loss_clip": 0.01080113, + "auxiliary_loss_mlp": 0.01040958, + "balance_loss_clip": 1.03826129, + "balance_loss_mlp": 1.0267365, + "epoch": 0.543153464602435, + "flos": 24607499973120.0, + "grad_norm": 1.5302152204895145, + "language_loss": 0.75507742, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.77628815, + "num_input_tokens_seen": 194536475, + "step": 9034, + "time_per_iteration": 6.07684326171875 + }, + { + "auxiliary_loss_clip": 0.01080457, + "auxiliary_loss_mlp": 0.01035889, + "balance_loss_clip": 1.04235947, + "balance_loss_mlp": 1.02360463, + "epoch": 0.543213587855103, + "flos": 19025725178880.0, + "grad_norm": 1.697596865274133, + "language_loss": 0.84559906, + "learning_rate": 1.817402369770655e-06, + "loss": 0.86676252, + "num_input_tokens_seen": 194554495, + "step": 9035, + "time_per_iteration": 4.246930122375488 + }, + { + "auxiliary_loss_clip": 0.01010369, + "auxiliary_loss_mlp": 0.01004655, + "balance_loss_clip": 1.01446867, + "balance_loss_mlp": 1.00328398, + "epoch": 0.5432737111077709, + "flos": 65686435125120.0, + "grad_norm": 0.7105133860132232, + "language_loss": 0.55900681, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.57915699, + "num_input_tokens_seen": 194617620, + "step": 9036, + "time_per_iteration": 3.214927911758423 + }, + { + "auxiliary_loss_clip": 0.0106374, + "auxiliary_loss_mlp": 0.01035958, + "balance_loss_clip": 1.04064369, + "balance_loss_mlp": 1.02285123, + "epoch": 0.5433338343604389, + "flos": 22091670460800.0, + "grad_norm": 1.4967561616212492, + "language_loss": 0.75198317, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.77298009, + "num_input_tokens_seen": 194637690, + "step": 9037, + "time_per_iteration": 2.815127372741699 + }, + { + "auxiliary_loss_clip": 0.01089499, + "auxiliary_loss_mlp": 0.01036314, + "balance_loss_clip": 1.039361, + "balance_loss_mlp": 1.02274799, + "epoch": 0.5433939576131068, + "flos": 34672649616000.0, + "grad_norm": 1.6562121389813547, + "language_loss": 0.66519392, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.68645203, + "num_input_tokens_seen": 194659520, + "step": 9038, + "time_per_iteration": 2.788142681121826 + }, + { + "auxiliary_loss_clip": 0.01105433, + "auxiliary_loss_mlp": 0.01036988, + "balance_loss_clip": 1.03904057, + "balance_loss_mlp": 1.02456677, + "epoch": 0.5434540808657748, + "flos": 20303355012480.0, + "grad_norm": 1.9500381910938636, + "language_loss": 0.7809025, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.80232668, + "num_input_tokens_seen": 194677645, + "step": 9039, + "time_per_iteration": 4.200030326843262 + }, + { + "auxiliary_loss_clip": 0.01076379, + "auxiliary_loss_mlp": 0.01038367, + "balance_loss_clip": 1.03707099, + "balance_loss_mlp": 1.02523017, + "epoch": 0.5435142041184428, + "flos": 23112790295040.0, + "grad_norm": 1.9066978344822971, + "language_loss": 0.76675421, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.7879017, + "num_input_tokens_seen": 194697400, + "step": 9040, + "time_per_iteration": 2.752359628677368 + }, + { + "auxiliary_loss_clip": 0.01021921, + "auxiliary_loss_mlp": 0.0101024, + "balance_loss_clip": 1.01599014, + "balance_loss_mlp": 1.00891709, + "epoch": 0.5435743273711108, + "flos": 64012746954240.0, + "grad_norm": 0.6657326543890927, + "language_loss": 0.52456856, + "learning_rate": 1.815075484268074e-06, + "loss": 0.54489017, + "num_input_tokens_seen": 194761205, + "step": 9041, + "time_per_iteration": 3.19743275642395 + }, + { + "auxiliary_loss_clip": 0.01092893, + "auxiliary_loss_mlp": 0.01043232, + "balance_loss_clip": 1.04014623, + "balance_loss_mlp": 1.0300709, + "epoch": 0.5436344506237788, + "flos": 25118903859840.0, + "grad_norm": 1.6935261425615555, + "language_loss": 0.76397556, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.78533685, + "num_input_tokens_seen": 194782445, + "step": 9042, + "time_per_iteration": 2.7176172733306885 + }, + { + "auxiliary_loss_clip": 0.01082719, + "auxiliary_loss_mlp": 0.01030979, + "balance_loss_clip": 1.04040313, + "balance_loss_mlp": 1.01886773, + "epoch": 0.5436945738764467, + "flos": 19572967860480.0, + "grad_norm": 1.7014237411229687, + "language_loss": 0.67346215, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.69459915, + "num_input_tokens_seen": 194800325, + "step": 9043, + "time_per_iteration": 2.7166213989257812 + }, + { + "auxiliary_loss_clip": 0.0107861, + "auxiliary_loss_mlp": 0.01032764, + "balance_loss_clip": 1.03779316, + "balance_loss_mlp": 1.01971054, + "epoch": 0.5437546971291147, + "flos": 21142515525120.0, + "grad_norm": 1.5921714365650326, + "language_loss": 0.84577447, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.86688828, + "num_input_tokens_seen": 194818675, + "step": 9044, + "time_per_iteration": 2.758593797683716 + }, + { + "auxiliary_loss_clip": 0.01123207, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.04196227, + "balance_loss_mlp": 1.01723039, + "epoch": 0.5438148203817826, + "flos": 25118688378240.0, + "grad_norm": 1.5431059852471993, + "language_loss": 0.62074721, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.64228952, + "num_input_tokens_seen": 194836595, + "step": 9045, + "time_per_iteration": 2.6207923889160156 + }, + { + "auxiliary_loss_clip": 0.0112166, + "auxiliary_loss_mlp": 0.01035257, + "balance_loss_clip": 1.04318917, + "balance_loss_mlp": 1.02179205, + "epoch": 0.5438749436344507, + "flos": 23002939526400.0, + "grad_norm": 1.4293832885602564, + "language_loss": 0.70140386, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.72297299, + "num_input_tokens_seen": 194857520, + "step": 9046, + "time_per_iteration": 2.6262285709381104 + }, + { + "auxiliary_loss_clip": 0.01117279, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.04171467, + "balance_loss_mlp": 1.01709127, + "epoch": 0.5439350668871186, + "flos": 15487016065920.0, + "grad_norm": 1.95554521575616, + "language_loss": 0.7724129, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.79387808, + "num_input_tokens_seen": 194876020, + "step": 9047, + "time_per_iteration": 2.592041492462158 + }, + { + "auxiliary_loss_clip": 0.01094716, + "auxiliary_loss_mlp": 0.01047772, + "balance_loss_clip": 1.04039311, + "balance_loss_mlp": 1.03321636, + "epoch": 0.5439951901397866, + "flos": 17238415311360.0, + "grad_norm": 1.5854248061222735, + "language_loss": 0.7262761, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.74770093, + "num_input_tokens_seen": 194894650, + "step": 9048, + "time_per_iteration": 2.667393684387207 + }, + { + "auxiliary_loss_clip": 0.01069346, + "auxiliary_loss_mlp": 0.01045305, + "balance_loss_clip": 1.03664947, + "balance_loss_mlp": 1.0298202, + "epoch": 0.5440553133924545, + "flos": 18661016436480.0, + "grad_norm": 1.9805900660696516, + "language_loss": 0.93650311, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.95764971, + "num_input_tokens_seen": 194911935, + "step": 9049, + "time_per_iteration": 2.7119088172912598 + }, + { + "auxiliary_loss_clip": 0.0110651, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.03992295, + "balance_loss_mlp": 1.02054429, + "epoch": 0.5441154366451225, + "flos": 27122934435840.0, + "grad_norm": 1.7800719649484351, + "language_loss": 0.73936987, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.76076329, + "num_input_tokens_seen": 194931620, + "step": 9050, + "time_per_iteration": 2.631661891937256 + }, + { + "auxiliary_loss_clip": 0.01111441, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.0437777, + "balance_loss_mlp": 1.01755428, + "epoch": 0.5441755598977904, + "flos": 25993867253760.0, + "grad_norm": 1.737903905046117, + "language_loss": 0.66990525, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.69132841, + "num_input_tokens_seen": 194952560, + "step": 9051, + "time_per_iteration": 2.648484230041504 + }, + { + "auxiliary_loss_clip": 0.01080337, + "auxiliary_loss_mlp": 0.01033354, + "balance_loss_clip": 1.03722811, + "balance_loss_mlp": 1.02039015, + "epoch": 0.5442356831504584, + "flos": 32380041173760.0, + "grad_norm": 2.245844605247971, + "language_loss": 0.67334735, + "learning_rate": 1.810810185460011e-06, + "loss": 0.69448429, + "num_input_tokens_seen": 194973915, + "step": 9052, + "time_per_iteration": 2.778211832046509 + }, + { + "auxiliary_loss_clip": 0.01121064, + "auxiliary_loss_mlp": 0.01033478, + "balance_loss_clip": 1.04266417, + "balance_loss_mlp": 1.02010286, + "epoch": 0.5442958064031264, + "flos": 24164290056960.0, + "grad_norm": 1.8200748140762566, + "language_loss": 0.92835879, + "learning_rate": 1.810422473773436e-06, + "loss": 0.9499042, + "num_input_tokens_seen": 194990170, + "step": 9053, + "time_per_iteration": 2.6110095977783203 + }, + { + "auxiliary_loss_clip": 0.01093907, + "auxiliary_loss_mlp": 0.01034949, + "balance_loss_clip": 1.04024363, + "balance_loss_mlp": 1.02203834, + "epoch": 0.5443559296557944, + "flos": 18764690065920.0, + "grad_norm": 2.3950140888374687, + "language_loss": 0.83948398, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.86077261, + "num_input_tokens_seen": 195006395, + "step": 9054, + "time_per_iteration": 2.6261367797851562 + }, + { + "auxiliary_loss_clip": 0.01090647, + "auxiliary_loss_mlp": 0.01034581, + "balance_loss_clip": 1.03965771, + "balance_loss_mlp": 1.02094352, + "epoch": 0.5444160529084624, + "flos": 22632556435200.0, + "grad_norm": 2.6065175825707327, + "language_loss": 0.68213475, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.70338708, + "num_input_tokens_seen": 195025080, + "step": 9055, + "time_per_iteration": 2.623518705368042 + }, + { + "auxiliary_loss_clip": 0.01000083, + "auxiliary_loss_mlp": 0.00999074, + "balance_loss_clip": 1.01110244, + "balance_loss_mlp": 0.99770337, + "epoch": 0.5444761761611303, + "flos": 69671909600640.0, + "grad_norm": 0.7426728731430834, + "language_loss": 0.57650024, + "learning_rate": 1.80925938190531e-06, + "loss": 0.59649181, + "num_input_tokens_seen": 195085725, + "step": 9056, + "time_per_iteration": 3.2228453159332275 + }, + { + "auxiliary_loss_clip": 0.01087409, + "auxiliary_loss_mlp": 0.01036027, + "balance_loss_clip": 1.04208684, + "balance_loss_mlp": 1.02234185, + "epoch": 0.5445362994137983, + "flos": 14278442129280.0, + "grad_norm": 1.75653480561415, + "language_loss": 0.69749284, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.71872711, + "num_input_tokens_seen": 195102585, + "step": 9057, + "time_per_iteration": 2.7110843658447266 + }, + { + "auxiliary_loss_clip": 0.01106044, + "auxiliary_loss_mlp": 0.01038419, + "balance_loss_clip": 1.04014075, + "balance_loss_mlp": 1.02472818, + "epoch": 0.5445964226664662, + "flos": 28986195611520.0, + "grad_norm": 2.0738816921888366, + "language_loss": 0.75373238, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.775177, + "num_input_tokens_seen": 195120055, + "step": 9058, + "time_per_iteration": 2.7001023292541504 + }, + { + "auxiliary_loss_clip": 0.01003793, + "auxiliary_loss_mlp": 0.01003874, + "balance_loss_clip": 1.01181531, + "balance_loss_mlp": 1.00230026, + "epoch": 0.5446565459191343, + "flos": 68620230270720.0, + "grad_norm": 0.7925901763726337, + "language_loss": 0.6261481, + "learning_rate": 1.808096355133312e-06, + "loss": 0.6462248, + "num_input_tokens_seen": 195181045, + "step": 9059, + "time_per_iteration": 3.355748414993286 + }, + { + "auxiliary_loss_clip": 0.01107073, + "auxiliary_loss_mlp": 0.0103287, + "balance_loss_clip": 1.0414511, + "balance_loss_mlp": 1.01922059, + "epoch": 0.5447166691718022, + "flos": 16216469464320.0, + "grad_norm": 1.790354282478879, + "language_loss": 0.79365647, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.81505585, + "num_input_tokens_seen": 195198840, + "step": 9060, + "time_per_iteration": 2.6523141860961914 + }, + { + "auxiliary_loss_clip": 0.01111799, + "auxiliary_loss_mlp": 0.01033132, + "balance_loss_clip": 1.04219317, + "balance_loss_mlp": 1.01976824, + "epoch": 0.5447767924244702, + "flos": 25849039616640.0, + "grad_norm": 1.7487339019361072, + "language_loss": 0.8006283, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.82207763, + "num_input_tokens_seen": 195218720, + "step": 9061, + "time_per_iteration": 2.660477876663208 + }, + { + "auxiliary_loss_clip": 0.01107514, + "auxiliary_loss_mlp": 0.01028575, + "balance_loss_clip": 1.04152489, + "balance_loss_mlp": 1.01595628, + "epoch": 0.5448369156771381, + "flos": 19677718897920.0, + "grad_norm": 1.667542325640746, + "language_loss": 0.8699556, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.89131653, + "num_input_tokens_seen": 195235770, + "step": 9062, + "time_per_iteration": 2.6527698040008545 + }, + { + "auxiliary_loss_clip": 0.0109274, + "auxiliary_loss_mlp": 0.01037371, + "balance_loss_clip": 1.03916395, + "balance_loss_mlp": 1.02188551, + "epoch": 0.5448970389298061, + "flos": 19281804215040.0, + "grad_norm": 1.6766222611874342, + "language_loss": 0.82069784, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.84199893, + "num_input_tokens_seen": 195254870, + "step": 9063, + "time_per_iteration": 2.651977062225342 + }, + { + "auxiliary_loss_clip": 0.01118028, + "auxiliary_loss_mlp": 0.01032916, + "balance_loss_clip": 1.0406127, + "balance_loss_mlp": 1.01958823, + "epoch": 0.544957162182474, + "flos": 20991690316800.0, + "grad_norm": 1.769153488212037, + "language_loss": 0.63484013, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.65634954, + "num_input_tokens_seen": 195273390, + "step": 9064, + "time_per_iteration": 2.595914602279663 + }, + { + "auxiliary_loss_clip": 0.0112242, + "auxiliary_loss_mlp": 0.01037085, + "balance_loss_clip": 1.04264021, + "balance_loss_mlp": 1.02337003, + "epoch": 0.545017285435142, + "flos": 25374587846400.0, + "grad_norm": 1.6143269954810184, + "language_loss": 0.79795569, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.81955075, + "num_input_tokens_seen": 195295635, + "step": 9065, + "time_per_iteration": 2.647632360458374 + }, + { + "auxiliary_loss_clip": 0.01082455, + "auxiliary_loss_mlp": 0.01032837, + "balance_loss_clip": 1.04022825, + "balance_loss_mlp": 1.0211482, + "epoch": 0.54507740868781, + "flos": 19134749934720.0, + "grad_norm": 2.1024584454927626, + "language_loss": 0.77589709, + "learning_rate": 1.805382881379827e-06, + "loss": 0.79705, + "num_input_tokens_seen": 195312545, + "step": 9066, + "time_per_iteration": 2.750904083251953 + }, + { + "auxiliary_loss_clip": 0.01106868, + "auxiliary_loss_mlp": 0.0103139, + "balance_loss_clip": 1.04005289, + "balance_loss_mlp": 1.01794958, + "epoch": 0.545137531940478, + "flos": 26249802635520.0, + "grad_norm": 2.0527073359497665, + "language_loss": 0.75859725, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.77997983, + "num_input_tokens_seen": 195332955, + "step": 9067, + "time_per_iteration": 2.68332839012146 + }, + { + "auxiliary_loss_clip": 0.0108798, + "auxiliary_loss_mlp": 0.01038091, + "balance_loss_clip": 1.04256892, + "balance_loss_mlp": 1.02183652, + "epoch": 0.545197655193146, + "flos": 37555629995520.0, + "grad_norm": 6.876378009840058, + "language_loss": 0.63596183, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.65722257, + "num_input_tokens_seen": 195355930, + "step": 9068, + "time_per_iteration": 2.893052816390991 + }, + { + "auxiliary_loss_clip": 0.01080095, + "auxiliary_loss_mlp": 0.01041608, + "balance_loss_clip": 1.0446372, + "balance_loss_mlp": 1.02935874, + "epoch": 0.5452577784458139, + "flos": 26031250333440.0, + "grad_norm": 1.5002235169223528, + "language_loss": 0.7186054, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.73982239, + "num_input_tokens_seen": 195376445, + "step": 9069, + "time_per_iteration": 2.7437844276428223 + }, + { + "auxiliary_loss_clip": 0.01118098, + "auxiliary_loss_mlp": 0.01028881, + "balance_loss_clip": 1.04397726, + "balance_loss_mlp": 1.0169543, + "epoch": 0.5453179016984819, + "flos": 17639034675840.0, + "grad_norm": 1.9248359915141238, + "language_loss": 0.73836279, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.75983256, + "num_input_tokens_seen": 195393725, + "step": 9070, + "time_per_iteration": 2.629026174545288 + }, + { + "auxiliary_loss_clip": 0.01104842, + "auxiliary_loss_mlp": 0.01038375, + "balance_loss_clip": 1.0405302, + "balance_loss_mlp": 1.02508879, + "epoch": 0.5453780249511498, + "flos": 23216679406080.0, + "grad_norm": 2.895965777257026, + "language_loss": 0.60386193, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.62529415, + "num_input_tokens_seen": 195411380, + "step": 9071, + "time_per_iteration": 2.787898540496826 + }, + { + "auxiliary_loss_clip": 0.0103628, + "auxiliary_loss_mlp": 0.01019994, + "balance_loss_clip": 1.01031959, + "balance_loss_mlp": 1.01858091, + "epoch": 0.5454381482038179, + "flos": 68696504801280.0, + "grad_norm": 0.701915733274622, + "language_loss": 0.57096583, + "learning_rate": 1.80305733435899e-06, + "loss": 0.59152853, + "num_input_tokens_seen": 195482015, + "step": 9072, + "time_per_iteration": 3.3096070289611816 + }, + { + "auxiliary_loss_clip": 0.01088718, + "auxiliary_loss_mlp": 0.0104092, + "balance_loss_clip": 1.03829658, + "balance_loss_mlp": 1.02696621, + "epoch": 0.5454982714564858, + "flos": 13260626346240.0, + "grad_norm": 1.6985686628313852, + "language_loss": 0.6941787, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.71547508, + "num_input_tokens_seen": 195500440, + "step": 9073, + "time_per_iteration": 5.942334413528442 + }, + { + "auxiliary_loss_clip": 0.01094077, + "auxiliary_loss_mlp": 0.01042156, + "balance_loss_clip": 1.03799677, + "balance_loss_mlp": 1.02981734, + "epoch": 0.5455583947091538, + "flos": 21835878733440.0, + "grad_norm": 1.7477774368009211, + "language_loss": 0.7124452, + "learning_rate": 1.802282211606627e-06, + "loss": 0.73380756, + "num_input_tokens_seen": 195520860, + "step": 9074, + "time_per_iteration": 2.6760778427124023 + }, + { + "auxiliary_loss_clip": 0.0110625, + "auxiliary_loss_mlp": 0.01038683, + "balance_loss_clip": 1.04050887, + "balance_loss_mlp": 1.02611828, + "epoch": 0.5456185179618217, + "flos": 17817438551040.0, + "grad_norm": 1.854490114521215, + "language_loss": 0.68543398, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.70688331, + "num_input_tokens_seen": 195538615, + "step": 9075, + "time_per_iteration": 4.19740891456604 + }, + { + "auxiliary_loss_clip": 0.01109026, + "auxiliary_loss_mlp": 0.01034737, + "balance_loss_clip": 1.04411292, + "balance_loss_mlp": 1.02303696, + "epoch": 0.5456786412144897, + "flos": 21069401391360.0, + "grad_norm": 1.8542702472429493, + "language_loss": 0.80530715, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.82674479, + "num_input_tokens_seen": 195557460, + "step": 9076, + "time_per_iteration": 2.6821329593658447 + }, + { + "auxiliary_loss_clip": 0.01109363, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.04109383, + "balance_loss_mlp": 1.01970196, + "epoch": 0.5457387644671576, + "flos": 23294965098240.0, + "grad_norm": 1.6176910715306643, + "language_loss": 0.80137533, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.82279032, + "num_input_tokens_seen": 195577985, + "step": 9077, + "time_per_iteration": 2.6378607749938965 + }, + { + "auxiliary_loss_clip": 0.01103737, + "auxiliary_loss_mlp": 0.01035436, + "balance_loss_clip": 1.04032636, + "balance_loss_mlp": 1.02293682, + "epoch": 0.5457988877198257, + "flos": 21617039122560.0, + "grad_norm": 2.2183628478116346, + "language_loss": 0.67997038, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.70136213, + "num_input_tokens_seen": 195597620, + "step": 9078, + "time_per_iteration": 4.261017560958862 + }, + { + "auxiliary_loss_clip": 0.01114465, + "auxiliary_loss_mlp": 0.01039359, + "balance_loss_clip": 1.04379976, + "balance_loss_mlp": 1.02579284, + "epoch": 0.5458590109724936, + "flos": 23762485543680.0, + "grad_norm": 1.8448340723101526, + "language_loss": 0.80507636, + "learning_rate": 1.800344536188764e-06, + "loss": 0.82661462, + "num_input_tokens_seen": 195615910, + "step": 9079, + "time_per_iteration": 2.6384685039520264 + }, + { + "auxiliary_loss_clip": 0.01124513, + "auxiliary_loss_mlp": 0.01034947, + "balance_loss_clip": 1.04221058, + "balance_loss_mlp": 1.02032018, + "epoch": 0.5459191342251616, + "flos": 24424283675520.0, + "grad_norm": 1.6928746227882223, + "language_loss": 0.75848919, + "learning_rate": 1.799957023759277e-06, + "loss": 0.78008378, + "num_input_tokens_seen": 195635620, + "step": 9080, + "time_per_iteration": 2.6506381034851074 + }, + { + "auxiliary_loss_clip": 0.01080273, + "auxiliary_loss_mlp": 0.01037485, + "balance_loss_clip": 1.03795743, + "balance_loss_mlp": 1.0230484, + "epoch": 0.5459792574778296, + "flos": 23623009032960.0, + "grad_norm": 2.0769433103494737, + "language_loss": 0.83164978, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.85282731, + "num_input_tokens_seen": 195652495, + "step": 9081, + "time_per_iteration": 2.705381393432617 + }, + { + "auxiliary_loss_clip": 0.0112596, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.04470921, + "balance_loss_mlp": 1.01884151, + "epoch": 0.5460393807304975, + "flos": 19135540033920.0, + "grad_norm": 1.688461125774873, + "language_loss": 0.70063365, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.72221684, + "num_input_tokens_seen": 195671965, + "step": 9082, + "time_per_iteration": 2.6176023483276367 + }, + { + "auxiliary_loss_clip": 0.01115168, + "auxiliary_loss_mlp": 0.01030163, + "balance_loss_clip": 1.03972983, + "balance_loss_mlp": 1.01709151, + "epoch": 0.5460995039831655, + "flos": 35918534805120.0, + "grad_norm": 1.559424348169526, + "language_loss": 0.66653717, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.68799043, + "num_input_tokens_seen": 195694725, + "step": 9083, + "time_per_iteration": 2.710636854171753 + }, + { + "auxiliary_loss_clip": 0.01091037, + "auxiliary_loss_mlp": 0.01033037, + "balance_loss_clip": 1.03879106, + "balance_loss_mlp": 1.01998401, + "epoch": 0.5461596272358334, + "flos": 26759231274240.0, + "grad_norm": 1.7271294710436846, + "language_loss": 0.78584135, + "learning_rate": 1.798407050044766e-06, + "loss": 0.80708218, + "num_input_tokens_seen": 195714090, + "step": 9084, + "time_per_iteration": 2.6876227855682373 + }, + { + "auxiliary_loss_clip": 0.01111571, + "auxiliary_loss_mlp": 0.01037411, + "balance_loss_clip": 1.042117, + "balance_loss_mlp": 1.02412558, + "epoch": 0.5462197504885015, + "flos": 20886580143360.0, + "grad_norm": 2.0534049917888852, + "language_loss": 0.75331509, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.77480489, + "num_input_tokens_seen": 195733585, + "step": 9085, + "time_per_iteration": 2.710315704345703 + }, + { + "auxiliary_loss_clip": 0.01098293, + "auxiliary_loss_mlp": 0.01035765, + "balance_loss_clip": 1.0397166, + "balance_loss_mlp": 1.02216959, + "epoch": 0.5462798737411694, + "flos": 25804976607360.0, + "grad_norm": 2.0038443585531174, + "language_loss": 0.75082123, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.7721619, + "num_input_tokens_seen": 195752820, + "step": 9086, + "time_per_iteration": 2.7101428508758545 + }, + { + "auxiliary_loss_clip": 0.01102837, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.03951812, + "balance_loss_mlp": 1.02227759, + "epoch": 0.5463399969938374, + "flos": 25775027642880.0, + "grad_norm": 1.6829711206227542, + "language_loss": 0.77097058, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.79235566, + "num_input_tokens_seen": 195773740, + "step": 9087, + "time_per_iteration": 2.6439003944396973 + }, + { + "auxiliary_loss_clip": 0.01114018, + "auxiliary_loss_mlp": 0.01042361, + "balance_loss_clip": 1.04376245, + "balance_loss_mlp": 1.02707863, + "epoch": 0.5464001202465053, + "flos": 18843298980480.0, + "grad_norm": 1.9617582228039958, + "language_loss": 0.77464199, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.79620576, + "num_input_tokens_seen": 195792125, + "step": 9088, + "time_per_iteration": 2.62850022315979 + }, + { + "auxiliary_loss_clip": 0.00993547, + "auxiliary_loss_mlp": 0.00999929, + "balance_loss_clip": 1.02517176, + "balance_loss_mlp": 0.99852258, + "epoch": 0.5464602434991733, + "flos": 69049541623680.0, + "grad_norm": 0.7268281858475805, + "language_loss": 0.57717931, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.59711409, + "num_input_tokens_seen": 195854935, + "step": 9089, + "time_per_iteration": 3.532050371170044 + }, + { + "auxiliary_loss_clip": 0.01085451, + "auxiliary_loss_mlp": 0.01038489, + "balance_loss_clip": 1.03805399, + "balance_loss_mlp": 1.02422571, + "epoch": 0.5465203667518412, + "flos": 27560039040000.0, + "grad_norm": 1.7593878993297172, + "language_loss": 0.76682436, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.78806376, + "num_input_tokens_seen": 195874715, + "step": 9090, + "time_per_iteration": 3.0779287815093994 + }, + { + "auxiliary_loss_clip": 0.01106384, + "auxiliary_loss_mlp": 0.01039408, + "balance_loss_clip": 1.03928399, + "balance_loss_mlp": 1.0233984, + "epoch": 0.5465804900045093, + "flos": 21210206705280.0, + "grad_norm": 1.8843979676244431, + "language_loss": 0.74037111, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.76182902, + "num_input_tokens_seen": 195892610, + "step": 9091, + "time_per_iteration": 2.6843886375427246 + }, + { + "auxiliary_loss_clip": 0.01103772, + "auxiliary_loss_mlp": 0.01037785, + "balance_loss_clip": 1.04514658, + "balance_loss_mlp": 1.02397454, + "epoch": 0.5466406132571772, + "flos": 22488949860480.0, + "grad_norm": 1.8168674877061988, + "language_loss": 0.78466463, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.80608022, + "num_input_tokens_seen": 195911085, + "step": 9092, + "time_per_iteration": 2.6951024532318115 + }, + { + "auxiliary_loss_clip": 0.01125215, + "auxiliary_loss_mlp": 0.01034303, + "balance_loss_clip": 1.04363537, + "balance_loss_mlp": 1.01997435, + "epoch": 0.5467007365098452, + "flos": 17675843137920.0, + "grad_norm": 2.188123152779193, + "language_loss": 0.74691254, + "learning_rate": 1.794920057818476e-06, + "loss": 0.76850772, + "num_input_tokens_seen": 195929845, + "step": 9093, + "time_per_iteration": 2.596165657043457 + }, + { + "auxiliary_loss_clip": 0.01112494, + "auxiliary_loss_mlp": 0.01040653, + "balance_loss_clip": 1.04044032, + "balance_loss_mlp": 1.02444029, + "epoch": 0.5467608597625132, + "flos": 15698852524800.0, + "grad_norm": 2.4498750676664414, + "language_loss": 0.6874221, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.70895356, + "num_input_tokens_seen": 195946350, + "step": 9094, + "time_per_iteration": 2.617203712463379 + }, + { + "auxiliary_loss_clip": 0.01100239, + "auxiliary_loss_mlp": 0.0103544, + "balance_loss_clip": 1.04255402, + "balance_loss_mlp": 1.02238083, + "epoch": 0.5468209830151811, + "flos": 24312816794880.0, + "grad_norm": 3.189829826251606, + "language_loss": 0.67888498, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.70024174, + "num_input_tokens_seen": 195959840, + "step": 9095, + "time_per_iteration": 2.709214687347412 + }, + { + "auxiliary_loss_clip": 0.01085979, + "auxiliary_loss_mlp": 0.01036228, + "balance_loss_clip": 1.0412364, + "balance_loss_mlp": 1.0228703, + "epoch": 0.5468811062678491, + "flos": 29166323339520.0, + "grad_norm": 1.772487886139895, + "language_loss": 0.66687673, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.68809879, + "num_input_tokens_seen": 195981125, + "step": 9096, + "time_per_iteration": 2.768289804458618 + }, + { + "auxiliary_loss_clip": 0.01013718, + "auxiliary_loss_mlp": 0.01003083, + "balance_loss_clip": 1.01639581, + "balance_loss_mlp": 1.00179529, + "epoch": 0.546941229520517, + "flos": 67867037982720.0, + "grad_norm": 0.7380745619271847, + "language_loss": 0.57528484, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.59545285, + "num_input_tokens_seen": 196038880, + "step": 9097, + "time_per_iteration": 3.353034496307373 + }, + { + "auxiliary_loss_clip": 0.01023908, + "auxiliary_loss_mlp": 0.00999165, + "balance_loss_clip": 1.01245689, + "balance_loss_mlp": 0.99754351, + "epoch": 0.5470013527731851, + "flos": 58270306625280.0, + "grad_norm": 0.9199423088856966, + "language_loss": 0.64710629, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.66733694, + "num_input_tokens_seen": 196099215, + "step": 9098, + "time_per_iteration": 3.1356828212738037 + }, + { + "auxiliary_loss_clip": 0.01114825, + "auxiliary_loss_mlp": 0.01037808, + "balance_loss_clip": 1.04415989, + "balance_loss_mlp": 1.02378869, + "epoch": 0.547061476025853, + "flos": 22965915582720.0, + "grad_norm": 2.132166365058938, + "language_loss": 0.73123235, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.75275862, + "num_input_tokens_seen": 196120370, + "step": 9099, + "time_per_iteration": 2.662252426147461 + }, + { + "auxiliary_loss_clip": 0.01097751, + "auxiliary_loss_mlp": 0.01035706, + "balance_loss_clip": 1.04278708, + "balance_loss_mlp": 1.02327275, + "epoch": 0.547121599278521, + "flos": 29968244426880.0, + "grad_norm": 1.880355780986747, + "language_loss": 0.72515011, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.74648476, + "num_input_tokens_seen": 196139075, + "step": 9100, + "time_per_iteration": 2.859636068344116 + }, + { + "auxiliary_loss_clip": 0.01106059, + "auxiliary_loss_mlp": 0.00770753, + "balance_loss_clip": 1.04162157, + "balance_loss_mlp": 1.00017691, + "epoch": 0.5471817225311889, + "flos": 36535443914880.0, + "grad_norm": 1.8314110929237357, + "language_loss": 0.68211091, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.70087898, + "num_input_tokens_seen": 196159990, + "step": 9101, + "time_per_iteration": 2.747811794281006 + }, + { + "auxiliary_loss_clip": 0.01123228, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.04393971, + "balance_loss_mlp": 1.02121687, + "epoch": 0.5472418457838569, + "flos": 25775243124480.0, + "grad_norm": 1.907951209204745, + "language_loss": 0.77796781, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.79954892, + "num_input_tokens_seen": 196180570, + "step": 9102, + "time_per_iteration": 2.6425788402557373 + }, + { + "auxiliary_loss_clip": 0.01087581, + "auxiliary_loss_mlp": 0.01039397, + "balance_loss_clip": 1.04114008, + "balance_loss_mlp": 1.02447212, + "epoch": 0.5473019690365248, + "flos": 27887687925120.0, + "grad_norm": 1.553646996990172, + "language_loss": 0.72080058, + "learning_rate": 1.791046361258413e-06, + "loss": 0.74207032, + "num_input_tokens_seen": 196200300, + "step": 9103, + "time_per_iteration": 2.7307486534118652 + }, + { + "auxiliary_loss_clip": 0.01088884, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.0425241, + "balance_loss_mlp": 1.01806211, + "epoch": 0.5473620922891929, + "flos": 57631490219520.0, + "grad_norm": 1.4283303897304696, + "language_loss": 0.65195155, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.67315584, + "num_input_tokens_seen": 196228525, + "step": 9104, + "time_per_iteration": 3.0792930126190186 + }, + { + "auxiliary_loss_clip": 0.01109949, + "auxiliary_loss_mlp": 0.0103298, + "balance_loss_clip": 1.04480743, + "balance_loss_mlp": 1.01883578, + "epoch": 0.5474222155418608, + "flos": 19354056422400.0, + "grad_norm": 1.90483998435302, + "language_loss": 0.82428771, + "learning_rate": 1.790271716558888e-06, + "loss": 0.84571701, + "num_input_tokens_seen": 196247690, + "step": 9105, + "time_per_iteration": 3.3235061168670654 + }, + { + "auxiliary_loss_clip": 0.01119165, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.04210079, + "balance_loss_mlp": 1.01735604, + "epoch": 0.5474823387945288, + "flos": 25120448144640.0, + "grad_norm": 1.6592382133296117, + "language_loss": 0.80052161, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.82201409, + "num_input_tokens_seen": 196268555, + "step": 9106, + "time_per_iteration": 2.7082676887512207 + }, + { + "auxiliary_loss_clip": 0.01115376, + "auxiliary_loss_mlp": 0.01036861, + "balance_loss_clip": 1.04689944, + "balance_loss_mlp": 1.02419519, + "epoch": 0.5475424620471967, + "flos": 18004174381440.0, + "grad_norm": 1.7933883779040884, + "language_loss": 0.69402343, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.71554577, + "num_input_tokens_seen": 196285585, + "step": 9107, + "time_per_iteration": 2.626214027404785 + }, + { + "auxiliary_loss_clip": 0.01115289, + "auxiliary_loss_mlp": 0.01035057, + "balance_loss_clip": 1.04319263, + "balance_loss_mlp": 1.02166939, + "epoch": 0.5476025852998647, + "flos": 22309324922880.0, + "grad_norm": 2.6929722220667824, + "language_loss": 0.63537276, + "learning_rate": 1.789109809193197e-06, + "loss": 0.65687621, + "num_input_tokens_seen": 196305085, + "step": 9108, + "time_per_iteration": 2.6056766510009766 + }, + { + "auxiliary_loss_clip": 0.01122102, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.0446291, + "balance_loss_mlp": 1.01750922, + "epoch": 0.5476627085525327, + "flos": 20120497850880.0, + "grad_norm": 1.7311986454715018, + "language_loss": 0.75234431, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.77386445, + "num_input_tokens_seen": 196323945, + "step": 9109, + "time_per_iteration": 2.562833786010742 + }, + { + "auxiliary_loss_clip": 0.01093609, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.04307365, + "balance_loss_mlp": 1.02143562, + "epoch": 0.5477228318052006, + "flos": 17712579772800.0, + "grad_norm": 1.7887859684809904, + "language_loss": 0.77939326, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.800686, + "num_input_tokens_seen": 196342200, + "step": 9110, + "time_per_iteration": 2.62839674949646 + }, + { + "auxiliary_loss_clip": 0.01106302, + "auxiliary_loss_mlp": 0.01032426, + "balance_loss_clip": 1.04262304, + "balance_loss_mlp": 1.01997423, + "epoch": 0.5477829550578687, + "flos": 25848895962240.0, + "grad_norm": 1.525983194059855, + "language_loss": 0.71175343, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.73314071, + "num_input_tokens_seen": 196362940, + "step": 9111, + "time_per_iteration": 2.664486885070801 + }, + { + "auxiliary_loss_clip": 0.01111586, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.0436976, + "balance_loss_mlp": 1.0317409, + "epoch": 0.5478430783105366, + "flos": 23039676161280.0, + "grad_norm": 1.5197619181850293, + "language_loss": 0.71096945, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.73253489, + "num_input_tokens_seen": 196383070, + "step": 9112, + "time_per_iteration": 2.7334086894989014 + }, + { + "auxiliary_loss_clip": 0.01067523, + "auxiliary_loss_mlp": 0.01034936, + "balance_loss_clip": 1.03873658, + "balance_loss_mlp": 1.02179968, + "epoch": 0.5479032015632046, + "flos": 16071210864000.0, + "grad_norm": 2.172543516099556, + "language_loss": 0.87877554, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.89980012, + "num_input_tokens_seen": 196398485, + "step": 9113, + "time_per_iteration": 5.9666571617126465 + }, + { + "auxiliary_loss_clip": 0.01070074, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.04229951, + "balance_loss_mlp": 1.01853991, + "epoch": 0.5479633248158725, + "flos": 24278701852800.0, + "grad_norm": 1.4694487805740626, + "language_loss": 0.73041236, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.7514348, + "num_input_tokens_seen": 196417725, + "step": 9114, + "time_per_iteration": 4.333765745162964 + }, + { + "auxiliary_loss_clip": 0.01093195, + "auxiliary_loss_mlp": 0.00770887, + "balance_loss_clip": 1.03821266, + "balance_loss_mlp": 1.00027823, + "epoch": 0.5480234480685405, + "flos": 26358216860160.0, + "grad_norm": 1.6145561495164014, + "language_loss": 0.72155976, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.74020058, + "num_input_tokens_seen": 196437840, + "step": 9115, + "time_per_iteration": 2.6793766021728516 + }, + { + "auxiliary_loss_clip": 0.01084634, + "auxiliary_loss_mlp": 0.00774539, + "balance_loss_clip": 1.03983831, + "balance_loss_mlp": 1.00038791, + "epoch": 0.5480835713212084, + "flos": 22055077480320.0, + "grad_norm": 1.7266092862770852, + "language_loss": 0.72229278, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.74088448, + "num_input_tokens_seen": 196457300, + "step": 9116, + "time_per_iteration": 2.738142490386963 + }, + { + "auxiliary_loss_clip": 0.01095127, + "auxiliary_loss_mlp": 0.01039685, + "balance_loss_clip": 1.04102373, + "balance_loss_mlp": 1.0259639, + "epoch": 0.5481436945738765, + "flos": 25301042749440.0, + "grad_norm": 4.413930764564679, + "language_loss": 0.76158273, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.78293079, + "num_input_tokens_seen": 196476720, + "step": 9117, + "time_per_iteration": 2.693359613418579 + }, + { + "auxiliary_loss_clip": 0.01070482, + "auxiliary_loss_mlp": 0.01035701, + "balance_loss_clip": 1.03514457, + "balance_loss_mlp": 1.02292752, + "epoch": 0.5482038178265444, + "flos": 33580857772800.0, + "grad_norm": 1.575829874902699, + "language_loss": 0.62537289, + "learning_rate": 1.785237306671674e-06, + "loss": 0.64643478, + "num_input_tokens_seen": 196496765, + "step": 9118, + "time_per_iteration": 4.42430305480957 + }, + { + "auxiliary_loss_clip": 0.01124628, + "auxiliary_loss_mlp": 0.01036624, + "balance_loss_clip": 1.04479444, + "balance_loss_mlp": 1.02259278, + "epoch": 0.5482639410792124, + "flos": 19026192055680.0, + "grad_norm": 2.694246810130355, + "language_loss": 0.79018009, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.81179261, + "num_input_tokens_seen": 196516220, + "step": 9119, + "time_per_iteration": 2.606593608856201 + }, + { + "auxiliary_loss_clip": 0.01092726, + "auxiliary_loss_mlp": 0.00769453, + "balance_loss_clip": 1.04150975, + "balance_loss_mlp": 1.00022948, + "epoch": 0.5483240643318803, + "flos": 25410318900480.0, + "grad_norm": 1.8682271604905119, + "language_loss": 0.82534289, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.8439647, + "num_input_tokens_seen": 196533860, + "step": 9120, + "time_per_iteration": 2.694546699523926 + }, + { + "auxiliary_loss_clip": 0.01089359, + "auxiliary_loss_mlp": 0.01039031, + "balance_loss_clip": 1.04395008, + "balance_loss_mlp": 1.02531016, + "epoch": 0.5483841875845483, + "flos": 21466896272640.0, + "grad_norm": 1.8000226938726367, + "language_loss": 0.80031526, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.82159919, + "num_input_tokens_seen": 196551305, + "step": 9121, + "time_per_iteration": 2.7422945499420166 + }, + { + "auxiliary_loss_clip": 0.01076146, + "auxiliary_loss_mlp": 0.01038978, + "balance_loss_clip": 1.03803313, + "balance_loss_mlp": 1.02408934, + "epoch": 0.5484443108372163, + "flos": 24747263792640.0, + "grad_norm": 1.9827939120507885, + "language_loss": 0.60996848, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.63111973, + "num_input_tokens_seen": 196569420, + "step": 9122, + "time_per_iteration": 2.782677412033081 + }, + { + "auxiliary_loss_clip": 0.01106377, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.04853153, + "balance_loss_mlp": 1.0268079, + "epoch": 0.5485044340898843, + "flos": 25375377945600.0, + "grad_norm": 1.5587852273808862, + "language_loss": 0.71594763, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.73739696, + "num_input_tokens_seen": 196590610, + "step": 9123, + "time_per_iteration": 2.756350517272949 + }, + { + "auxiliary_loss_clip": 0.01121133, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.04210067, + "balance_loss_mlp": 1.0208813, + "epoch": 0.5485645573425523, + "flos": 12641167370880.0, + "grad_norm": 2.3735658261361845, + "language_loss": 0.83559448, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.85714072, + "num_input_tokens_seen": 196606495, + "step": 9124, + "time_per_iteration": 2.61197829246521 + }, + { + "auxiliary_loss_clip": 0.01094486, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.04321349, + "balance_loss_mlp": 1.01992834, + "epoch": 0.5486246805952202, + "flos": 28329425383680.0, + "grad_norm": 1.5486509111854319, + "language_loss": 0.80518043, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.82645559, + "num_input_tokens_seen": 196626365, + "step": 9125, + "time_per_iteration": 2.773972749710083 + }, + { + "auxiliary_loss_clip": 0.01111849, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.04336679, + "balance_loss_mlp": 1.01903141, + "epoch": 0.5486848038478882, + "flos": 16800017817600.0, + "grad_norm": 4.333134351852335, + "language_loss": 0.74312758, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.76456887, + "num_input_tokens_seen": 196644465, + "step": 9126, + "time_per_iteration": 2.654529333114624 + }, + { + "auxiliary_loss_clip": 0.01107646, + "auxiliary_loss_mlp": 0.01037249, + "balance_loss_clip": 1.03968537, + "balance_loss_mlp": 1.02193701, + "epoch": 0.5487449271005561, + "flos": 17236224581760.0, + "grad_norm": 2.710645426319007, + "language_loss": 0.66802239, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.6894713, + "num_input_tokens_seen": 196659160, + "step": 9127, + "time_per_iteration": 2.615807294845581 + }, + { + "auxiliary_loss_clip": 0.01078683, + "auxiliary_loss_mlp": 0.01039383, + "balance_loss_clip": 1.03928149, + "balance_loss_mlp": 1.0249052, + "epoch": 0.5488050503532241, + "flos": 17340867878400.0, + "grad_norm": 2.0894273864631225, + "language_loss": 0.82909453, + "learning_rate": 1.781365618532181e-06, + "loss": 0.85027516, + "num_input_tokens_seen": 196677410, + "step": 9128, + "time_per_iteration": 2.681060791015625 + }, + { + "auxiliary_loss_clip": 0.01074302, + "auxiliary_loss_mlp": 0.01037438, + "balance_loss_clip": 1.03565645, + "balance_loss_mlp": 1.02254319, + "epoch": 0.548865173605892, + "flos": 17239169496960.0, + "grad_norm": 1.9025486027385248, + "language_loss": 0.74247289, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.76359022, + "num_input_tokens_seen": 196696765, + "step": 9129, + "time_per_iteration": 2.681459426879883 + }, + { + "auxiliary_loss_clip": 0.01077104, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.03771412, + "balance_loss_mlp": 1.02210581, + "epoch": 0.5489252968585601, + "flos": 17456716218240.0, + "grad_norm": 3.0707794644461854, + "language_loss": 0.63489515, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.65603966, + "num_input_tokens_seen": 196714895, + "step": 9130, + "time_per_iteration": 2.743734359741211 + }, + { + "auxiliary_loss_clip": 0.01124543, + "auxiliary_loss_mlp": 0.00771634, + "balance_loss_clip": 1.04329586, + "balance_loss_mlp": 1.00023222, + "epoch": 0.548985420111228, + "flos": 26323383646080.0, + "grad_norm": 1.7961020275949398, + "language_loss": 0.62998879, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.64895058, + "num_input_tokens_seen": 196735510, + "step": 9131, + "time_per_iteration": 2.7136600017547607 + }, + { + "auxiliary_loss_clip": 0.01109321, + "auxiliary_loss_mlp": 0.01039388, + "balance_loss_clip": 1.04004657, + "balance_loss_mlp": 1.02416492, + "epoch": 0.549045543363896, + "flos": 18693730748160.0, + "grad_norm": 1.6718560353245449, + "language_loss": 0.7504952, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.77198231, + "num_input_tokens_seen": 196752855, + "step": 9132, + "time_per_iteration": 2.686460494995117 + }, + { + "auxiliary_loss_clip": 0.01107553, + "auxiliary_loss_mlp": 0.01033276, + "balance_loss_clip": 1.03815818, + "balance_loss_mlp": 1.02046108, + "epoch": 0.5491056666165639, + "flos": 24717386655360.0, + "grad_norm": 1.5443073078358045, + "language_loss": 0.81107825, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.83248657, + "num_input_tokens_seen": 196772230, + "step": 9133, + "time_per_iteration": 2.607304811477661 + }, + { + "auxiliary_loss_clip": 0.0109676, + "auxiliary_loss_mlp": 0.00770878, + "balance_loss_clip": 1.04211152, + "balance_loss_mlp": 1.00023055, + "epoch": 0.5491657898692319, + "flos": 21576926609280.0, + "grad_norm": 2.2143971437865275, + "language_loss": 0.69978988, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.71846628, + "num_input_tokens_seen": 196790405, + "step": 9134, + "time_per_iteration": 2.655400037765503 + }, + { + "auxiliary_loss_clip": 0.01085592, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.03952289, + "balance_loss_mlp": 1.0263567, + "epoch": 0.5492259131219, + "flos": 50476432746240.0, + "grad_norm": 2.156005038881863, + "language_loss": 0.61240542, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.63366163, + "num_input_tokens_seen": 196813785, + "step": 9135, + "time_per_iteration": 2.911567449569702 + }, + { + "auxiliary_loss_clip": 0.01112825, + "auxiliary_loss_mlp": 0.01036574, + "balance_loss_clip": 1.042696, + "balance_loss_mlp": 1.02169049, + "epoch": 0.5492860363745679, + "flos": 25119262995840.0, + "grad_norm": 1.746391133416305, + "language_loss": 0.72368252, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.74517649, + "num_input_tokens_seen": 196834390, + "step": 9136, + "time_per_iteration": 2.6732101440429688 + }, + { + "auxiliary_loss_clip": 0.01060281, + "auxiliary_loss_mlp": 0.0104408, + "balance_loss_clip": 1.03961897, + "balance_loss_mlp": 1.02839267, + "epoch": 0.5493461596272359, + "flos": 22633777497600.0, + "grad_norm": 2.424259272269788, + "language_loss": 0.68256485, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.70360851, + "num_input_tokens_seen": 196853290, + "step": 9137, + "time_per_iteration": 2.7947540283203125 + }, + { + "auxiliary_loss_clip": 0.01030828, + "auxiliary_loss_mlp": 0.01011299, + "balance_loss_clip": 1.01489806, + "balance_loss_mlp": 1.00992203, + "epoch": 0.5494062828799038, + "flos": 66151800754560.0, + "grad_norm": 0.7420439748923869, + "language_loss": 0.65270352, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67312479, + "num_input_tokens_seen": 196913120, + "step": 9138, + "time_per_iteration": 3.2256250381469727 + }, + { + "auxiliary_loss_clip": 0.0111256, + "auxiliary_loss_mlp": 0.01032689, + "balance_loss_clip": 1.04488194, + "balance_loss_mlp": 1.01902211, + "epoch": 0.5494664061325718, + "flos": 21105958458240.0, + "grad_norm": 1.8659950166851553, + "language_loss": 0.75243253, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.77388501, + "num_input_tokens_seen": 196931530, + "step": 9139, + "time_per_iteration": 2.7239251136779785 + }, + { + "auxiliary_loss_clip": 0.01110681, + "auxiliary_loss_mlp": 0.01033207, + "balance_loss_clip": 1.04175556, + "balance_loss_mlp": 1.01932561, + "epoch": 0.5495265293852397, + "flos": 14392566616320.0, + "grad_norm": 1.6260992267363037, + "language_loss": 0.70765269, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.72909158, + "num_input_tokens_seen": 196949430, + "step": 9140, + "time_per_iteration": 2.647174119949341 + }, + { + "auxiliary_loss_clip": 0.01090583, + "auxiliary_loss_mlp": 0.01036785, + "balance_loss_clip": 1.03731537, + "balance_loss_mlp": 1.02207434, + "epoch": 0.5495866526379077, + "flos": 25549148966400.0, + "grad_norm": 1.8985191424105816, + "language_loss": 0.7687242, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.78999794, + "num_input_tokens_seen": 196968265, + "step": 9141, + "time_per_iteration": 2.65411639213562 + }, + { + "auxiliary_loss_clip": 0.01084812, + "auxiliary_loss_mlp": 0.01036963, + "balance_loss_clip": 1.0427072, + "balance_loss_mlp": 1.02342081, + "epoch": 0.5496467758905756, + "flos": 21317256213120.0, + "grad_norm": 2.1277262842794697, + "language_loss": 0.7463578, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.7675755, + "num_input_tokens_seen": 196984930, + "step": 9142, + "time_per_iteration": 2.7200329303741455 + }, + { + "auxiliary_loss_clip": 0.01098795, + "auxiliary_loss_mlp": 0.01036884, + "balance_loss_clip": 1.04416585, + "balance_loss_mlp": 1.02186954, + "epoch": 0.5497068991432437, + "flos": 22233086305920.0, + "grad_norm": 5.155975597587774, + "language_loss": 0.7661894, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.78754616, + "num_input_tokens_seen": 197002320, + "step": 9143, + "time_per_iteration": 2.6951520442962646 + }, + { + "auxiliary_loss_clip": 0.01091779, + "auxiliary_loss_mlp": 0.01037521, + "balance_loss_clip": 1.03912258, + "balance_loss_mlp": 1.02332926, + "epoch": 0.5497670223959116, + "flos": 18479093028480.0, + "grad_norm": 2.8186227807908466, + "language_loss": 0.79572552, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.81701857, + "num_input_tokens_seen": 197020825, + "step": 9144, + "time_per_iteration": 2.661098003387451 + }, + { + "auxiliary_loss_clip": 0.01112844, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.04339552, + "balance_loss_mlp": 1.02086163, + "epoch": 0.5498271456485796, + "flos": 29205107049600.0, + "grad_norm": 1.6865855857111283, + "language_loss": 0.70998669, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.731462, + "num_input_tokens_seen": 197040450, + "step": 9145, + "time_per_iteration": 2.6857175827026367 + }, + { + "auxiliary_loss_clip": 0.01109884, + "auxiliary_loss_mlp": 0.01033378, + "balance_loss_clip": 1.04158354, + "balance_loss_mlp": 1.02063489, + "epoch": 0.5498872689012475, + "flos": 34824372664320.0, + "grad_norm": 1.7292068512536125, + "language_loss": 0.70875257, + "learning_rate": 1.774398678985076e-06, + "loss": 0.73018515, + "num_input_tokens_seen": 197063930, + "step": 9146, + "time_per_iteration": 2.7719805240631104 + }, + { + "auxiliary_loss_clip": 0.01096176, + "auxiliary_loss_mlp": 0.01029792, + "balance_loss_clip": 1.04054928, + "balance_loss_mlp": 1.01708448, + "epoch": 0.5499473921539155, + "flos": 25921938268800.0, + "grad_norm": 1.7336366982972622, + "language_loss": 0.63770372, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.65896338, + "num_input_tokens_seen": 197082660, + "step": 9147, + "time_per_iteration": 2.6603379249572754 + }, + { + "auxiliary_loss_clip": 0.01125139, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.04582083, + "balance_loss_mlp": 1.01920164, + "epoch": 0.5500075154065835, + "flos": 22273701609600.0, + "grad_norm": 2.1607061922348088, + "language_loss": 0.81009579, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.8316772, + "num_input_tokens_seen": 197100675, + "step": 9148, + "time_per_iteration": 2.620183229446411 + }, + { + "auxiliary_loss_clip": 0.01101315, + "auxiliary_loss_mlp": 0.01039357, + "balance_loss_clip": 1.04367983, + "balance_loss_mlp": 1.02550507, + "epoch": 0.5500676386592515, + "flos": 28037507552640.0, + "grad_norm": 1.7340881050910257, + "language_loss": 0.79154336, + "learning_rate": 1.773237789559453e-06, + "loss": 0.81295007, + "num_input_tokens_seen": 197121320, + "step": 9149, + "time_per_iteration": 2.734495162963867 + }, + { + "auxiliary_loss_clip": 0.01082615, + "auxiliary_loss_mlp": 0.0102795, + "balance_loss_clip": 1.0412097, + "balance_loss_mlp": 1.01476002, + "epoch": 0.5501277619119195, + "flos": 23914819123200.0, + "grad_norm": 4.0693062888880185, + "language_loss": 0.72006851, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.74117416, + "num_input_tokens_seen": 197138965, + "step": 9150, + "time_per_iteration": 2.66481876373291 + }, + { + "auxiliary_loss_clip": 0.01099742, + "auxiliary_loss_mlp": 0.01033518, + "balance_loss_clip": 1.03804266, + "balance_loss_mlp": 1.0189085, + "epoch": 0.5501878851645874, + "flos": 20923783655040.0, + "grad_norm": 3.1249847499070014, + "language_loss": 0.75043446, + "learning_rate": 1.772463906245477e-06, + "loss": 0.77176708, + "num_input_tokens_seen": 197156460, + "step": 9151, + "time_per_iteration": 2.704946517944336 + }, + { + "auxiliary_loss_clip": 0.0109205, + "auxiliary_loss_mlp": 0.01033656, + "balance_loss_clip": 1.03899741, + "balance_loss_mlp": 1.01981556, + "epoch": 0.5502480084172554, + "flos": 20665298407680.0, + "grad_norm": 2.3903222148465035, + "language_loss": 0.76302028, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.78427732, + "num_input_tokens_seen": 197175140, + "step": 9152, + "time_per_iteration": 5.871058464050293 + }, + { + "auxiliary_loss_clip": 0.01098821, + "auxiliary_loss_mlp": 0.01033709, + "balance_loss_clip": 1.04291546, + "balance_loss_mlp": 1.02058983, + "epoch": 0.5503081316699233, + "flos": 26432552056320.0, + "grad_norm": 1.865148989318078, + "language_loss": 0.82033801, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.84166336, + "num_input_tokens_seen": 197194345, + "step": 9153, + "time_per_iteration": 2.741382598876953 + }, + { + "auxiliary_loss_clip": 0.01110131, + "auxiliary_loss_mlp": 0.01029037, + "balance_loss_clip": 1.04423809, + "balance_loss_mlp": 1.01572764, + "epoch": 0.5503682549225913, + "flos": 30629144718720.0, + "grad_norm": 1.7497509025726563, + "language_loss": 0.74392802, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.76531971, + "num_input_tokens_seen": 197215535, + "step": 9154, + "time_per_iteration": 4.345115900039673 + }, + { + "auxiliary_loss_clip": 0.01104154, + "auxiliary_loss_mlp": 0.01039546, + "balance_loss_clip": 1.04041803, + "balance_loss_mlp": 1.02451348, + "epoch": 0.5504283781752592, + "flos": 22565439872640.0, + "grad_norm": 1.5994441828682415, + "language_loss": 0.73138744, + "learning_rate": 1.770916243273199e-06, + "loss": 0.75282443, + "num_input_tokens_seen": 197234945, + "step": 9155, + "time_per_iteration": 2.6851611137390137 + }, + { + "auxiliary_loss_clip": 0.01021957, + "auxiliary_loss_mlp": 0.01001594, + "balance_loss_clip": 1.01543474, + "balance_loss_mlp": 1.00016963, + "epoch": 0.5504885014279273, + "flos": 67901009270400.0, + "grad_norm": 0.7575867212346565, + "language_loss": 0.55399221, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.57422775, + "num_input_tokens_seen": 197302285, + "step": 9156, + "time_per_iteration": 3.300373077392578 + }, + { + "auxiliary_loss_clip": 0.0110824, + "auxiliary_loss_mlp": 0.01037205, + "balance_loss_clip": 1.03954601, + "balance_loss_mlp": 1.02354383, + "epoch": 0.5505486246805952, + "flos": 22450058409600.0, + "grad_norm": 1.7338338818713679, + "language_loss": 0.82676858, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.84822297, + "num_input_tokens_seen": 197321575, + "step": 9157, + "time_per_iteration": 4.260001182556152 + }, + { + "auxiliary_loss_clip": 0.01128779, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.04512608, + "balance_loss_mlp": 1.02101421, + "epoch": 0.5506087479332632, + "flos": 26906896085760.0, + "grad_norm": 2.1665568405651916, + "language_loss": 0.7574966, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.77914703, + "num_input_tokens_seen": 197340255, + "step": 9158, + "time_per_iteration": 2.634035587310791 + }, + { + "auxiliary_loss_clip": 0.01079995, + "auxiliary_loss_mlp": 0.01032346, + "balance_loss_clip": 1.04036868, + "balance_loss_mlp": 1.01919723, + "epoch": 0.5506688711859311, + "flos": 22930256355840.0, + "grad_norm": 1.7765349720842452, + "language_loss": 0.7011236, + "learning_rate": 1.769368719290979e-06, + "loss": 0.72224694, + "num_input_tokens_seen": 197360360, + "step": 9159, + "time_per_iteration": 2.765982151031494 + }, + { + "auxiliary_loss_clip": 0.01074937, + "auxiliary_loss_mlp": 0.00772606, + "balance_loss_clip": 1.03859997, + "balance_loss_mlp": 1.00024915, + "epoch": 0.5507289944385991, + "flos": 29606408772480.0, + "grad_norm": 1.5184177470515237, + "language_loss": 0.6844312, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.70290661, + "num_input_tokens_seen": 197381905, + "step": 9160, + "time_per_iteration": 2.7715611457824707 + }, + { + "auxiliary_loss_clip": 0.01121201, + "auxiliary_loss_mlp": 0.01036642, + "balance_loss_clip": 1.04361653, + "balance_loss_mlp": 1.02297473, + "epoch": 0.5507891176912671, + "flos": 15334431091200.0, + "grad_norm": 2.346039254378587, + "language_loss": 0.71789527, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.7394737, + "num_input_tokens_seen": 197398555, + "step": 9161, + "time_per_iteration": 2.641042470932007 + }, + { + "auxiliary_loss_clip": 0.01112875, + "auxiliary_loss_mlp": 0.01042589, + "balance_loss_clip": 1.04357731, + "balance_loss_mlp": 1.02879643, + "epoch": 0.5508492409439351, + "flos": 26578313447040.0, + "grad_norm": 1.6233913779896265, + "language_loss": 0.69443804, + "learning_rate": 1.768208168081359e-06, + "loss": 0.71599269, + "num_input_tokens_seen": 197419630, + "step": 9162, + "time_per_iteration": 2.693645715713501 + }, + { + "auxiliary_loss_clip": 0.01122811, + "auxiliary_loss_mlp": 0.01038789, + "balance_loss_clip": 1.04462349, + "balance_loss_mlp": 1.02506185, + "epoch": 0.5509093641966031, + "flos": 25443428261760.0, + "grad_norm": 1.863003505887403, + "language_loss": 0.85338551, + "learning_rate": 1.767821335237733e-06, + "loss": 0.87500155, + "num_input_tokens_seen": 197438480, + "step": 9163, + "time_per_iteration": 2.6538877487182617 + }, + { + "auxiliary_loss_clip": 0.01088872, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.04132617, + "balance_loss_mlp": 1.01908576, + "epoch": 0.550969487449271, + "flos": 18698543170560.0, + "grad_norm": 1.8611061255519936, + "language_loss": 0.80892253, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.83013415, + "num_input_tokens_seen": 197456755, + "step": 9164, + "time_per_iteration": 2.813016891479492 + }, + { + "auxiliary_loss_clip": 0.0110727, + "auxiliary_loss_mlp": 0.01033027, + "balance_loss_clip": 1.04617882, + "balance_loss_mlp": 1.01856649, + "epoch": 0.551029610701939, + "flos": 22708723224960.0, + "grad_norm": 1.8149479270660511, + "language_loss": 0.73350954, + "learning_rate": 1.767047695977863e-06, + "loss": 0.75491256, + "num_input_tokens_seen": 197475530, + "step": 9165, + "time_per_iteration": 2.6487855911254883 + }, + { + "auxiliary_loss_clip": 0.01103747, + "auxiliary_loss_mlp": 0.01041326, + "balance_loss_clip": 1.04083133, + "balance_loss_mlp": 1.02677011, + "epoch": 0.5510897339546069, + "flos": 12420496166400.0, + "grad_norm": 1.9553906281347788, + "language_loss": 0.78998721, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.8114379, + "num_input_tokens_seen": 197490835, + "step": 9166, + "time_per_iteration": 2.578125 + }, + { + "auxiliary_loss_clip": 0.01089384, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.03881669, + "balance_loss_mlp": 1.01822138, + "epoch": 0.5511498572072749, + "flos": 18770579896320.0, + "grad_norm": 2.156469581369372, + "language_loss": 0.76529676, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.78651255, + "num_input_tokens_seen": 197508770, + "step": 9167, + "time_per_iteration": 2.7045888900756836 + }, + { + "auxiliary_loss_clip": 0.01112145, + "auxiliary_loss_mlp": 0.01032029, + "balance_loss_clip": 1.04281187, + "balance_loss_mlp": 1.01811707, + "epoch": 0.5512099804599428, + "flos": 19573326996480.0, + "grad_norm": 2.0156954118398227, + "language_loss": 0.79765004, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.81909174, + "num_input_tokens_seen": 197527340, + "step": 9168, + "time_per_iteration": 2.669908046722412 + }, + { + "auxiliary_loss_clip": 0.0111534, + "auxiliary_loss_mlp": 0.0104239, + "balance_loss_clip": 1.04542589, + "balance_loss_mlp": 1.02830565, + "epoch": 0.5512701037126109, + "flos": 26245600744320.0, + "grad_norm": 1.6113858397633185, + "language_loss": 0.69293267, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.71450996, + "num_input_tokens_seen": 197547280, + "step": 9169, + "time_per_iteration": 2.70609450340271 + }, + { + "auxiliary_loss_clip": 0.01106964, + "auxiliary_loss_mlp": 0.01029287, + "balance_loss_clip": 1.04113257, + "balance_loss_mlp": 1.01710367, + "epoch": 0.5513302269652788, + "flos": 21945406279680.0, + "grad_norm": 1.9890616519308366, + "language_loss": 0.85510826, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.87647074, + "num_input_tokens_seen": 197565045, + "step": 9170, + "time_per_iteration": 2.670785427093506 + }, + { + "auxiliary_loss_clip": 0.01022762, + "auxiliary_loss_mlp": 0.01003909, + "balance_loss_clip": 1.02287233, + "balance_loss_mlp": 1.00240731, + "epoch": 0.5513903502179468, + "flos": 68235948616320.0, + "grad_norm": 0.7781167580815929, + "language_loss": 0.59840322, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.61866993, + "num_input_tokens_seen": 197625005, + "step": 9171, + "time_per_iteration": 3.2524025440216064 + }, + { + "auxiliary_loss_clip": 0.01085077, + "auxiliary_loss_mlp": 0.01041997, + "balance_loss_clip": 1.03855562, + "balance_loss_mlp": 1.02763844, + "epoch": 0.5514504734706147, + "flos": 18734238311040.0, + "grad_norm": 1.556060427891405, + "language_loss": 0.70670319, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.72797394, + "num_input_tokens_seen": 197645050, + "step": 9172, + "time_per_iteration": 2.708811044692993 + }, + { + "auxiliary_loss_clip": 0.01120195, + "auxiliary_loss_mlp": 0.01038202, + "balance_loss_clip": 1.04229403, + "balance_loss_mlp": 1.02470756, + "epoch": 0.5515105967232827, + "flos": 22270972176000.0, + "grad_norm": 1.7490660409709138, + "language_loss": 0.75727642, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.77886033, + "num_input_tokens_seen": 197663910, + "step": 9173, + "time_per_iteration": 2.6022469997406006 + }, + { + "auxiliary_loss_clip": 0.01083041, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.04071558, + "balance_loss_mlp": 1.02043712, + "epoch": 0.5515707199759508, + "flos": 22557682535040.0, + "grad_norm": 1.9060639151270278, + "language_loss": 0.75156957, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.77273941, + "num_input_tokens_seen": 197681580, + "step": 9174, + "time_per_iteration": 2.758668899536133 + }, + { + "auxiliary_loss_clip": 0.01102936, + "auxiliary_loss_mlp": 0.01034738, + "balance_loss_clip": 1.0414834, + "balance_loss_mlp": 1.02056456, + "epoch": 0.5516308432286187, + "flos": 28291072636800.0, + "grad_norm": 2.209520073538634, + "language_loss": 0.72830188, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.74967873, + "num_input_tokens_seen": 197702095, + "step": 9175, + "time_per_iteration": 2.6674885749816895 + }, + { + "auxiliary_loss_clip": 0.01112767, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.04439914, + "balance_loss_mlp": 1.02324057, + "epoch": 0.5516909664812867, + "flos": 18764474584320.0, + "grad_norm": 1.7828415192194789, + "language_loss": 0.69321132, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.71470201, + "num_input_tokens_seen": 197720720, + "step": 9176, + "time_per_iteration": 2.721855878829956 + }, + { + "auxiliary_loss_clip": 0.01112205, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.04404604, + "balance_loss_mlp": 1.02004051, + "epoch": 0.5517510897339546, + "flos": 27740346336000.0, + "grad_norm": 1.6320384621008008, + "language_loss": 0.70890021, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.73034984, + "num_input_tokens_seen": 197741820, + "step": 9177, + "time_per_iteration": 2.6951122283935547 + }, + { + "auxiliary_loss_clip": 0.01111799, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.0442878, + "balance_loss_mlp": 1.01811981, + "epoch": 0.5518112129866226, + "flos": 18404470523520.0, + "grad_norm": 1.5626252071778102, + "language_loss": 0.80647016, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.82790309, + "num_input_tokens_seen": 197759160, + "step": 9178, + "time_per_iteration": 2.6048829555511475 + }, + { + "auxiliary_loss_clip": 0.01063405, + "auxiliary_loss_mlp": 0.01046955, + "balance_loss_clip": 1.04167509, + "balance_loss_mlp": 1.03129053, + "epoch": 0.5518713362392905, + "flos": 25082670015360.0, + "grad_norm": 2.211793529411812, + "language_loss": 0.7505163, + "learning_rate": 1.761633217089826e-06, + "loss": 0.77161986, + "num_input_tokens_seen": 197779760, + "step": 9179, + "time_per_iteration": 2.808234453201294 + }, + { + "auxiliary_loss_clip": 0.01114825, + "auxiliary_loss_mlp": 0.0104374, + "balance_loss_clip": 1.04556203, + "balance_loss_mlp": 1.02984655, + "epoch": 0.5519314594919585, + "flos": 36538999361280.0, + "grad_norm": 1.9934221112233521, + "language_loss": 0.7009306, + "learning_rate": 1.761246535912924e-06, + "loss": 0.7225163, + "num_input_tokens_seen": 197801545, + "step": 9180, + "time_per_iteration": 2.788222551345825 + }, + { + "auxiliary_loss_clip": 0.01106377, + "auxiliary_loss_mlp": 0.01041353, + "balance_loss_clip": 1.0398531, + "balance_loss_mlp": 1.02672613, + "epoch": 0.5519915827446265, + "flos": 20448613612800.0, + "grad_norm": 1.9005454733047327, + "language_loss": 0.67093515, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.69241244, + "num_input_tokens_seen": 197820760, + "step": 9181, + "time_per_iteration": 2.7013533115386963 + }, + { + "auxiliary_loss_clip": 0.01126813, + "auxiliary_loss_mlp": 0.0103403, + "balance_loss_clip": 1.0449146, + "balance_loss_mlp": 1.02041602, + "epoch": 0.5520517059972945, + "flos": 23768052151680.0, + "grad_norm": 2.0355295280850347, + "language_loss": 0.79382825, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.8154366, + "num_input_tokens_seen": 197840195, + "step": 9182, + "time_per_iteration": 2.6580309867858887 + }, + { + "auxiliary_loss_clip": 0.0108505, + "auxiliary_loss_mlp": 0.01029722, + "balance_loss_clip": 1.0405935, + "balance_loss_mlp": 1.01576233, + "epoch": 0.5521118292499624, + "flos": 22196457411840.0, + "grad_norm": 2.3123904881057524, + "language_loss": 0.83006704, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.85121477, + "num_input_tokens_seen": 197859475, + "step": 9183, + "time_per_iteration": 2.744466543197632 + }, + { + "auxiliary_loss_clip": 0.01100335, + "auxiliary_loss_mlp": 0.01028792, + "balance_loss_clip": 1.0419153, + "balance_loss_mlp": 1.01544046, + "epoch": 0.5521719525026304, + "flos": 23583291569280.0, + "grad_norm": 1.2881660479793424, + "language_loss": 0.67605364, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.6973449, + "num_input_tokens_seen": 197879395, + "step": 9184, + "time_per_iteration": 2.6846580505371094 + }, + { + "auxiliary_loss_clip": 0.01110729, + "auxiliary_loss_mlp": 0.01028759, + "balance_loss_clip": 1.04261684, + "balance_loss_mlp": 1.01442409, + "epoch": 0.5522320757552983, + "flos": 26137617482880.0, + "grad_norm": 1.486667996359971, + "language_loss": 0.76359147, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.78498632, + "num_input_tokens_seen": 197900815, + "step": 9185, + "time_per_iteration": 2.6278598308563232 + }, + { + "auxiliary_loss_clip": 0.01084681, + "auxiliary_loss_mlp": 0.01041899, + "balance_loss_clip": 1.04073203, + "balance_loss_mlp": 1.02742732, + "epoch": 0.5522921990079663, + "flos": 24676160820480.0, + "grad_norm": 1.6270174778631188, + "language_loss": 0.74294305, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.76420891, + "num_input_tokens_seen": 197918985, + "step": 9186, + "time_per_iteration": 2.7178421020507812 + }, + { + "auxiliary_loss_clip": 0.01094897, + "auxiliary_loss_mlp": 0.0103984, + "balance_loss_clip": 1.04445529, + "balance_loss_mlp": 1.02626204, + "epoch": 0.5523523222606344, + "flos": 22748153379840.0, + "grad_norm": 2.1270117067296725, + "language_loss": 0.66701925, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.68836665, + "num_input_tokens_seen": 197937725, + "step": 9187, + "time_per_iteration": 2.7278029918670654 + }, + { + "auxiliary_loss_clip": 0.01101824, + "auxiliary_loss_mlp": 0.01034427, + "balance_loss_clip": 1.04459238, + "balance_loss_mlp": 1.02054477, + "epoch": 0.5524124455133023, + "flos": 19755825022080.0, + "grad_norm": 1.575939713951601, + "language_loss": 0.7774123, + "learning_rate": 1.758153413657318e-06, + "loss": 0.79877484, + "num_input_tokens_seen": 197955635, + "step": 9188, + "time_per_iteration": 2.753506660461426 + }, + { + "auxiliary_loss_clip": 0.01095705, + "auxiliary_loss_mlp": 0.01031864, + "balance_loss_clip": 1.04053175, + "balance_loss_mlp": 1.01806509, + "epoch": 0.5524725687659703, + "flos": 23294821443840.0, + "grad_norm": 1.82344252580878, + "language_loss": 0.81139189, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.83266759, + "num_input_tokens_seen": 197974490, + "step": 9189, + "time_per_iteration": 2.7089128494262695 + }, + { + "auxiliary_loss_clip": 0.01104025, + "auxiliary_loss_mlp": 0.00770543, + "balance_loss_clip": 1.04259682, + "balance_loss_mlp": 1.00024211, + "epoch": 0.5525326920186382, + "flos": 24862178378880.0, + "grad_norm": 1.4850448399521246, + "language_loss": 0.76478475, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.78353041, + "num_input_tokens_seen": 197995735, + "step": 9190, + "time_per_iteration": 2.611971855163574 + }, + { + "auxiliary_loss_clip": 0.01125599, + "auxiliary_loss_mlp": 0.01041501, + "balance_loss_clip": 1.04273391, + "balance_loss_mlp": 1.02648067, + "epoch": 0.5525928152713062, + "flos": 13735580906880.0, + "grad_norm": 2.4141637541410508, + "language_loss": 0.78987861, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.81154966, + "num_input_tokens_seen": 198009685, + "step": 9191, + "time_per_iteration": 2.545794725418091 + }, + { + "auxiliary_loss_clip": 0.01050104, + "auxiliary_loss_mlp": 0.01035439, + "balance_loss_clip": 1.03439641, + "balance_loss_mlp": 1.02133703, + "epoch": 0.5526529385239741, + "flos": 13071592045440.0, + "grad_norm": 2.484462687188894, + "language_loss": 0.68966973, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.71052521, + "num_input_tokens_seen": 198026845, + "step": 9192, + "time_per_iteration": 6.08718204498291 + }, + { + "auxiliary_loss_clip": 0.01110861, + "auxiliary_loss_mlp": 0.01035933, + "balance_loss_clip": 1.04424548, + "balance_loss_mlp": 1.02356553, + "epoch": 0.5527130617766421, + "flos": 23148377694720.0, + "grad_norm": 1.4810056841060688, + "language_loss": 0.77680272, + "learning_rate": 1.756220509823588e-06, + "loss": 0.7982707, + "num_input_tokens_seen": 198045275, + "step": 9193, + "time_per_iteration": 4.1960039138793945 + }, + { + "auxiliary_loss_clip": 0.01083568, + "auxiliary_loss_mlp": 0.01034731, + "balance_loss_clip": 1.03722787, + "balance_loss_mlp": 1.02139795, + "epoch": 0.55277318502931, + "flos": 21285547482240.0, + "grad_norm": 1.4323494490195217, + "language_loss": 0.78473246, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.80591547, + "num_input_tokens_seen": 198065760, + "step": 9194, + "time_per_iteration": 2.730219841003418 + }, + { + "auxiliary_loss_clip": 0.01089289, + "auxiliary_loss_mlp": 0.01036551, + "balance_loss_clip": 1.04286909, + "balance_loss_mlp": 1.02309823, + "epoch": 0.5528333082819781, + "flos": 38324549462400.0, + "grad_norm": 2.5114324389353224, + "language_loss": 0.69563878, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.71689719, + "num_input_tokens_seen": 198087595, + "step": 9195, + "time_per_iteration": 2.898447275161743 + }, + { + "auxiliary_loss_clip": 0.01107137, + "auxiliary_loss_mlp": 0.01036404, + "balance_loss_clip": 1.04293728, + "balance_loss_mlp": 1.02215791, + "epoch": 0.552893431534646, + "flos": 13553621585280.0, + "grad_norm": 1.952206040801574, + "language_loss": 0.74276292, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.76419842, + "num_input_tokens_seen": 198104620, + "step": 9196, + "time_per_iteration": 2.775261878967285 + }, + { + "auxiliary_loss_clip": 0.01105394, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.04212689, + "balance_loss_mlp": 1.02461457, + "epoch": 0.552953554787314, + "flos": 21939408708480.0, + "grad_norm": 2.1600616911977384, + "language_loss": 0.76948142, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.79092181, + "num_input_tokens_seen": 198123565, + "step": 9197, + "time_per_iteration": 4.16440224647522 + }, + { + "auxiliary_loss_clip": 0.01097995, + "auxiliary_loss_mlp": 0.01032629, + "balance_loss_clip": 1.03984201, + "balance_loss_mlp": 1.01995707, + "epoch": 0.5530136780399819, + "flos": 43658002558080.0, + "grad_norm": 1.6850679441105894, + "language_loss": 0.76054031, + "learning_rate": 1.754287837093407e-06, + "loss": 0.78184652, + "num_input_tokens_seen": 198148270, + "step": 9198, + "time_per_iteration": 2.950439453125 + }, + { + "auxiliary_loss_clip": 0.01119177, + "auxiliary_loss_mlp": 0.01029802, + "balance_loss_clip": 1.04138994, + "balance_loss_mlp": 1.01700497, + "epoch": 0.5530738012926499, + "flos": 25045502417280.0, + "grad_norm": 1.499755291272354, + "language_loss": 0.79495585, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.81644565, + "num_input_tokens_seen": 198168810, + "step": 9199, + "time_per_iteration": 2.619361162185669 + }, + { + "auxiliary_loss_clip": 0.01078304, + "auxiliary_loss_mlp": 0.01039784, + "balance_loss_clip": 1.03867352, + "balance_loss_mlp": 1.02552032, + "epoch": 0.553133924545318, + "flos": 16472081623680.0, + "grad_norm": 1.9832278976810611, + "language_loss": 0.63797927, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.65916014, + "num_input_tokens_seen": 198186200, + "step": 9200, + "time_per_iteration": 2.6335854530334473 + }, + { + "auxiliary_loss_clip": 0.01102034, + "auxiliary_loss_mlp": 0.01033407, + "balance_loss_clip": 1.04273176, + "balance_loss_mlp": 1.01869619, + "epoch": 0.5531940477979859, + "flos": 24606207083520.0, + "grad_norm": 1.4982382349332672, + "language_loss": 0.66065866, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.68201303, + "num_input_tokens_seen": 198207050, + "step": 9201, + "time_per_iteration": 2.7522671222686768 + }, + { + "auxiliary_loss_clip": 0.01108187, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.04183888, + "balance_loss_mlp": 1.02056432, + "epoch": 0.5532541710506539, + "flos": 22159577122560.0, + "grad_norm": 1.9333851468305103, + "language_loss": 0.61028016, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.63171005, + "num_input_tokens_seen": 198224565, + "step": 9202, + "time_per_iteration": 2.6281580924987793 + }, + { + "auxiliary_loss_clip": 0.0110847, + "auxiliary_loss_mlp": 0.00770781, + "balance_loss_clip": 1.0422498, + "balance_loss_mlp": 1.00029778, + "epoch": 0.5533142943033218, + "flos": 21397265758080.0, + "grad_norm": 1.7184873612817428, + "language_loss": 0.64222115, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.66101366, + "num_input_tokens_seen": 198244790, + "step": 9203, + "time_per_iteration": 2.6509506702423096 + }, + { + "auxiliary_loss_clip": 0.01108951, + "auxiliary_loss_mlp": 0.0103371, + "balance_loss_clip": 1.04175293, + "balance_loss_mlp": 1.02028739, + "epoch": 0.5533744175559898, + "flos": 23550541344000.0, + "grad_norm": 1.4819756399271273, + "language_loss": 0.63615203, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.65757859, + "num_input_tokens_seen": 198264375, + "step": 9204, + "time_per_iteration": 2.7008473873138428 + }, + { + "auxiliary_loss_clip": 0.01106611, + "auxiliary_loss_mlp": 0.01030715, + "balance_loss_clip": 1.04070532, + "balance_loss_mlp": 1.0184958, + "epoch": 0.5534345408086577, + "flos": 24061514267520.0, + "grad_norm": 1.5985992235632864, + "language_loss": 0.77158082, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.79295409, + "num_input_tokens_seen": 198283895, + "step": 9205, + "time_per_iteration": 2.6544225215911865 + }, + { + "auxiliary_loss_clip": 0.01059768, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.03511405, + "balance_loss_mlp": 1.02576268, + "epoch": 0.5534946640613257, + "flos": 33771831408000.0, + "grad_norm": 1.4391383519913163, + "language_loss": 0.72826385, + "learning_rate": 1.751196045993537e-06, + "loss": 0.74925232, + "num_input_tokens_seen": 198310035, + "step": 9206, + "time_per_iteration": 2.832268476486206 + }, + { + "auxiliary_loss_clip": 0.01073531, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.03840923, + "balance_loss_mlp": 1.0208354, + "epoch": 0.5535547873139937, + "flos": 15159223526400.0, + "grad_norm": 2.230271879861814, + "language_loss": 0.75639313, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.77746987, + "num_input_tokens_seen": 198327810, + "step": 9207, + "time_per_iteration": 2.7088775634765625 + }, + { + "auxiliary_loss_clip": 0.01088202, + "auxiliary_loss_mlp": 0.01033804, + "balance_loss_clip": 1.0419991, + "balance_loss_mlp": 1.02010703, + "epoch": 0.5536149105666617, + "flos": 16980863817600.0, + "grad_norm": 71.24671792095333, + "language_loss": 0.61898887, + "learning_rate": 1.750423192272189e-06, + "loss": 0.6402089, + "num_input_tokens_seen": 198343150, + "step": 9208, + "time_per_iteration": 2.749739646911621 + }, + { + "auxiliary_loss_clip": 0.01123136, + "auxiliary_loss_mlp": 0.0103585, + "balance_loss_clip": 1.04367232, + "balance_loss_mlp": 1.02285004, + "epoch": 0.5536750338193296, + "flos": 18149935772160.0, + "grad_norm": 2.006267106077657, + "language_loss": 0.64258868, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.66417855, + "num_input_tokens_seen": 198360925, + "step": 9209, + "time_per_iteration": 2.6854724884033203 + }, + { + "auxiliary_loss_clip": 0.01084442, + "auxiliary_loss_mlp": 0.0104196, + "balance_loss_clip": 1.03969955, + "balance_loss_mlp": 1.02729774, + "epoch": 0.5537351570719976, + "flos": 22747794243840.0, + "grad_norm": 1.8841222831412607, + "language_loss": 0.82470959, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.84597361, + "num_input_tokens_seen": 198379265, + "step": 9210, + "time_per_iteration": 2.746532917022705 + }, + { + "auxiliary_loss_clip": 0.01098481, + "auxiliary_loss_mlp": 0.01029278, + "balance_loss_clip": 1.04068804, + "balance_loss_mlp": 1.016523, + "epoch": 0.5537952803246655, + "flos": 26356026130560.0, + "grad_norm": 1.6369703268884894, + "language_loss": 0.72731483, + "learning_rate": 1.74926398270663e-06, + "loss": 0.74859238, + "num_input_tokens_seen": 198399490, + "step": 9211, + "time_per_iteration": 2.767152786254883 + }, + { + "auxiliary_loss_clip": 0.01089972, + "auxiliary_loss_mlp": 0.01037477, + "balance_loss_clip": 1.03941226, + "balance_loss_mlp": 1.02259946, + "epoch": 0.5538554035773335, + "flos": 18037427397120.0, + "grad_norm": 1.965979716525238, + "language_loss": 0.6684767, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.68975115, + "num_input_tokens_seen": 198419110, + "step": 9212, + "time_per_iteration": 2.6946139335632324 + }, + { + "auxiliary_loss_clip": 0.01092654, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.04305434, + "balance_loss_mlp": 1.01557696, + "epoch": 0.5539155268300014, + "flos": 31686247002240.0, + "grad_norm": 1.403594998374367, + "language_loss": 0.51636183, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.53758979, + "num_input_tokens_seen": 198441360, + "step": 9213, + "time_per_iteration": 2.7821476459503174 + }, + { + "auxiliary_loss_clip": 0.01092111, + "auxiliary_loss_mlp": 0.01030874, + "balance_loss_clip": 1.04350758, + "balance_loss_mlp": 1.01752245, + "epoch": 0.5539756500826695, + "flos": 15193769431680.0, + "grad_norm": 3.6308307245288214, + "language_loss": 0.86044586, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.88167566, + "num_input_tokens_seen": 198459835, + "step": 9214, + "time_per_iteration": 2.7264554500579834 + }, + { + "auxiliary_loss_clip": 0.01110148, + "auxiliary_loss_mlp": 0.01032811, + "balance_loss_clip": 1.04324055, + "balance_loss_mlp": 1.02003813, + "epoch": 0.5540357733353375, + "flos": 26353117128960.0, + "grad_norm": 2.235553679927881, + "language_loss": 0.70002753, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.72145712, + "num_input_tokens_seen": 198478955, + "step": 9215, + "time_per_iteration": 2.684901714324951 + }, + { + "auxiliary_loss_clip": 0.01093255, + "auxiliary_loss_mlp": 0.0103064, + "balance_loss_clip": 1.03972387, + "balance_loss_mlp": 1.01641822, + "epoch": 0.5540958965880054, + "flos": 21323684747520.0, + "grad_norm": 1.5213166138329088, + "language_loss": 0.73443544, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.75567436, + "num_input_tokens_seen": 198499030, + "step": 9216, + "time_per_iteration": 2.6930174827575684 + }, + { + "auxiliary_loss_clip": 0.01095704, + "auxiliary_loss_mlp": 0.0103884, + "balance_loss_clip": 1.04206526, + "balance_loss_mlp": 1.02541757, + "epoch": 0.5541560198406734, + "flos": 25666828899840.0, + "grad_norm": 1.8909182551573178, + "language_loss": 0.71728694, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.73863238, + "num_input_tokens_seen": 198520265, + "step": 9217, + "time_per_iteration": 2.705566644668579 + }, + { + "auxiliary_loss_clip": 0.01102416, + "auxiliary_loss_mlp": 0.01027899, + "balance_loss_clip": 1.04219627, + "balance_loss_mlp": 1.01496446, + "epoch": 0.5542161430933413, + "flos": 21939624190080.0, + "grad_norm": 1.8150794810366015, + "language_loss": 0.78261054, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.80391365, + "num_input_tokens_seen": 198539645, + "step": 9218, + "time_per_iteration": 2.6569690704345703 + }, + { + "auxiliary_loss_clip": 0.01077956, + "auxiliary_loss_mlp": 0.01037085, + "balance_loss_clip": 1.03790164, + "balance_loss_mlp": 1.02161169, + "epoch": 0.5542762663460093, + "flos": 19571459489280.0, + "grad_norm": 1.6224660724744044, + "language_loss": 0.72173905, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.74288952, + "num_input_tokens_seen": 198558710, + "step": 9219, + "time_per_iteration": 2.685511350631714 + }, + { + "auxiliary_loss_clip": 0.01108862, + "auxiliary_loss_mlp": 0.01039965, + "balance_loss_clip": 1.04482341, + "balance_loss_mlp": 1.0262028, + "epoch": 0.5543363895986773, + "flos": 19499063627520.0, + "grad_norm": 1.5105706382424104, + "language_loss": 0.71297967, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.73446798, + "num_input_tokens_seen": 198577050, + "step": 9220, + "time_per_iteration": 2.6306073665618896 + }, + { + "auxiliary_loss_clip": 0.01120811, + "auxiliary_loss_mlp": 0.01026848, + "balance_loss_clip": 1.04381871, + "balance_loss_mlp": 1.01429546, + "epoch": 0.5543965128513453, + "flos": 22635609091200.0, + "grad_norm": 1.6307293256223026, + "language_loss": 0.79449409, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.81597066, + "num_input_tokens_seen": 198595290, + "step": 9221, + "time_per_iteration": 2.664358139038086 + }, + { + "auxiliary_loss_clip": 0.01090389, + "auxiliary_loss_mlp": 0.01034475, + "balance_loss_clip": 1.04653525, + "balance_loss_mlp": 1.02108812, + "epoch": 0.5544566361040132, + "flos": 25989952671360.0, + "grad_norm": 1.9685503329730023, + "language_loss": 0.83722961, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.85847831, + "num_input_tokens_seen": 198614110, + "step": 9222, + "time_per_iteration": 2.770050048828125 + }, + { + "auxiliary_loss_clip": 0.01100221, + "auxiliary_loss_mlp": 0.00771629, + "balance_loss_clip": 1.04789209, + "balance_loss_mlp": 1.00036037, + "epoch": 0.5545167593566812, + "flos": 28257568225920.0, + "grad_norm": 1.9185335813275248, + "language_loss": 0.75431746, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.773036, + "num_input_tokens_seen": 198633880, + "step": 9223, + "time_per_iteration": 2.794182062149048 + }, + { + "auxiliary_loss_clip": 0.01091289, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.04017019, + "balance_loss_mlp": 1.0201149, + "epoch": 0.5545768826093491, + "flos": 28476551491200.0, + "grad_norm": 1.614917501509061, + "language_loss": 0.82090491, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.84215945, + "num_input_tokens_seen": 198653505, + "step": 9224, + "time_per_iteration": 2.7137935161590576 + }, + { + "auxiliary_loss_clip": 0.01108448, + "auxiliary_loss_mlp": 0.01043106, + "balance_loss_clip": 1.04417324, + "balance_loss_mlp": 1.02924204, + "epoch": 0.5546370058620171, + "flos": 18478051534080.0, + "grad_norm": 1.7607532408743478, + "language_loss": 0.57043874, + "learning_rate": 1.743855475904141e-06, + "loss": 0.59195429, + "num_input_tokens_seen": 198671890, + "step": 9225, + "time_per_iteration": 2.616447687149048 + }, + { + "auxiliary_loss_clip": 0.01112997, + "auxiliary_loss_mlp": 0.01038411, + "balance_loss_clip": 1.04317498, + "balance_loss_mlp": 1.02444005, + "epoch": 0.554697129114685, + "flos": 22930507751040.0, + "grad_norm": 1.6222452828903178, + "language_loss": 0.67458808, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.69610214, + "num_input_tokens_seen": 198691995, + "step": 9226, + "time_per_iteration": 2.663339138031006 + }, + { + "auxiliary_loss_clip": 0.0108551, + "auxiliary_loss_mlp": 0.0103467, + "balance_loss_clip": 1.03901601, + "balance_loss_mlp": 1.02121162, + "epoch": 0.5547572523673531, + "flos": 21797166850560.0, + "grad_norm": 1.6061917148762987, + "language_loss": 0.74387592, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.76507771, + "num_input_tokens_seen": 198712440, + "step": 9227, + "time_per_iteration": 2.762258529663086 + }, + { + "auxiliary_loss_clip": 0.01087938, + "auxiliary_loss_mlp": 0.01034743, + "balance_loss_clip": 1.04223549, + "balance_loss_mlp": 1.02071249, + "epoch": 0.5548173756200211, + "flos": 22342829333760.0, + "grad_norm": 1.8589261758591291, + "language_loss": 0.73263627, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.7538631, + "num_input_tokens_seen": 198731515, + "step": 9228, + "time_per_iteration": 2.762092113494873 + }, + { + "auxiliary_loss_clip": 0.01122414, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.04351175, + "balance_loss_mlp": 1.01886559, + "epoch": 0.554877498872689, + "flos": 17858736213120.0, + "grad_norm": 1.672332446894358, + "language_loss": 0.75519872, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.77674282, + "num_input_tokens_seen": 198749750, + "step": 9229, + "time_per_iteration": 2.6003267765045166 + }, + { + "auxiliary_loss_clip": 0.01110807, + "auxiliary_loss_mlp": 0.00772253, + "balance_loss_clip": 1.0439682, + "balance_loss_mlp": 1.00026536, + "epoch": 0.554937622125357, + "flos": 17238343484160.0, + "grad_norm": 1.7587828151966396, + "language_loss": 0.68663722, + "learning_rate": 1.741924325613172e-06, + "loss": 0.70546782, + "num_input_tokens_seen": 198768320, + "step": 9230, + "time_per_iteration": 2.6502435207366943 + }, + { + "auxiliary_loss_clip": 0.01078746, + "auxiliary_loss_mlp": 0.01039366, + "balance_loss_clip": 1.04407859, + "balance_loss_mlp": 1.02506709, + "epoch": 0.5549977453780249, + "flos": 25368087484800.0, + "grad_norm": 2.162588573655947, + "language_loss": 0.6800701, + "learning_rate": 1.741538124855163e-06, + "loss": 0.70125121, + "num_input_tokens_seen": 198787230, + "step": 9231, + "time_per_iteration": 4.46450400352478 + }, + { + "auxiliary_loss_clip": 0.01125233, + "auxiliary_loss_mlp": 0.01040313, + "balance_loss_clip": 1.04339528, + "balance_loss_mlp": 1.02537608, + "epoch": 0.555057868630693, + "flos": 25079114568960.0, + "grad_norm": 1.7058695185820383, + "language_loss": 0.78623915, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.80789459, + "num_input_tokens_seen": 198806720, + "step": 9232, + "time_per_iteration": 4.17819356918335 + }, + { + "auxiliary_loss_clip": 0.01077674, + "auxiliary_loss_mlp": 0.01038155, + "balance_loss_clip": 1.03794336, + "balance_loss_mlp": 1.02523899, + "epoch": 0.5551179918833609, + "flos": 26104220812800.0, + "grad_norm": 1.530027860156435, + "language_loss": 0.82512534, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.84628367, + "num_input_tokens_seen": 198826235, + "step": 9233, + "time_per_iteration": 2.7746078968048096 + }, + { + "auxiliary_loss_clip": 0.01108881, + "auxiliary_loss_mlp": 0.01040385, + "balance_loss_clip": 1.04062366, + "balance_loss_mlp": 1.02632475, + "epoch": 0.5551781151360289, + "flos": 19384759572480.0, + "grad_norm": 8.113354085779601, + "language_loss": 0.74638891, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.76788163, + "num_input_tokens_seen": 198842655, + "step": 9234, + "time_per_iteration": 2.6174590587615967 + }, + { + "auxiliary_loss_clip": 0.01094953, + "auxiliary_loss_mlp": 0.01029345, + "balance_loss_clip": 1.03896558, + "balance_loss_mlp": 1.01647031, + "epoch": 0.5552382383886968, + "flos": 21725956137600.0, + "grad_norm": 4.639125305136136, + "language_loss": 0.64988184, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.67112482, + "num_input_tokens_seen": 198861210, + "step": 9235, + "time_per_iteration": 2.6820857524871826 + }, + { + "auxiliary_loss_clip": 0.0106692, + "auxiliary_loss_mlp": 0.01042767, + "balance_loss_clip": 1.03562975, + "balance_loss_mlp": 1.02793705, + "epoch": 0.5552983616413648, + "flos": 14356189117440.0, + "grad_norm": 1.66240052317675, + "language_loss": 0.67842531, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.69952214, + "num_input_tokens_seen": 198880045, + "step": 9236, + "time_per_iteration": 4.265462160110474 + }, + { + "auxiliary_loss_clip": 0.01116825, + "auxiliary_loss_mlp": 0.01028362, + "balance_loss_clip": 1.04261172, + "balance_loss_mlp": 1.01549888, + "epoch": 0.5553584848940327, + "flos": 25478548784640.0, + "grad_norm": 1.8489707966449562, + "language_loss": 0.86189765, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.88334954, + "num_input_tokens_seen": 198900210, + "step": 9237, + "time_per_iteration": 2.662736654281616 + }, + { + "auxiliary_loss_clip": 0.01108193, + "auxiliary_loss_mlp": 0.01037757, + "balance_loss_clip": 1.04178131, + "balance_loss_mlp": 1.02388716, + "epoch": 0.5554186081467007, + "flos": 22163850840960.0, + "grad_norm": 2.008755703666539, + "language_loss": 0.73663169, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.75809121, + "num_input_tokens_seen": 198919055, + "step": 9238, + "time_per_iteration": 2.6842122077941895 + }, + { + "auxiliary_loss_clip": 0.01105716, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.0387727, + "balance_loss_mlp": 1.01777411, + "epoch": 0.5554787313993687, + "flos": 49746656125440.0, + "grad_norm": 1.8187915692087442, + "language_loss": 0.78551757, + "learning_rate": 1.73844887285358e-06, + "loss": 0.80689085, + "num_input_tokens_seen": 198943505, + "step": 9239, + "time_per_iteration": 2.887911558151245 + }, + { + "auxiliary_loss_clip": 0.01106485, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.04819751, + "balance_loss_mlp": 1.01699483, + "epoch": 0.5555388546520367, + "flos": 22127365601280.0, + "grad_norm": 1.7617963791060023, + "language_loss": 0.8016845, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.82305664, + "num_input_tokens_seen": 198963590, + "step": 9240, + "time_per_iteration": 2.759277582168579 + }, + { + "auxiliary_loss_clip": 0.0109666, + "auxiliary_loss_mlp": 0.01034491, + "balance_loss_clip": 1.04089236, + "balance_loss_mlp": 1.02099013, + "epoch": 0.5555989779047047, + "flos": 24682122478080.0, + "grad_norm": 2.168471057936508, + "language_loss": 0.65255535, + "learning_rate": 1.737676658740786e-06, + "loss": 0.67386687, + "num_input_tokens_seen": 198982680, + "step": 9241, + "time_per_iteration": 2.7321317195892334 + }, + { + "auxiliary_loss_clip": 0.01110689, + "auxiliary_loss_mlp": 0.0077113, + "balance_loss_clip": 1.04320502, + "balance_loss_mlp": 1.00029731, + "epoch": 0.5556591011573726, + "flos": 16106510954880.0, + "grad_norm": 1.885035131778914, + "language_loss": 0.72406638, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.74288458, + "num_input_tokens_seen": 199000185, + "step": 9242, + "time_per_iteration": 2.6891591548919678 + }, + { + "auxiliary_loss_clip": 0.01106836, + "auxiliary_loss_mlp": 0.01034566, + "balance_loss_clip": 1.04584861, + "balance_loss_mlp": 1.02024293, + "epoch": 0.5557192244100406, + "flos": 12933695733120.0, + "grad_norm": 1.6675932055368092, + "language_loss": 0.64065903, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.66207308, + "num_input_tokens_seen": 199018380, + "step": 9243, + "time_per_iteration": 3.1710290908813477 + }, + { + "auxiliary_loss_clip": 0.01094198, + "auxiliary_loss_mlp": 0.00771105, + "balance_loss_clip": 1.04436445, + "balance_loss_mlp": 1.00027966, + "epoch": 0.5557793476627085, + "flos": 23111712887040.0, + "grad_norm": 2.6865994829235333, + "language_loss": 0.75548631, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.77413929, + "num_input_tokens_seen": 199037115, + "step": 9244, + "time_per_iteration": 2.686121940612793 + }, + { + "auxiliary_loss_clip": 0.01091692, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.03900838, + "balance_loss_mlp": 1.02352512, + "epoch": 0.5558394709153766, + "flos": 21428040735360.0, + "grad_norm": 2.0505810415857506, + "language_loss": 0.75051856, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.77179724, + "num_input_tokens_seen": 199053375, + "step": 9245, + "time_per_iteration": 2.6561057567596436 + }, + { + "auxiliary_loss_clip": 0.01099057, + "auxiliary_loss_mlp": 0.01034743, + "balance_loss_clip": 1.04262114, + "balance_loss_mlp": 1.02087283, + "epoch": 0.5558995941680445, + "flos": 25078324469760.0, + "grad_norm": 2.0581034442408055, + "language_loss": 0.79967058, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.82100856, + "num_input_tokens_seen": 199070930, + "step": 9246, + "time_per_iteration": 2.6968653202056885 + }, + { + "auxiliary_loss_clip": 0.01120892, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.0435034, + "balance_loss_mlp": 1.0241977, + "epoch": 0.5559597174207125, + "flos": 20011149872640.0, + "grad_norm": 1.8340386723611697, + "language_loss": 0.73825908, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.75984728, + "num_input_tokens_seen": 199088675, + "step": 9247, + "time_per_iteration": 2.5861082077026367 + }, + { + "auxiliary_loss_clip": 0.01091731, + "auxiliary_loss_mlp": 0.01035279, + "balance_loss_clip": 1.04089963, + "balance_loss_mlp": 1.0214448, + "epoch": 0.5560198406733804, + "flos": 16835677044480.0, + "grad_norm": 2.6765383510534324, + "language_loss": 0.74975288, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.77102304, + "num_input_tokens_seen": 199103075, + "step": 9248, + "time_per_iteration": 2.634092092514038 + }, + { + "auxiliary_loss_clip": 0.00999886, + "auxiliary_loss_mlp": 0.01011469, + "balance_loss_clip": 1.01177704, + "balance_loss_mlp": 1.00989556, + "epoch": 0.5560799639260484, + "flos": 70697051758080.0, + "grad_norm": 0.8462101410465201, + "language_loss": 0.59490269, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.61501622, + "num_input_tokens_seen": 199160325, + "step": 9249, + "time_per_iteration": 3.389267683029175 + }, + { + "auxiliary_loss_clip": 0.01118078, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.04007614, + "balance_loss_mlp": 1.01592088, + "epoch": 0.5561400871787163, + "flos": 23148593176320.0, + "grad_norm": 2.8767161081984427, + "language_loss": 0.79950154, + "learning_rate": 1.734202189316832e-06, + "loss": 0.82098025, + "num_input_tokens_seen": 199179760, + "step": 9250, + "time_per_iteration": 2.578690528869629 + }, + { + "auxiliary_loss_clip": 0.01098469, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.04169929, + "balance_loss_mlp": 1.02075529, + "epoch": 0.5562002104313843, + "flos": 17566423332480.0, + "grad_norm": 3.104352444179477, + "language_loss": 0.68685251, + "learning_rate": 1.733816187358836e-06, + "loss": 0.7081852, + "num_input_tokens_seen": 199196695, + "step": 9251, + "time_per_iteration": 2.7810349464416504 + }, + { + "auxiliary_loss_clip": 0.01109089, + "auxiliary_loss_mlp": 0.01033405, + "balance_loss_clip": 1.04200792, + "balance_loss_mlp": 1.02018476, + "epoch": 0.5562603336840523, + "flos": 25045430590080.0, + "grad_norm": 1.5038625186154766, + "language_loss": 0.75750792, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.77893281, + "num_input_tokens_seen": 199217845, + "step": 9252, + "time_per_iteration": 2.663238286972046 + }, + { + "auxiliary_loss_clip": 0.01107916, + "auxiliary_loss_mlp": 0.01039535, + "balance_loss_clip": 1.04108679, + "balance_loss_mlp": 1.02441943, + "epoch": 0.5563204569367203, + "flos": 29059022436480.0, + "grad_norm": 1.5228616100256118, + "language_loss": 0.72854966, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.7500242, + "num_input_tokens_seen": 199239250, + "step": 9253, + "time_per_iteration": 2.6020450592041016 + }, + { + "auxiliary_loss_clip": 0.01093689, + "auxiliary_loss_mlp": 0.01032948, + "balance_loss_clip": 1.04451489, + "balance_loss_mlp": 1.02043748, + "epoch": 0.5563805801893883, + "flos": 22090449398400.0, + "grad_norm": 1.6703038143704756, + "language_loss": 0.83143723, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.85270357, + "num_input_tokens_seen": 199258320, + "step": 9254, + "time_per_iteration": 2.701199531555176 + }, + { + "auxiliary_loss_clip": 0.01012318, + "auxiliary_loss_mlp": 0.01004464, + "balance_loss_clip": 1.01460981, + "balance_loss_mlp": 1.0030154, + "epoch": 0.5564407034420562, + "flos": 58636128689280.0, + "grad_norm": 0.8693463823650434, + "language_loss": 0.64875168, + "learning_rate": 1.732272280610387e-06, + "loss": 0.6689195, + "num_input_tokens_seen": 199314840, + "step": 9255, + "time_per_iteration": 3.1222445964813232 + }, + { + "auxiliary_loss_clip": 0.01111592, + "auxiliary_loss_mlp": 0.01033344, + "balance_loss_clip": 1.04527521, + "balance_loss_mlp": 1.02035666, + "epoch": 0.5565008266947242, + "flos": 23112323418240.0, + "grad_norm": 2.147539486852423, + "language_loss": 0.69487607, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.7163254, + "num_input_tokens_seen": 199335405, + "step": 9256, + "time_per_iteration": 2.642542600631714 + }, + { + "auxiliary_loss_clip": 0.01085774, + "auxiliary_loss_mlp": 0.01031728, + "balance_loss_clip": 1.04269767, + "balance_loss_mlp": 1.01939559, + "epoch": 0.5565609499473921, + "flos": 21578399066880.0, + "grad_norm": 1.6171582584602333, + "language_loss": 0.75981283, + "learning_rate": 1.73150038809119e-06, + "loss": 0.78098786, + "num_input_tokens_seen": 199354345, + "step": 9257, + "time_per_iteration": 2.712520122528076 + }, + { + "auxiliary_loss_clip": 0.01074562, + "auxiliary_loss_mlp": 0.01036038, + "balance_loss_clip": 1.04019046, + "balance_loss_mlp": 1.0233897, + "epoch": 0.5566210732000602, + "flos": 18369637309440.0, + "grad_norm": 3.6499733263034746, + "language_loss": 0.60697454, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.62808049, + "num_input_tokens_seen": 199372250, + "step": 9258, + "time_per_iteration": 2.751559257507324 + }, + { + "auxiliary_loss_clip": 0.01084702, + "auxiliary_loss_mlp": 0.01035032, + "balance_loss_clip": 1.03922486, + "balance_loss_mlp": 1.02042937, + "epoch": 0.5566811964527281, + "flos": 25703350053120.0, + "grad_norm": 1.5966024354647115, + "language_loss": 0.79111505, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.81231236, + "num_input_tokens_seen": 199392815, + "step": 9259, + "time_per_iteration": 2.7664895057678223 + }, + { + "auxiliary_loss_clip": 0.01088989, + "auxiliary_loss_mlp": 0.0103733, + "balance_loss_clip": 1.04242945, + "balance_loss_mlp": 1.02328086, + "epoch": 0.5567413197053961, + "flos": 26943991856640.0, + "grad_norm": 1.7833081696281723, + "language_loss": 0.81253225, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.83379543, + "num_input_tokens_seen": 199412375, + "step": 9260, + "time_per_iteration": 2.79059100151062 + }, + { + "auxiliary_loss_clip": 0.01120889, + "auxiliary_loss_mlp": 0.01039805, + "balance_loss_clip": 1.04265976, + "balance_loss_mlp": 1.02585721, + "epoch": 0.556801442958064, + "flos": 20850597694080.0, + "grad_norm": 1.513133023380305, + "language_loss": 0.69277883, + "learning_rate": 1.729956725348256e-06, + "loss": 0.71438575, + "num_input_tokens_seen": 199431490, + "step": 9261, + "time_per_iteration": 2.5942957401275635 + }, + { + "auxiliary_loss_clip": 0.01009344, + "auxiliary_loss_mlp": 0.01005985, + "balance_loss_clip": 1.01376081, + "balance_loss_mlp": 1.00455499, + "epoch": 0.556861566210732, + "flos": 70498213044480.0, + "grad_norm": 0.7654306967564637, + "language_loss": 0.61116695, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63132024, + "num_input_tokens_seen": 199495855, + "step": 9262, + "time_per_iteration": 3.2477405071258545 + }, + { + "auxiliary_loss_clip": 0.01109024, + "auxiliary_loss_mlp": 0.0103923, + "balance_loss_clip": 1.03991163, + "balance_loss_mlp": 1.02594411, + "epoch": 0.5569216894633999, + "flos": 25337276593920.0, + "grad_norm": 1.6344264149627976, + "language_loss": 0.64423072, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.66571325, + "num_input_tokens_seen": 199515870, + "step": 9263, + "time_per_iteration": 2.658576488494873 + }, + { + "auxiliary_loss_clip": 0.01095378, + "auxiliary_loss_mlp": 0.010346, + "balance_loss_clip": 1.03873014, + "balance_loss_mlp": 1.02134418, + "epoch": 0.556981812716068, + "flos": 22638733574400.0, + "grad_norm": 1.867976542015905, + "language_loss": 0.73368537, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.75498509, + "num_input_tokens_seen": 199535745, + "step": 9264, + "time_per_iteration": 2.7003254890441895 + }, + { + "auxiliary_loss_clip": 0.01095238, + "auxiliary_loss_mlp": 0.01029532, + "balance_loss_clip": 1.04636014, + "balance_loss_mlp": 1.01672268, + "epoch": 0.5570419359687359, + "flos": 11035852738560.0, + "grad_norm": 2.2771016341265526, + "language_loss": 0.76178783, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.78303552, + "num_input_tokens_seen": 199554035, + "step": 9265, + "time_per_iteration": 2.7386014461517334 + }, + { + "auxiliary_loss_clip": 0.01090389, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.04179025, + "balance_loss_mlp": 1.02249825, + "epoch": 0.5571020592214039, + "flos": 22823135020800.0, + "grad_norm": 1.339030652191656, + "language_loss": 0.70789158, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.72914135, + "num_input_tokens_seen": 199576120, + "step": 9266, + "time_per_iteration": 2.741800546646118 + }, + { + "auxiliary_loss_clip": 0.01094155, + "auxiliary_loss_mlp": 0.0103873, + "balance_loss_clip": 1.03911209, + "balance_loss_mlp": 1.0255034, + "epoch": 0.5571621824740719, + "flos": 22927778317440.0, + "grad_norm": 2.0031056980063506, + "language_loss": 0.68157613, + "learning_rate": 1.727641538728533e-06, + "loss": 0.70290494, + "num_input_tokens_seen": 199593780, + "step": 9267, + "time_per_iteration": 2.7874062061309814 + }, + { + "auxiliary_loss_clip": 0.01104037, + "auxiliary_loss_mlp": 0.01038856, + "balance_loss_clip": 1.03991306, + "balance_loss_mlp": 1.02653575, + "epoch": 0.5572223057267398, + "flos": 22966705681920.0, + "grad_norm": 1.918660534651482, + "language_loss": 0.74570519, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.76713407, + "num_input_tokens_seen": 199613220, + "step": 9268, + "time_per_iteration": 2.7008538246154785 + }, + { + "auxiliary_loss_clip": 0.01103292, + "auxiliary_loss_mlp": 0.00770403, + "balance_loss_clip": 1.04299617, + "balance_loss_mlp": 1.00017905, + "epoch": 0.5572824289794078, + "flos": 20960053413120.0, + "grad_norm": 1.8745085493520866, + "language_loss": 0.75087655, + "learning_rate": 1.726869892322104e-06, + "loss": 0.76961344, + "num_input_tokens_seen": 199632085, + "step": 9269, + "time_per_iteration": 2.653756856918335 + }, + { + "auxiliary_loss_clip": 0.01081519, + "auxiliary_loss_mlp": 0.01046232, + "balance_loss_clip": 1.03722787, + "balance_loss_mlp": 1.03201032, + "epoch": 0.5573425522320757, + "flos": 25042413847680.0, + "grad_norm": 1.688879717720704, + "language_loss": 0.82588089, + "learning_rate": 1.726484084647256e-06, + "loss": 0.84715831, + "num_input_tokens_seen": 199649295, + "step": 9270, + "time_per_iteration": 4.278396844863892 + }, + { + "auxiliary_loss_clip": 0.01079257, + "auxiliary_loss_mlp": 0.01039234, + "balance_loss_clip": 1.04120445, + "balance_loss_mlp": 1.02594197, + "epoch": 0.5574026754847438, + "flos": 23659637927040.0, + "grad_norm": 2.0078243728297167, + "language_loss": 0.79825968, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.81944454, + "num_input_tokens_seen": 199668870, + "step": 9271, + "time_per_iteration": 6.1330788135528564 + }, + { + "auxiliary_loss_clip": 0.01099668, + "auxiliary_loss_mlp": 0.01031508, + "balance_loss_clip": 1.04303491, + "balance_loss_mlp": 1.01848447, + "epoch": 0.5574627987374117, + "flos": 24782240661120.0, + "grad_norm": 2.2903855544483394, + "language_loss": 0.90515852, + "learning_rate": 1.725712500427442e-06, + "loss": 0.92647034, + "num_input_tokens_seen": 199684870, + "step": 9272, + "time_per_iteration": 2.6802456378936768 + }, + { + "auxiliary_loss_clip": 0.01086004, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.04199028, + "balance_loss_mlp": 1.02049148, + "epoch": 0.5575229219900797, + "flos": 21834944979840.0, + "grad_norm": 2.009692341926254, + "language_loss": 0.83817393, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.85936373, + "num_input_tokens_seen": 199701975, + "step": 9273, + "time_per_iteration": 2.714702606201172 + }, + { + "auxiliary_loss_clip": 0.01111871, + "auxiliary_loss_mlp": 0.01043586, + "balance_loss_clip": 1.0435437, + "balance_loss_mlp": 1.0286727, + "epoch": 0.5575830452427476, + "flos": 27815148408960.0, + "grad_norm": 2.029727061879287, + "language_loss": 0.74000418, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.76155877, + "num_input_tokens_seen": 199721865, + "step": 9274, + "time_per_iteration": 2.6897573471069336 + }, + { + "auxiliary_loss_clip": 0.01102598, + "auxiliary_loss_mlp": 0.01036471, + "balance_loss_clip": 1.04597545, + "balance_loss_mlp": 1.02202296, + "epoch": 0.5576431684954156, + "flos": 17812805696640.0, + "grad_norm": 2.7929550344218885, + "language_loss": 0.7749905, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.79638124, + "num_input_tokens_seen": 199736455, + "step": 9275, + "time_per_iteration": 2.6423583030700684 + }, + { + "auxiliary_loss_clip": 0.01093646, + "auxiliary_loss_mlp": 0.01035097, + "balance_loss_clip": 1.04310751, + "balance_loss_mlp": 1.02178109, + "epoch": 0.5577032917480835, + "flos": 15486872411520.0, + "grad_norm": 1.5365384810156146, + "language_loss": 0.75059974, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.77188718, + "num_input_tokens_seen": 199753125, + "step": 9276, + "time_per_iteration": 4.227986812591553 + }, + { + "auxiliary_loss_clip": 0.01098066, + "auxiliary_loss_mlp": 0.01035646, + "balance_loss_clip": 1.04026711, + "balance_loss_mlp": 1.02219296, + "epoch": 0.5577634150007516, + "flos": 21579763783680.0, + "grad_norm": 1.8156811956405543, + "language_loss": 0.75730252, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.77863955, + "num_input_tokens_seen": 199771365, + "step": 9277, + "time_per_iteration": 2.651348114013672 + }, + { + "auxiliary_loss_clip": 0.01117192, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.04269838, + "balance_loss_mlp": 1.02087963, + "epoch": 0.5578235382534195, + "flos": 21139750177920.0, + "grad_norm": 1.871466977383403, + "language_loss": 0.71828836, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.73979771, + "num_input_tokens_seen": 199790035, + "step": 9278, + "time_per_iteration": 2.657386302947998 + }, + { + "auxiliary_loss_clip": 0.0108587, + "auxiliary_loss_mlp": 0.01037056, + "balance_loss_clip": 1.04430723, + "balance_loss_mlp": 1.02232218, + "epoch": 0.5578836615060875, + "flos": 26505199313280.0, + "grad_norm": 1.586228481919935, + "language_loss": 0.75729156, + "learning_rate": 1.723012284057868e-06, + "loss": 0.77852082, + "num_input_tokens_seen": 199811125, + "step": 9279, + "time_per_iteration": 2.751840353012085 + }, + { + "auxiliary_loss_clip": 0.01093934, + "auxiliary_loss_mlp": 0.01037128, + "balance_loss_clip": 1.03794658, + "balance_loss_mlp": 1.02376509, + "epoch": 0.5579437847587555, + "flos": 20153786780160.0, + "grad_norm": 1.6097529730476008, + "language_loss": 0.67559254, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.69690311, + "num_input_tokens_seen": 199829915, + "step": 9280, + "time_per_iteration": 2.6563684940338135 + }, + { + "auxiliary_loss_clip": 0.01106752, + "auxiliary_loss_mlp": 0.01041709, + "balance_loss_clip": 1.0392946, + "balance_loss_mlp": 1.02810693, + "epoch": 0.5580039080114234, + "flos": 26102281478400.0, + "grad_norm": 1.6056594505621422, + "language_loss": 0.73215401, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.75363857, + "num_input_tokens_seen": 199850670, + "step": 9281, + "time_per_iteration": 2.6871986389160156 + }, + { + "auxiliary_loss_clip": 0.01086628, + "auxiliary_loss_mlp": 0.00770991, + "balance_loss_clip": 1.04039741, + "balance_loss_mlp": 1.0002861, + "epoch": 0.5580640312640914, + "flos": 13771671096960.0, + "grad_norm": 3.0582981113882317, + "language_loss": 0.75378543, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.77236158, + "num_input_tokens_seen": 199867645, + "step": 9282, + "time_per_iteration": 2.680744171142578 + }, + { + "auxiliary_loss_clip": 0.01055422, + "auxiliary_loss_mlp": 0.01036854, + "balance_loss_clip": 1.03532624, + "balance_loss_mlp": 1.02328229, + "epoch": 0.5581241545167593, + "flos": 17675986792320.0, + "grad_norm": 2.212590462669887, + "language_loss": 0.6592958, + "learning_rate": 1.721469534028297e-06, + "loss": 0.68021852, + "num_input_tokens_seen": 199886320, + "step": 9283, + "time_per_iteration": 2.7523255348205566 + }, + { + "auxiliary_loss_clip": 0.01087506, + "auxiliary_loss_mlp": 0.01030166, + "balance_loss_clip": 1.04440904, + "balance_loss_mlp": 1.01841235, + "epoch": 0.5581842777694274, + "flos": 19569161018880.0, + "grad_norm": 1.7248818916670352, + "language_loss": 0.82969356, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.85087025, + "num_input_tokens_seen": 199904895, + "step": 9284, + "time_per_iteration": 2.6912968158721924 + }, + { + "auxiliary_loss_clip": 0.01097795, + "auxiliary_loss_mlp": 0.0103561, + "balance_loss_clip": 1.04244661, + "balance_loss_mlp": 1.02261066, + "epoch": 0.5582444010220953, + "flos": 20595165102720.0, + "grad_norm": 2.3068151709488736, + "language_loss": 0.85949606, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.88083011, + "num_input_tokens_seen": 199921090, + "step": 9285, + "time_per_iteration": 2.6835310459136963 + }, + { + "auxiliary_loss_clip": 0.01095995, + "auxiliary_loss_mlp": 0.01037997, + "balance_loss_clip": 1.0437417, + "balance_loss_mlp": 1.02543855, + "epoch": 0.5583045242747633, + "flos": 19135504120320.0, + "grad_norm": 2.6758058324476024, + "language_loss": 0.73497176, + "learning_rate": 1.720312582354912e-06, + "loss": 0.75631171, + "num_input_tokens_seen": 199939925, + "step": 9286, + "time_per_iteration": 2.7510128021240234 + }, + { + "auxiliary_loss_clip": 0.01119969, + "auxiliary_loss_mlp": 0.01032279, + "balance_loss_clip": 1.04193521, + "balance_loss_mlp": 1.01924896, + "epoch": 0.5583646475274312, + "flos": 27454569730560.0, + "grad_norm": 2.5542622351497104, + "language_loss": 0.7366401, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.7581625, + "num_input_tokens_seen": 199960015, + "step": 9287, + "time_per_iteration": 2.7764368057250977 + }, + { + "auxiliary_loss_clip": 0.01087822, + "auxiliary_loss_mlp": 0.01038543, + "balance_loss_clip": 1.04215682, + "balance_loss_mlp": 1.0240171, + "epoch": 0.5584247707800992, + "flos": 23653784010240.0, + "grad_norm": 1.5995445525462566, + "language_loss": 0.75250727, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.77377093, + "num_input_tokens_seen": 199980505, + "step": 9288, + "time_per_iteration": 2.711667060852051 + }, + { + "auxiliary_loss_clip": 0.01101347, + "auxiliary_loss_mlp": 0.01045442, + "balance_loss_clip": 1.04461765, + "balance_loss_mlp": 1.03062999, + "epoch": 0.5584848940327671, + "flos": 13698880185600.0, + "grad_norm": 2.3847574468541075, + "language_loss": 0.77486145, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.79632932, + "num_input_tokens_seen": 199999020, + "step": 9289, + "time_per_iteration": 2.726365566253662 + }, + { + "auxiliary_loss_clip": 0.01092544, + "auxiliary_loss_mlp": 0.01034807, + "balance_loss_clip": 1.04270971, + "balance_loss_mlp": 1.02084172, + "epoch": 0.5585450172854352, + "flos": 27016208150400.0, + "grad_norm": 1.8546991944448898, + "language_loss": 0.61392409, + "learning_rate": 1.718770128672817e-06, + "loss": 0.63519758, + "num_input_tokens_seen": 200019020, + "step": 9290, + "time_per_iteration": 2.7546441555023193 + }, + { + "auxiliary_loss_clip": 0.01071377, + "auxiliary_loss_mlp": 0.01032544, + "balance_loss_clip": 1.03871763, + "balance_loss_mlp": 1.01945531, + "epoch": 0.5586051405381031, + "flos": 23185653033600.0, + "grad_norm": 2.64639974160875, + "language_loss": 0.68249333, + "learning_rate": 1.7183845418764e-06, + "loss": 0.70353258, + "num_input_tokens_seen": 200038110, + "step": 9291, + "time_per_iteration": 3.030916452407837 + }, + { + "auxiliary_loss_clip": 0.01091279, + "auxiliary_loss_mlp": 0.01045913, + "balance_loss_clip": 1.04114079, + "balance_loss_mlp": 1.03218007, + "epoch": 0.5586652637907711, + "flos": 20775544225920.0, + "grad_norm": 1.7635760067758424, + "language_loss": 0.84269536, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.86406732, + "num_input_tokens_seen": 200056210, + "step": 9292, + "time_per_iteration": 2.6990363597869873 + }, + { + "auxiliary_loss_clip": 0.01090195, + "auxiliary_loss_mlp": 0.01046206, + "balance_loss_clip": 1.03904271, + "balance_loss_mlp": 1.03265166, + "epoch": 0.5587253870434391, + "flos": 28219897837440.0, + "grad_norm": 2.3637237833932687, + "language_loss": 0.73976684, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.76113087, + "num_input_tokens_seen": 200075620, + "step": 9293, + "time_per_iteration": 2.7066195011138916 + }, + { + "auxiliary_loss_clip": 0.0108672, + "auxiliary_loss_mlp": 0.01044291, + "balance_loss_clip": 1.04188502, + "balance_loss_mlp": 1.03185785, + "epoch": 0.558785510296107, + "flos": 26615732440320.0, + "grad_norm": 1.7291294273759894, + "language_loss": 0.72083485, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.74214494, + "num_input_tokens_seen": 200095945, + "step": 9294, + "time_per_iteration": 2.7188310623168945 + }, + { + "auxiliary_loss_clip": 0.01098814, + "auxiliary_loss_mlp": 0.00770939, + "balance_loss_clip": 1.04345989, + "balance_loss_mlp": 1.0002197, + "epoch": 0.558845633548775, + "flos": 20156767608960.0, + "grad_norm": 2.0034844848738995, + "language_loss": 0.68573147, + "learning_rate": 1.716842301625806e-06, + "loss": 0.70442897, + "num_input_tokens_seen": 200114185, + "step": 9295, + "time_per_iteration": 2.645157814025879 + }, + { + "auxiliary_loss_clip": 0.01120796, + "auxiliary_loss_mlp": 0.01037699, + "balance_loss_clip": 1.04437232, + "balance_loss_mlp": 1.02404976, + "epoch": 0.5589057568014429, + "flos": 24350774492160.0, + "grad_norm": 1.451861251832641, + "language_loss": 0.81153715, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.83312207, + "num_input_tokens_seen": 200135030, + "step": 9296, + "time_per_iteration": 2.638831853866577 + }, + { + "auxiliary_loss_clip": 0.01109007, + "auxiliary_loss_mlp": 0.01036287, + "balance_loss_clip": 1.0433023, + "balance_loss_mlp": 1.02302504, + "epoch": 0.558965880054111, + "flos": 21105168359040.0, + "grad_norm": 2.39482931377815, + "language_loss": 0.65407717, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.67553014, + "num_input_tokens_seen": 200154290, + "step": 9297, + "time_per_iteration": 2.6714565753936768 + }, + { + "auxiliary_loss_clip": 0.01088452, + "auxiliary_loss_mlp": 0.01039165, + "balance_loss_clip": 1.04224098, + "balance_loss_mlp": 1.024997, + "epoch": 0.5590260033067789, + "flos": 18436071513600.0, + "grad_norm": 1.768502931317098, + "language_loss": 0.75242859, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.77370477, + "num_input_tokens_seen": 200171555, + "step": 9298, + "time_per_iteration": 2.7061312198638916 + }, + { + "auxiliary_loss_clip": 0.01019627, + "auxiliary_loss_mlp": 0.01016507, + "balance_loss_clip": 1.01274395, + "balance_loss_mlp": 1.01488543, + "epoch": 0.5590861265594469, + "flos": 70577432490240.0, + "grad_norm": 0.6867151105979278, + "language_loss": 0.52393436, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.54429573, + "num_input_tokens_seen": 200237010, + "step": 9299, + "time_per_iteration": 3.2783946990966797 + }, + { + "auxiliary_loss_clip": 0.01104521, + "auxiliary_loss_mlp": 0.01037017, + "balance_loss_clip": 1.04119837, + "balance_loss_mlp": 1.02390957, + "epoch": 0.5591462498121148, + "flos": 30664408896000.0, + "grad_norm": 1.9265460961114051, + "language_loss": 0.69143355, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.7128489, + "num_input_tokens_seen": 200260820, + "step": 9300, + "time_per_iteration": 2.716351270675659 + }, + { + "auxiliary_loss_clip": 0.01065458, + "auxiliary_loss_mlp": 0.01057284, + "balance_loss_clip": 1.03432143, + "balance_loss_mlp": 1.04067802, + "epoch": 0.5592063730647828, + "flos": 18150438562560.0, + "grad_norm": 2.0948179426753164, + "language_loss": 0.81994128, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.84116876, + "num_input_tokens_seen": 200278035, + "step": 9301, + "time_per_iteration": 2.6983389854431152 + }, + { + "auxiliary_loss_clip": 0.01117535, + "auxiliary_loss_mlp": 0.01032183, + "balance_loss_clip": 1.04067254, + "balance_loss_mlp": 1.0186348, + "epoch": 0.5592664963174507, + "flos": 24060400945920.0, + "grad_norm": 3.1722185850775553, + "language_loss": 0.68140459, + "learning_rate": 1.714143795138756e-06, + "loss": 0.70290172, + "num_input_tokens_seen": 200297255, + "step": 9302, + "time_per_iteration": 2.5997016429901123 + }, + { + "auxiliary_loss_clip": 0.01088292, + "auxiliary_loss_mlp": 0.01028765, + "balance_loss_clip": 1.04123783, + "balance_loss_mlp": 1.01426911, + "epoch": 0.5593266195701188, + "flos": 19827897661440.0, + "grad_norm": 1.7171276141981482, + "language_loss": 0.70894414, + "learning_rate": 1.713758337453878e-06, + "loss": 0.7301147, + "num_input_tokens_seen": 200317505, + "step": 9303, + "time_per_iteration": 2.720726728439331 + }, + { + "auxiliary_loss_clip": 0.01045978, + "auxiliary_loss_mlp": 0.01043666, + "balance_loss_clip": 1.03466618, + "balance_loss_mlp": 1.02934885, + "epoch": 0.5593867428227867, + "flos": 25300755440640.0, + "grad_norm": 3.8871936508431606, + "language_loss": 0.72614998, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.74704641, + "num_input_tokens_seen": 200338350, + "step": 9304, + "time_per_iteration": 2.7727861404418945 + }, + { + "auxiliary_loss_clip": 0.01107464, + "auxiliary_loss_mlp": 0.01030237, + "balance_loss_clip": 1.04120493, + "balance_loss_mlp": 1.0174104, + "epoch": 0.5594468660754547, + "flos": 12933013374720.0, + "grad_norm": 2.306388303475261, + "language_loss": 0.77981883, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.80119586, + "num_input_tokens_seen": 200353965, + "step": 9305, + "time_per_iteration": 2.5945067405700684 + }, + { + "auxiliary_loss_clip": 0.01069392, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.04184294, + "balance_loss_mlp": 1.01778793, + "epoch": 0.5595069893281227, + "flos": 19062713208960.0, + "grad_norm": 1.7491845938042618, + "language_loss": 0.69805098, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.71904755, + "num_input_tokens_seen": 200373595, + "step": 9306, + "time_per_iteration": 2.8083784580230713 + }, + { + "auxiliary_loss_clip": 0.01018297, + "auxiliary_loss_mlp": 0.01002442, + "balance_loss_clip": 1.015836, + "balance_loss_mlp": 1.00099397, + "epoch": 0.5595671125807906, + "flos": 70273375862400.0, + "grad_norm": 0.9194279331367995, + "language_loss": 0.60304606, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62325346, + "num_input_tokens_seen": 200429155, + "step": 9307, + "time_per_iteration": 3.301408052444458 + }, + { + "auxiliary_loss_clip": 0.01104522, + "auxiliary_loss_mlp": 0.01035417, + "balance_loss_clip": 1.0423522, + "balance_loss_mlp": 1.02234626, + "epoch": 0.5596272358334586, + "flos": 20665513889280.0, + "grad_norm": 1.8556565203900444, + "language_loss": 0.73943615, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.76083553, + "num_input_tokens_seen": 200448290, + "step": 9308, + "time_per_iteration": 2.6449387073516846 + }, + { + "auxiliary_loss_clip": 0.01051886, + "auxiliary_loss_mlp": 0.01038908, + "balance_loss_clip": 1.03424501, + "balance_loss_mlp": 1.02397084, + "epoch": 0.5596873590861265, + "flos": 25041013217280.0, + "grad_norm": 2.1877402567653808, + "language_loss": 0.69691569, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.71782362, + "num_input_tokens_seen": 200466555, + "step": 9309, + "time_per_iteration": 4.464626312255859 + }, + { + "auxiliary_loss_clip": 0.01093684, + "auxiliary_loss_mlp": 0.01037862, + "balance_loss_clip": 1.04161119, + "balance_loss_mlp": 1.02288949, + "epoch": 0.5597474823387946, + "flos": 25958387594880.0, + "grad_norm": 1.9102617963629012, + "language_loss": 0.75523353, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.77654898, + "num_input_tokens_seen": 200485980, + "step": 9310, + "time_per_iteration": 4.4445412158966064 + }, + { + "auxiliary_loss_clip": 0.01112006, + "auxiliary_loss_mlp": 0.01037268, + "balance_loss_clip": 1.04378152, + "balance_loss_mlp": 1.02286768, + "epoch": 0.5598076055914625, + "flos": 26177442687360.0, + "grad_norm": 2.0703892527912813, + "language_loss": 0.69657761, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.71807039, + "num_input_tokens_seen": 200504555, + "step": 9311, + "time_per_iteration": 4.303341865539551 + }, + { + "auxiliary_loss_clip": 0.01105172, + "auxiliary_loss_mlp": 0.01034066, + "balance_loss_clip": 1.04042637, + "balance_loss_mlp": 1.02103674, + "epoch": 0.5598677288441305, + "flos": 11655778590720.0, + "grad_norm": 1.8932120118757645, + "language_loss": 0.71856189, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.73995423, + "num_input_tokens_seen": 200522700, + "step": 9312, + "time_per_iteration": 2.610438823699951 + }, + { + "auxiliary_loss_clip": 0.01080705, + "auxiliary_loss_mlp": 0.01033643, + "balance_loss_clip": 1.04290187, + "balance_loss_mlp": 1.02023816, + "epoch": 0.5599278520967984, + "flos": 22966597941120.0, + "grad_norm": 2.1557841469459746, + "language_loss": 0.89152771, + "learning_rate": 1.709904360003822e-06, + "loss": 0.91267115, + "num_input_tokens_seen": 200541910, + "step": 9313, + "time_per_iteration": 2.6854610443115234 + }, + { + "auxiliary_loss_clip": 0.01081962, + "auxiliary_loss_mlp": 0.01044977, + "balance_loss_clip": 1.0415206, + "balance_loss_mlp": 1.03109467, + "epoch": 0.5599879753494664, + "flos": 21215557831680.0, + "grad_norm": 1.521477055933408, + "language_loss": 0.77815449, + "learning_rate": 1.709519022520204e-06, + "loss": 0.79942387, + "num_input_tokens_seen": 200562600, + "step": 9314, + "time_per_iteration": 4.262527942657471 + }, + { + "auxiliary_loss_clip": 0.01082652, + "auxiliary_loss_mlp": 0.01031612, + "balance_loss_clip": 1.0416466, + "balance_loss_mlp": 1.01851654, + "epoch": 0.5600480986021343, + "flos": 31903219105920.0, + "grad_norm": 1.6753660628338782, + "language_loss": 0.70509619, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.72623885, + "num_input_tokens_seen": 200584795, + "step": 9315, + "time_per_iteration": 2.7611892223358154 + }, + { + "auxiliary_loss_clip": 0.0110321, + "auxiliary_loss_mlp": 0.01041043, + "balance_loss_clip": 1.04375148, + "balance_loss_mlp": 1.02726793, + "epoch": 0.5601082218548024, + "flos": 28476048700800.0, + "grad_norm": 1.7587170023253702, + "language_loss": 0.66601861, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.68746114, + "num_input_tokens_seen": 200606945, + "step": 9316, + "time_per_iteration": 2.675050973892212 + }, + { + "auxiliary_loss_clip": 0.0108131, + "auxiliary_loss_mlp": 0.01037022, + "balance_loss_clip": 1.037871, + "balance_loss_mlp": 1.0214529, + "epoch": 0.5601683451074703, + "flos": 24097173494400.0, + "grad_norm": 2.414777902457845, + "language_loss": 0.87209964, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.89328289, + "num_input_tokens_seen": 200626340, + "step": 9317, + "time_per_iteration": 2.7405858039855957 + }, + { + "auxiliary_loss_clip": 0.01115616, + "auxiliary_loss_mlp": 0.01038233, + "balance_loss_clip": 1.04544759, + "balance_loss_mlp": 1.02290869, + "epoch": 0.5602284683601383, + "flos": 26356205698560.0, + "grad_norm": 1.8555836482261492, + "language_loss": 0.76961493, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.79115343, + "num_input_tokens_seen": 200644520, + "step": 9318, + "time_per_iteration": 2.683375597000122 + }, + { + "auxiliary_loss_clip": 0.0110569, + "auxiliary_loss_mlp": 0.01040718, + "balance_loss_clip": 1.04080641, + "balance_loss_mlp": 1.02822459, + "epoch": 0.5602885916128063, + "flos": 24496392228480.0, + "grad_norm": 1.6342768124534643, + "language_loss": 0.76235765, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.7838217, + "num_input_tokens_seen": 200664845, + "step": 9319, + "time_per_iteration": 2.6256465911865234 + }, + { + "auxiliary_loss_clip": 0.01107325, + "auxiliary_loss_mlp": 0.01036412, + "balance_loss_clip": 1.04242063, + "balance_loss_mlp": 1.02393723, + "epoch": 0.5603487148654742, + "flos": 27345006270720.0, + "grad_norm": 1.4761895802927258, + "language_loss": 0.85648036, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.87791771, + "num_input_tokens_seen": 200686535, + "step": 9320, + "time_per_iteration": 2.7295455932617188 + }, + { + "auxiliary_loss_clip": 0.0103543, + "auxiliary_loss_mlp": 0.01003142, + "balance_loss_clip": 1.01980209, + "balance_loss_mlp": 1.00181246, + "epoch": 0.5604088381181422, + "flos": 54087756180480.0, + "grad_norm": 0.7528149861495326, + "language_loss": 0.52530909, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54569471, + "num_input_tokens_seen": 200736965, + "step": 9321, + "time_per_iteration": 3.0199856758117676 + }, + { + "auxiliary_loss_clip": 0.01097468, + "auxiliary_loss_mlp": 0.01035636, + "balance_loss_clip": 1.04187417, + "balance_loss_mlp": 1.02274311, + "epoch": 0.5604689613708101, + "flos": 22236390357120.0, + "grad_norm": 1.366292846882571, + "language_loss": 0.74232858, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.7636596, + "num_input_tokens_seen": 200757420, + "step": 9322, + "time_per_iteration": 2.7239301204681396 + }, + { + "auxiliary_loss_clip": 0.01120105, + "auxiliary_loss_mlp": 0.01033893, + "balance_loss_clip": 1.04226124, + "balance_loss_mlp": 1.01925397, + "epoch": 0.5605290846234782, + "flos": 35297782940160.0, + "grad_norm": 1.6268223998146492, + "language_loss": 0.74119061, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.7627306, + "num_input_tokens_seen": 200779520, + "step": 9323, + "time_per_iteration": 2.7277660369873047 + }, + { + "auxiliary_loss_clip": 0.01097354, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.04408789, + "balance_loss_mlp": 1.01961303, + "epoch": 0.5605892078761461, + "flos": 20263314326400.0, + "grad_norm": 2.353968750169446, + "language_loss": 0.61679977, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.63811433, + "num_input_tokens_seen": 200799485, + "step": 9324, + "time_per_iteration": 2.681330442428589 + }, + { + "auxiliary_loss_clip": 0.01068442, + "auxiliary_loss_mlp": 0.0103778, + "balance_loss_clip": 1.03685164, + "balance_loss_mlp": 1.02353454, + "epoch": 0.5606493311288141, + "flos": 17308333134720.0, + "grad_norm": 1.7599111661375368, + "language_loss": 0.87798876, + "learning_rate": 1.705281040409226e-06, + "loss": 0.89905095, + "num_input_tokens_seen": 200817540, + "step": 9325, + "time_per_iteration": 2.73244571685791 + }, + { + "auxiliary_loss_clip": 0.01098073, + "auxiliary_loss_mlp": 0.01034138, + "balance_loss_clip": 1.04064608, + "balance_loss_mlp": 1.01970756, + "epoch": 0.560709454381482, + "flos": 21652985658240.0, + "grad_norm": 1.5582793995716135, + "language_loss": 0.7359941, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.75731623, + "num_input_tokens_seen": 200838380, + "step": 9326, + "time_per_iteration": 2.685098886489868 + }, + { + "auxiliary_loss_clip": 0.01099795, + "auxiliary_loss_mlp": 0.01027968, + "balance_loss_clip": 1.04008412, + "balance_loss_mlp": 1.01316798, + "epoch": 0.56076957763415, + "flos": 20303355012480.0, + "grad_norm": 1.8644433543241015, + "language_loss": 0.78216934, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.80344701, + "num_input_tokens_seen": 200855640, + "step": 9327, + "time_per_iteration": 2.7206430435180664 + }, + { + "auxiliary_loss_clip": 0.01106989, + "auxiliary_loss_mlp": 0.01034784, + "balance_loss_clip": 1.04609513, + "balance_loss_mlp": 1.02029371, + "epoch": 0.5608297008868179, + "flos": 25045897466880.0, + "grad_norm": 1.6309153070460434, + "language_loss": 0.78084052, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.80225813, + "num_input_tokens_seen": 200876585, + "step": 9328, + "time_per_iteration": 2.6724750995635986 + }, + { + "auxiliary_loss_clip": 0.01119639, + "auxiliary_loss_mlp": 0.01031594, + "balance_loss_clip": 1.04266322, + "balance_loss_mlp": 1.01832008, + "epoch": 0.560889824139486, + "flos": 19866825025920.0, + "grad_norm": 1.4710158195252034, + "language_loss": 0.73393631, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.75544858, + "num_input_tokens_seen": 200898175, + "step": 9329, + "time_per_iteration": 2.610711097717285 + }, + { + "auxiliary_loss_clip": 0.01100007, + "auxiliary_loss_mlp": 0.00773419, + "balance_loss_clip": 1.04148126, + "balance_loss_mlp": 1.00026274, + "epoch": 0.5609499473921539, + "flos": 22929394429440.0, + "grad_norm": 1.5539142345159989, + "language_loss": 0.83609939, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.85483366, + "num_input_tokens_seen": 200917515, + "step": 9330, + "time_per_iteration": 2.7287333011627197 + }, + { + "auxiliary_loss_clip": 0.01042257, + "auxiliary_loss_mlp": 0.01001028, + "balance_loss_clip": 1.01692343, + "balance_loss_mlp": 0.99974674, + "epoch": 0.5610100706448219, + "flos": 53035825455360.0, + "grad_norm": 0.7095685041475404, + "language_loss": 0.57797414, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.59840697, + "num_input_tokens_seen": 200978615, + "step": 9331, + "time_per_iteration": 3.197101354598999 + }, + { + "auxiliary_loss_clip": 0.01082146, + "auxiliary_loss_mlp": 0.01038466, + "balance_loss_clip": 1.0445832, + "balance_loss_mlp": 1.02405381, + "epoch": 0.5610701938974898, + "flos": 21834944979840.0, + "grad_norm": 1.957386899067858, + "language_loss": 0.82066166, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.8418678, + "num_input_tokens_seen": 200997745, + "step": 9332, + "time_per_iteration": 2.706125497817993 + }, + { + "auxiliary_loss_clip": 0.01106958, + "auxiliary_loss_mlp": 0.01043073, + "balance_loss_clip": 1.04060066, + "balance_loss_mlp": 1.02800488, + "epoch": 0.5611303171501578, + "flos": 17457183095040.0, + "grad_norm": 1.7807099110593088, + "language_loss": 0.81912845, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.8406288, + "num_input_tokens_seen": 201016370, + "step": 9333, + "time_per_iteration": 2.6288132667541504 + }, + { + "auxiliary_loss_clip": 0.01119893, + "auxiliary_loss_mlp": 0.01030061, + "balance_loss_clip": 1.04119062, + "balance_loss_mlp": 1.01676321, + "epoch": 0.5611904404028258, + "flos": 22637799820800.0, + "grad_norm": 1.6112092331225492, + "language_loss": 0.72989404, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.75139362, + "num_input_tokens_seen": 201034310, + "step": 9334, + "time_per_iteration": 2.6088995933532715 + }, + { + "auxiliary_loss_clip": 0.01098453, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.0453335, + "balance_loss_mlp": 1.02690315, + "epoch": 0.5612505636554938, + "flos": 14316327999360.0, + "grad_norm": 2.5253764454191416, + "language_loss": 0.71248639, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.73387766, + "num_input_tokens_seen": 201052030, + "step": 9335, + "time_per_iteration": 2.633389949798584 + }, + { + "auxiliary_loss_clip": 0.0109857, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.03983665, + "balance_loss_mlp": 1.02066469, + "epoch": 0.5613106869081618, + "flos": 16508279554560.0, + "grad_norm": 1.8386426637696407, + "language_loss": 0.77176088, + "learning_rate": 1.701044410566205e-06, + "loss": 0.79309118, + "num_input_tokens_seen": 201068445, + "step": 9336, + "time_per_iteration": 2.681753158569336 + }, + { + "auxiliary_loss_clip": 0.01108773, + "auxiliary_loss_mlp": 0.01033965, + "balance_loss_clip": 1.0423466, + "balance_loss_mlp": 1.02086353, + "epoch": 0.5613708101608297, + "flos": 24058569352320.0, + "grad_norm": 2.6196694346701817, + "language_loss": 0.64508319, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.66651058, + "num_input_tokens_seen": 201082140, + "step": 9337, + "time_per_iteration": 2.629194498062134 + }, + { + "auxiliary_loss_clip": 0.01025154, + "auxiliary_loss_mlp": 0.01003147, + "balance_loss_clip": 1.01963842, + "balance_loss_mlp": 1.00190687, + "epoch": 0.5614309334134977, + "flos": 64905735997440.0, + "grad_norm": 0.8917713489246797, + "language_loss": 0.62551695, + "learning_rate": 1.700274261035102e-06, + "loss": 0.64579999, + "num_input_tokens_seen": 201137245, + "step": 9338, + "time_per_iteration": 3.1740610599517822 + }, + { + "auxiliary_loss_clip": 0.01091363, + "auxiliary_loss_mlp": 0.01035931, + "balance_loss_clip": 1.04291368, + "balance_loss_mlp": 1.02275264, + "epoch": 0.5614910566661656, + "flos": 32919849740160.0, + "grad_norm": 1.9155240319962232, + "language_loss": 0.65588379, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.67715669, + "num_input_tokens_seen": 201157270, + "step": 9339, + "time_per_iteration": 2.795539617538452 + }, + { + "auxiliary_loss_clip": 0.0110324, + "auxiliary_loss_mlp": 0.01043787, + "balance_loss_clip": 1.04000616, + "balance_loss_mlp": 1.0283432, + "epoch": 0.5615511799188336, + "flos": 18588871969920.0, + "grad_norm": 1.9415000376687095, + "language_loss": 0.69498181, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.716452, + "num_input_tokens_seen": 201174530, + "step": 9340, + "time_per_iteration": 2.6073222160339355 + }, + { + "auxiliary_loss_clip": 0.01076412, + "auxiliary_loss_mlp": 0.0103814, + "balance_loss_clip": 1.04082394, + "balance_loss_mlp": 1.02536726, + "epoch": 0.5616113031715015, + "flos": 22820010537600.0, + "grad_norm": 1.461608284307224, + "language_loss": 0.77235413, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.79349971, + "num_input_tokens_seen": 201194905, + "step": 9341, + "time_per_iteration": 2.712812662124634 + }, + { + "auxiliary_loss_clip": 0.01069621, + "auxiliary_loss_mlp": 0.01037705, + "balance_loss_clip": 1.03758025, + "balance_loss_mlp": 1.0230068, + "epoch": 0.5616714264241696, + "flos": 22345702421760.0, + "grad_norm": 1.556156421929591, + "language_loss": 0.79645002, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.81752324, + "num_input_tokens_seen": 201213715, + "step": 9342, + "time_per_iteration": 2.774918556213379 + }, + { + "auxiliary_loss_clip": 0.01091015, + "auxiliary_loss_mlp": 0.01035282, + "balance_loss_clip": 1.03911448, + "balance_loss_mlp": 1.02109551, + "epoch": 0.5617315496768375, + "flos": 18807783408000.0, + "grad_norm": 2.3711889370259907, + "language_loss": 0.76042008, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.78168309, + "num_input_tokens_seen": 201231415, + "step": 9343, + "time_per_iteration": 2.7124969959259033 + }, + { + "auxiliary_loss_clip": 0.01080837, + "auxiliary_loss_mlp": 0.01044577, + "balance_loss_clip": 1.04475522, + "balance_loss_mlp": 1.03011727, + "epoch": 0.5617916729295055, + "flos": 18369314087040.0, + "grad_norm": 2.196794276196035, + "language_loss": 0.69644189, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.71769607, + "num_input_tokens_seen": 201249625, + "step": 9344, + "time_per_iteration": 2.7265472412109375 + }, + { + "auxiliary_loss_clip": 0.01121229, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.04347157, + "balance_loss_mlp": 1.02447486, + "epoch": 0.5618517961821734, + "flos": 28179964892160.0, + "grad_norm": 3.2350770637683106, + "language_loss": 0.6636014, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.68519998, + "num_input_tokens_seen": 201271205, + "step": 9345, + "time_per_iteration": 2.686527729034424 + }, + { + "auxiliary_loss_clip": 0.01098571, + "auxiliary_loss_mlp": 0.01032052, + "balance_loss_clip": 1.04279995, + "balance_loss_mlp": 1.0190227, + "epoch": 0.5619119194348414, + "flos": 15486872411520.0, + "grad_norm": 1.9772946469645978, + "language_loss": 0.87311339, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.89441955, + "num_input_tokens_seen": 201287700, + "step": 9346, + "time_per_iteration": 2.6551971435546875 + }, + { + "auxiliary_loss_clip": 0.0109764, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.04373372, + "balance_loss_mlp": 1.02243173, + "epoch": 0.5619720426875094, + "flos": 29128652951040.0, + "grad_norm": 2.320939151148892, + "language_loss": 0.59135818, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.61270428, + "num_input_tokens_seen": 201307530, + "step": 9347, + "time_per_iteration": 2.701704263687134 + }, + { + "auxiliary_loss_clip": 0.01113798, + "auxiliary_loss_mlp": 0.01039809, + "balance_loss_clip": 1.0449301, + "balance_loss_mlp": 1.02461553, + "epoch": 0.5620321659401774, + "flos": 18003743418240.0, + "grad_norm": 3.390094180858037, + "language_loss": 0.69345069, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.7149868, + "num_input_tokens_seen": 201326210, + "step": 9348, + "time_per_iteration": 2.6152281761169434 + }, + { + "auxiliary_loss_clip": 0.01072866, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.03694952, + "balance_loss_mlp": 1.02000761, + "epoch": 0.5620922891928454, + "flos": 20594518657920.0, + "grad_norm": 12.292181580280033, + "language_loss": 0.79008943, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.81117141, + "num_input_tokens_seen": 201346120, + "step": 9349, + "time_per_iteration": 5.937277793884277 + }, + { + "auxiliary_loss_clip": 0.01068645, + "auxiliary_loss_mlp": 0.0103743, + "balance_loss_clip": 1.04074883, + "balance_loss_mlp": 1.02314854, + "epoch": 0.5621524124455133, + "flos": 26287006147200.0, + "grad_norm": 2.217082199318971, + "language_loss": 0.67245173, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.6935125, + "num_input_tokens_seen": 201365700, + "step": 9350, + "time_per_iteration": 4.385211229324341 + }, + { + "auxiliary_loss_clip": 0.01069908, + "auxiliary_loss_mlp": 0.01039418, + "balance_loss_clip": 1.03964508, + "balance_loss_mlp": 1.02451682, + "epoch": 0.5622125356981813, + "flos": 12750299867520.0, + "grad_norm": 2.668539433171336, + "language_loss": 0.78305924, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.80415249, + "num_input_tokens_seen": 201382795, + "step": 9351, + "time_per_iteration": 2.6691691875457764 + }, + { + "auxiliary_loss_clip": 0.01099605, + "auxiliary_loss_mlp": 0.00772893, + "balance_loss_clip": 1.03920138, + "balance_loss_mlp": 1.00020838, + "epoch": 0.5622726589508492, + "flos": 23805327490560.0, + "grad_norm": 1.4861648044093183, + "language_loss": 0.59128547, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.61001039, + "num_input_tokens_seen": 201402780, + "step": 9352, + "time_per_iteration": 2.753941297531128 + }, + { + "auxiliary_loss_clip": 0.01105703, + "auxiliary_loss_mlp": 0.0103746, + "balance_loss_clip": 1.0406158, + "balance_loss_mlp": 1.02476466, + "epoch": 0.5623327822035172, + "flos": 24718212668160.0, + "grad_norm": 1.334754568183942, + "language_loss": 0.71630079, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.73773241, + "num_input_tokens_seen": 201424140, + "step": 9353, + "time_per_iteration": 4.249570369720459 + }, + { + "auxiliary_loss_clip": 0.01098184, + "auxiliary_loss_mlp": 0.01032581, + "balance_loss_clip": 1.04213238, + "balance_loss_mlp": 1.01918769, + "epoch": 0.5623929054561851, + "flos": 14019274523520.0, + "grad_norm": 2.376274628807619, + "language_loss": 0.7593621, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.78066975, + "num_input_tokens_seen": 201439645, + "step": 9354, + "time_per_iteration": 2.689899206161499 + }, + { + "auxiliary_loss_clip": 0.01089457, + "auxiliary_loss_mlp": 0.01035605, + "balance_loss_clip": 1.04167855, + "balance_loss_mlp": 1.02183056, + "epoch": 0.5624530287088532, + "flos": 20704405340160.0, + "grad_norm": 1.8223711210662343, + "language_loss": 0.72909653, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.75034714, + "num_input_tokens_seen": 201459970, + "step": 9355, + "time_per_iteration": 2.755100965499878 + }, + { + "auxiliary_loss_clip": 0.01104288, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.04146492, + "balance_loss_mlp": 1.01845825, + "epoch": 0.5625131519615211, + "flos": 21470918595840.0, + "grad_norm": 1.4719507883232867, + "language_loss": 0.7346037, + "learning_rate": 1.693344975084274e-06, + "loss": 0.75596744, + "num_input_tokens_seen": 201480055, + "step": 9356, + "time_per_iteration": 2.641638994216919 + }, + { + "auxiliary_loss_clip": 0.01119375, + "auxiliary_loss_mlp": 0.0103593, + "balance_loss_clip": 1.04301476, + "balance_loss_mlp": 1.02204823, + "epoch": 0.5625732752141891, + "flos": 18698004466560.0, + "grad_norm": 2.3002614331876687, + "language_loss": 0.83191347, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.85346651, + "num_input_tokens_seen": 201497645, + "step": 9357, + "time_per_iteration": 2.6374433040618896 + }, + { + "auxiliary_loss_clip": 0.01108702, + "auxiliary_loss_mlp": 0.01033262, + "balance_loss_clip": 1.04158151, + "balance_loss_mlp": 1.02019668, + "epoch": 0.562633398466857, + "flos": 16216900427520.0, + "grad_norm": 2.42238754199954, + "language_loss": 0.72483993, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.74625957, + "num_input_tokens_seen": 201515455, + "step": 9358, + "time_per_iteration": 2.6288702487945557 + }, + { + "auxiliary_loss_clip": 0.01118085, + "auxiliary_loss_mlp": 0.01042212, + "balance_loss_clip": 1.04183221, + "balance_loss_mlp": 1.02859807, + "epoch": 0.562693521719525, + "flos": 22491930689280.0, + "grad_norm": 2.2438292834488838, + "language_loss": 0.7763263, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.79792929, + "num_input_tokens_seen": 201534500, + "step": 9359, + "time_per_iteration": 2.6272196769714355 + }, + { + "auxiliary_loss_clip": 0.0109706, + "auxiliary_loss_mlp": 0.01033721, + "balance_loss_clip": 1.04087317, + "balance_loss_mlp": 1.0212934, + "epoch": 0.562753644972193, + "flos": 25331171281920.0, + "grad_norm": 1.8703344042445116, + "language_loss": 0.70466304, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.72597086, + "num_input_tokens_seen": 201553280, + "step": 9360, + "time_per_iteration": 2.6694719791412354 + }, + { + "auxiliary_loss_clip": 0.00993761, + "auxiliary_loss_mlp": 0.00999248, + "balance_loss_clip": 1.01494741, + "balance_loss_mlp": 0.99799061, + "epoch": 0.562813768224861, + "flos": 67392622126080.0, + "grad_norm": 0.7735600550199924, + "language_loss": 0.5555625, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57549262, + "num_input_tokens_seen": 201610030, + "step": 9361, + "time_per_iteration": 3.2061593532562256 + }, + { + "auxiliary_loss_clip": 0.01093709, + "auxiliary_loss_mlp": 0.01035172, + "balance_loss_clip": 1.04106104, + "balance_loss_mlp": 1.02236927, + "epoch": 0.562873891477529, + "flos": 23331163029120.0, + "grad_norm": 1.4272041180912485, + "language_loss": 0.8169086, + "learning_rate": 1.691036046141018e-06, + "loss": 0.83819747, + "num_input_tokens_seen": 201628370, + "step": 9362, + "time_per_iteration": 2.648585319519043 + }, + { + "auxiliary_loss_clip": 0.01084349, + "auxiliary_loss_mlp": 0.00771085, + "balance_loss_clip": 1.03982627, + "balance_loss_mlp": 1.00021708, + "epoch": 0.5629340147301969, + "flos": 38472824805120.0, + "grad_norm": 1.5810217639510977, + "language_loss": 0.7460767, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.76463103, + "num_input_tokens_seen": 201649790, + "step": 9363, + "time_per_iteration": 2.8376948833465576 + }, + { + "auxiliary_loss_clip": 0.01114455, + "auxiliary_loss_mlp": 0.01034672, + "balance_loss_clip": 1.04345608, + "balance_loss_mlp": 1.02068281, + "epoch": 0.5629941379828649, + "flos": 29242023252480.0, + "grad_norm": 1.625625465741998, + "language_loss": 0.82640725, + "learning_rate": 1.690266496731839e-06, + "loss": 0.84789848, + "num_input_tokens_seen": 201669175, + "step": 9364, + "time_per_iteration": 2.6790480613708496 + }, + { + "auxiliary_loss_clip": 0.0107898, + "auxiliary_loss_mlp": 0.0103866, + "balance_loss_clip": 1.03860497, + "balance_loss_mlp": 1.02573752, + "epoch": 0.5630542612355328, + "flos": 19420885676160.0, + "grad_norm": 2.0942443962927513, + "language_loss": 0.65238589, + "learning_rate": 1.689881739637642e-06, + "loss": 0.67356229, + "num_input_tokens_seen": 201687000, + "step": 9365, + "time_per_iteration": 2.6504223346710205 + }, + { + "auxiliary_loss_clip": 0.01099908, + "auxiliary_loss_mlp": 0.01040371, + "balance_loss_clip": 1.0423665, + "balance_loss_mlp": 1.0259583, + "epoch": 0.5631143844882008, + "flos": 22266303408000.0, + "grad_norm": 5.761173374312871, + "language_loss": 0.8185727, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.83997548, + "num_input_tokens_seen": 201703335, + "step": 9366, + "time_per_iteration": 2.6865267753601074 + }, + { + "auxiliary_loss_clip": 0.01118809, + "auxiliary_loss_mlp": 0.01033751, + "balance_loss_clip": 1.04305601, + "balance_loss_mlp": 1.02106667, + "epoch": 0.5631745077408687, + "flos": 22965305051520.0, + "grad_norm": 1.4687745386206819, + "language_loss": 0.73388821, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.75541377, + "num_input_tokens_seen": 201723495, + "step": 9367, + "time_per_iteration": 2.6309821605682373 + }, + { + "auxiliary_loss_clip": 0.01020057, + "auxiliary_loss_mlp": 0.01004541, + "balance_loss_clip": 1.01475585, + "balance_loss_mlp": 1.0032177, + "epoch": 0.5632346309935368, + "flos": 65080515576960.0, + "grad_norm": 0.6203732228424765, + "language_loss": 0.53471267, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.5549587, + "num_input_tokens_seen": 201792615, + "step": 9368, + "time_per_iteration": 3.285132884979248 + }, + { + "auxiliary_loss_clip": 0.01119712, + "auxiliary_loss_mlp": 0.01038636, + "balance_loss_clip": 1.04367208, + "balance_loss_mlp": 1.02514756, + "epoch": 0.5632947542462047, + "flos": 23002903612800.0, + "grad_norm": 1.6032046035258145, + "language_loss": 0.69323123, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.71481466, + "num_input_tokens_seen": 201812520, + "step": 9369, + "time_per_iteration": 2.5828912258148193 + }, + { + "auxiliary_loss_clip": 0.01081861, + "auxiliary_loss_mlp": 0.01036769, + "balance_loss_clip": 1.03560948, + "balance_loss_mlp": 1.02258897, + "epoch": 0.5633548774988727, + "flos": 30482593228800.0, + "grad_norm": 1.8644770946275213, + "language_loss": 0.75840139, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.77958775, + "num_input_tokens_seen": 201834185, + "step": 9370, + "time_per_iteration": 2.7649481296539307 + }, + { + "auxiliary_loss_clip": 0.01095504, + "auxiliary_loss_mlp": 0.01038896, + "balance_loss_clip": 1.04126322, + "balance_loss_mlp": 1.02440023, + "epoch": 0.5634150007515406, + "flos": 18515039564160.0, + "grad_norm": 2.2895815027179864, + "language_loss": 0.755108, + "learning_rate": 1.687573444537108e-06, + "loss": 0.776452, + "num_input_tokens_seen": 201851305, + "step": 9371, + "time_per_iteration": 2.591031312942505 + }, + { + "auxiliary_loss_clip": 0.01106226, + "auxiliary_loss_mlp": 0.01040784, + "balance_loss_clip": 1.04110384, + "balance_loss_mlp": 1.02787304, + "epoch": 0.5634751240042086, + "flos": 19244672530560.0, + "grad_norm": 1.7615457998604214, + "language_loss": 0.76489764, + "learning_rate": 1.687188770067285e-06, + "loss": 0.78636777, + "num_input_tokens_seen": 201870350, + "step": 9372, + "time_per_iteration": 2.619053840637207 + }, + { + "auxiliary_loss_clip": 0.01090528, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.03906, + "balance_loss_mlp": 1.01705718, + "epoch": 0.5635352472568766, + "flos": 12020630987520.0, + "grad_norm": 2.266062441891877, + "language_loss": 0.71336401, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.73457533, + "num_input_tokens_seen": 201886800, + "step": 9373, + "time_per_iteration": 2.624600887298584 + }, + { + "auxiliary_loss_clip": 0.01090554, + "auxiliary_loss_mlp": 0.01031384, + "balance_loss_clip": 1.04418933, + "balance_loss_mlp": 1.0168401, + "epoch": 0.5635953705095446, + "flos": 21871645701120.0, + "grad_norm": 2.1043627154333797, + "language_loss": 0.82543874, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.84665811, + "num_input_tokens_seen": 201904730, + "step": 9374, + "time_per_iteration": 2.644887924194336 + }, + { + "auxiliary_loss_clip": 0.01104117, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.03739262, + "balance_loss_mlp": 1.01734459, + "epoch": 0.5636554937622126, + "flos": 27126166659840.0, + "grad_norm": 1.7268514389800265, + "language_loss": 0.66357785, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.68492401, + "num_input_tokens_seen": 201924850, + "step": 9375, + "time_per_iteration": 2.65166974067688 + }, + { + "auxiliary_loss_clip": 0.01084894, + "auxiliary_loss_mlp": 0.00770652, + "balance_loss_clip": 1.04238153, + "balance_loss_mlp": 1.00019169, + "epoch": 0.5637156170148805, + "flos": 12926405272320.0, + "grad_norm": 2.3049359861127696, + "language_loss": 0.81049269, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.82904816, + "num_input_tokens_seen": 201939500, + "step": 9376, + "time_per_iteration": 2.766364336013794 + }, + { + "auxiliary_loss_clip": 0.01101359, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.04133999, + "balance_loss_mlp": 1.02115881, + "epoch": 0.5637757402675485, + "flos": 45551033130240.0, + "grad_norm": 1.6449694311006493, + "language_loss": 0.6926713, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.71403265, + "num_input_tokens_seen": 201963000, + "step": 9377, + "time_per_iteration": 2.870060443878174 + }, + { + "auxiliary_loss_clip": 0.01074381, + "auxiliary_loss_mlp": 0.01032969, + "balance_loss_clip": 1.03875685, + "balance_loss_mlp": 1.01979017, + "epoch": 0.5638358635202164, + "flos": 20886041439360.0, + "grad_norm": 1.3919625147372467, + "language_loss": 0.74771237, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.76878589, + "num_input_tokens_seen": 201983145, + "step": 9378, + "time_per_iteration": 2.749613046646118 + }, + { + "auxiliary_loss_clip": 0.01122728, + "auxiliary_loss_mlp": 0.01035934, + "balance_loss_clip": 1.04050553, + "balance_loss_mlp": 1.02154493, + "epoch": 0.5638959867728844, + "flos": 18806562345600.0, + "grad_norm": 2.63873718495401, + "language_loss": 0.81853002, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.84011662, + "num_input_tokens_seen": 202000335, + "step": 9379, + "time_per_iteration": 2.625277280807495 + }, + { + "auxiliary_loss_clip": 0.01093031, + "auxiliary_loss_mlp": 0.01036774, + "balance_loss_clip": 1.03674948, + "balance_loss_mlp": 1.02336287, + "epoch": 0.5639561100255523, + "flos": 27490336698240.0, + "grad_norm": 2.218934810530396, + "language_loss": 0.7167027, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.73800081, + "num_input_tokens_seen": 202018275, + "step": 9380, + "time_per_iteration": 2.715791940689087 + }, + { + "auxiliary_loss_clip": 0.01086194, + "auxiliary_loss_mlp": 0.01039984, + "balance_loss_clip": 1.0455358, + "balance_loss_mlp": 1.02485633, + "epoch": 0.5640162332782204, + "flos": 18076570243200.0, + "grad_norm": 2.0069687855649234, + "language_loss": 0.74178547, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.76304728, + "num_input_tokens_seen": 202034330, + "step": 9381, + "time_per_iteration": 2.8091652393341064 + }, + { + "auxiliary_loss_clip": 0.01068257, + "auxiliary_loss_mlp": 0.01041209, + "balance_loss_clip": 1.03590226, + "balance_loss_mlp": 1.02741027, + "epoch": 0.5640763565308883, + "flos": 20884856290560.0, + "grad_norm": 2.008212488841835, + "language_loss": 0.72358, + "learning_rate": 1.683342680176499e-06, + "loss": 0.74467456, + "num_input_tokens_seen": 202053100, + "step": 9382, + "time_per_iteration": 2.750049114227295 + }, + { + "auxiliary_loss_clip": 0.0103983, + "auxiliary_loss_mlp": 0.01012073, + "balance_loss_clip": 1.01468074, + "balance_loss_mlp": 1.01088643, + "epoch": 0.5641364797835563, + "flos": 64447912224000.0, + "grad_norm": 0.7132903418918451, + "language_loss": 0.54439944, + "learning_rate": 1.682958136989022e-06, + "loss": 0.56491846, + "num_input_tokens_seen": 202120125, + "step": 9383, + "time_per_iteration": 3.308600425720215 + }, + { + "auxiliary_loss_clip": 0.01106116, + "auxiliary_loss_mlp": 0.01030643, + "balance_loss_clip": 1.04080617, + "balance_loss_mlp": 1.01664162, + "epoch": 0.5641966030362242, + "flos": 18660944609280.0, + "grad_norm": 1.7587549687902173, + "language_loss": 0.71036148, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.73172909, + "num_input_tokens_seen": 202138030, + "step": 9384, + "time_per_iteration": 2.705378532409668 + }, + { + "auxiliary_loss_clip": 0.01098378, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.04193604, + "balance_loss_mlp": 1.02113533, + "epoch": 0.5642567262888922, + "flos": 22492325738880.0, + "grad_norm": 2.5060474723218724, + "language_loss": 0.75891483, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78025091, + "num_input_tokens_seen": 202155580, + "step": 9385, + "time_per_iteration": 2.648486375808716 + }, + { + "auxiliary_loss_clip": 0.01102679, + "auxiliary_loss_mlp": 0.01034721, + "balance_loss_clip": 1.03705001, + "balance_loss_mlp": 1.02121449, + "epoch": 0.5643168495415603, + "flos": 13003972692480.0, + "grad_norm": 1.9370694733196534, + "language_loss": 0.82360542, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.84497941, + "num_input_tokens_seen": 202170365, + "step": 9386, + "time_per_iteration": 2.6014211177825928 + }, + { + "auxiliary_loss_clip": 0.01108433, + "auxiliary_loss_mlp": 0.01035233, + "balance_loss_clip": 1.04246962, + "balance_loss_mlp": 1.02117205, + "epoch": 0.5643769727942282, + "flos": 18588297352320.0, + "grad_norm": 2.256739627854675, + "language_loss": 0.69928676, + "learning_rate": 1.681420084607516e-06, + "loss": 0.72072339, + "num_input_tokens_seen": 202189095, + "step": 9387, + "time_per_iteration": 2.6225178241729736 + }, + { + "auxiliary_loss_clip": 0.01110032, + "auxiliary_loss_mlp": 0.01036058, + "balance_loss_clip": 1.04169261, + "balance_loss_mlp": 1.02292085, + "epoch": 0.5644370960468962, + "flos": 33806269572480.0, + "grad_norm": 1.4294069994917775, + "language_loss": 0.74616826, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.76762915, + "num_input_tokens_seen": 202213500, + "step": 9388, + "time_per_iteration": 4.3489909172058105 + }, + { + "auxiliary_loss_clip": 0.01103005, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.04041004, + "balance_loss_mlp": 1.02143562, + "epoch": 0.5644972192995641, + "flos": 21214911386880.0, + "grad_norm": 1.5515532198665989, + "language_loss": 0.81965339, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.84101695, + "num_input_tokens_seen": 202231920, + "step": 9389, + "time_per_iteration": 5.713036060333252 + }, + { + "auxiliary_loss_clip": 0.01083726, + "auxiliary_loss_mlp": 0.01035772, + "balance_loss_clip": 1.03770804, + "balance_loss_mlp": 1.02090037, + "epoch": 0.5645573425522321, + "flos": 18587722734720.0, + "grad_norm": 2.017294292301613, + "language_loss": 0.63844502, + "learning_rate": 1.680266672116467e-06, + "loss": 0.65964001, + "num_input_tokens_seen": 202247600, + "step": 9390, + "time_per_iteration": 2.718738079071045 + }, + { + "auxiliary_loss_clip": 0.01096947, + "auxiliary_loss_mlp": 0.01030588, + "balance_loss_clip": 1.04229331, + "balance_loss_mlp": 1.01875103, + "epoch": 0.5646174658049, + "flos": 18113809668480.0, + "grad_norm": 1.8385345725956297, + "language_loss": 0.92190915, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.94318449, + "num_input_tokens_seen": 202265350, + "step": 9391, + "time_per_iteration": 2.6871705055236816 + }, + { + "auxiliary_loss_clip": 0.01118295, + "auxiliary_loss_mlp": 0.01037212, + "balance_loss_clip": 1.04650784, + "balance_loss_mlp": 1.02288282, + "epoch": 0.564677589057568, + "flos": 28329964087680.0, + "grad_norm": 2.30014312113224, + "language_loss": 0.60238105, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.62393618, + "num_input_tokens_seen": 202284285, + "step": 9392, + "time_per_iteration": 2.6965878009796143 + }, + { + "auxiliary_loss_clip": 0.01068376, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.03531122, + "balance_loss_mlp": 1.01367223, + "epoch": 0.564737712310236, + "flos": 22163743100160.0, + "grad_norm": 2.2381091213593924, + "language_loss": 0.81505215, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.83601177, + "num_input_tokens_seen": 202303450, + "step": 9393, + "time_per_iteration": 4.253687620162964 + }, + { + "auxiliary_loss_clip": 0.01095131, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.03995085, + "balance_loss_mlp": 1.01804614, + "epoch": 0.564797835562904, + "flos": 20959011918720.0, + "grad_norm": 1.6857006339700658, + "language_loss": 0.87381589, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.89507914, + "num_input_tokens_seen": 202322315, + "step": 9394, + "time_per_iteration": 2.6733334064483643 + }, + { + "auxiliary_loss_clip": 0.0110875, + "auxiliary_loss_mlp": 0.01033757, + "balance_loss_clip": 1.04296374, + "balance_loss_mlp": 1.02065587, + "epoch": 0.5648579588155719, + "flos": 17420302805760.0, + "grad_norm": 1.9505278392416294, + "language_loss": 0.84685338, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.86827838, + "num_input_tokens_seen": 202339905, + "step": 9395, + "time_per_iteration": 2.6754062175750732 + }, + { + "auxiliary_loss_clip": 0.0102964, + "auxiliary_loss_mlp": 0.01000117, + "balance_loss_clip": 1.01416993, + "balance_loss_mlp": 0.99888915, + "epoch": 0.5649180820682399, + "flos": 69929568835200.0, + "grad_norm": 0.7966393150311729, + "language_loss": 0.58260763, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60290521, + "num_input_tokens_seen": 202397320, + "step": 9396, + "time_per_iteration": 3.176486015319824 + }, + { + "auxiliary_loss_clip": 0.01099184, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.04099381, + "balance_loss_mlp": 1.01762295, + "epoch": 0.5649782053209078, + "flos": 24973070641920.0, + "grad_norm": 1.8659420980935195, + "language_loss": 0.70408708, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.72538739, + "num_input_tokens_seen": 202416865, + "step": 9397, + "time_per_iteration": 2.76436710357666 + }, + { + "auxiliary_loss_clip": 0.01087737, + "auxiliary_loss_mlp": 0.01036875, + "balance_loss_clip": 1.0412184, + "balance_loss_mlp": 1.02305877, + "epoch": 0.5650383285735758, + "flos": 21726602582400.0, + "grad_norm": 1.7242630837852022, + "language_loss": 0.66510224, + "learning_rate": 1.67719144001275e-06, + "loss": 0.68634838, + "num_input_tokens_seen": 202436210, + "step": 9398, + "time_per_iteration": 2.8452060222625732 + }, + { + "auxiliary_loss_clip": 0.0102199, + "auxiliary_loss_mlp": 0.01002651, + "balance_loss_clip": 1.01533413, + "balance_loss_mlp": 1.00157201, + "epoch": 0.5650984518262439, + "flos": 65904484636800.0, + "grad_norm": 0.7636877487193632, + "language_loss": 0.58165693, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.60190332, + "num_input_tokens_seen": 202492925, + "step": 9399, + "time_per_iteration": 3.1523597240448 + }, + { + "auxiliary_loss_clip": 0.01076045, + "auxiliary_loss_mlp": 0.01036845, + "balance_loss_clip": 1.03608418, + "balance_loss_mlp": 1.02109778, + "epoch": 0.5651585750789118, + "flos": 21032592929280.0, + "grad_norm": 2.707299355352823, + "language_loss": 0.7311101, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.75223899, + "num_input_tokens_seen": 202511905, + "step": 9400, + "time_per_iteration": 2.778313636779785 + }, + { + "auxiliary_loss_clip": 0.01093566, + "auxiliary_loss_mlp": 0.01038541, + "balance_loss_clip": 1.04261565, + "balance_loss_mlp": 1.02349663, + "epoch": 0.5652186983315798, + "flos": 18551919853440.0, + "grad_norm": 1.7896331589473868, + "language_loss": 0.6111843, + "learning_rate": 1.676038429548412e-06, + "loss": 0.63250542, + "num_input_tokens_seen": 202529815, + "step": 9401, + "time_per_iteration": 2.7110683917999268 + }, + { + "auxiliary_loss_clip": 0.01077473, + "auxiliary_loss_mlp": 0.01030698, + "balance_loss_clip": 1.03607464, + "balance_loss_mlp": 1.01735282, + "epoch": 0.5652788215842477, + "flos": 18478662065280.0, + "grad_norm": 3.6521869515488405, + "language_loss": 0.81323993, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.83432162, + "num_input_tokens_seen": 202547710, + "step": 9402, + "time_per_iteration": 2.8134961128234863 + }, + { + "auxiliary_loss_clip": 0.0106172, + "auxiliary_loss_mlp": 0.01043189, + "balance_loss_clip": 1.03186333, + "balance_loss_mlp": 1.02785897, + "epoch": 0.5653389448369157, + "flos": 30044052080640.0, + "grad_norm": 1.434807389128129, + "language_loss": 0.77711642, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.79816544, + "num_input_tokens_seen": 202568835, + "step": 9403, + "time_per_iteration": 2.9176833629608154 + }, + { + "auxiliary_loss_clip": 0.01064861, + "auxiliary_loss_mlp": 0.01036876, + "balance_loss_clip": 1.03543758, + "balance_loss_mlp": 1.02137828, + "epoch": 0.5653990680895836, + "flos": 16727550128640.0, + "grad_norm": 1.6891349615397695, + "language_loss": 0.69381618, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.71483362, + "num_input_tokens_seen": 202587385, + "step": 9404, + "time_per_iteration": 2.8122291564941406 + }, + { + "auxiliary_loss_clip": 0.01081972, + "auxiliary_loss_mlp": 0.01035091, + "balance_loss_clip": 1.03926969, + "balance_loss_mlp": 1.02245533, + "epoch": 0.5654591913422516, + "flos": 14538256179840.0, + "grad_norm": 1.8707097320787585, + "language_loss": 0.66802347, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.68919408, + "num_input_tokens_seen": 202604815, + "step": 9405, + "time_per_iteration": 2.6256675720214844 + }, + { + "auxiliary_loss_clip": 0.01087827, + "auxiliary_loss_mlp": 0.01038368, + "balance_loss_clip": 1.03976154, + "balance_loss_mlp": 1.02543378, + "epoch": 0.5655193145949196, + "flos": 26209905603840.0, + "grad_norm": 1.7731068900459501, + "language_loss": 0.74520212, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.76646411, + "num_input_tokens_seen": 202623775, + "step": 9406, + "time_per_iteration": 2.7220685482025146 + }, + { + "auxiliary_loss_clip": 0.01061139, + "auxiliary_loss_mlp": 0.01043351, + "balance_loss_clip": 1.03829598, + "balance_loss_mlp": 1.02655435, + "epoch": 0.5655794378475876, + "flos": 25046579825280.0, + "grad_norm": 1.7152353741974506, + "language_loss": 0.7952764, + "learning_rate": 1.673732740698882e-06, + "loss": 0.81632137, + "num_input_tokens_seen": 202643375, + "step": 9407, + "time_per_iteration": 2.785325765609741 + }, + { + "auxiliary_loss_clip": 0.01077703, + "auxiliary_loss_mlp": 0.01039246, + "balance_loss_clip": 1.03728688, + "balance_loss_mlp": 1.02510178, + "epoch": 0.5656395611002555, + "flos": 31032852652800.0, + "grad_norm": 1.3619251628826352, + "language_loss": 0.71023029, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.73139971, + "num_input_tokens_seen": 202668400, + "step": 9408, + "time_per_iteration": 2.8171489238739014 + }, + { + "auxiliary_loss_clip": 0.01061658, + "auxiliary_loss_mlp": 0.01035867, + "balance_loss_clip": 1.03865576, + "balance_loss_mlp": 1.02198541, + "epoch": 0.5656996843529235, + "flos": 20229522606720.0, + "grad_norm": 1.9952093590252573, + "language_loss": 0.81203496, + "learning_rate": 1.672964276570308e-06, + "loss": 0.8330102, + "num_input_tokens_seen": 202685125, + "step": 9409, + "time_per_iteration": 2.770899772644043 + }, + { + "auxiliary_loss_clip": 0.01076156, + "auxiliary_loss_mlp": 0.01030595, + "balance_loss_clip": 1.03786421, + "balance_loss_mlp": 1.01730919, + "epoch": 0.5657598076055914, + "flos": 20996251344000.0, + "grad_norm": 1.8859201816541107, + "language_loss": 0.78039193, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.80145949, + "num_input_tokens_seen": 202703830, + "step": 9410, + "time_per_iteration": 2.6944680213928223 + }, + { + "auxiliary_loss_clip": 0.01121778, + "auxiliary_loss_mlp": 0.01042462, + "balance_loss_clip": 1.04339719, + "balance_loss_mlp": 1.02865767, + "epoch": 0.5658199308582594, + "flos": 11545999649280.0, + "grad_norm": 2.199230863577756, + "language_loss": 0.83460367, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.85624611, + "num_input_tokens_seen": 202719835, + "step": 9411, + "time_per_iteration": 2.576122760772705 + }, + { + "auxiliary_loss_clip": 0.01112938, + "auxiliary_loss_mlp": 0.01033542, + "balance_loss_clip": 1.04195237, + "balance_loss_mlp": 1.01830769, + "epoch": 0.5658800541109275, + "flos": 14172146807040.0, + "grad_norm": 3.221148840875553, + "language_loss": 0.67855954, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.70002437, + "num_input_tokens_seen": 202736795, + "step": 9412, + "time_per_iteration": 2.6416120529174805 + }, + { + "auxiliary_loss_clip": 0.01104164, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.04040003, + "balance_loss_mlp": 1.02026224, + "epoch": 0.5659401773635954, + "flos": 27305073325440.0, + "grad_norm": 1.6585263288332466, + "language_loss": 0.58582389, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.60718977, + "num_input_tokens_seen": 202756900, + "step": 9413, + "time_per_iteration": 2.678048610687256 + }, + { + "auxiliary_loss_clip": 0.01039217, + "auxiliary_loss_mlp": 0.01044241, + "balance_loss_clip": 1.03433728, + "balance_loss_mlp": 1.02943516, + "epoch": 0.5660003006162634, + "flos": 16728196573440.0, + "grad_norm": 1.5449777270978375, + "language_loss": 0.69369984, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.71453446, + "num_input_tokens_seen": 202775145, + "step": 9414, + "time_per_iteration": 2.7721176147460938 + }, + { + "auxiliary_loss_clip": 0.01048825, + "auxiliary_loss_mlp": 0.01033596, + "balance_loss_clip": 1.04257154, + "balance_loss_mlp": 1.02139449, + "epoch": 0.5660604238689313, + "flos": 21653452535040.0, + "grad_norm": 1.812121190686056, + "language_loss": 0.78028589, + "learning_rate": 1.670659182280247e-06, + "loss": 0.80111009, + "num_input_tokens_seen": 202794505, + "step": 9415, + "time_per_iteration": 3.0027029514312744 + }, + { + "auxiliary_loss_clip": 0.01020707, + "auxiliary_loss_mlp": 0.01005189, + "balance_loss_clip": 1.01482093, + "balance_loss_mlp": 1.00411057, + "epoch": 0.5661205471215993, + "flos": 68824022083200.0, + "grad_norm": 0.6894107195855314, + "language_loss": 0.4917945, + "learning_rate": 1.670275043523822e-06, + "loss": 0.51205349, + "num_input_tokens_seen": 202858580, + "step": 9416, + "time_per_iteration": 3.564145565032959 + }, + { + "auxiliary_loss_clip": 0.01107627, + "auxiliary_loss_mlp": 0.00770936, + "balance_loss_clip": 1.04195189, + "balance_loss_mlp": 1.00020862, + "epoch": 0.5661806703742672, + "flos": 28621774177920.0, + "grad_norm": 1.657672708695628, + "language_loss": 0.62541103, + "learning_rate": 1.6698909172706e-06, + "loss": 0.64419663, + "num_input_tokens_seen": 202878565, + "step": 9417, + "time_per_iteration": 2.6624128818511963 + }, + { + "auxiliary_loss_clip": 0.01098355, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.03992152, + "balance_loss_mlp": 1.02003968, + "epoch": 0.5662407936269352, + "flos": 21397948116480.0, + "grad_norm": 1.9219049023075434, + "language_loss": 0.68760461, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.7089265, + "num_input_tokens_seen": 202897350, + "step": 9418, + "time_per_iteration": 2.686701774597168 + }, + { + "auxiliary_loss_clip": 0.0110608, + "auxiliary_loss_mlp": 0.01034957, + "balance_loss_clip": 1.03848708, + "balance_loss_mlp": 1.01997232, + "epoch": 0.5663009168796032, + "flos": 25660005315840.0, + "grad_norm": 1.8426385136450754, + "language_loss": 0.65225303, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.67366338, + "num_input_tokens_seen": 202916745, + "step": 9419, + "time_per_iteration": 2.7483572959899902 + }, + { + "auxiliary_loss_clip": 0.00978175, + "auxiliary_loss_mlp": 0.01018666, + "balance_loss_clip": 1.01932096, + "balance_loss_mlp": 1.01722336, + "epoch": 0.5663610401322712, + "flos": 67930458422400.0, + "grad_norm": 0.7448874820638522, + "language_loss": 0.59677726, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.61674571, + "num_input_tokens_seen": 202982375, + "step": 9420, + "time_per_iteration": 3.422990083694458 + }, + { + "auxiliary_loss_clip": 0.01098663, + "auxiliary_loss_mlp": 0.00770427, + "balance_loss_clip": 1.0412631, + "balance_loss_mlp": 1.00017929, + "epoch": 0.5664211633849391, + "flos": 24609367480320.0, + "grad_norm": 1.5681535851968893, + "language_loss": 0.74130625, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.75999713, + "num_input_tokens_seen": 203002430, + "step": 9421, + "time_per_iteration": 2.8006680011749268 + }, + { + "auxiliary_loss_clip": 0.0108426, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.03777134, + "balance_loss_mlp": 1.02212119, + "epoch": 0.5664812866376071, + "flos": 11648811352320.0, + "grad_norm": 2.1577016458252567, + "language_loss": 0.72988069, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.75108308, + "num_input_tokens_seen": 203019425, + "step": 9422, + "time_per_iteration": 2.6400234699249268 + }, + { + "auxiliary_loss_clip": 0.01105093, + "auxiliary_loss_mlp": 0.01037861, + "balance_loss_clip": 1.04141676, + "balance_loss_mlp": 1.02530825, + "epoch": 0.566541409890275, + "flos": 24643985212800.0, + "grad_norm": 1.7654112494568213, + "language_loss": 0.81893075, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.84036028, + "num_input_tokens_seen": 203039035, + "step": 9423, + "time_per_iteration": 2.673105239868164 + }, + { + "auxiliary_loss_clip": 0.01090689, + "auxiliary_loss_mlp": 0.01037493, + "balance_loss_clip": 1.03944159, + "balance_loss_mlp": 1.02356339, + "epoch": 0.566601533142943, + "flos": 22270577126400.0, + "grad_norm": 1.4934148877619189, + "language_loss": 0.8075555, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.82883728, + "num_input_tokens_seen": 203059320, + "step": 9424, + "time_per_iteration": 2.6597039699554443 + }, + { + "auxiliary_loss_clip": 0.01124321, + "auxiliary_loss_mlp": 0.00771519, + "balance_loss_clip": 1.04382432, + "balance_loss_mlp": 1.00018978, + "epoch": 0.5666616563956111, + "flos": 29971656218880.0, + "grad_norm": 2.0092362269175297, + "language_loss": 0.78882873, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.80778712, + "num_input_tokens_seen": 203078490, + "step": 9425, + "time_per_iteration": 2.6688153743743896 + }, + { + "auxiliary_loss_clip": 0.01090837, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.0417583, + "balance_loss_mlp": 1.02170324, + "epoch": 0.566721779648279, + "flos": 17781456101760.0, + "grad_norm": 1.976091068193849, + "language_loss": 0.5920769, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.61333382, + "num_input_tokens_seen": 203096065, + "step": 9426, + "time_per_iteration": 2.6646664142608643 + }, + { + "auxiliary_loss_clip": 0.01110034, + "auxiliary_loss_mlp": 0.01032331, + "balance_loss_clip": 1.04102027, + "balance_loss_mlp": 1.01922381, + "epoch": 0.566781902900947, + "flos": 21033490769280.0, + "grad_norm": 2.110311025280775, + "language_loss": 0.81678975, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.83821344, + "num_input_tokens_seen": 203115270, + "step": 9427, + "time_per_iteration": 5.8222620487213135 + }, + { + "auxiliary_loss_clip": 0.01117064, + "auxiliary_loss_mlp": 0.01038278, + "balance_loss_clip": 1.04323864, + "balance_loss_mlp": 1.02506411, + "epoch": 0.5668420261536149, + "flos": 23148593176320.0, + "grad_norm": 1.814267468057716, + "language_loss": 0.86105633, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.88260972, + "num_input_tokens_seen": 203134290, + "step": 9428, + "time_per_iteration": 4.0940985679626465 + }, + { + "auxiliary_loss_clip": 0.01102233, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.04397511, + "balance_loss_mlp": 1.01996648, + "epoch": 0.5669021494062829, + "flos": 22601601889920.0, + "grad_norm": 2.604927880391597, + "language_loss": 0.73541754, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.75678086, + "num_input_tokens_seen": 203152935, + "step": 9429, + "time_per_iteration": 2.711982011795044 + }, + { + "auxiliary_loss_clip": 0.01100688, + "auxiliary_loss_mlp": 0.00772268, + "balance_loss_clip": 1.04164147, + "balance_loss_mlp": 1.00020123, + "epoch": 0.5669622726589508, + "flos": 17381231786880.0, + "grad_norm": 3.499205688936759, + "language_loss": 0.75380534, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.77253491, + "num_input_tokens_seen": 203170110, + "step": 9430, + "time_per_iteration": 2.775536060333252 + }, + { + "auxiliary_loss_clip": 0.0111876, + "auxiliary_loss_mlp": 0.01036284, + "balance_loss_clip": 1.04125142, + "balance_loss_mlp": 1.02263451, + "epoch": 0.5670223959116188, + "flos": 18763253521920.0, + "grad_norm": 1.7932678929965582, + "language_loss": 0.72862244, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.75017291, + "num_input_tokens_seen": 203188825, + "step": 9431, + "time_per_iteration": 2.7299160957336426 + }, + { + "auxiliary_loss_clip": 0.01068382, + "auxiliary_loss_mlp": 0.01037407, + "balance_loss_clip": 1.03856969, + "balance_loss_mlp": 1.02459264, + "epoch": 0.5670825191642868, + "flos": 13553334276480.0, + "grad_norm": 1.899230938499918, + "language_loss": 0.73544705, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.75650489, + "num_input_tokens_seen": 203206860, + "step": 9432, + "time_per_iteration": 2.68713641166687 + }, + { + "auxiliary_loss_clip": 0.01066627, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.03716183, + "balance_loss_mlp": 1.01875782, + "epoch": 0.5671426424169548, + "flos": 22054035985920.0, + "grad_norm": 1.4657818599236931, + "language_loss": 0.78099382, + "learning_rate": 1.663746609539197e-06, + "loss": 0.80198044, + "num_input_tokens_seen": 203225625, + "step": 9433, + "time_per_iteration": 4.3982954025268555 + }, + { + "auxiliary_loss_clip": 0.01123451, + "auxiliary_loss_mlp": 0.01038623, + "balance_loss_clip": 1.04226542, + "balance_loss_mlp": 1.02239299, + "epoch": 0.5672027656696227, + "flos": 21323972056320.0, + "grad_norm": 1.9415050552486373, + "language_loss": 0.6311425, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.65276325, + "num_input_tokens_seen": 203242920, + "step": 9434, + "time_per_iteration": 2.6829726696014404 + }, + { + "auxiliary_loss_clip": 0.01106985, + "auxiliary_loss_mlp": 0.01029655, + "balance_loss_clip": 1.04066229, + "balance_loss_mlp": 1.01654196, + "epoch": 0.5672628889222907, + "flos": 23514056104320.0, + "grad_norm": 2.0456781967901025, + "language_loss": 0.66337132, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.68473774, + "num_input_tokens_seen": 203261995, + "step": 9435, + "time_per_iteration": 2.7055511474609375 + }, + { + "auxiliary_loss_clip": 0.01092568, + "auxiliary_loss_mlp": 0.00770808, + "balance_loss_clip": 1.03747869, + "balance_loss_mlp": 1.00008333, + "epoch": 0.5673230121749586, + "flos": 27121928855040.0, + "grad_norm": 1.9714061310868114, + "language_loss": 0.71574509, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.73437893, + "num_input_tokens_seen": 203280670, + "step": 9436, + "time_per_iteration": 2.7314302921295166 + }, + { + "auxiliary_loss_clip": 0.01119804, + "auxiliary_loss_mlp": 0.01034867, + "balance_loss_clip": 1.041466, + "balance_loss_mlp": 1.02099752, + "epoch": 0.5673831354276266, + "flos": 31141985149440.0, + "grad_norm": 1.474374193730658, + "language_loss": 0.7411499, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.76269662, + "num_input_tokens_seen": 203304800, + "step": 9437, + "time_per_iteration": 2.6829545497894287 + }, + { + "auxiliary_loss_clip": 0.01115825, + "auxiliary_loss_mlp": 0.01036618, + "balance_loss_clip": 1.04766893, + "balance_loss_mlp": 1.02231264, + "epoch": 0.5674432586802945, + "flos": 27673193859840.0, + "grad_norm": 2.0226289672132096, + "language_loss": 0.6118415, + "learning_rate": 1.661827179985277e-06, + "loss": 0.63336593, + "num_input_tokens_seen": 203324060, + "step": 9438, + "time_per_iteration": 2.6840946674346924 + }, + { + "auxiliary_loss_clip": 0.01097885, + "auxiliary_loss_mlp": 0.01032312, + "balance_loss_clip": 1.03924835, + "balance_loss_mlp": 1.0185318, + "epoch": 0.5675033819329626, + "flos": 26615157822720.0, + "grad_norm": 1.5530482991602657, + "language_loss": 0.75020033, + "learning_rate": 1.661443332486909e-06, + "loss": 0.77150226, + "num_input_tokens_seen": 203344360, + "step": 9439, + "time_per_iteration": 2.6898789405822754 + }, + { + "auxiliary_loss_clip": 0.01092055, + "auxiliary_loss_mlp": 0.01036149, + "balance_loss_clip": 1.04008341, + "balance_loss_mlp": 1.02168322, + "epoch": 0.5675635051856306, + "flos": 19098372435840.0, + "grad_norm": 1.924986803502997, + "language_loss": 0.83848387, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.85976589, + "num_input_tokens_seen": 203362115, + "step": 9440, + "time_per_iteration": 2.7438228130340576 + }, + { + "auxiliary_loss_clip": 0.01087383, + "auxiliary_loss_mlp": 0.01036961, + "balance_loss_clip": 1.0389899, + "balance_loss_mlp": 1.02264404, + "epoch": 0.5676236284382985, + "flos": 17566315591680.0, + "grad_norm": 3.3538120018942843, + "language_loss": 0.75190175, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.7731452, + "num_input_tokens_seen": 203380550, + "step": 9441, + "time_per_iteration": 2.6487948894500732 + }, + { + "auxiliary_loss_clip": 0.01066366, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.03523147, + "balance_loss_mlp": 1.02376556, + "epoch": 0.5676837516909665, + "flos": 15954069634560.0, + "grad_norm": 1.8078445069287523, + "language_loss": 0.83109975, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.85213792, + "num_input_tokens_seen": 203396590, + "step": 9442, + "time_per_iteration": 2.692474842071533 + }, + { + "auxiliary_loss_clip": 0.01083606, + "auxiliary_loss_mlp": 0.0103585, + "balance_loss_clip": 1.04210138, + "balance_loss_mlp": 1.02311242, + "epoch": 0.5677438749436344, + "flos": 18295912644480.0, + "grad_norm": 2.0214699890453414, + "language_loss": 0.74567246, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.76686704, + "num_input_tokens_seen": 203414280, + "step": 9443, + "time_per_iteration": 2.742173433303833 + }, + { + "auxiliary_loss_clip": 0.01093942, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.04245853, + "balance_loss_mlp": 1.02310669, + "epoch": 0.5678039981963025, + "flos": 17931311642880.0, + "grad_norm": 2.2236359492875817, + "language_loss": 0.77068752, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.79199237, + "num_input_tokens_seen": 203433280, + "step": 9444, + "time_per_iteration": 2.65165376663208 + }, + { + "auxiliary_loss_clip": 0.01083168, + "auxiliary_loss_mlp": 0.01042977, + "balance_loss_clip": 1.04132152, + "balance_loss_mlp": 1.02910697, + "epoch": 0.5678641214489704, + "flos": 19316350120320.0, + "grad_norm": 1.9769562357276376, + "language_loss": 0.80988097, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.83114243, + "num_input_tokens_seen": 203449935, + "step": 9445, + "time_per_iteration": 2.692103147506714 + }, + { + "auxiliary_loss_clip": 0.01115981, + "auxiliary_loss_mlp": 0.01030041, + "balance_loss_clip": 1.03910589, + "balance_loss_mlp": 1.01635599, + "epoch": 0.5679242447016384, + "flos": 27751084502400.0, + "grad_norm": 1.8145653139656197, + "language_loss": 0.71126974, + "learning_rate": 1.658756760280259e-06, + "loss": 0.73272997, + "num_input_tokens_seen": 203473025, + "step": 9446, + "time_per_iteration": 2.6656479835510254 + }, + { + "auxiliary_loss_clip": 0.01084809, + "auxiliary_loss_mlp": 0.01029841, + "balance_loss_clip": 1.03896558, + "balance_loss_mlp": 1.01640046, + "epoch": 0.5679843679543063, + "flos": 23769093646080.0, + "grad_norm": 1.9173533022587075, + "language_loss": 0.73434311, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.75548959, + "num_input_tokens_seen": 203492895, + "step": 9447, + "time_per_iteration": 2.7948012351989746 + }, + { + "auxiliary_loss_clip": 0.01099661, + "auxiliary_loss_mlp": 0.01034648, + "balance_loss_clip": 1.04186499, + "balance_loss_mlp": 1.02139819, + "epoch": 0.5680444912069743, + "flos": 25591883172480.0, + "grad_norm": 3.5475375147623294, + "language_loss": 0.7504915, + "learning_rate": 1.657989284462725e-06, + "loss": 0.77183461, + "num_input_tokens_seen": 203513710, + "step": 9448, + "time_per_iteration": 2.700333595275879 + }, + { + "auxiliary_loss_clip": 0.01079167, + "auxiliary_loss_mlp": 0.01049109, + "balance_loss_clip": 1.04264426, + "balance_loss_mlp": 1.0336951, + "epoch": 0.5681046144596422, + "flos": 23695799944320.0, + "grad_norm": 2.3399913967333865, + "language_loss": 0.76352537, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.78480804, + "num_input_tokens_seen": 203531630, + "step": 9449, + "time_per_iteration": 2.7736854553222656 + }, + { + "auxiliary_loss_clip": 0.01096359, + "auxiliary_loss_mlp": 0.01042326, + "balance_loss_clip": 1.04059768, + "balance_loss_mlp": 1.02729404, + "epoch": 0.5681647377123102, + "flos": 28000770917760.0, + "grad_norm": 1.7507923980752478, + "language_loss": 0.74660265, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.76798952, + "num_input_tokens_seen": 203551885, + "step": 9450, + "time_per_iteration": 2.749420642852783 + }, + { + "auxiliary_loss_clip": 0.01102012, + "auxiliary_loss_mlp": 0.01039617, + "balance_loss_clip": 1.04193068, + "balance_loss_mlp": 1.02674794, + "epoch": 0.5682248609649782, + "flos": 22747758330240.0, + "grad_norm": 2.689223250754005, + "language_loss": 0.66906244, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.69047868, + "num_input_tokens_seen": 203572250, + "step": 9451, + "time_per_iteration": 2.753199338912964 + }, + { + "auxiliary_loss_clip": 0.01096067, + "auxiliary_loss_mlp": 0.01038718, + "balance_loss_clip": 1.03942561, + "balance_loss_mlp": 1.02265501, + "epoch": 0.5682849842176462, + "flos": 21288600138240.0, + "grad_norm": 3.0838986562683557, + "language_loss": 0.71882987, + "learning_rate": 1.656454488573026e-06, + "loss": 0.74017769, + "num_input_tokens_seen": 203590605, + "step": 9452, + "time_per_iteration": 2.6950924396514893 + }, + { + "auxiliary_loss_clip": 0.01076417, + "auxiliary_loss_mlp": 0.01030065, + "balance_loss_clip": 1.03938448, + "balance_loss_mlp": 1.01734543, + "epoch": 0.5683451074703142, + "flos": 21141689512320.0, + "grad_norm": 1.8642874843773423, + "language_loss": 0.70013601, + "learning_rate": 1.656070822132428e-06, + "loss": 0.72120082, + "num_input_tokens_seen": 203610080, + "step": 9453, + "time_per_iteration": 2.7006165981292725 + }, + { + "auxiliary_loss_clip": 0.01076829, + "auxiliary_loss_mlp": 0.00769854, + "balance_loss_clip": 1.04066825, + "balance_loss_mlp": 1.00014949, + "epoch": 0.5684052307229821, + "flos": 22344481359360.0, + "grad_norm": 2.037972918051024, + "language_loss": 0.70139372, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.71986055, + "num_input_tokens_seen": 203630060, + "step": 9454, + "time_per_iteration": 2.759376287460327 + }, + { + "auxiliary_loss_clip": 0.01095428, + "auxiliary_loss_mlp": 0.01031911, + "balance_loss_clip": 1.03987896, + "balance_loss_mlp": 1.01938248, + "epoch": 0.5684653539756501, + "flos": 21798639308160.0, + "grad_norm": 1.989743078970872, + "language_loss": 0.6078186, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.62909198, + "num_input_tokens_seen": 203649065, + "step": 9455, + "time_per_iteration": 2.678152322769165 + }, + { + "auxiliary_loss_clip": 0.01082741, + "auxiliary_loss_mlp": 0.0103652, + "balance_loss_clip": 1.0447154, + "balance_loss_mlp": 1.02253652, + "epoch": 0.568525477228318, + "flos": 22999635475200.0, + "grad_norm": 4.296474832454859, + "language_loss": 0.73108375, + "learning_rate": 1.6549199011198e-06, + "loss": 0.75227636, + "num_input_tokens_seen": 203667545, + "step": 9456, + "time_per_iteration": 2.7307004928588867 + }, + { + "auxiliary_loss_clip": 0.01099598, + "auxiliary_loss_mlp": 0.01031688, + "balance_loss_clip": 1.04188192, + "balance_loss_mlp": 1.01902199, + "epoch": 0.568585600480986, + "flos": 21392489249280.0, + "grad_norm": 1.662795047431792, + "language_loss": 0.77013254, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.79144537, + "num_input_tokens_seen": 203686025, + "step": 9457, + "time_per_iteration": 2.665708303451538 + }, + { + "auxiliary_loss_clip": 0.01111194, + "auxiliary_loss_mlp": 0.01036842, + "balance_loss_clip": 1.0429163, + "balance_loss_mlp": 1.02280521, + "epoch": 0.568645723733654, + "flos": 30007351359360.0, + "grad_norm": 2.0672888051412817, + "language_loss": 0.66191971, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.68340003, + "num_input_tokens_seen": 203705540, + "step": 9458, + "time_per_iteration": 2.780771017074585 + }, + { + "auxiliary_loss_clip": 0.01110997, + "auxiliary_loss_mlp": 0.01031454, + "balance_loss_clip": 1.04201722, + "balance_loss_mlp": 1.01742291, + "epoch": 0.568705846986322, + "flos": 20412667077120.0, + "grad_norm": 2.504426538314312, + "language_loss": 0.68920743, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.71063197, + "num_input_tokens_seen": 203723670, + "step": 9459, + "time_per_iteration": 2.637176513671875 + }, + { + "auxiliary_loss_clip": 0.01095236, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.0442152, + "balance_loss_mlp": 1.01929152, + "epoch": 0.5687659702389899, + "flos": 17456752131840.0, + "grad_norm": 2.127788828908428, + "language_loss": 0.76758575, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.788867, + "num_input_tokens_seen": 203739705, + "step": 9460, + "time_per_iteration": 2.7338075637817383 + }, + { + "auxiliary_loss_clip": 0.01066336, + "auxiliary_loss_mlp": 0.0103936, + "balance_loss_clip": 1.04204893, + "balance_loss_mlp": 1.02563262, + "epoch": 0.5688260934916579, + "flos": 25406081095680.0, + "grad_norm": 1.8378075196350074, + "language_loss": 0.71994978, + "learning_rate": 1.65300196133547e-06, + "loss": 0.74100673, + "num_input_tokens_seen": 203759000, + "step": 9461, + "time_per_iteration": 2.9295692443847656 + }, + { + "auxiliary_loss_clip": 0.01110974, + "auxiliary_loss_mlp": 0.01036974, + "balance_loss_clip": 1.04267561, + "balance_loss_mlp": 1.02314544, + "epoch": 0.5688862167443258, + "flos": 21608024808960.0, + "grad_norm": 2.3363777583338794, + "language_loss": 0.73092425, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.75240374, + "num_input_tokens_seen": 203774295, + "step": 9462, + "time_per_iteration": 2.639132022857666 + }, + { + "auxiliary_loss_clip": 0.01105415, + "auxiliary_loss_mlp": 0.01026496, + "balance_loss_clip": 1.03986573, + "balance_loss_mlp": 1.01507592, + "epoch": 0.5689463399969938, + "flos": 22418996123520.0, + "grad_norm": 1.9966058203681178, + "language_loss": 0.72878397, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.75010306, + "num_input_tokens_seen": 203792710, + "step": 9463, + "time_per_iteration": 2.687623977661133 + }, + { + "auxiliary_loss_clip": 0.01108157, + "auxiliary_loss_mlp": 0.01032686, + "balance_loss_clip": 1.04214895, + "balance_loss_mlp": 1.02012718, + "epoch": 0.5690064632496618, + "flos": 18296810484480.0, + "grad_norm": 2.136514167684146, + "language_loss": 0.73800778, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.75941622, + "num_input_tokens_seen": 203811645, + "step": 9464, + "time_per_iteration": 2.6446449756622314 + }, + { + "auxiliary_loss_clip": 0.01110623, + "auxiliary_loss_mlp": 0.0077176, + "balance_loss_clip": 1.04163098, + "balance_loss_mlp": 1.00012827, + "epoch": 0.5690665865023298, + "flos": 21579260993280.0, + "grad_norm": 2.0135063282733108, + "language_loss": 0.84068149, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.85950536, + "num_input_tokens_seen": 203830040, + "step": 9465, + "time_per_iteration": 2.6243364810943604 + }, + { + "auxiliary_loss_clip": 0.01092541, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.03678536, + "balance_loss_mlp": 1.0195086, + "epoch": 0.5691267097549978, + "flos": 24421446501120.0, + "grad_norm": 1.6434295280058835, + "language_loss": 0.72125626, + "learning_rate": 1.651084350506125e-06, + "loss": 0.74250448, + "num_input_tokens_seen": 203851245, + "step": 9466, + "time_per_iteration": 5.837533712387085 + }, + { + "auxiliary_loss_clip": 0.01016007, + "auxiliary_loss_mlp": 0.01001581, + "balance_loss_clip": 1.01873374, + "balance_loss_mlp": 1.00037718, + "epoch": 0.5691868330076657, + "flos": 61657906199040.0, + "grad_norm": 0.7155703714304625, + "language_loss": 0.55334294, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.57351875, + "num_input_tokens_seen": 203916400, + "step": 9467, + "time_per_iteration": 4.8396992683410645 + }, + { + "auxiliary_loss_clip": 0.01107605, + "auxiliary_loss_mlp": 0.0103869, + "balance_loss_clip": 1.04263473, + "balance_loss_mlp": 1.02364564, + "epoch": 0.5692469562603337, + "flos": 21325193118720.0, + "grad_norm": 16.186384536861027, + "language_loss": 0.6343258, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.65578872, + "num_input_tokens_seen": 203935870, + "step": 9468, + "time_per_iteration": 2.6212332248687744 + }, + { + "auxiliary_loss_clip": 0.01066902, + "auxiliary_loss_mlp": 0.01038069, + "balance_loss_clip": 1.03614831, + "balance_loss_mlp": 1.02334094, + "epoch": 0.5693070795130016, + "flos": 23367899664000.0, + "grad_norm": 2.927691999708818, + "language_loss": 0.78902012, + "learning_rate": 1.64993394266317e-06, + "loss": 0.81006986, + "num_input_tokens_seen": 203954950, + "step": 9469, + "time_per_iteration": 2.745016098022461 + }, + { + "auxiliary_loss_clip": 0.01085393, + "auxiliary_loss_mlp": 0.01053274, + "balance_loss_clip": 1.04159784, + "balance_loss_mlp": 1.03830147, + "epoch": 0.5693672027656697, + "flos": 18697250280960.0, + "grad_norm": 2.217720738619104, + "language_loss": 0.69655335, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.71793997, + "num_input_tokens_seen": 203972715, + "step": 9470, + "time_per_iteration": 2.6895534992218018 + }, + { + "auxiliary_loss_clip": 0.01097198, + "auxiliary_loss_mlp": 0.01036868, + "balance_loss_clip": 1.04529762, + "balance_loss_mlp": 1.02352285, + "epoch": 0.5694273260183376, + "flos": 20449188230400.0, + "grad_norm": 1.6026966116267123, + "language_loss": 0.74473977, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.76608038, + "num_input_tokens_seen": 203990775, + "step": 9471, + "time_per_iteration": 2.6734213829040527 + }, + { + "auxiliary_loss_clip": 0.01077759, + "auxiliary_loss_mlp": 0.01040388, + "balance_loss_clip": 1.0421195, + "balance_loss_mlp": 1.02579701, + "epoch": 0.5694874492710056, + "flos": 17603195880960.0, + "grad_norm": 1.75714793559233, + "language_loss": 0.57588744, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.59706891, + "num_input_tokens_seen": 204008845, + "step": 9472, + "time_per_iteration": 4.335491180419922 + }, + { + "auxiliary_loss_clip": 0.01082559, + "auxiliary_loss_mlp": 0.01032344, + "balance_loss_clip": 1.03902221, + "balance_loss_mlp": 1.01946378, + "epoch": 0.5695475725236735, + "flos": 13370836250880.0, + "grad_norm": 1.9281443896441626, + "language_loss": 0.73845899, + "learning_rate": 1.648400251450638e-06, + "loss": 0.75960797, + "num_input_tokens_seen": 204023755, + "step": 9473, + "time_per_iteration": 2.706148147583008 + }, + { + "auxiliary_loss_clip": 0.01017729, + "auxiliary_loss_mlp": 0.01007582, + "balance_loss_clip": 1.02078795, + "balance_loss_mlp": 1.00631857, + "epoch": 0.5696076957763415, + "flos": 68174398661760.0, + "grad_norm": 0.6469732305814715, + "language_loss": 0.57547617, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.59572935, + "num_input_tokens_seen": 204091255, + "step": 9474, + "time_per_iteration": 3.2811825275421143 + }, + { + "auxiliary_loss_clip": 0.01106855, + "auxiliary_loss_mlp": 0.01038889, + "balance_loss_clip": 1.04254341, + "balance_loss_mlp": 1.02532923, + "epoch": 0.5696678190290094, + "flos": 33838301525760.0, + "grad_norm": 2.207374996280549, + "language_loss": 0.53488398, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.55634141, + "num_input_tokens_seen": 204113285, + "step": 9475, + "time_per_iteration": 2.701791524887085 + }, + { + "auxiliary_loss_clip": 0.01122912, + "auxiliary_loss_mlp": 0.01039618, + "balance_loss_clip": 1.04524517, + "balance_loss_mlp": 1.0262965, + "epoch": 0.5697279422816774, + "flos": 26356600748160.0, + "grad_norm": 1.6070261580589493, + "language_loss": 0.79622197, + "learning_rate": 1.647250122983675e-06, + "loss": 0.81784725, + "num_input_tokens_seen": 204133045, + "step": 9476, + "time_per_iteration": 2.695966958999634 + }, + { + "auxiliary_loss_clip": 0.01101607, + "auxiliary_loss_mlp": 0.01038712, + "balance_loss_clip": 1.04603529, + "balance_loss_mlp": 1.0258019, + "epoch": 0.5697880655343454, + "flos": 22930507751040.0, + "grad_norm": 1.9576279407758228, + "language_loss": 0.66811013, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.68951333, + "num_input_tokens_seen": 204152590, + "step": 9477, + "time_per_iteration": 2.6981940269470215 + }, + { + "auxiliary_loss_clip": 0.0108821, + "auxiliary_loss_mlp": 0.01037709, + "balance_loss_clip": 1.04286826, + "balance_loss_mlp": 1.02403569, + "epoch": 0.5698481887870134, + "flos": 26761314263040.0, + "grad_norm": 1.587062911340377, + "language_loss": 0.70738614, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.72864532, + "num_input_tokens_seen": 204171815, + "step": 9478, + "time_per_iteration": 2.779813766479492 + }, + { + "auxiliary_loss_clip": 0.01084042, + "auxiliary_loss_mlp": 0.01031832, + "balance_loss_clip": 1.03916287, + "balance_loss_mlp": 1.0200479, + "epoch": 0.5699083120396814, + "flos": 15742269089280.0, + "grad_norm": 4.484039953055517, + "language_loss": 0.6938777, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.71503651, + "num_input_tokens_seen": 204188535, + "step": 9479, + "time_per_iteration": 2.712655782699585 + }, + { + "auxiliary_loss_clip": 0.01078443, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.04121661, + "balance_loss_mlp": 1.02166843, + "epoch": 0.5699684352923493, + "flos": 19537272720000.0, + "grad_norm": 2.2062311419155205, + "language_loss": 0.71329868, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.73441678, + "num_input_tokens_seen": 204208365, + "step": 9480, + "time_per_iteration": 2.727628469467163 + }, + { + "auxiliary_loss_clip": 0.01089043, + "auxiliary_loss_mlp": 0.00769268, + "balance_loss_clip": 1.04188204, + "balance_loss_mlp": 1.00015211, + "epoch": 0.5700285585450173, + "flos": 16253349753600.0, + "grad_norm": 2.49302312393396, + "language_loss": 0.7201618, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.73874491, + "num_input_tokens_seen": 204226560, + "step": 9481, + "time_per_iteration": 2.6870779991149902 + }, + { + "auxiliary_loss_clip": 0.01111632, + "auxiliary_loss_mlp": 0.01037308, + "balance_loss_clip": 1.04494166, + "balance_loss_mlp": 1.02441525, + "epoch": 0.5700886817976852, + "flos": 19864993432320.0, + "grad_norm": 2.3265371075794046, + "language_loss": 0.78086042, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.80234993, + "num_input_tokens_seen": 204245410, + "step": 9482, + "time_per_iteration": 2.648545742034912 + }, + { + "auxiliary_loss_clip": 0.01099058, + "auxiliary_loss_mlp": 0.01031061, + "balance_loss_clip": 1.04446602, + "balance_loss_mlp": 1.01918221, + "epoch": 0.5701488050503533, + "flos": 23841704989440.0, + "grad_norm": 1.4982420423731841, + "language_loss": 0.77999502, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.80129617, + "num_input_tokens_seen": 204264840, + "step": 9483, + "time_per_iteration": 2.716085910797119 + }, + { + "auxiliary_loss_clip": 0.01098634, + "auxiliary_loss_mlp": 0.01043773, + "balance_loss_clip": 1.04435062, + "balance_loss_mlp": 1.03189337, + "epoch": 0.5702089283030212, + "flos": 23659673840640.0, + "grad_norm": 1.773078274148673, + "language_loss": 0.81291378, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.83433783, + "num_input_tokens_seen": 204284335, + "step": 9484, + "time_per_iteration": 2.7283802032470703 + }, + { + "auxiliary_loss_clip": 0.01120809, + "auxiliary_loss_mlp": 0.0077026, + "balance_loss_clip": 1.04378128, + "balance_loss_mlp": 1.00009394, + "epoch": 0.5702690515556892, + "flos": 27891171544320.0, + "grad_norm": 294.9687469035841, + "language_loss": 0.60670495, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.6256156, + "num_input_tokens_seen": 204302590, + "step": 9485, + "time_per_iteration": 2.7182137966156006 + }, + { + "auxiliary_loss_clip": 0.01107456, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.04291701, + "balance_loss_mlp": 1.02048063, + "epoch": 0.5703291748083571, + "flos": 24023951619840.0, + "grad_norm": 2.0199937842049676, + "language_loss": 0.65740418, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.67881644, + "num_input_tokens_seen": 204323055, + "step": 9486, + "time_per_iteration": 2.7076590061187744 + }, + { + "auxiliary_loss_clip": 0.01026531, + "auxiliary_loss_mlp": 0.01001416, + "balance_loss_clip": 1.02014589, + "balance_loss_mlp": 1.00029588, + "epoch": 0.5703892980610251, + "flos": 57023382919680.0, + "grad_norm": 0.6682653451732087, + "language_loss": 0.47990364, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.50018317, + "num_input_tokens_seen": 204386160, + "step": 9487, + "time_per_iteration": 3.3227086067199707 + }, + { + "auxiliary_loss_clip": 0.0108502, + "auxiliary_loss_mlp": 0.00770885, + "balance_loss_clip": 1.04171848, + "balance_loss_mlp": 1.00012314, + "epoch": 0.570449421313693, + "flos": 24351025887360.0, + "grad_norm": 1.5998860502141972, + "language_loss": 0.85676056, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.87531954, + "num_input_tokens_seen": 204406315, + "step": 9488, + "time_per_iteration": 2.7443041801452637 + }, + { + "auxiliary_loss_clip": 0.01084932, + "auxiliary_loss_mlp": 0.01036169, + "balance_loss_clip": 1.04087424, + "balance_loss_mlp": 1.02245951, + "epoch": 0.570509544566361, + "flos": 24828566227200.0, + "grad_norm": 1.4382001019160457, + "language_loss": 0.78847331, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.80968434, + "num_input_tokens_seen": 204427645, + "step": 9489, + "time_per_iteration": 2.7456719875335693 + }, + { + "auxiliary_loss_clip": 0.01099206, + "auxiliary_loss_mlp": 0.01028445, + "balance_loss_clip": 1.04345155, + "balance_loss_mlp": 1.01655364, + "epoch": 0.570569667819029, + "flos": 21397301671680.0, + "grad_norm": 1.7750907148912565, + "language_loss": 0.70044166, + "learning_rate": 1.641884454927604e-06, + "loss": 0.72171819, + "num_input_tokens_seen": 204445910, + "step": 9490, + "time_per_iteration": 2.646172046661377 + }, + { + "auxiliary_loss_clip": 0.01085076, + "auxiliary_loss_mlp": 0.0103304, + "balance_loss_clip": 1.04102945, + "balance_loss_mlp": 1.02055264, + "epoch": 0.570629791071697, + "flos": 23216751233280.0, + "grad_norm": 1.5662629922292932, + "language_loss": 0.76374, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.78492117, + "num_input_tokens_seen": 204464680, + "step": 9491, + "time_per_iteration": 2.686228036880493 + }, + { + "auxiliary_loss_clip": 0.01010704, + "auxiliary_loss_mlp": 0.0075136, + "balance_loss_clip": 1.01657176, + "balance_loss_mlp": 0.99964297, + "epoch": 0.570689914324365, + "flos": 65284666525440.0, + "grad_norm": 0.7940313966382696, + "language_loss": 0.57365447, + "learning_rate": 1.641118147266011e-06, + "loss": 0.5912751, + "num_input_tokens_seen": 204525580, + "step": 9492, + "time_per_iteration": 3.275951623916626 + }, + { + "auxiliary_loss_clip": 0.01091927, + "auxiliary_loss_mlp": 0.00770164, + "balance_loss_clip": 1.0425539, + "balance_loss_mlp": 1.00009966, + "epoch": 0.5707500375770329, + "flos": 21141904993920.0, + "grad_norm": 1.811585397599456, + "language_loss": 0.71563506, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.73425597, + "num_input_tokens_seen": 204541320, + "step": 9493, + "time_per_iteration": 2.6741974353790283 + }, + { + "auxiliary_loss_clip": 0.0112282, + "auxiliary_loss_mlp": 0.01032391, + "balance_loss_clip": 1.0450213, + "balance_loss_mlp": 1.01957655, + "epoch": 0.5708101608297009, + "flos": 20812747737600.0, + "grad_norm": 1.647557383472974, + "language_loss": 0.7782768, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.79982895, + "num_input_tokens_seen": 204560275, + "step": 9494, + "time_per_iteration": 2.6302967071533203 + }, + { + "auxiliary_loss_clip": 0.01124725, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_clip": 1.04463601, + "balance_loss_mlp": 1.01786041, + "epoch": 0.5708702840823688, + "flos": 25812338895360.0, + "grad_norm": 2.0991801198395166, + "language_loss": 0.80634642, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.82790697, + "num_input_tokens_seen": 204579430, + "step": 9495, + "time_per_iteration": 2.628124237060547 + }, + { + "auxiliary_loss_clip": 0.01077213, + "auxiliary_loss_mlp": 0.01041189, + "balance_loss_clip": 1.03985035, + "balance_loss_mlp": 1.02567959, + "epoch": 0.5709304073350369, + "flos": 23651916503040.0, + "grad_norm": 2.1559343585674067, + "language_loss": 0.66669941, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.68788344, + "num_input_tokens_seen": 204597710, + "step": 9496, + "time_per_iteration": 2.7877724170684814 + }, + { + "auxiliary_loss_clip": 0.01125369, + "auxiliary_loss_mlp": 0.01038193, + "balance_loss_clip": 1.04586279, + "balance_loss_mlp": 1.02479351, + "epoch": 0.5709905305877048, + "flos": 16107552449280.0, + "grad_norm": 2.6392695697640387, + "language_loss": 0.69406897, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.71570456, + "num_input_tokens_seen": 204616140, + "step": 9497, + "time_per_iteration": 2.5715434551239014 + }, + { + "auxiliary_loss_clip": 0.01107343, + "auxiliary_loss_mlp": 0.00770833, + "balance_loss_clip": 1.04470205, + "balance_loss_mlp": 1.00010204, + "epoch": 0.5710506538403728, + "flos": 24750819239040.0, + "grad_norm": 2.381002532737965, + "language_loss": 0.81296104, + "learning_rate": 1.638819551358182e-06, + "loss": 0.83174282, + "num_input_tokens_seen": 204636470, + "step": 9498, + "time_per_iteration": 2.7146875858306885 + }, + { + "auxiliary_loss_clip": 0.01122241, + "auxiliary_loss_mlp": 0.01039082, + "balance_loss_clip": 1.04371977, + "balance_loss_mlp": 1.02453244, + "epoch": 0.5711107770930407, + "flos": 21982250655360.0, + "grad_norm": 1.8640767096069095, + "language_loss": 0.66366005, + "learning_rate": 1.638436499891469e-06, + "loss": 0.68527335, + "num_input_tokens_seen": 204656640, + "step": 9499, + "time_per_iteration": 2.59460711479187 + }, + { + "auxiliary_loss_clip": 0.01090983, + "auxiliary_loss_mlp": 0.01034376, + "balance_loss_clip": 1.04218864, + "balance_loss_mlp": 1.02126861, + "epoch": 0.5711709003457087, + "flos": 19574009354880.0, + "grad_norm": 1.5439081268362653, + "language_loss": 0.71755552, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.73880911, + "num_input_tokens_seen": 204675475, + "step": 9500, + "time_per_iteration": 2.6723949909210205 + }, + { + "auxiliary_loss_clip": 0.01092856, + "auxiliary_loss_mlp": 0.01032614, + "balance_loss_clip": 1.04149878, + "balance_loss_mlp": 1.01973963, + "epoch": 0.5712310235983766, + "flos": 24242683489920.0, + "grad_norm": 1.9336466751975971, + "language_loss": 0.76224887, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.78350353, + "num_input_tokens_seen": 204695385, + "step": 9501, + "time_per_iteration": 2.7653119564056396 + }, + { + "auxiliary_loss_clip": 0.01101056, + "auxiliary_loss_mlp": 0.01035695, + "balance_loss_clip": 1.04289281, + "balance_loss_mlp": 1.02269506, + "epoch": 0.5712911468510447, + "flos": 20996143603200.0, + "grad_norm": 1.6146609274124086, + "language_loss": 0.75141633, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.77278382, + "num_input_tokens_seen": 204714730, + "step": 9502, + "time_per_iteration": 2.6820828914642334 + }, + { + "auxiliary_loss_clip": 0.01088314, + "auxiliary_loss_mlp": 0.01027948, + "balance_loss_clip": 1.04387522, + "balance_loss_mlp": 1.01492405, + "epoch": 0.5713512701037126, + "flos": 18916987731840.0, + "grad_norm": 1.5621825440350152, + "language_loss": 0.82271576, + "learning_rate": 1.636904431275105e-06, + "loss": 0.84387839, + "num_input_tokens_seen": 204735025, + "step": 9503, + "time_per_iteration": 2.663109302520752 + }, + { + "auxiliary_loss_clip": 0.01085944, + "auxiliary_loss_mlp": 0.01033945, + "balance_loss_clip": 1.04204583, + "balance_loss_mlp": 1.02192843, + "epoch": 0.5714113933563806, + "flos": 17413443308160.0, + "grad_norm": 2.684901451113001, + "language_loss": 0.86263931, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.88383818, + "num_input_tokens_seen": 204751365, + "step": 9504, + "time_per_iteration": 2.763122320175171 + }, + { + "auxiliary_loss_clip": 0.01075538, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.04011607, + "balance_loss_mlp": 1.01565766, + "epoch": 0.5714715166090486, + "flos": 20193360589440.0, + "grad_norm": 1.7486163539852246, + "language_loss": 0.75459665, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.77563769, + "num_input_tokens_seen": 204768980, + "step": 9505, + "time_per_iteration": 4.519685506820679 + }, + { + "auxiliary_loss_clip": 0.0111822, + "auxiliary_loss_mlp": 0.0103209, + "balance_loss_clip": 1.04235733, + "balance_loss_mlp": 1.02002621, + "epoch": 0.5715316398617165, + "flos": 18551668458240.0, + "grad_norm": 1.4826686830874622, + "language_loss": 0.81888402, + "learning_rate": 1.635755524332509e-06, + "loss": 0.84038711, + "num_input_tokens_seen": 204788110, + "step": 9506, + "time_per_iteration": 5.6948935985565186 + }, + { + "auxiliary_loss_clip": 0.01080083, + "auxiliary_loss_mlp": 0.00770857, + "balance_loss_clip": 1.03905082, + "balance_loss_mlp": 1.00010204, + "epoch": 0.5715917631143845, + "flos": 18478195188480.0, + "grad_norm": 1.7330193393772828, + "language_loss": 0.77595812, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.79446745, + "num_input_tokens_seen": 204807240, + "step": 9507, + "time_per_iteration": 2.7299420833587646 + }, + { + "auxiliary_loss_clip": 0.01098783, + "auxiliary_loss_mlp": 0.01037694, + "balance_loss_clip": 1.04040074, + "balance_loss_mlp": 1.02350807, + "epoch": 0.5716518863670524, + "flos": 24020037037440.0, + "grad_norm": 1.9478835056583133, + "language_loss": 0.6852861, + "learning_rate": 1.63498965540751e-06, + "loss": 0.70665085, + "num_input_tokens_seen": 204826415, + "step": 9508, + "time_per_iteration": 2.7023262977600098 + }, + { + "auxiliary_loss_clip": 0.01121987, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.04333735, + "balance_loss_mlp": 1.01777184, + "epoch": 0.5717120096197205, + "flos": 17819485626240.0, + "grad_norm": 2.087333212498838, + "language_loss": 0.80104595, + "learning_rate": 1.634606741699593e-06, + "loss": 0.82257771, + "num_input_tokens_seen": 204844305, + "step": 9509, + "time_per_iteration": 2.6331591606140137 + }, + { + "auxiliary_loss_clip": 0.01104906, + "auxiliary_loss_mlp": 0.01033683, + "balance_loss_clip": 1.04276729, + "balance_loss_mlp": 1.02071953, + "epoch": 0.5717721328723884, + "flos": 21866043179520.0, + "grad_norm": 1.9468766397229225, + "language_loss": 0.71857727, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.73996317, + "num_input_tokens_seen": 204861765, + "step": 9510, + "time_per_iteration": 2.6763837337493896 + }, + { + "auxiliary_loss_clip": 0.01096671, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.04109251, + "balance_loss_mlp": 1.01920152, + "epoch": 0.5718322561250564, + "flos": 28437624126720.0, + "grad_norm": 1.5755083758344817, + "language_loss": 0.69395983, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.71524119, + "num_input_tokens_seen": 204882505, + "step": 9511, + "time_per_iteration": 4.320638418197632 + }, + { + "auxiliary_loss_clip": 0.01097735, + "auxiliary_loss_mlp": 0.01035503, + "balance_loss_clip": 1.04172611, + "balance_loss_mlp": 1.02338552, + "epoch": 0.5718923793777243, + "flos": 13551825905280.0, + "grad_norm": 2.0067389560068047, + "language_loss": 0.6147874, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.63611984, + "num_input_tokens_seen": 204899830, + "step": 9512, + "time_per_iteration": 2.759669065475464 + }, + { + "auxiliary_loss_clip": 0.01095927, + "auxiliary_loss_mlp": 0.01029716, + "balance_loss_clip": 1.04188657, + "balance_loss_mlp": 1.01700234, + "epoch": 0.5719525026303923, + "flos": 17822035491840.0, + "grad_norm": 2.401258082797128, + "language_loss": 0.76018667, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.78144312, + "num_input_tokens_seen": 204918100, + "step": 9513, + "time_per_iteration": 2.7117698192596436 + }, + { + "auxiliary_loss_clip": 0.01030995, + "auxiliary_loss_mlp": 0.00999994, + "balance_loss_clip": 1.01519012, + "balance_loss_mlp": 0.99881381, + "epoch": 0.5720126258830602, + "flos": 61298042814720.0, + "grad_norm": 0.8987559536316853, + "language_loss": 0.66807652, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.68838638, + "num_input_tokens_seen": 204972925, + "step": 9514, + "time_per_iteration": 3.1701343059539795 + }, + { + "auxiliary_loss_clip": 0.01114643, + "auxiliary_loss_mlp": 0.01042868, + "balance_loss_clip": 1.0446943, + "balance_loss_mlp": 1.02930832, + "epoch": 0.5720727491357283, + "flos": 23988040997760.0, + "grad_norm": 2.0869347470902704, + "language_loss": 0.81355566, + "learning_rate": 1.63230955093099e-06, + "loss": 0.83513075, + "num_input_tokens_seen": 204990910, + "step": 9515, + "time_per_iteration": 2.668982744216919 + }, + { + "auxiliary_loss_clip": 0.01098965, + "auxiliary_loss_mlp": 0.01032673, + "balance_loss_clip": 1.04036427, + "balance_loss_mlp": 1.01993597, + "epoch": 0.5721328723883962, + "flos": 23405426398080.0, + "grad_norm": 3.1746972716468664, + "language_loss": 0.85928082, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.88059723, + "num_input_tokens_seen": 205010500, + "step": 9516, + "time_per_iteration": 2.6741178035736084 + }, + { + "auxiliary_loss_clip": 0.01083742, + "auxiliary_loss_mlp": 0.01031013, + "balance_loss_clip": 1.04019785, + "balance_loss_mlp": 1.01784658, + "epoch": 0.5721929956410642, + "flos": 18804910320000.0, + "grad_norm": 1.8608727945257042, + "language_loss": 0.87884629, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.8999939, + "num_input_tokens_seen": 205028560, + "step": 9517, + "time_per_iteration": 2.699981451034546 + }, + { + "auxiliary_loss_clip": 0.01066403, + "auxiliary_loss_mlp": 0.01031636, + "balance_loss_clip": 1.03665698, + "balance_loss_mlp": 1.01866579, + "epoch": 0.5722531188937322, + "flos": 27196659100800.0, + "grad_norm": 1.632945668541975, + "language_loss": 0.85146403, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.87244439, + "num_input_tokens_seen": 205048650, + "step": 9518, + "time_per_iteration": 2.8667659759521484 + }, + { + "auxiliary_loss_clip": 0.01104733, + "auxiliary_loss_mlp": 0.01033736, + "balance_loss_clip": 1.04255366, + "balance_loss_mlp": 1.02131414, + "epoch": 0.5723132421464001, + "flos": 15195672852480.0, + "grad_norm": 1.838676422571758, + "language_loss": 0.7901606, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.81154531, + "num_input_tokens_seen": 205066480, + "step": 9519, + "time_per_iteration": 2.593822479248047 + }, + { + "auxiliary_loss_clip": 0.01117664, + "auxiliary_loss_mlp": 0.01029991, + "balance_loss_clip": 1.04276991, + "balance_loss_mlp": 1.01755762, + "epoch": 0.5723733653990681, + "flos": 27599433281280.0, + "grad_norm": 1.4978137038182386, + "language_loss": 0.83191645, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.85339302, + "num_input_tokens_seen": 205087475, + "step": 9520, + "time_per_iteration": 2.664851427078247 + }, + { + "auxiliary_loss_clip": 0.01098568, + "auxiliary_loss_mlp": 0.01044625, + "balance_loss_clip": 1.04248536, + "balance_loss_mlp": 1.03115487, + "epoch": 0.572433488651736, + "flos": 18222870337920.0, + "grad_norm": 2.1952309591015267, + "language_loss": 0.72542965, + "learning_rate": 1.630012862105243e-06, + "loss": 0.74686158, + "num_input_tokens_seen": 205106495, + "step": 9521, + "time_per_iteration": 2.7253611087799072 + }, + { + "auxiliary_loss_clip": 0.011175, + "auxiliary_loss_mlp": 0.00769564, + "balance_loss_clip": 1.04164016, + "balance_loss_mlp": 1.00010264, + "epoch": 0.5724936119044041, + "flos": 31249106484480.0, + "grad_norm": 2.153094973040902, + "language_loss": 0.78315163, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.80202222, + "num_input_tokens_seen": 205128285, + "step": 9522, + "time_per_iteration": 2.6890037059783936 + }, + { + "auxiliary_loss_clip": 0.01088616, + "auxiliary_loss_mlp": 0.01034098, + "balance_loss_clip": 1.04117084, + "balance_loss_mlp": 1.02251649, + "epoch": 0.572553735157072, + "flos": 19202189719680.0, + "grad_norm": 1.511112661891623, + "language_loss": 0.71476662, + "learning_rate": 1.629247411248102e-06, + "loss": 0.73599374, + "num_input_tokens_seen": 205146595, + "step": 9523, + "time_per_iteration": 2.6567182540893555 + }, + { + "auxiliary_loss_clip": 0.01092274, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.03921247, + "balance_loss_mlp": 1.02187228, + "epoch": 0.57261385840974, + "flos": 21214911386880.0, + "grad_norm": 2.2130630300856207, + "language_loss": 0.70017171, + "learning_rate": 1.628864706900738e-06, + "loss": 0.72143173, + "num_input_tokens_seen": 205164295, + "step": 9524, + "time_per_iteration": 2.700518846511841 + }, + { + "auxiliary_loss_clip": 0.01107505, + "auxiliary_loss_mlp": 0.01031795, + "balance_loss_clip": 1.04225564, + "balance_loss_mlp": 1.01971316, + "epoch": 0.5726739816624079, + "flos": 33984529793280.0, + "grad_norm": 1.461112152817653, + "language_loss": 0.65126455, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.67265761, + "num_input_tokens_seen": 205185380, + "step": 9525, + "time_per_iteration": 2.7389535903930664 + }, + { + "auxiliary_loss_clip": 0.01091158, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.03928351, + "balance_loss_mlp": 1.023139, + "epoch": 0.5727341049150759, + "flos": 24275972419200.0, + "grad_norm": 1.930578654391071, + "language_loss": 0.72484279, + "learning_rate": 1.628099340440984e-06, + "loss": 0.7461046, + "num_input_tokens_seen": 205204895, + "step": 9526, + "time_per_iteration": 2.702472448348999 + }, + { + "auxiliary_loss_clip": 0.01103623, + "auxiliary_loss_mlp": 0.01038123, + "balance_loss_clip": 1.03998101, + "balance_loss_mlp": 1.02604759, + "epoch": 0.5727942281677438, + "flos": 28400564269440.0, + "grad_norm": 2.0565235980515206, + "language_loss": 0.8007257, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.8221432, + "num_input_tokens_seen": 205223440, + "step": 9527, + "time_per_iteration": 2.7238149642944336 + }, + { + "auxiliary_loss_clip": 0.01101882, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.03860235, + "balance_loss_mlp": 1.02360809, + "epoch": 0.5728543514204119, + "flos": 19536769929600.0, + "grad_norm": 1.770832454252008, + "language_loss": 0.72136271, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.74274695, + "num_input_tokens_seen": 205242800, + "step": 9528, + "time_per_iteration": 2.593954086303711 + }, + { + "auxiliary_loss_clip": 0.01117957, + "auxiliary_loss_mlp": 0.01036459, + "balance_loss_clip": 1.04303622, + "balance_loss_mlp": 1.02363753, + "epoch": 0.5729144746730798, + "flos": 21506757390720.0, + "grad_norm": 2.0200513223103846, + "language_loss": 0.86137569, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.88291985, + "num_input_tokens_seen": 205259465, + "step": 9529, + "time_per_iteration": 2.6399447917938232 + }, + { + "auxiliary_loss_clip": 0.01022279, + "auxiliary_loss_mlp": 0.00999796, + "balance_loss_clip": 1.01659954, + "balance_loss_mlp": 0.99862826, + "epoch": 0.5729745979257478, + "flos": 58681628242560.0, + "grad_norm": 0.7634342678167043, + "language_loss": 0.56170225, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58192301, + "num_input_tokens_seen": 205314100, + "step": 9530, + "time_per_iteration": 3.081955671310425 + }, + { + "auxiliary_loss_clip": 0.01096881, + "auxiliary_loss_mlp": 0.01030649, + "balance_loss_clip": 1.04126835, + "balance_loss_mlp": 1.01860929, + "epoch": 0.5730347211784158, + "flos": 18552099421440.0, + "grad_norm": 1.8014631294656338, + "language_loss": 0.66785836, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.6891337, + "num_input_tokens_seen": 205333420, + "step": 9531, + "time_per_iteration": 2.650801658630371 + }, + { + "auxiliary_loss_clip": 0.01102348, + "auxiliary_loss_mlp": 0.01042246, + "balance_loss_clip": 1.03970659, + "balance_loss_mlp": 1.02834046, + "epoch": 0.5730948444310837, + "flos": 38031482396160.0, + "grad_norm": 2.1479743871986314, + "language_loss": 0.75923574, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.78068173, + "num_input_tokens_seen": 205350995, + "step": 9532, + "time_per_iteration": 2.7268972396850586 + }, + { + "auxiliary_loss_clip": 0.01117449, + "auxiliary_loss_mlp": 0.01031067, + "balance_loss_clip": 1.0426352, + "balance_loss_mlp": 1.01828206, + "epoch": 0.5731549676837517, + "flos": 25227066689280.0, + "grad_norm": 1.3324145118640112, + "language_loss": 0.78908527, + "learning_rate": 1.625421002822686e-06, + "loss": 0.81057048, + "num_input_tokens_seen": 205372675, + "step": 9533, + "time_per_iteration": 2.6636223793029785 + }, + { + "auxiliary_loss_clip": 0.01105019, + "auxiliary_loss_mlp": 0.01029773, + "balance_loss_clip": 1.04237115, + "balance_loss_mlp": 1.01806676, + "epoch": 0.5732150909364196, + "flos": 23368222886400.0, + "grad_norm": 1.7921135162563215, + "language_loss": 0.85584033, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.87718827, + "num_input_tokens_seen": 205392590, + "step": 9534, + "time_per_iteration": 2.6173202991485596 + }, + { + "auxiliary_loss_clip": 0.01098044, + "auxiliary_loss_mlp": 0.01038668, + "balance_loss_clip": 1.0421629, + "balance_loss_mlp": 1.02537584, + "epoch": 0.5732752141890877, + "flos": 23079357711360.0, + "grad_norm": 1.8285457434330181, + "language_loss": 0.7536543, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.77502143, + "num_input_tokens_seen": 205414885, + "step": 9535, + "time_per_iteration": 2.6797807216644287 + }, + { + "auxiliary_loss_clip": 0.0110163, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.04250264, + "balance_loss_mlp": 1.01984715, + "epoch": 0.5733353374417556, + "flos": 24352282863360.0, + "grad_norm": 1.4660219442049842, + "language_loss": 0.71041429, + "learning_rate": 1.624273356614346e-06, + "loss": 0.73175883, + "num_input_tokens_seen": 205434440, + "step": 9536, + "time_per_iteration": 2.6927666664123535 + }, + { + "auxiliary_loss_clip": 0.0107587, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.03728056, + "balance_loss_mlp": 1.02034533, + "epoch": 0.5733954606944236, + "flos": 27198849830400.0, + "grad_norm": 1.9779932456354445, + "language_loss": 0.69794559, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.71905118, + "num_input_tokens_seen": 205454225, + "step": 9537, + "time_per_iteration": 2.758420944213867 + }, + { + "auxiliary_loss_clip": 0.01119262, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.04359508, + "balance_loss_mlp": 1.02364206, + "epoch": 0.5734555839470915, + "flos": 28765129357440.0, + "grad_norm": 1.8277858348507134, + "language_loss": 0.62517941, + "learning_rate": 1.623508330355902e-06, + "loss": 0.64673591, + "num_input_tokens_seen": 205474750, + "step": 9538, + "time_per_iteration": 2.6978628635406494 + }, + { + "auxiliary_loss_clip": 0.01105121, + "auxiliary_loss_mlp": 0.0103457, + "balance_loss_clip": 1.04219174, + "balance_loss_mlp": 1.02135563, + "epoch": 0.5735157071997595, + "flos": 22966813422720.0, + "grad_norm": 1.6582870130678489, + "language_loss": 0.83564949, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.85704643, + "num_input_tokens_seen": 205495495, + "step": 9539, + "time_per_iteration": 2.7695393562316895 + }, + { + "auxiliary_loss_clip": 0.01086088, + "auxiliary_loss_mlp": 0.01038955, + "balance_loss_clip": 1.04798675, + "balance_loss_mlp": 1.02566326, + "epoch": 0.5735758304524274, + "flos": 18989455420800.0, + "grad_norm": 2.207302017109072, + "language_loss": 0.73048598, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.75173634, + "num_input_tokens_seen": 205510070, + "step": 9540, + "time_per_iteration": 2.760653018951416 + }, + { + "auxiliary_loss_clip": 0.01101303, + "auxiliary_loss_mlp": 0.00769854, + "balance_loss_clip": 1.03920221, + "balance_loss_mlp": 1.00004601, + "epoch": 0.5736359537050955, + "flos": 28397942576640.0, + "grad_norm": 2.4125489920069074, + "language_loss": 0.79765099, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.81636256, + "num_input_tokens_seen": 205530190, + "step": 9541, + "time_per_iteration": 2.789978504180908 + }, + { + "auxiliary_loss_clip": 0.01096764, + "auxiliary_loss_mlp": 0.01033683, + "balance_loss_clip": 1.040447, + "balance_loss_mlp": 1.02054572, + "epoch": 0.5736960769577634, + "flos": 15627210848640.0, + "grad_norm": 2.579963788523863, + "language_loss": 0.6497947, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.67109919, + "num_input_tokens_seen": 205547380, + "step": 9542, + "time_per_iteration": 2.684465169906616 + }, + { + "auxiliary_loss_clip": 0.01094703, + "auxiliary_loss_mlp": 0.01032355, + "balance_loss_clip": 1.03985989, + "balance_loss_mlp": 1.01992106, + "epoch": 0.5737562002104314, + "flos": 18003994813440.0, + "grad_norm": 2.1591412151518625, + "language_loss": 0.82844281, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.84971344, + "num_input_tokens_seen": 205566540, + "step": 9543, + "time_per_iteration": 2.724700450897217 + }, + { + "auxiliary_loss_clip": 0.01078135, + "auxiliary_loss_mlp": 0.01034179, + "balance_loss_clip": 1.03842759, + "balance_loss_mlp": 1.01990938, + "epoch": 0.5738163234630994, + "flos": 20698192287360.0, + "grad_norm": 2.0892075264702616, + "language_loss": 0.73500836, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.75613153, + "num_input_tokens_seen": 205584200, + "step": 9544, + "time_per_iteration": 2.7072341442108154 + }, + { + "auxiliary_loss_clip": 0.01063343, + "auxiliary_loss_mlp": 0.01034841, + "balance_loss_clip": 1.03527069, + "balance_loss_mlp": 1.02173972, + "epoch": 0.5738764467157673, + "flos": 23149311448320.0, + "grad_norm": 1.791719724630014, + "language_loss": 0.76021409, + "learning_rate": 1.620831188925733e-06, + "loss": 0.78119594, + "num_input_tokens_seen": 205604675, + "step": 9545, + "time_per_iteration": 4.402756690979004 + }, + { + "auxiliary_loss_clip": 0.0109842, + "auxiliary_loss_mlp": 0.0103679, + "balance_loss_clip": 1.04495752, + "balance_loss_mlp": 1.02345061, + "epoch": 0.5739365699684353, + "flos": 29492930730240.0, + "grad_norm": 1.94712066693327, + "language_loss": 0.56656086, + "learning_rate": 1.620448797546459e-06, + "loss": 0.58791304, + "num_input_tokens_seen": 205624680, + "step": 9546, + "time_per_iteration": 6.025787115097046 + }, + { + "auxiliary_loss_clip": 0.01091236, + "auxiliary_loss_mlp": 0.01033391, + "balance_loss_clip": 1.03923881, + "balance_loss_mlp": 1.02023625, + "epoch": 0.5739966932211032, + "flos": 14027247342720.0, + "grad_norm": 2.369322585416499, + "language_loss": 0.7595309, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.78077716, + "num_input_tokens_seen": 205641950, + "step": 9547, + "time_per_iteration": 2.71240496635437 + }, + { + "auxiliary_loss_clip": 0.01104111, + "auxiliary_loss_mlp": 0.01030548, + "balance_loss_clip": 1.03877449, + "balance_loss_mlp": 1.01709485, + "epoch": 0.5740568164737713, + "flos": 19062030850560.0, + "grad_norm": 5.307379698295213, + "language_loss": 0.74525601, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.76660264, + "num_input_tokens_seen": 205660130, + "step": 9548, + "time_per_iteration": 2.651829957962036 + }, + { + "auxiliary_loss_clip": 0.01085909, + "auxiliary_loss_mlp": 0.0103587, + "balance_loss_clip": 1.03760338, + "balance_loss_mlp": 1.02268577, + "epoch": 0.5741169397264392, + "flos": 22127832478080.0, + "grad_norm": 4.02154100378115, + "language_loss": 0.69476151, + "learning_rate": 1.619301709822355e-06, + "loss": 0.71597928, + "num_input_tokens_seen": 205678895, + "step": 9549, + "time_per_iteration": 2.7304623126983643 + }, + { + "auxiliary_loss_clip": 0.01068231, + "auxiliary_loss_mlp": 0.01031011, + "balance_loss_clip": 1.04319942, + "balance_loss_mlp": 1.01907182, + "epoch": 0.5741770629791072, + "flos": 24936836797440.0, + "grad_norm": 1.4366767261825364, + "language_loss": 0.79742229, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.81841469, + "num_input_tokens_seen": 205698450, + "step": 9550, + "time_per_iteration": 2.759152889251709 + }, + { + "auxiliary_loss_clip": 0.01091678, + "auxiliary_loss_mlp": 0.01036065, + "balance_loss_clip": 1.04081261, + "balance_loss_mlp": 1.0213902, + "epoch": 0.5742371862317751, + "flos": 18801462614400.0, + "grad_norm": 1.889418417446442, + "language_loss": 0.67791235, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.69918978, + "num_input_tokens_seen": 205714870, + "step": 9551, + "time_per_iteration": 4.226199150085449 + }, + { + "auxiliary_loss_clip": 0.01082087, + "auxiliary_loss_mlp": 0.0103572, + "balance_loss_clip": 1.04173434, + "balance_loss_mlp": 1.02287543, + "epoch": 0.5742973094844431, + "flos": 24460661174400.0, + "grad_norm": 2.3194402923297157, + "language_loss": 0.7223655, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.74354362, + "num_input_tokens_seen": 205736045, + "step": 9552, + "time_per_iteration": 2.736600160598755 + }, + { + "auxiliary_loss_clip": 0.01103832, + "auxiliary_loss_mlp": 0.01033342, + "balance_loss_clip": 1.04454732, + "balance_loss_mlp": 1.02040219, + "epoch": 0.574357432737111, + "flos": 21652770176640.0, + "grad_norm": 2.128940953023755, + "language_loss": 0.79823256, + "learning_rate": 1.617772461696843e-06, + "loss": 0.81960428, + "num_input_tokens_seen": 205754445, + "step": 9553, + "time_per_iteration": 2.6895127296447754 + }, + { + "auxiliary_loss_clip": 0.01111471, + "auxiliary_loss_mlp": 0.01032858, + "balance_loss_clip": 1.04313147, + "balance_loss_mlp": 1.02050185, + "epoch": 0.5744175559897791, + "flos": 16544728880640.0, + "grad_norm": 1.880148698667659, + "language_loss": 0.8353495, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.85679281, + "num_input_tokens_seen": 205770595, + "step": 9554, + "time_per_iteration": 2.615577220916748 + }, + { + "auxiliary_loss_clip": 0.01115074, + "auxiliary_loss_mlp": 0.0077091, + "balance_loss_clip": 1.04545319, + "balance_loss_mlp": 1.0001241, + "epoch": 0.574477679242447, + "flos": 24207598880640.0, + "grad_norm": 1.4793540146055872, + "language_loss": 0.71076667, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.72962654, + "num_input_tokens_seen": 205791935, + "step": 9555, + "time_per_iteration": 2.7411417961120605 + }, + { + "auxiliary_loss_clip": 0.01093974, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.04077876, + "balance_loss_mlp": 1.01763785, + "epoch": 0.574537802495115, + "flos": 14903000835840.0, + "grad_norm": 2.2805548015379755, + "language_loss": 0.72663784, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.7478897, + "num_input_tokens_seen": 205807260, + "step": 9556, + "time_per_iteration": 2.6720690727233887 + }, + { + "auxiliary_loss_clip": 0.01111378, + "auxiliary_loss_mlp": 0.01033136, + "balance_loss_clip": 1.04576373, + "balance_loss_mlp": 1.02015448, + "epoch": 0.5745979257477829, + "flos": 24934969290240.0, + "grad_norm": 1.744837604754053, + "language_loss": 0.74087226, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.76231742, + "num_input_tokens_seen": 205826885, + "step": 9557, + "time_per_iteration": 2.7899231910705566 + }, + { + "auxiliary_loss_clip": 0.01108542, + "auxiliary_loss_mlp": 0.01034037, + "balance_loss_clip": 1.04274464, + "balance_loss_mlp": 1.02157927, + "epoch": 0.5746580490004509, + "flos": 17235757704960.0, + "grad_norm": 1.5016834383596844, + "language_loss": 0.67902005, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.70044577, + "num_input_tokens_seen": 205844630, + "step": 9558, + "time_per_iteration": 2.762430429458618 + }, + { + "auxiliary_loss_clip": 0.01094279, + "auxiliary_loss_mlp": 0.01052047, + "balance_loss_clip": 1.04277229, + "balance_loss_mlp": 1.03471398, + "epoch": 0.5747181722531189, + "flos": 13187871348480.0, + "grad_norm": 2.4192829019987148, + "language_loss": 0.72013688, + "learning_rate": 1.615479024621659e-06, + "loss": 0.74160016, + "num_input_tokens_seen": 205860960, + "step": 9559, + "time_per_iteration": 2.757319688796997 + }, + { + "auxiliary_loss_clip": 0.01097547, + "auxiliary_loss_mlp": 0.00769026, + "balance_loss_clip": 1.04342794, + "balance_loss_mlp": 1.00012159, + "epoch": 0.5747782955057869, + "flos": 22963006581120.0, + "grad_norm": 1.6274858947785595, + "language_loss": 0.78883743, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.8075031, + "num_input_tokens_seen": 205880675, + "step": 9560, + "time_per_iteration": 2.746260166168213 + }, + { + "auxiliary_loss_clip": 0.01052934, + "auxiliary_loss_mlp": 0.01029841, + "balance_loss_clip": 1.03918111, + "balance_loss_mlp": 1.0164957, + "epoch": 0.5748384187584549, + "flos": 23403235668480.0, + "grad_norm": 2.1977539095196903, + "language_loss": 0.64321613, + "learning_rate": 1.614714662090588e-06, + "loss": 0.6640439, + "num_input_tokens_seen": 205900050, + "step": 9561, + "time_per_iteration": 2.8124732971191406 + }, + { + "auxiliary_loss_clip": 0.01116845, + "auxiliary_loss_mlp": 0.01039625, + "balance_loss_clip": 1.04539895, + "balance_loss_mlp": 1.02567124, + "epoch": 0.5748985420111228, + "flos": 17785514338560.0, + "grad_norm": 2.0210299953328414, + "language_loss": 0.7193495, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.74091417, + "num_input_tokens_seen": 205918855, + "step": 9562, + "time_per_iteration": 2.7868704795837402 + }, + { + "auxiliary_loss_clip": 0.01067199, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.03979492, + "balance_loss_mlp": 1.02039778, + "epoch": 0.5749586652637908, + "flos": 19866250408320.0, + "grad_norm": 1.4806264841650407, + "language_loss": 0.84100068, + "learning_rate": 1.613950357999751e-06, + "loss": 0.86199754, + "num_input_tokens_seen": 205936970, + "step": 9563, + "time_per_iteration": 2.7772703170776367 + }, + { + "auxiliary_loss_clip": 0.01073481, + "auxiliary_loss_mlp": 0.01039774, + "balance_loss_clip": 1.0434773, + "balance_loss_mlp": 1.02635074, + "epoch": 0.5750187885164587, + "flos": 21287235421440.0, + "grad_norm": 2.0689431633426802, + "language_loss": 0.5717746, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.59290713, + "num_input_tokens_seen": 205954630, + "step": 9564, + "time_per_iteration": 2.8411808013916016 + }, + { + "auxiliary_loss_clip": 0.01092301, + "auxiliary_loss_mlp": 0.01036175, + "balance_loss_clip": 1.04144359, + "balance_loss_mlp": 1.0226326, + "epoch": 0.5750789117691267, + "flos": 18804658924800.0, + "grad_norm": 1.7191674250507119, + "language_loss": 0.76114881, + "learning_rate": 1.613186112465078e-06, + "loss": 0.78243363, + "num_input_tokens_seen": 205971510, + "step": 9565, + "time_per_iteration": 2.822044610977173 + }, + { + "auxiliary_loss_clip": 0.01002918, + "auxiliary_loss_mlp": 0.01012299, + "balance_loss_clip": 1.01532471, + "balance_loss_mlp": 1.01098824, + "epoch": 0.5751390350217946, + "flos": 70663224124800.0, + "grad_norm": 0.74248986424084, + "language_loss": 0.60725588, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.62740809, + "num_input_tokens_seen": 206035125, + "step": 9566, + "time_per_iteration": 3.427154064178467 + }, + { + "auxiliary_loss_clip": 0.01093716, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.04347396, + "balance_loss_mlp": 1.02224672, + "epoch": 0.5751991582744627, + "flos": 14246338348800.0, + "grad_norm": 2.3384715191144214, + "language_loss": 0.75378191, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.77506685, + "num_input_tokens_seen": 206052075, + "step": 9567, + "time_per_iteration": 2.8895022869110107 + }, + { + "auxiliary_loss_clip": 0.0110852, + "auxiliary_loss_mlp": 0.0103381, + "balance_loss_clip": 1.04461062, + "balance_loss_mlp": 1.02136469, + "epoch": 0.5752592815271306, + "flos": 18328160079360.0, + "grad_norm": 1.398692478003959, + "language_loss": 0.74487442, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.7662977, + "num_input_tokens_seen": 206069970, + "step": 9568, + "time_per_iteration": 2.745008945465088 + }, + { + "auxiliary_loss_clip": 0.01122376, + "auxiliary_loss_mlp": 0.01031079, + "balance_loss_clip": 1.04557085, + "balance_loss_mlp": 1.01852036, + "epoch": 0.5753194047797986, + "flos": 20922742160640.0, + "grad_norm": 1.8288224744161317, + "language_loss": 0.71572077, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.73725533, + "num_input_tokens_seen": 206088950, + "step": 9569, + "time_per_iteration": 2.9613218307495117 + }, + { + "auxiliary_loss_clip": 0.01113684, + "auxiliary_loss_mlp": 0.01037553, + "balance_loss_clip": 1.04693925, + "balance_loss_mlp": 1.02399325, + "epoch": 0.5753795280324665, + "flos": 19281804215040.0, + "grad_norm": 2.1991270484780916, + "language_loss": 0.55975366, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.58126599, + "num_input_tokens_seen": 206107780, + "step": 9570, + "time_per_iteration": 2.6928811073303223 + }, + { + "auxiliary_loss_clip": 0.01118829, + "auxiliary_loss_mlp": 0.01034724, + "balance_loss_clip": 1.04458117, + "balance_loss_mlp": 1.02252328, + "epoch": 0.5754396512851345, + "flos": 21652877917440.0, + "grad_norm": 1.4030574698632734, + "language_loss": 0.64338309, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.66491854, + "num_input_tokens_seen": 206127445, + "step": 9571, + "time_per_iteration": 2.635603427886963 + }, + { + "auxiliary_loss_clip": 0.01111717, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.04484558, + "balance_loss_mlp": 1.01890385, + "epoch": 0.5754997745378025, + "flos": 51021700179840.0, + "grad_norm": 1.5230879857727748, + "language_loss": 0.67137802, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.69281137, + "num_input_tokens_seen": 206152005, + "step": 9572, + "time_per_iteration": 2.9080519676208496 + }, + { + "auxiliary_loss_clip": 0.01101219, + "auxiliary_loss_mlp": 0.01032315, + "balance_loss_clip": 1.04746473, + "balance_loss_mlp": 1.01870155, + "epoch": 0.5755598977904705, + "flos": 22856890826880.0, + "grad_norm": 1.7883651828614429, + "language_loss": 0.72390687, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.74524224, + "num_input_tokens_seen": 206169875, + "step": 9573, + "time_per_iteration": 2.815703868865967 + }, + { + "auxiliary_loss_clip": 0.01118198, + "auxiliary_loss_mlp": 0.01031966, + "balance_loss_clip": 1.04730046, + "balance_loss_mlp": 1.0202539, + "epoch": 0.5756200210431385, + "flos": 38472824805120.0, + "grad_norm": 1.8637575754568128, + "language_loss": 0.76394922, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.78545088, + "num_input_tokens_seen": 206192635, + "step": 9574, + "time_per_iteration": 2.778196096420288 + }, + { + "auxiliary_loss_clip": 0.01068081, + "auxiliary_loss_mlp": 0.01036908, + "balance_loss_clip": 1.03836775, + "balance_loss_mlp": 1.02201867, + "epoch": 0.5756801442958064, + "flos": 23910006700800.0, + "grad_norm": 2.572143968399992, + "language_loss": 0.66373074, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.68478066, + "num_input_tokens_seen": 206211485, + "step": 9575, + "time_per_iteration": 2.780195951461792 + }, + { + "auxiliary_loss_clip": 0.01097887, + "auxiliary_loss_mlp": 0.01031317, + "balance_loss_clip": 1.04497039, + "balance_loss_mlp": 1.01917517, + "epoch": 0.5757402675484744, + "flos": 21105276099840.0, + "grad_norm": 1.5189421087528554, + "language_loss": 0.79787755, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.81916952, + "num_input_tokens_seen": 206231740, + "step": 9576, + "time_per_iteration": 2.7809135913848877 + }, + { + "auxiliary_loss_clip": 0.01096091, + "auxiliary_loss_mlp": 0.0102674, + "balance_loss_clip": 1.0435828, + "balance_loss_mlp": 1.01512265, + "epoch": 0.5758003908011423, + "flos": 20559110826240.0, + "grad_norm": 1.7619408585744085, + "language_loss": 0.69726396, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.71849227, + "num_input_tokens_seen": 206250975, + "step": 9577, + "time_per_iteration": 2.732150077819824 + }, + { + "auxiliary_loss_clip": 0.01111358, + "auxiliary_loss_mlp": 0.01035186, + "balance_loss_clip": 1.04446626, + "balance_loss_mlp": 1.02306843, + "epoch": 0.5758605140538103, + "flos": 16473015377280.0, + "grad_norm": 2.216832845639703, + "language_loss": 0.66558278, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.6870482, + "num_input_tokens_seen": 206268800, + "step": 9578, + "time_per_iteration": 2.639571189880371 + }, + { + "auxiliary_loss_clip": 0.01091288, + "auxiliary_loss_mlp": 0.01032209, + "balance_loss_clip": 1.04414392, + "balance_loss_mlp": 1.01997256, + "epoch": 0.5759206373064782, + "flos": 21287558643840.0, + "grad_norm": 1.7735647320590846, + "language_loss": 0.72313404, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.74436903, + "num_input_tokens_seen": 206287190, + "step": 9579, + "time_per_iteration": 2.6910343170166016 + }, + { + "auxiliary_loss_clip": 0.0110168, + "auxiliary_loss_mlp": 0.01035785, + "balance_loss_clip": 1.04436874, + "balance_loss_mlp": 1.02170634, + "epoch": 0.5759807605591463, + "flos": 26067879227520.0, + "grad_norm": 4.803146579630836, + "language_loss": 0.65395081, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.67532551, + "num_input_tokens_seen": 206307020, + "step": 9580, + "time_per_iteration": 2.7227509021759033 + }, + { + "auxiliary_loss_clip": 0.01092842, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.04106581, + "balance_loss_mlp": 1.0212729, + "epoch": 0.5760408838118142, + "flos": 18873068376960.0, + "grad_norm": 1.9154940218320493, + "language_loss": 0.85214174, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.87341785, + "num_input_tokens_seen": 206324095, + "step": 9581, + "time_per_iteration": 2.699432849884033 + }, + { + "auxiliary_loss_clip": 0.0113104, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.05060983, + "balance_loss_mlp": 1.02360821, + "epoch": 0.5761010070644822, + "flos": 15378134964480.0, + "grad_norm": 2.109676550381332, + "language_loss": 0.67354548, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.69522369, + "num_input_tokens_seen": 206343210, + "step": 9582, + "time_per_iteration": 2.6381587982177734 + }, + { + "auxiliary_loss_clip": 0.01026383, + "auxiliary_loss_mlp": 0.01001724, + "balance_loss_clip": 1.01951599, + "balance_loss_mlp": 1.00040722, + "epoch": 0.5761611303171501, + "flos": 71471932882560.0, + "grad_norm": 0.6463341323488921, + "language_loss": 0.57134479, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59162581, + "num_input_tokens_seen": 206415935, + "step": 9583, + "time_per_iteration": 3.52109694480896 + }, + { + "auxiliary_loss_clip": 0.01090801, + "auxiliary_loss_mlp": 0.01030991, + "balance_loss_clip": 1.04208195, + "balance_loss_mlp": 1.01818216, + "epoch": 0.5762212535698181, + "flos": 16246167033600.0, + "grad_norm": 1.791358766979404, + "language_loss": 0.82729411, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.84851205, + "num_input_tokens_seen": 206431900, + "step": 9584, + "time_per_iteration": 2.7258176803588867 + }, + { + "auxiliary_loss_clip": 0.0104221, + "auxiliary_loss_mlp": 0.01002028, + "balance_loss_clip": 1.0174526, + "balance_loss_mlp": 1.00080013, + "epoch": 0.5762813768224861, + "flos": 70185504216960.0, + "grad_norm": 0.623568426409687, + "language_loss": 0.49559212, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51603448, + "num_input_tokens_seen": 206501200, + "step": 9585, + "time_per_iteration": 7.823396682739258 + }, + { + "auxiliary_loss_clip": 0.01092491, + "auxiliary_loss_mlp": 0.01027016, + "balance_loss_clip": 1.04217815, + "balance_loss_mlp": 1.01516044, + "epoch": 0.5763415000751541, + "flos": 20518028645760.0, + "grad_norm": 1.574762209284147, + "language_loss": 0.85150623, + "learning_rate": 1.605165098835465e-06, + "loss": 0.87270141, + "num_input_tokens_seen": 206520575, + "step": 9586, + "time_per_iteration": 2.6869027614593506 + }, + { + "auxiliary_loss_clip": 0.0110803, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.04531455, + "balance_loss_mlp": 1.02091956, + "epoch": 0.5764016233278221, + "flos": 15815526877440.0, + "grad_norm": 2.1680790738732796, + "language_loss": 0.80101568, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.8224445, + "num_input_tokens_seen": 206538060, + "step": 9587, + "time_per_iteration": 2.664121627807617 + }, + { + "auxiliary_loss_clip": 0.01091421, + "auxiliary_loss_mlp": 0.01037732, + "balance_loss_clip": 1.04280019, + "balance_loss_mlp": 1.02400517, + "epoch": 0.57646174658049, + "flos": 20772312001920.0, + "grad_norm": 1.6197519148440016, + "language_loss": 0.66023791, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.68152946, + "num_input_tokens_seen": 206557320, + "step": 9588, + "time_per_iteration": 2.6596546173095703 + }, + { + "auxiliary_loss_clip": 0.01095166, + "auxiliary_loss_mlp": 0.01039726, + "balance_loss_clip": 1.04326534, + "balance_loss_mlp": 1.02583766, + "epoch": 0.576521869833158, + "flos": 23549930812800.0, + "grad_norm": 2.4954533064787383, + "language_loss": 0.78688884, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.80823773, + "num_input_tokens_seen": 206575780, + "step": 9589, + "time_per_iteration": 2.799503803253174 + }, + { + "auxiliary_loss_clip": 0.01114482, + "auxiliary_loss_mlp": 0.01025254, + "balance_loss_clip": 1.041682, + "balance_loss_mlp": 1.01353538, + "epoch": 0.5765819930858259, + "flos": 20266582464000.0, + "grad_norm": 2.2193599120856304, + "language_loss": 0.79450285, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.81590021, + "num_input_tokens_seen": 206594100, + "step": 9590, + "time_per_iteration": 4.355879545211792 + }, + { + "auxiliary_loss_clip": 0.01052935, + "auxiliary_loss_mlp": 0.00769289, + "balance_loss_clip": 1.03650951, + "balance_loss_mlp": 1.00013447, + "epoch": 0.5766421163384939, + "flos": 23148772744320.0, + "grad_norm": 1.8083193654510727, + "language_loss": 0.63346255, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.65168482, + "num_input_tokens_seen": 206613325, + "step": 9591, + "time_per_iteration": 2.8449039459228516 + }, + { + "auxiliary_loss_clip": 0.01122211, + "auxiliary_loss_mlp": 0.00769941, + "balance_loss_clip": 1.04640627, + "balance_loss_mlp": 1.00013709, + "epoch": 0.5767022395911618, + "flos": 25848895962240.0, + "grad_norm": 2.331025746602298, + "language_loss": 0.78112143, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.80004299, + "num_input_tokens_seen": 206634265, + "step": 9592, + "time_per_iteration": 2.7304346561431885 + }, + { + "auxiliary_loss_clip": 0.01004052, + "auxiliary_loss_mlp": 0.01021446, + "balance_loss_clip": 1.02547979, + "balance_loss_mlp": 1.01965749, + "epoch": 0.5767623628438299, + "flos": 68293299657600.0, + "grad_norm": 0.7436002967471621, + "language_loss": 0.59609032, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.61634529, + "num_input_tokens_seen": 206696990, + "step": 9593, + "time_per_iteration": 3.461658477783203 + }, + { + "auxiliary_loss_clip": 0.01110844, + "auxiliary_loss_mlp": 0.01041399, + "balance_loss_clip": 1.042449, + "balance_loss_mlp": 1.02810097, + "epoch": 0.5768224860964978, + "flos": 30188448754560.0, + "grad_norm": 1.9449888897854992, + "language_loss": 0.71144432, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73296678, + "num_input_tokens_seen": 206717815, + "step": 9594, + "time_per_iteration": 2.8465657234191895 + }, + { + "auxiliary_loss_clip": 0.01085879, + "auxiliary_loss_mlp": 0.01033309, + "balance_loss_clip": 1.04293251, + "balance_loss_mlp": 1.02196598, + "epoch": 0.5768826093491658, + "flos": 17895041884800.0, + "grad_norm": 2.485745999068748, + "language_loss": 0.70693135, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.72812331, + "num_input_tokens_seen": 206735985, + "step": 9595, + "time_per_iteration": 2.724013566970825 + }, + { + "auxiliary_loss_clip": 0.01120342, + "auxiliary_loss_mlp": 0.01030885, + "balance_loss_clip": 1.04522467, + "balance_loss_mlp": 1.01821947, + "epoch": 0.5769427326018337, + "flos": 17457183095040.0, + "grad_norm": 2.28937358102888, + "language_loss": 0.69969249, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.72120476, + "num_input_tokens_seen": 206753370, + "step": 9596, + "time_per_iteration": 2.602410316467285 + }, + { + "auxiliary_loss_clip": 0.01097835, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.04560232, + "balance_loss_mlp": 1.01944959, + "epoch": 0.5770028558545017, + "flos": 39421728345600.0, + "grad_norm": 1.7463690567151626, + "language_loss": 0.67612261, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.69743955, + "num_input_tokens_seen": 206777645, + "step": 9597, + "time_per_iteration": 2.9427249431610107 + }, + { + "auxiliary_loss_clip": 0.01096299, + "auxiliary_loss_mlp": 0.01033961, + "balance_loss_clip": 1.04274464, + "balance_loss_mlp": 1.02137804, + "epoch": 0.5770629791071697, + "flos": 21536383132800.0, + "grad_norm": 1.8692422611288704, + "language_loss": 0.81584179, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.83714437, + "num_input_tokens_seen": 206794865, + "step": 9598, + "time_per_iteration": 2.748018503189087 + }, + { + "auxiliary_loss_clip": 0.01073806, + "auxiliary_loss_mlp": 0.01042323, + "balance_loss_clip": 1.03563309, + "balance_loss_mlp": 1.0270164, + "epoch": 0.5771231023598377, + "flos": 20886795624960.0, + "grad_norm": 1.6175391320992503, + "language_loss": 0.7306143, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.7517755, + "num_input_tokens_seen": 206814095, + "step": 9599, + "time_per_iteration": 2.7712650299072266 + }, + { + "auxiliary_loss_clip": 0.01115679, + "auxiliary_loss_mlp": 0.01033218, + "balance_loss_clip": 1.04342914, + "balance_loss_mlp": 1.0211184, + "epoch": 0.5771832256125057, + "flos": 18077216688000.0, + "grad_norm": 3.919070780783451, + "language_loss": 0.78193593, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.80342484, + "num_input_tokens_seen": 206832245, + "step": 9600, + "time_per_iteration": 2.6597604751586914 + }, + { + "auxiliary_loss_clip": 0.01113425, + "auxiliary_loss_mlp": 0.0077084, + "balance_loss_clip": 1.04604816, + "balance_loss_mlp": 1.00016284, + "epoch": 0.5772433488651736, + "flos": 26359078786560.0, + "grad_norm": 1.665079650983798, + "language_loss": 0.72689855, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.74574125, + "num_input_tokens_seen": 206851535, + "step": 9601, + "time_per_iteration": 2.7263121604919434 + }, + { + "auxiliary_loss_clip": 0.01064473, + "auxiliary_loss_mlp": 0.01036032, + "balance_loss_clip": 1.04480124, + "balance_loss_mlp": 1.02311611, + "epoch": 0.5773034721178416, + "flos": 19680987035520.0, + "grad_norm": 2.0948856363437534, + "language_loss": 0.68606448, + "learning_rate": 1.599058274973348e-06, + "loss": 0.70706952, + "num_input_tokens_seen": 206870595, + "step": 9602, + "time_per_iteration": 2.8572375774383545 + }, + { + "auxiliary_loss_clip": 0.01088049, + "auxiliary_loss_mlp": 0.01035522, + "balance_loss_clip": 1.03997481, + "balance_loss_mlp": 1.02274275, + "epoch": 0.5773635953705095, + "flos": 25082885496960.0, + "grad_norm": 1.4139424352201144, + "language_loss": 0.73376763, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.75500333, + "num_input_tokens_seen": 206892320, + "step": 9603, + "time_per_iteration": 2.816098928451538 + }, + { + "auxiliary_loss_clip": 0.01108536, + "auxiliary_loss_mlp": 0.01029532, + "balance_loss_clip": 1.0450983, + "balance_loss_mlp": 1.01732492, + "epoch": 0.5774237186231775, + "flos": 21032987978880.0, + "grad_norm": 1.7349679186761677, + "language_loss": 0.76407522, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.78545588, + "num_input_tokens_seen": 206912485, + "step": 9604, + "time_per_iteration": 2.718163013458252 + }, + { + "auxiliary_loss_clip": 0.01086662, + "auxiliary_loss_mlp": 0.0103562, + "balance_loss_clip": 1.04304457, + "balance_loss_mlp": 1.02200651, + "epoch": 0.5774838418758454, + "flos": 15231727128960.0, + "grad_norm": 2.5247859182247026, + "language_loss": 0.83387136, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.85509419, + "num_input_tokens_seen": 206929100, + "step": 9605, + "time_per_iteration": 2.8076066970825195 + }, + { + "auxiliary_loss_clip": 0.01096142, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.04626584, + "balance_loss_mlp": 1.01667333, + "epoch": 0.5775439651285135, + "flos": 23582609210880.0, + "grad_norm": 1.8595500746131972, + "language_loss": 0.77926147, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.80054009, + "num_input_tokens_seen": 206947020, + "step": 9606, + "time_per_iteration": 2.7766621112823486 + }, + { + "auxiliary_loss_clip": 0.01117345, + "auxiliary_loss_mlp": 0.01035757, + "balance_loss_clip": 1.04331446, + "balance_loss_mlp": 1.02362752, + "epoch": 0.5776040883811814, + "flos": 18040515966720.0, + "grad_norm": 1.672602422897938, + "language_loss": 0.73896575, + "learning_rate": 1.597150687927619e-06, + "loss": 0.76049674, + "num_input_tokens_seen": 206964065, + "step": 9607, + "time_per_iteration": 2.6057968139648438 + }, + { + "auxiliary_loss_clip": 0.01076534, + "auxiliary_loss_mlp": 0.01034666, + "balance_loss_clip": 1.04220486, + "balance_loss_mlp": 1.02155876, + "epoch": 0.5776642116338494, + "flos": 18624638937600.0, + "grad_norm": 1.6326461875987317, + "language_loss": 0.69385672, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.71496868, + "num_input_tokens_seen": 206981940, + "step": 9608, + "time_per_iteration": 2.784708023071289 + }, + { + "auxiliary_loss_clip": 0.01084539, + "auxiliary_loss_mlp": 0.01033053, + "balance_loss_clip": 1.03977787, + "balance_loss_mlp": 1.01979709, + "epoch": 0.5777243348865173, + "flos": 28402539517440.0, + "grad_norm": 1.6850838728782904, + "language_loss": 0.76766187, + "learning_rate": 1.596387759940665e-06, + "loss": 0.78883779, + "num_input_tokens_seen": 207002365, + "step": 9609, + "time_per_iteration": 2.7439122200012207 + }, + { + "auxiliary_loss_clip": 0.01090565, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.04297495, + "balance_loss_mlp": 1.02154744, + "epoch": 0.5777844581391853, + "flos": 24024705805440.0, + "grad_norm": 1.7626877282975804, + "language_loss": 0.76948774, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.79072988, + "num_input_tokens_seen": 207021195, + "step": 9610, + "time_per_iteration": 2.748898506164551 + }, + { + "auxiliary_loss_clip": 0.0108266, + "auxiliary_loss_mlp": 0.01029905, + "balance_loss_clip": 1.03885353, + "balance_loss_mlp": 1.01625562, + "epoch": 0.5778445813918534, + "flos": 17777361951360.0, + "grad_norm": 2.997373910278609, + "language_loss": 0.68867594, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.70980155, + "num_input_tokens_seen": 207037465, + "step": 9611, + "time_per_iteration": 2.7037806510925293 + }, + { + "auxiliary_loss_clip": 0.01103482, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.03957248, + "balance_loss_mlp": 1.01607609, + "epoch": 0.5779047046445213, + "flos": 22233194046720.0, + "grad_norm": 1.7435127822918648, + "language_loss": 0.83207917, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.85340309, + "num_input_tokens_seen": 207054230, + "step": 9612, + "time_per_iteration": 2.6507790088653564 + }, + { + "auxiliary_loss_clip": 0.01119736, + "auxiliary_loss_mlp": 0.01030573, + "balance_loss_clip": 1.04522681, + "balance_loss_mlp": 1.01779914, + "epoch": 0.5779648278971893, + "flos": 21434361528960.0, + "grad_norm": 1.6430153650030166, + "language_loss": 0.79567391, + "learning_rate": 1.594862087742667e-06, + "loss": 0.81717706, + "num_input_tokens_seen": 207073150, + "step": 9613, + "time_per_iteration": 2.679202079772949 + }, + { + "auxiliary_loss_clip": 0.01107, + "auxiliary_loss_mlp": 0.01032074, + "balance_loss_clip": 1.04167032, + "balance_loss_mlp": 1.02013552, + "epoch": 0.5780249511498572, + "flos": 19026120228480.0, + "grad_norm": 1.7764623177151277, + "language_loss": 0.77572, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.7971108, + "num_input_tokens_seen": 207090375, + "step": 9614, + "time_per_iteration": 2.6978790760040283 + }, + { + "auxiliary_loss_clip": 0.01086413, + "auxiliary_loss_mlp": 0.01033429, + "balance_loss_clip": 1.04169321, + "balance_loss_mlp": 1.020715, + "epoch": 0.5780850744025252, + "flos": 12124663752960.0, + "grad_norm": 2.2008207091737093, + "language_loss": 0.81598818, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.83718669, + "num_input_tokens_seen": 207106030, + "step": 9615, + "time_per_iteration": 2.7248473167419434 + }, + { + "auxiliary_loss_clip": 0.01104516, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.03926682, + "balance_loss_mlp": 1.01978278, + "epoch": 0.5781451976551931, + "flos": 25044425009280.0, + "grad_norm": 1.4596798757523364, + "language_loss": 0.67086244, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.69223398, + "num_input_tokens_seen": 207125435, + "step": 9616, + "time_per_iteration": 2.7597362995147705 + }, + { + "auxiliary_loss_clip": 0.01106834, + "auxiliary_loss_mlp": 0.01032261, + "balance_loss_clip": 1.04345763, + "balance_loss_mlp": 1.01935697, + "epoch": 0.5782053209078611, + "flos": 19245606284160.0, + "grad_norm": 1.6175721800228267, + "language_loss": 0.77521074, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.79660165, + "num_input_tokens_seen": 207145095, + "step": 9617, + "time_per_iteration": 2.8377323150634766 + }, + { + "auxiliary_loss_clip": 0.01094943, + "auxiliary_loss_mlp": 0.01035216, + "balance_loss_clip": 1.04236281, + "balance_loss_mlp": 1.02231812, + "epoch": 0.578265444160529, + "flos": 25993831340160.0, + "grad_norm": 1.5155731004031996, + "language_loss": 0.75113726, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.77243888, + "num_input_tokens_seen": 207166045, + "step": 9618, + "time_per_iteration": 2.665472984313965 + }, + { + "auxiliary_loss_clip": 0.0111694, + "auxiliary_loss_mlp": 0.01028064, + "balance_loss_clip": 1.04336691, + "balance_loss_mlp": 1.01594067, + "epoch": 0.5783255674131971, + "flos": 21798603394560.0, + "grad_norm": 2.8083861615500445, + "language_loss": 0.81775922, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.83920932, + "num_input_tokens_seen": 207185290, + "step": 9619, + "time_per_iteration": 2.6156482696533203 + }, + { + "auxiliary_loss_clip": 0.01099184, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.04264188, + "balance_loss_mlp": 1.0207603, + "epoch": 0.578385690665865, + "flos": 24789746603520.0, + "grad_norm": 1.7083869874707343, + "language_loss": 0.72963226, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.75094968, + "num_input_tokens_seen": 207205505, + "step": 9620, + "time_per_iteration": 2.7066376209259033 + }, + { + "auxiliary_loss_clip": 0.01096891, + "auxiliary_loss_mlp": 0.01030675, + "balance_loss_clip": 1.04079533, + "balance_loss_mlp": 1.01831877, + "epoch": 0.578445813918533, + "flos": 21212864311680.0, + "grad_norm": 8.221069459540734, + "language_loss": 0.76836628, + "learning_rate": 1.591811481689916e-06, + "loss": 0.78964192, + "num_input_tokens_seen": 207225315, + "step": 9621, + "time_per_iteration": 2.746229887008667 + }, + { + "auxiliary_loss_clip": 0.01054178, + "auxiliary_loss_mlp": 0.0104303, + "balance_loss_clip": 1.03465438, + "balance_loss_mlp": 1.02871835, + "epoch": 0.5785059371712009, + "flos": 25046795306880.0, + "grad_norm": 1.8397649270084009, + "language_loss": 0.70646143, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.72743344, + "num_input_tokens_seen": 207247690, + "step": 9622, + "time_per_iteration": 2.7708969116210938 + }, + { + "auxiliary_loss_clip": 0.01024027, + "auxiliary_loss_mlp": 0.01003845, + "balance_loss_clip": 1.01965523, + "balance_loss_mlp": 1.00259304, + "epoch": 0.5785660604238689, + "flos": 70843172284800.0, + "grad_norm": 0.7693139889423115, + "language_loss": 0.55946988, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.57974857, + "num_input_tokens_seen": 207301735, + "step": 9623, + "time_per_iteration": 3.2743892669677734 + }, + { + "auxiliary_loss_clip": 0.01084844, + "auxiliary_loss_mlp": 0.01037987, + "balance_loss_clip": 1.04244125, + "balance_loss_mlp": 1.02392614, + "epoch": 0.578626183676537, + "flos": 31649977244160.0, + "grad_norm": 2.0494784145389677, + "language_loss": 0.71381462, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.73504293, + "num_input_tokens_seen": 207321240, + "step": 9624, + "time_per_iteration": 2.761348247528076 + }, + { + "auxiliary_loss_clip": 0.01084192, + "auxiliary_loss_mlp": 0.01039308, + "balance_loss_clip": 1.03928137, + "balance_loss_mlp": 1.02572453, + "epoch": 0.5786863069292049, + "flos": 21865181253120.0, + "grad_norm": 2.0143803075104687, + "language_loss": 0.82421607, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.845451, + "num_input_tokens_seen": 207339540, + "step": 9625, + "time_per_iteration": 4.566919326782227 + }, + { + "auxiliary_loss_clip": 0.01116336, + "auxiliary_loss_mlp": 0.01033709, + "balance_loss_clip": 1.042328, + "balance_loss_mlp": 1.02037549, + "epoch": 0.5787464301818729, + "flos": 23364954748800.0, + "grad_norm": 1.438878240234706, + "language_loss": 0.70356315, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72506356, + "num_input_tokens_seen": 207360470, + "step": 9626, + "time_per_iteration": 2.6495361328125 + }, + { + "auxiliary_loss_clip": 0.01095761, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.0427779, + "balance_loss_mlp": 1.02442503, + "epoch": 0.5788065534345408, + "flos": 30004011394560.0, + "grad_norm": 1.470476031522724, + "language_loss": 0.72111934, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.74244475, + "num_input_tokens_seen": 207383080, + "step": 9627, + "time_per_iteration": 2.8884880542755127 + }, + { + "auxiliary_loss_clip": 0.01104923, + "auxiliary_loss_mlp": 0.0103066, + "balance_loss_clip": 1.04045546, + "balance_loss_mlp": 1.01872754, + "epoch": 0.5788666766872088, + "flos": 24527849564160.0, + "grad_norm": 5.936898137308074, + "language_loss": 0.83902895, + "learning_rate": 1.589143013764458e-06, + "loss": 0.8603847, + "num_input_tokens_seen": 207401000, + "step": 9628, + "time_per_iteration": 2.746950626373291 + }, + { + "auxiliary_loss_clip": 0.01093971, + "auxiliary_loss_mlp": 0.01031789, + "balance_loss_clip": 1.03782499, + "balance_loss_mlp": 1.01856256, + "epoch": 0.5789267999398767, + "flos": 23732823888000.0, + "grad_norm": 1.5735702827765405, + "language_loss": 0.72260225, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.74385989, + "num_input_tokens_seen": 207419230, + "step": 9629, + "time_per_iteration": 4.194722652435303 + }, + { + "auxiliary_loss_clip": 0.01096902, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.043715, + "balance_loss_mlp": 1.02154994, + "epoch": 0.5789869231925447, + "flos": 21135045496320.0, + "grad_norm": 2.2622526010485062, + "language_loss": 0.74250948, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.76382619, + "num_input_tokens_seen": 207437615, + "step": 9630, + "time_per_iteration": 2.754213571548462 + }, + { + "auxiliary_loss_clip": 0.01083141, + "auxiliary_loss_mlp": 0.00770695, + "balance_loss_clip": 1.0400362, + "balance_loss_mlp": 1.00009274, + "epoch": 0.5790470464452127, + "flos": 21209632087680.0, + "grad_norm": 1.6843723839781237, + "language_loss": 0.78927267, + "learning_rate": 1.587999618060523e-06, + "loss": 0.8078109, + "num_input_tokens_seen": 207457270, + "step": 9631, + "time_per_iteration": 2.757955551147461 + }, + { + "auxiliary_loss_clip": 0.01116603, + "auxiliary_loss_mlp": 0.01029207, + "balance_loss_clip": 1.04169166, + "balance_loss_mlp": 1.01674962, + "epoch": 0.5791071696978807, + "flos": 23404384903680.0, + "grad_norm": 1.5220400196762927, + "language_loss": 0.75543463, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.77689266, + "num_input_tokens_seen": 207477890, + "step": 9632, + "time_per_iteration": 2.5955679416656494 + }, + { + "auxiliary_loss_clip": 0.01090291, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.04132521, + "balance_loss_mlp": 1.01704419, + "epoch": 0.5791672929505486, + "flos": 24206521472640.0, + "grad_norm": 2.166079097569446, + "language_loss": 0.79483461, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.81604362, + "num_input_tokens_seen": 207497670, + "step": 9633, + "time_per_iteration": 3.0309832096099854 + }, + { + "auxiliary_loss_clip": 0.01090489, + "auxiliary_loss_mlp": 0.0104029, + "balance_loss_clip": 1.04247785, + "balance_loss_mlp": 1.02621174, + "epoch": 0.5792274162032166, + "flos": 24348871071360.0, + "grad_norm": 1.6628345099755575, + "language_loss": 0.77489352, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.79620135, + "num_input_tokens_seen": 207516105, + "step": 9634, + "time_per_iteration": 2.742804765701294 + }, + { + "auxiliary_loss_clip": 0.01103303, + "auxiliary_loss_mlp": 0.01039877, + "balance_loss_clip": 1.04325557, + "balance_loss_mlp": 1.0265131, + "epoch": 0.5792875394558845, + "flos": 20449403712000.0, + "grad_norm": 2.0206641079359695, + "language_loss": 0.63376474, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.65519655, + "num_input_tokens_seen": 207533685, + "step": 9635, + "time_per_iteration": 2.758554220199585 + }, + { + "auxiliary_loss_clip": 0.01090702, + "auxiliary_loss_mlp": 0.01040857, + "balance_loss_clip": 1.0402782, + "balance_loss_mlp": 1.02797055, + "epoch": 0.5793476627085525, + "flos": 24060329118720.0, + "grad_norm": 1.4022803042470642, + "language_loss": 0.77229643, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.793612, + "num_input_tokens_seen": 207552840, + "step": 9636, + "time_per_iteration": 2.6893904209136963 + }, + { + "auxiliary_loss_clip": 0.01087778, + "auxiliary_loss_mlp": 0.01033423, + "balance_loss_clip": 1.03770018, + "balance_loss_mlp": 1.02124608, + "epoch": 0.5794077859612206, + "flos": 22054287381120.0, + "grad_norm": 1.6516741793622702, + "language_loss": 0.68164212, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.70285416, + "num_input_tokens_seen": 207572095, + "step": 9637, + "time_per_iteration": 2.7232043743133545 + }, + { + "auxiliary_loss_clip": 0.01076767, + "auxiliary_loss_mlp": 0.01035713, + "balance_loss_clip": 1.04049063, + "balance_loss_mlp": 1.02214098, + "epoch": 0.5794679092138885, + "flos": 11434855991040.0, + "grad_norm": 2.739438707467598, + "language_loss": 0.72531378, + "learning_rate": 1.585332242234043e-06, + "loss": 0.74643862, + "num_input_tokens_seen": 207587495, + "step": 9638, + "time_per_iteration": 2.819202423095703 + }, + { + "auxiliary_loss_clip": 0.01107966, + "auxiliary_loss_mlp": 0.0103288, + "balance_loss_clip": 1.04470587, + "balance_loss_mlp": 1.02056587, + "epoch": 0.5795280324665565, + "flos": 18880215183360.0, + "grad_norm": 1.716063507275685, + "language_loss": 0.72309893, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.74450737, + "num_input_tokens_seen": 207606795, + "step": 9639, + "time_per_iteration": 2.683488130569458 + }, + { + "auxiliary_loss_clip": 0.01094721, + "auxiliary_loss_mlp": 0.01039725, + "balance_loss_clip": 1.0399698, + "balance_loss_mlp": 1.02682686, + "epoch": 0.5795881557192244, + "flos": 13005947940480.0, + "grad_norm": 1.8567608995858262, + "language_loss": 0.70044529, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.72178972, + "num_input_tokens_seen": 207623620, + "step": 9640, + "time_per_iteration": 2.672945737838745 + }, + { + "auxiliary_loss_clip": 0.01096614, + "auxiliary_loss_mlp": 0.01042841, + "balance_loss_clip": 1.0413754, + "balance_loss_mlp": 1.02858996, + "epoch": 0.5796482789718924, + "flos": 19932397303680.0, + "grad_norm": 2.4123450370287958, + "language_loss": 0.7753675, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.79676205, + "num_input_tokens_seen": 207639380, + "step": 9641, + "time_per_iteration": 2.688164472579956 + }, + { + "auxiliary_loss_clip": 0.01119399, + "auxiliary_loss_mlp": 0.01036698, + "balance_loss_clip": 1.04407382, + "balance_loss_mlp": 1.02385926, + "epoch": 0.5797084022245603, + "flos": 21650794928640.0, + "grad_norm": 1.8311937480298248, + "language_loss": 0.73798597, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.75954694, + "num_input_tokens_seen": 207657915, + "step": 9642, + "time_per_iteration": 2.624521017074585 + }, + { + "auxiliary_loss_clip": 0.01102536, + "auxiliary_loss_mlp": 0.01038535, + "balance_loss_clip": 1.04526544, + "balance_loss_mlp": 1.02577972, + "epoch": 0.5797685254772283, + "flos": 26031573555840.0, + "grad_norm": 5.942363913556237, + "language_loss": 0.73259425, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.75400496, + "num_input_tokens_seen": 207678620, + "step": 9643, + "time_per_iteration": 2.715672254562378 + }, + { + "auxiliary_loss_clip": 0.01121691, + "auxiliary_loss_mlp": 0.01033759, + "balance_loss_clip": 1.04416251, + "balance_loss_mlp": 1.02062845, + "epoch": 0.5798286487298963, + "flos": 22705167778560.0, + "grad_norm": 1.8659489070776951, + "language_loss": 0.67181957, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.69337404, + "num_input_tokens_seen": 207696980, + "step": 9644, + "time_per_iteration": 2.6038551330566406 + }, + { + "auxiliary_loss_clip": 0.01116177, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.04553771, + "balance_loss_mlp": 1.02103531, + "epoch": 0.5798887719825643, + "flos": 23148988225920.0, + "grad_norm": 2.1679759651263044, + "language_loss": 0.85346615, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.8749733, + "num_input_tokens_seen": 207714065, + "step": 9645, + "time_per_iteration": 2.667259931564331 + }, + { + "auxiliary_loss_clip": 0.01122251, + "auxiliary_loss_mlp": 0.01030168, + "balance_loss_clip": 1.04620934, + "balance_loss_mlp": 1.01735902, + "epoch": 0.5799488952352322, + "flos": 24426043441920.0, + "grad_norm": 2.1123906469300935, + "language_loss": 0.75605559, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.77757978, + "num_input_tokens_seen": 207734720, + "step": 9646, + "time_per_iteration": 2.559659481048584 + }, + { + "auxiliary_loss_clip": 0.01099999, + "auxiliary_loss_mlp": 0.01037708, + "balance_loss_clip": 1.04342473, + "balance_loss_mlp": 1.02371335, + "epoch": 0.5800090184879002, + "flos": 38395903829760.0, + "grad_norm": 1.698650252646941, + "language_loss": 0.59495735, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.61633444, + "num_input_tokens_seen": 207755435, + "step": 9647, + "time_per_iteration": 2.7939651012420654 + }, + { + "auxiliary_loss_clip": 0.0107788, + "auxiliary_loss_mlp": 0.0105249, + "balance_loss_clip": 1.04142165, + "balance_loss_mlp": 1.03642702, + "epoch": 0.5800691417405681, + "flos": 19784840232960.0, + "grad_norm": 1.6988187353884752, + "language_loss": 0.84499681, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.86630046, + "num_input_tokens_seen": 207773570, + "step": 9648, + "time_per_iteration": 2.7750449180603027 + }, + { + "auxiliary_loss_clip": 0.01032269, + "auxiliary_loss_mlp": 0.01003411, + "balance_loss_clip": 1.01776171, + "balance_loss_mlp": 1.0021714, + "epoch": 0.5801292649932361, + "flos": 70314565783680.0, + "grad_norm": 0.8432525659417933, + "language_loss": 0.62929457, + "learning_rate": 1.581142210256242e-06, + "loss": 0.64965135, + "num_input_tokens_seen": 207830095, + "step": 9649, + "time_per_iteration": 3.21219801902771 + }, + { + "auxiliary_loss_clip": 0.01078275, + "auxiliary_loss_mlp": 0.0103905, + "balance_loss_clip": 1.03673697, + "balance_loss_mlp": 1.02525127, + "epoch": 0.5801893882459042, + "flos": 18734812928640.0, + "grad_norm": 1.587591091557097, + "language_loss": 0.82462633, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.84579957, + "num_input_tokens_seen": 207848555, + "step": 9650, + "time_per_iteration": 2.8374016284942627 + }, + { + "auxiliary_loss_clip": 0.0108491, + "auxiliary_loss_mlp": 0.01036427, + "balance_loss_clip": 1.03912425, + "balance_loss_mlp": 1.0230515, + "epoch": 0.5802495114985721, + "flos": 15596507698560.0, + "grad_norm": 3.679017793776146, + "language_loss": 0.7786057, + "learning_rate": 1.580380592177698e-06, + "loss": 0.79981905, + "num_input_tokens_seen": 207867060, + "step": 9651, + "time_per_iteration": 2.728508949279785 + }, + { + "auxiliary_loss_clip": 0.01103104, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.04429924, + "balance_loss_mlp": 1.02555537, + "epoch": 0.5803096347512401, + "flos": 18255405081600.0, + "grad_norm": 1.8929228958840072, + "language_loss": 0.74471784, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.76614177, + "num_input_tokens_seen": 207884520, + "step": 9652, + "time_per_iteration": 2.6977131366729736 + }, + { + "auxiliary_loss_clip": 0.01092621, + "auxiliary_loss_mlp": 0.01028533, + "balance_loss_clip": 1.04145324, + "balance_loss_mlp": 1.0150857, + "epoch": 0.580369758003908, + "flos": 22893160584960.0, + "grad_norm": 2.031010770866024, + "language_loss": 0.7703613, + "learning_rate": 1.579619037747193e-06, + "loss": 0.79157287, + "num_input_tokens_seen": 207905370, + "step": 9653, + "time_per_iteration": 2.7233431339263916 + }, + { + "auxiliary_loss_clip": 0.01121993, + "auxiliary_loss_mlp": 0.01034522, + "balance_loss_clip": 1.04465187, + "balance_loss_mlp": 1.02035964, + "epoch": 0.580429881256576, + "flos": 18697681244160.0, + "grad_norm": 1.9204408515131524, + "language_loss": 0.74248046, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.76404566, + "num_input_tokens_seen": 207923790, + "step": 9654, + "time_per_iteration": 2.595330238342285 + }, + { + "auxiliary_loss_clip": 0.01054131, + "auxiliary_loss_mlp": 0.01037747, + "balance_loss_clip": 1.04102838, + "balance_loss_mlp": 1.02466965, + "epoch": 0.5804900045092439, + "flos": 24681978823680.0, + "grad_norm": 1.627345886244452, + "language_loss": 0.70138443, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.72230321, + "num_input_tokens_seen": 207942335, + "step": 9655, + "time_per_iteration": 2.8097565174102783 + }, + { + "auxiliary_loss_clip": 0.01125048, + "auxiliary_loss_mlp": 0.01038459, + "balance_loss_clip": 1.04366922, + "balance_loss_mlp": 1.02495217, + "epoch": 0.580550127761912, + "flos": 23112790295040.0, + "grad_norm": 1.8908787804935243, + "language_loss": 0.69673449, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.71836954, + "num_input_tokens_seen": 207961975, + "step": 9656, + "time_per_iteration": 2.6233110427856445 + }, + { + "auxiliary_loss_clip": 0.01107455, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.04619503, + "balance_loss_mlp": 1.02208686, + "epoch": 0.5806102510145799, + "flos": 18475681236480.0, + "grad_norm": 1.5577317145380594, + "language_loss": 0.71972537, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.7411471, + "num_input_tokens_seen": 207979520, + "step": 9657, + "time_per_iteration": 2.616337537765503 + }, + { + "auxiliary_loss_clip": 0.01111294, + "auxiliary_loss_mlp": 0.01037621, + "balance_loss_clip": 1.04370785, + "balance_loss_mlp": 1.0237869, + "epoch": 0.5806703742672479, + "flos": 23915645136000.0, + "grad_norm": 1.9819747060784367, + "language_loss": 0.70975304, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.73124212, + "num_input_tokens_seen": 207998375, + "step": 9658, + "time_per_iteration": 2.6383109092712402 + }, + { + "auxiliary_loss_clip": 0.01031383, + "auxiliary_loss_mlp": 0.01001283, + "balance_loss_clip": 1.01641989, + "balance_loss_mlp": 1.00009727, + "epoch": 0.5807304975199158, + "flos": 66311999412480.0, + "grad_norm": 0.7167527277810166, + "language_loss": 0.5357672, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.55609381, + "num_input_tokens_seen": 208060605, + "step": 9659, + "time_per_iteration": 3.1848106384277344 + }, + { + "auxiliary_loss_clip": 0.0111162, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.04272866, + "balance_loss_mlp": 1.02638984, + "epoch": 0.5807906207725838, + "flos": 31722444933120.0, + "grad_norm": 1.8377682291636406, + "language_loss": 0.61835778, + "learning_rate": 1.576954100136366e-06, + "loss": 0.63987488, + "num_input_tokens_seen": 208080320, + "step": 9660, + "time_per_iteration": 2.7875893115997314 + }, + { + "auxiliary_loss_clip": 0.01108259, + "auxiliary_loss_mlp": 0.01035512, + "balance_loss_clip": 1.03933334, + "balance_loss_mlp": 1.02131391, + "epoch": 0.5808507440252517, + "flos": 23801161512960.0, + "grad_norm": 1.4582842247400174, + "language_loss": 0.65268373, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.6741215, + "num_input_tokens_seen": 208099305, + "step": 9661, + "time_per_iteration": 2.640033721923828 + }, + { + "auxiliary_loss_clip": 0.01060469, + "auxiliary_loss_mlp": 0.01027812, + "balance_loss_clip": 1.03416336, + "balance_loss_mlp": 1.01562285, + "epoch": 0.5809108672779197, + "flos": 13698449222400.0, + "grad_norm": 13.818010552074016, + "language_loss": 0.74664855, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.76753139, + "num_input_tokens_seen": 208116960, + "step": 9662, + "time_per_iteration": 2.78912091255188 + }, + { + "auxiliary_loss_clip": 0.0103935, + "auxiliary_loss_mlp": 0.01000149, + "balance_loss_clip": 1.01472378, + "balance_loss_mlp": 0.99883789, + "epoch": 0.5809709905305876, + "flos": 69134866381440.0, + "grad_norm": 0.8720581464390529, + "language_loss": 0.58341724, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.60381216, + "num_input_tokens_seen": 208182190, + "step": 9663, + "time_per_iteration": 3.2206766605377197 + }, + { + "auxiliary_loss_clip": 0.01099545, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.04324317, + "balance_loss_mlp": 1.02127123, + "epoch": 0.5810311137832557, + "flos": 19827538525440.0, + "grad_norm": 2.2012699158511073, + "language_loss": 0.82044816, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.84178805, + "num_input_tokens_seen": 208197015, + "step": 9664, + "time_per_iteration": 5.9192726612091064 + }, + { + "auxiliary_loss_clip": 0.01089768, + "auxiliary_loss_mlp": 0.00771212, + "balance_loss_clip": 1.03780138, + "balance_loss_mlp": 1.0000962, + "epoch": 0.5810912370359237, + "flos": 29238503719680.0, + "grad_norm": 4.331316838714664, + "language_loss": 0.81583905, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.83444887, + "num_input_tokens_seen": 208215795, + "step": 9665, + "time_per_iteration": 2.7813103199005127 + }, + { + "auxiliary_loss_clip": 0.01104588, + "auxiliary_loss_mlp": 0.01035909, + "balance_loss_clip": 1.0461179, + "balance_loss_mlp": 1.02123475, + "epoch": 0.5811513602885916, + "flos": 22785572373120.0, + "grad_norm": 1.7229241789226792, + "language_loss": 0.81392443, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.83532941, + "num_input_tokens_seen": 208234655, + "step": 9666, + "time_per_iteration": 2.7249464988708496 + }, + { + "auxiliary_loss_clip": 0.01101961, + "auxiliary_loss_mlp": 0.01035898, + "balance_loss_clip": 1.04181623, + "balance_loss_mlp": 1.02339292, + "epoch": 0.5812114835412596, + "flos": 18734346051840.0, + "grad_norm": 1.7975787773576042, + "language_loss": 0.80100554, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.82238424, + "num_input_tokens_seen": 208251300, + "step": 9667, + "time_per_iteration": 2.600576639175415 + }, + { + "auxiliary_loss_clip": 0.01117108, + "auxiliary_loss_mlp": 0.01037273, + "balance_loss_clip": 1.04451418, + "balance_loss_mlp": 1.02237177, + "epoch": 0.5812716067939275, + "flos": 26431295080320.0, + "grad_norm": 1.4400303722288619, + "language_loss": 0.78809667, + "learning_rate": 1.573909419957653e-06, + "loss": 0.80964047, + "num_input_tokens_seen": 208272685, + "step": 9668, + "time_per_iteration": 4.22690486907959 + }, + { + "auxiliary_loss_clip": 0.01098312, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.04209864, + "balance_loss_mlp": 1.02148795, + "epoch": 0.5813317300465956, + "flos": 43397865285120.0, + "grad_norm": 1.8465293320084986, + "language_loss": 0.64245093, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.66377068, + "num_input_tokens_seen": 208294315, + "step": 9669, + "time_per_iteration": 2.8652687072753906 + }, + { + "auxiliary_loss_clip": 0.01069091, + "auxiliary_loss_mlp": 0.01041038, + "balance_loss_clip": 1.03997946, + "balance_loss_mlp": 1.02672672, + "epoch": 0.5813918532992635, + "flos": 24785472885120.0, + "grad_norm": 1.4411692985545548, + "language_loss": 0.7307651, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.75186646, + "num_input_tokens_seen": 208315610, + "step": 9670, + "time_per_iteration": 2.829456329345703 + }, + { + "auxiliary_loss_clip": 0.01086705, + "auxiliary_loss_mlp": 0.01034661, + "balance_loss_clip": 1.03999424, + "balance_loss_mlp": 1.02194691, + "epoch": 0.5814519765519315, + "flos": 22857357703680.0, + "grad_norm": 2.0479138475359844, + "language_loss": 0.7874738, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.80868745, + "num_input_tokens_seen": 208334725, + "step": 9671, + "time_per_iteration": 2.7991318702697754 + }, + { + "auxiliary_loss_clip": 0.0107985, + "auxiliary_loss_mlp": 0.01044541, + "balance_loss_clip": 1.0416975, + "balance_loss_mlp": 1.02910936, + "epoch": 0.5815120998045994, + "flos": 24060831909120.0, + "grad_norm": 1.9838213735263186, + "language_loss": 0.61369407, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.634938, + "num_input_tokens_seen": 208353825, + "step": 9672, + "time_per_iteration": 2.8498592376708984 + }, + { + "auxiliary_loss_clip": 0.01065855, + "auxiliary_loss_mlp": 0.01038617, + "balance_loss_clip": 1.04000103, + "balance_loss_mlp": 1.02496195, + "epoch": 0.5815722230572674, + "flos": 24279491952000.0, + "grad_norm": 2.0691966635939365, + "language_loss": 0.81397313, + "learning_rate": 1.572007019492342e-06, + "loss": 0.83501786, + "num_input_tokens_seen": 208374160, + "step": 9673, + "time_per_iteration": 2.8208439350128174 + }, + { + "auxiliary_loss_clip": 0.0108779, + "auxiliary_loss_mlp": 0.01038429, + "balance_loss_clip": 1.04342866, + "balance_loss_mlp": 1.0242784, + "epoch": 0.5816323463099353, + "flos": 22200371994240.0, + "grad_norm": 1.86389400550988, + "language_loss": 0.88404083, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.905303, + "num_input_tokens_seen": 208392105, + "step": 9674, + "time_per_iteration": 2.7522170543670654 + }, + { + "auxiliary_loss_clip": 0.01120808, + "auxiliary_loss_mlp": 0.00770234, + "balance_loss_clip": 1.04347241, + "balance_loss_mlp": 1.00026846, + "epoch": 0.5816924695626033, + "flos": 24134448833280.0, + "grad_norm": 1.4106486697266074, + "language_loss": 0.78974068, + "learning_rate": 1.571246172811984e-06, + "loss": 0.80865109, + "num_input_tokens_seen": 208411755, + "step": 9675, + "time_per_iteration": 2.6588079929351807 + }, + { + "auxiliary_loss_clip": 0.01106314, + "auxiliary_loss_mlp": 0.01035578, + "balance_loss_clip": 1.04066849, + "balance_loss_mlp": 1.02178526, + "epoch": 0.5817525928152713, + "flos": 21324223451520.0, + "grad_norm": 1.863415006013356, + "language_loss": 0.70507479, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.72649372, + "num_input_tokens_seen": 208429995, + "step": 9676, + "time_per_iteration": 2.64201283454895 + }, + { + "auxiliary_loss_clip": 0.01058756, + "auxiliary_loss_mlp": 0.01033649, + "balance_loss_clip": 1.0396111, + "balance_loss_mlp": 1.02030993, + "epoch": 0.5818127160679393, + "flos": 26934510666240.0, + "grad_norm": 2.6670948708651636, + "language_loss": 0.63821483, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.65913892, + "num_input_tokens_seen": 208443655, + "step": 9677, + "time_per_iteration": 2.818047523498535 + }, + { + "auxiliary_loss_clip": 0.01020823, + "auxiliary_loss_mlp": 0.01010612, + "balance_loss_clip": 1.02114296, + "balance_loss_mlp": 1.00937831, + "epoch": 0.5818728393206073, + "flos": 63918626342400.0, + "grad_norm": 0.8047469836092298, + "language_loss": 0.54188442, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.56219876, + "num_input_tokens_seen": 208498405, + "step": 9678, + "time_per_iteration": 3.2669215202331543 + }, + { + "auxiliary_loss_clip": 0.01019281, + "auxiliary_loss_mlp": 0.0100911, + "balance_loss_clip": 1.01330447, + "balance_loss_mlp": 1.00782299, + "epoch": 0.5819329625732752, + "flos": 64954108638720.0, + "grad_norm": 0.7377482843760589, + "language_loss": 0.56218177, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58246571, + "num_input_tokens_seen": 208559075, + "step": 9679, + "time_per_iteration": 3.130009174346924 + }, + { + "auxiliary_loss_clip": 0.01118656, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.04236495, + "balance_loss_mlp": 1.01982164, + "epoch": 0.5819930858259432, + "flos": 21215270522880.0, + "grad_norm": 1.65967573029577, + "language_loss": 0.65638047, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.67788512, + "num_input_tokens_seen": 208577770, + "step": 9680, + "time_per_iteration": 2.63765811920166 + }, + { + "auxiliary_loss_clip": 0.01095966, + "auxiliary_loss_mlp": 0.01030097, + "balance_loss_clip": 1.04104781, + "balance_loss_mlp": 1.01797342, + "epoch": 0.5820532090786111, + "flos": 19458520151040.0, + "grad_norm": 1.9145859585775957, + "language_loss": 0.83394265, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.85520327, + "num_input_tokens_seen": 208595110, + "step": 9681, + "time_per_iteration": 2.6886913776397705 + }, + { + "auxiliary_loss_clip": 0.0112012, + "auxiliary_loss_mlp": 0.01033373, + "balance_loss_clip": 1.04263687, + "balance_loss_mlp": 1.02064157, + "epoch": 0.5821133323312792, + "flos": 17712615686400.0, + "grad_norm": 1.6180763493056738, + "language_loss": 0.76095504, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.78248996, + "num_input_tokens_seen": 208612080, + "step": 9682, + "time_per_iteration": 2.616946220397949 + }, + { + "auxiliary_loss_clip": 0.01054825, + "auxiliary_loss_mlp": 0.01035748, + "balance_loss_clip": 1.03545356, + "balance_loss_mlp": 1.0205251, + "epoch": 0.5821734555839471, + "flos": 24571804832640.0, + "grad_norm": 1.897202579717977, + "language_loss": 0.7534517, + "learning_rate": 1.568203437579977e-06, + "loss": 0.77435744, + "num_input_tokens_seen": 208630235, + "step": 9683, + "time_per_iteration": 2.7519571781158447 + }, + { + "auxiliary_loss_clip": 0.01098515, + "auxiliary_loss_mlp": 0.01032751, + "balance_loss_clip": 1.04482961, + "balance_loss_mlp": 1.0191133, + "epoch": 0.5822335788366151, + "flos": 22382259488640.0, + "grad_norm": 1.7304603651050097, + "language_loss": 0.73967683, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.76098949, + "num_input_tokens_seen": 208647925, + "step": 9684, + "time_per_iteration": 2.585839033126831 + }, + { + "auxiliary_loss_clip": 0.01095398, + "auxiliary_loss_mlp": 0.01040225, + "balance_loss_clip": 1.04306865, + "balance_loss_mlp": 1.02714145, + "epoch": 0.582293702089283, + "flos": 26722494639360.0, + "grad_norm": 1.9911340281622987, + "language_loss": 0.78017914, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.80153537, + "num_input_tokens_seen": 208666180, + "step": 9685, + "time_per_iteration": 2.6262004375457764 + }, + { + "auxiliary_loss_clip": 0.01119541, + "auxiliary_loss_mlp": 0.01037721, + "balance_loss_clip": 1.04301238, + "balance_loss_mlp": 1.02463138, + "epoch": 0.582353825341951, + "flos": 17348661129600.0, + "grad_norm": 1.534499166945951, + "language_loss": 0.75514185, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.7767145, + "num_input_tokens_seen": 208684240, + "step": 9686, + "time_per_iteration": 2.4799644947052 + }, + { + "auxiliary_loss_clip": 0.01029752, + "auxiliary_loss_mlp": 0.00999968, + "balance_loss_clip": 1.01506877, + "balance_loss_mlp": 0.99893057, + "epoch": 0.5824139485946189, + "flos": 55473261534720.0, + "grad_norm": 0.8130045203422185, + "language_loss": 0.57394326, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.59424043, + "num_input_tokens_seen": 208736090, + "step": 9687, + "time_per_iteration": 2.9722440242767334 + }, + { + "auxiliary_loss_clip": 0.01079028, + "auxiliary_loss_mlp": 0.01038321, + "balance_loss_clip": 1.03950655, + "balance_loss_mlp": 1.02262747, + "epoch": 0.582474071847287, + "flos": 20303031790080.0, + "grad_norm": 1.7516030258378996, + "language_loss": 0.70063931, + "learning_rate": 1.566302259738727e-06, + "loss": 0.72181278, + "num_input_tokens_seen": 208754600, + "step": 9688, + "time_per_iteration": 2.802976369857788 + }, + { + "auxiliary_loss_clip": 0.01110989, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.04311526, + "balance_loss_mlp": 1.02075768, + "epoch": 0.5825341950999549, + "flos": 23878010661120.0, + "grad_norm": 2.126323858989827, + "language_loss": 0.65013343, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.67157751, + "num_input_tokens_seen": 208773140, + "step": 9689, + "time_per_iteration": 2.6299288272857666 + }, + { + "auxiliary_loss_clip": 0.01095981, + "auxiliary_loss_mlp": 0.00770437, + "balance_loss_clip": 1.04142618, + "balance_loss_mlp": 1.00009274, + "epoch": 0.5825943183526229, + "flos": 23113041690240.0, + "grad_norm": 1.599269729220552, + "language_loss": 0.7352339, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.75389808, + "num_input_tokens_seen": 208793410, + "step": 9690, + "time_per_iteration": 2.6903798580169678 + }, + { + "auxiliary_loss_clip": 0.01096107, + "auxiliary_loss_mlp": 0.01038726, + "balance_loss_clip": 1.03903055, + "balance_loss_mlp": 1.02372348, + "epoch": 0.5826544416052909, + "flos": 22857429530880.0, + "grad_norm": 1.61399606195473, + "language_loss": 0.75654376, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.77789205, + "num_input_tokens_seen": 208811920, + "step": 9691, + "time_per_iteration": 2.7056210041046143 + }, + { + "auxiliary_loss_clip": 0.01109061, + "auxiliary_loss_mlp": 0.01032641, + "balance_loss_clip": 1.04082966, + "balance_loss_mlp": 1.01972461, + "epoch": 0.5827145648579588, + "flos": 31501845555840.0, + "grad_norm": 2.2562223304416755, + "language_loss": 0.80682158, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.82823855, + "num_input_tokens_seen": 208834720, + "step": 9692, + "time_per_iteration": 2.7577641010284424 + }, + { + "auxiliary_loss_clip": 0.01028968, + "auxiliary_loss_mlp": 0.01002786, + "balance_loss_clip": 1.01420581, + "balance_loss_mlp": 1.00161159, + "epoch": 0.5827746881106268, + "flos": 69811817074560.0, + "grad_norm": 0.7560919402716259, + "language_loss": 0.5693723, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.58968985, + "num_input_tokens_seen": 208898415, + "step": 9693, + "time_per_iteration": 3.145176887512207 + }, + { + "auxiliary_loss_clip": 0.01105496, + "auxiliary_loss_mlp": 0.0076985, + "balance_loss_clip": 1.04020321, + "balance_loss_mlp": 1.00010538, + "epoch": 0.5828348113632947, + "flos": 23112395245440.0, + "grad_norm": 2.61225629767126, + "language_loss": 0.79375291, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.81250644, + "num_input_tokens_seen": 208919045, + "step": 9694, + "time_per_iteration": 2.7443995475769043 + }, + { + "auxiliary_loss_clip": 0.01083069, + "auxiliary_loss_mlp": 0.01042673, + "balance_loss_clip": 1.03822398, + "balance_loss_mlp": 1.02909541, + "epoch": 0.5828949346159628, + "flos": 21873082245120.0, + "grad_norm": 1.4254101237523094, + "language_loss": 0.76205015, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.78330755, + "num_input_tokens_seen": 208939375, + "step": 9695, + "time_per_iteration": 2.688107490539551 + }, + { + "auxiliary_loss_clip": 0.01027446, + "auxiliary_loss_mlp": 0.01003052, + "balance_loss_clip": 1.01271224, + "balance_loss_mlp": 1.00191391, + "epoch": 0.5829550578686307, + "flos": 65962553950080.0, + "grad_norm": 0.7742487055111029, + "language_loss": 0.54982823, + "learning_rate": 1.563261231127095e-06, + "loss": 0.57013327, + "num_input_tokens_seen": 209004760, + "step": 9696, + "time_per_iteration": 3.239593029022217 + }, + { + "auxiliary_loss_clip": 0.0108245, + "auxiliary_loss_mlp": 0.01030212, + "balance_loss_clip": 1.04170382, + "balance_loss_mlp": 1.01751041, + "epoch": 0.5830151811212987, + "flos": 16289799079680.0, + "grad_norm": 2.124266497676036, + "language_loss": 0.76664579, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.78777242, + "num_input_tokens_seen": 209022930, + "step": 9697, + "time_per_iteration": 2.6790308952331543 + }, + { + "auxiliary_loss_clip": 0.01121339, + "auxiliary_loss_mlp": 0.01035657, + "balance_loss_clip": 1.04233479, + "balance_loss_mlp": 1.02154899, + "epoch": 0.5830753043739666, + "flos": 24168851084160.0, + "grad_norm": 1.5579611092820027, + "language_loss": 0.77714729, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.79871726, + "num_input_tokens_seen": 209043740, + "step": 9698, + "time_per_iteration": 2.635885715484619 + }, + { + "auxiliary_loss_clip": 0.01079274, + "auxiliary_loss_mlp": 0.01038337, + "balance_loss_clip": 1.0413661, + "balance_loss_mlp": 1.02498519, + "epoch": 0.5831354276266346, + "flos": 27059050097280.0, + "grad_norm": 1.5784163010462595, + "language_loss": 0.84167337, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.86284947, + "num_input_tokens_seen": 209068885, + "step": 9699, + "time_per_iteration": 2.8312487602233887 + }, + { + "auxiliary_loss_clip": 0.01095092, + "auxiliary_loss_mlp": 0.010366, + "balance_loss_clip": 1.03954756, + "balance_loss_mlp": 1.02281952, + "epoch": 0.5831955508793025, + "flos": 23623475909760.0, + "grad_norm": 2.065302984121428, + "language_loss": 0.65489984, + "learning_rate": 1.561741113828305e-06, + "loss": 0.67621672, + "num_input_tokens_seen": 209087340, + "step": 9700, + "time_per_iteration": 2.784442901611328 + }, + { + "auxiliary_loss_clip": 0.01108875, + "auxiliary_loss_mlp": 0.01034575, + "balance_loss_clip": 1.04089403, + "balance_loss_mlp": 1.02150953, + "epoch": 0.5832556741319705, + "flos": 24973250209920.0, + "grad_norm": 1.5991522353668115, + "language_loss": 0.71547067, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.73690522, + "num_input_tokens_seen": 209108840, + "step": 9701, + "time_per_iteration": 2.6895313262939453 + }, + { + "auxiliary_loss_clip": 0.01096283, + "auxiliary_loss_mlp": 0.01041435, + "balance_loss_clip": 1.04180253, + "balance_loss_mlp": 1.02841139, + "epoch": 0.5833157973846385, + "flos": 23221563655680.0, + "grad_norm": 1.6635802287235106, + "language_loss": 0.85541105, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.87678826, + "num_input_tokens_seen": 209127985, + "step": 9702, + "time_per_iteration": 2.6746225357055664 + }, + { + "auxiliary_loss_clip": 0.01102319, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.04071856, + "balance_loss_mlp": 1.02131367, + "epoch": 0.5833759206373065, + "flos": 21977941023360.0, + "grad_norm": 1.4183987857502756, + "language_loss": 0.77847046, + "learning_rate": 1.560601200301392e-06, + "loss": 0.79983002, + "num_input_tokens_seen": 209146885, + "step": 9703, + "time_per_iteration": 4.3035502433776855 + }, + { + "auxiliary_loss_clip": 0.01122779, + "auxiliary_loss_mlp": 0.01034804, + "balance_loss_clip": 1.04359257, + "balance_loss_mlp": 1.0208385, + "epoch": 0.5834360438899745, + "flos": 21762405463680.0, + "grad_norm": 1.8064531110729998, + "language_loss": 0.71067387, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.73224974, + "num_input_tokens_seen": 209166130, + "step": 9704, + "time_per_iteration": 4.107022762298584 + }, + { + "auxiliary_loss_clip": 0.01094563, + "auxiliary_loss_mlp": 0.01038062, + "balance_loss_clip": 1.04187346, + "balance_loss_mlp": 1.02561641, + "epoch": 0.5834961671426424, + "flos": 15992566035840.0, + "grad_norm": 1.6675564380890735, + "language_loss": 0.81363106, + "learning_rate": 1.559841341236335e-06, + "loss": 0.8349573, + "num_input_tokens_seen": 209183350, + "step": 9705, + "time_per_iteration": 2.7058465480804443 + }, + { + "auxiliary_loss_clip": 0.010702, + "auxiliary_loss_mlp": 0.01034129, + "balance_loss_clip": 1.03672004, + "balance_loss_mlp": 1.02125466, + "epoch": 0.5835562903953104, + "flos": 22818322598400.0, + "grad_norm": 1.7137147806220967, + "language_loss": 0.80614948, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.82719278, + "num_input_tokens_seen": 209203945, + "step": 9706, + "time_per_iteration": 2.776280164718628 + }, + { + "auxiliary_loss_clip": 0.01105997, + "auxiliary_loss_mlp": 0.0103669, + "balance_loss_clip": 1.04129124, + "balance_loss_mlp": 1.02315402, + "epoch": 0.5836164136479783, + "flos": 48468056624640.0, + "grad_norm": 2.0771057832537414, + "language_loss": 0.74647468, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.76790154, + "num_input_tokens_seen": 209227080, + "step": 9707, + "time_per_iteration": 2.857609272003174 + }, + { + "auxiliary_loss_clip": 0.01081909, + "auxiliary_loss_mlp": 0.01031553, + "balance_loss_clip": 1.03649998, + "balance_loss_mlp": 1.01878548, + "epoch": 0.5836765369006464, + "flos": 26905998245760.0, + "grad_norm": 2.7159127892637067, + "language_loss": 0.81819087, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.83932543, + "num_input_tokens_seen": 209248170, + "step": 9708, + "time_per_iteration": 4.28432822227478 + }, + { + "auxiliary_loss_clip": 0.01102304, + "auxiliary_loss_mlp": 0.01032201, + "balance_loss_clip": 1.0439347, + "balance_loss_mlp": 1.01914191, + "epoch": 0.5837366601533143, + "flos": 20084048524800.0, + "grad_norm": 1.4146539482815383, + "language_loss": 0.78367102, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.80501604, + "num_input_tokens_seen": 209267730, + "step": 9709, + "time_per_iteration": 2.6337647438049316 + }, + { + "auxiliary_loss_clip": 0.01017869, + "auxiliary_loss_mlp": 0.00999553, + "balance_loss_clip": 1.01163578, + "balance_loss_mlp": 0.99844998, + "epoch": 0.5837967834059823, + "flos": 65363885971200.0, + "grad_norm": 0.7723563596720286, + "language_loss": 0.5654794, + "learning_rate": 1.557941985915844e-06, + "loss": 0.58565366, + "num_input_tokens_seen": 209332510, + "step": 9710, + "time_per_iteration": 3.255643844604492 + }, + { + "auxiliary_loss_clip": 0.01084064, + "auxiliary_loss_mlp": 0.01035883, + "balance_loss_clip": 1.03939962, + "balance_loss_mlp": 1.02429581, + "epoch": 0.5838569066586502, + "flos": 25338641310720.0, + "grad_norm": 1.5220841159249796, + "language_loss": 0.6560964, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.67729586, + "num_input_tokens_seen": 209353355, + "step": 9711, + "time_per_iteration": 2.7771286964416504 + }, + { + "auxiliary_loss_clip": 0.01124372, + "auxiliary_loss_mlp": 0.01037032, + "balance_loss_clip": 1.04342008, + "balance_loss_mlp": 1.02279854, + "epoch": 0.5839170299113182, + "flos": 22229243550720.0, + "grad_norm": 1.6457925309868888, + "language_loss": 0.78601259, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.80762661, + "num_input_tokens_seen": 209370960, + "step": 9712, + "time_per_iteration": 2.6130564212799072 + }, + { + "auxiliary_loss_clip": 0.01079932, + "auxiliary_loss_mlp": 0.0077171, + "balance_loss_clip": 1.03610897, + "balance_loss_mlp": 1.00007439, + "epoch": 0.5839771531639861, + "flos": 22200012858240.0, + "grad_norm": 1.6123088749448828, + "language_loss": 0.73624194, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.75475836, + "num_input_tokens_seen": 209390955, + "step": 9713, + "time_per_iteration": 2.752688407897949 + }, + { + "auxiliary_loss_clip": 0.01098855, + "auxiliary_loss_mlp": 0.0103448, + "balance_loss_clip": 1.03949571, + "balance_loss_mlp": 1.02000761, + "epoch": 0.5840372764166541, + "flos": 22419355259520.0, + "grad_norm": 2.057640389539287, + "language_loss": 0.69393289, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.71526623, + "num_input_tokens_seen": 209410260, + "step": 9714, + "time_per_iteration": 2.697676181793213 + }, + { + "auxiliary_loss_clip": 0.01118564, + "auxiliary_loss_mlp": 0.01037008, + "balance_loss_clip": 1.04040492, + "balance_loss_mlp": 1.02368677, + "epoch": 0.5840973996693221, + "flos": 19828256797440.0, + "grad_norm": 1.733937894535342, + "language_loss": 0.80418617, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.82574189, + "num_input_tokens_seen": 209429920, + "step": 9715, + "time_per_iteration": 2.5865848064422607 + }, + { + "auxiliary_loss_clip": 0.01094879, + "auxiliary_loss_mlp": 0.0103561, + "balance_loss_clip": 1.03690863, + "balance_loss_mlp": 1.02182388, + "epoch": 0.5841575229219901, + "flos": 21142982401920.0, + "grad_norm": 2.4772648960449586, + "language_loss": 0.72541732, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.74672222, + "num_input_tokens_seen": 209449470, + "step": 9716, + "time_per_iteration": 2.760240077972412 + }, + { + "auxiliary_loss_clip": 0.01088946, + "auxiliary_loss_mlp": 0.01033627, + "balance_loss_clip": 1.03793585, + "balance_loss_mlp": 1.02124131, + "epoch": 0.5842176461746581, + "flos": 24640322025600.0, + "grad_norm": 1.7815945401286815, + "language_loss": 0.75058079, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.7718066, + "num_input_tokens_seen": 209467695, + "step": 9717, + "time_per_iteration": 2.7470862865448 + }, + { + "auxiliary_loss_clip": 0.01109202, + "auxiliary_loss_mlp": 0.01038785, + "balance_loss_clip": 1.04155052, + "balance_loss_mlp": 1.02575004, + "epoch": 0.584277769427326, + "flos": 19131158574720.0, + "grad_norm": 3.2108802254609827, + "language_loss": 0.79614913, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.81762898, + "num_input_tokens_seen": 209484250, + "step": 9718, + "time_per_iteration": 2.6843111515045166 + }, + { + "auxiliary_loss_clip": 0.01094695, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.03992128, + "balance_loss_mlp": 1.01880288, + "epoch": 0.584337892679994, + "flos": 22675111073280.0, + "grad_norm": 1.6948464280827684, + "language_loss": 0.67670137, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.69797808, + "num_input_tokens_seen": 209502830, + "step": 9719, + "time_per_iteration": 2.658722400665283 + }, + { + "auxiliary_loss_clip": 0.01119777, + "auxiliary_loss_mlp": 0.01038004, + "balance_loss_clip": 1.04168653, + "balance_loss_mlp": 1.02466464, + "epoch": 0.5843980159326619, + "flos": 31284083352960.0, + "grad_norm": 1.997670996956063, + "language_loss": 0.75795102, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.77952886, + "num_input_tokens_seen": 209525995, + "step": 9720, + "time_per_iteration": 2.6901891231536865 + }, + { + "auxiliary_loss_clip": 0.01082891, + "auxiliary_loss_mlp": 0.01039482, + "balance_loss_clip": 1.04280281, + "balance_loss_mlp": 1.02657783, + "epoch": 0.58445813918533, + "flos": 22748117466240.0, + "grad_norm": 1.7155190503214905, + "language_loss": 0.83123529, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.85245907, + "num_input_tokens_seen": 209545895, + "step": 9721, + "time_per_iteration": 2.71907639503479 + }, + { + "auxiliary_loss_clip": 0.01037273, + "auxiliary_loss_mlp": 0.01006637, + "balance_loss_clip": 1.01290512, + "balance_loss_mlp": 1.00543344, + "epoch": 0.5845182624379979, + "flos": 60686556658560.0, + "grad_norm": 0.9400176499911559, + "language_loss": 0.7134223, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73386145, + "num_input_tokens_seen": 209602315, + "step": 9722, + "time_per_iteration": 3.1959645748138428 + }, + { + "auxiliary_loss_clip": 0.01099534, + "auxiliary_loss_mlp": 0.01040774, + "balance_loss_clip": 1.03890538, + "balance_loss_mlp": 1.02751184, + "epoch": 0.5845783856906659, + "flos": 16362446336640.0, + "grad_norm": 1.9834511811038693, + "language_loss": 0.89731622, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.91871929, + "num_input_tokens_seen": 209617615, + "step": 9723, + "time_per_iteration": 2.592627763748169 + }, + { + "auxiliary_loss_clip": 0.01094383, + "auxiliary_loss_mlp": 0.01038255, + "balance_loss_clip": 1.04275918, + "balance_loss_mlp": 1.02554142, + "epoch": 0.5846385089433338, + "flos": 20083402080000.0, + "grad_norm": 1.398468813522248, + "language_loss": 0.68486446, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.70619082, + "num_input_tokens_seen": 209637005, + "step": 9724, + "time_per_iteration": 2.655640125274658 + }, + { + "auxiliary_loss_clip": 0.01110347, + "auxiliary_loss_mlp": 0.01036604, + "balance_loss_clip": 1.04291487, + "balance_loss_mlp": 1.02341366, + "epoch": 0.5846986321960018, + "flos": 17311062568320.0, + "grad_norm": 1.717409456716096, + "language_loss": 0.86049938, + "learning_rate": 1.552246441587197e-06, + "loss": 0.88196886, + "num_input_tokens_seen": 209653170, + "step": 9725, + "time_per_iteration": 2.6035261154174805 + }, + { + "auxiliary_loss_clip": 0.01095255, + "auxiliary_loss_mlp": 0.010422, + "balance_loss_clip": 1.04249406, + "balance_loss_mlp": 1.02926588, + "epoch": 0.5847587554486697, + "flos": 17197907748480.0, + "grad_norm": 1.6193535846243259, + "language_loss": 0.82923484, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.85060942, + "num_input_tokens_seen": 209671275, + "step": 9726, + "time_per_iteration": 2.655017137527466 + }, + { + "auxiliary_loss_clip": 0.01055108, + "auxiliary_loss_mlp": 0.00770936, + "balance_loss_clip": 1.03983736, + "balance_loss_mlp": 1.00008965, + "epoch": 0.5848188787013378, + "flos": 24529106540160.0, + "grad_norm": 1.736262693329601, + "language_loss": 0.66609311, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.68435353, + "num_input_tokens_seen": 209690380, + "step": 9727, + "time_per_iteration": 2.820906639099121 + }, + { + "auxiliary_loss_clip": 0.01083507, + "auxiliary_loss_mlp": 0.01045274, + "balance_loss_clip": 1.03799105, + "balance_loss_mlp": 1.03056347, + "epoch": 0.5848790019540057, + "flos": 20628382204800.0, + "grad_norm": 1.7999573427153348, + "language_loss": 0.81628853, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.83757633, + "num_input_tokens_seen": 209708845, + "step": 9728, + "time_per_iteration": 2.7597923278808594 + }, + { + "auxiliary_loss_clip": 0.01103874, + "auxiliary_loss_mlp": 0.01042076, + "balance_loss_clip": 1.03965843, + "balance_loss_mlp": 1.0296309, + "epoch": 0.5849391252066737, + "flos": 22418852469120.0, + "grad_norm": 2.078641796720901, + "language_loss": 0.77696002, + "learning_rate": 1.550728272957027e-06, + "loss": 0.79841954, + "num_input_tokens_seen": 209729000, + "step": 9729, + "time_per_iteration": 2.663864850997925 + }, + { + "auxiliary_loss_clip": 0.01102359, + "auxiliary_loss_mlp": 0.0103712, + "balance_loss_clip": 1.03954148, + "balance_loss_mlp": 1.022475, + "epoch": 0.5849992484593417, + "flos": 25410929431680.0, + "grad_norm": 1.8450519403802392, + "language_loss": 0.70192915, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.72332394, + "num_input_tokens_seen": 209747435, + "step": 9730, + "time_per_iteration": 2.6668407917022705 + }, + { + "auxiliary_loss_clip": 0.01124849, + "auxiliary_loss_mlp": 0.01036601, + "balance_loss_clip": 1.04504502, + "balance_loss_mlp": 1.02224803, + "epoch": 0.5850593717120096, + "flos": 21065163586560.0, + "grad_norm": 1.6923527463370078, + "language_loss": 0.78973091, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.81134546, + "num_input_tokens_seen": 209764910, + "step": 9731, + "time_per_iteration": 2.6093108654022217 + }, + { + "auxiliary_loss_clip": 0.01103256, + "auxiliary_loss_mlp": 0.01046113, + "balance_loss_clip": 1.04004776, + "balance_loss_mlp": 1.03114593, + "epoch": 0.5851194949646776, + "flos": 25301545539840.0, + "grad_norm": 2.322897025480009, + "language_loss": 0.70276213, + "learning_rate": 1.549589825316528e-06, + "loss": 0.7242558, + "num_input_tokens_seen": 209786115, + "step": 9732, + "time_per_iteration": 2.6483914852142334 + }, + { + "auxiliary_loss_clip": 0.01068434, + "auxiliary_loss_mlp": 0.01041994, + "balance_loss_clip": 1.03862739, + "balance_loss_mlp": 1.02584136, + "epoch": 0.5851796182173455, + "flos": 23587242065280.0, + "grad_norm": 1.8361177860467572, + "language_loss": 0.53096974, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.55207402, + "num_input_tokens_seen": 209806095, + "step": 9733, + "time_per_iteration": 2.7837493419647217 + }, + { + "auxiliary_loss_clip": 0.0110623, + "auxiliary_loss_mlp": 0.01037809, + "balance_loss_clip": 1.04327631, + "balance_loss_mlp": 1.023206, + "epoch": 0.5852397414700136, + "flos": 24822712310400.0, + "grad_norm": 2.1555850580582945, + "language_loss": 0.87172639, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.89316678, + "num_input_tokens_seen": 209823650, + "step": 9734, + "time_per_iteration": 2.647822618484497 + }, + { + "auxiliary_loss_clip": 0.0109023, + "auxiliary_loss_mlp": 0.01035437, + "balance_loss_clip": 1.03915906, + "balance_loss_mlp": 1.02305174, + "epoch": 0.5852998647226815, + "flos": 19937784343680.0, + "grad_norm": 1.6523754491187739, + "language_loss": 0.72117126, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.74242795, + "num_input_tokens_seen": 209843220, + "step": 9735, + "time_per_iteration": 2.6707499027252197 + }, + { + "auxiliary_loss_clip": 0.01111823, + "auxiliary_loss_mlp": 0.01038537, + "balance_loss_clip": 1.04385519, + "balance_loss_mlp": 1.02448797, + "epoch": 0.5853599879753495, + "flos": 16720367408640.0, + "grad_norm": 5.660280505854459, + "language_loss": 0.74303764, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.76454127, + "num_input_tokens_seen": 209854880, + "step": 9736, + "time_per_iteration": 2.6474769115448 + }, + { + "auxiliary_loss_clip": 0.01084732, + "auxiliary_loss_mlp": 0.01038896, + "balance_loss_clip": 1.03950977, + "balance_loss_mlp": 1.0241437, + "epoch": 0.5854201112280174, + "flos": 44456583680640.0, + "grad_norm": 1.705724680337342, + "language_loss": 0.7066859, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.72792208, + "num_input_tokens_seen": 209877870, + "step": 9737, + "time_per_iteration": 2.8703529834747314 + }, + { + "auxiliary_loss_clip": 0.01079098, + "auxiliary_loss_mlp": 0.01042352, + "balance_loss_clip": 1.03875983, + "balance_loss_mlp": 1.02830887, + "epoch": 0.5854802344806854, + "flos": 20339193807360.0, + "grad_norm": 1.7465210824086157, + "language_loss": 0.82571793, + "learning_rate": 1.547313391573169e-06, + "loss": 0.84693247, + "num_input_tokens_seen": 209896690, + "step": 9738, + "time_per_iteration": 2.6930525302886963 + }, + { + "auxiliary_loss_clip": 0.01123353, + "auxiliary_loss_mlp": 0.00771973, + "balance_loss_clip": 1.04294574, + "balance_loss_mlp": 1.00014758, + "epoch": 0.5855403577333533, + "flos": 20921054221440.0, + "grad_norm": 1.6403149295747592, + "language_loss": 0.68084544, + "learning_rate": 1.546934045946082e-06, + "loss": 0.6997987, + "num_input_tokens_seen": 209914640, + "step": 9739, + "time_per_iteration": 2.6120223999023438 + }, + { + "auxiliary_loss_clip": 0.01122823, + "auxiliary_loss_mlp": 0.01028069, + "balance_loss_clip": 1.04343581, + "balance_loss_mlp": 1.01383555, + "epoch": 0.5856004809860214, + "flos": 20448649526400.0, + "grad_norm": 2.346965983276941, + "language_loss": 0.5878849, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.60939384, + "num_input_tokens_seen": 209933375, + "step": 9740, + "time_per_iteration": 2.6393442153930664 + }, + { + "auxiliary_loss_clip": 0.01091861, + "auxiliary_loss_mlp": 0.01034284, + "balance_loss_clip": 1.03964174, + "balance_loss_mlp": 1.01996112, + "epoch": 0.5856606042386893, + "flos": 19640766781440.0, + "grad_norm": 1.8171598434150709, + "language_loss": 0.75508714, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.77634859, + "num_input_tokens_seen": 209952055, + "step": 9741, + "time_per_iteration": 2.6550915241241455 + }, + { + "auxiliary_loss_clip": 0.01085436, + "auxiliary_loss_mlp": 0.01034709, + "balance_loss_clip": 1.03900838, + "balance_loss_mlp": 1.02109587, + "epoch": 0.5857207274913573, + "flos": 21686166846720.0, + "grad_norm": 1.6487285096737663, + "language_loss": 0.75935274, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.78055418, + "num_input_tokens_seen": 209971190, + "step": 9742, + "time_per_iteration": 4.381955146789551 + }, + { + "auxiliary_loss_clip": 0.01098042, + "auxiliary_loss_mlp": 0.01033792, + "balance_loss_clip": 1.04340363, + "balance_loss_mlp": 1.0209775, + "epoch": 0.5857808507440253, + "flos": 23182708118400.0, + "grad_norm": 1.6035533638401356, + "language_loss": 0.74864548, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.76996386, + "num_input_tokens_seen": 209990695, + "step": 9743, + "time_per_iteration": 5.72803258895874 + }, + { + "auxiliary_loss_clip": 0.01098389, + "auxiliary_loss_mlp": 0.01032176, + "balance_loss_clip": 1.04424453, + "balance_loss_mlp": 1.01949835, + "epoch": 0.5858409739966932, + "flos": 27235299156480.0, + "grad_norm": 1.98808093933083, + "language_loss": 0.81046313, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.83176875, + "num_input_tokens_seen": 210010210, + "step": 9744, + "time_per_iteration": 2.7265267372131348 + }, + { + "auxiliary_loss_clip": 0.01094798, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.04087067, + "balance_loss_mlp": 1.01669562, + "epoch": 0.5859010972493612, + "flos": 27855512317440.0, + "grad_norm": 1.7065591540492446, + "language_loss": 0.71426034, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.73551434, + "num_input_tokens_seen": 210030030, + "step": 9745, + "time_per_iteration": 2.737842082977295 + }, + { + "auxiliary_loss_clip": 0.01023206, + "auxiliary_loss_mlp": 0.01004158, + "balance_loss_clip": 1.01973987, + "balance_loss_mlp": 1.00301957, + "epoch": 0.5859612205020291, + "flos": 70007064428160.0, + "grad_norm": 0.7272764484566879, + "language_loss": 0.53267932, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.552953, + "num_input_tokens_seen": 210094840, + "step": 9746, + "time_per_iteration": 3.3027215003967285 + }, + { + "auxiliary_loss_clip": 0.01094571, + "auxiliary_loss_mlp": 0.01035687, + "balance_loss_clip": 1.04237437, + "balance_loss_mlp": 1.02163196, + "epoch": 0.5860213437546972, + "flos": 24056019486720.0, + "grad_norm": 2.0261235602549466, + "language_loss": 0.73138428, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.75268686, + "num_input_tokens_seen": 210114660, + "step": 9747, + "time_per_iteration": 4.224852085113525 + }, + { + "auxiliary_loss_clip": 0.01092652, + "auxiliary_loss_mlp": 0.01046673, + "balance_loss_clip": 1.03909874, + "balance_loss_mlp": 1.03123569, + "epoch": 0.5860814670073651, + "flos": 18947583141120.0, + "grad_norm": 1.867050340664373, + "language_loss": 0.81183696, + "learning_rate": 1.543520710142051e-06, + "loss": 0.83323026, + "num_input_tokens_seen": 210132770, + "step": 9748, + "time_per_iteration": 2.6568126678466797 + }, + { + "auxiliary_loss_clip": 0.01111974, + "auxiliary_loss_mlp": 0.01038317, + "balance_loss_clip": 1.04387689, + "balance_loss_mlp": 1.0241785, + "epoch": 0.5861415902600331, + "flos": 22561848512640.0, + "grad_norm": 1.7272716772059427, + "language_loss": 0.72221619, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.7437191, + "num_input_tokens_seen": 210151895, + "step": 9749, + "time_per_iteration": 2.6895384788513184 + }, + { + "auxiliary_loss_clip": 0.01101508, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.04664361, + "balance_loss_mlp": 1.02205098, + "epoch": 0.586201713512701, + "flos": 14392027912320.0, + "grad_norm": 2.592210537631562, + "language_loss": 0.75040287, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.77177632, + "num_input_tokens_seen": 210168040, + "step": 9750, + "time_per_iteration": 2.737083911895752 + }, + { + "auxiliary_loss_clip": 0.0108729, + "auxiliary_loss_mlp": 0.01036704, + "balance_loss_clip": 1.04378581, + "balance_loss_mlp": 1.02202928, + "epoch": 0.586261836765369, + "flos": 19498560837120.0, + "grad_norm": 1.8612157402372733, + "language_loss": 0.70927167, + "learning_rate": 1.542383242598344e-06, + "loss": 0.73051161, + "num_input_tokens_seen": 210187720, + "step": 9751, + "time_per_iteration": 2.7111241817474365 + }, + { + "auxiliary_loss_clip": 0.01125805, + "auxiliary_loss_mlp": 0.01043313, + "balance_loss_clip": 1.04531717, + "balance_loss_mlp": 1.02769637, + "epoch": 0.5863219600180369, + "flos": 20701819560960.0, + "grad_norm": 1.7129799601344229, + "language_loss": 0.74548101, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.76717222, + "num_input_tokens_seen": 210206080, + "step": 9752, + "time_per_iteration": 2.626716136932373 + }, + { + "auxiliary_loss_clip": 0.01108046, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.04339004, + "balance_loss_mlp": 1.01842308, + "epoch": 0.586382083270705, + "flos": 19792130693760.0, + "grad_norm": 1.767262069370236, + "language_loss": 0.77331054, + "learning_rate": 1.541625017642943e-06, + "loss": 0.79471087, + "num_input_tokens_seen": 210225660, + "step": 9753, + "time_per_iteration": 2.6093239784240723 + }, + { + "auxiliary_loss_clip": 0.01116295, + "auxiliary_loss_mlp": 0.01029138, + "balance_loss_clip": 1.04288065, + "balance_loss_mlp": 1.01651943, + "epoch": 0.5864422065233729, + "flos": 16500558130560.0, + "grad_norm": 1.6790243104766265, + "language_loss": 0.70988512, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.73133945, + "num_input_tokens_seen": 210242725, + "step": 9754, + "time_per_iteration": 2.5604028701782227 + }, + { + "auxiliary_loss_clip": 0.01095441, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.0401392, + "balance_loss_mlp": 1.0194732, + "epoch": 0.5865023297760409, + "flos": 20413277608320.0, + "grad_norm": 2.0857561604768065, + "language_loss": 0.72379315, + "learning_rate": 1.540866862214043e-06, + "loss": 0.7450884, + "num_input_tokens_seen": 210263225, + "step": 9755, + "time_per_iteration": 2.656785011291504 + }, + { + "auxiliary_loss_clip": 0.01012678, + "auxiliary_loss_mlp": 0.01004177, + "balance_loss_clip": 1.01731849, + "balance_loss_mlp": 1.00294328, + "epoch": 0.5865624530287089, + "flos": 63350769254400.0, + "grad_norm": 0.7450356800362308, + "language_loss": 0.56920898, + "learning_rate": 1.540487810607967e-06, + "loss": 0.58937752, + "num_input_tokens_seen": 210322310, + "step": 9756, + "time_per_iteration": 3.2905054092407227 + }, + { + "auxiliary_loss_clip": 0.01115752, + "auxiliary_loss_mlp": 0.01031709, + "balance_loss_clip": 1.04039788, + "balance_loss_mlp": 1.01922202, + "epoch": 0.5866225762813768, + "flos": 27016279977600.0, + "grad_norm": 11.015446509800649, + "language_loss": 0.76104087, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.78251553, + "num_input_tokens_seen": 210340845, + "step": 9757, + "time_per_iteration": 2.6325418949127197 + }, + { + "auxiliary_loss_clip": 0.01021435, + "auxiliary_loss_mlp": 0.01009977, + "balance_loss_clip": 1.01624918, + "balance_loss_mlp": 1.00884426, + "epoch": 0.5866826995340448, + "flos": 72987038507520.0, + "grad_norm": 0.8546616305193999, + "language_loss": 0.60420328, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.62451738, + "num_input_tokens_seen": 210397815, + "step": 9758, + "time_per_iteration": 3.227780342102051 + }, + { + "auxiliary_loss_clip": 0.0112535, + "auxiliary_loss_mlp": 0.01036264, + "balance_loss_clip": 1.0447619, + "balance_loss_mlp": 1.02245307, + "epoch": 0.5867428227867127, + "flos": 21285727050240.0, + "grad_norm": 2.191365428773927, + "language_loss": 0.71787071, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.73948681, + "num_input_tokens_seen": 210413900, + "step": 9759, + "time_per_iteration": 2.593574047088623 + }, + { + "auxiliary_loss_clip": 0.01096792, + "auxiliary_loss_mlp": 0.0103787, + "balance_loss_clip": 1.04106188, + "balance_loss_mlp": 1.02525759, + "epoch": 0.5868029460393808, + "flos": 33468852188160.0, + "grad_norm": 1.6194048366561686, + "language_loss": 0.72730052, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.74864709, + "num_input_tokens_seen": 210434110, + "step": 9760, + "time_per_iteration": 2.7872965335845947 + }, + { + "auxiliary_loss_clip": 0.01107006, + "auxiliary_loss_mlp": 0.01032523, + "balance_loss_clip": 1.04269731, + "balance_loss_mlp": 1.01910627, + "epoch": 0.5868630692920487, + "flos": 17889475276800.0, + "grad_norm": 1.9662195987833622, + "language_loss": 0.72611898, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.74751425, + "num_input_tokens_seen": 210451685, + "step": 9761, + "time_per_iteration": 2.701533317565918 + }, + { + "auxiliary_loss_clip": 0.01106159, + "auxiliary_loss_mlp": 0.01036709, + "balance_loss_clip": 1.04491735, + "balance_loss_mlp": 1.02211809, + "epoch": 0.5869231925447167, + "flos": 21035035054080.0, + "grad_norm": 1.7395731063260564, + "language_loss": 0.75217378, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.77360249, + "num_input_tokens_seen": 210470825, + "step": 9762, + "time_per_iteration": 2.721714496612549 + }, + { + "auxiliary_loss_clip": 0.01082216, + "auxiliary_loss_mlp": 0.01036155, + "balance_loss_clip": 1.03985929, + "balance_loss_mlp": 1.02164149, + "epoch": 0.5869833157973846, + "flos": 74738219293440.0, + "grad_norm": 4.660992958273475, + "language_loss": 0.72322762, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.74441129, + "num_input_tokens_seen": 210500075, + "step": 9763, + "time_per_iteration": 3.1116628646850586 + }, + { + "auxiliary_loss_clip": 0.01101878, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.03773355, + "balance_loss_mlp": 1.02203679, + "epoch": 0.5870434390500526, + "flos": 17638998762240.0, + "grad_norm": 1.815727939349207, + "language_loss": 0.80352604, + "learning_rate": 1.53745602625755e-06, + "loss": 0.82489097, + "num_input_tokens_seen": 210518150, + "step": 9764, + "time_per_iteration": 2.682579278945923 + }, + { + "auxiliary_loss_clip": 0.01091583, + "auxiliary_loss_mlp": 0.01034941, + "balance_loss_clip": 1.04217017, + "balance_loss_mlp": 1.02132726, + "epoch": 0.5871035623027205, + "flos": 21506146859520.0, + "grad_norm": 1.83004906571999, + "language_loss": 0.79265928, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.81392443, + "num_input_tokens_seen": 210537760, + "step": 9765, + "time_per_iteration": 2.6972546577453613 + }, + { + "auxiliary_loss_clip": 0.01088979, + "auxiliary_loss_mlp": 0.01039927, + "balance_loss_clip": 1.04256606, + "balance_loss_mlp": 1.02595556, + "epoch": 0.5871636855553886, + "flos": 13551861818880.0, + "grad_norm": 1.76294195099967, + "language_loss": 0.83693898, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.85822797, + "num_input_tokens_seen": 210555515, + "step": 9766, + "time_per_iteration": 2.7466630935668945 + }, + { + "auxiliary_loss_clip": 0.01111118, + "auxiliary_loss_mlp": 0.01037087, + "balance_loss_clip": 1.04195547, + "balance_loss_mlp": 1.02393794, + "epoch": 0.5872238088080565, + "flos": 26212922346240.0, + "grad_norm": 1.5937380342892973, + "language_loss": 0.6981988, + "learning_rate": 1.536319396136257e-06, + "loss": 0.71968091, + "num_input_tokens_seen": 210575000, + "step": 9767, + "time_per_iteration": 2.6740965843200684 + }, + { + "auxiliary_loss_clip": 0.0110439, + "auxiliary_loss_mlp": 0.0077267, + "balance_loss_clip": 1.04049277, + "balance_loss_mlp": 1.00008368, + "epoch": 0.5872839320607245, + "flos": 30665198995200.0, + "grad_norm": 2.1136221747138095, + "language_loss": 0.6360091, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.65477967, + "num_input_tokens_seen": 210595185, + "step": 9768, + "time_per_iteration": 2.7575178146362305 + }, + { + "auxiliary_loss_clip": 0.01037412, + "auxiliary_loss_mlp": 0.00751529, + "balance_loss_clip": 1.01318574, + "balance_loss_mlp": 0.99987358, + "epoch": 0.5873440553133924, + "flos": 60303570871680.0, + "grad_norm": 0.7223687744232398, + "language_loss": 0.53866827, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.55655766, + "num_input_tokens_seen": 210653210, + "step": 9769, + "time_per_iteration": 3.1609816551208496 + }, + { + "auxiliary_loss_clip": 0.01084812, + "auxiliary_loss_mlp": 0.01042021, + "balance_loss_clip": 1.03922772, + "balance_loss_mlp": 1.02880073, + "epoch": 0.5874041785660604, + "flos": 21539292134400.0, + "grad_norm": 1.4066762666706196, + "language_loss": 0.70984697, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.73111528, + "num_input_tokens_seen": 210673750, + "step": 9770, + "time_per_iteration": 2.7312963008880615 + }, + { + "auxiliary_loss_clip": 0.01073411, + "auxiliary_loss_mlp": 0.01035898, + "balance_loss_clip": 1.0386194, + "balance_loss_mlp": 1.02226591, + "epoch": 0.5874643018187284, + "flos": 24388947671040.0, + "grad_norm": 1.7359405395861034, + "language_loss": 0.681171, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.70226407, + "num_input_tokens_seen": 210692960, + "step": 9771, + "time_per_iteration": 2.7748193740844727 + }, + { + "auxiliary_loss_clip": 0.01072231, + "auxiliary_loss_mlp": 0.01041976, + "balance_loss_clip": 1.03671551, + "balance_loss_mlp": 1.02564466, + "epoch": 0.5875244250713964, + "flos": 28147717457280.0, + "grad_norm": 1.5217173137024316, + "language_loss": 0.661672, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.68281412, + "num_input_tokens_seen": 210714040, + "step": 9772, + "time_per_iteration": 2.841942071914673 + }, + { + "auxiliary_loss_clip": 0.01124952, + "auxiliary_loss_mlp": 0.01044932, + "balance_loss_clip": 1.045434, + "balance_loss_mlp": 1.03047216, + "epoch": 0.5875845483240644, + "flos": 25812410722560.0, + "grad_norm": 1.4922365157265927, + "language_loss": 0.74535245, + "learning_rate": 1.534046611017519e-06, + "loss": 0.76705128, + "num_input_tokens_seen": 210733710, + "step": 9773, + "time_per_iteration": 2.6284871101379395 + }, + { + "auxiliary_loss_clip": 0.01087977, + "auxiliary_loss_mlp": 0.0104147, + "balance_loss_clip": 1.04292727, + "balance_loss_mlp": 1.02706945, + "epoch": 0.5876446715767323, + "flos": 26906572863360.0, + "grad_norm": 1.947316209295704, + "language_loss": 0.52915788, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.55045235, + "num_input_tokens_seen": 210753580, + "step": 9774, + "time_per_iteration": 2.7891509532928467 + }, + { + "auxiliary_loss_clip": 0.01113387, + "auxiliary_loss_mlp": 0.01039669, + "balance_loss_clip": 1.04437912, + "balance_loss_mlp": 1.02526808, + "epoch": 0.5877047948294003, + "flos": 36684832579200.0, + "grad_norm": 2.3607783176851824, + "language_loss": 0.64713901, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.66866958, + "num_input_tokens_seen": 210773495, + "step": 9775, + "time_per_iteration": 2.771148920059204 + }, + { + "auxiliary_loss_clip": 0.01105141, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.04033184, + "balance_loss_mlp": 1.02344131, + "epoch": 0.5877649180820682, + "flos": 26724721282560.0, + "grad_norm": 1.636403069820384, + "language_loss": 0.73844278, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.75986409, + "num_input_tokens_seen": 210793645, + "step": 9776, + "time_per_iteration": 2.690695285797119 + }, + { + "auxiliary_loss_clip": 0.01119488, + "auxiliary_loss_mlp": 0.01039689, + "balance_loss_clip": 1.0420121, + "balance_loss_mlp": 1.0264504, + "epoch": 0.5878250413347362, + "flos": 21032197879680.0, + "grad_norm": 1.5421458331894318, + "language_loss": 0.73914766, + "learning_rate": 1.532531774126821e-06, + "loss": 0.76073945, + "num_input_tokens_seen": 210813415, + "step": 9777, + "time_per_iteration": 2.6284945011138916 + }, + { + "auxiliary_loss_clip": 0.01083567, + "auxiliary_loss_mlp": 0.01038914, + "balance_loss_clip": 1.04067087, + "balance_loss_mlp": 1.02573574, + "epoch": 0.5878851645874041, + "flos": 25484259047040.0, + "grad_norm": 1.8412101918270336, + "language_loss": 0.74325955, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.76448435, + "num_input_tokens_seen": 210833850, + "step": 9778, + "time_per_iteration": 2.7255308628082275 + }, + { + "auxiliary_loss_clip": 0.01072977, + "auxiliary_loss_mlp": 0.01040231, + "balance_loss_clip": 1.03567362, + "balance_loss_mlp": 1.0246737, + "epoch": 0.5879452878400722, + "flos": 23769129559680.0, + "grad_norm": 1.8337946976743424, + "language_loss": 0.70162809, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.72276014, + "num_input_tokens_seen": 210853115, + "step": 9779, + "time_per_iteration": 2.715529680252075 + }, + { + "auxiliary_loss_clip": 0.01121839, + "auxiliary_loss_mlp": 0.00771635, + "balance_loss_clip": 1.04201186, + "balance_loss_mlp": 1.00009024, + "epoch": 0.5880054110927401, + "flos": 17824513530240.0, + "grad_norm": 2.202026224542238, + "language_loss": 0.66388619, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.68282098, + "num_input_tokens_seen": 210872090, + "step": 9780, + "time_per_iteration": 2.628286361694336 + }, + { + "auxiliary_loss_clip": 0.01091434, + "auxiliary_loss_mlp": 0.01038369, + "balance_loss_clip": 1.04466867, + "balance_loss_mlp": 1.02406991, + "epoch": 0.5880655343454081, + "flos": 19463404400640.0, + "grad_norm": 1.8753551233884636, + "language_loss": 0.72474289, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.74604088, + "num_input_tokens_seen": 210888490, + "step": 9781, + "time_per_iteration": 4.2804930210113525 + }, + { + "auxiliary_loss_clip": 0.01092565, + "auxiliary_loss_mlp": 0.00771373, + "balance_loss_clip": 1.04225159, + "balance_loss_mlp": 1.00005984, + "epoch": 0.588125657598076, + "flos": 21397588980480.0, + "grad_norm": 1.5003005055277707, + "language_loss": 0.70744377, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.72608316, + "num_input_tokens_seen": 210908220, + "step": 9782, + "time_per_iteration": 4.278367519378662 + }, + { + "auxiliary_loss_clip": 0.01105689, + "auxiliary_loss_mlp": 0.01041859, + "balance_loss_clip": 1.03929675, + "balance_loss_mlp": 1.02716005, + "epoch": 0.588185780850744, + "flos": 16034653797120.0, + "grad_norm": 2.093864455539888, + "language_loss": 0.70450729, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.72598279, + "num_input_tokens_seen": 210923945, + "step": 9783, + "time_per_iteration": 4.194809436798096 + }, + { + "auxiliary_loss_clip": 0.01085302, + "auxiliary_loss_mlp": 0.01036158, + "balance_loss_clip": 1.04440248, + "balance_loss_mlp": 1.02117932, + "epoch": 0.588245904103412, + "flos": 23728226947200.0, + "grad_norm": 2.1947417455944653, + "language_loss": 0.69071788, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.71193242, + "num_input_tokens_seen": 210941955, + "step": 9784, + "time_per_iteration": 2.7187066078186035 + }, + { + "auxiliary_loss_clip": 0.01072816, + "auxiliary_loss_mlp": 0.01034537, + "balance_loss_clip": 1.03863633, + "balance_loss_mlp": 1.02094078, + "epoch": 0.58830602735608, + "flos": 33802534558080.0, + "grad_norm": 24.973572945721454, + "language_loss": 0.69460654, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.71568, + "num_input_tokens_seen": 210963105, + "step": 9785, + "time_per_iteration": 2.878143548965454 + }, + { + "auxiliary_loss_clip": 0.01107899, + "auxiliary_loss_mlp": 0.01029541, + "balance_loss_clip": 1.04268789, + "balance_loss_mlp": 1.01706553, + "epoch": 0.588366150608748, + "flos": 17090714586240.0, + "grad_norm": 1.9508012380203874, + "language_loss": 0.77078086, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.79215527, + "num_input_tokens_seen": 210978720, + "step": 9786, + "time_per_iteration": 2.6095898151397705 + }, + { + "auxiliary_loss_clip": 0.01101968, + "auxiliary_loss_mlp": 0.01029534, + "balance_loss_clip": 1.04132032, + "balance_loss_mlp": 1.01609302, + "epoch": 0.5884262738614159, + "flos": 22127186033280.0, + "grad_norm": 1.4529797212559594, + "language_loss": 0.79197991, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.81329501, + "num_input_tokens_seen": 210998750, + "step": 9787, + "time_per_iteration": 4.223788261413574 + }, + { + "auxiliary_loss_clip": 0.01081001, + "auxiliary_loss_mlp": 0.01036004, + "balance_loss_clip": 1.04142892, + "balance_loss_mlp": 1.02382052, + "epoch": 0.5884863971140839, + "flos": 21031838743680.0, + "grad_norm": 2.5032495709629186, + "language_loss": 0.6604932, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.68166327, + "num_input_tokens_seen": 211017550, + "step": 9788, + "time_per_iteration": 2.770289659500122 + }, + { + "auxiliary_loss_clip": 0.01089935, + "auxiliary_loss_mlp": 0.01038801, + "balance_loss_clip": 1.04031539, + "balance_loss_mlp": 1.02356613, + "epoch": 0.5885465203667518, + "flos": 23805112008960.0, + "grad_norm": 2.4491161231159495, + "language_loss": 0.80353689, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.82482433, + "num_input_tokens_seen": 211034135, + "step": 9789, + "time_per_iteration": 2.7129344940185547 + }, + { + "auxiliary_loss_clip": 0.01088956, + "auxiliary_loss_mlp": 0.00771498, + "balance_loss_clip": 1.04013371, + "balance_loss_mlp": 0.99999416, + "epoch": 0.5886066436194198, + "flos": 18880574319360.0, + "grad_norm": 1.8752240370073765, + "language_loss": 0.7074194, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.72602391, + "num_input_tokens_seen": 211053850, + "step": 9790, + "time_per_iteration": 2.7234628200531006 + }, + { + "auxiliary_loss_clip": 0.01082257, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.04143536, + "balance_loss_mlp": 1.02096915, + "epoch": 0.5886667668720877, + "flos": 24790141653120.0, + "grad_norm": 1.7147674530197825, + "language_loss": 0.83315635, + "learning_rate": 1.527232084570895e-06, + "loss": 0.85432208, + "num_input_tokens_seen": 211072165, + "step": 9791, + "time_per_iteration": 2.711566686630249 + }, + { + "auxiliary_loss_clip": 0.0110606, + "auxiliary_loss_mlp": 0.01044469, + "balance_loss_clip": 1.04232645, + "balance_loss_mlp": 1.0296278, + "epoch": 0.5887268901247558, + "flos": 21614381516160.0, + "grad_norm": 1.5737373299770356, + "language_loss": 0.7653091, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.78681433, + "num_input_tokens_seen": 211089630, + "step": 9792, + "time_per_iteration": 2.634300947189331 + }, + { + "auxiliary_loss_clip": 0.0105802, + "auxiliary_loss_mlp": 0.01047083, + "balance_loss_clip": 1.03111851, + "balance_loss_mlp": 1.03123975, + "epoch": 0.5887870133774237, + "flos": 20481722974080.0, + "grad_norm": 2.6665803472381935, + "language_loss": 0.68956935, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.7106204, + "num_input_tokens_seen": 211106120, + "step": 9793, + "time_per_iteration": 2.7154650688171387 + }, + { + "auxiliary_loss_clip": 0.01116924, + "auxiliary_loss_mlp": 0.01033012, + "balance_loss_clip": 1.04252207, + "balance_loss_mlp": 1.01923692, + "epoch": 0.5888471366300917, + "flos": 19206283870080.0, + "grad_norm": 1.9062241907170245, + "language_loss": 0.60218275, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.62368208, + "num_input_tokens_seen": 211122450, + "step": 9794, + "time_per_iteration": 2.584721088409424 + }, + { + "auxiliary_loss_clip": 0.01087928, + "auxiliary_loss_mlp": 0.01038963, + "balance_loss_clip": 1.04045248, + "balance_loss_mlp": 1.0248251, + "epoch": 0.5889072598827596, + "flos": 19972904866560.0, + "grad_norm": 1.5367259931320274, + "language_loss": 0.65087652, + "learning_rate": 1.525718531219257e-06, + "loss": 0.67214543, + "num_input_tokens_seen": 211141765, + "step": 9795, + "time_per_iteration": 2.6578221321105957 + }, + { + "auxiliary_loss_clip": 0.01080946, + "auxiliary_loss_mlp": 0.01041808, + "balance_loss_clip": 1.03947282, + "balance_loss_mlp": 1.02942848, + "epoch": 0.5889673831354276, + "flos": 20741249715840.0, + "grad_norm": 1.5439612087123358, + "language_loss": 0.74185097, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.76307845, + "num_input_tokens_seen": 211160475, + "step": 9796, + "time_per_iteration": 2.7106168270111084 + }, + { + "auxiliary_loss_clip": 0.01094109, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.03922713, + "balance_loss_mlp": 1.02194858, + "epoch": 0.5890275063880956, + "flos": 25300935008640.0, + "grad_norm": 1.398085740010997, + "language_loss": 0.82796204, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.84925568, + "num_input_tokens_seen": 211180480, + "step": 9797, + "time_per_iteration": 2.7226924896240234 + }, + { + "auxiliary_loss_clip": 0.01089451, + "auxiliary_loss_mlp": 0.01032137, + "balance_loss_clip": 1.03643203, + "balance_loss_mlp": 1.01857686, + "epoch": 0.5890876296407636, + "flos": 11765377964160.0, + "grad_norm": 2.441249596431382, + "language_loss": 0.792216, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.81343186, + "num_input_tokens_seen": 211198000, + "step": 9798, + "time_per_iteration": 2.661177396774292 + }, + { + "auxiliary_loss_clip": 0.01116784, + "auxiliary_loss_mlp": 0.01033567, + "balance_loss_clip": 1.04251814, + "balance_loss_mlp": 1.02085924, + "epoch": 0.5891477528934316, + "flos": 13589460380160.0, + "grad_norm": 4.031600606780585, + "language_loss": 0.74594498, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.76744843, + "num_input_tokens_seen": 211214765, + "step": 9799, + "time_per_iteration": 2.597598075866699 + }, + { + "auxiliary_loss_clip": 0.0108372, + "auxiliary_loss_mlp": 0.01033117, + "balance_loss_clip": 1.03822446, + "balance_loss_mlp": 1.01848447, + "epoch": 0.5892078761460995, + "flos": 15049193189760.0, + "grad_norm": 1.9844034954522878, + "language_loss": 0.7639305, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.78509891, + "num_input_tokens_seen": 211232335, + "step": 9800, + "time_per_iteration": 2.6959407329559326 + }, + { + "auxiliary_loss_clip": 0.01068975, + "auxiliary_loss_mlp": 0.01043567, + "balance_loss_clip": 1.03649104, + "balance_loss_mlp": 1.02863002, + "epoch": 0.5892679993987675, + "flos": 15778215624960.0, + "grad_norm": 2.091540130493628, + "language_loss": 0.78984964, + "learning_rate": 1.523448741022722e-06, + "loss": 0.81097507, + "num_input_tokens_seen": 211249985, + "step": 9801, + "time_per_iteration": 2.7329885959625244 + }, + { + "auxiliary_loss_clip": 0.01084752, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.04138374, + "balance_loss_mlp": 1.01958394, + "epoch": 0.5893281226514354, + "flos": 25265203954560.0, + "grad_norm": 1.6724920210450809, + "language_loss": 0.66076094, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.68194282, + "num_input_tokens_seen": 211268425, + "step": 9802, + "time_per_iteration": 2.9191880226135254 + }, + { + "auxiliary_loss_clip": 0.01106682, + "auxiliary_loss_mlp": 0.01030935, + "balance_loss_clip": 1.04172468, + "balance_loss_mlp": 1.01782823, + "epoch": 0.5893882459041034, + "flos": 19458232842240.0, + "grad_norm": 1.576394450599596, + "language_loss": 0.78281248, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.80418861, + "num_input_tokens_seen": 211286680, + "step": 9803, + "time_per_iteration": 2.6395671367645264 + }, + { + "auxiliary_loss_clip": 0.01110111, + "auxiliary_loss_mlp": 0.01036458, + "balance_loss_clip": 1.04354095, + "balance_loss_mlp": 1.0227071, + "epoch": 0.5894483691567713, + "flos": 20634056553600.0, + "grad_norm": 1.421228889325947, + "language_loss": 0.73083454, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75230026, + "num_input_tokens_seen": 211307700, + "step": 9804, + "time_per_iteration": 2.7451324462890625 + }, + { + "auxiliary_loss_clip": 0.01091882, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.04156733, + "balance_loss_mlp": 1.01769745, + "epoch": 0.5895084924094394, + "flos": 17778223877760.0, + "grad_norm": 1.6374324136970364, + "language_loss": 0.74669635, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.76792479, + "num_input_tokens_seen": 211324835, + "step": 9805, + "time_per_iteration": 2.6853296756744385 + }, + { + "auxiliary_loss_clip": 0.01113863, + "auxiliary_loss_mlp": 0.00772031, + "balance_loss_clip": 1.04102564, + "balance_loss_mlp": 1.00008976, + "epoch": 0.5895686156621073, + "flos": 20121072468480.0, + "grad_norm": 1.677515475610003, + "language_loss": 0.77973545, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.79859436, + "num_input_tokens_seen": 211344130, + "step": 9806, + "time_per_iteration": 2.6450774669647217 + }, + { + "auxiliary_loss_clip": 0.01117555, + "auxiliary_loss_mlp": 0.01031595, + "balance_loss_clip": 1.0410825, + "balance_loss_mlp": 1.01801682, + "epoch": 0.5896287389147753, + "flos": 20850058990080.0, + "grad_norm": 1.7162663032269994, + "language_loss": 0.76973009, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.79122162, + "num_input_tokens_seen": 211362915, + "step": 9807, + "time_per_iteration": 2.5557191371917725 + }, + { + "auxiliary_loss_clip": 0.01115136, + "auxiliary_loss_mlp": 0.01029659, + "balance_loss_clip": 1.04593015, + "balance_loss_mlp": 1.01563966, + "epoch": 0.5896888621674432, + "flos": 14537897043840.0, + "grad_norm": 1.9630689597763404, + "language_loss": 0.74407029, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.76551819, + "num_input_tokens_seen": 211380700, + "step": 9808, + "time_per_iteration": 2.649773359298706 + }, + { + "auxiliary_loss_clip": 0.01069554, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.03687882, + "balance_loss_mlp": 1.01540375, + "epoch": 0.5897489854201112, + "flos": 20886759711360.0, + "grad_norm": 2.8224307817464194, + "language_loss": 0.72173887, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.74273449, + "num_input_tokens_seen": 211400095, + "step": 9809, + "time_per_iteration": 2.8795154094696045 + }, + { + "auxiliary_loss_clip": 0.01097105, + "auxiliary_loss_mlp": 0.01035609, + "balance_loss_clip": 1.03962231, + "balance_loss_mlp": 1.02155459, + "epoch": 0.5898091086727792, + "flos": 20011149872640.0, + "grad_norm": 1.9654509433248524, + "language_loss": 0.82251418, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.84384131, + "num_input_tokens_seen": 211417810, + "step": 9810, + "time_per_iteration": 2.7300972938537598 + }, + { + "auxiliary_loss_clip": 0.01108515, + "auxiliary_loss_mlp": 0.01035386, + "balance_loss_clip": 1.04266787, + "balance_loss_mlp": 1.02213001, + "epoch": 0.5898692319254472, + "flos": 16253242012800.0, + "grad_norm": 2.8325616643541043, + "language_loss": 0.80945516, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.83089411, + "num_input_tokens_seen": 211436020, + "step": 9811, + "time_per_iteration": 2.6033973693847656 + }, + { + "auxiliary_loss_clip": 0.01114433, + "auxiliary_loss_mlp": 0.01031742, + "balance_loss_clip": 1.04528522, + "balance_loss_mlp": 1.01666236, + "epoch": 0.5899293551781152, + "flos": 20448541785600.0, + "grad_norm": 2.9067717634400174, + "language_loss": 0.77026772, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.79172945, + "num_input_tokens_seen": 211454335, + "step": 9812, + "time_per_iteration": 2.6283788681030273 + }, + { + "auxiliary_loss_clip": 0.01085179, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.04126084, + "balance_loss_mlp": 1.01832533, + "epoch": 0.5899894784307831, + "flos": 13881701433600.0, + "grad_norm": 2.0160065726104426, + "language_loss": 0.70596051, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.72711378, + "num_input_tokens_seen": 211472775, + "step": 9813, + "time_per_iteration": 2.7235190868377686 + }, + { + "auxiliary_loss_clip": 0.01094818, + "auxiliary_loss_mlp": 0.01038761, + "balance_loss_clip": 1.04338694, + "balance_loss_mlp": 1.02489126, + "epoch": 0.5900496016834511, + "flos": 20083797129600.0, + "grad_norm": 10.075807478503481, + "language_loss": 0.72172022, + "learning_rate": 1.518533098148494e-06, + "loss": 0.74305606, + "num_input_tokens_seen": 211492195, + "step": 9814, + "time_per_iteration": 2.7245450019836426 + }, + { + "auxiliary_loss_clip": 0.01093647, + "auxiliary_loss_mlp": 0.01037117, + "balance_loss_clip": 1.04272461, + "balance_loss_mlp": 1.02348518, + "epoch": 0.590109724936119, + "flos": 20259148348800.0, + "grad_norm": 1.7959189057174523, + "language_loss": 0.78608483, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.80739248, + "num_input_tokens_seen": 211510220, + "step": 9815, + "time_per_iteration": 2.624587297439575 + }, + { + "auxiliary_loss_clip": 0.0109595, + "auxiliary_loss_mlp": 0.0077231, + "balance_loss_clip": 1.04222631, + "balance_loss_mlp": 1.00011277, + "epoch": 0.590169848188787, + "flos": 24235069806720.0, + "grad_norm": 1.934955250523914, + "language_loss": 0.75605524, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.77473778, + "num_input_tokens_seen": 211526260, + "step": 9816, + "time_per_iteration": 2.805889844894409 + }, + { + "auxiliary_loss_clip": 0.01120987, + "auxiliary_loss_mlp": 0.01039982, + "balance_loss_clip": 1.04457593, + "balance_loss_mlp": 1.02636814, + "epoch": 0.590229971441455, + "flos": 17784724239360.0, + "grad_norm": 1.761702620923252, + "language_loss": 0.81330854, + "learning_rate": 1.517399156051309e-06, + "loss": 0.8349182, + "num_input_tokens_seen": 211542890, + "step": 9817, + "time_per_iteration": 2.5694470405578613 + }, + { + "auxiliary_loss_clip": 0.0106411, + "auxiliary_loss_mlp": 0.01046757, + "balance_loss_clip": 1.03651428, + "balance_loss_mlp": 1.03204691, + "epoch": 0.590290094694123, + "flos": 22236893147520.0, + "grad_norm": 1.6227389463072333, + "language_loss": 0.7634322, + "learning_rate": 1.517021211933682e-06, + "loss": 0.78454089, + "num_input_tokens_seen": 211562685, + "step": 9818, + "time_per_iteration": 2.7369279861450195 + }, + { + "auxiliary_loss_clip": 0.0108334, + "auxiliary_loss_mlp": 0.01037737, + "balance_loss_clip": 1.04248178, + "balance_loss_mlp": 1.02451682, + "epoch": 0.5903502179467909, + "flos": 19098623831040.0, + "grad_norm": 2.2508579930127333, + "language_loss": 0.66751575, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.68872648, + "num_input_tokens_seen": 211579960, + "step": 9819, + "time_per_iteration": 2.683518648147583 + }, + { + "auxiliary_loss_clip": 0.01121974, + "auxiliary_loss_mlp": 0.01034215, + "balance_loss_clip": 1.04451931, + "balance_loss_mlp": 1.02100039, + "epoch": 0.5904103411994589, + "flos": 24235500769920.0, + "grad_norm": 1.5861802995785013, + "language_loss": 0.78221858, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.80378044, + "num_input_tokens_seen": 211599310, + "step": 9820, + "time_per_iteration": 2.67228627204895 + }, + { + "auxiliary_loss_clip": 0.01010393, + "auxiliary_loss_mlp": 0.01023264, + "balance_loss_clip": 1.01880515, + "balance_loss_mlp": 1.02123773, + "epoch": 0.5904704644521268, + "flos": 64876613045760.0, + "grad_norm": 0.9671648573222682, + "language_loss": 0.65189892, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67223543, + "num_input_tokens_seen": 211658790, + "step": 9821, + "time_per_iteration": 4.79486083984375 + }, + { + "auxiliary_loss_clip": 0.01079974, + "auxiliary_loss_mlp": 0.01038386, + "balance_loss_clip": 1.04072082, + "balance_loss_mlp": 1.0247488, + "epoch": 0.5905305877047948, + "flos": 19609991804160.0, + "grad_norm": 1.8549459171527238, + "language_loss": 0.61307114, + "learning_rate": 1.515509618752521e-06, + "loss": 0.63425475, + "num_input_tokens_seen": 211677240, + "step": 9822, + "time_per_iteration": 5.756153345108032 + }, + { + "auxiliary_loss_clip": 0.01122858, + "auxiliary_loss_mlp": 0.01041517, + "balance_loss_clip": 1.04382062, + "balance_loss_mlp": 1.02788556, + "epoch": 0.5905907109574628, + "flos": 18989634988800.0, + "grad_norm": 2.151764899841445, + "language_loss": 0.82442653, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.84607029, + "num_input_tokens_seen": 211695485, + "step": 9823, + "time_per_iteration": 2.6660759449005127 + }, + { + "auxiliary_loss_clip": 0.01098497, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.04229546, + "balance_loss_mlp": 1.0183413, + "epoch": 0.5906508342101308, + "flos": 22200407907840.0, + "grad_norm": 1.984006151976339, + "language_loss": 0.72755098, + "learning_rate": 1.514753932336165e-06, + "loss": 0.74885976, + "num_input_tokens_seen": 211713090, + "step": 9824, + "time_per_iteration": 2.679081439971924 + }, + { + "auxiliary_loss_clip": 0.01095276, + "auxiliary_loss_mlp": 0.00772718, + "balance_loss_clip": 1.04067087, + "balance_loss_mlp": 1.00008035, + "epoch": 0.5907109574627988, + "flos": 20886687884160.0, + "grad_norm": 2.158910240340413, + "language_loss": 0.82870126, + "learning_rate": 1.514376116721693e-06, + "loss": 0.84738123, + "num_input_tokens_seen": 211732510, + "step": 9825, + "time_per_iteration": 2.719106674194336 + }, + { + "auxiliary_loss_clip": 0.0110445, + "auxiliary_loss_mlp": 0.01034274, + "balance_loss_clip": 1.04120886, + "balance_loss_mlp": 1.02252591, + "epoch": 0.5907710807154667, + "flos": 21506649649920.0, + "grad_norm": 1.7542204465206233, + "language_loss": 0.76779485, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.78918207, + "num_input_tokens_seen": 211748695, + "step": 9826, + "time_per_iteration": 4.231219291687012 + }, + { + "auxiliary_loss_clip": 0.01094981, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.04213846, + "balance_loss_mlp": 1.01828933, + "epoch": 0.5908312039681347, + "flos": 22018376759040.0, + "grad_norm": 1.9593281360323977, + "language_loss": 0.72049826, + "learning_rate": 1.513620540751793e-06, + "loss": 0.74176061, + "num_input_tokens_seen": 211768545, + "step": 9827, + "time_per_iteration": 2.654449462890625 + }, + { + "auxiliary_loss_clip": 0.01073518, + "auxiliary_loss_mlp": 0.010335, + "balance_loss_clip": 1.03849053, + "balance_loss_mlp": 1.02111387, + "epoch": 0.5908913272208026, + "flos": 18479523991680.0, + "grad_norm": 1.6640399072146284, + "language_loss": 0.79552126, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.8165915, + "num_input_tokens_seen": 211786665, + "step": 9828, + "time_per_iteration": 2.8060965538024902 + }, + { + "auxiliary_loss_clip": 0.01065495, + "auxiliary_loss_mlp": 0.01038324, + "balance_loss_clip": 1.04091191, + "balance_loss_mlp": 1.02510321, + "epoch": 0.5909514504734706, + "flos": 12312189682560.0, + "grad_norm": 1.8739746775685384, + "language_loss": 0.88231647, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.90335464, + "num_input_tokens_seen": 211801215, + "step": 9829, + "time_per_iteration": 2.819425106048584 + }, + { + "auxiliary_loss_clip": 0.01023107, + "auxiliary_loss_mlp": 0.01007549, + "balance_loss_clip": 1.01821566, + "balance_loss_mlp": 1.00632119, + "epoch": 0.5910115737261386, + "flos": 70213262451840.0, + "grad_norm": 0.7698473487867592, + "language_loss": 0.57849222, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.59879881, + "num_input_tokens_seen": 211857005, + "step": 9830, + "time_per_iteration": 3.1567955017089844 + }, + { + "auxiliary_loss_clip": 0.01114755, + "auxiliary_loss_mlp": 0.00772402, + "balance_loss_clip": 1.04340577, + "balance_loss_mlp": 1.0002377, + "epoch": 0.5910716969788066, + "flos": 22017766227840.0, + "grad_norm": 2.1363303387386723, + "language_loss": 0.75768107, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.77655268, + "num_input_tokens_seen": 211876675, + "step": 9831, + "time_per_iteration": 2.7048380374908447 + }, + { + "auxiliary_loss_clip": 0.01089263, + "auxiliary_loss_mlp": 0.01034604, + "balance_loss_clip": 1.04322839, + "balance_loss_mlp": 1.02142549, + "epoch": 0.5911318202314745, + "flos": 21251648021760.0, + "grad_norm": 1.6552693507472749, + "language_loss": 0.77847427, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.79971302, + "num_input_tokens_seen": 211895725, + "step": 9832, + "time_per_iteration": 2.716529369354248 + }, + { + "auxiliary_loss_clip": 0.01105775, + "auxiliary_loss_mlp": 0.01031029, + "balance_loss_clip": 1.04159164, + "balance_loss_mlp": 1.01780295, + "epoch": 0.5911919434841425, + "flos": 17821604528640.0, + "grad_norm": 1.9563179904860062, + "language_loss": 0.83245647, + "learning_rate": 1.511354255945847e-06, + "loss": 0.8538245, + "num_input_tokens_seen": 211913860, + "step": 9833, + "time_per_iteration": 2.641958236694336 + }, + { + "auxiliary_loss_clip": 0.0110771, + "auxiliary_loss_mlp": 0.01038041, + "balance_loss_clip": 1.04046118, + "balance_loss_mlp": 1.02435589, + "epoch": 0.5912520667368104, + "flos": 20374781207040.0, + "grad_norm": 1.5336556134798032, + "language_loss": 0.74267918, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.76413667, + "num_input_tokens_seen": 211932880, + "step": 9834, + "time_per_iteration": 2.628453016281128 + }, + { + "auxiliary_loss_clip": 0.01119479, + "auxiliary_loss_mlp": 0.01034016, + "balance_loss_clip": 1.04244208, + "balance_loss_mlp": 1.02121329, + "epoch": 0.5913121899894784, + "flos": 17930557457280.0, + "grad_norm": 2.771797648904754, + "language_loss": 0.78298235, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.80451727, + "num_input_tokens_seen": 211948625, + "step": 9835, + "time_per_iteration": 2.5689404010772705 + }, + { + "auxiliary_loss_clip": 0.01095655, + "auxiliary_loss_mlp": 0.01036768, + "balance_loss_clip": 1.03806067, + "balance_loss_mlp": 1.0237323, + "epoch": 0.5913723132421465, + "flos": 22126934638080.0, + "grad_norm": 1.8733256786117318, + "language_loss": 0.73799431, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.75931853, + "num_input_tokens_seen": 211965355, + "step": 9836, + "time_per_iteration": 2.695117712020874 + }, + { + "auxiliary_loss_clip": 0.01083057, + "auxiliary_loss_mlp": 0.01035766, + "balance_loss_clip": 1.03943884, + "balance_loss_mlp": 1.02149653, + "epoch": 0.5914324364948144, + "flos": 15697918771200.0, + "grad_norm": 1.9392468028622023, + "language_loss": 0.82138634, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.84257448, + "num_input_tokens_seen": 211982245, + "step": 9837, + "time_per_iteration": 2.6912343502044678 + }, + { + "auxiliary_loss_clip": 0.01078463, + "auxiliary_loss_mlp": 0.01035071, + "balance_loss_clip": 1.03632522, + "balance_loss_mlp": 1.02026534, + "epoch": 0.5914925597474824, + "flos": 22747327367040.0, + "grad_norm": 2.27741138864597, + "language_loss": 0.79637218, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.81750751, + "num_input_tokens_seen": 212000250, + "step": 9838, + "time_per_iteration": 2.6449244022369385 + }, + { + "auxiliary_loss_clip": 0.010718, + "auxiliary_loss_mlp": 0.01039396, + "balance_loss_clip": 1.04010475, + "balance_loss_mlp": 1.02605057, + "epoch": 0.5915526830001503, + "flos": 18292788161280.0, + "grad_norm": 1.9685283368258655, + "language_loss": 0.69672906, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.71784103, + "num_input_tokens_seen": 212017505, + "step": 9839, + "time_per_iteration": 2.76196026802063 + }, + { + "auxiliary_loss_clip": 0.01093291, + "auxiliary_loss_mlp": 0.01043789, + "balance_loss_clip": 1.04008913, + "balance_loss_mlp": 1.0308131, + "epoch": 0.5916128062528183, + "flos": 17019072910080.0, + "grad_norm": 2.7566603972322943, + "language_loss": 0.65802211, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.67939293, + "num_input_tokens_seen": 212034595, + "step": 9840, + "time_per_iteration": 2.647179365158081 + }, + { + "auxiliary_loss_clip": 0.01095524, + "auxiliary_loss_mlp": 0.01030956, + "balance_loss_clip": 1.04105091, + "balance_loss_mlp": 1.0170027, + "epoch": 0.5916729295054862, + "flos": 24754231031040.0, + "grad_norm": 1.7835451737672352, + "language_loss": 0.81441593, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.83568072, + "num_input_tokens_seen": 212055775, + "step": 9841, + "time_per_iteration": 2.693742036819458 + }, + { + "auxiliary_loss_clip": 0.01090733, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.04020691, + "balance_loss_mlp": 1.01772296, + "epoch": 0.5917330527581542, + "flos": 15958199698560.0, + "grad_norm": 1.7111294758223268, + "language_loss": 0.69152761, + "learning_rate": 1.507956080444291e-06, + "loss": 0.71273863, + "num_input_tokens_seen": 212074000, + "step": 9842, + "time_per_iteration": 2.6797986030578613 + }, + { + "auxiliary_loss_clip": 0.01093141, + "auxiliary_loss_mlp": 0.0103715, + "balance_loss_clip": 1.03811431, + "balance_loss_mlp": 1.02367949, + "epoch": 0.5917931760108222, + "flos": 23800730549760.0, + "grad_norm": 3.159007391867861, + "language_loss": 0.83409858, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.85540152, + "num_input_tokens_seen": 212091415, + "step": 9843, + "time_per_iteration": 2.7194371223449707 + }, + { + "auxiliary_loss_clip": 0.01090728, + "auxiliary_loss_mlp": 0.01031987, + "balance_loss_clip": 1.03646731, + "balance_loss_mlp": 1.01762211, + "epoch": 0.5918532992634902, + "flos": 23249609199360.0, + "grad_norm": 5.395713728013965, + "language_loss": 0.81329596, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.83452308, + "num_input_tokens_seen": 212105255, + "step": 9844, + "time_per_iteration": 2.7136270999908447 + }, + { + "auxiliary_loss_clip": 0.01068008, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.03874016, + "balance_loss_mlp": 1.01633775, + "epoch": 0.5919134225161581, + "flos": 19499853726720.0, + "grad_norm": 1.8542895008446525, + "language_loss": 0.74591327, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.7668947, + "num_input_tokens_seen": 212122765, + "step": 9845, + "time_per_iteration": 2.781914710998535 + }, + { + "auxiliary_loss_clip": 0.01077949, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.03821266, + "balance_loss_mlp": 1.01837575, + "epoch": 0.5919735457688261, + "flos": 38800940567040.0, + "grad_norm": 1.69458434045341, + "language_loss": 0.63799906, + "learning_rate": 1.506446264718213e-06, + "loss": 0.65912199, + "num_input_tokens_seen": 212143960, + "step": 9846, + "time_per_iteration": 2.8427982330322266 + }, + { + "auxiliary_loss_clip": 0.01076538, + "auxiliary_loss_mlp": 0.00769552, + "balance_loss_clip": 1.03801441, + "balance_loss_mlp": 1.00004482, + "epoch": 0.592033669021494, + "flos": 22163994495360.0, + "grad_norm": 1.809865828874733, + "language_loss": 0.76013452, + "learning_rate": 1.506068857539931e-06, + "loss": 0.77859539, + "num_input_tokens_seen": 212162005, + "step": 9847, + "time_per_iteration": 2.737806797027588 + }, + { + "auxiliary_loss_clip": 0.01092495, + "auxiliary_loss_mlp": 0.01031315, + "balance_loss_clip": 1.03829622, + "balance_loss_mlp": 1.01720047, + "epoch": 0.592093792274162, + "flos": 22710985781760.0, + "grad_norm": 1.7217593328479819, + "language_loss": 0.62444723, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.64568532, + "num_input_tokens_seen": 212181635, + "step": 9848, + "time_per_iteration": 2.768158197402954 + }, + { + "auxiliary_loss_clip": 0.01108627, + "auxiliary_loss_mlp": 0.01039243, + "balance_loss_clip": 1.04256344, + "balance_loss_mlp": 1.02609384, + "epoch": 0.59215391552683, + "flos": 22528954632960.0, + "grad_norm": 1.7269094299177161, + "language_loss": 0.75832105, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.7797997, + "num_input_tokens_seen": 212201615, + "step": 9849, + "time_per_iteration": 2.6506807804107666 + }, + { + "auxiliary_loss_clip": 0.01095576, + "auxiliary_loss_mlp": 0.01036342, + "balance_loss_clip": 1.04088306, + "balance_loss_mlp": 1.02223945, + "epoch": 0.592214038779498, + "flos": 24499013921280.0, + "grad_norm": 2.077646783474588, + "language_loss": 0.75440395, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.7757231, + "num_input_tokens_seen": 212219355, + "step": 9850, + "time_per_iteration": 2.738163471221924 + }, + { + "auxiliary_loss_clip": 0.01079223, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.0389607, + "balance_loss_mlp": 1.02532411, + "epoch": 0.592274162032166, + "flos": 21831353619840.0, + "grad_norm": 2.0657919494048094, + "language_loss": 0.75485742, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.77603519, + "num_input_tokens_seen": 212236710, + "step": 9851, + "time_per_iteration": 2.7006642818450928 + }, + { + "auxiliary_loss_clip": 0.0109594, + "auxiliary_loss_mlp": 0.0103171, + "balance_loss_clip": 1.04149699, + "balance_loss_mlp": 1.01846635, + "epoch": 0.5923342852848339, + "flos": 24608146417920.0, + "grad_norm": 1.9468749498411155, + "language_loss": 0.7089386, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.73021513, + "num_input_tokens_seen": 212256195, + "step": 9852, + "time_per_iteration": 2.706106424331665 + }, + { + "auxiliary_loss_clip": 0.01104361, + "auxiliary_loss_mlp": 0.0077249, + "balance_loss_clip": 1.04451549, + "balance_loss_mlp": 1.00008225, + "epoch": 0.5923944085375019, + "flos": 19938143479680.0, + "grad_norm": 1.600717143056076, + "language_loss": 0.80555183, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.82432032, + "num_input_tokens_seen": 212274085, + "step": 9853, + "time_per_iteration": 2.7119646072387695 + }, + { + "auxiliary_loss_clip": 0.01088586, + "auxiliary_loss_mlp": 0.01028953, + "balance_loss_clip": 1.03719842, + "balance_loss_mlp": 1.01654959, + "epoch": 0.5924545317901698, + "flos": 28658510812800.0, + "grad_norm": 1.9598293021275044, + "language_loss": 0.67597294, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.69714832, + "num_input_tokens_seen": 212295530, + "step": 9854, + "time_per_iteration": 2.7060039043426514 + }, + { + "auxiliary_loss_clip": 0.01081304, + "auxiliary_loss_mlp": 0.01029538, + "balance_loss_clip": 1.03990042, + "balance_loss_mlp": 1.01680636, + "epoch": 0.5925146550428378, + "flos": 19864885691520.0, + "grad_norm": 1.7821900938554989, + "language_loss": 0.88811159, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.90921998, + "num_input_tokens_seen": 212313770, + "step": 9855, + "time_per_iteration": 2.749842882156372 + }, + { + "auxiliary_loss_clip": 0.01097397, + "auxiliary_loss_mlp": 0.01031382, + "balance_loss_clip": 1.04023433, + "balance_loss_mlp": 1.01912177, + "epoch": 0.5925747782955058, + "flos": 15122989681920.0, + "grad_norm": 1.7553886735756365, + "language_loss": 0.86097872, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.8822664, + "num_input_tokens_seen": 212331525, + "step": 9856, + "time_per_iteration": 2.8213181495666504 + }, + { + "auxiliary_loss_clip": 0.0110594, + "auxiliary_loss_mlp": 0.01036211, + "balance_loss_clip": 1.03984308, + "balance_loss_mlp": 1.02389097, + "epoch": 0.5926349015481738, + "flos": 18405440190720.0, + "grad_norm": 3.6746631679389536, + "language_loss": 0.77349007, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.79491156, + "num_input_tokens_seen": 212347295, + "step": 9857, + "time_per_iteration": 2.6580264568328857 + }, + { + "auxiliary_loss_clip": 0.0107388, + "auxiliary_loss_mlp": 0.01051977, + "balance_loss_clip": 1.03587079, + "balance_loss_mlp": 1.03689682, + "epoch": 0.5926950248008417, + "flos": 23111138269440.0, + "grad_norm": 2.383524132494838, + "language_loss": 0.64598405, + "learning_rate": 1.501918617901419e-06, + "loss": 0.66724265, + "num_input_tokens_seen": 212365750, + "step": 9858, + "time_per_iteration": 2.7002615928649902 + }, + { + "auxiliary_loss_clip": 0.01103608, + "auxiliary_loss_mlp": 0.01033595, + "balance_loss_clip": 1.04055738, + "balance_loss_mlp": 1.02088773, + "epoch": 0.5927551480535097, + "flos": 28033916192640.0, + "grad_norm": 1.88700094462338, + "language_loss": 0.77598989, + "learning_rate": 1.501541436426501e-06, + "loss": 0.79736185, + "num_input_tokens_seen": 212385300, + "step": 9859, + "time_per_iteration": 4.434144496917725 + }, + { + "auxiliary_loss_clip": 0.01078779, + "auxiliary_loss_mlp": 0.00771508, + "balance_loss_clip": 1.04448819, + "balance_loss_mlp": 1.00007796, + "epoch": 0.5928152713061776, + "flos": 21798675221760.0, + "grad_norm": 4.274702781757113, + "language_loss": 0.74740881, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.7659117, + "num_input_tokens_seen": 212402140, + "step": 9860, + "time_per_iteration": 2.8576431274414062 + }, + { + "auxiliary_loss_clip": 0.01080315, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.04223692, + "balance_loss_mlp": 1.02289104, + "epoch": 0.5928753945588456, + "flos": 24316839118080.0, + "grad_norm": 1.6207851458155365, + "language_loss": 0.7622723, + "learning_rate": 1.500787130195763e-06, + "loss": 0.7834208, + "num_input_tokens_seen": 212421790, + "step": 9861, + "time_per_iteration": 5.779749393463135 + }, + { + "auxiliary_loss_clip": 0.01079641, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.03737628, + "balance_loss_mlp": 1.0201298, + "epoch": 0.5929355178115137, + "flos": 26464619923200.0, + "grad_norm": 2.31911103307255, + "language_loss": 0.70733476, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.72845423, + "num_input_tokens_seen": 212442115, + "step": 9862, + "time_per_iteration": 2.7879045009613037 + }, + { + "auxiliary_loss_clip": 0.01057596, + "auxiliary_loss_mlp": 0.01034108, + "balance_loss_clip": 1.03278732, + "balance_loss_mlp": 1.02148342, + "epoch": 0.5929956410641816, + "flos": 24965995662720.0, + "grad_norm": 1.7884457502004503, + "language_loss": 0.78123254, + "learning_rate": 1.500032899685832e-06, + "loss": 0.80214959, + "num_input_tokens_seen": 212459535, + "step": 9863, + "time_per_iteration": 2.7296791076660156 + }, + { + "auxiliary_loss_clip": 0.01089944, + "auxiliary_loss_mlp": 0.01040962, + "balance_loss_clip": 1.03986549, + "balance_loss_mlp": 1.02770567, + "epoch": 0.5930557643168496, + "flos": 26208325405440.0, + "grad_norm": 2.4622472815237506, + "language_loss": 0.70487082, + "learning_rate": 1.499655812861921e-06, + "loss": 0.72617984, + "num_input_tokens_seen": 212479385, + "step": 9864, + "time_per_iteration": 2.6773011684417725 + }, + { + "auxiliary_loss_clip": 0.01089195, + "auxiliary_loss_mlp": 0.01036172, + "balance_loss_clip": 1.03835356, + "balance_loss_mlp": 1.0226891, + "epoch": 0.5931158875695175, + "flos": 27854937699840.0, + "grad_norm": 1.4468399758370936, + "language_loss": 0.67205417, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.69330788, + "num_input_tokens_seen": 212500060, + "step": 9865, + "time_per_iteration": 4.260905981063843 + }, + { + "auxiliary_loss_clip": 0.01098878, + "auxiliary_loss_mlp": 0.0103771, + "balance_loss_clip": 1.04014802, + "balance_loss_mlp": 1.02411962, + "epoch": 0.5931760108221855, + "flos": 15413650536960.0, + "grad_norm": 1.9702875461989908, + "language_loss": 0.77913535, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.80050123, + "num_input_tokens_seen": 212518590, + "step": 9866, + "time_per_iteration": 2.6692967414855957 + }, + { + "auxiliary_loss_clip": 0.01090663, + "auxiliary_loss_mlp": 0.01031022, + "balance_loss_clip": 1.04043937, + "balance_loss_mlp": 1.01891649, + "epoch": 0.5932361340748534, + "flos": 30188520581760.0, + "grad_norm": 2.3223854732809364, + "language_loss": 0.71955562, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.74077249, + "num_input_tokens_seen": 212538190, + "step": 9867, + "time_per_iteration": 2.73850679397583 + }, + { + "auxiliary_loss_clip": 0.01094459, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.04182947, + "balance_loss_mlp": 1.01644969, + "epoch": 0.5932962573275214, + "flos": 20157557708160.0, + "grad_norm": 1.577108097655746, + "language_loss": 0.66789985, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.68914956, + "num_input_tokens_seen": 212557820, + "step": 9868, + "time_per_iteration": 2.776890277862549 + }, + { + "auxiliary_loss_clip": 0.01060162, + "auxiliary_loss_mlp": 0.00771363, + "balance_loss_clip": 1.03597963, + "balance_loss_mlp": 1.00004768, + "epoch": 0.5933563805801894, + "flos": 25445906300160.0, + "grad_norm": 1.613226423561444, + "language_loss": 0.75353992, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.77185524, + "num_input_tokens_seen": 212577645, + "step": 9869, + "time_per_iteration": 2.8630988597869873 + }, + { + "auxiliary_loss_clip": 0.010636, + "auxiliary_loss_mlp": 0.01038006, + "balance_loss_clip": 1.03897762, + "balance_loss_mlp": 1.02469635, + "epoch": 0.5934165038328574, + "flos": 59995740337920.0, + "grad_norm": 1.8583969258808255, + "language_loss": 0.74005115, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.76106727, + "num_input_tokens_seen": 212603430, + "step": 9870, + "time_per_iteration": 3.0915732383728027 + }, + { + "auxiliary_loss_clip": 0.01071863, + "auxiliary_loss_mlp": 0.01030945, + "balance_loss_clip": 1.03705025, + "balance_loss_mlp": 1.01810646, + "epoch": 0.5934766270855253, + "flos": 24420548661120.0, + "grad_norm": 2.145127507644007, + "language_loss": 0.7232281, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.7442562, + "num_input_tokens_seen": 212620730, + "step": 9871, + "time_per_iteration": 2.7629406452178955 + }, + { + "auxiliary_loss_clip": 0.01086004, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.04104018, + "balance_loss_mlp": 1.01764774, + "epoch": 0.5935367503381933, + "flos": 23513158264320.0, + "grad_norm": 2.0164353140130835, + "language_loss": 0.74587923, + "learning_rate": 1.496639802503271e-06, + "loss": 0.76705188, + "num_input_tokens_seen": 212639745, + "step": 9872, + "time_per_iteration": 2.74772310256958 + }, + { + "auxiliary_loss_clip": 0.01111382, + "auxiliary_loss_mlp": 0.01038485, + "balance_loss_clip": 1.04180598, + "balance_loss_mlp": 1.02414966, + "epoch": 0.5935968735908612, + "flos": 18948337326720.0, + "grad_norm": 2.3277369002939388, + "language_loss": 0.79620034, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.81769902, + "num_input_tokens_seen": 212655915, + "step": 9873, + "time_per_iteration": 2.663547992706299 + }, + { + "auxiliary_loss_clip": 0.01108216, + "auxiliary_loss_mlp": 0.01034928, + "balance_loss_clip": 1.04269648, + "balance_loss_mlp": 1.021523, + "epoch": 0.5936569968435292, + "flos": 25483433034240.0, + "grad_norm": 1.6892324145577737, + "language_loss": 0.8490203, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.87045169, + "num_input_tokens_seen": 212676115, + "step": 9874, + "time_per_iteration": 2.654606580734253 + }, + { + "auxiliary_loss_clip": 0.01019729, + "auxiliary_loss_mlp": 0.01001192, + "balance_loss_clip": 1.01379979, + "balance_loss_mlp": 0.99991626, + "epoch": 0.5937171200961973, + "flos": 66378361789440.0, + "grad_norm": 0.7079839888277836, + "language_loss": 0.59980857, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.62001777, + "num_input_tokens_seen": 212737560, + "step": 9875, + "time_per_iteration": 3.3108227252960205 + }, + { + "auxiliary_loss_clip": 0.01094208, + "auxiliary_loss_mlp": 0.01033507, + "balance_loss_clip": 1.03624558, + "balance_loss_mlp": 1.01859426, + "epoch": 0.5937772433488652, + "flos": 14903467712640.0, + "grad_norm": 5.919714877847386, + "language_loss": 0.7768054, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.79808253, + "num_input_tokens_seen": 212755365, + "step": 9876, + "time_per_iteration": 2.6835005283355713 + }, + { + "auxiliary_loss_clip": 0.01097876, + "auxiliary_loss_mlp": 0.01028372, + "balance_loss_clip": 1.03590453, + "balance_loss_mlp": 1.01630843, + "epoch": 0.5938373666015332, + "flos": 22561489376640.0, + "grad_norm": 1.528829961767438, + "language_loss": 0.75805295, + "learning_rate": 1.494755415907243e-06, + "loss": 0.77931547, + "num_input_tokens_seen": 212773875, + "step": 9877, + "time_per_iteration": 2.703756332397461 + }, + { + "auxiliary_loss_clip": 0.0110632, + "auxiliary_loss_mlp": 0.01028449, + "balance_loss_clip": 1.03964424, + "balance_loss_mlp": 1.01493096, + "epoch": 0.5938974898542011, + "flos": 18440883936000.0, + "grad_norm": 2.6694319382348666, + "language_loss": 0.81408948, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.83543718, + "num_input_tokens_seen": 212790590, + "step": 9878, + "time_per_iteration": 2.6299495697021484 + }, + { + "auxiliary_loss_clip": 0.01090649, + "auxiliary_loss_mlp": 0.00772164, + "balance_loss_clip": 1.03885496, + "balance_loss_mlp": 1.00006008, + "epoch": 0.5939576131068691, + "flos": 45586728270720.0, + "grad_norm": 1.7408999007224344, + "language_loss": 0.71310401, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.73173207, + "num_input_tokens_seen": 212812265, + "step": 9879, + "time_per_iteration": 2.9403438568115234 + }, + { + "auxiliary_loss_clip": 0.01107517, + "auxiliary_loss_mlp": 0.01037191, + "balance_loss_clip": 1.04333889, + "balance_loss_mlp": 1.02471602, + "epoch": 0.594017736359537, + "flos": 23587708942080.0, + "grad_norm": 1.6220417937962182, + "language_loss": 0.5754692, + "learning_rate": 1.493625013742401e-06, + "loss": 0.59691632, + "num_input_tokens_seen": 212831915, + "step": 9880, + "time_per_iteration": 2.722222089767456 + }, + { + "auxiliary_loss_clip": 0.01108825, + "auxiliary_loss_mlp": 0.01034905, + "balance_loss_clip": 1.04171181, + "balance_loss_mlp": 1.02144003, + "epoch": 0.594077859612205, + "flos": 29457235589760.0, + "grad_norm": 1.8505883622927, + "language_loss": 0.77141905, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.79285634, + "num_input_tokens_seen": 212851350, + "step": 9881, + "time_per_iteration": 2.7424824237823486 + }, + { + "auxiliary_loss_clip": 0.01104617, + "auxiliary_loss_mlp": 0.01027481, + "balance_loss_clip": 1.0387702, + "balance_loss_mlp": 1.01456428, + "epoch": 0.594137982864873, + "flos": 16800089644800.0, + "grad_norm": 2.611625845648677, + "language_loss": 0.82625538, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.84757638, + "num_input_tokens_seen": 212867995, + "step": 9882, + "time_per_iteration": 2.6125638484954834 + }, + { + "auxiliary_loss_clip": 0.01108328, + "auxiliary_loss_mlp": 0.01036545, + "balance_loss_clip": 1.04283643, + "balance_loss_mlp": 1.02419496, + "epoch": 0.594198106117541, + "flos": 12750263953920.0, + "grad_norm": 2.4545417723722434, + "language_loss": 0.79556072, + "learning_rate": 1.492494784393667e-06, + "loss": 0.81700939, + "num_input_tokens_seen": 212885220, + "step": 9883, + "time_per_iteration": 2.6739277839660645 + }, + { + "auxiliary_loss_clip": 0.01090609, + "auxiliary_loss_mlp": 0.00770805, + "balance_loss_clip": 1.04405499, + "balance_loss_mlp": 1.00010085, + "epoch": 0.5942582293702089, + "flos": 20996538652800.0, + "grad_norm": 2.530798381383893, + "language_loss": 0.7459439, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.76455808, + "num_input_tokens_seen": 212903195, + "step": 9884, + "time_per_iteration": 2.720139503479004 + }, + { + "auxiliary_loss_clip": 0.01118755, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.04366493, + "balance_loss_mlp": 1.01757646, + "epoch": 0.5943183526228769, + "flos": 28291431772800.0, + "grad_norm": 2.040352336443274, + "language_loss": 0.66608262, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.68757325, + "num_input_tokens_seen": 212923340, + "step": 9885, + "time_per_iteration": 2.6618847846984863 + }, + { + "auxiliary_loss_clip": 0.01093907, + "auxiliary_loss_mlp": 0.01041351, + "balance_loss_clip": 1.04138327, + "balance_loss_mlp": 1.02835155, + "epoch": 0.5943784758755448, + "flos": 26614619118720.0, + "grad_norm": 2.630158617128694, + "language_loss": 0.77534634, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.79669893, + "num_input_tokens_seen": 212942755, + "step": 9886, + "time_per_iteration": 2.7532429695129395 + }, + { + "auxiliary_loss_clip": 0.01025813, + "auxiliary_loss_mlp": 0.01001276, + "balance_loss_clip": 1.01382208, + "balance_loss_mlp": 0.99997658, + "epoch": 0.5944385991282128, + "flos": 64190935347840.0, + "grad_norm": 0.9149518659336237, + "language_loss": 0.64530778, + "learning_rate": 1.490988081420423e-06, + "loss": 0.66557866, + "num_input_tokens_seen": 212999355, + "step": 9887, + "time_per_iteration": 3.060612440109253 + }, + { + "auxiliary_loss_clip": 0.01097622, + "auxiliary_loss_mlp": 0.01032109, + "balance_loss_clip": 1.03770781, + "balance_loss_mlp": 1.01940084, + "epoch": 0.5944987223808808, + "flos": 19571998193280.0, + "grad_norm": 1.6915419105373903, + "language_loss": 0.69181025, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.71310759, + "num_input_tokens_seen": 213018570, + "step": 9888, + "time_per_iteration": 2.617629051208496 + }, + { + "auxiliary_loss_clip": 0.01088883, + "auxiliary_loss_mlp": 0.01034911, + "balance_loss_clip": 1.03844309, + "balance_loss_mlp": 1.02113008, + "epoch": 0.5945588456335488, + "flos": 26177586341760.0, + "grad_norm": 2.5005305893435685, + "language_loss": 0.79495192, + "learning_rate": 1.490234845687366e-06, + "loss": 0.81618989, + "num_input_tokens_seen": 213037735, + "step": 9889, + "time_per_iteration": 2.685150146484375 + }, + { + "auxiliary_loss_clip": 0.01080162, + "auxiliary_loss_mlp": 0.01026954, + "balance_loss_clip": 1.03793621, + "balance_loss_mlp": 1.01496744, + "epoch": 0.5946189688862168, + "flos": 20446494710400.0, + "grad_norm": 1.6110540672551508, + "language_loss": 0.70713383, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.72820497, + "num_input_tokens_seen": 213057160, + "step": 9890, + "time_per_iteration": 2.7299606800079346 + }, + { + "auxiliary_loss_clip": 0.01088716, + "auxiliary_loss_mlp": 0.01032845, + "balance_loss_clip": 1.04451787, + "balance_loss_mlp": 1.01896358, + "epoch": 0.5946790921388847, + "flos": 13437521850240.0, + "grad_norm": 1.9451498476517268, + "language_loss": 0.69461864, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71583426, + "num_input_tokens_seen": 213073630, + "step": 9891, + "time_per_iteration": 2.7253577709198 + }, + { + "auxiliary_loss_clip": 0.01104108, + "auxiliary_loss_mlp": 0.01040464, + "balance_loss_clip": 1.04076028, + "balance_loss_mlp": 1.02784514, + "epoch": 0.5947392153915527, + "flos": 20412272027520.0, + "grad_norm": 1.8738043279095635, + "language_loss": 0.53252602, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.55397171, + "num_input_tokens_seen": 213092450, + "step": 9892, + "time_per_iteration": 2.630176067352295 + }, + { + "auxiliary_loss_clip": 0.01007775, + "auxiliary_loss_mlp": 0.01004642, + "balance_loss_clip": 1.01469183, + "balance_loss_mlp": 1.00331867, + "epoch": 0.5947993386442206, + "flos": 65619138994560.0, + "grad_norm": 0.662438980473289, + "language_loss": 0.54533142, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.56545562, + "num_input_tokens_seen": 213155465, + "step": 9893, + "time_per_iteration": 3.3319764137268066 + }, + { + "auxiliary_loss_clip": 0.01079474, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.04197478, + "balance_loss_mlp": 1.02362406, + "epoch": 0.5948594618968887, + "flos": 23183103168000.0, + "grad_norm": 1.5803116085974762, + "language_loss": 0.74965519, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.77080828, + "num_input_tokens_seen": 213174875, + "step": 9894, + "time_per_iteration": 2.708012104034424 + }, + { + "auxiliary_loss_clip": 0.01084394, + "auxiliary_loss_mlp": 0.01031071, + "balance_loss_clip": 1.04066491, + "balance_loss_mlp": 1.01860142, + "epoch": 0.5949195851495566, + "flos": 13626771632640.0, + "grad_norm": 1.7370359553625463, + "language_loss": 0.77732074, + "learning_rate": 1.487975602873434e-06, + "loss": 0.79847538, + "num_input_tokens_seen": 213192695, + "step": 9895, + "time_per_iteration": 2.6831347942352295 + }, + { + "auxiliary_loss_clip": 0.01067508, + "auxiliary_loss_mlp": 0.01037328, + "balance_loss_clip": 1.03781974, + "balance_loss_mlp": 1.0233922, + "epoch": 0.5949797084022246, + "flos": 19751012599680.0, + "grad_norm": 1.6095460497638086, + "language_loss": 0.79347014, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.81451851, + "num_input_tokens_seen": 213211195, + "step": 9896, + "time_per_iteration": 2.7621328830718994 + }, + { + "auxiliary_loss_clip": 0.01106477, + "auxiliary_loss_mlp": 0.01035793, + "balance_loss_clip": 1.04062951, + "balance_loss_mlp": 1.02315736, + "epoch": 0.5950398316548925, + "flos": 25773878407680.0, + "grad_norm": 1.5421424712505716, + "language_loss": 0.83955193, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.86097461, + "num_input_tokens_seen": 213231975, + "step": 9897, + "time_per_iteration": 2.7152647972106934 + }, + { + "auxiliary_loss_clip": 0.01092695, + "auxiliary_loss_mlp": 0.01037147, + "balance_loss_clip": 1.04191113, + "balance_loss_mlp": 1.02490425, + "epoch": 0.5950999549075605, + "flos": 23039029716480.0, + "grad_norm": 1.9245000057416703, + "language_loss": 0.70950294, + "learning_rate": 1.486846243389939e-06, + "loss": 0.73080134, + "num_input_tokens_seen": 213249760, + "step": 9898, + "time_per_iteration": 4.332275867462158 + }, + { + "auxiliary_loss_clip": 0.01105674, + "auxiliary_loss_mlp": 0.01044981, + "balance_loss_clip": 1.03863168, + "balance_loss_mlp": 1.02892375, + "epoch": 0.5951600781602284, + "flos": 32446367637120.0, + "grad_norm": 2.443382879492767, + "language_loss": 0.64050412, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.66201067, + "num_input_tokens_seen": 213269890, + "step": 9899, + "time_per_iteration": 2.747209072113037 + }, + { + "auxiliary_loss_clip": 0.01117539, + "auxiliary_loss_mlp": 0.01028742, + "balance_loss_clip": 1.04378319, + "balance_loss_mlp": 1.01740563, + "epoch": 0.5952202014128964, + "flos": 23800874204160.0, + "grad_norm": 1.865552618204713, + "language_loss": 0.71956146, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.74102432, + "num_input_tokens_seen": 213289400, + "step": 9900, + "time_per_iteration": 5.790768146514893 + }, + { + "auxiliary_loss_clip": 0.01114892, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.04192626, + "balance_loss_mlp": 1.01949835, + "epoch": 0.5952803246655644, + "flos": 22492182084480.0, + "grad_norm": 1.7457638078039162, + "language_loss": 0.84428406, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.86575621, + "num_input_tokens_seen": 213308040, + "step": 9901, + "time_per_iteration": 2.7782936096191406 + }, + { + "auxiliary_loss_clip": 0.00993307, + "auxiliary_loss_mlp": 0.01008976, + "balance_loss_clip": 1.01768923, + "balance_loss_mlp": 1.00779581, + "epoch": 0.5953404479182324, + "flos": 51234688851840.0, + "grad_norm": 0.8002603783256921, + "language_loss": 0.58178693, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.60180974, + "num_input_tokens_seen": 213358585, + "step": 9902, + "time_per_iteration": 3.245389699935913 + }, + { + "auxiliary_loss_clip": 0.01059574, + "auxiliary_loss_mlp": 0.01029206, + "balance_loss_clip": 1.03823233, + "balance_loss_mlp": 1.01596808, + "epoch": 0.5954005711709004, + "flos": 23112682554240.0, + "grad_norm": 2.326170730098328, + "language_loss": 0.77513373, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.79602152, + "num_input_tokens_seen": 213379585, + "step": 9903, + "time_per_iteration": 2.938472032546997 + }, + { + "auxiliary_loss_clip": 0.01080471, + "auxiliary_loss_mlp": 0.01035506, + "balance_loss_clip": 1.04236233, + "balance_loss_mlp": 1.02275109, + "epoch": 0.5954606944235683, + "flos": 35954732736000.0, + "grad_norm": 3.664262182530453, + "language_loss": 0.7767508, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.79791057, + "num_input_tokens_seen": 213401465, + "step": 9904, + "time_per_iteration": 4.397410869598389 + }, + { + "auxiliary_loss_clip": 0.01102001, + "auxiliary_loss_mlp": 0.01038114, + "balance_loss_clip": 1.04016399, + "balance_loss_mlp": 1.02507877, + "epoch": 0.5955208176762363, + "flos": 30443665864320.0, + "grad_norm": 1.9431813333035064, + "language_loss": 0.72943354, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.7508347, + "num_input_tokens_seen": 213422720, + "step": 9905, + "time_per_iteration": 2.7936177253723145 + }, + { + "auxiliary_loss_clip": 0.01109363, + "auxiliary_loss_mlp": 0.01030507, + "balance_loss_clip": 1.04223228, + "balance_loss_mlp": 1.01717925, + "epoch": 0.5955809409289042, + "flos": 17640112083840.0, + "grad_norm": 1.9824269605474862, + "language_loss": 0.70172507, + "learning_rate": 1.483835475336295e-06, + "loss": 0.72312379, + "num_input_tokens_seen": 213439480, + "step": 9906, + "time_per_iteration": 2.6985738277435303 + }, + { + "auxiliary_loss_clip": 0.01106299, + "auxiliary_loss_mlp": 0.01032912, + "balance_loss_clip": 1.04149914, + "balance_loss_mlp": 1.01987052, + "epoch": 0.5956410641815723, + "flos": 24279887001600.0, + "grad_norm": 1.8692952809001842, + "language_loss": 0.75197554, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.77336764, + "num_input_tokens_seen": 213458895, + "step": 9907, + "time_per_iteration": 2.6980481147766113 + }, + { + "auxiliary_loss_clip": 0.01088924, + "auxiliary_loss_mlp": 0.01032034, + "balance_loss_clip": 1.03741193, + "balance_loss_mlp": 1.0194813, + "epoch": 0.5957011874342402, + "flos": 35734277013120.0, + "grad_norm": 1.635771489703633, + "language_loss": 0.67245162, + "learning_rate": 1.483082978767595e-06, + "loss": 0.69366121, + "num_input_tokens_seen": 213481730, + "step": 9908, + "time_per_iteration": 2.7698655128479004 + }, + { + "auxiliary_loss_clip": 0.01040116, + "auxiliary_loss_mlp": 0.01031975, + "balance_loss_clip": 1.03187275, + "balance_loss_mlp": 1.0195055, + "epoch": 0.5957613106869082, + "flos": 21245004005760.0, + "grad_norm": 1.9181869047737456, + "language_loss": 0.76516539, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.78588629, + "num_input_tokens_seen": 213497225, + "step": 9909, + "time_per_iteration": 2.8098058700561523 + }, + { + "auxiliary_loss_clip": 0.0103764, + "auxiliary_loss_mlp": 0.01004774, + "balance_loss_clip": 1.01340699, + "balance_loss_mlp": 1.00373673, + "epoch": 0.5958214339395761, + "flos": 65940969876480.0, + "grad_norm": 0.9280508663350204, + "language_loss": 0.73383075, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75425494, + "num_input_tokens_seen": 213556890, + "step": 9910, + "time_per_iteration": 3.228283166885376 + }, + { + "auxiliary_loss_clip": 0.0109102, + "auxiliary_loss_mlp": 0.01035168, + "balance_loss_clip": 1.03882253, + "balance_loss_mlp": 1.02188206, + "epoch": 0.5958815571922441, + "flos": 23218690567680.0, + "grad_norm": 2.4798653486938544, + "language_loss": 0.69676727, + "learning_rate": 1.481954380961799e-06, + "loss": 0.71802914, + "num_input_tokens_seen": 213575800, + "step": 9911, + "time_per_iteration": 2.6699378490448 + }, + { + "auxiliary_loss_clip": 0.01116036, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.04485154, + "balance_loss_mlp": 1.01942098, + "epoch": 0.595941680444912, + "flos": 16538623568640.0, + "grad_norm": 1.9669774674890577, + "language_loss": 0.65873277, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.68022704, + "num_input_tokens_seen": 213592740, + "step": 9912, + "time_per_iteration": 2.642876386642456 + }, + { + "auxiliary_loss_clip": 0.0108881, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.04177618, + "balance_loss_mlp": 1.02360988, + "epoch": 0.59600180369758, + "flos": 27818883423360.0, + "grad_norm": 1.9028573243158677, + "language_loss": 0.73863906, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.7598986, + "num_input_tokens_seen": 213611970, + "step": 9913, + "time_per_iteration": 2.860369920730591 + }, + { + "auxiliary_loss_clip": 0.01083137, + "auxiliary_loss_mlp": 0.00770309, + "balance_loss_clip": 1.03919995, + "balance_loss_mlp": 1.0000217, + "epoch": 0.596061926950248, + "flos": 29491566013440.0, + "grad_norm": 2.1966155200103907, + "language_loss": 0.79778421, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.81631863, + "num_input_tokens_seen": 213632230, + "step": 9914, + "time_per_iteration": 2.790907382965088 + }, + { + "auxiliary_loss_clip": 0.01079867, + "auxiliary_loss_mlp": 0.01029281, + "balance_loss_clip": 1.03796613, + "balance_loss_mlp": 1.01610804, + "epoch": 0.596122050202916, + "flos": 16836790366080.0, + "grad_norm": 1.724717360749454, + "language_loss": 0.67540228, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.69649374, + "num_input_tokens_seen": 213649645, + "step": 9915, + "time_per_iteration": 2.701197385787964 + }, + { + "auxiliary_loss_clip": 0.01088406, + "auxiliary_loss_mlp": 0.01030943, + "balance_loss_clip": 1.03837395, + "balance_loss_mlp": 1.0187242, + "epoch": 0.596182173455584, + "flos": 20996646393600.0, + "grad_norm": 1.462048268018942, + "language_loss": 0.78788066, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.8090741, + "num_input_tokens_seen": 213668850, + "step": 9916, + "time_per_iteration": 2.7466511726379395 + }, + { + "auxiliary_loss_clip": 0.01093274, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.03742838, + "balance_loss_mlp": 1.01851332, + "epoch": 0.5962422967082519, + "flos": 16065680169600.0, + "grad_norm": 1.8319257164110343, + "language_loss": 0.8272475, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.84849626, + "num_input_tokens_seen": 213685695, + "step": 9917, + "time_per_iteration": 2.6762564182281494 + }, + { + "auxiliary_loss_clip": 0.01090404, + "auxiliary_loss_mlp": 0.01034476, + "balance_loss_clip": 1.04083288, + "balance_loss_mlp": 1.02195954, + "epoch": 0.5963024199609199, + "flos": 12166966995840.0, + "grad_norm": 1.8036319058685593, + "language_loss": 0.76979315, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.79104197, + "num_input_tokens_seen": 213703515, + "step": 9918, + "time_per_iteration": 2.707718849182129 + }, + { + "auxiliary_loss_clip": 0.01108865, + "auxiliary_loss_mlp": 0.01038575, + "balance_loss_clip": 1.04414129, + "balance_loss_mlp": 1.02538431, + "epoch": 0.5963625432135878, + "flos": 28074280101120.0, + "grad_norm": 2.6956936924639012, + "language_loss": 0.78955698, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.8110314, + "num_input_tokens_seen": 213724170, + "step": 9919, + "time_per_iteration": 2.732933759689331 + }, + { + "auxiliary_loss_clip": 0.01091105, + "auxiliary_loss_mlp": 0.01037217, + "balance_loss_clip": 1.04111147, + "balance_loss_mlp": 1.02323401, + "epoch": 0.5964226664662559, + "flos": 19860324664320.0, + "grad_norm": 1.8773735409019414, + "language_loss": 0.77863061, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.79991376, + "num_input_tokens_seen": 213740620, + "step": 9920, + "time_per_iteration": 2.6758365631103516 + }, + { + "auxiliary_loss_clip": 0.01105504, + "auxiliary_loss_mlp": 0.01037014, + "balance_loss_clip": 1.04226005, + "balance_loss_mlp": 1.02303123, + "epoch": 0.5964827897189238, + "flos": 12932618325120.0, + "grad_norm": 2.199993791667526, + "language_loss": 0.82559252, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.84701777, + "num_input_tokens_seen": 213755390, + "step": 9921, + "time_per_iteration": 2.631972312927246 + }, + { + "auxiliary_loss_clip": 0.0110339, + "auxiliary_loss_mlp": 0.01032591, + "balance_loss_clip": 1.0396421, + "balance_loss_mlp": 1.01888192, + "epoch": 0.5965429129715918, + "flos": 18150797698560.0, + "grad_norm": 3.5044992121063103, + "language_loss": 0.80699342, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.82835329, + "num_input_tokens_seen": 213773225, + "step": 9922, + "time_per_iteration": 2.646479606628418 + }, + { + "auxiliary_loss_clip": 0.01107944, + "auxiliary_loss_mlp": 0.00770214, + "balance_loss_clip": 1.04096532, + "balance_loss_mlp": 1.000036, + "epoch": 0.5966030362242597, + "flos": 21763231476480.0, + "grad_norm": 1.7423002236659255, + "language_loss": 0.77125442, + "learning_rate": 1.477441761580111e-06, + "loss": 0.79003608, + "num_input_tokens_seen": 213791860, + "step": 9923, + "time_per_iteration": 2.646597385406494 + }, + { + "auxiliary_loss_clip": 0.01105997, + "auxiliary_loss_mlp": 0.01038842, + "balance_loss_clip": 1.04343677, + "balance_loss_mlp": 1.02382815, + "epoch": 0.5966631594769277, + "flos": 18807208790400.0, + "grad_norm": 1.7872252192325138, + "language_loss": 0.76111019, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.78255856, + "num_input_tokens_seen": 213809455, + "step": 9924, + "time_per_iteration": 2.784302234649658 + }, + { + "auxiliary_loss_clip": 0.01098024, + "auxiliary_loss_mlp": 0.0103727, + "balance_loss_clip": 1.03841281, + "balance_loss_mlp": 1.02270854, + "epoch": 0.5967232827295956, + "flos": 14064163545600.0, + "grad_norm": 2.5918588496554222, + "language_loss": 0.66627729, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.6876303, + "num_input_tokens_seen": 213826615, + "step": 9925, + "time_per_iteration": 2.6964471340179443 + }, + { + "auxiliary_loss_clip": 0.01088743, + "auxiliary_loss_mlp": 0.01035345, + "balance_loss_clip": 1.04202008, + "balance_loss_mlp": 1.0219934, + "epoch": 0.5967834059822636, + "flos": 17238235743360.0, + "grad_norm": 2.607968523577736, + "language_loss": 0.71629661, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.7375375, + "num_input_tokens_seen": 213844495, + "step": 9926, + "time_per_iteration": 2.739656448364258 + }, + { + "auxiliary_loss_clip": 0.01076071, + "auxiliary_loss_mlp": 0.00771823, + "balance_loss_clip": 1.04067254, + "balance_loss_mlp": 1.0001483, + "epoch": 0.5968435292349316, + "flos": 42520244284800.0, + "grad_norm": 1.798681806501109, + "language_loss": 0.70456839, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.72304738, + "num_input_tokens_seen": 213869125, + "step": 9927, + "time_per_iteration": 2.9877870082855225 + }, + { + "auxiliary_loss_clip": 0.01071922, + "auxiliary_loss_mlp": 0.01029018, + "balance_loss_clip": 1.03775859, + "balance_loss_mlp": 1.01547647, + "epoch": 0.5969036524875996, + "flos": 37630898945280.0, + "grad_norm": 1.7276883821850428, + "language_loss": 0.63847625, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.6594857, + "num_input_tokens_seen": 213891115, + "step": 9928, + "time_per_iteration": 2.889533042907715 + }, + { + "auxiliary_loss_clip": 0.01115406, + "auxiliary_loss_mlp": 0.0103325, + "balance_loss_clip": 1.04134023, + "balance_loss_mlp": 1.02103138, + "epoch": 0.5969637757402676, + "flos": 23148377694720.0, + "grad_norm": 1.6663701476220254, + "language_loss": 0.69803309, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.71951973, + "num_input_tokens_seen": 213911925, + "step": 9929, + "time_per_iteration": 2.6571357250213623 + }, + { + "auxiliary_loss_clip": 0.01073832, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.04385591, + "balance_loss_mlp": 1.02244925, + "epoch": 0.5970238989929355, + "flos": 24020934877440.0, + "grad_norm": 1.7972325287685906, + "language_loss": 0.76839757, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.78948194, + "num_input_tokens_seen": 213930715, + "step": 9930, + "time_per_iteration": 2.7475857734680176 + }, + { + "auxiliary_loss_clip": 0.0109514, + "auxiliary_loss_mlp": 0.01034752, + "balance_loss_clip": 1.04357862, + "balance_loss_mlp": 1.02017856, + "epoch": 0.5970840222456035, + "flos": 19426883247360.0, + "grad_norm": 1.7574249474808616, + "language_loss": 0.68748617, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.70878506, + "num_input_tokens_seen": 213950015, + "step": 9931, + "time_per_iteration": 2.713695526123047 + }, + { + "auxiliary_loss_clip": 0.01025314, + "auxiliary_loss_mlp": 0.01000381, + "balance_loss_clip": 1.01468325, + "balance_loss_mlp": 0.99920666, + "epoch": 0.5971441454982714, + "flos": 62976615235200.0, + "grad_norm": 0.8553027537300191, + "language_loss": 0.64182514, + "learning_rate": 1.474059168257065e-06, + "loss": 0.66208208, + "num_input_tokens_seen": 214003330, + "step": 9932, + "time_per_iteration": 3.106821060180664 + }, + { + "auxiliary_loss_clip": 0.01084112, + "auxiliary_loss_mlp": 0.01032121, + "balance_loss_clip": 1.03818321, + "balance_loss_mlp": 1.01869833, + "epoch": 0.5972042687509395, + "flos": 20266223328000.0, + "grad_norm": 2.9993889514324463, + "language_loss": 0.73966062, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.76082295, + "num_input_tokens_seen": 214021680, + "step": 9933, + "time_per_iteration": 2.718324899673462 + }, + { + "auxiliary_loss_clip": 0.01028586, + "auxiliary_loss_mlp": 0.01004687, + "balance_loss_clip": 1.02009809, + "balance_loss_mlp": 1.00334597, + "epoch": 0.5972643920036074, + "flos": 71652383832960.0, + "grad_norm": 0.6592973095113355, + "language_loss": 0.52000248, + "learning_rate": 1.473307699867203e-06, + "loss": 0.54033524, + "num_input_tokens_seen": 214090265, + "step": 9934, + "time_per_iteration": 3.265408515930176 + }, + { + "auxiliary_loss_clip": 0.01038691, + "auxiliary_loss_mlp": 0.01008472, + "balance_loss_clip": 1.01466894, + "balance_loss_mlp": 1.00733399, + "epoch": 0.5973245152562754, + "flos": 56892702263040.0, + "grad_norm": 0.8334850866606021, + "language_loss": 0.54153717, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.5620088, + "num_input_tokens_seen": 214146375, + "step": 9935, + "time_per_iteration": 3.07120680809021 + }, + { + "auxiliary_loss_clip": 0.01095451, + "auxiliary_loss_mlp": 0.01033243, + "balance_loss_clip": 1.04008901, + "balance_loss_mlp": 1.01956391, + "epoch": 0.5973846385089433, + "flos": 24164361884160.0, + "grad_norm": 1.5706852760220016, + "language_loss": 0.66061485, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.68190181, + "num_input_tokens_seen": 214165340, + "step": 9936, + "time_per_iteration": 2.724457263946533 + }, + { + "auxiliary_loss_clip": 0.01060903, + "auxiliary_loss_mlp": 0.01035654, + "balance_loss_clip": 1.03609622, + "balance_loss_mlp": 1.02246332, + "epoch": 0.5974447617616113, + "flos": 17670599752320.0, + "grad_norm": 1.9876387260879245, + "language_loss": 0.6771605, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.69812608, + "num_input_tokens_seen": 214181360, + "step": 9937, + "time_per_iteration": 2.75978422164917 + }, + { + "auxiliary_loss_clip": 0.0111018, + "auxiliary_loss_mlp": 0.01032313, + "balance_loss_clip": 1.04208851, + "balance_loss_mlp": 1.01846123, + "epoch": 0.5975048850142792, + "flos": 22892514140160.0, + "grad_norm": 2.408863368051578, + "language_loss": 0.77660179, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.79802668, + "num_input_tokens_seen": 214198525, + "step": 9938, + "time_per_iteration": 4.499311447143555 + }, + { + "auxiliary_loss_clip": 0.01105785, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.03925014, + "balance_loss_mlp": 1.02100301, + "epoch": 0.5975650082669473, + "flos": 24353108876160.0, + "grad_norm": 1.4410606641316148, + "language_loss": 0.75726342, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.77866983, + "num_input_tokens_seen": 214218710, + "step": 9939, + "time_per_iteration": 4.202291011810303 + }, + { + "auxiliary_loss_clip": 0.01073866, + "auxiliary_loss_mlp": 0.01032947, + "balance_loss_clip": 1.04116249, + "balance_loss_mlp": 1.01718175, + "epoch": 0.5976251315196152, + "flos": 20923352691840.0, + "grad_norm": 4.812638761028828, + "language_loss": 0.68618965, + "learning_rate": 1.471053774486878e-06, + "loss": 0.70725775, + "num_input_tokens_seen": 214237800, + "step": 9940, + "time_per_iteration": 4.418368339538574 + }, + { + "auxiliary_loss_clip": 0.01090139, + "auxiliary_loss_mlp": 0.01036739, + "balance_loss_clip": 1.04158998, + "balance_loss_mlp": 1.02415049, + "epoch": 0.5976852547722832, + "flos": 35844594658560.0, + "grad_norm": 1.3494600203677949, + "language_loss": 0.70370513, + "learning_rate": 1.470678190375664e-06, + "loss": 0.72497392, + "num_input_tokens_seen": 214260355, + "step": 9941, + "time_per_iteration": 2.7807397842407227 + }, + { + "auxiliary_loss_clip": 0.01092498, + "auxiliary_loss_mlp": 0.01034522, + "balance_loss_clip": 1.03824401, + "balance_loss_mlp": 1.02123034, + "epoch": 0.5977453780249512, + "flos": 12855948744960.0, + "grad_norm": 1.9808022638780955, + "language_loss": 0.77407408, + "learning_rate": 1.470302626336386e-06, + "loss": 0.79534429, + "num_input_tokens_seen": 214277120, + "step": 9942, + "time_per_iteration": 2.6881802082061768 + }, + { + "auxiliary_loss_clip": 0.01071168, + "auxiliary_loss_mlp": 0.01037338, + "balance_loss_clip": 1.03963232, + "balance_loss_mlp": 1.02418923, + "epoch": 0.5978055012776191, + "flos": 20959155573120.0, + "grad_norm": 1.9541019064521015, + "language_loss": 0.76172185, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.78280699, + "num_input_tokens_seen": 214295300, + "step": 9943, + "time_per_iteration": 4.4215734004974365 + }, + { + "auxiliary_loss_clip": 0.0105205, + "auxiliary_loss_mlp": 0.01034121, + "balance_loss_clip": 1.03876281, + "balance_loss_mlp": 1.02149105, + "epoch": 0.5978656245302871, + "flos": 34058003063040.0, + "grad_norm": 1.735048648764757, + "language_loss": 0.62473679, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.64559853, + "num_input_tokens_seen": 214317050, + "step": 9944, + "time_per_iteration": 2.8701138496398926 + }, + { + "auxiliary_loss_clip": 0.0109987, + "auxiliary_loss_mlp": 0.0103879, + "balance_loss_clip": 1.04420114, + "balance_loss_mlp": 1.02530718, + "epoch": 0.597925747782955, + "flos": 37373275624320.0, + "grad_norm": 1.7121148929704375, + "language_loss": 0.72442955, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.74581611, + "num_input_tokens_seen": 214337470, + "step": 9945, + "time_per_iteration": 2.7868094444274902 + }, + { + "auxiliary_loss_clip": 0.01063078, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.03817308, + "balance_loss_mlp": 1.02280903, + "epoch": 0.5979858710356231, + "flos": 25374803328000.0, + "grad_norm": 2.215747344961558, + "language_loss": 0.66905904, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.6900481, + "num_input_tokens_seen": 214357975, + "step": 9946, + "time_per_iteration": 2.83195161819458 + }, + { + "auxiliary_loss_clip": 0.01104512, + "auxiliary_loss_mlp": 0.01042624, + "balance_loss_clip": 1.03969336, + "balance_loss_mlp": 1.0275619, + "epoch": 0.598045994288291, + "flos": 13698413308800.0, + "grad_norm": 1.928704516420183, + "language_loss": 0.88898396, + "learning_rate": 1.468425107717461e-06, + "loss": 0.91045535, + "num_input_tokens_seen": 214374125, + "step": 9947, + "time_per_iteration": 2.5993123054504395 + }, + { + "auxiliary_loss_clip": 0.01112155, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.04039431, + "balance_loss_mlp": 1.02080822, + "epoch": 0.598106117540959, + "flos": 21981352815360.0, + "grad_norm": 1.8699586676771087, + "language_loss": 0.72236538, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.74381137, + "num_input_tokens_seen": 214393395, + "step": 9948, + "time_per_iteration": 2.6766860485076904 + }, + { + "auxiliary_loss_clip": 0.01093809, + "auxiliary_loss_mlp": 0.01035478, + "balance_loss_clip": 1.03969812, + "balance_loss_mlp": 1.02129257, + "epoch": 0.5981662407936269, + "flos": 20559362221440.0, + "grad_norm": 1.8848269321833362, + "language_loss": 0.89223683, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.91352975, + "num_input_tokens_seen": 214411550, + "step": 9949, + "time_per_iteration": 2.731804370880127 + }, + { + "auxiliary_loss_clip": 0.01105698, + "auxiliary_loss_mlp": 0.01030419, + "balance_loss_clip": 1.0420059, + "balance_loss_mlp": 1.01814604, + "epoch": 0.5982263640462949, + "flos": 14063840323200.0, + "grad_norm": 2.0634992965968917, + "language_loss": 0.70250058, + "learning_rate": 1.467298838320673e-06, + "loss": 0.72386169, + "num_input_tokens_seen": 214429780, + "step": 9950, + "time_per_iteration": 2.666879415512085 + }, + { + "auxiliary_loss_clip": 0.01103442, + "auxiliary_loss_mlp": 0.01031809, + "balance_loss_clip": 1.0392406, + "balance_loss_mlp": 1.01904809, + "epoch": 0.5982864872989628, + "flos": 17707228646400.0, + "grad_norm": 1.610292824709656, + "language_loss": 0.78345191, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.80480444, + "num_input_tokens_seen": 214447775, + "step": 9951, + "time_per_iteration": 2.624361753463745 + }, + { + "auxiliary_loss_clip": 0.01096152, + "auxiliary_loss_mlp": 0.01038536, + "balance_loss_clip": 1.0411104, + "balance_loss_mlp": 1.02471995, + "epoch": 0.5983466105516309, + "flos": 16764789553920.0, + "grad_norm": 1.4677439185999286, + "language_loss": 0.73951542, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.76086229, + "num_input_tokens_seen": 214467245, + "step": 9952, + "time_per_iteration": 2.780212640762329 + }, + { + "auxiliary_loss_clip": 0.01097597, + "auxiliary_loss_mlp": 0.00771764, + "balance_loss_clip": 1.04058945, + "balance_loss_mlp": 1.0000577, + "epoch": 0.5984067338042988, + "flos": 20042714949120.0, + "grad_norm": 2.0876696722134493, + "language_loss": 0.79496032, + "learning_rate": 1.466172750724613e-06, + "loss": 0.81365395, + "num_input_tokens_seen": 214484385, + "step": 9953, + "time_per_iteration": 2.6629557609558105 + }, + { + "auxiliary_loss_clip": 0.01088175, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.04368794, + "balance_loss_mlp": 1.02172363, + "epoch": 0.5984668570569668, + "flos": 26319900026880.0, + "grad_norm": 1.571611875852805, + "language_loss": 0.69577867, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.71700311, + "num_input_tokens_seen": 214503465, + "step": 9954, + "time_per_iteration": 2.772745132446289 + }, + { + "auxiliary_loss_clip": 0.01092663, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.03927422, + "balance_loss_mlp": 1.02299023, + "epoch": 0.5985269803096348, + "flos": 20593728558720.0, + "grad_norm": 1.8709505635033254, + "language_loss": 0.73055756, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.75183785, + "num_input_tokens_seen": 214520725, + "step": 9955, + "time_per_iteration": 2.6827971935272217 + }, + { + "auxiliary_loss_clip": 0.01118308, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.04205883, + "balance_loss_mlp": 1.0209837, + "epoch": 0.5985871035623027, + "flos": 26865382942080.0, + "grad_norm": 1.5476020192092728, + "language_loss": 0.68627518, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.70780075, + "num_input_tokens_seen": 214540675, + "step": 9956, + "time_per_iteration": 2.6055126190185547 + }, + { + "auxiliary_loss_clip": 0.01120333, + "auxiliary_loss_mlp": 0.01033532, + "balance_loss_clip": 1.0435667, + "balance_loss_mlp": 1.02041864, + "epoch": 0.5986472268149707, + "flos": 19609704495360.0, + "grad_norm": 5.767015828905461, + "language_loss": 0.74026513, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.76180387, + "num_input_tokens_seen": 214559910, + "step": 9957, + "time_per_iteration": 2.670759677886963 + }, + { + "auxiliary_loss_clip": 0.01082315, + "auxiliary_loss_mlp": 0.01029692, + "balance_loss_clip": 1.04125023, + "balance_loss_mlp": 1.01696002, + "epoch": 0.5987073500676386, + "flos": 21794616984960.0, + "grad_norm": 2.0517993540808157, + "language_loss": 0.84612942, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.86724949, + "num_input_tokens_seen": 214575960, + "step": 9958, + "time_per_iteration": 2.710693597793579 + }, + { + "auxiliary_loss_clip": 0.01088695, + "auxiliary_loss_mlp": 0.00771117, + "balance_loss_clip": 1.04130435, + "balance_loss_mlp": 1.00005364, + "epoch": 0.5987674733203067, + "flos": 24314361079680.0, + "grad_norm": 1.9589439151063424, + "language_loss": 0.6649909, + "learning_rate": 1.463921122471864e-06, + "loss": 0.68358904, + "num_input_tokens_seen": 214594230, + "step": 9959, + "time_per_iteration": 2.7052528858184814 + }, + { + "auxiliary_loss_clip": 0.0110604, + "auxiliary_loss_mlp": 0.01031714, + "balance_loss_clip": 1.04048181, + "balance_loss_mlp": 1.01915514, + "epoch": 0.5988275965729746, + "flos": 21320201128320.0, + "grad_norm": 1.6803724665796522, + "language_loss": 0.83453488, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.85591239, + "num_input_tokens_seen": 214613130, + "step": 9960, + "time_per_iteration": 2.698373317718506 + }, + { + "auxiliary_loss_clip": 0.0110105, + "auxiliary_loss_mlp": 0.01026917, + "balance_loss_clip": 1.04384398, + "balance_loss_mlp": 1.01451361, + "epoch": 0.5988877198256426, + "flos": 25118041933440.0, + "grad_norm": 1.4637618649833892, + "language_loss": 0.79449862, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.81577832, + "num_input_tokens_seen": 214634470, + "step": 9961, + "time_per_iteration": 2.763143539428711 + }, + { + "auxiliary_loss_clip": 0.01115923, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.04150534, + "balance_loss_mlp": 1.02107751, + "epoch": 0.5989478430783105, + "flos": 26429104350720.0, + "grad_norm": 1.7720947984672266, + "language_loss": 0.66938126, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69088268, + "num_input_tokens_seen": 214654030, + "step": 9962, + "time_per_iteration": 2.6398210525512695 + }, + { + "auxiliary_loss_clip": 0.01100963, + "auxiliary_loss_mlp": 0.01040353, + "balance_loss_clip": 1.03867447, + "balance_loss_mlp": 1.02583313, + "epoch": 0.5990079663309785, + "flos": 25778439434880.0, + "grad_norm": 1.3371562951805418, + "language_loss": 0.74043596, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.76184916, + "num_input_tokens_seen": 214676985, + "step": 9963, + "time_per_iteration": 2.716456651687622 + }, + { + "auxiliary_loss_clip": 0.01105789, + "auxiliary_loss_mlp": 0.010335, + "balance_loss_clip": 1.04120398, + "balance_loss_mlp": 1.02003562, + "epoch": 0.5990680895836464, + "flos": 36831779118720.0, + "grad_norm": 1.8119605341465645, + "language_loss": 0.68010569, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.70149863, + "num_input_tokens_seen": 214700105, + "step": 9964, + "time_per_iteration": 2.764112710952759 + }, + { + "auxiliary_loss_clip": 0.01082495, + "auxiliary_loss_mlp": 0.01029274, + "balance_loss_clip": 1.04189765, + "balance_loss_mlp": 1.0158987, + "epoch": 0.5991282128363145, + "flos": 24133550993280.0, + "grad_norm": 1.838028773427246, + "language_loss": 0.76536453, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.78648221, + "num_input_tokens_seen": 214717885, + "step": 9965, + "time_per_iteration": 2.6872916221618652 + }, + { + "auxiliary_loss_clip": 0.01100107, + "auxiliary_loss_mlp": 0.01029644, + "balance_loss_clip": 1.03997707, + "balance_loss_mlp": 1.01664448, + "epoch": 0.5991883360889824, + "flos": 10304064956160.0, + "grad_norm": 1.881941118756219, + "language_loss": 0.77352554, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.79482305, + "num_input_tokens_seen": 214733680, + "step": 9966, + "time_per_iteration": 2.645473003387451 + }, + { + "auxiliary_loss_clip": 0.01080024, + "auxiliary_loss_mlp": 0.01029432, + "balance_loss_clip": 1.04003799, + "balance_loss_mlp": 1.01742721, + "epoch": 0.5992484593416504, + "flos": 23951196622080.0, + "grad_norm": 1.4675663731632993, + "language_loss": 0.73089266, + "learning_rate": 1.460920090376422e-06, + "loss": 0.75198722, + "num_input_tokens_seen": 214753285, + "step": 9967, + "time_per_iteration": 2.7043392658233643 + }, + { + "auxiliary_loss_clip": 0.0111042, + "auxiliary_loss_mlp": 0.01035802, + "balance_loss_clip": 1.04168642, + "balance_loss_mlp": 1.02200305, + "epoch": 0.5993085825943184, + "flos": 11944105061760.0, + "grad_norm": 2.0432757361111724, + "language_loss": 0.68492925, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.70639145, + "num_input_tokens_seen": 214767810, + "step": 9968, + "time_per_iteration": 2.618802070617676 + }, + { + "auxiliary_loss_clip": 0.01104497, + "auxiliary_loss_mlp": 0.01037187, + "balance_loss_clip": 1.03805614, + "balance_loss_mlp": 1.02315605, + "epoch": 0.5993687058469863, + "flos": 19026838500480.0, + "grad_norm": 1.5933947258371375, + "language_loss": 0.79251635, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.81393319, + "num_input_tokens_seen": 214786040, + "step": 9969, + "time_per_iteration": 2.6758008003234863 + }, + { + "auxiliary_loss_clip": 0.01100647, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.03998137, + "balance_loss_mlp": 1.0176506, + "epoch": 0.5994288290996543, + "flos": 14282967242880.0, + "grad_norm": 1.6601112189929519, + "language_loss": 0.80936122, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.83067989, + "num_input_tokens_seen": 214803110, + "step": 9970, + "time_per_iteration": 2.7434401512145996 + }, + { + "auxiliary_loss_clip": 0.01064445, + "auxiliary_loss_mlp": 0.01044271, + "balance_loss_clip": 1.0378437, + "balance_loss_mlp": 1.02879751, + "epoch": 0.5994889523523222, + "flos": 19206643006080.0, + "grad_norm": 2.015109530583561, + "language_loss": 0.61666113, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.6377483, + "num_input_tokens_seen": 214819945, + "step": 9971, + "time_per_iteration": 2.6593470573425293 + }, + { + "auxiliary_loss_clip": 0.01112816, + "auxiliary_loss_mlp": 0.01033245, + "balance_loss_clip": 1.04096997, + "balance_loss_mlp": 1.02065659, + "epoch": 0.5995490756049903, + "flos": 28037040675840.0, + "grad_norm": 1.7466561522631148, + "language_loss": 0.79054534, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.81200594, + "num_input_tokens_seen": 214838810, + "step": 9972, + "time_per_iteration": 2.657733917236328 + }, + { + "auxiliary_loss_clip": 0.01077287, + "auxiliary_loss_mlp": 0.0103561, + "balance_loss_clip": 1.03948355, + "balance_loss_mlp": 1.0220139, + "epoch": 0.5996091988576582, + "flos": 29052953038080.0, + "grad_norm": 2.7295276371688657, + "language_loss": 0.76414442, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.78527337, + "num_input_tokens_seen": 214857040, + "step": 9973, + "time_per_iteration": 2.804370880126953 + }, + { + "auxiliary_loss_clip": 0.01080222, + "auxiliary_loss_mlp": 0.01031483, + "balance_loss_clip": 1.03798461, + "balance_loss_mlp": 1.01820862, + "epoch": 0.5996693221103262, + "flos": 20813968800000.0, + "grad_norm": 2.687412315258338, + "language_loss": 0.65429473, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.6754117, + "num_input_tokens_seen": 214873375, + "step": 9974, + "time_per_iteration": 2.7193095684051514 + }, + { + "auxiliary_loss_clip": 0.01106109, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.0399034, + "balance_loss_mlp": 1.01984227, + "epoch": 0.5997294453629941, + "flos": 23768914078080.0, + "grad_norm": 1.3699302504221633, + "language_loss": 0.74378854, + "learning_rate": 1.457920366566428e-06, + "loss": 0.76517522, + "num_input_tokens_seen": 214893900, + "step": 9975, + "time_per_iteration": 2.6727962493896484 + }, + { + "auxiliary_loss_clip": 0.01117306, + "auxiliary_loss_mlp": 0.01031631, + "balance_loss_clip": 1.04184341, + "balance_loss_mlp": 1.01771951, + "epoch": 0.5997895686156621, + "flos": 20960017499520.0, + "grad_norm": 1.8689128111534072, + "language_loss": 0.77081978, + "learning_rate": 1.457545493441611e-06, + "loss": 0.79230917, + "num_input_tokens_seen": 214912110, + "step": 9976, + "time_per_iteration": 2.5855295658111572 + }, + { + "auxiliary_loss_clip": 0.01101132, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.04325271, + "balance_loss_mlp": 1.0225029, + "epoch": 0.59984969186833, + "flos": 28365443746560.0, + "grad_norm": 2.489782776024688, + "language_loss": 0.74998355, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.77135837, + "num_input_tokens_seen": 214930140, + "step": 9977, + "time_per_iteration": 4.355423212051392 + }, + { + "auxiliary_loss_clip": 0.01081083, + "auxiliary_loss_mlp": 0.01029688, + "balance_loss_clip": 1.03771675, + "balance_loss_mlp": 1.01639032, + "epoch": 0.5999098151209981, + "flos": 22565906749440.0, + "grad_norm": 1.7961745328309484, + "language_loss": 0.69053113, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.71163881, + "num_input_tokens_seen": 214949200, + "step": 9978, + "time_per_iteration": 2.687735080718994 + }, + { + "auxiliary_loss_clip": 0.01124045, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.04541636, + "balance_loss_mlp": 1.02081037, + "epoch": 0.599969938373666, + "flos": 18768712389120.0, + "grad_norm": 1.9378111201967976, + "language_loss": 0.81427479, + "learning_rate": 1.456420997543594e-06, + "loss": 0.8358658, + "num_input_tokens_seen": 214965775, + "step": 9979, + "time_per_iteration": 5.60455322265625 + }, + { + "auxiliary_loss_clip": 0.01113469, + "auxiliary_loss_mlp": 0.0103294, + "balance_loss_clip": 1.04139137, + "balance_loss_mlp": 1.02011895, + "epoch": 0.600030061626334, + "flos": 11327231865600.0, + "grad_norm": 2.0199004568827577, + "language_loss": 0.70054936, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.72201335, + "num_input_tokens_seen": 214982480, + "step": 9980, + "time_per_iteration": 2.5815303325653076 + }, + { + "auxiliary_loss_clip": 0.01105293, + "auxiliary_loss_mlp": 0.01032543, + "balance_loss_clip": 1.03971553, + "balance_loss_mlp": 1.01786244, + "epoch": 0.600090184879002, + "flos": 16578664254720.0, + "grad_norm": 2.2746227330860327, + "language_loss": 0.686566, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.70794439, + "num_input_tokens_seen": 214998110, + "step": 9981, + "time_per_iteration": 2.635133743286133 + }, + { + "auxiliary_loss_clip": 0.01106547, + "auxiliary_loss_mlp": 0.01036545, + "balance_loss_clip": 1.04316497, + "balance_loss_mlp": 1.02458215, + "epoch": 0.6001503081316699, + "flos": 23618627573760.0, + "grad_norm": 1.8281310539755133, + "language_loss": 0.78525096, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.80668187, + "num_input_tokens_seen": 215017995, + "step": 9982, + "time_per_iteration": 4.6227052211761475 + }, + { + "auxiliary_loss_clip": 0.01066865, + "auxiliary_loss_mlp": 0.01043371, + "balance_loss_clip": 1.03895831, + "balance_loss_mlp": 1.02730179, + "epoch": 0.6002104313843379, + "flos": 20667668705280.0, + "grad_norm": 1.558592797835216, + "language_loss": 0.73127562, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.75237799, + "num_input_tokens_seen": 215038285, + "step": 9983, + "time_per_iteration": 2.851017951965332 + }, + { + "auxiliary_loss_clip": 0.01075266, + "auxiliary_loss_mlp": 0.01033335, + "balance_loss_clip": 1.03699243, + "balance_loss_mlp": 1.01962018, + "epoch": 0.6002705546370058, + "flos": 22455229968000.0, + "grad_norm": 4.7484025968689325, + "language_loss": 0.78227878, + "learning_rate": 1.454547250154447e-06, + "loss": 0.80336481, + "num_input_tokens_seen": 215057825, + "step": 9984, + "time_per_iteration": 2.6935315132141113 + }, + { + "auxiliary_loss_clip": 0.01109117, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.04397178, + "balance_loss_mlp": 1.02215743, + "epoch": 0.6003306778896739, + "flos": 25191982080000.0, + "grad_norm": 1.729800567101094, + "language_loss": 0.83458543, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.85603082, + "num_input_tokens_seen": 215077790, + "step": 9985, + "time_per_iteration": 2.7772903442382812 + }, + { + "auxiliary_loss_clip": 0.01106318, + "auxiliary_loss_mlp": 0.01039651, + "balance_loss_clip": 1.04176068, + "balance_loss_mlp": 1.02729487, + "epoch": 0.6003908011423418, + "flos": 26687733252480.0, + "grad_norm": 2.2153021552569956, + "language_loss": 0.71093589, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.73239559, + "num_input_tokens_seen": 215097650, + "step": 9986, + "time_per_iteration": 2.794067859649658 + }, + { + "auxiliary_loss_clip": 0.0112089, + "auxiliary_loss_mlp": 0.00771497, + "balance_loss_clip": 1.04465151, + "balance_loss_mlp": 1.00010371, + "epoch": 0.6004509243950098, + "flos": 22565080736640.0, + "grad_norm": 1.3997582574427474, + "language_loss": 0.71425599, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.73317981, + "num_input_tokens_seen": 215118235, + "step": 9987, + "time_per_iteration": 2.689911365509033 + }, + { + "auxiliary_loss_clip": 0.01096945, + "auxiliary_loss_mlp": 0.01039105, + "balance_loss_clip": 1.04330432, + "balance_loss_mlp": 1.02589071, + "epoch": 0.6005110476476777, + "flos": 19719303868800.0, + "grad_norm": 1.7371829608849618, + "language_loss": 0.84939432, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.87075484, + "num_input_tokens_seen": 215136755, + "step": 9988, + "time_per_iteration": 2.7220449447631836 + }, + { + "auxiliary_loss_clip": 0.01108518, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.04211533, + "balance_loss_mlp": 1.02187085, + "epoch": 0.6005711709003457, + "flos": 17712543859200.0, + "grad_norm": 1.6453818743399957, + "language_loss": 0.65595025, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.6773892, + "num_input_tokens_seen": 215155225, + "step": 9989, + "time_per_iteration": 2.708707809448242 + }, + { + "auxiliary_loss_clip": 0.0110487, + "auxiliary_loss_mlp": 0.01034775, + "balance_loss_clip": 1.04078543, + "balance_loss_mlp": 1.02239513, + "epoch": 0.6006312941530136, + "flos": 18514464946560.0, + "grad_norm": 1.5276583445435046, + "language_loss": 0.8036738, + "learning_rate": 1.452299436003257e-06, + "loss": 0.82507026, + "num_input_tokens_seen": 215174815, + "step": 9990, + "time_per_iteration": 2.6760056018829346 + }, + { + "auxiliary_loss_clip": 0.0107479, + "auxiliary_loss_mlp": 0.01031552, + "balance_loss_clip": 1.03909266, + "balance_loss_mlp": 1.01817632, + "epoch": 0.6006914174056817, + "flos": 21390837223680.0, + "grad_norm": 2.0016484487093833, + "language_loss": 0.8290872, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.85015059, + "num_input_tokens_seen": 215192045, + "step": 9991, + "time_per_iteration": 2.6902015209198 + }, + { + "auxiliary_loss_clip": 0.01062355, + "auxiliary_loss_mlp": 0.01042556, + "balance_loss_clip": 1.03686535, + "balance_loss_mlp": 1.02867436, + "epoch": 0.6007515406583496, + "flos": 12750515349120.0, + "grad_norm": 1.9693626562875086, + "language_loss": 0.82834661, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.84939575, + "num_input_tokens_seen": 215209885, + "step": 9992, + "time_per_iteration": 2.750401496887207 + }, + { + "auxiliary_loss_clip": 0.01095422, + "auxiliary_loss_mlp": 0.00771119, + "balance_loss_clip": 1.04209352, + "balance_loss_mlp": 1.0001328, + "epoch": 0.6008116639110176, + "flos": 19206894401280.0, + "grad_norm": 1.855753675619843, + "language_loss": 0.66424763, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.68291306, + "num_input_tokens_seen": 215228150, + "step": 9993, + "time_per_iteration": 2.664606809616089 + }, + { + "auxiliary_loss_clip": 0.01080718, + "auxiliary_loss_mlp": 0.01034631, + "balance_loss_clip": 1.03863966, + "balance_loss_mlp": 1.02051032, + "epoch": 0.6008717871636855, + "flos": 17055342668160.0, + "grad_norm": 2.4957386160129182, + "language_loss": 0.80870563, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.82985908, + "num_input_tokens_seen": 215243755, + "step": 9994, + "time_per_iteration": 2.640841007232666 + }, + { + "auxiliary_loss_clip": 0.01071985, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.03745914, + "balance_loss_mlp": 1.01653457, + "epoch": 0.6009319104163535, + "flos": 20298686244480.0, + "grad_norm": 1.874968253383489, + "language_loss": 0.72665036, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.7476638, + "num_input_tokens_seen": 215262130, + "step": 9995, + "time_per_iteration": 2.694720506668091 + }, + { + "auxiliary_loss_clip": 0.01094635, + "auxiliary_loss_mlp": 0.01038473, + "balance_loss_clip": 1.03786469, + "balance_loss_mlp": 1.02479935, + "epoch": 0.6009920336690215, + "flos": 21836776573440.0, + "grad_norm": 1.6925252532660184, + "language_loss": 0.80807674, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.82940787, + "num_input_tokens_seen": 215281785, + "step": 9996, + "time_per_iteration": 2.6821236610412598 + }, + { + "auxiliary_loss_clip": 0.01056059, + "auxiliary_loss_mlp": 0.01045573, + "balance_loss_clip": 1.0363729, + "balance_loss_mlp": 1.03094554, + "epoch": 0.6010521569216895, + "flos": 22596107109120.0, + "grad_norm": 2.5847377804090548, + "language_loss": 0.78435457, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.80537087, + "num_input_tokens_seen": 215297550, + "step": 9997, + "time_per_iteration": 2.763819694519043 + }, + { + "auxiliary_loss_clip": 0.01106886, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.0403868, + "balance_loss_mlp": 1.01968443, + "epoch": 0.6011122801743575, + "flos": 19171702051200.0, + "grad_norm": 1.6202780199081332, + "language_loss": 0.73208427, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.75348961, + "num_input_tokens_seen": 215316360, + "step": 9998, + "time_per_iteration": 2.642061471939087 + }, + { + "auxiliary_loss_clip": 0.01085494, + "auxiliary_loss_mlp": 0.01033235, + "balance_loss_clip": 1.03910601, + "balance_loss_mlp": 1.01992536, + "epoch": 0.6011724034270254, + "flos": 25010022758400.0, + "grad_norm": 1.482726748062067, + "language_loss": 0.72144544, + "learning_rate": 1.448929117633027e-06, + "loss": 0.74263275, + "num_input_tokens_seen": 215336405, + "step": 9999, + "time_per_iteration": 2.726409673690796 + }, + { + "auxiliary_loss_clip": 0.01067323, + "auxiliary_loss_mlp": 0.0103545, + "balance_loss_clip": 1.03762555, + "balance_loss_mlp": 1.02221787, + "epoch": 0.6012325266796934, + "flos": 21797669640960.0, + "grad_norm": 1.6696696942731026, + "language_loss": 0.78647506, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.80750275, + "num_input_tokens_seen": 215356590, + "step": 10000, + "time_per_iteration": 2.8357326984405518 + }, + { + "auxiliary_loss_clip": 0.01121882, + "auxiliary_loss_mlp": 0.0103934, + "balance_loss_clip": 1.04357147, + "balance_loss_mlp": 1.02509475, + "epoch": 0.6012926499323613, + "flos": 19573003774080.0, + "grad_norm": 1.8951447876838274, + "language_loss": 0.7747916, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.79640388, + "num_input_tokens_seen": 215374295, + "step": 10001, + "time_per_iteration": 2.623619556427002 + }, + { + "auxiliary_loss_clip": 0.01110485, + "auxiliary_loss_mlp": 0.01029908, + "balance_loss_clip": 1.04319382, + "balance_loss_mlp": 1.01584136, + "epoch": 0.6013527731850293, + "flos": 34860786076800.0, + "grad_norm": 1.8091026033907125, + "language_loss": 0.5879162, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.60932016, + "num_input_tokens_seen": 215394535, + "step": 10002, + "time_per_iteration": 2.7854535579681396 + }, + { + "auxiliary_loss_clip": 0.01101715, + "auxiliary_loss_mlp": 0.01040915, + "balance_loss_clip": 1.04363275, + "balance_loss_mlp": 1.02503633, + "epoch": 0.6014128964376972, + "flos": 23291948355840.0, + "grad_norm": 1.7477200306776974, + "language_loss": 0.7803607, + "learning_rate": 1.447431741055314e-06, + "loss": 0.80178702, + "num_input_tokens_seen": 215414355, + "step": 10003, + "time_per_iteration": 2.717195987701416 + }, + { + "auxiliary_loss_clip": 0.01119246, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.04236484, + "balance_loss_mlp": 1.02104282, + "epoch": 0.6014730196903653, + "flos": 24820916630400.0, + "grad_norm": 2.556614535664238, + "language_loss": 0.77315271, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.79469103, + "num_input_tokens_seen": 215428280, + "step": 10004, + "time_per_iteration": 2.7323880195617676 + }, + { + "auxiliary_loss_clip": 0.01103784, + "auxiliary_loss_mlp": 0.01030419, + "balance_loss_clip": 1.04035211, + "balance_loss_mlp": 1.01653695, + "epoch": 0.6015331429430332, + "flos": 23112359331840.0, + "grad_norm": 1.5669492652896668, + "language_loss": 0.72698373, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.74832577, + "num_input_tokens_seen": 215448970, + "step": 10005, + "time_per_iteration": 2.6966609954833984 + }, + { + "auxiliary_loss_clip": 0.0111171, + "auxiliary_loss_mlp": 0.01028792, + "balance_loss_clip": 1.03977418, + "balance_loss_mlp": 1.01611972, + "epoch": 0.6015932661957012, + "flos": 19201363706880.0, + "grad_norm": 2.133433515954987, + "language_loss": 0.7512781, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.77268308, + "num_input_tokens_seen": 215465260, + "step": 10006, + "time_per_iteration": 2.5414936542510986 + }, + { + "auxiliary_loss_clip": 0.01089042, + "auxiliary_loss_mlp": 0.01034372, + "balance_loss_clip": 1.03682578, + "balance_loss_mlp": 1.0206567, + "epoch": 0.6016533894483691, + "flos": 18113630100480.0, + "grad_norm": 2.222329085457676, + "language_loss": 0.73606133, + "learning_rate": 1.445934699732685e-06, + "loss": 0.75729549, + "num_input_tokens_seen": 215482725, + "step": 10007, + "time_per_iteration": 2.7956955432891846 + }, + { + "auxiliary_loss_clip": 0.0109466, + "auxiliary_loss_mlp": 0.01027479, + "balance_loss_clip": 1.04082942, + "balance_loss_mlp": 1.0153439, + "epoch": 0.6017135127010371, + "flos": 16216900427520.0, + "grad_norm": 1.6405140373840412, + "language_loss": 0.69996077, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.72118211, + "num_input_tokens_seen": 215500420, + "step": 10008, + "time_per_iteration": 2.740049362182617 + }, + { + "auxiliary_loss_clip": 0.01104877, + "auxiliary_loss_mlp": 0.01024718, + "balance_loss_clip": 1.03994632, + "balance_loss_mlp": 1.01252937, + "epoch": 0.6017736359537051, + "flos": 23444246021760.0, + "grad_norm": 1.594791938839471, + "language_loss": 0.76377881, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.78507471, + "num_input_tokens_seen": 215522260, + "step": 10009, + "time_per_iteration": 2.6797382831573486 + }, + { + "auxiliary_loss_clip": 0.0109029, + "auxiliary_loss_mlp": 0.00770516, + "balance_loss_clip": 1.03898764, + "balance_loss_mlp": 1.00010157, + "epoch": 0.601833759206373, + "flos": 23514056104320.0, + "grad_norm": 1.9797273750165876, + "language_loss": 0.74202949, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.76063752, + "num_input_tokens_seen": 215541715, + "step": 10010, + "time_per_iteration": 2.7184016704559326 + }, + { + "auxiliary_loss_clip": 0.01028511, + "auxiliary_loss_mlp": 0.00998357, + "balance_loss_clip": 1.01324391, + "balance_loss_mlp": 0.99717093, + "epoch": 0.6018938824590411, + "flos": 63991668648960.0, + "grad_norm": 0.8045921055289736, + "language_loss": 0.55051792, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.57078665, + "num_input_tokens_seen": 215603020, + "step": 10011, + "time_per_iteration": 3.2024238109588623 + }, + { + "auxiliary_loss_clip": 0.01107806, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.04110157, + "balance_loss_mlp": 1.02186561, + "epoch": 0.601954005711709, + "flos": 34640007131520.0, + "grad_norm": 1.3534958886387711, + "language_loss": 0.62085426, + "learning_rate": 1.44406387091556e-06, + "loss": 0.64227581, + "num_input_tokens_seen": 215625115, + "step": 10012, + "time_per_iteration": 2.756197452545166 + }, + { + "auxiliary_loss_clip": 0.01074106, + "auxiliary_loss_mlp": 0.01028149, + "balance_loss_clip": 1.03729844, + "balance_loss_mlp": 1.01547122, + "epoch": 0.602014128964377, + "flos": 19427062815360.0, + "grad_norm": 2.02443112791839, + "language_loss": 0.74996275, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.77098525, + "num_input_tokens_seen": 215643730, + "step": 10013, + "time_per_iteration": 2.718114137649536 + }, + { + "auxiliary_loss_clip": 0.01109921, + "auxiliary_loss_mlp": 0.01028766, + "balance_loss_clip": 1.03983474, + "balance_loss_mlp": 1.01669037, + "epoch": 0.6020742522170449, + "flos": 28329389470080.0, + "grad_norm": 1.6563944160673858, + "language_loss": 0.81454921, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.83593607, + "num_input_tokens_seen": 215664425, + "step": 10014, + "time_per_iteration": 2.6359105110168457 + }, + { + "auxiliary_loss_clip": 0.01089157, + "auxiliary_loss_mlp": 0.01030481, + "balance_loss_clip": 1.037884, + "balance_loss_mlp": 1.01777339, + "epoch": 0.6021343754697129, + "flos": 22747040058240.0, + "grad_norm": 1.540720048754759, + "language_loss": 0.72213233, + "learning_rate": 1.442941626485624e-06, + "loss": 0.74332869, + "num_input_tokens_seen": 215684280, + "step": 10015, + "time_per_iteration": 2.7502388954162598 + }, + { + "auxiliary_loss_clip": 0.01020446, + "auxiliary_loss_mlp": 0.01001943, + "balance_loss_clip": 1.01667976, + "balance_loss_mlp": 1.00080478, + "epoch": 0.6021944987223808, + "flos": 65752007402880.0, + "grad_norm": 0.8150448703202539, + "language_loss": 0.5473066, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.56753051, + "num_input_tokens_seen": 215739780, + "step": 10016, + "time_per_iteration": 4.701697826385498 + }, + { + "auxiliary_loss_clip": 0.01094661, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.04152966, + "balance_loss_mlp": 1.01637208, + "epoch": 0.6022546219750489, + "flos": 16105182151680.0, + "grad_norm": 1.5504792190081969, + "language_loss": 0.82899499, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.85023677, + "num_input_tokens_seen": 215757885, + "step": 10017, + "time_per_iteration": 2.636793851852417 + }, + { + "auxiliary_loss_clip": 0.01091797, + "auxiliary_loss_mlp": 0.01031972, + "balance_loss_clip": 1.0407809, + "balance_loss_mlp": 1.01946664, + "epoch": 0.6023147452277168, + "flos": 25512555985920.0, + "grad_norm": 1.7715837391634046, + "language_loss": 0.83621204, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.85744977, + "num_input_tokens_seen": 215776415, + "step": 10018, + "time_per_iteration": 6.060548543930054 + }, + { + "auxiliary_loss_clip": 0.01093456, + "auxiliary_loss_mlp": 0.01038236, + "balance_loss_clip": 1.03801382, + "balance_loss_mlp": 1.02431202, + "epoch": 0.6023748684803848, + "flos": 22636075968000.0, + "grad_norm": 1.733441285539822, + "language_loss": 0.78400528, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.80532229, + "num_input_tokens_seen": 215794865, + "step": 10019, + "time_per_iteration": 2.6781299114227295 + }, + { + "auxiliary_loss_clip": 0.01075209, + "auxiliary_loss_mlp": 0.00770827, + "balance_loss_clip": 1.03914475, + "balance_loss_mlp": 1.00017881, + "epoch": 0.6024349917330527, + "flos": 26210444307840.0, + "grad_norm": 2.381543125857722, + "language_loss": 0.73964417, + "learning_rate": 1.441071641765681e-06, + "loss": 0.7581045, + "num_input_tokens_seen": 215816840, + "step": 10020, + "time_per_iteration": 2.7956390380859375 + }, + { + "auxiliary_loss_clip": 0.01095191, + "auxiliary_loss_mlp": 0.01033486, + "balance_loss_clip": 1.04020286, + "balance_loss_mlp": 1.0205282, + "epoch": 0.6024951149857207, + "flos": 21251755762560.0, + "grad_norm": 2.1093873668761765, + "language_loss": 0.64171422, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.663001, + "num_input_tokens_seen": 215836100, + "step": 10021, + "time_per_iteration": 4.23021388053894 + }, + { + "auxiliary_loss_clip": 0.01102751, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.03910637, + "balance_loss_mlp": 1.01996362, + "epoch": 0.6025552382383887, + "flos": 26943453152640.0, + "grad_norm": 1.41151849166176, + "language_loss": 0.80664903, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.82801056, + "num_input_tokens_seen": 215858480, + "step": 10022, + "time_per_iteration": 2.6966497898101807 + }, + { + "auxiliary_loss_clip": 0.0110378, + "auxiliary_loss_mlp": 0.01030649, + "balance_loss_clip": 1.04190755, + "balance_loss_mlp": 1.0179832, + "epoch": 0.6026153614910567, + "flos": 31684379495040.0, + "grad_norm": 1.480979872703277, + "language_loss": 0.66483712, + "learning_rate": 1.439949905155693e-06, + "loss": 0.68618143, + "num_input_tokens_seen": 215879950, + "step": 10023, + "time_per_iteration": 2.691399574279785 + }, + { + "auxiliary_loss_clip": 0.01104501, + "auxiliary_loss_mlp": 0.01032886, + "balance_loss_clip": 1.03789723, + "balance_loss_mlp": 1.02022552, + "epoch": 0.6026754847437247, + "flos": 29312731175040.0, + "grad_norm": 2.162444553659901, + "language_loss": 0.74503481, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.76640868, + "num_input_tokens_seen": 215899830, + "step": 10024, + "time_per_iteration": 2.7364046573638916 + }, + { + "auxiliary_loss_clip": 0.01104535, + "auxiliary_loss_mlp": 0.01036059, + "balance_loss_clip": 1.04094052, + "balance_loss_mlp": 1.02257693, + "epoch": 0.6027356079963926, + "flos": 23586775188480.0, + "grad_norm": 1.6406938647308078, + "language_loss": 0.72738647, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.74879241, + "num_input_tokens_seen": 215920440, + "step": 10025, + "time_per_iteration": 2.6431972980499268 + }, + { + "auxiliary_loss_clip": 0.01119748, + "auxiliary_loss_mlp": 0.01037727, + "balance_loss_clip": 1.04081619, + "balance_loss_mlp": 1.02376747, + "epoch": 0.6027957312490606, + "flos": 20813753318400.0, + "grad_norm": 2.306954455043105, + "language_loss": 0.6677472, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.68932194, + "num_input_tokens_seen": 215940535, + "step": 10026, + "time_per_iteration": 2.6187641620635986 + }, + { + "auxiliary_loss_clip": 0.0110922, + "auxiliary_loss_mlp": 0.01036818, + "balance_loss_clip": 1.03789234, + "balance_loss_mlp": 1.02471268, + "epoch": 0.6028558545017285, + "flos": 19935773182080.0, + "grad_norm": 1.830391575126131, + "language_loss": 0.80050242, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.82196277, + "num_input_tokens_seen": 215958045, + "step": 10027, + "time_per_iteration": 2.576110601425171 + }, + { + "auxiliary_loss_clip": 0.01081954, + "auxiliary_loss_mlp": 0.01036412, + "balance_loss_clip": 1.03722823, + "balance_loss_mlp": 1.02301311, + "epoch": 0.6029159777543965, + "flos": 22820836550400.0, + "grad_norm": 2.0223053255723236, + "language_loss": 0.70934105, + "learning_rate": 1.438080769071171e-06, + "loss": 0.73052478, + "num_input_tokens_seen": 215977330, + "step": 10028, + "time_per_iteration": 2.7288432121276855 + }, + { + "auxiliary_loss_clip": 0.01084702, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.04540849, + "balance_loss_mlp": 1.02254987, + "epoch": 0.6029761010070644, + "flos": 23587242065280.0, + "grad_norm": 2.1142238314038595, + "language_loss": 0.84057522, + "learning_rate": 1.437707005721669e-06, + "loss": 0.86177796, + "num_input_tokens_seen": 215997865, + "step": 10029, + "time_per_iteration": 2.7901382446289062 + }, + { + "auxiliary_loss_clip": 0.0109278, + "auxiliary_loss_mlp": 0.01032236, + "balance_loss_clip": 1.0393126, + "balance_loss_mlp": 1.0201664, + "epoch": 0.6030362242597325, + "flos": 13662430859520.0, + "grad_norm": 2.2431865033670744, + "language_loss": 0.79994917, + "learning_rate": 1.437333263694373e-06, + "loss": 0.82119942, + "num_input_tokens_seen": 216016230, + "step": 10030, + "time_per_iteration": 2.780527114868164 + }, + { + "auxiliary_loss_clip": 0.01048723, + "auxiliary_loss_mlp": 0.01042121, + "balance_loss_clip": 1.03655624, + "balance_loss_mlp": 1.02806032, + "epoch": 0.6030963475124004, + "flos": 24422883045120.0, + "grad_norm": 1.9455803489075072, + "language_loss": 0.71241331, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.73332179, + "num_input_tokens_seen": 216035785, + "step": 10031, + "time_per_iteration": 2.8193559646606445 + }, + { + "auxiliary_loss_clip": 0.0107322, + "auxiliary_loss_mlp": 0.01037048, + "balance_loss_clip": 1.0378077, + "balance_loss_mlp": 1.02281427, + "epoch": 0.6031564707650684, + "flos": 29644043247360.0, + "grad_norm": 2.2622695973651834, + "language_loss": 0.72744608, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.74854881, + "num_input_tokens_seen": 216059555, + "step": 10032, + "time_per_iteration": 2.8426249027252197 + }, + { + "auxiliary_loss_clip": 0.0110112, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.04412532, + "balance_loss_mlp": 1.02163815, + "epoch": 0.6032165940177363, + "flos": 16618776768000.0, + "grad_norm": 1.8175959049184216, + "language_loss": 0.68774295, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.70910323, + "num_input_tokens_seen": 216077235, + "step": 10033, + "time_per_iteration": 2.700209379196167 + }, + { + "auxiliary_loss_clip": 0.01089272, + "auxiliary_loss_mlp": 0.01037723, + "balance_loss_clip": 1.04015613, + "balance_loss_mlp": 1.02396595, + "epoch": 0.6032767172704043, + "flos": 17488173553920.0, + "grad_norm": 2.115327938975923, + "language_loss": 0.7568332, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.77810311, + "num_input_tokens_seen": 216094985, + "step": 10034, + "time_per_iteration": 2.6627981662750244 + }, + { + "auxiliary_loss_clip": 0.01095189, + "auxiliary_loss_mlp": 0.01030387, + "balance_loss_clip": 1.04141998, + "balance_loss_mlp": 1.01710701, + "epoch": 0.6033368405230723, + "flos": 26832955939200.0, + "grad_norm": 3.2723425599009026, + "language_loss": 0.74862671, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.7698825, + "num_input_tokens_seen": 216115905, + "step": 10035, + "time_per_iteration": 2.8429391384124756 + }, + { + "auxiliary_loss_clip": 0.01082466, + "auxiliary_loss_mlp": 0.01027098, + "balance_loss_clip": 1.03635907, + "balance_loss_mlp": 1.0147779, + "epoch": 0.6033969637757403, + "flos": 16909904499840.0, + "grad_norm": 1.7778569727517832, + "language_loss": 0.8656829, + "learning_rate": 1.435091260090536e-06, + "loss": 0.88677853, + "num_input_tokens_seen": 216132420, + "step": 10036, + "time_per_iteration": 2.7539496421813965 + }, + { + "auxiliary_loss_clip": 0.0107738, + "auxiliary_loss_mlp": 0.01034344, + "balance_loss_clip": 1.03851438, + "balance_loss_mlp": 1.02084994, + "epoch": 0.6034570870284083, + "flos": 22930076787840.0, + "grad_norm": 1.8216360444833892, + "language_loss": 0.70128858, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.72240573, + "num_input_tokens_seen": 216149800, + "step": 10037, + "time_per_iteration": 2.6496496200561523 + }, + { + "auxiliary_loss_clip": 0.0109976, + "auxiliary_loss_mlp": 0.01037189, + "balance_loss_clip": 1.03967977, + "balance_loss_mlp": 1.02270496, + "epoch": 0.6035172102810762, + "flos": 23366319465600.0, + "grad_norm": 1.570748886934951, + "language_loss": 0.8512125, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.87258202, + "num_input_tokens_seen": 216168200, + "step": 10038, + "time_per_iteration": 2.6828958988189697 + }, + { + "auxiliary_loss_clip": 0.01098827, + "auxiliary_loss_mlp": 0.01034673, + "balance_loss_clip": 1.04050255, + "balance_loss_mlp": 1.02128005, + "epoch": 0.6035773335337442, + "flos": 20887082933760.0, + "grad_norm": 2.3203593434406242, + "language_loss": 0.76504898, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.78638399, + "num_input_tokens_seen": 216187105, + "step": 10039, + "time_per_iteration": 2.6590511798858643 + }, + { + "auxiliary_loss_clip": 0.01102907, + "auxiliary_loss_mlp": 0.01031911, + "balance_loss_clip": 1.03922081, + "balance_loss_mlp": 1.019382, + "epoch": 0.6036374567864121, + "flos": 24936298093440.0, + "grad_norm": 1.8339871345285923, + "language_loss": 0.71111763, + "learning_rate": 1.433597019260301e-06, + "loss": 0.73246586, + "num_input_tokens_seen": 216205440, + "step": 10040, + "time_per_iteration": 2.6712801456451416 + }, + { + "auxiliary_loss_clip": 0.01109688, + "auxiliary_loss_mlp": 0.01031148, + "balance_loss_clip": 1.04312241, + "balance_loss_mlp": 1.01598454, + "epoch": 0.6036975800390801, + "flos": 23148269953920.0, + "grad_norm": 2.0137364812654166, + "language_loss": 0.78602934, + "learning_rate": 1.433223512712475e-06, + "loss": 0.80743772, + "num_input_tokens_seen": 216223130, + "step": 10041, + "time_per_iteration": 2.670166015625 + }, + { + "auxiliary_loss_clip": 0.01096185, + "auxiliary_loss_mlp": 0.01029552, + "balance_loss_clip": 1.04166305, + "balance_loss_mlp": 1.01649821, + "epoch": 0.603757703291748, + "flos": 18660729127680.0, + "grad_norm": 1.7274066455029002, + "language_loss": 0.75525141, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.77650881, + "num_input_tokens_seen": 216240260, + "step": 10042, + "time_per_iteration": 2.6106081008911133 + }, + { + "auxiliary_loss_clip": 0.0106962, + "auxiliary_loss_mlp": 0.01029029, + "balance_loss_clip": 1.03727007, + "balance_loss_mlp": 1.01701236, + "epoch": 0.6038178265444161, + "flos": 19682603147520.0, + "grad_norm": 1.9258503206144206, + "language_loss": 0.84721899, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.86820555, + "num_input_tokens_seen": 216258510, + "step": 10043, + "time_per_iteration": 2.71673846244812 + }, + { + "auxiliary_loss_clip": 0.01081507, + "auxiliary_loss_mlp": 0.01040859, + "balance_loss_clip": 1.03832972, + "balance_loss_mlp": 1.02579701, + "epoch": 0.603877949797084, + "flos": 22638230784000.0, + "grad_norm": 1.8215258720973655, + "language_loss": 0.70104671, + "learning_rate": 1.432103122078974e-06, + "loss": 0.72227025, + "num_input_tokens_seen": 216277550, + "step": 10044, + "time_per_iteration": 2.7252089977264404 + }, + { + "auxiliary_loss_clip": 0.01106435, + "auxiliary_loss_mlp": 0.01032617, + "balance_loss_clip": 1.04218245, + "balance_loss_mlp": 1.01826382, + "epoch": 0.603938073049752, + "flos": 25447881548160.0, + "grad_norm": 1.9233339181851183, + "language_loss": 0.78067368, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.80206418, + "num_input_tokens_seen": 216296690, + "step": 10045, + "time_per_iteration": 2.6885697841644287 + }, + { + "auxiliary_loss_clip": 0.01071663, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.04522324, + "balance_loss_mlp": 1.02084029, + "epoch": 0.6039981963024199, + "flos": 22340135813760.0, + "grad_norm": 1.7431481861658145, + "language_loss": 0.77048129, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.79154372, + "num_input_tokens_seen": 216316110, + "step": 10046, + "time_per_iteration": 2.762124538421631 + }, + { + "auxiliary_loss_clip": 0.01061952, + "auxiliary_loss_mlp": 0.01040938, + "balance_loss_clip": 1.03495252, + "balance_loss_mlp": 1.02685905, + "epoch": 0.6040583195550879, + "flos": 20703148364160.0, + "grad_norm": 1.791420221750128, + "language_loss": 0.87246406, + "learning_rate": 1.430982925257827e-06, + "loss": 0.893493, + "num_input_tokens_seen": 216333855, + "step": 10047, + "time_per_iteration": 2.7445127964019775 + }, + { + "auxiliary_loss_clip": 0.01104302, + "auxiliary_loss_mlp": 0.01030965, + "balance_loss_clip": 1.04149449, + "balance_loss_mlp": 1.01879954, + "epoch": 0.604118442807756, + "flos": 27163118776320.0, + "grad_norm": 1.4945345993269403, + "language_loss": 0.75776327, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.77911592, + "num_input_tokens_seen": 216354890, + "step": 10048, + "time_per_iteration": 2.730748414993286 + }, + { + "auxiliary_loss_clip": 0.01108329, + "auxiliary_loss_mlp": 0.01044251, + "balance_loss_clip": 1.04174399, + "balance_loss_mlp": 1.02869403, + "epoch": 0.6041785660604239, + "flos": 30881524654080.0, + "grad_norm": 2.2243998349441183, + "language_loss": 0.66556633, + "learning_rate": 1.430236235239386e-06, + "loss": 0.68709219, + "num_input_tokens_seen": 216376055, + "step": 10049, + "time_per_iteration": 2.6866142749786377 + }, + { + "auxiliary_loss_clip": 0.01089915, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_clip": 1.03830862, + "balance_loss_mlp": 1.02849865, + "epoch": 0.6042386893130919, + "flos": 19938215306880.0, + "grad_norm": 1.639569270992707, + "language_loss": 0.66928005, + "learning_rate": 1.429862922631336e-06, + "loss": 0.69060636, + "num_input_tokens_seen": 216396295, + "step": 10050, + "time_per_iteration": 2.744527816772461 + }, + { + "auxiliary_loss_clip": 0.01083354, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.03962123, + "balance_loss_mlp": 1.02269185, + "epoch": 0.6042988125657598, + "flos": 32415915882240.0, + "grad_norm": 1.7210161547813447, + "language_loss": 0.6963383, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.71753216, + "num_input_tokens_seen": 216416605, + "step": 10051, + "time_per_iteration": 2.820204734802246 + }, + { + "auxiliary_loss_clip": 0.01100825, + "auxiliary_loss_mlp": 0.01032129, + "balance_loss_clip": 1.03741777, + "balance_loss_mlp": 1.01908135, + "epoch": 0.6043589358184278, + "flos": 17420805596160.0, + "grad_norm": 2.3541607325849987, + "language_loss": 0.64901161, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.67034107, + "num_input_tokens_seen": 216435130, + "step": 10052, + "time_per_iteration": 2.682201385498047 + }, + { + "auxiliary_loss_clip": 0.01094174, + "auxiliary_loss_mlp": 0.01034325, + "balance_loss_clip": 1.0397222, + "balance_loss_mlp": 1.0204432, + "epoch": 0.6044190590710957, + "flos": 27672834723840.0, + "grad_norm": 1.5756389367941481, + "language_loss": 0.69104528, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.71233022, + "num_input_tokens_seen": 216455640, + "step": 10053, + "time_per_iteration": 2.8296010494232178 + }, + { + "auxiliary_loss_clip": 0.01018297, + "auxiliary_loss_mlp": 0.01003475, + "balance_loss_clip": 1.01298642, + "balance_loss_mlp": 1.0022707, + "epoch": 0.6044791823237637, + "flos": 65316267515520.0, + "grad_norm": 0.7275681454160189, + "language_loss": 0.60339212, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.62360984, + "num_input_tokens_seen": 216518130, + "step": 10054, + "time_per_iteration": 3.3135299682617188 + }, + { + "auxiliary_loss_clip": 0.01055185, + "auxiliary_loss_mlp": 0.01033215, + "balance_loss_clip": 1.03634906, + "balance_loss_mlp": 1.019768, + "epoch": 0.6045393055764317, + "flos": 24492369905280.0, + "grad_norm": 1.5749604097549974, + "language_loss": 0.8565892, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.87747318, + "num_input_tokens_seen": 216536845, + "step": 10055, + "time_per_iteration": 2.803851842880249 + }, + { + "auxiliary_loss_clip": 0.0109594, + "auxiliary_loss_mlp": 0.01048723, + "balance_loss_clip": 1.04159987, + "balance_loss_mlp": 1.03321385, + "epoch": 0.6045994288290997, + "flos": 19054345340160.0, + "grad_norm": 2.24817202299257, + "language_loss": 0.74068117, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.76212776, + "num_input_tokens_seen": 216551860, + "step": 10056, + "time_per_iteration": 4.305849313735962 + }, + { + "auxiliary_loss_clip": 0.01073635, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.03811693, + "balance_loss_mlp": 1.02309358, + "epoch": 0.6046595520817676, + "flos": 26576697335040.0, + "grad_norm": 1.7955377697616153, + "language_loss": 0.80028808, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.82138407, + "num_input_tokens_seen": 216574775, + "step": 10057, + "time_per_iteration": 5.891208648681641 + }, + { + "auxiliary_loss_clip": 0.0111396, + "auxiliary_loss_mlp": 0.00770338, + "balance_loss_clip": 1.04094028, + "balance_loss_mlp": 1.00008702, + "epoch": 0.6047196753344356, + "flos": 13582277660160.0, + "grad_norm": 2.047185386836812, + "language_loss": 0.75578213, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.77462518, + "num_input_tokens_seen": 216590100, + "step": 10058, + "time_per_iteration": 2.6869444847106934 + }, + { + "auxiliary_loss_clip": 0.0110179, + "auxiliary_loss_mlp": 0.01030868, + "balance_loss_clip": 1.03934133, + "balance_loss_mlp": 1.01811314, + "epoch": 0.6047797985871035, + "flos": 25520456977920.0, + "grad_norm": 1.9889135378311975, + "language_loss": 0.70937455, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.73070109, + "num_input_tokens_seen": 216610145, + "step": 10059, + "time_per_iteration": 2.7275924682617188 + }, + { + "auxiliary_loss_clip": 0.01092569, + "auxiliary_loss_mlp": 0.0103084, + "balance_loss_clip": 1.03944898, + "balance_loss_mlp": 1.01753664, + "epoch": 0.6048399218397715, + "flos": 20520147548160.0, + "grad_norm": 1.7704655084920065, + "language_loss": 0.76338398, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.78461802, + "num_input_tokens_seen": 216630625, + "step": 10060, + "time_per_iteration": 4.274925470352173 + }, + { + "auxiliary_loss_clip": 0.01104515, + "auxiliary_loss_mlp": 0.01034248, + "balance_loss_clip": 1.03981733, + "balance_loss_mlp": 1.02105165, + "epoch": 0.6049000450924396, + "flos": 20408788408320.0, + "grad_norm": 1.9626551853189032, + "language_loss": 0.73588789, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.75727558, + "num_input_tokens_seen": 216649255, + "step": 10061, + "time_per_iteration": 2.6950912475585938 + }, + { + "auxiliary_loss_clip": 0.01076727, + "auxiliary_loss_mlp": 0.00771397, + "balance_loss_clip": 1.04075074, + "balance_loss_mlp": 1.00014019, + "epoch": 0.6049601683451075, + "flos": 20741357456640.0, + "grad_norm": 2.92695225177956, + "language_loss": 0.67823231, + "learning_rate": 1.425384861715639e-06, + "loss": 0.69671357, + "num_input_tokens_seen": 216668100, + "step": 10062, + "time_per_iteration": 2.7427420616149902 + }, + { + "auxiliary_loss_clip": 0.01099001, + "auxiliary_loss_mlp": 0.010396, + "balance_loss_clip": 1.03907073, + "balance_loss_mlp": 1.02500868, + "epoch": 0.6050202915977755, + "flos": 20083114771200.0, + "grad_norm": 2.0011992400768173, + "language_loss": 0.71559471, + "learning_rate": 1.425011831266978e-06, + "loss": 0.73698068, + "num_input_tokens_seen": 216686125, + "step": 10063, + "time_per_iteration": 2.652628183364868 + }, + { + "auxiliary_loss_clip": 0.01111808, + "auxiliary_loss_mlp": 0.01037973, + "balance_loss_clip": 1.03926516, + "balance_loss_mlp": 1.02516413, + "epoch": 0.6050804148504434, + "flos": 15960821391360.0, + "grad_norm": 1.8208827458989, + "language_loss": 0.84698188, + "learning_rate": 1.424638822621926e-06, + "loss": 0.86847973, + "num_input_tokens_seen": 216704265, + "step": 10064, + "time_per_iteration": 2.6407761573791504 + }, + { + "auxiliary_loss_clip": 0.01105098, + "auxiliary_loss_mlp": 0.01032994, + "balance_loss_clip": 1.04044116, + "balance_loss_mlp": 1.01974392, + "epoch": 0.6051405381031114, + "flos": 17456644391040.0, + "grad_norm": 2.095191883416591, + "language_loss": 0.79596299, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.81734389, + "num_input_tokens_seen": 216721765, + "step": 10065, + "time_per_iteration": 2.633913040161133 + }, + { + "auxiliary_loss_clip": 0.01067386, + "auxiliary_loss_mlp": 0.01033127, + "balance_loss_clip": 1.03866124, + "balance_loss_mlp": 1.0181725, + "epoch": 0.6052006613557793, + "flos": 11400130517760.0, + "grad_norm": 2.398871193370657, + "language_loss": 0.78276229, + "learning_rate": 1.423892870799226e-06, + "loss": 0.80376744, + "num_input_tokens_seen": 216738295, + "step": 10066, + "time_per_iteration": 2.729074001312256 + }, + { + "auxiliary_loss_clip": 0.01059487, + "auxiliary_loss_mlp": 0.01033515, + "balance_loss_clip": 1.03963447, + "balance_loss_mlp": 1.01981831, + "epoch": 0.6052607846084473, + "flos": 24750998807040.0, + "grad_norm": 1.7528217462877862, + "language_loss": 0.7308799, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.75180995, + "num_input_tokens_seen": 216759875, + "step": 10067, + "time_per_iteration": 2.81003999710083 + }, + { + "auxiliary_loss_clip": 0.01094022, + "auxiliary_loss_mlp": 0.00770796, + "balance_loss_clip": 1.04127932, + "balance_loss_mlp": 1.00018191, + "epoch": 0.6053209078611153, + "flos": 20741141975040.0, + "grad_norm": 1.357631083448857, + "language_loss": 0.68994391, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.70859212, + "num_input_tokens_seen": 216780705, + "step": 10068, + "time_per_iteration": 2.7258529663085938 + }, + { + "auxiliary_loss_clip": 0.010988, + "auxiliary_loss_mlp": 0.01031117, + "balance_loss_clip": 1.03859472, + "balance_loss_mlp": 1.01821876, + "epoch": 0.6053810311137833, + "flos": 18953149749120.0, + "grad_norm": 3.7091992376991096, + "language_loss": 0.870857, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.89215624, + "num_input_tokens_seen": 216797625, + "step": 10069, + "time_per_iteration": 2.57892107963562 + }, + { + "auxiliary_loss_clip": 0.01081389, + "auxiliary_loss_mlp": 0.01029042, + "balance_loss_clip": 1.03757524, + "balance_loss_mlp": 1.01611388, + "epoch": 0.6054411543664512, + "flos": 23951124794880.0, + "grad_norm": 1.6595378120531261, + "language_loss": 0.83174849, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.85285282, + "num_input_tokens_seen": 216817610, + "step": 10070, + "time_per_iteration": 2.7172200679779053 + }, + { + "auxiliary_loss_clip": 0.01100339, + "auxiliary_loss_mlp": 0.01034986, + "balance_loss_clip": 1.04162169, + "balance_loss_mlp": 1.02189064, + "epoch": 0.6055012776191192, + "flos": 20593979953920.0, + "grad_norm": 1.9849870448156475, + "language_loss": 0.85964417, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.88099742, + "num_input_tokens_seen": 216836835, + "step": 10071, + "time_per_iteration": 2.677682638168335 + }, + { + "auxiliary_loss_clip": 0.01109082, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.04172432, + "balance_loss_mlp": 1.02102959, + "epoch": 0.6055614008717871, + "flos": 30298191782400.0, + "grad_norm": 1.8218197035918635, + "language_loss": 0.77151179, + "learning_rate": 1.421655540088603e-06, + "loss": 0.79295927, + "num_input_tokens_seen": 216856760, + "step": 10072, + "time_per_iteration": 2.806692123413086 + }, + { + "auxiliary_loss_clip": 0.01094577, + "auxiliary_loss_mlp": 0.01028599, + "balance_loss_clip": 1.0381639, + "balance_loss_mlp": 1.01447272, + "epoch": 0.6056215241244551, + "flos": 27125017424640.0, + "grad_norm": 1.5487316274587832, + "language_loss": 0.74428165, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.76551342, + "num_input_tokens_seen": 216878795, + "step": 10073, + "time_per_iteration": 2.746279239654541 + }, + { + "auxiliary_loss_clip": 0.00997245, + "auxiliary_loss_mlp": 0.01001533, + "balance_loss_clip": 1.01025248, + "balance_loss_mlp": 1.00035894, + "epoch": 0.6056816473771232, + "flos": 56007323925120.0, + "grad_norm": 0.7538510449367495, + "language_loss": 0.55113828, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.57112598, + "num_input_tokens_seen": 216937800, + "step": 10074, + "time_per_iteration": 3.3036320209503174 + }, + { + "auxiliary_loss_clip": 0.01075201, + "auxiliary_loss_mlp": 0.01042355, + "balance_loss_clip": 1.03847015, + "balance_loss_mlp": 1.02714372, + "epoch": 0.6057417706297911, + "flos": 23549499849600.0, + "grad_norm": 1.7766669021243995, + "language_loss": 0.81689596, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.83807153, + "num_input_tokens_seen": 216955280, + "step": 10075, + "time_per_iteration": 2.731048583984375 + }, + { + "auxiliary_loss_clip": 0.01107881, + "auxiliary_loss_mlp": 0.01025575, + "balance_loss_clip": 1.04031885, + "balance_loss_mlp": 1.01165175, + "epoch": 0.6058018938824591, + "flos": 27744296832000.0, + "grad_norm": 1.740054911914685, + "language_loss": 0.77907681, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.80041134, + "num_input_tokens_seen": 216976950, + "step": 10076, + "time_per_iteration": 2.6934380531311035 + }, + { + "auxiliary_loss_clip": 0.01106108, + "auxiliary_loss_mlp": 0.01036344, + "balance_loss_clip": 1.03907084, + "balance_loss_mlp": 1.02240217, + "epoch": 0.605862017135127, + "flos": 22783381643520.0, + "grad_norm": 1.6512555736365901, + "language_loss": 0.72421932, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.74564385, + "num_input_tokens_seen": 216996945, + "step": 10077, + "time_per_iteration": 2.6461181640625 + }, + { + "auxiliary_loss_clip": 0.01117207, + "auxiliary_loss_mlp": 0.0103146, + "balance_loss_clip": 1.04170644, + "balance_loss_mlp": 1.01762605, + "epoch": 0.605922140387795, + "flos": 21215019127680.0, + "grad_norm": 1.9059777517343863, + "language_loss": 0.55426162, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.57574832, + "num_input_tokens_seen": 217016580, + "step": 10078, + "time_per_iteration": 2.6261439323425293 + }, + { + "auxiliary_loss_clip": 0.01073319, + "auxiliary_loss_mlp": 0.01031331, + "balance_loss_clip": 1.03767908, + "balance_loss_mlp": 1.0181284, + "epoch": 0.6059822636404629, + "flos": 27268372604160.0, + "grad_norm": 1.6895659179757812, + "language_loss": 0.70538819, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.72643465, + "num_input_tokens_seen": 217037300, + "step": 10079, + "time_per_iteration": 2.830202102661133 + }, + { + "auxiliary_loss_clip": 0.01092187, + "auxiliary_loss_mlp": 0.01039196, + "balance_loss_clip": 1.03862, + "balance_loss_mlp": 1.02637529, + "epoch": 0.606042386893131, + "flos": 20631327120000.0, + "grad_norm": 1.6859252666783793, + "language_loss": 0.6267547, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.64806855, + "num_input_tokens_seen": 217055805, + "step": 10080, + "time_per_iteration": 2.6813855171203613 + }, + { + "auxiliary_loss_clip": 0.01094103, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.03858209, + "balance_loss_mlp": 1.01722753, + "epoch": 0.6061025101457989, + "flos": 23002293081600.0, + "grad_norm": 2.6314265017345613, + "language_loss": 0.71340102, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.73465574, + "num_input_tokens_seen": 217074175, + "step": 10081, + "time_per_iteration": 2.750216007232666 + }, + { + "auxiliary_loss_clip": 0.01091896, + "auxiliary_loss_mlp": 0.01029512, + "balance_loss_clip": 1.03969479, + "balance_loss_mlp": 1.01624978, + "epoch": 0.6061626333984669, + "flos": 29898937134720.0, + "grad_norm": 1.724175069330151, + "language_loss": 0.69190812, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.71312225, + "num_input_tokens_seen": 217095695, + "step": 10082, + "time_per_iteration": 2.7279422283172607 + }, + { + "auxiliary_loss_clip": 0.01117243, + "auxiliary_loss_mlp": 0.01032658, + "balance_loss_clip": 1.04338622, + "balance_loss_mlp": 1.01974726, + "epoch": 0.6062227566511348, + "flos": 25009196745600.0, + "grad_norm": 1.3736157370589637, + "language_loss": 0.65741009, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.67890906, + "num_input_tokens_seen": 217116260, + "step": 10083, + "time_per_iteration": 2.6431922912597656 + }, + { + "auxiliary_loss_clip": 0.01104697, + "auxiliary_loss_mlp": 0.01033772, + "balance_loss_clip": 1.03986526, + "balance_loss_mlp": 1.02053976, + "epoch": 0.6062828799038028, + "flos": 19463943104640.0, + "grad_norm": 1.8569136538666067, + "language_loss": 0.74291378, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.7642985, + "num_input_tokens_seen": 217134465, + "step": 10084, + "time_per_iteration": 2.693331003189087 + }, + { + "auxiliary_loss_clip": 0.01089491, + "auxiliary_loss_mlp": 0.01040114, + "balance_loss_clip": 1.03806448, + "balance_loss_mlp": 1.02654219, + "epoch": 0.6063430031564707, + "flos": 13589568120960.0, + "grad_norm": 14.01820477469797, + "language_loss": 0.72177935, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.74307537, + "num_input_tokens_seen": 217149920, + "step": 10085, + "time_per_iteration": 2.6207504272460938 + }, + { + "auxiliary_loss_clip": 0.01115179, + "auxiliary_loss_mlp": 0.0103546, + "balance_loss_clip": 1.04101026, + "balance_loss_mlp": 1.02325344, + "epoch": 0.6064031264091387, + "flos": 23255499029760.0, + "grad_norm": 1.9650382613535748, + "language_loss": 0.76113385, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.78264022, + "num_input_tokens_seen": 217168165, + "step": 10086, + "time_per_iteration": 2.6350982189178467 + }, + { + "auxiliary_loss_clip": 0.01079834, + "auxiliary_loss_mlp": 0.01033915, + "balance_loss_clip": 1.03654695, + "balance_loss_mlp": 1.02082586, + "epoch": 0.6064632496618068, + "flos": 22458462192000.0, + "grad_norm": 1.6281495100420569, + "language_loss": 0.72623181, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.74736929, + "num_input_tokens_seen": 217190070, + "step": 10087, + "time_per_iteration": 2.7133493423461914 + }, + { + "auxiliary_loss_clip": 0.01101404, + "auxiliary_loss_mlp": 0.0103704, + "balance_loss_clip": 1.03922224, + "balance_loss_mlp": 1.02555394, + "epoch": 0.6065233729144747, + "flos": 25118652464640.0, + "grad_norm": 1.8336458297983596, + "language_loss": 0.83669853, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.85808301, + "num_input_tokens_seen": 217209370, + "step": 10088, + "time_per_iteration": 2.6913206577301025 + }, + { + "auxiliary_loss_clip": 0.01058404, + "auxiliary_loss_mlp": 0.00771924, + "balance_loss_clip": 1.03367972, + "balance_loss_mlp": 1.00013125, + "epoch": 0.6065834961671427, + "flos": 23477355383040.0, + "grad_norm": 2.41510818695702, + "language_loss": 0.7150932, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.73339653, + "num_input_tokens_seen": 217226990, + "step": 10089, + "time_per_iteration": 2.71724271774292 + }, + { + "auxiliary_loss_clip": 0.01104996, + "auxiliary_loss_mlp": 0.01039267, + "balance_loss_clip": 1.04092312, + "balance_loss_mlp": 1.02694106, + "epoch": 0.6066436194198106, + "flos": 17019396132480.0, + "grad_norm": 3.755310304725579, + "language_loss": 0.82807851, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.84952104, + "num_input_tokens_seen": 217244585, + "step": 10090, + "time_per_iteration": 2.600306510925293 + }, + { + "auxiliary_loss_clip": 0.01082916, + "auxiliary_loss_mlp": 0.01036874, + "balance_loss_clip": 1.04005432, + "balance_loss_mlp": 1.02280176, + "epoch": 0.6067037426724786, + "flos": 18514752255360.0, + "grad_norm": 2.395523786898732, + "language_loss": 0.75284386, + "learning_rate": 1.4145758826341e-06, + "loss": 0.77404171, + "num_input_tokens_seen": 217263435, + "step": 10091, + "time_per_iteration": 2.7555627822875977 + }, + { + "auxiliary_loss_clip": 0.0111346, + "auxiliary_loss_mlp": 0.01037619, + "balance_loss_clip": 1.04098213, + "balance_loss_mlp": 1.02436924, + "epoch": 0.6067638659251465, + "flos": 22345989730560.0, + "grad_norm": 1.5349996815844518, + "language_loss": 0.79607046, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.81758124, + "num_input_tokens_seen": 217283725, + "step": 10092, + "time_per_iteration": 2.607757568359375 + }, + { + "auxiliary_loss_clip": 0.01094482, + "auxiliary_loss_mlp": 0.01037242, + "balance_loss_clip": 1.03954625, + "balance_loss_mlp": 1.02349734, + "epoch": 0.6068239891778145, + "flos": 12451019748480.0, + "grad_norm": 1.7756923536136626, + "language_loss": 0.7618677, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.78318495, + "num_input_tokens_seen": 217301120, + "step": 10093, + "time_per_iteration": 2.730297327041626 + }, + { + "auxiliary_loss_clip": 0.01088328, + "auxiliary_loss_mlp": 0.01043446, + "balance_loss_clip": 1.0393225, + "balance_loss_mlp": 1.02897358, + "epoch": 0.6068841124304825, + "flos": 23185868515200.0, + "grad_norm": 1.8396370870528131, + "language_loss": 0.87565696, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.89697462, + "num_input_tokens_seen": 217319585, + "step": 10094, + "time_per_iteration": 2.7664146423339844 + }, + { + "auxiliary_loss_clip": 0.01107836, + "auxiliary_loss_mlp": 0.01029861, + "balance_loss_clip": 1.04203224, + "balance_loss_mlp": 1.01675439, + "epoch": 0.6069442356831505, + "flos": 18587902302720.0, + "grad_norm": 1.805883260375072, + "language_loss": 0.71895981, + "learning_rate": 1.413086446353919e-06, + "loss": 0.74033689, + "num_input_tokens_seen": 217338880, + "step": 10095, + "time_per_iteration": 2.610901355743408 + }, + { + "auxiliary_loss_clip": 0.01089454, + "auxiliary_loss_mlp": 0.01034743, + "balance_loss_clip": 1.03730071, + "balance_loss_mlp": 1.02213049, + "epoch": 0.6070043589358184, + "flos": 20960340721920.0, + "grad_norm": 1.8353844932279613, + "language_loss": 0.76935136, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.79059333, + "num_input_tokens_seen": 217357480, + "step": 10096, + "time_per_iteration": 5.823329925537109 + }, + { + "auxiliary_loss_clip": 0.01119601, + "auxiliary_loss_mlp": 0.01041655, + "balance_loss_clip": 1.04269695, + "balance_loss_mlp": 1.02889967, + "epoch": 0.6070644821884864, + "flos": 11692443398400.0, + "grad_norm": 2.030764189672632, + "language_loss": 0.80070782, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.82232034, + "num_input_tokens_seen": 217374575, + "step": 10097, + "time_per_iteration": 2.63212513923645 + }, + { + "auxiliary_loss_clip": 0.01090335, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.04231095, + "balance_loss_mlp": 1.01897645, + "epoch": 0.6071246054411543, + "flos": 19310568030720.0, + "grad_norm": 1.5236568833124404, + "language_loss": 0.67320025, + "learning_rate": 1.411969602780478e-06, + "loss": 0.69442379, + "num_input_tokens_seen": 217392950, + "step": 10098, + "time_per_iteration": 2.6840009689331055 + }, + { + "auxiliary_loss_clip": 0.01114691, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.04036307, + "balance_loss_mlp": 1.0169934, + "epoch": 0.6071847286938223, + "flos": 17749029098880.0, + "grad_norm": 2.4274073378556125, + "language_loss": 0.80730307, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.82874513, + "num_input_tokens_seen": 217412145, + "step": 10099, + "time_per_iteration": 2.5781733989715576 + }, + { + "auxiliary_loss_clip": 0.01085094, + "auxiliary_loss_mlp": 0.01039748, + "balance_loss_clip": 1.03784657, + "balance_loss_mlp": 1.02517462, + "epoch": 0.6072448519464904, + "flos": 22637512512000.0, + "grad_norm": 2.246118750219277, + "language_loss": 0.70420504, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.7254535, + "num_input_tokens_seen": 217432080, + "step": 10100, + "time_per_iteration": 4.310024738311768 + }, + { + "auxiliary_loss_clip": 0.01077866, + "auxiliary_loss_mlp": 0.01036569, + "balance_loss_clip": 1.03830409, + "balance_loss_mlp": 1.02207279, + "epoch": 0.6073049751991583, + "flos": 19537308633600.0, + "grad_norm": 1.6047311801163284, + "language_loss": 0.70821762, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.72936189, + "num_input_tokens_seen": 217450945, + "step": 10101, + "time_per_iteration": 2.726445198059082 + }, + { + "auxiliary_loss_clip": 0.01084441, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.03571582, + "balance_loss_mlp": 1.02082181, + "epoch": 0.6073650984518263, + "flos": 28294233033600.0, + "grad_norm": 2.197032989023165, + "language_loss": 0.69728243, + "learning_rate": 1.410480790256154e-06, + "loss": 0.71846825, + "num_input_tokens_seen": 217473105, + "step": 10102, + "time_per_iteration": 2.7282192707061768 + }, + { + "auxiliary_loss_clip": 0.0111817, + "auxiliary_loss_mlp": 0.01035861, + "balance_loss_clip": 1.04134989, + "balance_loss_mlp": 1.0230341, + "epoch": 0.6074252217044942, + "flos": 25664422688640.0, + "grad_norm": 1.8635985471124068, + "language_loss": 0.73704481, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.7585851, + "num_input_tokens_seen": 217491780, + "step": 10103, + "time_per_iteration": 2.6332626342773438 + }, + { + "auxiliary_loss_clip": 0.01077723, + "auxiliary_loss_mlp": 0.01037617, + "balance_loss_clip": 1.04122865, + "balance_loss_mlp": 1.02333558, + "epoch": 0.6074853449571622, + "flos": 22857106308480.0, + "grad_norm": 1.5666292395017738, + "language_loss": 0.76782012, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.78897351, + "num_input_tokens_seen": 217510605, + "step": 10104, + "time_per_iteration": 2.7046008110046387 + }, + { + "auxiliary_loss_clip": 0.01012823, + "auxiliary_loss_mlp": 0.01009652, + "balance_loss_clip": 1.01738811, + "balance_loss_mlp": 1.00849557, + "epoch": 0.6075454682098301, + "flos": 67111406547840.0, + "grad_norm": 0.7409971394494129, + "language_loss": 0.55891275, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.57913756, + "num_input_tokens_seen": 217574815, + "step": 10105, + "time_per_iteration": 3.2526538372039795 + }, + { + "auxiliary_loss_clip": 0.01030607, + "auxiliary_loss_mlp": 0.01011283, + "balance_loss_clip": 1.01659429, + "balance_loss_mlp": 1.01022172, + "epoch": 0.6076055914624982, + "flos": 70712024751360.0, + "grad_norm": 0.768019180696257, + "language_loss": 0.56802553, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.58844441, + "num_input_tokens_seen": 217632375, + "step": 10106, + "time_per_iteration": 3.158289909362793 + }, + { + "auxiliary_loss_clip": 0.01063356, + "auxiliary_loss_mlp": 0.01035278, + "balance_loss_clip": 1.03482223, + "balance_loss_mlp": 1.02204537, + "epoch": 0.6076657147151661, + "flos": 28364545906560.0, + "grad_norm": 1.5438087958158528, + "language_loss": 0.68604589, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.7070322, + "num_input_tokens_seen": 217653055, + "step": 10107, + "time_per_iteration": 2.922015905380249 + }, + { + "auxiliary_loss_clip": 0.01104951, + "auxiliary_loss_mlp": 0.01029821, + "balance_loss_clip": 1.03881001, + "balance_loss_mlp": 1.01635098, + "epoch": 0.6077258379678341, + "flos": 15049767807360.0, + "grad_norm": 1.8478390173687478, + "language_loss": 0.81575567, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.83710343, + "num_input_tokens_seen": 217671520, + "step": 10108, + "time_per_iteration": 2.6345651149749756 + }, + { + "auxiliary_loss_clip": 0.01090498, + "auxiliary_loss_mlp": 0.01037826, + "balance_loss_clip": 1.03763413, + "balance_loss_mlp": 1.02306151, + "epoch": 0.607785961220502, + "flos": 36167251553280.0, + "grad_norm": 2.15332165440763, + "language_loss": 0.71337903, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.73466218, + "num_input_tokens_seen": 217691880, + "step": 10109, + "time_per_iteration": 2.874757766723633 + }, + { + "auxiliary_loss_clip": 0.01090295, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.03903341, + "balance_loss_mlp": 1.02175713, + "epoch": 0.60784608447317, + "flos": 22524249951360.0, + "grad_norm": 1.6052444437933584, + "language_loss": 0.79990447, + "learning_rate": 1.407504239132653e-06, + "loss": 0.82114512, + "num_input_tokens_seen": 217710530, + "step": 10110, + "time_per_iteration": 2.6963181495666504 + }, + { + "auxiliary_loss_clip": 0.01089001, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.03760231, + "balance_loss_mlp": 1.01529717, + "epoch": 0.6079062077258379, + "flos": 23841166285440.0, + "grad_norm": 2.270664246588292, + "language_loss": 0.70269084, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.72387832, + "num_input_tokens_seen": 217728650, + "step": 10111, + "time_per_iteration": 2.6903553009033203 + }, + { + "auxiliary_loss_clip": 0.01085414, + "auxiliary_loss_mlp": 0.010291, + "balance_loss_clip": 1.04066074, + "balance_loss_mlp": 1.01539087, + "epoch": 0.6079663309785059, + "flos": 23367037737600.0, + "grad_norm": 1.6556748056408641, + "language_loss": 0.65621054, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.67735571, + "num_input_tokens_seen": 217747135, + "step": 10112, + "time_per_iteration": 2.7705774307250977 + }, + { + "auxiliary_loss_clip": 0.01029897, + "auxiliary_loss_mlp": 0.00999602, + "balance_loss_clip": 1.01457083, + "balance_loss_mlp": 0.99854136, + "epoch": 0.6080264542311739, + "flos": 71382873110400.0, + "grad_norm": 0.6359208638260742, + "language_loss": 0.49526292, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.51555794, + "num_input_tokens_seen": 217811860, + "step": 10113, + "time_per_iteration": 3.2169973850250244 + }, + { + "auxiliary_loss_clip": 0.01030037, + "auxiliary_loss_mlp": 0.01000401, + "balance_loss_clip": 1.01493645, + "balance_loss_mlp": 0.99929249, + "epoch": 0.6080865774838419, + "flos": 66529833442560.0, + "grad_norm": 0.8386978497659568, + "language_loss": 0.56947362, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.58977795, + "num_input_tokens_seen": 217866510, + "step": 10114, + "time_per_iteration": 3.1260786056518555 + }, + { + "auxiliary_loss_clip": 0.01118489, + "auxiliary_loss_mlp": 0.01029714, + "balance_loss_clip": 1.04061675, + "balance_loss_mlp": 1.01540279, + "epoch": 0.6081467007365099, + "flos": 19207935895680.0, + "grad_norm": 2.0279729583270405, + "language_loss": 0.70046329, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.72194529, + "num_input_tokens_seen": 217885650, + "step": 10115, + "time_per_iteration": 2.627066135406494 + }, + { + "auxiliary_loss_clip": 0.01076474, + "auxiliary_loss_mlp": 0.01030702, + "balance_loss_clip": 1.03560662, + "balance_loss_mlp": 1.01668298, + "epoch": 0.6082068239891778, + "flos": 24167737762560.0, + "grad_norm": 1.5787360311779992, + "language_loss": 0.72676456, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.74783635, + "num_input_tokens_seen": 217905300, + "step": 10116, + "time_per_iteration": 2.713207721710205 + }, + { + "auxiliary_loss_clip": 0.01090032, + "auxiliary_loss_mlp": 0.01036221, + "balance_loss_clip": 1.03843713, + "balance_loss_mlp": 1.02108169, + "epoch": 0.6082669472418458, + "flos": 37413316310400.0, + "grad_norm": 1.6151215779769803, + "language_loss": 0.53940326, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.56066579, + "num_input_tokens_seen": 217927845, + "step": 10117, + "time_per_iteration": 2.809150218963623 + }, + { + "auxiliary_loss_clip": 0.01097513, + "auxiliary_loss_mlp": 0.01030118, + "balance_loss_clip": 1.04143286, + "balance_loss_mlp": 1.01718414, + "epoch": 0.6083270704945137, + "flos": 15085534775040.0, + "grad_norm": 1.724080776440041, + "language_loss": 0.70168173, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.72295797, + "num_input_tokens_seen": 217946145, + "step": 10118, + "time_per_iteration": 2.6340367794036865 + }, + { + "auxiliary_loss_clip": 0.01051915, + "auxiliary_loss_mlp": 0.01030313, + "balance_loss_clip": 1.03519964, + "balance_loss_mlp": 1.01717043, + "epoch": 0.6083871937471818, + "flos": 20668458804480.0, + "grad_norm": 1.7207126950990799, + "language_loss": 0.74843824, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.76926053, + "num_input_tokens_seen": 217965190, + "step": 10119, + "time_per_iteration": 2.7610390186309814 + }, + { + "auxiliary_loss_clip": 0.01102909, + "auxiliary_loss_mlp": 0.01034672, + "balance_loss_clip": 1.04056787, + "balance_loss_mlp": 1.02195883, + "epoch": 0.6084473169998497, + "flos": 21506901045120.0, + "grad_norm": 1.7294665557102438, + "language_loss": 0.67426908, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.69564486, + "num_input_tokens_seen": 217983625, + "step": 10120, + "time_per_iteration": 2.6205523014068604 + }, + { + "auxiliary_loss_clip": 0.01108129, + "auxiliary_loss_mlp": 0.01033833, + "balance_loss_clip": 1.04188108, + "balance_loss_mlp": 1.02035594, + "epoch": 0.6085074402525177, + "flos": 26870051710080.0, + "grad_norm": 1.6306465435700652, + "language_loss": 0.74561995, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.76703954, + "num_input_tokens_seen": 218006005, + "step": 10121, + "time_per_iteration": 2.6655447483062744 + }, + { + "auxiliary_loss_clip": 0.01103879, + "auxiliary_loss_mlp": 0.01034712, + "balance_loss_clip": 1.03920245, + "balance_loss_mlp": 1.02181315, + "epoch": 0.6085675635051856, + "flos": 10889839952640.0, + "grad_norm": 1.8102237735068374, + "language_loss": 0.80563319, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.8270191, + "num_input_tokens_seen": 218024195, + "step": 10122, + "time_per_iteration": 2.5725269317626953 + }, + { + "auxiliary_loss_clip": 0.011003, + "auxiliary_loss_mlp": 0.01033892, + "balance_loss_clip": 1.03930187, + "balance_loss_mlp": 1.01991475, + "epoch": 0.6086276867578536, + "flos": 34862186707200.0, + "grad_norm": 2.5216051585049994, + "language_loss": 0.55656278, + "learning_rate": 1.402670413578284e-06, + "loss": 0.5779047, + "num_input_tokens_seen": 218047190, + "step": 10123, + "time_per_iteration": 2.7452590465545654 + }, + { + "auxiliary_loss_clip": 0.01107373, + "auxiliary_loss_mlp": 0.01041375, + "balance_loss_clip": 1.0430057, + "balance_loss_mlp": 1.02773786, + "epoch": 0.6086878100105215, + "flos": 20047706939520.0, + "grad_norm": 2.4791520044019526, + "language_loss": 0.73864502, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.76013255, + "num_input_tokens_seen": 218065945, + "step": 10124, + "time_per_iteration": 2.6622564792633057 + }, + { + "auxiliary_loss_clip": 0.01089528, + "auxiliary_loss_mlp": 0.01035903, + "balance_loss_clip": 1.03544164, + "balance_loss_mlp": 1.0215143, + "epoch": 0.6087479332631895, + "flos": 18332469711360.0, + "grad_norm": 2.9318658727845577, + "language_loss": 0.65483487, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.67608917, + "num_input_tokens_seen": 218085285, + "step": 10125, + "time_per_iteration": 2.677290439605713 + }, + { + "auxiliary_loss_clip": 0.01116071, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.04222536, + "balance_loss_mlp": 1.0202589, + "epoch": 0.6088080565158575, + "flos": 24493411399680.0, + "grad_norm": 1.769901084210043, + "language_loss": 0.76367819, + "learning_rate": 1.40155545786479e-06, + "loss": 0.785173, + "num_input_tokens_seen": 218104735, + "step": 10126, + "time_per_iteration": 2.6574339866638184 + }, + { + "auxiliary_loss_clip": 0.01079175, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.04002953, + "balance_loss_mlp": 1.01883876, + "epoch": 0.6088681797685255, + "flos": 10269016260480.0, + "grad_norm": 2.3378560015936705, + "language_loss": 0.70790273, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.72901833, + "num_input_tokens_seen": 218121855, + "step": 10127, + "time_per_iteration": 2.6849265098571777 + }, + { + "auxiliary_loss_clip": 0.01121141, + "auxiliary_loss_mlp": 0.01035296, + "balance_loss_clip": 1.04394543, + "balance_loss_mlp": 1.02121782, + "epoch": 0.6089283030211935, + "flos": 21973703218560.0, + "grad_norm": 2.1716351382875874, + "language_loss": 0.72938377, + "learning_rate": 1.400812267497691e-06, + "loss": 0.75094813, + "num_input_tokens_seen": 218137325, + "step": 10128, + "time_per_iteration": 2.5779154300689697 + }, + { + "auxiliary_loss_clip": 0.01065888, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.03911877, + "balance_loss_mlp": 1.02046144, + "epoch": 0.6089884262738614, + "flos": 17785191116160.0, + "grad_norm": 2.2560816683992075, + "language_loss": 0.7314086, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.7523976, + "num_input_tokens_seen": 218155530, + "step": 10129, + "time_per_iteration": 2.765955924987793 + }, + { + "auxiliary_loss_clip": 0.01113573, + "auxiliary_loss_mlp": 0.01033476, + "balance_loss_clip": 1.03910589, + "balance_loss_mlp": 1.02067268, + "epoch": 0.6090485495265294, + "flos": 36910423946880.0, + "grad_norm": 1.6122727527780822, + "language_loss": 0.65641886, + "learning_rate": 1.400069168015626e-06, + "loss": 0.67788941, + "num_input_tokens_seen": 218182535, + "step": 10130, + "time_per_iteration": 2.78676438331604 + }, + { + "auxiliary_loss_clip": 0.01086426, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.03784049, + "balance_loss_mlp": 1.01903617, + "epoch": 0.6091086727791973, + "flos": 19899036547200.0, + "grad_norm": 1.8425930589011128, + "language_loss": 0.76978183, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.79095113, + "num_input_tokens_seen": 218201740, + "step": 10131, + "time_per_iteration": 2.5955772399902344 + }, + { + "auxiliary_loss_clip": 0.0108451, + "auxiliary_loss_mlp": 0.01035484, + "balance_loss_clip": 1.04026079, + "balance_loss_mlp": 1.02387881, + "epoch": 0.6091687960318654, + "flos": 22163635359360.0, + "grad_norm": 1.697349608419957, + "language_loss": 0.76859689, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.78979683, + "num_input_tokens_seen": 218219800, + "step": 10132, + "time_per_iteration": 2.7611875534057617 + }, + { + "auxiliary_loss_clip": 0.01112693, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.04171348, + "balance_loss_mlp": 1.02192605, + "epoch": 0.6092289192845333, + "flos": 21465280160640.0, + "grad_norm": 1.734329950775569, + "language_loss": 0.75766826, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.77913374, + "num_input_tokens_seen": 218237585, + "step": 10133, + "time_per_iteration": 2.5794837474823 + }, + { + "auxiliary_loss_clip": 0.0110335, + "auxiliary_loss_mlp": 0.01034974, + "balance_loss_clip": 1.03942025, + "balance_loss_mlp": 1.0217123, + "epoch": 0.6092890425372013, + "flos": 28694924225280.0, + "grad_norm": 1.7908978482931064, + "language_loss": 0.63917655, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.66055977, + "num_input_tokens_seen": 218258700, + "step": 10134, + "time_per_iteration": 2.7197823524475098 + }, + { + "auxiliary_loss_clip": 0.01091736, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.04008186, + "balance_loss_mlp": 1.02060866, + "epoch": 0.6093491657898692, + "flos": 20813178700800.0, + "grad_norm": 1.9213179565189793, + "language_loss": 0.7841872, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.80543816, + "num_input_tokens_seen": 218275655, + "step": 10135, + "time_per_iteration": 4.243841171264648 + }, + { + "auxiliary_loss_clip": 0.01093049, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.04067171, + "balance_loss_mlp": 1.0183785, + "epoch": 0.6094092890425372, + "flos": 25446983708160.0, + "grad_norm": 1.9609713951304055, + "language_loss": 0.72346425, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.7446959, + "num_input_tokens_seen": 218295720, + "step": 10136, + "time_per_iteration": 4.175207853317261 + }, + { + "auxiliary_loss_clip": 0.01118097, + "auxiliary_loss_mlp": 0.01029914, + "balance_loss_clip": 1.04258895, + "balance_loss_mlp": 1.01715255, + "epoch": 0.6094694122952051, + "flos": 35621265847680.0, + "grad_norm": 1.7802525821484743, + "language_loss": 0.74853754, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.77001762, + "num_input_tokens_seen": 218316745, + "step": 10137, + "time_per_iteration": 2.7007157802581787 + }, + { + "auxiliary_loss_clip": 0.01100831, + "auxiliary_loss_mlp": 0.0104803, + "balance_loss_clip": 1.0380677, + "balance_loss_mlp": 1.03291392, + "epoch": 0.6095295355478731, + "flos": 24456962073600.0, + "grad_norm": 2.07495429210998, + "language_loss": 0.80021697, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.82170558, + "num_input_tokens_seen": 218335385, + "step": 10138, + "time_per_iteration": 2.642719268798828 + }, + { + "auxiliary_loss_clip": 0.01085336, + "auxiliary_loss_mlp": 0.01035484, + "balance_loss_clip": 1.03812051, + "balance_loss_mlp": 1.02355671, + "epoch": 0.6095896588005411, + "flos": 15633208419840.0, + "grad_norm": 2.0335546536806746, + "language_loss": 0.81230104, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.83350921, + "num_input_tokens_seen": 218353320, + "step": 10139, + "time_per_iteration": 4.268277645111084 + }, + { + "auxiliary_loss_clip": 0.01077185, + "auxiliary_loss_mlp": 0.01037802, + "balance_loss_clip": 1.03831828, + "balance_loss_mlp": 1.02411103, + "epoch": 0.6096497820532091, + "flos": 15550577182080.0, + "grad_norm": 2.12947943365166, + "language_loss": 0.83466005, + "learning_rate": 1.396355037825315e-06, + "loss": 0.85580993, + "num_input_tokens_seen": 218365620, + "step": 10140, + "time_per_iteration": 2.8175792694091797 + }, + { + "auxiliary_loss_clip": 0.01105576, + "auxiliary_loss_mlp": 0.01033997, + "balance_loss_clip": 1.04053175, + "balance_loss_mlp": 1.02132499, + "epoch": 0.6097099053058771, + "flos": 24204474397440.0, + "grad_norm": 1.6865480520512064, + "language_loss": 0.7552228, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.77661854, + "num_input_tokens_seen": 218383785, + "step": 10141, + "time_per_iteration": 2.6393468379974365 + }, + { + "auxiliary_loss_clip": 0.01087905, + "auxiliary_loss_mlp": 0.01037932, + "balance_loss_clip": 1.03879404, + "balance_loss_mlp": 1.02400196, + "epoch": 0.609770028558545, + "flos": 19570238426880.0, + "grad_norm": 2.2429886109126955, + "language_loss": 0.76329374, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.7845521, + "num_input_tokens_seen": 218399055, + "step": 10142, + "time_per_iteration": 2.6924803256988525 + }, + { + "auxiliary_loss_clip": 0.01117011, + "auxiliary_loss_mlp": 0.01034845, + "balance_loss_clip": 1.04226887, + "balance_loss_mlp": 1.02116001, + "epoch": 0.609830151811213, + "flos": 23949185460480.0, + "grad_norm": 1.9503172342998385, + "language_loss": 0.77012557, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.7916441, + "num_input_tokens_seen": 218419120, + "step": 10143, + "time_per_iteration": 2.8441388607025146 + }, + { + "auxiliary_loss_clip": 0.01100288, + "auxiliary_loss_mlp": 0.01040669, + "balance_loss_clip": 1.0388006, + "balance_loss_mlp": 1.02585077, + "epoch": 0.6098902750638809, + "flos": 16179732829440.0, + "grad_norm": 1.761002506839972, + "language_loss": 0.75323224, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.77464181, + "num_input_tokens_seen": 218435290, + "step": 10144, + "time_per_iteration": 2.6133413314819336 + }, + { + "auxiliary_loss_clip": 0.01087547, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.03644156, + "balance_loss_mlp": 1.02106476, + "epoch": 0.609950398316549, + "flos": 44526393763200.0, + "grad_norm": 2.2237996959363566, + "language_loss": 0.72757131, + "learning_rate": 1.394498830235383e-06, + "loss": 0.7487936, + "num_input_tokens_seen": 218457880, + "step": 10145, + "time_per_iteration": 2.939194679260254 + }, + { + "auxiliary_loss_clip": 0.01090456, + "auxiliary_loss_mlp": 0.0103494, + "balance_loss_clip": 1.03637707, + "balance_loss_mlp": 1.02223277, + "epoch": 0.6100105215692169, + "flos": 23221743223680.0, + "grad_norm": 1.7269520496185313, + "language_loss": 0.69230616, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.7135601, + "num_input_tokens_seen": 218475930, + "step": 10146, + "time_per_iteration": 2.6565699577331543 + }, + { + "auxiliary_loss_clip": 0.01068091, + "auxiliary_loss_mlp": 0.00769179, + "balance_loss_clip": 1.03681684, + "balance_loss_mlp": 1.00011373, + "epoch": 0.6100706448218849, + "flos": 15012564295680.0, + "grad_norm": 1.635331048644393, + "language_loss": 0.77205098, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.79042363, + "num_input_tokens_seen": 218493675, + "step": 10147, + "time_per_iteration": 2.7440903186798096 + }, + { + "auxiliary_loss_clip": 0.01093041, + "auxiliary_loss_mlp": 0.01032822, + "balance_loss_clip": 1.03794551, + "balance_loss_mlp": 1.01953053, + "epoch": 0.6101307680745528, + "flos": 19639976682240.0, + "grad_norm": 2.1080947760938895, + "language_loss": 0.78184944, + "learning_rate": 1.393385381096786e-06, + "loss": 0.80310804, + "num_input_tokens_seen": 218511780, + "step": 10148, + "time_per_iteration": 2.638685703277588 + }, + { + "auxiliary_loss_clip": 0.01080447, + "auxiliary_loss_mlp": 0.01036994, + "balance_loss_clip": 1.0334245, + "balance_loss_mlp": 1.02181888, + "epoch": 0.6101908913272208, + "flos": 29935566028800.0, + "grad_norm": 2.0657205711801776, + "language_loss": 0.54227436, + "learning_rate": 1.39301427737093e-06, + "loss": 0.56344879, + "num_input_tokens_seen": 218531850, + "step": 10149, + "time_per_iteration": 2.800041437149048 + }, + { + "auxiliary_loss_clip": 0.01092603, + "auxiliary_loss_mlp": 0.01036713, + "balance_loss_clip": 1.04295909, + "balance_loss_mlp": 1.02440453, + "epoch": 0.6102510145798887, + "flos": 21798639308160.0, + "grad_norm": 1.8291736341547842, + "language_loss": 0.8044911, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.82578421, + "num_input_tokens_seen": 218551245, + "step": 10150, + "time_per_iteration": 2.725109577178955 + }, + { + "auxiliary_loss_clip": 0.01091495, + "auxiliary_loss_mlp": 0.01041201, + "balance_loss_clip": 1.04189467, + "balance_loss_mlp": 1.02752137, + "epoch": 0.6103111378325567, + "flos": 20706129192960.0, + "grad_norm": 3.373563414576853, + "language_loss": 0.68982595, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.71115291, + "num_input_tokens_seen": 218571365, + "step": 10151, + "time_per_iteration": 2.672344923019409 + }, + { + "auxiliary_loss_clip": 0.01114149, + "auxiliary_loss_mlp": 0.01030926, + "balance_loss_clip": 1.03988385, + "balance_loss_mlp": 1.01889241, + "epoch": 0.6103712610852247, + "flos": 29381643417600.0, + "grad_norm": 1.7910960351729457, + "language_loss": 0.7080698, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.72952056, + "num_input_tokens_seen": 218588315, + "step": 10152, + "time_per_iteration": 2.687704086303711 + }, + { + "auxiliary_loss_clip": 0.01081357, + "auxiliary_loss_mlp": 0.01034717, + "balance_loss_clip": 1.04208827, + "balance_loss_mlp": 1.02176535, + "epoch": 0.6104313843378927, + "flos": 20813035046400.0, + "grad_norm": 1.9308044180202404, + "language_loss": 0.77972472, + "learning_rate": 1.391530092777811e-06, + "loss": 0.80088544, + "num_input_tokens_seen": 218605940, + "step": 10153, + "time_per_iteration": 2.737981081008911 + }, + { + "auxiliary_loss_clip": 0.01090347, + "auxiliary_loss_mlp": 0.01032715, + "balance_loss_clip": 1.03806591, + "balance_loss_mlp": 1.01951218, + "epoch": 0.6104915075905607, + "flos": 26578457101440.0, + "grad_norm": 3.840439775750313, + "language_loss": 0.79736745, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.81859809, + "num_input_tokens_seen": 218626100, + "step": 10154, + "time_per_iteration": 2.737769365310669 + }, + { + "auxiliary_loss_clip": 0.01105395, + "auxiliary_loss_mlp": 0.01031455, + "balance_loss_clip": 1.04265189, + "balance_loss_mlp": 1.01911139, + "epoch": 0.6105516308432286, + "flos": 23915788790400.0, + "grad_norm": 1.6402345774234983, + "language_loss": 0.70273185, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.72410041, + "num_input_tokens_seen": 218645060, + "step": 10155, + "time_per_iteration": 2.710547924041748 + }, + { + "auxiliary_loss_clip": 0.0110624, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.04239929, + "balance_loss_mlp": 1.01990473, + "epoch": 0.6106117540958966, + "flos": 31577365900800.0, + "grad_norm": 1.4885347094481511, + "language_loss": 0.71531796, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.73671484, + "num_input_tokens_seen": 218667690, + "step": 10156, + "time_per_iteration": 2.7332398891448975 + }, + { + "auxiliary_loss_clip": 0.0109286, + "auxiliary_loss_mlp": 0.01032773, + "balance_loss_clip": 1.04169869, + "balance_loss_mlp": 1.01951134, + "epoch": 0.6106718773485645, + "flos": 19608160210560.0, + "grad_norm": 1.5588894396348068, + "language_loss": 0.6765914, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.69784772, + "num_input_tokens_seen": 218687505, + "step": 10157, + "time_per_iteration": 2.7539916038513184 + }, + { + "auxiliary_loss_clip": 0.01075332, + "auxiliary_loss_mlp": 0.01028524, + "balance_loss_clip": 1.03566861, + "balance_loss_mlp": 1.01551235, + "epoch": 0.6107320006012326, + "flos": 17123895774720.0, + "grad_norm": 1.7948221929891892, + "language_loss": 0.72670758, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.74774617, + "num_input_tokens_seen": 218705315, + "step": 10158, + "time_per_iteration": 2.7469441890716553 + }, + { + "auxiliary_loss_clip": 0.01103253, + "auxiliary_loss_mlp": 0.01033152, + "balance_loss_clip": 1.0429678, + "balance_loss_mlp": 1.02017009, + "epoch": 0.6107921238539005, + "flos": 30148228500480.0, + "grad_norm": 1.507227050717275, + "language_loss": 0.69370097, + "learning_rate": 1.389304508366635e-06, + "loss": 0.715065, + "num_input_tokens_seen": 218725735, + "step": 10159, + "time_per_iteration": 2.7083382606506348 + }, + { + "auxiliary_loss_clip": 0.01118821, + "auxiliary_loss_mlp": 0.01031902, + "balance_loss_clip": 1.04300821, + "balance_loss_mlp": 1.01859236, + "epoch": 0.6108522471065685, + "flos": 18440273404800.0, + "grad_norm": 1.9516164322769225, + "language_loss": 0.78660917, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.80811644, + "num_input_tokens_seen": 218743215, + "step": 10160, + "time_per_iteration": 2.5400774478912354 + }, + { + "auxiliary_loss_clip": 0.01029498, + "auxiliary_loss_mlp": 0.01003038, + "balance_loss_clip": 1.01565576, + "balance_loss_mlp": 1.00198889, + "epoch": 0.6109123703592364, + "flos": 64135454791680.0, + "grad_norm": 0.8179177002663486, + "language_loss": 0.61458665, + "learning_rate": 1.388562832007295e-06, + "loss": 0.63491201, + "num_input_tokens_seen": 218806440, + "step": 10161, + "time_per_iteration": 3.3134469985961914 + }, + { + "auxiliary_loss_clip": 0.01099659, + "auxiliary_loss_mlp": 0.00772317, + "balance_loss_clip": 1.04388893, + "balance_loss_mlp": 1.00015724, + "epoch": 0.6109724936119044, + "flos": 20667848273280.0, + "grad_norm": 4.3292370915840666, + "language_loss": 0.76713967, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.78585941, + "num_input_tokens_seen": 218825720, + "step": 10162, + "time_per_iteration": 2.666212797164917 + }, + { + "auxiliary_loss_clip": 0.01115754, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.04164445, + "balance_loss_mlp": 1.0187993, + "epoch": 0.6110326168645723, + "flos": 31351882273920.0, + "grad_norm": 1.703540773348326, + "language_loss": 0.71334386, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.73482347, + "num_input_tokens_seen": 218847735, + "step": 10163, + "time_per_iteration": 2.65462327003479 + }, + { + "auxiliary_loss_clip": 0.01112689, + "auxiliary_loss_mlp": 0.0102818, + "balance_loss_clip": 1.03985834, + "balance_loss_mlp": 1.01618207, + "epoch": 0.6110927401172404, + "flos": 25003378742400.0, + "grad_norm": 1.8985707771161122, + "language_loss": 0.59787023, + "learning_rate": 1.387450491396625e-06, + "loss": 0.61927891, + "num_input_tokens_seen": 218866585, + "step": 10164, + "time_per_iteration": 2.5967462062835693 + }, + { + "auxiliary_loss_clip": 0.01098803, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.04045308, + "balance_loss_mlp": 1.01886845, + "epoch": 0.6111528633699083, + "flos": 26248078782720.0, + "grad_norm": 1.6376390692210252, + "language_loss": 0.75717723, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.77848011, + "num_input_tokens_seen": 218885560, + "step": 10165, + "time_per_iteration": 2.706014633178711 + }, + { + "auxiliary_loss_clip": 0.01092416, + "auxiliary_loss_mlp": 0.01029472, + "balance_loss_clip": 1.04052818, + "balance_loss_mlp": 1.01629317, + "epoch": 0.6112129866225763, + "flos": 22382474970240.0, + "grad_norm": 1.6019347929518222, + "language_loss": 0.79179376, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.81301266, + "num_input_tokens_seen": 218905055, + "step": 10166, + "time_per_iteration": 2.648865222930908 + }, + { + "auxiliary_loss_clip": 0.01089634, + "auxiliary_loss_mlp": 0.01029582, + "balance_loss_clip": 1.03888917, + "balance_loss_mlp": 1.01634979, + "epoch": 0.6112731098752443, + "flos": 25227892702080.0, + "grad_norm": 1.8171337399867642, + "language_loss": 0.67561293, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.69680506, + "num_input_tokens_seen": 218924030, + "step": 10167, + "time_per_iteration": 2.700876474380493 + }, + { + "auxiliary_loss_clip": 0.01114313, + "auxiliary_loss_mlp": 0.01035637, + "balance_loss_clip": 1.04177189, + "balance_loss_mlp": 1.02360249, + "epoch": 0.6113332331279122, + "flos": 22893160584960.0, + "grad_norm": 1.916507906954157, + "language_loss": 0.79281151, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.81431103, + "num_input_tokens_seen": 218943750, + "step": 10168, + "time_per_iteration": 2.7144253253936768 + }, + { + "auxiliary_loss_clip": 0.01121355, + "auxiliary_loss_mlp": 0.01040045, + "balance_loss_clip": 1.04141057, + "balance_loss_mlp": 1.02516127, + "epoch": 0.6113933563805802, + "flos": 18620329305600.0, + "grad_norm": 5.812497502727784, + "language_loss": 0.85299641, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.87461042, + "num_input_tokens_seen": 218957585, + "step": 10169, + "time_per_iteration": 2.5470833778381348 + }, + { + "auxiliary_loss_clip": 0.01112463, + "auxiliary_loss_mlp": 0.01031208, + "balance_loss_clip": 1.03939247, + "balance_loss_mlp": 1.01953125, + "epoch": 0.6114534796332481, + "flos": 41866275317760.0, + "grad_norm": 1.6486085762416796, + "language_loss": 0.78718483, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.80862153, + "num_input_tokens_seen": 218980025, + "step": 10170, + "time_per_iteration": 2.729773998260498 + }, + { + "auxiliary_loss_clip": 0.01098191, + "auxiliary_loss_mlp": 0.01039397, + "balance_loss_clip": 1.03988242, + "balance_loss_mlp": 1.02454388, + "epoch": 0.6115136028859162, + "flos": 21908454163200.0, + "grad_norm": 2.373771480482072, + "language_loss": 0.68857706, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.70995295, + "num_input_tokens_seen": 218998200, + "step": 10171, + "time_per_iteration": 2.8418185710906982 + }, + { + "auxiliary_loss_clip": 0.01084168, + "auxiliary_loss_mlp": 0.01037331, + "balance_loss_clip": 1.03639293, + "balance_loss_mlp": 1.0224179, + "epoch": 0.6115737261385841, + "flos": 28804846821120.0, + "grad_norm": 1.5357621569118813, + "language_loss": 0.79195881, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.81317377, + "num_input_tokens_seen": 219017910, + "step": 10172, + "time_per_iteration": 2.7578020095825195 + }, + { + "auxiliary_loss_clip": 0.01083831, + "auxiliary_loss_mlp": 0.0103911, + "balance_loss_clip": 1.03985405, + "balance_loss_mlp": 1.02511525, + "epoch": 0.6116338493912521, + "flos": 21251468453760.0, + "grad_norm": 2.0161581722139252, + "language_loss": 0.67301053, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.69423997, + "num_input_tokens_seen": 219037730, + "step": 10173, + "time_per_iteration": 2.767425298690796 + }, + { + "auxiliary_loss_clip": 0.01093328, + "auxiliary_loss_mlp": 0.01039514, + "balance_loss_clip": 1.0412842, + "balance_loss_mlp": 1.02572155, + "epoch": 0.61169397264392, + "flos": 17530189488000.0, + "grad_norm": 1.7178312614749116, + "language_loss": 0.55863279, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.57996118, + "num_input_tokens_seen": 219056755, + "step": 10174, + "time_per_iteration": 4.367191314697266 + }, + { + "auxiliary_loss_clip": 0.01098909, + "auxiliary_loss_mlp": 0.0103905, + "balance_loss_clip": 1.0425694, + "balance_loss_mlp": 1.02503705, + "epoch": 0.611754095896588, + "flos": 23951555758080.0, + "grad_norm": 1.9312072143196408, + "language_loss": 0.66054702, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.68192655, + "num_input_tokens_seen": 219076985, + "step": 10175, + "time_per_iteration": 4.3369996547698975 + }, + { + "auxiliary_loss_clip": 0.0110119, + "auxiliary_loss_mlp": 0.00770739, + "balance_loss_clip": 1.03765738, + "balance_loss_mlp": 1.00009918, + "epoch": 0.6118142191492559, + "flos": 25994872834560.0, + "grad_norm": 2.243694545574497, + "language_loss": 0.83143312, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.85015237, + "num_input_tokens_seen": 219096050, + "step": 10176, + "time_per_iteration": 2.6386196613311768 + }, + { + "auxiliary_loss_clip": 0.01097242, + "auxiliary_loss_mlp": 0.01040776, + "balance_loss_clip": 1.04172039, + "balance_loss_mlp": 1.02604187, + "epoch": 0.611874342401924, + "flos": 24603190341120.0, + "grad_norm": 1.9428160095597935, + "language_loss": 0.77491206, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.79629225, + "num_input_tokens_seen": 219112665, + "step": 10177, + "time_per_iteration": 2.68098521232605 + }, + { + "auxiliary_loss_clip": 0.01100764, + "auxiliary_loss_mlp": 0.00771744, + "balance_loss_clip": 1.03818965, + "balance_loss_mlp": 1.00019312, + "epoch": 0.6119344656545919, + "flos": 15887132640000.0, + "grad_norm": 3.7169342629070505, + "language_loss": 0.75467336, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.77339846, + "num_input_tokens_seen": 219129120, + "step": 10178, + "time_per_iteration": 2.600816011428833 + }, + { + "auxiliary_loss_clip": 0.01088953, + "auxiliary_loss_mlp": 0.01045893, + "balance_loss_clip": 1.03788781, + "balance_loss_mlp": 1.03069353, + "epoch": 0.6119945889072599, + "flos": 21652877917440.0, + "grad_norm": 1.6054240792862575, + "language_loss": 0.67197716, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.69332558, + "num_input_tokens_seen": 219148950, + "step": 10179, + "time_per_iteration": 4.199966669082642 + }, + { + "auxiliary_loss_clip": 0.0109746, + "auxiliary_loss_mlp": 0.01034998, + "balance_loss_clip": 1.0424819, + "balance_loss_mlp": 1.02241588, + "epoch": 0.6120547121599279, + "flos": 13772533023360.0, + "grad_norm": 1.7918927990683708, + "language_loss": 0.83621407, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.85753864, + "num_input_tokens_seen": 219165585, + "step": 10180, + "time_per_iteration": 2.617266893386841 + }, + { + "auxiliary_loss_clip": 0.01117181, + "auxiliary_loss_mlp": 0.01032718, + "balance_loss_clip": 1.04272151, + "balance_loss_mlp": 1.01881814, + "epoch": 0.6121148354125958, + "flos": 20079164275200.0, + "grad_norm": 1.5733186243311148, + "language_loss": 0.7745713, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.79607022, + "num_input_tokens_seen": 219183280, + "step": 10181, + "time_per_iteration": 2.542682409286499 + }, + { + "auxiliary_loss_clip": 0.01117399, + "auxiliary_loss_mlp": 0.01035122, + "balance_loss_clip": 1.041821, + "balance_loss_mlp": 1.02228367, + "epoch": 0.6121749586652638, + "flos": 13471313569920.0, + "grad_norm": 15.811946001131306, + "language_loss": 0.80652797, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.82805324, + "num_input_tokens_seen": 219197200, + "step": 10182, + "time_per_iteration": 2.6980040073394775 + }, + { + "auxiliary_loss_clip": 0.01077836, + "auxiliary_loss_mlp": 0.01037108, + "balance_loss_clip": 1.03717065, + "balance_loss_mlp": 1.02501428, + "epoch": 0.6122350819179317, + "flos": 20120533764480.0, + "grad_norm": 1.5906642026710172, + "language_loss": 0.82815677, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.84930623, + "num_input_tokens_seen": 219216825, + "step": 10183, + "time_per_iteration": 2.808246612548828 + }, + { + "auxiliary_loss_clip": 0.01025033, + "auxiliary_loss_mlp": 0.01005337, + "balance_loss_clip": 1.01312232, + "balance_loss_mlp": 1.00417471, + "epoch": 0.6122952051705998, + "flos": 65429242767360.0, + "grad_norm": 0.7045561187177276, + "language_loss": 0.62833804, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.64864177, + "num_input_tokens_seen": 219283795, + "step": 10184, + "time_per_iteration": 3.2871408462524414 + }, + { + "auxiliary_loss_clip": 0.01108097, + "auxiliary_loss_mlp": 0.01037833, + "balance_loss_clip": 1.042454, + "balance_loss_mlp": 1.02517307, + "epoch": 0.6123553284232677, + "flos": 20376253664640.0, + "grad_norm": 1.792613488461195, + "language_loss": 0.82103658, + "learning_rate": 1.379669981812101e-06, + "loss": 0.8424958, + "num_input_tokens_seen": 219302385, + "step": 10185, + "time_per_iteration": 2.623692750930786 + }, + { + "auxiliary_loss_clip": 0.0109256, + "auxiliary_loss_mlp": 0.01038333, + "balance_loss_clip": 1.04070401, + "balance_loss_mlp": 1.02442169, + "epoch": 0.6124154516759357, + "flos": 23987645948160.0, + "grad_norm": 1.7206353448570937, + "language_loss": 0.74358237, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.76489139, + "num_input_tokens_seen": 219319765, + "step": 10186, + "time_per_iteration": 2.657557725906372 + }, + { + "auxiliary_loss_clip": 0.01099771, + "auxiliary_loss_mlp": 0.0103362, + "balance_loss_clip": 1.03756428, + "balance_loss_mlp": 1.021294, + "epoch": 0.6124755749286036, + "flos": 21468799693440.0, + "grad_norm": 1.5881045533275502, + "language_loss": 0.7818836, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.80321753, + "num_input_tokens_seen": 219337440, + "step": 10187, + "time_per_iteration": 2.625558376312256 + }, + { + "auxiliary_loss_clip": 0.01113087, + "auxiliary_loss_mlp": 0.01033487, + "balance_loss_clip": 1.03851271, + "balance_loss_mlp": 1.02038562, + "epoch": 0.6125356981812716, + "flos": 23879195809920.0, + "grad_norm": 1.8256616870215527, + "language_loss": 0.83049744, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.85196316, + "num_input_tokens_seen": 219357525, + "step": 10188, + "time_per_iteration": 2.6045219898223877 + }, + { + "auxiliary_loss_clip": 0.0108702, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.04232693, + "balance_loss_mlp": 1.01822519, + "epoch": 0.6125958214339395, + "flos": 14425604150400.0, + "grad_norm": 1.7058207723590004, + "language_loss": 0.7547375, + "learning_rate": 1.378189152155896e-06, + "loss": 0.77592176, + "num_input_tokens_seen": 219374855, + "step": 10189, + "time_per_iteration": 2.7627220153808594 + }, + { + "auxiliary_loss_clip": 0.01101171, + "auxiliary_loss_mlp": 0.0104025, + "balance_loss_clip": 1.03780556, + "balance_loss_mlp": 1.02642107, + "epoch": 0.6126559446866076, + "flos": 23259090389760.0, + "grad_norm": 1.513309715943079, + "language_loss": 0.74214786, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.76356208, + "num_input_tokens_seen": 219394740, + "step": 10190, + "time_per_iteration": 2.617075204849243 + }, + { + "auxiliary_loss_clip": 0.01104454, + "auxiliary_loss_mlp": 0.0103683, + "balance_loss_clip": 1.04099751, + "balance_loss_mlp": 1.02338362, + "epoch": 0.6127160679392755, + "flos": 26864808324480.0, + "grad_norm": 1.7858486096662998, + "language_loss": 0.68623936, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.70765221, + "num_input_tokens_seen": 219413755, + "step": 10191, + "time_per_iteration": 2.681180477142334 + }, + { + "auxiliary_loss_clip": 0.0110296, + "auxiliary_loss_mlp": 0.0103819, + "balance_loss_clip": 1.0385741, + "balance_loss_mlp": 1.02425456, + "epoch": 0.6127761911919435, + "flos": 26396425952640.0, + "grad_norm": 2.13769200790618, + "language_loss": 0.73452723, + "learning_rate": 1.377078777445467e-06, + "loss": 0.75593865, + "num_input_tokens_seen": 219433560, + "step": 10192, + "time_per_iteration": 2.6742324829101562 + }, + { + "auxiliary_loss_clip": 0.01075917, + "auxiliary_loss_mlp": 0.01033242, + "balance_loss_clip": 1.03988755, + "balance_loss_mlp": 1.02090943, + "epoch": 0.6128363144446115, + "flos": 22634747164800.0, + "grad_norm": 2.0088299944144636, + "language_loss": 0.83632165, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.85741329, + "num_input_tokens_seen": 219452640, + "step": 10193, + "time_per_iteration": 2.701087474822998 + }, + { + "auxiliary_loss_clip": 0.01082703, + "auxiliary_loss_mlp": 0.01035438, + "balance_loss_clip": 1.03853893, + "balance_loss_mlp": 1.02231348, + "epoch": 0.6128964376972794, + "flos": 26759051706240.0, + "grad_norm": 2.1771802645074105, + "language_loss": 0.6991539, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.72033525, + "num_input_tokens_seen": 219468585, + "step": 10194, + "time_per_iteration": 2.6878440380096436 + }, + { + "auxiliary_loss_clip": 0.01010189, + "auxiliary_loss_mlp": 0.01003845, + "balance_loss_clip": 1.01538479, + "balance_loss_mlp": 1.002653, + "epoch": 0.6129565609499474, + "flos": 65567929178880.0, + "grad_norm": 0.8185640373649049, + "language_loss": 0.58629549, + "learning_rate": 1.375968615326149e-06, + "loss": 0.60643584, + "num_input_tokens_seen": 219523015, + "step": 10195, + "time_per_iteration": 3.05383038520813 + }, + { + "auxiliary_loss_clip": 0.01095455, + "auxiliary_loss_mlp": 0.01035767, + "balance_loss_clip": 1.04045796, + "balance_loss_mlp": 1.02244508, + "epoch": 0.6130166842026153, + "flos": 16362087200640.0, + "grad_norm": 2.135532256863793, + "language_loss": 0.69762802, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.71894026, + "num_input_tokens_seen": 219539980, + "step": 10196, + "time_per_iteration": 2.6125218868255615 + }, + { + "auxiliary_loss_clip": 0.01089403, + "auxiliary_loss_mlp": 0.01036656, + "balance_loss_clip": 1.03711545, + "balance_loss_mlp": 1.02356637, + "epoch": 0.6130768074552834, + "flos": 23652455207040.0, + "grad_norm": 1.7041901113683988, + "language_loss": 0.71497107, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.73623163, + "num_input_tokens_seen": 219556980, + "step": 10197, + "time_per_iteration": 2.687622547149658 + }, + { + "auxiliary_loss_clip": 0.01102107, + "auxiliary_loss_mlp": 0.01046474, + "balance_loss_clip": 1.03892088, + "balance_loss_mlp": 1.03226423, + "epoch": 0.6131369307079513, + "flos": 20047455544320.0, + "grad_norm": 2.1425841144655533, + "language_loss": 0.79149073, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.81297648, + "num_input_tokens_seen": 219576410, + "step": 10198, + "time_per_iteration": 2.6697170734405518 + }, + { + "auxiliary_loss_clip": 0.01092328, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.04398417, + "balance_loss_mlp": 1.01794744, + "epoch": 0.6131970539606193, + "flos": 22672166158080.0, + "grad_norm": 1.4352269101197792, + "language_loss": 0.74505019, + "learning_rate": 1.374488730519181e-06, + "loss": 0.76628667, + "num_input_tokens_seen": 219597180, + "step": 10199, + "time_per_iteration": 2.789501905441284 + }, + { + "auxiliary_loss_clip": 0.01092976, + "auxiliary_loss_mlp": 0.01040283, + "balance_loss_clip": 1.03864002, + "balance_loss_mlp": 1.02596581, + "epoch": 0.6132571772132872, + "flos": 26870913636480.0, + "grad_norm": 2.276152956596312, + "language_loss": 0.62111485, + "learning_rate": 1.374118818580993e-06, + "loss": 0.64244747, + "num_input_tokens_seen": 219617630, + "step": 10200, + "time_per_iteration": 2.7012946605682373 + }, + { + "auxiliary_loss_clip": 0.01092122, + "auxiliary_loss_mlp": 0.01030786, + "balance_loss_clip": 1.04091394, + "balance_loss_mlp": 1.01772022, + "epoch": 0.6133173004659552, + "flos": 22892657794560.0, + "grad_norm": 2.1392566641947464, + "language_loss": 0.6911571, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.71238619, + "num_input_tokens_seen": 219637025, + "step": 10201, + "time_per_iteration": 2.7815003395080566 + }, + { + "auxiliary_loss_clip": 0.01091125, + "auxiliary_loss_mlp": 0.01031281, + "balance_loss_clip": 1.03879607, + "balance_loss_mlp": 1.018466, + "epoch": 0.6133774237186231, + "flos": 20485098852480.0, + "grad_norm": 1.7663162719665984, + "language_loss": 0.83417988, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.85540396, + "num_input_tokens_seen": 219656625, + "step": 10202, + "time_per_iteration": 2.6809394359588623 + }, + { + "auxiliary_loss_clip": 0.01037873, + "auxiliary_loss_mlp": 0.0100084, + "balance_loss_clip": 1.01421046, + "balance_loss_mlp": 0.99977362, + "epoch": 0.6134375469712912, + "flos": 69413065217280.0, + "grad_norm": 0.8928245444729744, + "language_loss": 0.67083746, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.69122458, + "num_input_tokens_seen": 219718090, + "step": 10203, + "time_per_iteration": 3.153150796890259 + }, + { + "auxiliary_loss_clip": 0.01107329, + "auxiliary_loss_mlp": 0.01030437, + "balance_loss_clip": 1.04093874, + "balance_loss_mlp": 1.01783061, + "epoch": 0.6134976702239591, + "flos": 41281541815680.0, + "grad_norm": 1.5881826993460113, + "language_loss": 0.61211205, + "learning_rate": 1.37263940830327e-06, + "loss": 0.63348967, + "num_input_tokens_seen": 219740100, + "step": 10204, + "time_per_iteration": 2.8730733394622803 + }, + { + "auxiliary_loss_clip": 0.01079745, + "auxiliary_loss_mlp": 0.0102996, + "balance_loss_clip": 1.03856349, + "balance_loss_mlp": 1.0171572, + "epoch": 0.6135577934766271, + "flos": 22346600261760.0, + "grad_norm": 1.8494988248574857, + "language_loss": 0.72484612, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.74594319, + "num_input_tokens_seen": 219761225, + "step": 10205, + "time_per_iteration": 2.789635419845581 + }, + { + "auxiliary_loss_clip": 0.0110225, + "auxiliary_loss_mlp": 0.01027505, + "balance_loss_clip": 1.04025602, + "balance_loss_mlp": 1.01416492, + "epoch": 0.6136179167292951, + "flos": 23728155120000.0, + "grad_norm": 1.726684216662775, + "language_loss": 0.76188898, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.78318655, + "num_input_tokens_seen": 219780085, + "step": 10206, + "time_per_iteration": 2.6312029361724854 + }, + { + "auxiliary_loss_clip": 0.01085288, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.04444122, + "balance_loss_mlp": 1.0182395, + "epoch": 0.613678039981963, + "flos": 26024678144640.0, + "grad_norm": 2.2620215533288013, + "language_loss": 0.7565456, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.7777158, + "num_input_tokens_seen": 219797895, + "step": 10207, + "time_per_iteration": 2.768277645111084 + }, + { + "auxiliary_loss_clip": 0.01103864, + "auxiliary_loss_mlp": 0.01034203, + "balance_loss_clip": 1.04067349, + "balance_loss_mlp": 1.02212703, + "epoch": 0.613738163234631, + "flos": 9859957200000.0, + "grad_norm": 2.399068150435751, + "language_loss": 0.83005822, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.85143888, + "num_input_tokens_seen": 219811295, + "step": 10208, + "time_per_iteration": 2.5925726890563965 + }, + { + "auxiliary_loss_clip": 0.0109897, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.04265046, + "balance_loss_mlp": 1.01999795, + "epoch": 0.613798286487299, + "flos": 33182070001920.0, + "grad_norm": 1.8170662176874706, + "language_loss": 0.72382063, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.74515176, + "num_input_tokens_seen": 219832735, + "step": 10209, + "time_per_iteration": 2.7966833114624023 + }, + { + "auxiliary_loss_clip": 0.01115107, + "auxiliary_loss_mlp": 0.01038487, + "balance_loss_clip": 1.04209638, + "balance_loss_mlp": 1.02545118, + "epoch": 0.613858409739967, + "flos": 25627901535360.0, + "grad_norm": 1.6402518788547593, + "language_loss": 0.74474829, + "learning_rate": 1.37042100685438e-06, + "loss": 0.76628423, + "num_input_tokens_seen": 219852755, + "step": 10210, + "time_per_iteration": 2.615272045135498 + }, + { + "auxiliary_loss_clip": 0.01010153, + "auxiliary_loss_mlp": 0.01001177, + "balance_loss_clip": 1.01308346, + "balance_loss_mlp": 0.99999064, + "epoch": 0.6139185329926349, + "flos": 67192313932800.0, + "grad_norm": 0.8597503962544338, + "language_loss": 0.64958251, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.66969585, + "num_input_tokens_seen": 219922785, + "step": 10211, + "time_per_iteration": 3.410182476043701 + }, + { + "auxiliary_loss_clip": 0.01093321, + "auxiliary_loss_mlp": 0.00771551, + "balance_loss_clip": 1.03993869, + "balance_loss_mlp": 1.00015092, + "epoch": 0.6139786562453029, + "flos": 21543637680000.0, + "grad_norm": 2.0754424248675893, + "language_loss": 0.7585628, + "learning_rate": 1.369681730544801e-06, + "loss": 0.77721149, + "num_input_tokens_seen": 219942215, + "step": 10212, + "time_per_iteration": 3.0132839679718018 + }, + { + "auxiliary_loss_clip": 0.01087691, + "auxiliary_loss_mlp": 0.01041947, + "balance_loss_clip": 1.03709769, + "balance_loss_mlp": 1.02745092, + "epoch": 0.6140387794979708, + "flos": 26068489758720.0, + "grad_norm": 1.8964126815157365, + "language_loss": 0.74028683, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.76158321, + "num_input_tokens_seen": 219963830, + "step": 10213, + "time_per_iteration": 2.757840871810913 + }, + { + "auxiliary_loss_clip": 0.01100654, + "auxiliary_loss_mlp": 0.01037388, + "balance_loss_clip": 1.04215121, + "balance_loss_mlp": 1.02341056, + "epoch": 0.6140989027506388, + "flos": 23694614795520.0, + "grad_norm": 1.4673821315924696, + "language_loss": 0.73059738, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.7519778, + "num_input_tokens_seen": 219983815, + "step": 10214, + "time_per_iteration": 5.874944686889648 + }, + { + "auxiliary_loss_clip": 0.01119065, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.04233837, + "balance_loss_mlp": 1.01859856, + "epoch": 0.6141590260033067, + "flos": 22231721589120.0, + "grad_norm": 1.979046579810642, + "language_loss": 0.74642611, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.76794291, + "num_input_tokens_seen": 220003165, + "step": 10215, + "time_per_iteration": 4.103458404541016 + }, + { + "auxiliary_loss_clip": 0.0110334, + "auxiliary_loss_mlp": 0.01036271, + "balance_loss_clip": 1.04110682, + "balance_loss_mlp": 1.02308035, + "epoch": 0.6142191492559748, + "flos": 23871653953920.0, + "grad_norm": 3.0132083118300526, + "language_loss": 0.78161263, + "learning_rate": 1.368203464858542e-06, + "loss": 0.80300874, + "num_input_tokens_seen": 220021015, + "step": 10216, + "time_per_iteration": 2.6554577350616455 + }, + { + "auxiliary_loss_clip": 0.01116166, + "auxiliary_loss_mlp": 0.01038341, + "balance_loss_clip": 1.04212427, + "balance_loss_mlp": 1.02428079, + "epoch": 0.6142792725086427, + "flos": 15042513260160.0, + "grad_norm": 2.385226690327553, + "language_loss": 0.80102211, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.82256722, + "num_input_tokens_seen": 220035780, + "step": 10217, + "time_per_iteration": 2.5665090084075928 + }, + { + "auxiliary_loss_clip": 0.01096361, + "auxiliary_loss_mlp": 0.01032764, + "balance_loss_clip": 1.04036403, + "balance_loss_mlp": 1.0193646, + "epoch": 0.6143393957613107, + "flos": 23330947547520.0, + "grad_norm": 2.363056906877031, + "language_loss": 0.7822212, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.80351239, + "num_input_tokens_seen": 220054280, + "step": 10218, + "time_per_iteration": 2.659820795059204 + }, + { + "auxiliary_loss_clip": 0.01108038, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.04290485, + "balance_loss_mlp": 1.02203321, + "epoch": 0.6143995190139786, + "flos": 20117086058880.0, + "grad_norm": 1.5950804577882065, + "language_loss": 0.8189528, + "learning_rate": 1.367095017101569e-06, + "loss": 0.84038228, + "num_input_tokens_seen": 220074120, + "step": 10219, + "time_per_iteration": 4.207094192504883 + }, + { + "auxiliary_loss_clip": 0.01098839, + "auxiliary_loss_mlp": 0.01035321, + "balance_loss_clip": 1.0370295, + "balance_loss_mlp": 1.02146316, + "epoch": 0.6144596422666466, + "flos": 42303559489920.0, + "grad_norm": 2.5627103076938424, + "language_loss": 0.66738832, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.68872988, + "num_input_tokens_seen": 220096320, + "step": 10220, + "time_per_iteration": 2.7829878330230713 + }, + { + "auxiliary_loss_clip": 0.01103534, + "auxiliary_loss_mlp": 0.01029408, + "balance_loss_clip": 1.03913307, + "balance_loss_mlp": 1.01669455, + "epoch": 0.6145197655193146, + "flos": 21573622558080.0, + "grad_norm": 1.8637833274966709, + "language_loss": 0.71766376, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.73899317, + "num_input_tokens_seen": 220114850, + "step": 10221, + "time_per_iteration": 2.621060609817505 + }, + { + "auxiliary_loss_clip": 0.01066987, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.03472996, + "balance_loss_mlp": 1.01779163, + "epoch": 0.6145798887719826, + "flos": 21471098163840.0, + "grad_norm": 1.725179067455254, + "language_loss": 0.79747754, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.81845343, + "num_input_tokens_seen": 220133395, + "step": 10222, + "time_per_iteration": 2.7557356357574463 + }, + { + "auxiliary_loss_clip": 0.01092387, + "auxiliary_loss_mlp": 0.01042666, + "balance_loss_clip": 1.04074025, + "balance_loss_mlp": 1.02842045, + "epoch": 0.6146400120246506, + "flos": 20777016683520.0, + "grad_norm": 1.8633750333173091, + "language_loss": 0.76163048, + "learning_rate": 1.365617422821788e-06, + "loss": 0.78298092, + "num_input_tokens_seen": 220152790, + "step": 10223, + "time_per_iteration": 2.649580717086792 + }, + { + "auxiliary_loss_clip": 0.01093219, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.04123545, + "balance_loss_mlp": 1.02193058, + "epoch": 0.6147001352773185, + "flos": 13881306384000.0, + "grad_norm": 1.8872928812493461, + "language_loss": 0.78260607, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.80388576, + "num_input_tokens_seen": 220169535, + "step": 10224, + "time_per_iteration": 2.6781771183013916 + }, + { + "auxiliary_loss_clip": 0.01076582, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.03500175, + "balance_loss_mlp": 1.02076793, + "epoch": 0.6147602585299865, + "flos": 56641791807360.0, + "grad_norm": 1.4371349679447827, + "language_loss": 0.66419935, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.68529069, + "num_input_tokens_seen": 220195305, + "step": 10225, + "time_per_iteration": 3.0390100479125977 + }, + { + "auxiliary_loss_clip": 0.01103654, + "auxiliary_loss_mlp": 0.00771954, + "balance_loss_clip": 1.04197466, + "balance_loss_mlp": 1.00015104, + "epoch": 0.6148203817826544, + "flos": 32817217605120.0, + "grad_norm": 2.067542776960223, + "language_loss": 0.6355052, + "learning_rate": 1.364509479649357e-06, + "loss": 0.65426129, + "num_input_tokens_seen": 220215040, + "step": 10226, + "time_per_iteration": 2.744330644607544 + }, + { + "auxiliary_loss_clip": 0.01090925, + "auxiliary_loss_mlp": 0.01037806, + "balance_loss_clip": 1.03825569, + "balance_loss_mlp": 1.02304804, + "epoch": 0.6148805050353224, + "flos": 18332038748160.0, + "grad_norm": 1.7718988021259403, + "language_loss": 0.75872779, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.78001511, + "num_input_tokens_seen": 220234205, + "step": 10227, + "time_per_iteration": 2.7481887340545654 + }, + { + "auxiliary_loss_clip": 0.01054701, + "auxiliary_loss_mlp": 0.01043082, + "balance_loss_clip": 1.03239739, + "balance_loss_mlp": 1.02689981, + "epoch": 0.6149406282879903, + "flos": 14063983977600.0, + "grad_norm": 2.209409032208413, + "language_loss": 0.62177163, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.64274943, + "num_input_tokens_seen": 220252730, + "step": 10228, + "time_per_iteration": 2.797832489013672 + }, + { + "auxiliary_loss_clip": 0.0109079, + "auxiliary_loss_mlp": 0.01033221, + "balance_loss_clip": 1.03737903, + "balance_loss_mlp": 1.0200839, + "epoch": 0.6150007515406584, + "flos": 25190186400000.0, + "grad_norm": 2.3158396173840683, + "language_loss": 0.74483359, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.7660737, + "num_input_tokens_seen": 220273345, + "step": 10229, + "time_per_iteration": 2.7949423789978027 + }, + { + "auxiliary_loss_clip": 0.01118363, + "auxiliary_loss_mlp": 0.01039286, + "balance_loss_clip": 1.04305434, + "balance_loss_mlp": 1.02533805, + "epoch": 0.6150608747933263, + "flos": 21945262625280.0, + "grad_norm": 1.6423781673268174, + "language_loss": 0.7801019, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.80167842, + "num_input_tokens_seen": 220293845, + "step": 10230, + "time_per_iteration": 2.666316509246826 + }, + { + "auxiliary_loss_clip": 0.01086667, + "auxiliary_loss_mlp": 0.01029888, + "balance_loss_clip": 1.03686535, + "balance_loss_mlp": 1.01674509, + "epoch": 0.6151209980459943, + "flos": 30117453523200.0, + "grad_norm": 1.4482184431954421, + "language_loss": 0.73085076, + "learning_rate": 1.36266338983927e-06, + "loss": 0.75201631, + "num_input_tokens_seen": 220316070, + "step": 10231, + "time_per_iteration": 2.7693657875061035 + }, + { + "auxiliary_loss_clip": 0.01095915, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.04084241, + "balance_loss_mlp": 1.02099395, + "epoch": 0.6151811212986622, + "flos": 30008356940160.0, + "grad_norm": 1.525819080770735, + "language_loss": 0.69824755, + "learning_rate": 1.362294244324858e-06, + "loss": 0.71954578, + "num_input_tokens_seen": 220335695, + "step": 10232, + "time_per_iteration": 2.682452917098999 + }, + { + "auxiliary_loss_clip": 0.01099274, + "auxiliary_loss_mlp": 0.00770809, + "balance_loss_clip": 1.03777719, + "balance_loss_mlp": 1.00007868, + "epoch": 0.6152412445513302, + "flos": 18872888808960.0, + "grad_norm": 2.3038424014240215, + "language_loss": 0.91654289, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.93524379, + "num_input_tokens_seen": 220353720, + "step": 10233, + "time_per_iteration": 2.6199569702148438 + }, + { + "auxiliary_loss_clip": 0.01083051, + "auxiliary_loss_mlp": 0.01033569, + "balance_loss_clip": 1.04041195, + "balance_loss_mlp": 1.02191687, + "epoch": 0.6153013678039982, + "flos": 25703601448320.0, + "grad_norm": 1.8226041312601646, + "language_loss": 0.71622181, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.73738801, + "num_input_tokens_seen": 220372515, + "step": 10234, + "time_per_iteration": 2.6806395053863525 + }, + { + "auxiliary_loss_clip": 0.01107194, + "auxiliary_loss_mlp": 0.00771951, + "balance_loss_clip": 1.04099405, + "balance_loss_mlp": 1.0002284, + "epoch": 0.6153614910566662, + "flos": 28510271383680.0, + "grad_norm": 2.918285420802953, + "language_loss": 0.66839552, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.68718696, + "num_input_tokens_seen": 220393490, + "step": 10235, + "time_per_iteration": 2.896367073059082 + }, + { + "auxiliary_loss_clip": 0.01102816, + "auxiliary_loss_mlp": 0.0103213, + "balance_loss_clip": 1.04112911, + "balance_loss_mlp": 1.01878452, + "epoch": 0.6154216143093342, + "flos": 23549787158400.0, + "grad_norm": 1.534901762766115, + "language_loss": 0.81011724, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.83146667, + "num_input_tokens_seen": 220412855, + "step": 10236, + "time_per_iteration": 2.679506301879883 + }, + { + "auxiliary_loss_clip": 0.01117813, + "auxiliary_loss_mlp": 0.01032266, + "balance_loss_clip": 1.04047644, + "balance_loss_mlp": 1.01949906, + "epoch": 0.6154817375620021, + "flos": 22748081552640.0, + "grad_norm": 1.522081781804378, + "language_loss": 0.80553526, + "learning_rate": 1.360448879760721e-06, + "loss": 0.82703608, + "num_input_tokens_seen": 220433440, + "step": 10237, + "time_per_iteration": 2.6127498149871826 + }, + { + "auxiliary_loss_clip": 0.011004, + "auxiliary_loss_mlp": 0.01042204, + "balance_loss_clip": 1.04215753, + "balance_loss_mlp": 1.02890038, + "epoch": 0.6155418608146701, + "flos": 27162975121920.0, + "grad_norm": 1.7653521660078044, + "language_loss": 0.75694555, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.77837157, + "num_input_tokens_seen": 220453445, + "step": 10238, + "time_per_iteration": 2.7021820545196533 + }, + { + "auxiliary_loss_clip": 0.00990356, + "auxiliary_loss_mlp": 0.01013988, + "balance_loss_clip": 1.01446486, + "balance_loss_mlp": 1.01235473, + "epoch": 0.615601984067338, + "flos": 68811165014400.0, + "grad_norm": 0.760761219232036, + "language_loss": 0.57602662, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.59607005, + "num_input_tokens_seen": 220509730, + "step": 10239, + "time_per_iteration": 3.3009963035583496 + }, + { + "auxiliary_loss_clip": 0.01096252, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.03823948, + "balance_loss_mlp": 1.01997805, + "epoch": 0.615662107320006, + "flos": 15517144598400.0, + "grad_norm": 1.7796767695280624, + "language_loss": 0.77439094, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.79568958, + "num_input_tokens_seen": 220527295, + "step": 10240, + "time_per_iteration": 2.7327582836151123 + }, + { + "auxiliary_loss_clip": 0.011174, + "auxiliary_loss_mlp": 0.01036437, + "balance_loss_clip": 1.04190874, + "balance_loss_mlp": 1.02288342, + "epoch": 0.615722230572674, + "flos": 21063691128960.0, + "grad_norm": 3.4544456151934315, + "language_loss": 0.73013711, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.75167549, + "num_input_tokens_seen": 220542730, + "step": 10241, + "time_per_iteration": 2.6023552417755127 + }, + { + "auxiliary_loss_clip": 0.01112719, + "auxiliary_loss_mlp": 0.01028871, + "balance_loss_clip": 1.03958392, + "balance_loss_mlp": 1.01619887, + "epoch": 0.615782353825342, + "flos": 23256791919360.0, + "grad_norm": 1.6070807308789545, + "language_loss": 0.72045815, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.7418741, + "num_input_tokens_seen": 220562995, + "step": 10242, + "time_per_iteration": 2.6226117610931396 + }, + { + "auxiliary_loss_clip": 0.0110498, + "auxiliary_loss_mlp": 0.01029697, + "balance_loss_clip": 1.04025722, + "balance_loss_mlp": 1.01723933, + "epoch": 0.6158424770780099, + "flos": 21103911383040.0, + "grad_norm": 3.2758585328662693, + "language_loss": 0.72332186, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.74466866, + "num_input_tokens_seen": 220581775, + "step": 10243, + "time_per_iteration": 2.6781527996063232 + }, + { + "auxiliary_loss_clip": 0.01030422, + "auxiliary_loss_mlp": 0.01003075, + "balance_loss_clip": 1.01600218, + "balance_loss_mlp": 1.00200224, + "epoch": 0.6159026003306779, + "flos": 70333276769280.0, + "grad_norm": 0.7877801586989086, + "language_loss": 0.56873554, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.5890705, + "num_input_tokens_seen": 220646395, + "step": 10244, + "time_per_iteration": 3.2125418186187744 + }, + { + "auxiliary_loss_clip": 0.01114981, + "auxiliary_loss_mlp": 0.01034292, + "balance_loss_clip": 1.03982329, + "balance_loss_mlp": 1.02022541, + "epoch": 0.6159627235833458, + "flos": 33874355802240.0, + "grad_norm": 1.5742269245602847, + "language_loss": 0.63524461, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.65673733, + "num_input_tokens_seen": 220668335, + "step": 10245, + "time_per_iteration": 2.7619571685791016 + }, + { + "auxiliary_loss_clip": 0.01065921, + "auxiliary_loss_mlp": 0.01029863, + "balance_loss_clip": 1.03640854, + "balance_loss_mlp": 1.01676226, + "epoch": 0.6160228468360138, + "flos": 26575440359040.0, + "grad_norm": 2.04251017264565, + "language_loss": 0.79142463, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.81238246, + "num_input_tokens_seen": 220688915, + "step": 10246, + "time_per_iteration": 2.799443483352661 + }, + { + "auxiliary_loss_clip": 0.01079892, + "auxiliary_loss_mlp": 0.00772846, + "balance_loss_clip": 1.03852773, + "balance_loss_mlp": 1.00013709, + "epoch": 0.6160829700886818, + "flos": 17193274894080.0, + "grad_norm": 3.4946061818357115, + "language_loss": 0.87453389, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.89306134, + "num_input_tokens_seen": 220703465, + "step": 10247, + "time_per_iteration": 2.652655839920044 + }, + { + "auxiliary_loss_clip": 0.01044965, + "auxiliary_loss_mlp": 0.01035448, + "balance_loss_clip": 1.03624761, + "balance_loss_mlp": 1.02157784, + "epoch": 0.6161430933413498, + "flos": 23623547736960.0, + "grad_norm": 1.6669970799602325, + "language_loss": 0.79791045, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.81871456, + "num_input_tokens_seen": 220722090, + "step": 10248, + "time_per_iteration": 2.742093563079834 + }, + { + "auxiliary_loss_clip": 0.01068661, + "auxiliary_loss_mlp": 0.010344, + "balance_loss_clip": 1.03618228, + "balance_loss_mlp": 1.02193117, + "epoch": 0.6162032165940178, + "flos": 23002436736000.0, + "grad_norm": 3.2255403010195884, + "language_loss": 0.87085855, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.89188921, + "num_input_tokens_seen": 220741075, + "step": 10249, + "time_per_iteration": 2.7385706901550293 + }, + { + "auxiliary_loss_clip": 0.01115811, + "auxiliary_loss_mlp": 0.01026714, + "balance_loss_clip": 1.04125154, + "balance_loss_mlp": 1.01251006, + "epoch": 0.6162633398466857, + "flos": 39421979740800.0, + "grad_norm": 2.234106446125174, + "language_loss": 0.69080746, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.71223265, + "num_input_tokens_seen": 220763395, + "step": 10250, + "time_per_iteration": 2.736942768096924 + }, + { + "auxiliary_loss_clip": 0.0108508, + "auxiliary_loss_mlp": 0.0102783, + "balance_loss_clip": 1.03718221, + "balance_loss_mlp": 1.01533055, + "epoch": 0.6163234630993537, + "flos": 19244672530560.0, + "grad_norm": 1.84490130709099, + "language_loss": 0.74013072, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.76125979, + "num_input_tokens_seen": 220780640, + "step": 10251, + "time_per_iteration": 2.736994504928589 + }, + { + "auxiliary_loss_clip": 0.01098297, + "auxiliary_loss_mlp": 0.01035781, + "balance_loss_clip": 1.03710103, + "balance_loss_mlp": 1.02119529, + "epoch": 0.6163835863520216, + "flos": 15961791058560.0, + "grad_norm": 2.3749552307580615, + "language_loss": 0.68138051, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.7027213, + "num_input_tokens_seen": 220797960, + "step": 10252, + "time_per_iteration": 2.5879385471343994 + }, + { + "auxiliary_loss_clip": 0.00977001, + "auxiliary_loss_mlp": 0.01000711, + "balance_loss_clip": 1.01395059, + "balance_loss_mlp": 0.9993996, + "epoch": 0.6164437096046896, + "flos": 68103834393600.0, + "grad_norm": 0.8911370167619598, + "language_loss": 0.57833099, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.59810811, + "num_input_tokens_seen": 220856930, + "step": 10253, + "time_per_iteration": 6.5962769985198975 + }, + { + "auxiliary_loss_clip": 0.0109176, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.03666162, + "balance_loss_mlp": 1.01960826, + "epoch": 0.6165038328573575, + "flos": 21361211481600.0, + "grad_norm": 1.506953433371801, + "language_loss": 0.80028725, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.82153523, + "num_input_tokens_seen": 220877595, + "step": 10254, + "time_per_iteration": 4.457768678665161 + }, + { + "auxiliary_loss_clip": 0.01092373, + "auxiliary_loss_mlp": 0.01030068, + "balance_loss_clip": 1.04135227, + "balance_loss_mlp": 1.01689541, + "epoch": 0.6165639561100256, + "flos": 21101972048640.0, + "grad_norm": 2.217497401179692, + "language_loss": 0.80495244, + "learning_rate": 1.353810600008846e-06, + "loss": 0.82617688, + "num_input_tokens_seen": 220896880, + "step": 10255, + "time_per_iteration": 2.730621814727783 + }, + { + "auxiliary_loss_clip": 0.010977, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.04147696, + "balance_loss_mlp": 1.01882291, + "epoch": 0.6166240793626935, + "flos": 25338533569920.0, + "grad_norm": 2.145694668444534, + "language_loss": 0.65628386, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.67758632, + "num_input_tokens_seen": 220916425, + "step": 10256, + "time_per_iteration": 2.7114098072052 + }, + { + "auxiliary_loss_clip": 0.01103834, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.04223847, + "balance_loss_mlp": 1.01924038, + "epoch": 0.6166842026153615, + "flos": 19682639061120.0, + "grad_norm": 1.5926214774863399, + "language_loss": 0.7198689, + "learning_rate": 1.353073501949825e-06, + "loss": 0.74122202, + "num_input_tokens_seen": 220935050, + "step": 10257, + "time_per_iteration": 2.633733034133911 + }, + { + "auxiliary_loss_clip": 0.01096088, + "auxiliary_loss_mlp": 0.01034844, + "balance_loss_clip": 1.04075146, + "balance_loss_mlp": 1.02102792, + "epoch": 0.6167443258680294, + "flos": 19318361281920.0, + "grad_norm": 1.5727725354833466, + "language_loss": 0.72232676, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.74363607, + "num_input_tokens_seen": 220953085, + "step": 10258, + "time_per_iteration": 4.227793455123901 + }, + { + "auxiliary_loss_clip": 0.010877, + "auxiliary_loss_mlp": 0.01041882, + "balance_loss_clip": 1.03643775, + "balance_loss_mlp": 1.02724326, + "epoch": 0.6168044491206974, + "flos": 25265239868160.0, + "grad_norm": 2.5764422484709026, + "language_loss": 0.63939095, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.66068673, + "num_input_tokens_seen": 220969050, + "step": 10259, + "time_per_iteration": 2.66133713722229 + }, + { + "auxiliary_loss_clip": 0.01079598, + "auxiliary_loss_mlp": 0.01032477, + "balance_loss_clip": 1.04043519, + "balance_loss_mlp": 1.01882792, + "epoch": 0.6168645723733654, + "flos": 13219903301760.0, + "grad_norm": 1.7806797732317314, + "language_loss": 0.71367824, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.73479903, + "num_input_tokens_seen": 220985825, + "step": 10260, + "time_per_iteration": 2.7046947479248047 + }, + { + "auxiliary_loss_clip": 0.01112627, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.04544723, + "balance_loss_mlp": 1.02161956, + "epoch": 0.6169246956260334, + "flos": 26652038112000.0, + "grad_norm": 2.1654324787038366, + "language_loss": 0.68724519, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.70873445, + "num_input_tokens_seen": 221004465, + "step": 10261, + "time_per_iteration": 2.6891751289367676 + }, + { + "auxiliary_loss_clip": 0.01077329, + "auxiliary_loss_mlp": 0.01039226, + "balance_loss_clip": 1.03780389, + "balance_loss_mlp": 1.02766895, + "epoch": 0.6169848188787014, + "flos": 23148413608320.0, + "grad_norm": 2.004758584780846, + "language_loss": 0.71780062, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.73896611, + "num_input_tokens_seen": 221023260, + "step": 10262, + "time_per_iteration": 2.7089951038360596 + }, + { + "auxiliary_loss_clip": 0.01096265, + "auxiliary_loss_mlp": 0.01036729, + "balance_loss_clip": 1.0397017, + "balance_loss_mlp": 1.02370548, + "epoch": 0.6170449421313693, + "flos": 23331917214720.0, + "grad_norm": 1.9399509227658047, + "language_loss": 0.70199084, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.72332084, + "num_input_tokens_seen": 221043090, + "step": 10263, + "time_per_iteration": 2.750321865081787 + }, + { + "auxiliary_loss_clip": 0.01051355, + "auxiliary_loss_mlp": 0.01030928, + "balance_loss_clip": 1.03560829, + "balance_loss_mlp": 1.01777899, + "epoch": 0.6171050653840373, + "flos": 15851617067520.0, + "grad_norm": 2.2572438712768217, + "language_loss": 0.75942671, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.78024954, + "num_input_tokens_seen": 221061435, + "step": 10264, + "time_per_iteration": 2.868535041809082 + }, + { + "auxiliary_loss_clip": 0.0111535, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.04105282, + "balance_loss_mlp": 1.0207653, + "epoch": 0.6171651886367052, + "flos": 20045516209920.0, + "grad_norm": 2.372576687009926, + "language_loss": 0.85552394, + "learning_rate": 1.350126092092247e-06, + "loss": 0.87702072, + "num_input_tokens_seen": 221078705, + "step": 10265, + "time_per_iteration": 2.8565142154693604 + }, + { + "auxiliary_loss_clip": 0.01067477, + "auxiliary_loss_mlp": 0.01039322, + "balance_loss_clip": 1.04373622, + "balance_loss_mlp": 1.0262332, + "epoch": 0.6172253118893732, + "flos": 26432695710720.0, + "grad_norm": 1.8305416019911092, + "language_loss": 0.64584678, + "learning_rate": 1.349757776608153e-06, + "loss": 0.66691476, + "num_input_tokens_seen": 221099245, + "step": 10266, + "time_per_iteration": 2.8642327785491943 + }, + { + "auxiliary_loss_clip": 0.01077105, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.03542173, + "balance_loss_mlp": 1.02038074, + "epoch": 0.6172854351420412, + "flos": 22632879657600.0, + "grad_norm": 1.5801224931645446, + "language_loss": 0.75690526, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.77801126, + "num_input_tokens_seen": 221116930, + "step": 10267, + "time_per_iteration": 2.6700358390808105 + }, + { + "auxiliary_loss_clip": 0.01085691, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.03821349, + "balance_loss_mlp": 1.01927543, + "epoch": 0.6173455583947092, + "flos": 21212936138880.0, + "grad_norm": 1.8670913933452218, + "language_loss": 0.75156605, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.77275252, + "num_input_tokens_seen": 221137660, + "step": 10268, + "time_per_iteration": 2.696876287460327 + }, + { + "auxiliary_loss_clip": 0.01094834, + "auxiliary_loss_mlp": 0.01028833, + "balance_loss_clip": 1.03917122, + "balance_loss_mlp": 1.01574397, + "epoch": 0.6174056816473771, + "flos": 19500284689920.0, + "grad_norm": 1.6535000846549075, + "language_loss": 0.75516117, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.77639782, + "num_input_tokens_seen": 221156225, + "step": 10269, + "time_per_iteration": 2.602811098098755 + }, + { + "auxiliary_loss_clip": 0.01112983, + "auxiliary_loss_mlp": 0.01032291, + "balance_loss_clip": 1.03888416, + "balance_loss_mlp": 1.01934433, + "epoch": 0.6174658049000451, + "flos": 15997342544640.0, + "grad_norm": 2.0658565412775864, + "language_loss": 0.76633871, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.78779137, + "num_input_tokens_seen": 221173820, + "step": 10270, + "time_per_iteration": 2.4974937438964844 + }, + { + "auxiliary_loss_clip": 0.01094367, + "auxiliary_loss_mlp": 0.01029373, + "balance_loss_clip": 1.03897905, + "balance_loss_mlp": 1.01614046, + "epoch": 0.617525928152713, + "flos": 21903893136000.0, + "grad_norm": 1.7132984501088018, + "language_loss": 0.82571089, + "learning_rate": 1.347916569325736e-06, + "loss": 0.84694827, + "num_input_tokens_seen": 221191815, + "step": 10271, + "time_per_iteration": 2.5579023361206055 + }, + { + "auxiliary_loss_clip": 0.01117578, + "auxiliary_loss_mlp": 0.00770278, + "balance_loss_clip": 1.04181647, + "balance_loss_mlp": 1.00026119, + "epoch": 0.617586051405381, + "flos": 21105958458240.0, + "grad_norm": 1.7753710890796277, + "language_loss": 0.77119303, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.79007161, + "num_input_tokens_seen": 221211205, + "step": 10272, + "time_per_iteration": 2.5040929317474365 + }, + { + "auxiliary_loss_clip": 0.01010193, + "auxiliary_loss_mlp": 0.01008445, + "balance_loss_clip": 1.01500225, + "balance_loss_mlp": 1.00734258, + "epoch": 0.617646174658049, + "flos": 58610776665600.0, + "grad_norm": 0.8102559733678494, + "language_loss": 0.59036177, + "learning_rate": 1.347180259404513e-06, + "loss": 0.61054814, + "num_input_tokens_seen": 221268430, + "step": 10273, + "time_per_iteration": 3.0667202472686768 + }, + { + "auxiliary_loss_clip": 0.0108364, + "auxiliary_loss_mlp": 0.01039906, + "balance_loss_clip": 1.03496802, + "balance_loss_mlp": 1.02545786, + "epoch": 0.617706297910717, + "flos": 13878684691200.0, + "grad_norm": 2.411144915020525, + "language_loss": 0.73045421, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.75168967, + "num_input_tokens_seen": 221281930, + "step": 10274, + "time_per_iteration": 2.608651638031006 + }, + { + "auxiliary_loss_clip": 0.0110423, + "auxiliary_loss_mlp": 0.00770133, + "balance_loss_clip": 1.04004967, + "balance_loss_mlp": 1.00015223, + "epoch": 0.617766421163385, + "flos": 19208438686080.0, + "grad_norm": 2.134780547516878, + "language_loss": 0.77694172, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.79568529, + "num_input_tokens_seen": 221301605, + "step": 10275, + "time_per_iteration": 2.588878631591797 + }, + { + "auxiliary_loss_clip": 0.01073523, + "auxiliary_loss_mlp": 0.01029844, + "balance_loss_clip": 1.03674793, + "balance_loss_mlp": 1.01733303, + "epoch": 0.6178265444160529, + "flos": 22565978576640.0, + "grad_norm": 2.554653383498776, + "language_loss": 0.79304695, + "learning_rate": 1.346075980219998e-06, + "loss": 0.8140806, + "num_input_tokens_seen": 221320105, + "step": 10276, + "time_per_iteration": 2.704596757888794 + }, + { + "auxiliary_loss_clip": 0.0104785, + "auxiliary_loss_mlp": 0.0103935, + "balance_loss_clip": 1.03442883, + "balance_loss_mlp": 1.02518225, + "epoch": 0.6178866676687209, + "flos": 11984289402240.0, + "grad_norm": 1.984181156670454, + "language_loss": 0.80967486, + "learning_rate": 1.345707936733612e-06, + "loss": 0.83054686, + "num_input_tokens_seen": 221335915, + "step": 10277, + "time_per_iteration": 2.7356364727020264 + }, + { + "auxiliary_loss_clip": 0.01088845, + "auxiliary_loss_mlp": 0.01030881, + "balance_loss_clip": 1.04154968, + "balance_loss_mlp": 1.01682067, + "epoch": 0.6179467909213888, + "flos": 20991510748800.0, + "grad_norm": 1.5775634797191704, + "language_loss": 0.81279171, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.83398896, + "num_input_tokens_seen": 221353965, + "step": 10278, + "time_per_iteration": 2.703054666519165 + }, + { + "auxiliary_loss_clip": 0.0106686, + "auxiliary_loss_mlp": 0.00769812, + "balance_loss_clip": 1.03503084, + "balance_loss_mlp": 1.00006652, + "epoch": 0.6180069141740568, + "flos": 25338102606720.0, + "grad_norm": 1.5156506321196916, + "language_loss": 0.74347699, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.76184368, + "num_input_tokens_seen": 221374080, + "step": 10279, + "time_per_iteration": 2.777080774307251 + }, + { + "auxiliary_loss_clip": 0.01096628, + "auxiliary_loss_mlp": 0.010318, + "balance_loss_clip": 1.03583622, + "balance_loss_mlp": 1.01950932, + "epoch": 0.6180670374267248, + "flos": 19645722858240.0, + "grad_norm": 1.5230976022896776, + "language_loss": 0.70880997, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.73009425, + "num_input_tokens_seen": 221392910, + "step": 10280, + "time_per_iteration": 2.682345151901245 + }, + { + "auxiliary_loss_clip": 0.01116485, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.04136443, + "balance_loss_mlp": 1.02197635, + "epoch": 0.6181271606793928, + "flos": 19464876858240.0, + "grad_norm": 1.5388475151652443, + "language_loss": 0.72637439, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.74789023, + "num_input_tokens_seen": 221410990, + "step": 10281, + "time_per_iteration": 2.546891927719116 + }, + { + "auxiliary_loss_clip": 0.01091569, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.04059482, + "balance_loss_mlp": 1.01733923, + "epoch": 0.6181872839320607, + "flos": 25594289383680.0, + "grad_norm": 1.5263826245103997, + "language_loss": 0.76680994, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.78801513, + "num_input_tokens_seen": 221431020, + "step": 10282, + "time_per_iteration": 2.6794841289520264 + }, + { + "auxiliary_loss_clip": 0.0108706, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.03559065, + "balance_loss_mlp": 1.01857102, + "epoch": 0.6182474071847287, + "flos": 25551806572800.0, + "grad_norm": 1.675077981875324, + "language_loss": 0.69088876, + "learning_rate": 1.343500197330931e-06, + "loss": 0.71210587, + "num_input_tokens_seen": 221453235, + "step": 10283, + "time_per_iteration": 2.704653263092041 + }, + { + "auxiliary_loss_clip": 0.01110364, + "auxiliary_loss_mlp": 0.01029861, + "balance_loss_clip": 1.03980327, + "balance_loss_mlp": 1.01613414, + "epoch": 0.6183075304373966, + "flos": 22123738327680.0, + "grad_norm": 1.6796519430341141, + "language_loss": 0.75191927, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.77332163, + "num_input_tokens_seen": 221472560, + "step": 10284, + "time_per_iteration": 2.613283395767212 + }, + { + "auxiliary_loss_clip": 0.010977, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.04041815, + "balance_loss_mlp": 1.02422476, + "epoch": 0.6183676536900646, + "flos": 22455589104000.0, + "grad_norm": 1.4535785054838537, + "language_loss": 0.75249875, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.77384472, + "num_input_tokens_seen": 221492835, + "step": 10285, + "time_per_iteration": 2.661404848098755 + }, + { + "auxiliary_loss_clip": 0.01076492, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.03464127, + "balance_loss_mlp": 1.02047253, + "epoch": 0.6184277769427327, + "flos": 23364128736000.0, + "grad_norm": 1.9348516071602069, + "language_loss": 0.72801822, + "learning_rate": 1.342396663517503e-06, + "loss": 0.74911165, + "num_input_tokens_seen": 221511870, + "step": 10286, + "time_per_iteration": 2.7692575454711914 + }, + { + "auxiliary_loss_clip": 0.01112181, + "auxiliary_loss_mlp": 0.01029502, + "balance_loss_clip": 1.03996992, + "balance_loss_mlp": 1.01705098, + "epoch": 0.6184879001954006, + "flos": 22711057608960.0, + "grad_norm": 1.6994058202973141, + "language_loss": 0.76147521, + "learning_rate": 1.342028868767199e-06, + "loss": 0.78289199, + "num_input_tokens_seen": 221529915, + "step": 10287, + "time_per_iteration": 2.737244129180908 + }, + { + "auxiliary_loss_clip": 0.01075986, + "auxiliary_loss_mlp": 0.01033857, + "balance_loss_clip": 1.038939, + "balance_loss_mlp": 1.02116728, + "epoch": 0.6185480234480686, + "flos": 23841920471040.0, + "grad_norm": 1.661792493637227, + "language_loss": 0.73342609, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.75452453, + "num_input_tokens_seen": 221549745, + "step": 10288, + "time_per_iteration": 2.738234281539917 + }, + { + "auxiliary_loss_clip": 0.01099888, + "auxiliary_loss_mlp": 0.01035399, + "balance_loss_clip": 1.03925002, + "balance_loss_mlp": 1.0233885, + "epoch": 0.6186081467007365, + "flos": 45477595774080.0, + "grad_norm": 1.4788464659042324, + "language_loss": 0.72843671, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.7497896, + "num_input_tokens_seen": 221572455, + "step": 10289, + "time_per_iteration": 2.870210886001587 + }, + { + "auxiliary_loss_clip": 0.01088106, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.0376215, + "balance_loss_mlp": 1.01749849, + "epoch": 0.6186682699534045, + "flos": 23550864566400.0, + "grad_norm": 1.4742798847115595, + "language_loss": 0.79430723, + "learning_rate": 1.340925634274056e-06, + "loss": 0.81549788, + "num_input_tokens_seen": 221591325, + "step": 10290, + "time_per_iteration": 2.7061526775360107 + }, + { + "auxiliary_loss_clip": 0.01104029, + "auxiliary_loss_mlp": 0.01033504, + "balance_loss_clip": 1.03934646, + "balance_loss_mlp": 1.02068937, + "epoch": 0.6187283932060724, + "flos": 25774201630080.0, + "grad_norm": 1.6274786697127714, + "language_loss": 0.81492877, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.83630407, + "num_input_tokens_seen": 221611640, + "step": 10291, + "time_per_iteration": 2.664706230163574 + }, + { + "auxiliary_loss_clip": 0.01114199, + "auxiliary_loss_mlp": 0.01034165, + "balance_loss_clip": 1.04050338, + "balance_loss_mlp": 1.02185655, + "epoch": 0.6187885164587404, + "flos": 25265203954560.0, + "grad_norm": 1.5926453232151345, + "language_loss": 0.77492392, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.79640758, + "num_input_tokens_seen": 221631225, + "step": 10292, + "time_per_iteration": 4.222437381744385 + }, + { + "auxiliary_loss_clip": 0.01085532, + "auxiliary_loss_mlp": 0.01041109, + "balance_loss_clip": 1.03610599, + "balance_loss_mlp": 1.02526617, + "epoch": 0.6188486397114084, + "flos": 26250772302720.0, + "grad_norm": 2.004631291368857, + "language_loss": 0.7354871, + "learning_rate": 1.339822624710401e-06, + "loss": 0.75675344, + "num_input_tokens_seen": 221651035, + "step": 10293, + "time_per_iteration": 4.283612251281738 + }, + { + "auxiliary_loss_clip": 0.01083695, + "auxiliary_loss_mlp": 0.0077033, + "balance_loss_clip": 1.03986382, + "balance_loss_mlp": 1.00014317, + "epoch": 0.6189087629640764, + "flos": 20923388605440.0, + "grad_norm": 1.9118403389506524, + "language_loss": 0.8346625, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.85320276, + "num_input_tokens_seen": 221671300, + "step": 10294, + "time_per_iteration": 4.339020013809204 + }, + { + "auxiliary_loss_clip": 0.01097661, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.04166722, + "balance_loss_mlp": 1.02219725, + "epoch": 0.6189688862167443, + "flos": 14829814874880.0, + "grad_norm": 2.141454584748579, + "language_loss": 0.706837, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.72816062, + "num_input_tokens_seen": 221687320, + "step": 10295, + "time_per_iteration": 2.631901264190674 + }, + { + "auxiliary_loss_clip": 0.01115282, + "auxiliary_loss_mlp": 0.01039296, + "balance_loss_clip": 1.04228771, + "balance_loss_mlp": 1.02599859, + "epoch": 0.6190290094694123, + "flos": 24285058560000.0, + "grad_norm": 1.7512650583676883, + "language_loss": 0.70329851, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.72484434, + "num_input_tokens_seen": 221710175, + "step": 10296, + "time_per_iteration": 2.689392566680908 + }, + { + "auxiliary_loss_clip": 0.01081279, + "auxiliary_loss_mlp": 0.01034769, + "balance_loss_clip": 1.03957784, + "balance_loss_mlp": 1.02048767, + "epoch": 0.6190891327220802, + "flos": 22529457423360.0, + "grad_norm": 1.9634695381003797, + "language_loss": 0.71536231, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.73652285, + "num_input_tokens_seen": 221728145, + "step": 10297, + "time_per_iteration": 2.7065582275390625 + }, + { + "auxiliary_loss_clip": 0.01036404, + "auxiliary_loss_mlp": 0.01000643, + "balance_loss_clip": 1.01235867, + "balance_loss_mlp": 0.99964732, + "epoch": 0.6191492559747482, + "flos": 67729357152000.0, + "grad_norm": 0.8790158844538737, + "language_loss": 0.64109659, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.66146708, + "num_input_tokens_seen": 221786100, + "step": 10298, + "time_per_iteration": 4.634017467498779 + }, + { + "auxiliary_loss_clip": 0.01116645, + "auxiliary_loss_mlp": 0.01033648, + "balance_loss_clip": 1.04158056, + "balance_loss_mlp": 1.02121425, + "epoch": 0.6192093792274163, + "flos": 22346672088960.0, + "grad_norm": 1.7807348336033566, + "language_loss": 0.74117303, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.762676, + "num_input_tokens_seen": 221806450, + "step": 10299, + "time_per_iteration": 2.6040680408477783 + }, + { + "auxiliary_loss_clip": 0.01108454, + "auxiliary_loss_mlp": 0.01030477, + "balance_loss_clip": 1.0418222, + "balance_loss_mlp": 1.01792383, + "epoch": 0.6192695024800842, + "flos": 13553944807680.0, + "grad_norm": 1.8290075775776669, + "language_loss": 0.68678868, + "learning_rate": 1.337249812568732e-06, + "loss": 0.70817792, + "num_input_tokens_seen": 221823330, + "step": 10300, + "time_per_iteration": 2.641167163848877 + }, + { + "auxiliary_loss_clip": 0.01101551, + "auxiliary_loss_mlp": 0.00770748, + "balance_loss_clip": 1.04044676, + "balance_loss_mlp": 1.00015926, + "epoch": 0.6193296257327522, + "flos": 17415310815360.0, + "grad_norm": 1.7786248978132038, + "language_loss": 0.66813135, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.68685436, + "num_input_tokens_seen": 221839360, + "step": 10301, + "time_per_iteration": 2.639004945755005 + }, + { + "auxiliary_loss_clip": 0.01072819, + "auxiliary_loss_mlp": 0.01035838, + "balance_loss_clip": 1.03622746, + "balance_loss_mlp": 1.02365446, + "epoch": 0.6193897489854201, + "flos": 31101118450560.0, + "grad_norm": 1.5932766793388897, + "language_loss": 0.72753853, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.74862504, + "num_input_tokens_seen": 221859465, + "step": 10302, + "time_per_iteration": 2.7263267040252686 + }, + { + "auxiliary_loss_clip": 0.01090931, + "auxiliary_loss_mlp": 0.01030697, + "balance_loss_clip": 1.0426929, + "balance_loss_mlp": 1.01734614, + "epoch": 0.6194498722380881, + "flos": 19134031662720.0, + "grad_norm": 1.7635802463343486, + "language_loss": 0.80626869, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.82748497, + "num_input_tokens_seen": 221878555, + "step": 10303, + "time_per_iteration": 2.674865961074829 + }, + { + "auxiliary_loss_clip": 0.01117513, + "auxiliary_loss_mlp": 0.01032597, + "balance_loss_clip": 1.04101253, + "balance_loss_mlp": 1.01882231, + "epoch": 0.619509995490756, + "flos": 21835088634240.0, + "grad_norm": 4.546006861231834, + "language_loss": 0.76722652, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.78872764, + "num_input_tokens_seen": 221898790, + "step": 10304, + "time_per_iteration": 2.578068256378174 + }, + { + "auxiliary_loss_clip": 0.01085456, + "auxiliary_loss_mlp": 0.0103497, + "balance_loss_clip": 1.04078317, + "balance_loss_mlp": 1.02160096, + "epoch": 0.619570118743424, + "flos": 23806548552960.0, + "grad_norm": 2.037308303130727, + "language_loss": 0.77085918, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.79206347, + "num_input_tokens_seen": 221918875, + "step": 10305, + "time_per_iteration": 2.6557652950286865 + }, + { + "auxiliary_loss_clip": 0.01112573, + "auxiliary_loss_mlp": 0.01033009, + "balance_loss_clip": 1.04317331, + "balance_loss_mlp": 1.01832271, + "epoch": 0.619630241996092, + "flos": 21101612912640.0, + "grad_norm": 1.5905815409224004, + "language_loss": 0.7876581, + "learning_rate": 1.335045524968045e-06, + "loss": 0.80911398, + "num_input_tokens_seen": 221937895, + "step": 10306, + "time_per_iteration": 2.58312726020813 + }, + { + "auxiliary_loss_clip": 0.01056494, + "auxiliary_loss_mlp": 0.01030878, + "balance_loss_clip": 1.03859866, + "balance_loss_mlp": 1.0192728, + "epoch": 0.61969036524876, + "flos": 27308269635840.0, + "grad_norm": 1.649742748314876, + "language_loss": 0.80246294, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.82333666, + "num_input_tokens_seen": 221955920, + "step": 10307, + "time_per_iteration": 2.7693941593170166 + }, + { + "auxiliary_loss_clip": 0.01001046, + "auxiliary_loss_mlp": 0.01015241, + "balance_loss_clip": 1.01444507, + "balance_loss_mlp": 1.0141207, + "epoch": 0.6197504885014279, + "flos": 51648955384320.0, + "grad_norm": 0.8068090756771118, + "language_loss": 0.59387553, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.61403841, + "num_input_tokens_seen": 222011405, + "step": 10308, + "time_per_iteration": 3.2183339595794678 + }, + { + "auxiliary_loss_clip": 0.01087174, + "auxiliary_loss_mlp": 0.01030837, + "balance_loss_clip": 1.03852654, + "balance_loss_mlp": 1.01956522, + "epoch": 0.6198106117540959, + "flos": 30557107992960.0, + "grad_norm": 1.7201847601109612, + "language_loss": 0.67907512, + "learning_rate": 1.333943721384037e-06, + "loss": 0.70025527, + "num_input_tokens_seen": 222034545, + "step": 10309, + "time_per_iteration": 2.728565216064453 + }, + { + "auxiliary_loss_clip": 0.01083478, + "auxiliary_loss_mlp": 0.0103687, + "balance_loss_clip": 1.03543091, + "balance_loss_mlp": 1.02430511, + "epoch": 0.6198707350067638, + "flos": 18909733184640.0, + "grad_norm": 1.5362872726536445, + "language_loss": 0.72323126, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.74443471, + "num_input_tokens_seen": 222052690, + "step": 10310, + "time_per_iteration": 2.7349348068237305 + }, + { + "auxiliary_loss_clip": 0.01098291, + "auxiliary_loss_mlp": 0.01037346, + "balance_loss_clip": 1.04345024, + "balance_loss_mlp": 1.02295148, + "epoch": 0.6199308582594318, + "flos": 21433858738560.0, + "grad_norm": 2.3493071886977948, + "language_loss": 0.79078376, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.81214017, + "num_input_tokens_seen": 222069095, + "step": 10311, + "time_per_iteration": 2.682654857635498 + }, + { + "auxiliary_loss_clip": 0.01081352, + "auxiliary_loss_mlp": 0.01035506, + "balance_loss_clip": 1.0394609, + "balance_loss_mlp": 1.02252364, + "epoch": 0.6199909815120999, + "flos": 18407379525120.0, + "grad_norm": 1.7307569913604643, + "language_loss": 0.72513938, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.74630797, + "num_input_tokens_seen": 222087360, + "step": 10312, + "time_per_iteration": 2.677211284637451 + }, + { + "auxiliary_loss_clip": 0.01071298, + "auxiliary_loss_mlp": 0.01034778, + "balance_loss_clip": 1.04210687, + "balance_loss_mlp": 1.02137268, + "epoch": 0.6200511047647678, + "flos": 21466860359040.0, + "grad_norm": 3.7235217852030926, + "language_loss": 0.72115338, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.74221408, + "num_input_tokens_seen": 222106130, + "step": 10313, + "time_per_iteration": 2.7689011096954346 + }, + { + "auxiliary_loss_clip": 0.01108898, + "auxiliary_loss_mlp": 0.01033235, + "balance_loss_clip": 1.04191053, + "balance_loss_mlp": 1.01971102, + "epoch": 0.6201112280174358, + "flos": 18215903099520.0, + "grad_norm": 1.7819666620639945, + "language_loss": 0.78249431, + "learning_rate": 1.332107887401416e-06, + "loss": 0.80391562, + "num_input_tokens_seen": 222123125, + "step": 10314, + "time_per_iteration": 2.618197441101074 + }, + { + "auxiliary_loss_clip": 0.01102699, + "auxiliary_loss_mlp": 0.01031591, + "balance_loss_clip": 1.03891587, + "balance_loss_mlp": 1.01907969, + "epoch": 0.6201713512701037, + "flos": 20011185786240.0, + "grad_norm": 1.747606387674539, + "language_loss": 0.78019774, + "learning_rate": 1.331740796528812e-06, + "loss": 0.80154061, + "num_input_tokens_seen": 222140655, + "step": 10315, + "time_per_iteration": 2.6219210624694824 + }, + { + "auxiliary_loss_clip": 0.01081861, + "auxiliary_loss_mlp": 0.01033596, + "balance_loss_clip": 1.0434972, + "balance_loss_mlp": 1.02088857, + "epoch": 0.6202314745227717, + "flos": 22487692884480.0, + "grad_norm": 2.153515207542012, + "language_loss": 0.76050055, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.78165507, + "num_input_tokens_seen": 222160450, + "step": 10316, + "time_per_iteration": 2.766108989715576 + }, + { + "auxiliary_loss_clip": 0.01115322, + "auxiliary_loss_mlp": 0.01032068, + "balance_loss_clip": 1.03810644, + "balance_loss_mlp": 1.01903796, + "epoch": 0.6202915977754396, + "flos": 26828682220800.0, + "grad_norm": 2.0292313073024366, + "language_loss": 0.77797258, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.79944646, + "num_input_tokens_seen": 222179170, + "step": 10317, + "time_per_iteration": 2.66479754447937 + }, + { + "auxiliary_loss_clip": 0.01017104, + "auxiliary_loss_mlp": 0.01000773, + "balance_loss_clip": 1.01230764, + "balance_loss_mlp": 0.99964064, + "epoch": 0.6203517210281076, + "flos": 62742694890240.0, + "grad_norm": 0.6983272901342329, + "language_loss": 0.59043646, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.61061525, + "num_input_tokens_seen": 222242660, + "step": 10318, + "time_per_iteration": 3.26334547996521 + }, + { + "auxiliary_loss_clip": 0.01087685, + "auxiliary_loss_mlp": 0.01036361, + "balance_loss_clip": 1.04098892, + "balance_loss_mlp": 1.02262819, + "epoch": 0.6204118442807756, + "flos": 23404277162880.0, + "grad_norm": 1.7402353399266621, + "language_loss": 0.77895933, + "learning_rate": 1.330272686582143e-06, + "loss": 0.80019981, + "num_input_tokens_seen": 222262170, + "step": 10319, + "time_per_iteration": 2.729206085205078 + }, + { + "auxiliary_loss_clip": 0.01095977, + "auxiliary_loss_mlp": 0.01036689, + "balance_loss_clip": 1.04197454, + "balance_loss_mlp": 1.02473831, + "epoch": 0.6204719675334436, + "flos": 20193647898240.0, + "grad_norm": 1.990293472142164, + "language_loss": 0.66651958, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.6878463, + "num_input_tokens_seen": 222280375, + "step": 10320, + "time_per_iteration": 2.6254241466522217 + }, + { + "auxiliary_loss_clip": 0.0107265, + "auxiliary_loss_mlp": 0.01032632, + "balance_loss_clip": 1.03743291, + "balance_loss_mlp": 1.02023411, + "epoch": 0.6205320907861115, + "flos": 13188050916480.0, + "grad_norm": 1.82656973457559, + "language_loss": 0.76147729, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.78253013, + "num_input_tokens_seen": 222297325, + "step": 10321, + "time_per_iteration": 2.7273271083831787 + }, + { + "auxiliary_loss_clip": 0.01086085, + "auxiliary_loss_mlp": 0.01026791, + "balance_loss_clip": 1.03792763, + "balance_loss_mlp": 1.01485252, + "epoch": 0.6205922140387795, + "flos": 20668386977280.0, + "grad_norm": 1.806601811467465, + "language_loss": 0.73700678, + "learning_rate": 1.329171870732758e-06, + "loss": 0.75813556, + "num_input_tokens_seen": 222317095, + "step": 10322, + "time_per_iteration": 2.699514627456665 + }, + { + "auxiliary_loss_clip": 0.01074398, + "auxiliary_loss_mlp": 0.01028622, + "balance_loss_clip": 1.03568387, + "balance_loss_mlp": 1.01665354, + "epoch": 0.6206523372914474, + "flos": 23877831093120.0, + "grad_norm": 1.7201277094728098, + "language_loss": 0.72919118, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.75022137, + "num_input_tokens_seen": 222337055, + "step": 10323, + "time_per_iteration": 2.743650436401367 + }, + { + "auxiliary_loss_clip": 0.01111352, + "auxiliary_loss_mlp": 0.0103222, + "balance_loss_clip": 1.04181314, + "balance_loss_mlp": 1.0182364, + "epoch": 0.6207124605441154, + "flos": 13406603218560.0, + "grad_norm": 2.6397698912445495, + "language_loss": 0.58581293, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.60724854, + "num_input_tokens_seen": 222354515, + "step": 10324, + "time_per_iteration": 2.624112129211426 + }, + { + "auxiliary_loss_clip": 0.0107635, + "auxiliary_loss_mlp": 0.01039987, + "balance_loss_clip": 1.03851843, + "balance_loss_mlp": 1.02483535, + "epoch": 0.6207725837967835, + "flos": 18916341287040.0, + "grad_norm": 1.9960731674186785, + "language_loss": 0.77214384, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.79330719, + "num_input_tokens_seen": 222372755, + "step": 10325, + "time_per_iteration": 2.7152631282806396 + }, + { + "auxiliary_loss_clip": 0.01106149, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.03993106, + "balance_loss_mlp": 1.01689494, + "epoch": 0.6208327070494514, + "flos": 23980211832960.0, + "grad_norm": 1.8479718801200147, + "language_loss": 0.72421134, + "learning_rate": 1.327704472462003e-06, + "loss": 0.74557668, + "num_input_tokens_seen": 222391380, + "step": 10326, + "time_per_iteration": 2.7786142826080322 + }, + { + "auxiliary_loss_clip": 0.01108733, + "auxiliary_loss_mlp": 0.01040278, + "balance_loss_clip": 1.04103386, + "balance_loss_mlp": 1.02686155, + "epoch": 0.6208928303021194, + "flos": 22820405587200.0, + "grad_norm": 2.631988552550178, + "language_loss": 0.74086714, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.76235723, + "num_input_tokens_seen": 222411165, + "step": 10327, + "time_per_iteration": 2.6204168796539307 + }, + { + "auxiliary_loss_clip": 0.01090969, + "auxiliary_loss_mlp": 0.01032322, + "balance_loss_clip": 1.03982306, + "balance_loss_mlp": 1.01871443, + "epoch": 0.6209529535547873, + "flos": 17564519911680.0, + "grad_norm": 1.9488386802913455, + "language_loss": 0.79213655, + "learning_rate": 1.326970926232066e-06, + "loss": 0.81336939, + "num_input_tokens_seen": 222428110, + "step": 10328, + "time_per_iteration": 2.678966522216797 + }, + { + "auxiliary_loss_clip": 0.01080917, + "auxiliary_loss_mlp": 0.01040936, + "balance_loss_clip": 1.03594792, + "balance_loss_mlp": 1.02738202, + "epoch": 0.6210130768074553, + "flos": 22011912311040.0, + "grad_norm": 1.6747137440925206, + "language_loss": 0.77850568, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.79972422, + "num_input_tokens_seen": 222446385, + "step": 10329, + "time_per_iteration": 2.7247962951660156 + }, + { + "auxiliary_loss_clip": 0.01022383, + "auxiliary_loss_mlp": 0.01002444, + "balance_loss_clip": 1.00971746, + "balance_loss_mlp": 1.00120986, + "epoch": 0.6210732000601232, + "flos": 63676873854720.0, + "grad_norm": 0.8323168859834922, + "language_loss": 0.62231028, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.64255857, + "num_input_tokens_seen": 222502150, + "step": 10330, + "time_per_iteration": 3.1397132873535156 + }, + { + "auxiliary_loss_clip": 0.01109711, + "auxiliary_loss_mlp": 0.01039515, + "balance_loss_clip": 1.04052687, + "balance_loss_mlp": 1.02538919, + "epoch": 0.6211333233127913, + "flos": 24243365848320.0, + "grad_norm": 1.916638297562339, + "language_loss": 0.77865416, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.80014634, + "num_input_tokens_seen": 222519880, + "step": 10331, + "time_per_iteration": 4.165555715560913 + }, + { + "auxiliary_loss_clip": 0.01119225, + "auxiliary_loss_mlp": 0.01036042, + "balance_loss_clip": 1.04211998, + "balance_loss_mlp": 1.0226016, + "epoch": 0.6211934465654592, + "flos": 16943803960320.0, + "grad_norm": 2.274669690788456, + "language_loss": 0.67796123, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.69951391, + "num_input_tokens_seen": 222538545, + "step": 10332, + "time_per_iteration": 4.209641933441162 + }, + { + "auxiliary_loss_clip": 0.01082735, + "auxiliary_loss_mlp": 0.01033197, + "balance_loss_clip": 1.03757524, + "balance_loss_mlp": 1.0202266, + "epoch": 0.6212535698181272, + "flos": 15267386355840.0, + "grad_norm": 1.6414227257739276, + "language_loss": 0.76285797, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.78401732, + "num_input_tokens_seen": 222556935, + "step": 10333, + "time_per_iteration": 4.338353157043457 + }, + { + "auxiliary_loss_clip": 0.01086354, + "auxiliary_loss_mlp": 0.01035943, + "balance_loss_clip": 1.03819084, + "balance_loss_mlp": 1.02344966, + "epoch": 0.6213136930707951, + "flos": 13443950384640.0, + "grad_norm": 2.217560323857708, + "language_loss": 0.69773704, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.71896005, + "num_input_tokens_seen": 222574035, + "step": 10334, + "time_per_iteration": 2.6839816570281982 + }, + { + "auxiliary_loss_clip": 0.01092709, + "auxiliary_loss_mlp": 0.00770618, + "balance_loss_clip": 1.03960049, + "balance_loss_mlp": 1.00011337, + "epoch": 0.6213738163234631, + "flos": 18111223889280.0, + "grad_norm": 1.6672758368774196, + "language_loss": 0.69724143, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.71587467, + "num_input_tokens_seen": 222592290, + "step": 10335, + "time_per_iteration": 2.6737349033355713 + }, + { + "auxiliary_loss_clip": 0.01059124, + "auxiliary_loss_mlp": 0.01035916, + "balance_loss_clip": 1.03123188, + "balance_loss_mlp": 1.02301764, + "epoch": 0.621433939576131, + "flos": 25337348421120.0, + "grad_norm": 1.5976161024349493, + "language_loss": 0.79976332, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.82071376, + "num_input_tokens_seen": 222612805, + "step": 10336, + "time_per_iteration": 2.747412919998169 + }, + { + "auxiliary_loss_clip": 0.01113717, + "auxiliary_loss_mlp": 0.01036201, + "balance_loss_clip": 1.04143834, + "balance_loss_mlp": 1.02375555, + "epoch": 0.621494062828799, + "flos": 22565619440640.0, + "grad_norm": 1.7008650000144097, + "language_loss": 0.73422229, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.75572157, + "num_input_tokens_seen": 222632260, + "step": 10337, + "time_per_iteration": 4.168013334274292 + }, + { + "auxiliary_loss_clip": 0.01118051, + "auxiliary_loss_mlp": 0.01039175, + "balance_loss_clip": 1.04091513, + "balance_loss_mlp": 1.0258832, + "epoch": 0.621554186081467, + "flos": 27417976750080.0, + "grad_norm": 4.811980339506567, + "language_loss": 0.63192534, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.65349758, + "num_input_tokens_seen": 222653570, + "step": 10338, + "time_per_iteration": 2.640453815460205 + }, + { + "auxiliary_loss_clip": 0.01103195, + "auxiliary_loss_mlp": 0.01037115, + "balance_loss_clip": 1.0407145, + "balance_loss_mlp": 1.0245744, + "epoch": 0.621614309334135, + "flos": 22346815743360.0, + "grad_norm": 1.5973259219647309, + "language_loss": 0.71490097, + "learning_rate": 1.322938249724991e-06, + "loss": 0.73630404, + "num_input_tokens_seen": 222672480, + "step": 10339, + "time_per_iteration": 2.6346054077148438 + }, + { + "auxiliary_loss_clip": 0.01062852, + "auxiliary_loss_mlp": 0.01037431, + "balance_loss_clip": 1.03612769, + "balance_loss_mlp": 1.02370453, + "epoch": 0.621674432586803, + "flos": 19281229597440.0, + "grad_norm": 1.7281695006377986, + "language_loss": 0.69872439, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.71972716, + "num_input_tokens_seen": 222691200, + "step": 10340, + "time_per_iteration": 2.7176573276519775 + }, + { + "auxiliary_loss_clip": 0.01067449, + "auxiliary_loss_mlp": 0.01032185, + "balance_loss_clip": 1.03537023, + "balance_loss_mlp": 1.01955473, + "epoch": 0.6217345558394709, + "flos": 21609533180160.0, + "grad_norm": 2.160368660473176, + "language_loss": 0.68745732, + "learning_rate": 1.322205369037788e-06, + "loss": 0.70845366, + "num_input_tokens_seen": 222709975, + "step": 10341, + "time_per_iteration": 2.667415142059326 + }, + { + "auxiliary_loss_clip": 0.01105428, + "auxiliary_loss_mlp": 0.01033663, + "balance_loss_clip": 1.04163766, + "balance_loss_mlp": 1.01951921, + "epoch": 0.6217946790921389, + "flos": 18004102554240.0, + "grad_norm": 1.857842108735868, + "language_loss": 0.8084417, + "learning_rate": 1.321838967240299e-06, + "loss": 0.82983261, + "num_input_tokens_seen": 222729005, + "step": 10342, + "time_per_iteration": 2.6358642578125 + }, + { + "auxiliary_loss_clip": 0.01016012, + "auxiliary_loss_mlp": 0.01001969, + "balance_loss_clip": 1.01067889, + "balance_loss_mlp": 1.00081241, + "epoch": 0.6218548023448068, + "flos": 61973631768960.0, + "grad_norm": 0.7777664565041693, + "language_loss": 0.57339287, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.59357268, + "num_input_tokens_seen": 222786090, + "step": 10343, + "time_per_iteration": 3.105703830718994 + }, + { + "auxiliary_loss_clip": 0.01071779, + "auxiliary_loss_mlp": 0.01031556, + "balance_loss_clip": 1.03384042, + "balance_loss_mlp": 1.01972461, + "epoch": 0.6219149255974749, + "flos": 25739152934400.0, + "grad_norm": 1.873733183159078, + "language_loss": 0.7244643, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.74549764, + "num_input_tokens_seen": 222806100, + "step": 10344, + "time_per_iteration": 2.7128279209136963 + }, + { + "auxiliary_loss_clip": 0.01106863, + "auxiliary_loss_mlp": 0.01045674, + "balance_loss_clip": 1.04245842, + "balance_loss_mlp": 1.03368115, + "epoch": 0.6219750488501428, + "flos": 25411073086080.0, + "grad_norm": 3.095022336982982, + "language_loss": 0.60327411, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.62479943, + "num_input_tokens_seen": 222826575, + "step": 10345, + "time_per_iteration": 2.741757392883301 + }, + { + "auxiliary_loss_clip": 0.01048609, + "auxiliary_loss_mlp": 0.01041234, + "balance_loss_clip": 1.03204262, + "balance_loss_mlp": 1.02753103, + "epoch": 0.6220351721028108, + "flos": 20047383717120.0, + "grad_norm": 1.8310337674001005, + "language_loss": 0.77749038, + "learning_rate": 1.320373617348614e-06, + "loss": 0.79838884, + "num_input_tokens_seen": 222845285, + "step": 10346, + "time_per_iteration": 2.770772695541382 + }, + { + "auxiliary_loss_clip": 0.01080995, + "auxiliary_loss_mlp": 0.01037279, + "balance_loss_clip": 1.03780663, + "balance_loss_mlp": 1.02326, + "epoch": 0.6220952953554787, + "flos": 27488397363840.0, + "grad_norm": 1.684158236808197, + "language_loss": 0.71739966, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.73858243, + "num_input_tokens_seen": 222864575, + "step": 10347, + "time_per_iteration": 2.708918333053589 + }, + { + "auxiliary_loss_clip": 0.01099172, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.03707337, + "balance_loss_mlp": 1.01956046, + "epoch": 0.6221554186081467, + "flos": 19207612673280.0, + "grad_norm": 1.7479247707562864, + "language_loss": 0.71972638, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.74103796, + "num_input_tokens_seen": 222884420, + "step": 10348, + "time_per_iteration": 2.7594058513641357 + }, + { + "auxiliary_loss_clip": 0.01001862, + "auxiliary_loss_mlp": 0.01006112, + "balance_loss_clip": 1.01154137, + "balance_loss_mlp": 1.00479472, + "epoch": 0.6222155418608146, + "flos": 62950939989120.0, + "grad_norm": 0.816855091094188, + "language_loss": 0.54121429, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.56129414, + "num_input_tokens_seen": 222944690, + "step": 10349, + "time_per_iteration": 3.2531776428222656 + }, + { + "auxiliary_loss_clip": 0.0107704, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.03621447, + "balance_loss_mlp": 1.01792502, + "epoch": 0.6222756651134826, + "flos": 22601099099520.0, + "grad_norm": 2.4846967665996234, + "language_loss": 0.69486421, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.71594191, + "num_input_tokens_seen": 222962990, + "step": 10350, + "time_per_iteration": 2.7475686073303223 + }, + { + "auxiliary_loss_clip": 0.01116919, + "auxiliary_loss_mlp": 0.01038055, + "balance_loss_clip": 1.04172456, + "balance_loss_mlp": 1.02536559, + "epoch": 0.6223357883661506, + "flos": 21142228216320.0, + "grad_norm": 1.8297714166368801, + "language_loss": 0.5704937, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.59204346, + "num_input_tokens_seen": 222980715, + "step": 10351, + "time_per_iteration": 2.675811290740967 + }, + { + "auxiliary_loss_clip": 0.01024035, + "auxiliary_loss_mlp": 0.01004222, + "balance_loss_clip": 1.01215839, + "balance_loss_mlp": 1.00262439, + "epoch": 0.6223959116188186, + "flos": 63765071700480.0, + "grad_norm": 0.8048031710876978, + "language_loss": 0.61121249, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.63149512, + "num_input_tokens_seen": 223040685, + "step": 10352, + "time_per_iteration": 3.2121970653533936 + }, + { + "auxiliary_loss_clip": 0.01111121, + "auxiliary_loss_mlp": 0.01037194, + "balance_loss_clip": 1.03907847, + "balance_loss_mlp": 1.02456367, + "epoch": 0.6224560348714866, + "flos": 22565727181440.0, + "grad_norm": 2.8594267132643267, + "language_loss": 0.82211882, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.84360194, + "num_input_tokens_seen": 223059000, + "step": 10353, + "time_per_iteration": 2.6481454372406006 + }, + { + "auxiliary_loss_clip": 0.01097506, + "auxiliary_loss_mlp": 0.01033284, + "balance_loss_clip": 1.03879428, + "balance_loss_mlp": 1.02166736, + "epoch": 0.6225161581241545, + "flos": 24097748112000.0, + "grad_norm": 1.6101266746131675, + "language_loss": 0.75329089, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.77459884, + "num_input_tokens_seen": 223079345, + "step": 10354, + "time_per_iteration": 2.672100067138672 + }, + { + "auxiliary_loss_clip": 0.01071329, + "auxiliary_loss_mlp": 0.01033068, + "balance_loss_clip": 1.03829408, + "balance_loss_mlp": 1.02011561, + "epoch": 0.6225762813768225, + "flos": 20443513881600.0, + "grad_norm": 1.4917034506382563, + "language_loss": 0.78818482, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.80922878, + "num_input_tokens_seen": 223097880, + "step": 10355, + "time_per_iteration": 2.6894590854644775 + }, + { + "auxiliary_loss_clip": 0.0110748, + "auxiliary_loss_mlp": 0.01038653, + "balance_loss_clip": 1.04353356, + "balance_loss_mlp": 1.0261302, + "epoch": 0.6226364046294904, + "flos": 27198131558400.0, + "grad_norm": 1.5243384390478247, + "language_loss": 0.7810744, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.80253577, + "num_input_tokens_seen": 223118185, + "step": 10356, + "time_per_iteration": 2.662597417831421 + }, + { + "auxiliary_loss_clip": 0.01095206, + "auxiliary_loss_mlp": 0.00771022, + "balance_loss_clip": 1.03841674, + "balance_loss_mlp": 1.0001657, + "epoch": 0.6226965278821585, + "flos": 20445776438400.0, + "grad_norm": 1.8782312562736863, + "language_loss": 0.6801585, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.69882077, + "num_input_tokens_seen": 223137600, + "step": 10357, + "time_per_iteration": 2.630401611328125 + }, + { + "auxiliary_loss_clip": 0.01095487, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.03887713, + "balance_loss_mlp": 1.02341211, + "epoch": 0.6227566511348264, + "flos": 22162737519360.0, + "grad_norm": 2.8474094143077453, + "language_loss": 0.76153404, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.78286874, + "num_input_tokens_seen": 223154360, + "step": 10358, + "time_per_iteration": 2.661013126373291 + }, + { + "auxiliary_loss_clip": 0.01092746, + "auxiliary_loss_mlp": 0.01033714, + "balance_loss_clip": 1.03905225, + "balance_loss_mlp": 1.02091646, + "epoch": 0.6228167743874944, + "flos": 18040875102720.0, + "grad_norm": 2.1492109037016287, + "language_loss": 0.82438827, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.84565282, + "num_input_tokens_seen": 223172255, + "step": 10359, + "time_per_iteration": 2.75612211227417 + }, + { + "auxiliary_loss_clip": 0.01084816, + "auxiliary_loss_mlp": 0.0105208, + "balance_loss_clip": 1.03617096, + "balance_loss_mlp": 1.0374589, + "epoch": 0.6228768976401623, + "flos": 17742851959680.0, + "grad_norm": 3.2541550800674046, + "language_loss": 0.73383337, + "learning_rate": 1.315248145768822e-06, + "loss": 0.75520235, + "num_input_tokens_seen": 223186965, + "step": 10360, + "time_per_iteration": 2.761385440826416 + }, + { + "auxiliary_loss_clip": 0.01103199, + "auxiliary_loss_mlp": 0.01038017, + "balance_loss_clip": 1.03837395, + "balance_loss_mlp": 1.025244, + "epoch": 0.6229370208928303, + "flos": 17894934144000.0, + "grad_norm": 1.937323368007563, + "language_loss": 0.77496618, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.79637837, + "num_input_tokens_seen": 223206045, + "step": 10361, + "time_per_iteration": 2.7078726291656494 + }, + { + "auxiliary_loss_clip": 0.0107034, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.0354948, + "balance_loss_mlp": 1.01774836, + "epoch": 0.6229971441454982, + "flos": 17347763289600.0, + "grad_norm": 2.088996135555703, + "language_loss": 0.6762352, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.69723737, + "num_input_tokens_seen": 223224820, + "step": 10362, + "time_per_iteration": 2.693016529083252 + }, + { + "auxiliary_loss_clip": 0.01095554, + "auxiliary_loss_mlp": 0.01033637, + "balance_loss_clip": 1.03886461, + "balance_loss_mlp": 1.02005267, + "epoch": 0.6230572673981662, + "flos": 29241376807680.0, + "grad_norm": 2.450509773967882, + "language_loss": 0.67575699, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.6970489, + "num_input_tokens_seen": 223243205, + "step": 10363, + "time_per_iteration": 2.7115700244903564 + }, + { + "auxiliary_loss_clip": 0.01068138, + "auxiliary_loss_mlp": 0.01032085, + "balance_loss_clip": 1.03868961, + "balance_loss_mlp": 1.01858449, + "epoch": 0.6231173906508342, + "flos": 16325961096960.0, + "grad_norm": 1.7878512911378444, + "language_loss": 0.86638474, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.88738704, + "num_input_tokens_seen": 223261370, + "step": 10364, + "time_per_iteration": 2.6732850074768066 + }, + { + "auxiliary_loss_clip": 0.01017483, + "auxiliary_loss_mlp": 0.01010257, + "balance_loss_clip": 1.01340818, + "balance_loss_mlp": 1.00900543, + "epoch": 0.6231775139035022, + "flos": 68702032517760.0, + "grad_norm": 0.8935233084209503, + "language_loss": 0.60708529, + "learning_rate": 1.313418851605015e-06, + "loss": 0.62736267, + "num_input_tokens_seen": 223315050, + "step": 10365, + "time_per_iteration": 3.2580301761627197 + }, + { + "auxiliary_loss_clip": 0.01085426, + "auxiliary_loss_mlp": 0.007721, + "balance_loss_clip": 1.04356837, + "balance_loss_mlp": 1.00019813, + "epoch": 0.6232376371561702, + "flos": 19821038163840.0, + "grad_norm": 1.9797808373338666, + "language_loss": 0.75283766, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.77141291, + "num_input_tokens_seen": 223332130, + "step": 10366, + "time_per_iteration": 2.695686101913452 + }, + { + "auxiliary_loss_clip": 0.01107257, + "auxiliary_loss_mlp": 0.01040192, + "balance_loss_clip": 1.04238236, + "balance_loss_mlp": 1.0269959, + "epoch": 0.6232977604088381, + "flos": 23258264376960.0, + "grad_norm": 3.5413788647782978, + "language_loss": 0.76049531, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.78196979, + "num_input_tokens_seen": 223351605, + "step": 10367, + "time_per_iteration": 2.6170830726623535 + }, + { + "auxiliary_loss_clip": 0.01102139, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.04015589, + "balance_loss_mlp": 1.02192068, + "epoch": 0.6233578836615061, + "flos": 21106425335040.0, + "grad_norm": 1.5257334476599056, + "language_loss": 0.78428042, + "learning_rate": 1.312321587418457e-06, + "loss": 0.80564719, + "num_input_tokens_seen": 223372090, + "step": 10368, + "time_per_iteration": 2.625438928604126 + }, + { + "auxiliary_loss_clip": 0.01052163, + "auxiliary_loss_mlp": 0.01035756, + "balance_loss_clip": 1.03783369, + "balance_loss_mlp": 1.02115321, + "epoch": 0.623418006914174, + "flos": 23769416868480.0, + "grad_norm": 2.0197111691245735, + "language_loss": 0.68460292, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.70548213, + "num_input_tokens_seen": 223390110, + "step": 10369, + "time_per_iteration": 2.808359146118164 + }, + { + "auxiliary_loss_clip": 0.01117993, + "auxiliary_loss_mlp": 0.0103678, + "balance_loss_clip": 1.04215741, + "balance_loss_mlp": 1.02325583, + "epoch": 0.6234781301668421, + "flos": 17890480857600.0, + "grad_norm": 2.044462972771541, + "language_loss": 0.88031048, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.90185821, + "num_input_tokens_seen": 223404205, + "step": 10370, + "time_per_iteration": 4.117987155914307 + }, + { + "auxiliary_loss_clip": 0.0111332, + "auxiliary_loss_mlp": 0.01029208, + "balance_loss_clip": 1.039994, + "balance_loss_mlp": 1.01634502, + "epoch": 0.62353825341951, + "flos": 26175503352960.0, + "grad_norm": 1.608857921427384, + "language_loss": 0.66079128, + "learning_rate": 1.311224557923402e-06, + "loss": 0.68221653, + "num_input_tokens_seen": 223424855, + "step": 10371, + "time_per_iteration": 4.359363079071045 + }, + { + "auxiliary_loss_clip": 0.01098316, + "auxiliary_loss_mlp": 0.01029741, + "balance_loss_clip": 1.03844571, + "balance_loss_mlp": 1.01849937, + "epoch": 0.623598376672178, + "flos": 31139902160640.0, + "grad_norm": 1.3363320294252738, + "language_loss": 0.77749312, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.79877365, + "num_input_tokens_seen": 223447225, + "step": 10372, + "time_per_iteration": 4.2803263664245605 + }, + { + "auxiliary_loss_clip": 0.01105747, + "auxiliary_loss_mlp": 0.01033888, + "balance_loss_clip": 1.0399971, + "balance_loss_mlp": 1.02063167, + "epoch": 0.6236584999248459, + "flos": 23730202195200.0, + "grad_norm": 1.86692873433382, + "language_loss": 0.77388912, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.79528546, + "num_input_tokens_seen": 223467520, + "step": 10373, + "time_per_iteration": 2.6164214611053467 + }, + { + "auxiliary_loss_clip": 0.01099988, + "auxiliary_loss_mlp": 0.01029626, + "balance_loss_clip": 1.03910232, + "balance_loss_mlp": 1.0177052, + "epoch": 0.6237186231775139, + "flos": 21762764599680.0, + "grad_norm": 1.5661441130214229, + "language_loss": 0.69628543, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.71758157, + "num_input_tokens_seen": 223488130, + "step": 10374, + "time_per_iteration": 2.620152711868286 + }, + { + "auxiliary_loss_clip": 0.0109877, + "auxiliary_loss_mlp": 0.01027687, + "balance_loss_clip": 1.04083633, + "balance_loss_mlp": 1.01481199, + "epoch": 0.6237787464301818, + "flos": 14939486075520.0, + "grad_norm": 1.8629467261116164, + "language_loss": 0.77406085, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.7953254, + "num_input_tokens_seen": 223505105, + "step": 10375, + "time_per_iteration": 2.662888526916504 + }, + { + "auxiliary_loss_clip": 0.0108805, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.03999758, + "balance_loss_mlp": 1.01531863, + "epoch": 0.6238388696828499, + "flos": 35590311302400.0, + "grad_norm": 1.5320519249858895, + "language_loss": 0.70062512, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.72178227, + "num_input_tokens_seen": 223528065, + "step": 10376, + "time_per_iteration": 2.7455239295959473 + }, + { + "auxiliary_loss_clip": 0.01087005, + "auxiliary_loss_mlp": 0.01030618, + "balance_loss_clip": 1.04036319, + "balance_loss_mlp": 1.01622987, + "epoch": 0.6238989929355178, + "flos": 23623511823360.0, + "grad_norm": 1.5317768555363875, + "language_loss": 0.76383424, + "learning_rate": 1.309031204505301e-06, + "loss": 0.78501046, + "num_input_tokens_seen": 223547305, + "step": 10377, + "time_per_iteration": 4.217595338821411 + }, + { + "auxiliary_loss_clip": 0.01095365, + "auxiliary_loss_mlp": 0.01032947, + "balance_loss_clip": 1.04230881, + "balance_loss_mlp": 1.02149701, + "epoch": 0.6239591161881858, + "flos": 22087468569600.0, + "grad_norm": 1.9922863755635154, + "language_loss": 0.68561447, + "learning_rate": 1.308665737227052e-06, + "loss": 0.70689762, + "num_input_tokens_seen": 223567205, + "step": 10378, + "time_per_iteration": 2.668548822402954 + }, + { + "auxiliary_loss_clip": 0.01089219, + "auxiliary_loss_mlp": 0.01032012, + "balance_loss_clip": 1.03835332, + "balance_loss_mlp": 1.01904845, + "epoch": 0.6240192394408538, + "flos": 24535930124160.0, + "grad_norm": 1.8244104489721222, + "language_loss": 0.76516432, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.78637671, + "num_input_tokens_seen": 223586560, + "step": 10379, + "time_per_iteration": 2.636387825012207 + }, + { + "auxiliary_loss_clip": 0.01091775, + "auxiliary_loss_mlp": 0.01029494, + "balance_loss_clip": 1.03986144, + "balance_loss_mlp": 1.01667941, + "epoch": 0.6240793626935217, + "flos": 27931930502400.0, + "grad_norm": 1.3063592721987374, + "language_loss": 0.79515195, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.81636459, + "num_input_tokens_seen": 223610595, + "step": 10380, + "time_per_iteration": 2.7264626026153564 + }, + { + "auxiliary_loss_clip": 0.01098611, + "auxiliary_loss_mlp": 0.01032438, + "balance_loss_clip": 1.04053771, + "balance_loss_mlp": 1.02064252, + "epoch": 0.6241394859461897, + "flos": 22892514140160.0, + "grad_norm": 1.5486607590861352, + "language_loss": 0.80008709, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.82139754, + "num_input_tokens_seen": 223630230, + "step": 10381, + "time_per_iteration": 2.6646101474761963 + }, + { + "auxiliary_loss_clip": 0.01089557, + "auxiliary_loss_mlp": 0.01035129, + "balance_loss_clip": 1.03794694, + "balance_loss_mlp": 1.0213902, + "epoch": 0.6241996091988576, + "flos": 12750766744320.0, + "grad_norm": 2.2250038803258256, + "language_loss": 0.74777293, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.76901984, + "num_input_tokens_seen": 223648360, + "step": 10382, + "time_per_iteration": 2.7230777740478516 + }, + { + "auxiliary_loss_clip": 0.01101818, + "auxiliary_loss_mlp": 0.01025191, + "balance_loss_clip": 1.03977156, + "balance_loss_mlp": 1.01332331, + "epoch": 0.6242597324515257, + "flos": 25851302173440.0, + "grad_norm": 1.6487787752646939, + "language_loss": 0.78440118, + "learning_rate": 1.306838794344911e-06, + "loss": 0.80567122, + "num_input_tokens_seen": 223671255, + "step": 10383, + "time_per_iteration": 2.7347943782806396 + }, + { + "auxiliary_loss_clip": 0.01078794, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.03457248, + "balance_loss_mlp": 1.01803493, + "epoch": 0.6243198557041936, + "flos": 19937712516480.0, + "grad_norm": 1.7448881929287328, + "language_loss": 0.74959773, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.77069044, + "num_input_tokens_seen": 223689860, + "step": 10384, + "time_per_iteration": 2.715670347213745 + }, + { + "auxiliary_loss_clip": 0.01090865, + "auxiliary_loss_mlp": 0.01039296, + "balance_loss_clip": 1.03685331, + "balance_loss_mlp": 1.02441823, + "epoch": 0.6243799789568616, + "flos": 18406194376320.0, + "grad_norm": 1.703443113253697, + "language_loss": 0.66354865, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.68485022, + "num_input_tokens_seen": 223707835, + "step": 10385, + "time_per_iteration": 2.6395132541656494 + }, + { + "auxiliary_loss_clip": 0.01017413, + "auxiliary_loss_mlp": 0.00999729, + "balance_loss_clip": 1.01207745, + "balance_loss_mlp": 0.998501, + "epoch": 0.6244401022095295, + "flos": 66027587523840.0, + "grad_norm": 0.7616108367019777, + "language_loss": 0.6200667, + "learning_rate": 1.305742943921692e-06, + "loss": 0.64023811, + "num_input_tokens_seen": 223771875, + "step": 10386, + "time_per_iteration": 3.2555150985717773 + }, + { + "auxiliary_loss_clip": 0.01103744, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.03913903, + "balance_loss_mlp": 1.01928985, + "epoch": 0.6245002254621975, + "flos": 24571266128640.0, + "grad_norm": 2.488369520959267, + "language_loss": 0.7205711, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.74193048, + "num_input_tokens_seen": 223788895, + "step": 10387, + "time_per_iteration": 2.6242222785949707 + }, + { + "auxiliary_loss_clip": 0.01111553, + "auxiliary_loss_mlp": 0.01040034, + "balance_loss_clip": 1.04189062, + "balance_loss_mlp": 1.0254066, + "epoch": 0.6245603487148654, + "flos": 29168837291520.0, + "grad_norm": 2.3305483255195787, + "language_loss": 0.65657806, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.67809391, + "num_input_tokens_seen": 223810385, + "step": 10388, + "time_per_iteration": 2.659313440322876 + }, + { + "auxiliary_loss_clip": 0.0107602, + "auxiliary_loss_mlp": 0.01029774, + "balance_loss_clip": 1.03905761, + "balance_loss_mlp": 1.01803207, + "epoch": 0.6246204719675335, + "flos": 14790097411200.0, + "grad_norm": 1.9152677822128796, + "language_loss": 0.79151481, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.81257272, + "num_input_tokens_seen": 223826040, + "step": 10389, + "time_per_iteration": 2.6531307697296143 + }, + { + "auxiliary_loss_clip": 0.0108775, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.036906, + "balance_loss_mlp": 1.02176762, + "epoch": 0.6246805952202014, + "flos": 12493538472960.0, + "grad_norm": 4.707823306169989, + "language_loss": 0.60542148, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.62664247, + "num_input_tokens_seen": 223842300, + "step": 10390, + "time_per_iteration": 2.6380884647369385 + }, + { + "auxiliary_loss_clip": 0.01095689, + "auxiliary_loss_mlp": 0.01032943, + "balance_loss_clip": 1.03998685, + "balance_loss_mlp": 1.02047336, + "epoch": 0.6247407184728694, + "flos": 12786677366400.0, + "grad_norm": 1.9478919515008288, + "language_loss": 0.76811498, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.78940129, + "num_input_tokens_seen": 223858320, + "step": 10391, + "time_per_iteration": 2.6485612392425537 + }, + { + "auxiliary_loss_clip": 0.01095815, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.0409584, + "balance_loss_mlp": 1.02074265, + "epoch": 0.6248008417255374, + "flos": 40629188960640.0, + "grad_norm": 1.4703588614112992, + "language_loss": 0.64372337, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.66502333, + "num_input_tokens_seen": 223883545, + "step": 10392, + "time_per_iteration": 2.8461811542510986 + }, + { + "auxiliary_loss_clip": 0.01096988, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.04133046, + "balance_loss_mlp": 1.02083135, + "epoch": 0.6248609649782053, + "flos": 19902017376000.0, + "grad_norm": 1.871769291735266, + "language_loss": 0.76746744, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.78878057, + "num_input_tokens_seen": 223901445, + "step": 10393, + "time_per_iteration": 2.637190818786621 + }, + { + "auxiliary_loss_clip": 0.01078713, + "auxiliary_loss_mlp": 0.00772119, + "balance_loss_clip": 1.03866291, + "balance_loss_mlp": 1.00009847, + "epoch": 0.6249210882308733, + "flos": 19682746801920.0, + "grad_norm": 1.7077234803990555, + "language_loss": 0.82370424, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.84221256, + "num_input_tokens_seen": 223920170, + "step": 10394, + "time_per_iteration": 2.6997132301330566 + }, + { + "auxiliary_loss_clip": 0.01095186, + "auxiliary_loss_mlp": 0.01037496, + "balance_loss_clip": 1.03878772, + "balance_loss_mlp": 1.02421618, + "epoch": 0.6249812114835412, + "flos": 13990726189440.0, + "grad_norm": 1.9873009659143388, + "language_loss": 0.75021064, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.77153742, + "num_input_tokens_seen": 223936495, + "step": 10395, + "time_per_iteration": 2.6623713970184326 + }, + { + "auxiliary_loss_clip": 0.01095635, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.03662229, + "balance_loss_mlp": 1.0203023, + "epoch": 0.6250413347362093, + "flos": 14530031965440.0, + "grad_norm": 3.229511376831138, + "language_loss": 0.72134733, + "learning_rate": 1.302091822487119e-06, + "loss": 0.74264228, + "num_input_tokens_seen": 223950070, + "step": 10396, + "time_per_iteration": 2.677992820739746 + }, + { + "auxiliary_loss_clip": 0.01075755, + "auxiliary_loss_mlp": 0.0103795, + "balance_loss_clip": 1.04014516, + "balance_loss_mlp": 1.0248127, + "epoch": 0.6251014579888772, + "flos": 22963006581120.0, + "grad_norm": 1.7904379273274065, + "language_loss": 0.75906593, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.78020298, + "num_input_tokens_seen": 223970065, + "step": 10397, + "time_per_iteration": 2.722014904022217 + }, + { + "auxiliary_loss_clip": 0.01092491, + "auxiliary_loss_mlp": 0.0103722, + "balance_loss_clip": 1.04022741, + "balance_loss_mlp": 1.02372003, + "epoch": 0.6251615812415452, + "flos": 28111232217600.0, + "grad_norm": 4.888327827010162, + "language_loss": 0.74880314, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.77010036, + "num_input_tokens_seen": 223990315, + "step": 10398, + "time_per_iteration": 2.7456398010253906 + }, + { + "auxiliary_loss_clip": 0.01117793, + "auxiliary_loss_mlp": 0.01031154, + "balance_loss_clip": 1.03983879, + "balance_loss_mlp": 1.01699233, + "epoch": 0.6252217044942131, + "flos": 26724469887360.0, + "grad_norm": 1.9767586095703997, + "language_loss": 0.73813987, + "learning_rate": 1.300997001489483e-06, + "loss": 0.75962937, + "num_input_tokens_seen": 224009960, + "step": 10399, + "time_per_iteration": 2.6542532444000244 + }, + { + "auxiliary_loss_clip": 0.01077509, + "auxiliary_loss_mlp": 0.01036637, + "balance_loss_clip": 1.03692555, + "balance_loss_mlp": 1.02285028, + "epoch": 0.6252818277468811, + "flos": 20006768413440.0, + "grad_norm": 1.7877165034058586, + "language_loss": 0.74266648, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.76380795, + "num_input_tokens_seen": 224028870, + "step": 10400, + "time_per_iteration": 2.6837148666381836 + }, + { + "auxiliary_loss_clip": 0.0101245, + "auxiliary_loss_mlp": 0.01001226, + "balance_loss_clip": 1.01475704, + "balance_loss_mlp": 0.99997389, + "epoch": 0.625341950999549, + "flos": 59278285059840.0, + "grad_norm": 0.8429284892848663, + "language_loss": 0.56419927, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.58433604, + "num_input_tokens_seen": 224094140, + "step": 10401, + "time_per_iteration": 3.3155579566955566 + }, + { + "auxiliary_loss_clip": 0.01107517, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.04071486, + "balance_loss_mlp": 1.02067709, + "epoch": 0.625402074252217, + "flos": 20157090831360.0, + "grad_norm": 2.04205601235836, + "language_loss": 0.83276439, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.85418153, + "num_input_tokens_seen": 224113235, + "step": 10402, + "time_per_iteration": 2.691084146499634 + }, + { + "auxiliary_loss_clip": 0.01036621, + "auxiliary_loss_mlp": 0.01034014, + "balance_loss_clip": 1.036587, + "balance_loss_mlp": 1.02084088, + "epoch": 0.625462197504885, + "flos": 29132531619840.0, + "grad_norm": 2.64185876470146, + "language_loss": 0.69291663, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.71362293, + "num_input_tokens_seen": 224134530, + "step": 10403, + "time_per_iteration": 2.9650638103485107 + }, + { + "auxiliary_loss_clip": 0.01081288, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.03741455, + "balance_loss_mlp": 1.01692796, + "epoch": 0.625522320757553, + "flos": 26104436294400.0, + "grad_norm": 1.773424214610222, + "language_loss": 0.71938539, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.74051046, + "num_input_tokens_seen": 224154170, + "step": 10404, + "time_per_iteration": 3.032392978668213 + }, + { + "auxiliary_loss_clip": 0.01071553, + "auxiliary_loss_mlp": 0.01037364, + "balance_loss_clip": 1.03673673, + "balance_loss_mlp": 1.02419138, + "epoch": 0.625582444010221, + "flos": 20630967984000.0, + "grad_norm": 1.988268046568807, + "language_loss": 0.69859874, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.71968794, + "num_input_tokens_seen": 224172730, + "step": 10405, + "time_per_iteration": 2.752593994140625 + }, + { + "auxiliary_loss_clip": 0.01088298, + "auxiliary_loss_mlp": 0.01038031, + "balance_loss_clip": 1.03901088, + "balance_loss_mlp": 1.02447712, + "epoch": 0.6256425672628889, + "flos": 20521512264960.0, + "grad_norm": 1.8903848634840759, + "language_loss": 0.7935456, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.81480896, + "num_input_tokens_seen": 224192620, + "step": 10406, + "time_per_iteration": 2.6944150924682617 + }, + { + "auxiliary_loss_clip": 0.01078593, + "auxiliary_loss_mlp": 0.01035645, + "balance_loss_clip": 1.0391295, + "balance_loss_mlp": 1.02321792, + "epoch": 0.6257026905155569, + "flos": 29529200488320.0, + "grad_norm": 1.7747095604461551, + "language_loss": 0.68853474, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.70967722, + "num_input_tokens_seen": 224214660, + "step": 10407, + "time_per_iteration": 2.7394134998321533 + }, + { + "auxiliary_loss_clip": 0.01101618, + "auxiliary_loss_mlp": 0.00769457, + "balance_loss_clip": 1.03912544, + "balance_loss_mlp": 1.00006318, + "epoch": 0.6257628137682248, + "flos": 24024885373440.0, + "grad_norm": 1.6542698687790116, + "language_loss": 0.8580991, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.87680984, + "num_input_tokens_seen": 224234170, + "step": 10408, + "time_per_iteration": 2.647240400314331 + }, + { + "auxiliary_loss_clip": 0.01090915, + "auxiliary_loss_mlp": 0.00769522, + "balance_loss_clip": 1.03742266, + "balance_loss_mlp": 1.00007892, + "epoch": 0.6258229370208929, + "flos": 20850956830080.0, + "grad_norm": 1.8769352919555562, + "language_loss": 0.79664773, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.81525207, + "num_input_tokens_seen": 224253115, + "step": 10409, + "time_per_iteration": 5.298889636993408 + }, + { + "auxiliary_loss_clip": 0.01091226, + "auxiliary_loss_mlp": 0.01033686, + "balance_loss_clip": 1.03762126, + "balance_loss_mlp": 1.02168143, + "epoch": 0.6258830602735608, + "flos": 22231542021120.0, + "grad_norm": 2.146507314015339, + "language_loss": 0.69629455, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.71754372, + "num_input_tokens_seen": 224271375, + "step": 10410, + "time_per_iteration": 2.7642364501953125 + }, + { + "auxiliary_loss_clip": 0.01066453, + "auxiliary_loss_mlp": 0.01032082, + "balance_loss_clip": 1.03571606, + "balance_loss_mlp": 1.01986265, + "epoch": 0.6259431835262288, + "flos": 25076887925760.0, + "grad_norm": 2.4453810502825153, + "language_loss": 0.67605823, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.6970436, + "num_input_tokens_seen": 224290315, + "step": 10411, + "time_per_iteration": 4.3257997035980225 + }, + { + "auxiliary_loss_clip": 0.0106799, + "auxiliary_loss_mlp": 0.01040597, + "balance_loss_clip": 1.03715658, + "balance_loss_mlp": 1.02818179, + "epoch": 0.6260033067788967, + "flos": 28252288926720.0, + "grad_norm": 1.954494979108325, + "language_loss": 0.69357151, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.71465743, + "num_input_tokens_seen": 224310545, + "step": 10412, + "time_per_iteration": 4.512540578842163 + }, + { + "auxiliary_loss_clip": 0.01080692, + "auxiliary_loss_mlp": 0.0104025, + "balance_loss_clip": 1.03551555, + "balance_loss_mlp": 1.02700531, + "epoch": 0.6260634300315647, + "flos": 23367432787200.0, + "grad_norm": 1.4726479761814617, + "language_loss": 0.6975283, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.71873772, + "num_input_tokens_seen": 224331115, + "step": 10413, + "time_per_iteration": 2.715327262878418 + }, + { + "auxiliary_loss_clip": 0.01083008, + "auxiliary_loss_mlp": 0.01034077, + "balance_loss_clip": 1.03659189, + "balance_loss_mlp": 1.01976025, + "epoch": 0.6261235532842326, + "flos": 18035308494720.0, + "grad_norm": 2.5748151630879277, + "language_loss": 0.80629605, + "learning_rate": 1.295526482316796e-06, + "loss": 0.82746685, + "num_input_tokens_seen": 224347525, + "step": 10414, + "time_per_iteration": 2.7809388637542725 + }, + { + "auxiliary_loss_clip": 0.0110639, + "auxiliary_loss_mlp": 0.01037432, + "balance_loss_clip": 1.04208875, + "balance_loss_mlp": 1.0249393, + "epoch": 0.6261836765369007, + "flos": 22011265866240.0, + "grad_norm": 1.7429212772998885, + "language_loss": 0.74786866, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.7693069, + "num_input_tokens_seen": 224367045, + "step": 10415, + "time_per_iteration": 2.790271282196045 + }, + { + "auxiliary_loss_clip": 0.01062067, + "auxiliary_loss_mlp": 0.01034612, + "balance_loss_clip": 1.03746879, + "balance_loss_mlp": 1.0215826, + "epoch": 0.6262437997895686, + "flos": 24936010784640.0, + "grad_norm": 1.5794864494822807, + "language_loss": 0.74193609, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.76290286, + "num_input_tokens_seen": 224388860, + "step": 10416, + "time_per_iteration": 2.7647581100463867 + }, + { + "auxiliary_loss_clip": 0.01086432, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.04012477, + "balance_loss_mlp": 1.02101088, + "epoch": 0.6263039230422366, + "flos": 31608428186880.0, + "grad_norm": 1.6472166500534797, + "language_loss": 0.84573495, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.86693239, + "num_input_tokens_seen": 224409645, + "step": 10417, + "time_per_iteration": 4.274592638015747 + }, + { + "auxiliary_loss_clip": 0.01105981, + "auxiliary_loss_mlp": 0.0103493, + "balance_loss_clip": 1.04019403, + "balance_loss_mlp": 1.02186441, + "epoch": 0.6263640462949046, + "flos": 17639465639040.0, + "grad_norm": 2.0790985994239066, + "language_loss": 0.56728101, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.58869016, + "num_input_tokens_seen": 224428530, + "step": 10418, + "time_per_iteration": 2.691500186920166 + }, + { + "auxiliary_loss_clip": 0.01110622, + "auxiliary_loss_mlp": 0.0104313, + "balance_loss_clip": 1.04013753, + "balance_loss_mlp": 1.0293498, + "epoch": 0.6264241695475725, + "flos": 19974951941760.0, + "grad_norm": 1.8736530467564598, + "language_loss": 0.8455261, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.86706358, + "num_input_tokens_seen": 224447175, + "step": 10419, + "time_per_iteration": 2.739027261734009 + }, + { + "auxiliary_loss_clip": 0.01119559, + "auxiliary_loss_mlp": 0.01031671, + "balance_loss_clip": 1.04406238, + "balance_loss_mlp": 1.01907599, + "epoch": 0.6264842928002405, + "flos": 27344323912320.0, + "grad_norm": 1.509247263381085, + "language_loss": 0.6426456, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.66415787, + "num_input_tokens_seen": 224469445, + "step": 10420, + "time_per_iteration": 2.7180798053741455 + }, + { + "auxiliary_loss_clip": 0.01076087, + "auxiliary_loss_mlp": 0.01035851, + "balance_loss_clip": 1.03824723, + "balance_loss_mlp": 1.02204061, + "epoch": 0.6265444160529084, + "flos": 22997265177600.0, + "grad_norm": 2.1707304020443527, + "language_loss": 0.86138391, + "learning_rate": 1.292975627485741e-06, + "loss": 0.88250327, + "num_input_tokens_seen": 224486590, + "step": 10421, + "time_per_iteration": 2.7487831115722656 + }, + { + "auxiliary_loss_clip": 0.01078665, + "auxiliary_loss_mlp": 0.01036628, + "balance_loss_clip": 1.03799725, + "balance_loss_mlp": 1.02374697, + "epoch": 0.6266045393055765, + "flos": 19938323047680.0, + "grad_norm": 2.422674057917065, + "language_loss": 0.79407763, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.81523055, + "num_input_tokens_seen": 224502795, + "step": 10422, + "time_per_iteration": 2.8828704357147217 + }, + { + "auxiliary_loss_clip": 0.01104293, + "auxiliary_loss_mlp": 0.01027022, + "balance_loss_clip": 1.04006767, + "balance_loss_mlp": 1.01370621, + "epoch": 0.6266646625582444, + "flos": 24389091325440.0, + "grad_norm": 2.2930026415354368, + "language_loss": 0.74455339, + "learning_rate": 1.292247052906389e-06, + "loss": 0.76586652, + "num_input_tokens_seen": 224522300, + "step": 10423, + "time_per_iteration": 2.7208752632141113 + }, + { + "auxiliary_loss_clip": 0.01114032, + "auxiliary_loss_mlp": 0.01028546, + "balance_loss_clip": 1.04019392, + "balance_loss_mlp": 1.01625562, + "epoch": 0.6267247858109124, + "flos": 14683802088960.0, + "grad_norm": 1.9557551713522223, + "language_loss": 0.7775594, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.79898518, + "num_input_tokens_seen": 224538260, + "step": 10424, + "time_per_iteration": 2.592926263809204 + }, + { + "auxiliary_loss_clip": 0.01113819, + "auxiliary_loss_mlp": 0.01032907, + "balance_loss_clip": 1.04032254, + "balance_loss_mlp": 1.01879287, + "epoch": 0.6267849090635803, + "flos": 24929977299840.0, + "grad_norm": 2.1677847187028605, + "language_loss": 0.6903978, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.71186507, + "num_input_tokens_seen": 224559155, + "step": 10425, + "time_per_iteration": 2.668877363204956 + }, + { + "auxiliary_loss_clip": 0.01089804, + "auxiliary_loss_mlp": 0.01029639, + "balance_loss_clip": 1.03939557, + "balance_loss_mlp": 1.01808131, + "epoch": 0.6268450323162483, + "flos": 25337851211520.0, + "grad_norm": 1.4857408938723873, + "language_loss": 0.74492955, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.76612389, + "num_input_tokens_seen": 224578660, + "step": 10426, + "time_per_iteration": 2.720566987991333 + }, + { + "auxiliary_loss_clip": 0.01106657, + "auxiliary_loss_mlp": 0.00770492, + "balance_loss_clip": 1.04118848, + "balance_loss_mlp": 1.00016105, + "epoch": 0.6269051555689162, + "flos": 26177299032960.0, + "grad_norm": 2.445291482107416, + "language_loss": 0.80835652, + "learning_rate": 1.290790225914929e-06, + "loss": 0.82712793, + "num_input_tokens_seen": 224599080, + "step": 10427, + "time_per_iteration": 2.6930294036865234 + }, + { + "auxiliary_loss_clip": 0.01083192, + "auxiliary_loss_mlp": 0.01039458, + "balance_loss_clip": 1.03919089, + "balance_loss_mlp": 1.02608228, + "epoch": 0.6269652788215843, + "flos": 18256877539200.0, + "grad_norm": 2.002033794251086, + "language_loss": 0.68361104, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.70483756, + "num_input_tokens_seen": 224614225, + "step": 10428, + "time_per_iteration": 2.750072717666626 + }, + { + "auxiliary_loss_clip": 0.01070825, + "auxiliary_loss_mlp": 0.01048713, + "balance_loss_clip": 1.03721058, + "balance_loss_mlp": 1.03428292, + "epoch": 0.6270254020742522, + "flos": 11765413877760.0, + "grad_norm": 1.948024958379765, + "language_loss": 0.71860063, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.73979598, + "num_input_tokens_seen": 224632365, + "step": 10429, + "time_per_iteration": 2.746628761291504 + }, + { + "auxiliary_loss_clip": 0.01109377, + "auxiliary_loss_mlp": 0.01032535, + "balance_loss_clip": 1.04220653, + "balance_loss_mlp": 1.01867652, + "epoch": 0.6270855253269202, + "flos": 23475631530240.0, + "grad_norm": 1.6097875593140534, + "language_loss": 0.79522586, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.81664503, + "num_input_tokens_seen": 224651125, + "step": 10430, + "time_per_iteration": 2.7708442211151123 + }, + { + "auxiliary_loss_clip": 0.01033801, + "auxiliary_loss_mlp": 0.01002127, + "balance_loss_clip": 1.01011229, + "balance_loss_mlp": 1.00103593, + "epoch": 0.6271456485795882, + "flos": 70064520232320.0, + "grad_norm": 1.3411395578732954, + "language_loss": 0.59105575, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.61141503, + "num_input_tokens_seen": 224716115, + "step": 10431, + "time_per_iteration": 3.284141778945923 + }, + { + "auxiliary_loss_clip": 0.01016087, + "auxiliary_loss_mlp": 0.01003696, + "balance_loss_clip": 1.01267934, + "balance_loss_mlp": 1.00251579, + "epoch": 0.6272057718322561, + "flos": 65156718280320.0, + "grad_norm": 0.8756941222650257, + "language_loss": 0.63814843, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.65834618, + "num_input_tokens_seen": 224782930, + "step": 10432, + "time_per_iteration": 3.315559148788452 + }, + { + "auxiliary_loss_clip": 0.0109102, + "auxiliary_loss_mlp": 0.01033306, + "balance_loss_clip": 1.03992772, + "balance_loss_mlp": 1.02161813, + "epoch": 0.6272658950849241, + "flos": 24389342720640.0, + "grad_norm": 1.881228339897183, + "language_loss": 0.64901084, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.67025411, + "num_input_tokens_seen": 224802010, + "step": 10433, + "time_per_iteration": 2.7182137966156006 + }, + { + "auxiliary_loss_clip": 0.01108511, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.04193711, + "balance_loss_mlp": 1.01966476, + "epoch": 0.627326018337592, + "flos": 17966001202560.0, + "grad_norm": 2.029162826422426, + "language_loss": 0.61656857, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.63798386, + "num_input_tokens_seen": 224818875, + "step": 10434, + "time_per_iteration": 2.698272228240967 + }, + { + "auxiliary_loss_clip": 0.0107895, + "auxiliary_loss_mlp": 0.01026455, + "balance_loss_clip": 1.03706336, + "balance_loss_mlp": 1.01392627, + "epoch": 0.6273861415902601, + "flos": 20230097224320.0, + "grad_norm": 1.7060876035395582, + "language_loss": 0.84624016, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.86729419, + "num_input_tokens_seen": 224837790, + "step": 10435, + "time_per_iteration": 2.7053635120391846 + }, + { + "auxiliary_loss_clip": 0.01033575, + "auxiliary_loss_mlp": 0.01005985, + "balance_loss_clip": 1.00981998, + "balance_loss_mlp": 1.00484645, + "epoch": 0.627446264842928, + "flos": 64953210798720.0, + "grad_norm": 0.7308695189229724, + "language_loss": 0.61571616, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.63611174, + "num_input_tokens_seen": 224899685, + "step": 10436, + "time_per_iteration": 3.1732895374298096 + }, + { + "auxiliary_loss_clip": 0.01099296, + "auxiliary_loss_mlp": 0.01040375, + "balance_loss_clip": 1.04577446, + "balance_loss_mlp": 1.02651119, + "epoch": 0.627506388095596, + "flos": 23584261236480.0, + "grad_norm": 1.4615085745823022, + "language_loss": 0.77539217, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.79678893, + "num_input_tokens_seen": 224918650, + "step": 10437, + "time_per_iteration": 2.8112289905548096 + }, + { + "auxiliary_loss_clip": 0.0102524, + "auxiliary_loss_mlp": 0.01007069, + "balance_loss_clip": 1.01128411, + "balance_loss_mlp": 1.00595462, + "epoch": 0.6275665113482639, + "flos": 67583631674880.0, + "grad_norm": 0.7245410806399479, + "language_loss": 0.54275799, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.56308109, + "num_input_tokens_seen": 224981575, + "step": 10438, + "time_per_iteration": 3.1365692615509033 + }, + { + "auxiliary_loss_clip": 0.01063228, + "auxiliary_loss_mlp": 0.01041641, + "balance_loss_clip": 1.03674674, + "balance_loss_mlp": 1.02898097, + "epoch": 0.6276266346009319, + "flos": 27636924101760.0, + "grad_norm": 1.7255538562739963, + "language_loss": 0.84122932, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.86227804, + "num_input_tokens_seen": 225000820, + "step": 10439, + "time_per_iteration": 2.909126043319702 + }, + { + "auxiliary_loss_clip": 0.01077398, + "auxiliary_loss_mlp": 0.01044046, + "balance_loss_clip": 1.04187262, + "balance_loss_mlp": 1.03006864, + "epoch": 0.6276867578535998, + "flos": 22746142218240.0, + "grad_norm": 2.0752652164499783, + "language_loss": 0.80063027, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.8218447, + "num_input_tokens_seen": 225017585, + "step": 10440, + "time_per_iteration": 2.7793238162994385 + }, + { + "auxiliary_loss_clip": 0.01059905, + "auxiliary_loss_mlp": 0.01030462, + "balance_loss_clip": 1.03476882, + "balance_loss_mlp": 1.01888728, + "epoch": 0.6277468811062679, + "flos": 24644200694400.0, + "grad_norm": 1.357982638723412, + "language_loss": 0.74566025, + "learning_rate": 1.285694725799337e-06, + "loss": 0.76656389, + "num_input_tokens_seen": 225039085, + "step": 10441, + "time_per_iteration": 2.9267096519470215 + }, + { + "auxiliary_loss_clip": 0.01095865, + "auxiliary_loss_mlp": 0.0103067, + "balance_loss_clip": 1.03701901, + "balance_loss_mlp": 1.01759267, + "epoch": 0.6278070043589358, + "flos": 19678975873920.0, + "grad_norm": 2.0708219033723316, + "language_loss": 0.72098005, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.74224538, + "num_input_tokens_seen": 225058105, + "step": 10442, + "time_per_iteration": 2.6998653411865234 + }, + { + "auxiliary_loss_clip": 0.01081918, + "auxiliary_loss_mlp": 0.01030205, + "balance_loss_clip": 1.03865194, + "balance_loss_mlp": 1.01750898, + "epoch": 0.6278671276116038, + "flos": 22121834906880.0, + "grad_norm": 1.6030154795021492, + "language_loss": 0.7134285, + "learning_rate": 1.284967229712762e-06, + "loss": 0.73454976, + "num_input_tokens_seen": 225077605, + "step": 10443, + "time_per_iteration": 2.8322415351867676 + }, + { + "auxiliary_loss_clip": 0.0111667, + "auxiliary_loss_mlp": 0.01031963, + "balance_loss_clip": 1.04252923, + "balance_loss_mlp": 1.01954722, + "epoch": 0.6279272508642717, + "flos": 23038562839680.0, + "grad_norm": 2.1504215551644523, + "language_loss": 0.73254573, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.75403202, + "num_input_tokens_seen": 225097775, + "step": 10444, + "time_per_iteration": 2.6936285495758057 + }, + { + "auxiliary_loss_clip": 0.01085082, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.04689062, + "balance_loss_mlp": 1.01936126, + "epoch": 0.6279873741169397, + "flos": 19824090819840.0, + "grad_norm": 2.0098765769795697, + "language_loss": 0.724576, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.74574882, + "num_input_tokens_seen": 225115585, + "step": 10445, + "time_per_iteration": 2.7513034343719482 + }, + { + "auxiliary_loss_clip": 0.01101735, + "auxiliary_loss_mlp": 0.01029744, + "balance_loss_clip": 1.03916216, + "balance_loss_mlp": 1.0170486, + "epoch": 0.6280474973696077, + "flos": 23915393740800.0, + "grad_norm": 1.5354377153299141, + "language_loss": 0.692366, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.71368074, + "num_input_tokens_seen": 225135575, + "step": 10446, + "time_per_iteration": 2.7197511196136475 + }, + { + "auxiliary_loss_clip": 0.01075612, + "auxiliary_loss_mlp": 0.0103331, + "balance_loss_clip": 1.03858328, + "balance_loss_mlp": 1.01901674, + "epoch": 0.6281076206222757, + "flos": 17967976450560.0, + "grad_norm": 2.0624649000071638, + "language_loss": 0.73082191, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.75191116, + "num_input_tokens_seen": 225154230, + "step": 10447, + "time_per_iteration": 2.8416759967803955 + }, + { + "auxiliary_loss_clip": 0.01024228, + "auxiliary_loss_mlp": 0.00999654, + "balance_loss_clip": 1.00985765, + "balance_loss_mlp": 0.99855727, + "epoch": 0.6281677438749437, + "flos": 66778370622720.0, + "grad_norm": 0.6739953142314802, + "language_loss": 0.52296638, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.54320526, + "num_input_tokens_seen": 225213650, + "step": 10448, + "time_per_iteration": 5.136569976806641 + }, + { + "auxiliary_loss_clip": 0.01089733, + "auxiliary_loss_mlp": 0.01050472, + "balance_loss_clip": 1.0385865, + "balance_loss_mlp": 1.03579164, + "epoch": 0.6282278671276116, + "flos": 11656173640320.0, + "grad_norm": 2.2865528324647744, + "language_loss": 0.91361725, + "learning_rate": 1.282785392633079e-06, + "loss": 0.93501937, + "num_input_tokens_seen": 225230135, + "step": 10449, + "time_per_iteration": 2.7638633251190186 + }, + { + "auxiliary_loss_clip": 0.01112884, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.03918815, + "balance_loss_mlp": 1.02023697, + "epoch": 0.6282879903802796, + "flos": 42741597847680.0, + "grad_norm": 1.5879286033336677, + "language_loss": 0.60231853, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.6237675, + "num_input_tokens_seen": 225253520, + "step": 10450, + "time_per_iteration": 4.464092493057251 + }, + { + "auxiliary_loss_clip": 0.01089139, + "auxiliary_loss_mlp": 0.01032278, + "balance_loss_clip": 1.04133666, + "balance_loss_mlp": 1.01986873, + "epoch": 0.6283481136329475, + "flos": 20009210538240.0, + "grad_norm": 1.522481037470791, + "language_loss": 0.76846904, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.78968322, + "num_input_tokens_seen": 225272460, + "step": 10451, + "time_per_iteration": 4.40496563911438 + }, + { + "auxiliary_loss_clip": 0.01090661, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.03676105, + "balance_loss_mlp": 1.01773453, + "epoch": 0.6284082368856155, + "flos": 21904431840000.0, + "grad_norm": 1.614739235308552, + "language_loss": 0.77571416, + "learning_rate": 1.281694841064566e-06, + "loss": 0.79692847, + "num_input_tokens_seen": 225291700, + "step": 10452, + "time_per_iteration": 2.7239017486572266 + }, + { + "auxiliary_loss_clip": 0.01088221, + "auxiliary_loss_mlp": 0.01034824, + "balance_loss_clip": 1.04302955, + "balance_loss_mlp": 1.02150226, + "epoch": 0.6284683601382834, + "flos": 25484187219840.0, + "grad_norm": 1.7878849951641813, + "language_loss": 0.72469395, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.74592441, + "num_input_tokens_seen": 225311470, + "step": 10453, + "time_per_iteration": 2.9393930435180664 + }, + { + "auxiliary_loss_clip": 0.01053587, + "auxiliary_loss_mlp": 0.01040763, + "balance_loss_clip": 1.03172648, + "balance_loss_mlp": 1.02527809, + "epoch": 0.6285284833909515, + "flos": 16538695395840.0, + "grad_norm": 1.709886822132608, + "language_loss": 0.80723816, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.82818168, + "num_input_tokens_seen": 225328385, + "step": 10454, + "time_per_iteration": 2.8191676139831543 + }, + { + "auxiliary_loss_clip": 0.01086328, + "auxiliary_loss_mlp": 0.01036988, + "balance_loss_clip": 1.04401016, + "balance_loss_mlp": 1.02476287, + "epoch": 0.6285886066436194, + "flos": 22820692896000.0, + "grad_norm": 1.9883426544542775, + "language_loss": 0.82205665, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.84328985, + "num_input_tokens_seen": 225348415, + "step": 10455, + "time_per_iteration": 2.778773784637451 + }, + { + "auxiliary_loss_clip": 0.01066143, + "auxiliary_loss_mlp": 0.00771548, + "balance_loss_clip": 1.03564739, + "balance_loss_mlp": 1.00019312, + "epoch": 0.6286487298962874, + "flos": 24715734629760.0, + "grad_norm": 1.5354473458638056, + "language_loss": 0.81757617, + "learning_rate": 1.280241153705706e-06, + "loss": 0.83595306, + "num_input_tokens_seen": 225367740, + "step": 10456, + "time_per_iteration": 4.4299633502960205 + }, + { + "auxiliary_loss_clip": 0.0108958, + "auxiliary_loss_mlp": 0.01031242, + "balance_loss_clip": 1.04148746, + "balance_loss_mlp": 1.01731229, + "epoch": 0.6287088531489553, + "flos": 20740818752640.0, + "grad_norm": 1.6813486630133685, + "language_loss": 0.71938455, + "learning_rate": 1.27987780006486e-06, + "loss": 0.74059272, + "num_input_tokens_seen": 225388405, + "step": 10457, + "time_per_iteration": 2.7010886669158936 + }, + { + "auxiliary_loss_clip": 0.0110824, + "auxiliary_loss_mlp": 0.01035135, + "balance_loss_clip": 1.03882265, + "balance_loss_mlp": 1.02124166, + "epoch": 0.6287689764016233, + "flos": 23070630706560.0, + "grad_norm": 1.8855739678870833, + "language_loss": 0.79754472, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.81897843, + "num_input_tokens_seen": 225408360, + "step": 10458, + "time_per_iteration": 2.826195478439331 + }, + { + "auxiliary_loss_clip": 0.01110415, + "auxiliary_loss_mlp": 0.01033556, + "balance_loss_clip": 1.0434413, + "balance_loss_mlp": 1.02032971, + "epoch": 0.6288290996542913, + "flos": 32233669251840.0, + "grad_norm": 1.613153759988395, + "language_loss": 0.61056519, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.63200486, + "num_input_tokens_seen": 225431310, + "step": 10459, + "time_per_iteration": 2.8198750019073486 + }, + { + "auxiliary_loss_clip": 0.01090967, + "auxiliary_loss_mlp": 0.0103353, + "balance_loss_clip": 1.03930306, + "balance_loss_mlp": 1.02109075, + "epoch": 0.6288892229069593, + "flos": 24641327606400.0, + "grad_norm": 1.6884168463635612, + "language_loss": 0.78966278, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.81090778, + "num_input_tokens_seen": 225450385, + "step": 10460, + "time_per_iteration": 2.8095743656158447 + }, + { + "auxiliary_loss_clip": 0.01074125, + "auxiliary_loss_mlp": 0.01031631, + "balance_loss_clip": 1.0369761, + "balance_loss_mlp": 1.01822627, + "epoch": 0.6289493461596273, + "flos": 17858341163520.0, + "grad_norm": 1.6519482013468527, + "language_loss": 0.73814094, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.75919855, + "num_input_tokens_seen": 225467325, + "step": 10461, + "time_per_iteration": 2.754106044769287 + }, + { + "auxiliary_loss_clip": 0.01093245, + "auxiliary_loss_mlp": 0.01040397, + "balance_loss_clip": 1.03983331, + "balance_loss_mlp": 1.02764726, + "epoch": 0.6290094694122952, + "flos": 22345379199360.0, + "grad_norm": 1.7440118950274472, + "language_loss": 0.69962513, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.72096151, + "num_input_tokens_seen": 225487370, + "step": 10462, + "time_per_iteration": 2.721280574798584 + }, + { + "auxiliary_loss_clip": 0.01109582, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.04013419, + "balance_loss_mlp": 1.01948082, + "epoch": 0.6290695926649632, + "flos": 28402431776640.0, + "grad_norm": 2.4371122708038846, + "language_loss": 0.7249735, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.74637896, + "num_input_tokens_seen": 225506915, + "step": 10463, + "time_per_iteration": 2.7322490215301514 + }, + { + "auxiliary_loss_clip": 0.01094633, + "auxiliary_loss_mlp": 0.0104, + "balance_loss_clip": 1.04333925, + "balance_loss_mlp": 1.02713692, + "epoch": 0.6291297159176311, + "flos": 21505464501120.0, + "grad_norm": 1.7167597419504528, + "language_loss": 0.72533494, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.74668121, + "num_input_tokens_seen": 225525670, + "step": 10464, + "time_per_iteration": 2.7556610107421875 + }, + { + "auxiliary_loss_clip": 0.01086904, + "auxiliary_loss_mlp": 0.01034283, + "balance_loss_clip": 1.04166722, + "balance_loss_mlp": 1.02168906, + "epoch": 0.6291898391702991, + "flos": 12203308581120.0, + "grad_norm": 1.750105989459617, + "language_loss": 0.69012117, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.71133304, + "num_input_tokens_seen": 225542235, + "step": 10465, + "time_per_iteration": 2.6720523834228516 + }, + { + "auxiliary_loss_clip": 0.01026598, + "auxiliary_loss_mlp": 0.01001492, + "balance_loss_clip": 1.0124836, + "balance_loss_mlp": 1.00023413, + "epoch": 0.629249962422967, + "flos": 69299479434240.0, + "grad_norm": 0.6784608705879751, + "language_loss": 0.59741104, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.61769187, + "num_input_tokens_seen": 225607185, + "step": 10466, + "time_per_iteration": 3.353839635848999 + }, + { + "auxiliary_loss_clip": 0.01073177, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.03545153, + "balance_loss_mlp": 1.02020311, + "epoch": 0.6293100856756351, + "flos": 40077888042240.0, + "grad_norm": 1.835286938356158, + "language_loss": 0.64667165, + "learning_rate": 1.276245767820154e-06, + "loss": 0.66772521, + "num_input_tokens_seen": 225628785, + "step": 10467, + "time_per_iteration": 2.921297550201416 + }, + { + "auxiliary_loss_clip": 0.01014455, + "auxiliary_loss_mlp": 0.01000173, + "balance_loss_clip": 1.01132929, + "balance_loss_mlp": 0.9989695, + "epoch": 0.629370208928303, + "flos": 67501108177920.0, + "grad_norm": 0.7915302961276658, + "language_loss": 0.56811368, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.58825994, + "num_input_tokens_seen": 225678980, + "step": 10468, + "time_per_iteration": 3.01094126701355 + }, + { + "auxiliary_loss_clip": 0.00999481, + "auxiliary_loss_mlp": 0.00999518, + "balance_loss_clip": 1.01559901, + "balance_loss_mlp": 0.9980635, + "epoch": 0.629430332180971, + "flos": 60660450449280.0, + "grad_norm": 0.7367622716998392, + "language_loss": 0.57934558, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.59933555, + "num_input_tokens_seen": 225740295, + "step": 10469, + "time_per_iteration": 3.254342555999756 + }, + { + "auxiliary_loss_clip": 0.01032056, + "auxiliary_loss_mlp": 0.01005271, + "balance_loss_clip": 1.02417684, + "balance_loss_mlp": 1.00394154, + "epoch": 0.6294904554336389, + "flos": 66869764778880.0, + "grad_norm": 0.6802993920043705, + "language_loss": 0.5213244, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.54169762, + "num_input_tokens_seen": 225805615, + "step": 10470, + "time_per_iteration": 3.2833499908447266 + }, + { + "auxiliary_loss_clip": 0.01099474, + "auxiliary_loss_mlp": 0.0103723, + "balance_loss_clip": 1.03933227, + "balance_loss_mlp": 1.02434301, + "epoch": 0.6295505786863069, + "flos": 42522794150400.0, + "grad_norm": 1.6833251005433751, + "language_loss": 0.7409395, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.76230645, + "num_input_tokens_seen": 225826585, + "step": 10471, + "time_per_iteration": 2.839749574661255 + }, + { + "auxiliary_loss_clip": 0.0108924, + "auxiliary_loss_mlp": 0.0103139, + "balance_loss_clip": 1.0421524, + "balance_loss_mlp": 1.01881981, + "epoch": 0.629610701938975, + "flos": 17384140788480.0, + "grad_norm": 1.8072062146815357, + "language_loss": 0.63223195, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.65343827, + "num_input_tokens_seen": 225844095, + "step": 10472, + "time_per_iteration": 2.72947359085083 + }, + { + "auxiliary_loss_clip": 0.01121891, + "auxiliary_loss_mlp": 0.01039125, + "balance_loss_clip": 1.04511738, + "balance_loss_mlp": 1.02616739, + "epoch": 0.6296708251916429, + "flos": 24242934885120.0, + "grad_norm": 1.6320866537592498, + "language_loss": 0.69356817, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.71517837, + "num_input_tokens_seen": 225864310, + "step": 10473, + "time_per_iteration": 2.68420672416687 + }, + { + "auxiliary_loss_clip": 0.01090218, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.03732657, + "balance_loss_mlp": 1.02030838, + "epoch": 0.6297309484443109, + "flos": 19278536077440.0, + "grad_norm": 1.63494515725041, + "language_loss": 0.7420494, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.7632792, + "num_input_tokens_seen": 225883830, + "step": 10474, + "time_per_iteration": 2.7413995265960693 + }, + { + "auxiliary_loss_clip": 0.01090194, + "auxiliary_loss_mlp": 0.00769939, + "balance_loss_clip": 1.03743196, + "balance_loss_mlp": 1.0001384, + "epoch": 0.6297910716969788, + "flos": 30662685043200.0, + "grad_norm": 1.4351205807606953, + "language_loss": 0.66564953, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.68425083, + "num_input_tokens_seen": 225905755, + "step": 10475, + "time_per_iteration": 2.7660322189331055 + }, + { + "auxiliary_loss_clip": 0.0106541, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.03863168, + "balance_loss_mlp": 1.01878738, + "epoch": 0.6298511949496468, + "flos": 14423018371200.0, + "grad_norm": 1.9836644906797416, + "language_loss": 0.9036352, + "learning_rate": 1.272979284940101e-06, + "loss": 0.92459542, + "num_input_tokens_seen": 225922155, + "step": 10476, + "time_per_iteration": 2.758232593536377 + }, + { + "auxiliary_loss_clip": 0.01114316, + "auxiliary_loss_mlp": 0.01035706, + "balance_loss_clip": 1.04105282, + "balance_loss_mlp": 1.02374947, + "epoch": 0.6299113182023147, + "flos": 23514163845120.0, + "grad_norm": 5.4120485720423055, + "language_loss": 0.75543785, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.77693808, + "num_input_tokens_seen": 225941060, + "step": 10477, + "time_per_iteration": 2.689332962036133 + }, + { + "auxiliary_loss_clip": 0.01100017, + "auxiliary_loss_mlp": 0.01035097, + "balance_loss_clip": 1.03945518, + "balance_loss_mlp": 1.02181101, + "epoch": 0.6299714414549827, + "flos": 22674500542080.0, + "grad_norm": 1.792423931833335, + "language_loss": 0.70299745, + "learning_rate": 1.272253702758138e-06, + "loss": 0.7243486, + "num_input_tokens_seen": 225960870, + "step": 10478, + "time_per_iteration": 2.641702651977539 + }, + { + "auxiliary_loss_clip": 0.011102, + "auxiliary_loss_mlp": 0.01032825, + "balance_loss_clip": 1.04167068, + "balance_loss_mlp": 1.01943791, + "epoch": 0.6300315647076506, + "flos": 14501735026560.0, + "grad_norm": 2.1836774795585012, + "language_loss": 0.66761291, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.68904316, + "num_input_tokens_seen": 225977895, + "step": 10479, + "time_per_iteration": 2.6688246726989746 + }, + { + "auxiliary_loss_clip": 0.01090005, + "auxiliary_loss_mlp": 0.0077118, + "balance_loss_clip": 1.03907907, + "balance_loss_mlp": 1.0001682, + "epoch": 0.6300916879603187, + "flos": 21871681614720.0, + "grad_norm": 2.512846896597075, + "language_loss": 0.73645091, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.7550627, + "num_input_tokens_seen": 225997835, + "step": 10480, + "time_per_iteration": 2.7305657863616943 + }, + { + "auxiliary_loss_clip": 0.011053, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.04060471, + "balance_loss_mlp": 1.02189767, + "epoch": 0.6301518112129866, + "flos": 21834047139840.0, + "grad_norm": 1.8722485238317301, + "language_loss": 0.79015726, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.81156552, + "num_input_tokens_seen": 226017620, + "step": 10481, + "time_per_iteration": 2.687849283218384 + }, + { + "auxiliary_loss_clip": 0.01021696, + "auxiliary_loss_mlp": 0.01011899, + "balance_loss_clip": 1.01580834, + "balance_loss_mlp": 1.01079035, + "epoch": 0.6302119344656546, + "flos": 44334237957120.0, + "grad_norm": 0.8976146461078123, + "language_loss": 0.61833119, + "learning_rate": 1.2708028696588e-06, + "loss": 0.63866711, + "num_input_tokens_seen": 226068755, + "step": 10482, + "time_per_iteration": 3.008683681488037 + }, + { + "auxiliary_loss_clip": 0.01109585, + "auxiliary_loss_mlp": 0.01034347, + "balance_loss_clip": 1.04106355, + "balance_loss_mlp": 1.02004182, + "epoch": 0.6302720577183225, + "flos": 11217919800960.0, + "grad_norm": 2.2108979789482635, + "language_loss": 0.8277266, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.84916592, + "num_input_tokens_seen": 226084395, + "step": 10483, + "time_per_iteration": 2.623480796813965 + }, + { + "auxiliary_loss_clip": 0.01094195, + "auxiliary_loss_mlp": 0.01042488, + "balance_loss_clip": 1.03946197, + "balance_loss_mlp": 1.03022778, + "epoch": 0.6303321809709905, + "flos": 27964932122880.0, + "grad_norm": 1.5219185358756147, + "language_loss": 0.72691327, + "learning_rate": 1.270077618961487e-06, + "loss": 0.74828005, + "num_input_tokens_seen": 226105890, + "step": 10484, + "time_per_iteration": 2.7577946186065674 + }, + { + "auxiliary_loss_clip": 0.0108643, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.04017258, + "balance_loss_mlp": 1.01970792, + "epoch": 0.6303923042236586, + "flos": 28220759763840.0, + "grad_norm": 2.7543040419083606, + "language_loss": 0.74625325, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.76744819, + "num_input_tokens_seen": 226126760, + "step": 10485, + "time_per_iteration": 2.8124029636383057 + }, + { + "auxiliary_loss_clip": 0.01093712, + "auxiliary_loss_mlp": 0.00771476, + "balance_loss_clip": 1.04156017, + "balance_loss_mlp": 1.00019419, + "epoch": 0.6304524274763265, + "flos": 27631034271360.0, + "grad_norm": 1.7508926529215563, + "language_loss": 0.81359017, + "learning_rate": 1.269352478979093e-06, + "loss": 0.83224207, + "num_input_tokens_seen": 226147315, + "step": 10486, + "time_per_iteration": 2.8222594261169434 + }, + { + "auxiliary_loss_clip": 0.0109264, + "auxiliary_loss_mlp": 0.01040277, + "balance_loss_clip": 1.04081047, + "balance_loss_mlp": 1.02773643, + "epoch": 0.6305125507289945, + "flos": 17311313963520.0, + "grad_norm": 1.7524407832841304, + "language_loss": 0.63269603, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.6540252, + "num_input_tokens_seen": 226165935, + "step": 10487, + "time_per_iteration": 2.629199743270874 + }, + { + "auxiliary_loss_clip": 0.01116472, + "auxiliary_loss_mlp": 0.01040106, + "balance_loss_clip": 1.04161322, + "balance_loss_mlp": 1.0270344, + "epoch": 0.6305726739816624, + "flos": 25808280658560.0, + "grad_norm": 1.6120412913951392, + "language_loss": 0.66997957, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.69154537, + "num_input_tokens_seen": 226186890, + "step": 10488, + "time_per_iteration": 4.3398730754852295 + }, + { + "auxiliary_loss_clip": 0.01096551, + "auxiliary_loss_mlp": 0.01032615, + "balance_loss_clip": 1.04035902, + "balance_loss_mlp": 1.02013993, + "epoch": 0.6306327972343304, + "flos": 21797454159360.0, + "grad_norm": 1.6559636367213997, + "language_loss": 0.67318177, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.69447345, + "num_input_tokens_seen": 226206710, + "step": 10489, + "time_per_iteration": 4.3245344161987305 + }, + { + "auxiliary_loss_clip": 0.01079741, + "auxiliary_loss_mlp": 0.01044256, + "balance_loss_clip": 1.03847003, + "balance_loss_mlp": 1.02838886, + "epoch": 0.6306929204869983, + "flos": 20777375819520.0, + "grad_norm": 1.8294067402999528, + "language_loss": 0.6980201, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.7192601, + "num_input_tokens_seen": 226225565, + "step": 10490, + "time_per_iteration": 2.7364768981933594 + }, + { + "auxiliary_loss_clip": 0.0109348, + "auxiliary_loss_mlp": 0.01037174, + "balance_loss_clip": 1.03807712, + "balance_loss_mlp": 1.02385783, + "epoch": 0.6307530437396663, + "flos": 23654214973440.0, + "grad_norm": 3.3228808138384545, + "language_loss": 0.78209651, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.80340308, + "num_input_tokens_seen": 226243680, + "step": 10491, + "time_per_iteration": 4.192841053009033 + }, + { + "auxiliary_loss_clip": 0.01089569, + "auxiliary_loss_mlp": 0.01036924, + "balance_loss_clip": 1.03836989, + "balance_loss_mlp": 1.02435327, + "epoch": 0.6308131669923343, + "flos": 24719002767360.0, + "grad_norm": 2.408793546436542, + "language_loss": 0.55951095, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.58077586, + "num_input_tokens_seen": 226264345, + "step": 10492, + "time_per_iteration": 2.7634830474853516 + }, + { + "auxiliary_loss_clip": 0.01118182, + "auxiliary_loss_mlp": 0.01040842, + "balance_loss_clip": 1.04113233, + "balance_loss_mlp": 1.026793, + "epoch": 0.6308732902450023, + "flos": 22565403959040.0, + "grad_norm": 1.8001504389218699, + "language_loss": 0.64376915, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.66535938, + "num_input_tokens_seen": 226283165, + "step": 10493, + "time_per_iteration": 2.617398977279663 + }, + { + "auxiliary_loss_clip": 0.01079208, + "auxiliary_loss_mlp": 0.01031715, + "balance_loss_clip": 1.03931165, + "balance_loss_mlp": 1.01834536, + "epoch": 0.6309334134976702, + "flos": 24644200694400.0, + "grad_norm": 1.3815551057795799, + "language_loss": 0.82869065, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.84979987, + "num_input_tokens_seen": 226304080, + "step": 10494, + "time_per_iteration": 2.9209089279174805 + }, + { + "auxiliary_loss_clip": 0.01102712, + "auxiliary_loss_mlp": 0.01035887, + "balance_loss_clip": 1.04531574, + "balance_loss_mlp": 1.02259517, + "epoch": 0.6309935367503382, + "flos": 41427949651200.0, + "grad_norm": 1.8103540070682869, + "language_loss": 0.79647011, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.81785613, + "num_input_tokens_seen": 226325925, + "step": 10495, + "time_per_iteration": 2.913984775543213 + }, + { + "auxiliary_loss_clip": 0.0108712, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.03742623, + "balance_loss_mlp": 1.02182817, + "epoch": 0.6310536600030061, + "flos": 15118931445120.0, + "grad_norm": 1.9837558740535257, + "language_loss": 0.70338362, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.72461271, + "num_input_tokens_seen": 226344190, + "step": 10496, + "time_per_iteration": 4.195697546005249 + }, + { + "auxiliary_loss_clip": 0.01097081, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.04069757, + "balance_loss_mlp": 1.02359533, + "epoch": 0.6311137832556741, + "flos": 15231619388160.0, + "grad_norm": 2.0479454703454616, + "language_loss": 0.79674435, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.81808245, + "num_input_tokens_seen": 226361520, + "step": 10497, + "time_per_iteration": 2.7244081497192383 + }, + { + "auxiliary_loss_clip": 0.01080809, + "auxiliary_loss_mlp": 0.01033562, + "balance_loss_clip": 1.03673339, + "balance_loss_mlp": 1.02134275, + "epoch": 0.6311739065083422, + "flos": 22018664067840.0, + "grad_norm": 1.9007003272679206, + "language_loss": 0.73755234, + "learning_rate": 1.265003970256247e-06, + "loss": 0.75869608, + "num_input_tokens_seen": 226381920, + "step": 10498, + "time_per_iteration": 2.702826976776123 + }, + { + "auxiliary_loss_clip": 0.01106258, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.03932881, + "balance_loss_mlp": 1.02077663, + "epoch": 0.6312340297610101, + "flos": 22710770300160.0, + "grad_norm": 2.137540621016438, + "language_loss": 0.70001101, + "learning_rate": 1.264641775364217e-06, + "loss": 0.72141325, + "num_input_tokens_seen": 226400035, + "step": 10499, + "time_per_iteration": 2.6359314918518066 + }, + { + "auxiliary_loss_clip": 0.01105058, + "auxiliary_loss_mlp": 0.0104466, + "balance_loss_clip": 1.04247713, + "balance_loss_mlp": 1.03126705, + "epoch": 0.6312941530136781, + "flos": 24280102483200.0, + "grad_norm": 1.7496076109467864, + "language_loss": 0.69836605, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.7198633, + "num_input_tokens_seen": 226418280, + "step": 10500, + "time_per_iteration": 2.6434264183044434 + }, + { + "auxiliary_loss_clip": 0.01117728, + "auxiliary_loss_mlp": 0.01037176, + "balance_loss_clip": 1.04233432, + "balance_loss_mlp": 1.02433133, + "epoch": 0.631354276266346, + "flos": 21725956137600.0, + "grad_norm": 1.767641766149829, + "language_loss": 0.74439371, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.76594275, + "num_input_tokens_seen": 226436650, + "step": 10501, + "time_per_iteration": 2.6442511081695557 + }, + { + "auxiliary_loss_clip": 0.01104233, + "auxiliary_loss_mlp": 0.00770378, + "balance_loss_clip": 1.04097271, + "balance_loss_mlp": 1.00013256, + "epoch": 0.631414399519014, + "flos": 24025100855040.0, + "grad_norm": 2.125617189575791, + "language_loss": 0.75111711, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.76986325, + "num_input_tokens_seen": 226456275, + "step": 10502, + "time_per_iteration": 2.6732592582702637 + }, + { + "auxiliary_loss_clip": 0.01108933, + "auxiliary_loss_mlp": 0.01052555, + "balance_loss_clip": 1.04151106, + "balance_loss_mlp": 1.03879273, + "epoch": 0.6314745227716819, + "flos": 24315797623680.0, + "grad_norm": 2.013663319345679, + "language_loss": 0.85323668, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.87485158, + "num_input_tokens_seen": 226473610, + "step": 10503, + "time_per_iteration": 2.7602460384368896 + }, + { + "auxiliary_loss_clip": 0.01084517, + "auxiliary_loss_mlp": 0.01034434, + "balance_loss_clip": 1.03906107, + "balance_loss_mlp": 1.02097487, + "epoch": 0.6315346460243499, + "flos": 23366391292800.0, + "grad_norm": 1.6896389995545142, + "language_loss": 0.86806571, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.88925523, + "num_input_tokens_seen": 226493665, + "step": 10504, + "time_per_iteration": 2.6560161113739014 + }, + { + "auxiliary_loss_clip": 0.0108443, + "auxiliary_loss_mlp": 0.0103934, + "balance_loss_clip": 1.0409503, + "balance_loss_mlp": 1.02557158, + "epoch": 0.6315947692770179, + "flos": 20260333497600.0, + "grad_norm": 1.5595011849504998, + "language_loss": 0.76756787, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.78880554, + "num_input_tokens_seen": 226511625, + "step": 10505, + "time_per_iteration": 2.7035913467407227 + }, + { + "auxiliary_loss_clip": 0.01073251, + "auxiliary_loss_mlp": 0.01035629, + "balance_loss_clip": 1.03666878, + "balance_loss_mlp": 1.02143097, + "epoch": 0.6316548925296859, + "flos": 25265850399360.0, + "grad_norm": 2.3166055953098774, + "language_loss": 0.81818491, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.83927369, + "num_input_tokens_seen": 226530085, + "step": 10506, + "time_per_iteration": 2.762647867202759 + }, + { + "auxiliary_loss_clip": 0.01118108, + "auxiliary_loss_mlp": 0.01035819, + "balance_loss_clip": 1.0422647, + "balance_loss_mlp": 1.02248573, + "epoch": 0.6317150157823538, + "flos": 22930579578240.0, + "grad_norm": 1.8490757285143165, + "language_loss": 0.74521178, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.76675105, + "num_input_tokens_seen": 226548115, + "step": 10507, + "time_per_iteration": 2.598595380783081 + }, + { + "auxiliary_loss_clip": 0.01094729, + "auxiliary_loss_mlp": 0.01038809, + "balance_loss_clip": 1.04198813, + "balance_loss_mlp": 1.02511764, + "epoch": 0.6317751390350218, + "flos": 22527051212160.0, + "grad_norm": 2.137138504509131, + "language_loss": 0.67884028, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.7001757, + "num_input_tokens_seen": 226567955, + "step": 10508, + "time_per_iteration": 2.6457536220550537 + }, + { + "auxiliary_loss_clip": 0.01081753, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.03684628, + "balance_loss_mlp": 1.02029264, + "epoch": 0.6318352622876897, + "flos": 23294749616640.0, + "grad_norm": 1.726891076070715, + "language_loss": 0.70810485, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.72925943, + "num_input_tokens_seen": 226588205, + "step": 10509, + "time_per_iteration": 2.7340633869171143 + }, + { + "auxiliary_loss_clip": 0.01100032, + "auxiliary_loss_mlp": 0.01030079, + "balance_loss_clip": 1.0408802, + "balance_loss_mlp": 1.01750255, + "epoch": 0.6318953855403577, + "flos": 20704082117760.0, + "grad_norm": 2.059347572016265, + "language_loss": 0.79585326, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.81715441, + "num_input_tokens_seen": 226606965, + "step": 10510, + "time_per_iteration": 2.7126991748809814 + }, + { + "auxiliary_loss_clip": 0.01073398, + "auxiliary_loss_mlp": 0.00771235, + "balance_loss_clip": 1.03949821, + "balance_loss_mlp": 1.00013995, + "epoch": 0.6319555087930258, + "flos": 22820046451200.0, + "grad_norm": 2.029248251908187, + "language_loss": 0.70844626, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.72689259, + "num_input_tokens_seen": 226627845, + "step": 10511, + "time_per_iteration": 2.862959384918213 + }, + { + "auxiliary_loss_clip": 0.01113995, + "auxiliary_loss_mlp": 0.01035402, + "balance_loss_clip": 1.04076004, + "balance_loss_mlp": 1.02298617, + "epoch": 0.6320156320456937, + "flos": 19970929618560.0, + "grad_norm": 1.5814642404723724, + "language_loss": 0.80147332, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.82296729, + "num_input_tokens_seen": 226645855, + "step": 10512, + "time_per_iteration": 2.599238872528076 + }, + { + "auxiliary_loss_clip": 0.01104767, + "auxiliary_loss_mlp": 0.01033707, + "balance_loss_clip": 1.04045844, + "balance_loss_mlp": 1.01971221, + "epoch": 0.6320757552983617, + "flos": 27013406889600.0, + "grad_norm": 2.290319172186619, + "language_loss": 0.70844841, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.72983325, + "num_input_tokens_seen": 226665375, + "step": 10513, + "time_per_iteration": 2.706372022628784 + }, + { + "auxiliary_loss_clip": 0.01107929, + "auxiliary_loss_mlp": 0.01034926, + "balance_loss_clip": 1.03973472, + "balance_loss_mlp": 1.02081192, + "epoch": 0.6321358785510296, + "flos": 23695943598720.0, + "grad_norm": 2.242079914271032, + "language_loss": 0.6665644, + "learning_rate": 1.259212205855459e-06, + "loss": 0.68799293, + "num_input_tokens_seen": 226685270, + "step": 10514, + "time_per_iteration": 2.6768577098846436 + }, + { + "auxiliary_loss_clip": 0.01080896, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.03646874, + "balance_loss_mlp": 1.02114093, + "epoch": 0.6321960018036976, + "flos": 25995231970560.0, + "grad_norm": 1.8993538704282873, + "language_loss": 0.74367702, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.76482546, + "num_input_tokens_seen": 226705325, + "step": 10515, + "time_per_iteration": 2.8709843158721924 + }, + { + "auxiliary_loss_clip": 0.01089992, + "auxiliary_loss_mlp": 0.01031214, + "balance_loss_clip": 1.04074252, + "balance_loss_mlp": 1.01873255, + "epoch": 0.6322561250563655, + "flos": 22821016118400.0, + "grad_norm": 1.7638160656735167, + "language_loss": 0.90024698, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.92145908, + "num_input_tokens_seen": 226723815, + "step": 10516, + "time_per_iteration": 2.691826343536377 + }, + { + "auxiliary_loss_clip": 0.0112538, + "auxiliary_loss_mlp": 0.01036003, + "balance_loss_clip": 1.04528499, + "balance_loss_mlp": 1.02075589, + "epoch": 0.6323162483090335, + "flos": 18988413926400.0, + "grad_norm": 1.6560830086526979, + "language_loss": 0.81829578, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.83990955, + "num_input_tokens_seen": 226741550, + "step": 10517, + "time_per_iteration": 2.620199203491211 + }, + { + "auxiliary_loss_clip": 0.01061827, + "auxiliary_loss_mlp": 0.01039321, + "balance_loss_clip": 1.03930223, + "balance_loss_mlp": 1.02642882, + "epoch": 0.6323763715617015, + "flos": 19865173000320.0, + "grad_norm": 1.7035542921935394, + "language_loss": 0.7784009, + "learning_rate": 1.257765386189541e-06, + "loss": 0.79941237, + "num_input_tokens_seen": 226761115, + "step": 10518, + "time_per_iteration": 2.91979718208313 + }, + { + "auxiliary_loss_clip": 0.01096755, + "auxiliary_loss_mlp": 0.0103349, + "balance_loss_clip": 1.03876209, + "balance_loss_mlp": 1.02090716, + "epoch": 0.6324364948143695, + "flos": 22782699285120.0, + "grad_norm": 1.44276453327461, + "language_loss": 0.85200572, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.87330812, + "num_input_tokens_seen": 226782225, + "step": 10519, + "time_per_iteration": 2.74233078956604 + }, + { + "auxiliary_loss_clip": 0.01088566, + "auxiliary_loss_mlp": 0.01039518, + "balance_loss_clip": 1.03878999, + "balance_loss_mlp": 1.02666724, + "epoch": 0.6324966180670374, + "flos": 22235923480320.0, + "grad_norm": 2.1806676145694692, + "language_loss": 0.71964407, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.74092495, + "num_input_tokens_seen": 226802375, + "step": 10520, + "time_per_iteration": 2.682180404663086 + }, + { + "auxiliary_loss_clip": 0.01103452, + "auxiliary_loss_mlp": 0.01035272, + "balance_loss_clip": 1.03956473, + "balance_loss_mlp": 1.02224886, + "epoch": 0.6325567413197054, + "flos": 21689183589120.0, + "grad_norm": 1.7702779314390575, + "language_loss": 0.71439731, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.73578453, + "num_input_tokens_seen": 226822165, + "step": 10521, + "time_per_iteration": 2.657323122024536 + }, + { + "auxiliary_loss_clip": 0.01076504, + "auxiliary_loss_mlp": 0.01041724, + "balance_loss_clip": 1.03893948, + "balance_loss_mlp": 1.0255115, + "epoch": 0.6326168645723733, + "flos": 19937137898880.0, + "grad_norm": 1.7329974565509776, + "language_loss": 0.721259, + "learning_rate": 1.256319016853377e-06, + "loss": 0.74244124, + "num_input_tokens_seen": 226841645, + "step": 10522, + "time_per_iteration": 2.746037721633911 + }, + { + "auxiliary_loss_clip": 0.01074288, + "auxiliary_loss_mlp": 0.01034292, + "balance_loss_clip": 1.04106843, + "balance_loss_mlp": 1.02167988, + "epoch": 0.6326769878250413, + "flos": 20230348619520.0, + "grad_norm": 1.8934714872441534, + "language_loss": 0.81941485, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.84050065, + "num_input_tokens_seen": 226860355, + "step": 10523, + "time_per_iteration": 2.761061906814575 + }, + { + "auxiliary_loss_clip": 0.01103759, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.03989744, + "balance_loss_mlp": 1.01712918, + "epoch": 0.6327371110777094, + "flos": 20775759707520.0, + "grad_norm": 2.3750030560810163, + "language_loss": 0.73983908, + "learning_rate": 1.255596001333195e-06, + "loss": 0.76118159, + "num_input_tokens_seen": 226878390, + "step": 10524, + "time_per_iteration": 2.677591323852539 + }, + { + "auxiliary_loss_clip": 0.01101897, + "auxiliary_loss_mlp": 0.01041422, + "balance_loss_clip": 1.04099619, + "balance_loss_mlp": 1.02719402, + "epoch": 0.6327972343303773, + "flos": 30336544529280.0, + "grad_norm": 1.9503552038514373, + "language_loss": 0.84243858, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.86387181, + "num_input_tokens_seen": 226898420, + "step": 10525, + "time_per_iteration": 2.7905821800231934 + }, + { + "auxiliary_loss_clip": 0.0108609, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.03651416, + "balance_loss_mlp": 1.01617217, + "epoch": 0.6328573575830453, + "flos": 17092258871040.0, + "grad_norm": 1.6646724041083503, + "language_loss": 0.6700424, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.6912058, + "num_input_tokens_seen": 226916305, + "step": 10526, + "time_per_iteration": 2.658766031265259 + }, + { + "auxiliary_loss_clip": 0.01111357, + "auxiliary_loss_mlp": 0.01036081, + "balance_loss_clip": 1.04416919, + "balance_loss_mlp": 1.02141845, + "epoch": 0.6329174808357132, + "flos": 25047154442880.0, + "grad_norm": 2.0355958158409346, + "language_loss": 0.73648405, + "learning_rate": 1.254511689796244e-06, + "loss": 0.75795841, + "num_input_tokens_seen": 226937705, + "step": 10527, + "time_per_iteration": 5.2298712730407715 + }, + { + "auxiliary_loss_clip": 0.01105368, + "auxiliary_loss_mlp": 0.01035127, + "balance_loss_clip": 1.04319382, + "balance_loss_mlp": 1.02256858, + "epoch": 0.6329776040883812, + "flos": 16836826279680.0, + "grad_norm": 2.5914253744426614, + "language_loss": 0.71704459, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.73844951, + "num_input_tokens_seen": 226954880, + "step": 10528, + "time_per_iteration": 2.6561360359191895 + }, + { + "auxiliary_loss_clip": 0.01104345, + "auxiliary_loss_mlp": 0.01031745, + "balance_loss_clip": 1.04158008, + "balance_loss_mlp": 1.01763082, + "epoch": 0.6330377273410491, + "flos": 13516705382400.0, + "grad_norm": 1.8004698597026916, + "language_loss": 0.66514266, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.68650359, + "num_input_tokens_seen": 226972595, + "step": 10529, + "time_per_iteration": 4.169236421585083 + }, + { + "auxiliary_loss_clip": 0.01109158, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.0410428, + "balance_loss_mlp": 1.01895428, + "epoch": 0.6330978505937171, + "flos": 21538825257600.0, + "grad_norm": 2.1634257180763545, + "language_loss": 0.75199169, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.77341741, + "num_input_tokens_seen": 226991910, + "step": 10530, + "time_per_iteration": 4.1243627071380615 + }, + { + "auxiliary_loss_clip": 0.0111004, + "auxiliary_loss_mlp": 0.00770904, + "balance_loss_clip": 1.04529655, + "balance_loss_mlp": 1.00030541, + "epoch": 0.6331579738463851, + "flos": 25009484054400.0, + "grad_norm": 1.5033967127528767, + "language_loss": 0.73765004, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.75645947, + "num_input_tokens_seen": 227010175, + "step": 10531, + "time_per_iteration": 2.757310152053833 + }, + { + "auxiliary_loss_clip": 0.010819, + "auxiliary_loss_mlp": 0.0103456, + "balance_loss_clip": 1.0428102, + "balance_loss_mlp": 1.02120292, + "epoch": 0.6332180970990531, + "flos": 14976007228800.0, + "grad_norm": 2.152892996011048, + "language_loss": 0.79560679, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.81677139, + "num_input_tokens_seen": 227025540, + "step": 10532, + "time_per_iteration": 2.693357229232788 + }, + { + "auxiliary_loss_clip": 0.01106096, + "auxiliary_loss_mlp": 0.01033111, + "balance_loss_clip": 1.04273748, + "balance_loss_mlp": 1.02105284, + "epoch": 0.633278220351721, + "flos": 22706963458560.0, + "grad_norm": 1.5569394240480623, + "language_loss": 0.74720097, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.76859295, + "num_input_tokens_seen": 227045520, + "step": 10533, + "time_per_iteration": 2.6261446475982666 + }, + { + "auxiliary_loss_clip": 0.01096787, + "auxiliary_loss_mlp": 0.01038782, + "balance_loss_clip": 1.04458022, + "balance_loss_mlp": 1.02473903, + "epoch": 0.633338343604389, + "flos": 12602922364800.0, + "grad_norm": 2.379717167307364, + "language_loss": 0.77104855, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.79240417, + "num_input_tokens_seen": 227059420, + "step": 10534, + "time_per_iteration": 2.6211531162261963 + }, + { + "auxiliary_loss_clip": 0.01080216, + "auxiliary_loss_mlp": 0.01043157, + "balance_loss_clip": 1.03751063, + "balance_loss_mlp": 1.02861977, + "epoch": 0.6333984668570569, + "flos": 25960111447680.0, + "grad_norm": 1.7545098866738538, + "language_loss": 0.86108071, + "learning_rate": 1.251621437204777e-06, + "loss": 0.88231444, + "num_input_tokens_seen": 227081310, + "step": 10535, + "time_per_iteration": 4.269057035446167 + }, + { + "auxiliary_loss_clip": 0.01110282, + "auxiliary_loss_mlp": 0.01037711, + "balance_loss_clip": 1.04232645, + "balance_loss_mlp": 1.02399635, + "epoch": 0.6334585901097249, + "flos": 23659242877440.0, + "grad_norm": 1.7414784178378062, + "language_loss": 0.76938647, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.79086637, + "num_input_tokens_seen": 227100365, + "step": 10536, + "time_per_iteration": 2.6666407585144043 + }, + { + "auxiliary_loss_clip": 0.01102168, + "auxiliary_loss_mlp": 0.01038535, + "balance_loss_clip": 1.04189527, + "balance_loss_mlp": 1.02443218, + "epoch": 0.633518713362393, + "flos": 28760496503040.0, + "grad_norm": 2.0392502828924353, + "language_loss": 0.60273743, + "learning_rate": 1.250899157568855e-06, + "loss": 0.62414443, + "num_input_tokens_seen": 227119680, + "step": 10537, + "time_per_iteration": 2.7295584678649902 + }, + { + "auxiliary_loss_clip": 0.01012372, + "auxiliary_loss_mlp": 0.01000462, + "balance_loss_clip": 1.01797509, + "balance_loss_mlp": 0.99907935, + "epoch": 0.6335788366150609, + "flos": 70420322401920.0, + "grad_norm": 0.7714446209447136, + "language_loss": 0.52451682, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.54464519, + "num_input_tokens_seen": 227184465, + "step": 10538, + "time_per_iteration": 3.3442068099975586 + }, + { + "auxiliary_loss_clip": 0.01100864, + "auxiliary_loss_mlp": 0.01035126, + "balance_loss_clip": 1.04384637, + "balance_loss_mlp": 1.02057028, + "epoch": 0.6336389598677289, + "flos": 23732069702400.0, + "grad_norm": 1.8384221769935791, + "language_loss": 0.83274323, + "learning_rate": 1.250176991556848e-06, + "loss": 0.85410309, + "num_input_tokens_seen": 227202185, + "step": 10539, + "time_per_iteration": 2.696904182434082 + }, + { + "auxiliary_loss_clip": 0.01090255, + "auxiliary_loss_mlp": 0.01032507, + "balance_loss_clip": 1.03990459, + "balance_loss_mlp": 1.01798737, + "epoch": 0.6336990831203968, + "flos": 29276676898560.0, + "grad_norm": 1.6347430731245383, + "language_loss": 0.86721331, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.88844097, + "num_input_tokens_seen": 227222020, + "step": 10540, + "time_per_iteration": 2.7495079040527344 + }, + { + "auxiliary_loss_clip": 0.01091229, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.04014003, + "balance_loss_mlp": 1.02244198, + "epoch": 0.6337592063730648, + "flos": 29096836479360.0, + "grad_norm": 2.116079588237037, + "language_loss": 0.7269882, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.74824154, + "num_input_tokens_seen": 227240885, + "step": 10541, + "time_per_iteration": 2.750035285949707 + }, + { + "auxiliary_loss_clip": 0.01111525, + "auxiliary_loss_mlp": 0.010355, + "balance_loss_clip": 1.04309511, + "balance_loss_mlp": 1.02114677, + "epoch": 0.6338193296257327, + "flos": 34706477249280.0, + "grad_norm": 2.608261813881904, + "language_loss": 0.85043848, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.87190866, + "num_input_tokens_seen": 227257880, + "step": 10542, + "time_per_iteration": 2.7066802978515625 + }, + { + "auxiliary_loss_clip": 0.01107251, + "auxiliary_loss_mlp": 0.01033519, + "balance_loss_clip": 1.04289162, + "balance_loss_mlp": 1.01898777, + "epoch": 0.6338794528784008, + "flos": 16687581269760.0, + "grad_norm": 1.8074408618170101, + "language_loss": 0.77832586, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.79973352, + "num_input_tokens_seen": 227274840, + "step": 10543, + "time_per_iteration": 2.6362385749816895 + }, + { + "auxiliary_loss_clip": 0.01065317, + "auxiliary_loss_mlp": 0.0104211, + "balance_loss_clip": 1.04040122, + "balance_loss_mlp": 1.02933073, + "epoch": 0.6339395761310687, + "flos": 22346600261760.0, + "grad_norm": 1.5926861927585991, + "language_loss": 0.73305023, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.75412452, + "num_input_tokens_seen": 227294835, + "step": 10544, + "time_per_iteration": 2.7428245544433594 + }, + { + "auxiliary_loss_clip": 0.01089874, + "auxiliary_loss_mlp": 0.01039428, + "balance_loss_clip": 1.04020858, + "balance_loss_mlp": 1.02617836, + "epoch": 0.6339996993837367, + "flos": 18551812112640.0, + "grad_norm": 4.4072583606750895, + "language_loss": 0.68668348, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.70797652, + "num_input_tokens_seen": 227314935, + "step": 10545, + "time_per_iteration": 2.8335583209991455 + }, + { + "auxiliary_loss_clip": 0.01092777, + "auxiliary_loss_mlp": 0.01037678, + "balance_loss_clip": 1.03954399, + "balance_loss_mlp": 1.02418935, + "epoch": 0.6340598226364046, + "flos": 12969498614400.0, + "grad_norm": 1.9287987147307617, + "language_loss": 0.70950794, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.73081255, + "num_input_tokens_seen": 227332905, + "step": 10546, + "time_per_iteration": 2.6343114376068115 + }, + { + "auxiliary_loss_clip": 0.01103009, + "auxiliary_loss_mlp": 0.01031279, + "balance_loss_clip": 1.0436604, + "balance_loss_mlp": 1.01867259, + "epoch": 0.6341199458890726, + "flos": 26687984647680.0, + "grad_norm": 1.2499026086544156, + "language_loss": 0.77873629, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.80007923, + "num_input_tokens_seen": 227354915, + "step": 10547, + "time_per_iteration": 2.704674005508423 + }, + { + "auxiliary_loss_clip": 0.01072985, + "auxiliary_loss_mlp": 0.0104046, + "balance_loss_clip": 1.03441143, + "balance_loss_mlp": 1.02732289, + "epoch": 0.6341800691417405, + "flos": 18734274224640.0, + "grad_norm": 1.6184133650868997, + "language_loss": 0.62827075, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.64940524, + "num_input_tokens_seen": 227372990, + "step": 10548, + "time_per_iteration": 2.7401933670043945 + }, + { + "auxiliary_loss_clip": 0.01089619, + "auxiliary_loss_mlp": 0.01038454, + "balance_loss_clip": 1.03783989, + "balance_loss_mlp": 1.02509081, + "epoch": 0.6342401923944085, + "flos": 26249443499520.0, + "grad_norm": 2.3059628412520308, + "language_loss": 0.62195736, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.64323807, + "num_input_tokens_seen": 227393270, + "step": 10549, + "time_per_iteration": 2.825896739959717 + }, + { + "auxiliary_loss_clip": 0.0106782, + "auxiliary_loss_mlp": 0.01035303, + "balance_loss_clip": 1.0408318, + "balance_loss_mlp": 1.02268422, + "epoch": 0.6343003156470765, + "flos": 24680937329280.0, + "grad_norm": 1.71498279606421, + "language_loss": 0.73401284, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.75504404, + "num_input_tokens_seen": 227413630, + "step": 10550, + "time_per_iteration": 2.780163049697876 + }, + { + "auxiliary_loss_clip": 0.0100437, + "auxiliary_loss_mlp": 0.01001031, + "balance_loss_clip": 1.0126493, + "balance_loss_mlp": 0.99974936, + "epoch": 0.6343604388997445, + "flos": 69805352626560.0, + "grad_norm": 0.6910389749764038, + "language_loss": 0.57719415, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.59724814, + "num_input_tokens_seen": 227476630, + "step": 10551, + "time_per_iteration": 3.286808729171753 + }, + { + "auxiliary_loss_clip": 0.01082742, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.04196656, + "balance_loss_mlp": 1.01796162, + "epoch": 0.6344205621524125, + "flos": 21982430223360.0, + "grad_norm": 1.74505505177434, + "language_loss": 0.67322063, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.69434893, + "num_input_tokens_seen": 227496060, + "step": 10552, + "time_per_iteration": 2.7764453887939453 + }, + { + "auxiliary_loss_clip": 0.01080056, + "auxiliary_loss_mlp": 0.01034576, + "balance_loss_clip": 1.03920615, + "balance_loss_mlp": 1.02086091, + "epoch": 0.6344806854050804, + "flos": 20448865008000.0, + "grad_norm": 1.5562703117807677, + "language_loss": 0.81798071, + "learning_rate": 1.24512502014147e-06, + "loss": 0.839127, + "num_input_tokens_seen": 227513440, + "step": 10553, + "time_per_iteration": 2.7851717472076416 + }, + { + "auxiliary_loss_clip": 0.01106231, + "auxiliary_loss_mlp": 0.0103609, + "balance_loss_clip": 1.04020214, + "balance_loss_mlp": 1.02246475, + "epoch": 0.6345408086577484, + "flos": 40510611187200.0, + "grad_norm": 1.7532654974316204, + "language_loss": 0.5476743, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.56909752, + "num_input_tokens_seen": 227535395, + "step": 10554, + "time_per_iteration": 2.79447078704834 + }, + { + "auxiliary_loss_clip": 0.01096611, + "auxiliary_loss_mlp": 0.01034981, + "balance_loss_clip": 1.0413723, + "balance_loss_mlp": 1.02187991, + "epoch": 0.6346009319104163, + "flos": 21361319222400.0, + "grad_norm": 2.4671241924977583, + "language_loss": 0.70400488, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.72532082, + "num_input_tokens_seen": 227554545, + "step": 10555, + "time_per_iteration": 2.6849427223205566 + }, + { + "auxiliary_loss_clip": 0.01017602, + "auxiliary_loss_mlp": 0.01006112, + "balance_loss_clip": 1.0127604, + "balance_loss_mlp": 1.00490761, + "epoch": 0.6346610551630844, + "flos": 71365419100800.0, + "grad_norm": 0.773594882523352, + "language_loss": 0.55296588, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.57320297, + "num_input_tokens_seen": 227608575, + "step": 10556, + "time_per_iteration": 3.1463379859924316 + }, + { + "auxiliary_loss_clip": 0.01095791, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.0396291, + "balance_loss_mlp": 1.01756358, + "epoch": 0.6347211784157523, + "flos": 25411504049280.0, + "grad_norm": 2.5502749141285848, + "language_loss": 0.67922962, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.70050198, + "num_input_tokens_seen": 227628175, + "step": 10557, + "time_per_iteration": 2.693422794342041 + }, + { + "auxiliary_loss_clip": 0.0108673, + "auxiliary_loss_mlp": 0.01038794, + "balance_loss_clip": 1.03953815, + "balance_loss_mlp": 1.02604496, + "epoch": 0.6347813016684203, + "flos": 15742735966080.0, + "grad_norm": 1.602709548432784, + "language_loss": 0.70369065, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.72494584, + "num_input_tokens_seen": 227645330, + "step": 10558, + "time_per_iteration": 2.671268939971924 + }, + { + "auxiliary_loss_clip": 0.01083073, + "auxiliary_loss_mlp": 0.01034562, + "balance_loss_clip": 1.03938115, + "balance_loss_mlp": 1.02120471, + "epoch": 0.6348414249210882, + "flos": 21464777370240.0, + "grad_norm": 1.4417814449763804, + "language_loss": 0.78316975, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.80434608, + "num_input_tokens_seen": 227665250, + "step": 10559, + "time_per_iteration": 2.7575199604034424 + }, + { + "auxiliary_loss_clip": 0.01090706, + "auxiliary_loss_mlp": 0.01041541, + "balance_loss_clip": 1.03786755, + "balance_loss_mlp": 1.02740252, + "epoch": 0.6349015481737562, + "flos": 21653057485440.0, + "grad_norm": 1.8349318523473441, + "language_loss": 0.67984653, + "learning_rate": 1.242601136020078e-06, + "loss": 0.70116907, + "num_input_tokens_seen": 227685070, + "step": 10560, + "time_per_iteration": 2.6403374671936035 + }, + { + "auxiliary_loss_clip": 0.01089304, + "auxiliary_loss_mlp": 0.01045217, + "balance_loss_clip": 1.03931737, + "balance_loss_mlp": 1.03085184, + "epoch": 0.6349616714264241, + "flos": 22194984954240.0, + "grad_norm": 1.606240636171636, + "language_loss": 0.76797289, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.78931808, + "num_input_tokens_seen": 227704430, + "step": 10561, + "time_per_iteration": 2.7372517585754395 + }, + { + "auxiliary_loss_clip": 0.01093461, + "auxiliary_loss_mlp": 0.01035075, + "balance_loss_clip": 1.03962195, + "balance_loss_mlp": 1.02203918, + "epoch": 0.6350217946790921, + "flos": 25410354814080.0, + "grad_norm": 2.1365474752692966, + "language_loss": 0.71962273, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.74090809, + "num_input_tokens_seen": 227724920, + "step": 10562, + "time_per_iteration": 2.7133450508117676 + }, + { + "auxiliary_loss_clip": 0.01105126, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.04334474, + "balance_loss_mlp": 1.02005243, + "epoch": 0.63508191793176, + "flos": 19718944732800.0, + "grad_norm": 2.0107972952363413, + "language_loss": 0.80757058, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.8289668, + "num_input_tokens_seen": 227743400, + "step": 10563, + "time_per_iteration": 2.6585617065429688 + }, + { + "auxiliary_loss_clip": 0.01091086, + "auxiliary_loss_mlp": 0.01038953, + "balance_loss_clip": 1.04419041, + "balance_loss_mlp": 1.02567887, + "epoch": 0.6351420411844281, + "flos": 18186923802240.0, + "grad_norm": 2.444256209228289, + "language_loss": 0.81206977, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.83337021, + "num_input_tokens_seen": 227759990, + "step": 10564, + "time_per_iteration": 2.705941915512085 + }, + { + "auxiliary_loss_clip": 0.01087784, + "auxiliary_loss_mlp": 0.01045814, + "balance_loss_clip": 1.04181719, + "balance_loss_mlp": 1.03100812, + "epoch": 0.6352021644370961, + "flos": 33726511422720.0, + "grad_norm": 1.5889053443954093, + "language_loss": 0.72453761, + "learning_rate": 1.240799222993407e-06, + "loss": 0.74587357, + "num_input_tokens_seen": 227780835, + "step": 10565, + "time_per_iteration": 2.765345335006714 + }, + { + "auxiliary_loss_clip": 0.01102461, + "auxiliary_loss_mlp": 0.01033958, + "balance_loss_clip": 1.04256928, + "balance_loss_mlp": 1.01919961, + "epoch": 0.635262287689764, + "flos": 20374781207040.0, + "grad_norm": 2.121063161403432, + "language_loss": 0.69596386, + "learning_rate": 1.240438926700324e-06, + "loss": 0.71732807, + "num_input_tokens_seen": 227798580, + "step": 10566, + "time_per_iteration": 4.550225496292114 + }, + { + "auxiliary_loss_clip": 0.01103568, + "auxiliary_loss_mlp": 0.01033127, + "balance_loss_clip": 1.04312527, + "balance_loss_mlp": 1.0210278, + "epoch": 0.635322410942432, + "flos": 27525421307520.0, + "grad_norm": 1.5800197118440122, + "language_loss": 0.69619238, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.71755934, + "num_input_tokens_seen": 227819210, + "step": 10567, + "time_per_iteration": 2.6888957023620605 + }, + { + "auxiliary_loss_clip": 0.01100039, + "auxiliary_loss_mlp": 0.01031317, + "balance_loss_clip": 1.04216862, + "balance_loss_mlp": 1.01925862, + "epoch": 0.6353825341950999, + "flos": 21543601766400.0, + "grad_norm": 2.2757897203537976, + "language_loss": 0.8449024, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.86621594, + "num_input_tokens_seen": 227838340, + "step": 10568, + "time_per_iteration": 4.255465030670166 + }, + { + "auxiliary_loss_clip": 0.01056215, + "auxiliary_loss_mlp": 0.01041007, + "balance_loss_clip": 1.03819847, + "balance_loss_mlp": 1.026559, + "epoch": 0.635442657447768, + "flos": 31759756185600.0, + "grad_norm": 1.8323936037096342, + "language_loss": 0.84063637, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.86160862, + "num_input_tokens_seen": 227859170, + "step": 10569, + "time_per_iteration": 4.377737760543823 + }, + { + "auxiliary_loss_clip": 0.01104285, + "auxiliary_loss_mlp": 0.01032738, + "balance_loss_clip": 1.04183245, + "balance_loss_mlp": 1.01939797, + "epoch": 0.6355027807004359, + "flos": 19828831415040.0, + "grad_norm": 1.6700504081300207, + "language_loss": 0.69352221, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.71489245, + "num_input_tokens_seen": 227878545, + "step": 10570, + "time_per_iteration": 2.6112160682678223 + }, + { + "auxiliary_loss_clip": 0.01107497, + "auxiliary_loss_mlp": 0.01036944, + "balance_loss_clip": 1.04085815, + "balance_loss_mlp": 1.02342606, + "epoch": 0.6355629039531039, + "flos": 30372383324160.0, + "grad_norm": 1.7288699826037912, + "language_loss": 0.65762198, + "learning_rate": 1.2386378775476e-06, + "loss": 0.67906642, + "num_input_tokens_seen": 227898875, + "step": 10571, + "time_per_iteration": 2.7335216999053955 + }, + { + "auxiliary_loss_clip": 0.01113018, + "auxiliary_loss_mlp": 0.01029154, + "balance_loss_clip": 1.04446983, + "balance_loss_mlp": 1.01616585, + "epoch": 0.6356230272057718, + "flos": 17932065828480.0, + "grad_norm": 1.9788287371045428, + "language_loss": 0.71541518, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.73683691, + "num_input_tokens_seen": 227917130, + "step": 10572, + "time_per_iteration": 2.6052427291870117 + }, + { + "auxiliary_loss_clip": 0.01084769, + "auxiliary_loss_mlp": 0.01034, + "balance_loss_clip": 1.04089427, + "balance_loss_mlp": 1.02181661, + "epoch": 0.6356831504584398, + "flos": 25375844822400.0, + "grad_norm": 1.6900483013767176, + "language_loss": 0.81165767, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.83284533, + "num_input_tokens_seen": 227939550, + "step": 10573, + "time_per_iteration": 2.8153634071350098 + }, + { + "auxiliary_loss_clip": 0.0109877, + "auxiliary_loss_mlp": 0.01033009, + "balance_loss_clip": 1.04272556, + "balance_loss_mlp": 1.02006316, + "epoch": 0.6357432737111077, + "flos": 46500331720320.0, + "grad_norm": 1.6632630908080246, + "language_loss": 0.68936265, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.71068037, + "num_input_tokens_seen": 227962200, + "step": 10574, + "time_per_iteration": 4.407367467880249 + }, + { + "auxiliary_loss_clip": 0.01116558, + "auxiliary_loss_mlp": 0.01031438, + "balance_loss_clip": 1.04334235, + "balance_loss_mlp": 1.01825356, + "epoch": 0.6358033969637757, + "flos": 17274361847040.0, + "grad_norm": 2.216480993085757, + "language_loss": 0.86364478, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.88512474, + "num_input_tokens_seen": 227979270, + "step": 10575, + "time_per_iteration": 2.59047532081604 + }, + { + "auxiliary_loss_clip": 0.01116011, + "auxiliary_loss_mlp": 0.01037179, + "balance_loss_clip": 1.04200649, + "balance_loss_mlp": 1.02420902, + "epoch": 0.6358635202164437, + "flos": 27125520215040.0, + "grad_norm": 1.527365029746322, + "language_loss": 0.72139943, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.74293131, + "num_input_tokens_seen": 228000550, + "step": 10576, + "time_per_iteration": 2.6213035583496094 + }, + { + "auxiliary_loss_clip": 0.01094385, + "auxiliary_loss_mlp": 0.0103256, + "balance_loss_clip": 1.0408107, + "balance_loss_mlp": 1.01913691, + "epoch": 0.6359236434691117, + "flos": 27525205825920.0, + "grad_norm": 1.587362724967965, + "language_loss": 0.69232905, + "learning_rate": 1.236477571455085e-06, + "loss": 0.71359849, + "num_input_tokens_seen": 228022005, + "step": 10577, + "time_per_iteration": 2.6874570846557617 + }, + { + "auxiliary_loss_clip": 0.01076719, + "auxiliary_loss_mlp": 0.01031904, + "balance_loss_clip": 1.04086065, + "balance_loss_mlp": 1.01938713, + "epoch": 0.6359837667217797, + "flos": 39348290989440.0, + "grad_norm": 1.631898217557544, + "language_loss": 0.71984881, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.74093509, + "num_input_tokens_seen": 228043770, + "step": 10578, + "time_per_iteration": 2.956587314605713 + }, + { + "auxiliary_loss_clip": 0.01011581, + "auxiliary_loss_mlp": 0.00752167, + "balance_loss_clip": 1.01532173, + "balance_loss_mlp": 0.99992144, + "epoch": 0.6360438899744476, + "flos": 56413797206400.0, + "grad_norm": 0.7005664562343583, + "language_loss": 0.5446803, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.56231779, + "num_input_tokens_seen": 228104985, + "step": 10579, + "time_per_iteration": 3.3165230751037598 + }, + { + "auxiliary_loss_clip": 0.01090928, + "auxiliary_loss_mlp": 0.01034048, + "balance_loss_clip": 1.03814209, + "balance_loss_mlp": 1.02082229, + "epoch": 0.6361040132271156, + "flos": 24973106555520.0, + "grad_norm": 1.557921238837489, + "language_loss": 0.77395153, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.7952013, + "num_input_tokens_seen": 228125620, + "step": 10580, + "time_per_iteration": 2.712324857711792 + }, + { + "auxiliary_loss_clip": 0.01087081, + "auxiliary_loss_mlp": 0.00770805, + "balance_loss_clip": 1.04100418, + "balance_loss_mlp": 1.00011897, + "epoch": 0.6361641364797835, + "flos": 23259198130560.0, + "grad_norm": 2.013936086375126, + "language_loss": 0.66709065, + "learning_rate": 1.235037946268301e-06, + "loss": 0.68566948, + "num_input_tokens_seen": 228143495, + "step": 10581, + "time_per_iteration": 2.7856929302215576 + }, + { + "auxiliary_loss_clip": 0.01102449, + "auxiliary_loss_mlp": 0.01034551, + "balance_loss_clip": 1.0404247, + "balance_loss_mlp": 1.02227867, + "epoch": 0.6362242597324516, + "flos": 25994513698560.0, + "grad_norm": 1.9398130134586062, + "language_loss": 0.68718088, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.70855093, + "num_input_tokens_seen": 228166500, + "step": 10582, + "time_per_iteration": 2.737300395965576 + }, + { + "auxiliary_loss_clip": 0.01089734, + "auxiliary_loss_mlp": 0.01038152, + "balance_loss_clip": 1.04106402, + "balance_loss_mlp": 1.02545059, + "epoch": 0.6362843829851195, + "flos": 25703242312320.0, + "grad_norm": 2.1615330133159305, + "language_loss": 0.84382987, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.86510873, + "num_input_tokens_seen": 228185325, + "step": 10583, + "time_per_iteration": 2.736928939819336 + }, + { + "auxiliary_loss_clip": 0.01094529, + "auxiliary_loss_mlp": 0.01034443, + "balance_loss_clip": 1.04331303, + "balance_loss_mlp": 1.02157402, + "epoch": 0.6363445062377875, + "flos": 20522912895360.0, + "grad_norm": 1.8294448915060182, + "language_loss": 0.75581825, + "learning_rate": 1.233958531908538e-06, + "loss": 0.77710795, + "num_input_tokens_seen": 228204050, + "step": 10584, + "time_per_iteration": 2.66745662689209 + }, + { + "auxiliary_loss_clip": 0.01092434, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.04142356, + "balance_loss_mlp": 1.02158976, + "epoch": 0.6364046294904554, + "flos": 19463799450240.0, + "grad_norm": 1.8372541511316505, + "language_loss": 0.72750449, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.74878752, + "num_input_tokens_seen": 228222430, + "step": 10585, + "time_per_iteration": 2.7207906246185303 + }, + { + "auxiliary_loss_clip": 0.01078843, + "auxiliary_loss_mlp": 0.01028745, + "balance_loss_clip": 1.03947806, + "balance_loss_mlp": 1.01638353, + "epoch": 0.6364647527431234, + "flos": 20995892208000.0, + "grad_norm": 1.8754451190030996, + "language_loss": 0.82982284, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.85089874, + "num_input_tokens_seen": 228241925, + "step": 10586, + "time_per_iteration": 2.883169174194336 + }, + { + "auxiliary_loss_clip": 0.01104026, + "auxiliary_loss_mlp": 0.01024669, + "balance_loss_clip": 1.04210103, + "balance_loss_mlp": 1.01253915, + "epoch": 0.6365248759957913, + "flos": 25770789838080.0, + "grad_norm": 2.4347749012599382, + "language_loss": 0.72591609, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.74720299, + "num_input_tokens_seen": 228262535, + "step": 10587, + "time_per_iteration": 2.696120500564575 + }, + { + "auxiliary_loss_clip": 0.01095392, + "auxiliary_loss_mlp": 0.01030465, + "balance_loss_clip": 1.04264998, + "balance_loss_mlp": 1.01770997, + "epoch": 0.6365849992484593, + "flos": 22455589104000.0, + "grad_norm": 2.0432270596750395, + "language_loss": 0.77210999, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.79336858, + "num_input_tokens_seen": 228281340, + "step": 10588, + "time_per_iteration": 2.7811734676361084 + }, + { + "auxiliary_loss_clip": 0.0106633, + "auxiliary_loss_mlp": 0.01028817, + "balance_loss_clip": 1.03860903, + "balance_loss_mlp": 1.0154599, + "epoch": 0.6366451225011273, + "flos": 19025689265280.0, + "grad_norm": 1.4710865244749312, + "language_loss": 0.79949176, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.82044327, + "num_input_tokens_seen": 228300865, + "step": 10589, + "time_per_iteration": 2.8011467456817627 + }, + { + "auxiliary_loss_clip": 0.01093718, + "auxiliary_loss_mlp": 0.01032855, + "balance_loss_clip": 1.03902805, + "balance_loss_mlp": 1.02014768, + "epoch": 0.6367052457537953, + "flos": 25228395492480.0, + "grad_norm": 2.226066060883624, + "language_loss": 0.67151499, + "learning_rate": 1.231800487863257e-06, + "loss": 0.69278073, + "num_input_tokens_seen": 228320815, + "step": 10590, + "time_per_iteration": 2.709080934524536 + }, + { + "auxiliary_loss_clip": 0.01111263, + "auxiliary_loss_mlp": 0.01033159, + "balance_loss_clip": 1.04165292, + "balance_loss_mlp": 1.01980138, + "epoch": 0.6367653690064633, + "flos": 19208438686080.0, + "grad_norm": 2.18709267526875, + "language_loss": 0.78891504, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.81035924, + "num_input_tokens_seen": 228339065, + "step": 10591, + "time_per_iteration": 2.636992931365967 + }, + { + "auxiliary_loss_clip": 0.01092014, + "auxiliary_loss_mlp": 0.01028959, + "balance_loss_clip": 1.04065537, + "balance_loss_mlp": 1.01711535, + "epoch": 0.6368254922591312, + "flos": 23546806329600.0, + "grad_norm": 1.430576733389061, + "language_loss": 0.89153397, + "learning_rate": 1.231081372744317e-06, + "loss": 0.91274369, + "num_input_tokens_seen": 228359210, + "step": 10592, + "time_per_iteration": 2.7107973098754883 + }, + { + "auxiliary_loss_clip": 0.01099214, + "auxiliary_loss_mlp": 0.01027902, + "balance_loss_clip": 1.03750551, + "balance_loss_mlp": 1.01598144, + "epoch": 0.6368856155117992, + "flos": 26467313443200.0, + "grad_norm": 1.4034572445207882, + "language_loss": 0.68212253, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.7033937, + "num_input_tokens_seen": 228379630, + "step": 10593, + "time_per_iteration": 2.807321786880493 + }, + { + "auxiliary_loss_clip": 0.01061371, + "auxiliary_loss_mlp": 0.01042752, + "balance_loss_clip": 1.03203607, + "balance_loss_mlp": 1.02891731, + "epoch": 0.6369457387644671, + "flos": 33692432394240.0, + "grad_norm": 1.761330533007529, + "language_loss": 0.63678664, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.65782785, + "num_input_tokens_seen": 228401410, + "step": 10594, + "time_per_iteration": 2.856600046157837 + }, + { + "auxiliary_loss_clip": 0.01023648, + "auxiliary_loss_mlp": 0.01001204, + "balance_loss_clip": 1.01176047, + "balance_loss_mlp": 0.99982756, + "epoch": 0.6370058620171352, + "flos": 70908600908160.0, + "grad_norm": 0.7623002997880329, + "language_loss": 0.54635006, + "learning_rate": 1.230002918781022e-06, + "loss": 0.56659859, + "num_input_tokens_seen": 228470335, + "step": 10595, + "time_per_iteration": 3.2980732917785645 + }, + { + "auxiliary_loss_clip": 0.01118729, + "auxiliary_loss_mlp": 0.01042081, + "balance_loss_clip": 1.04251242, + "balance_loss_mlp": 1.02855635, + "epoch": 0.6370659852698031, + "flos": 21141940907520.0, + "grad_norm": 2.0781706076151445, + "language_loss": 0.67100823, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.69261628, + "num_input_tokens_seen": 228490765, + "step": 10596, + "time_per_iteration": 2.6011126041412354 + }, + { + "auxiliary_loss_clip": 0.01099686, + "auxiliary_loss_mlp": 0.01037794, + "balance_loss_clip": 1.04006338, + "balance_loss_mlp": 1.02463365, + "epoch": 0.6371261085224711, + "flos": 20193288762240.0, + "grad_norm": 2.011756808968462, + "language_loss": 0.7937991, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.81517392, + "num_input_tokens_seen": 228509700, + "step": 10597, + "time_per_iteration": 2.6972439289093018 + }, + { + "auxiliary_loss_clip": 0.01108387, + "auxiliary_loss_mlp": 0.01037543, + "balance_loss_clip": 1.04363835, + "balance_loss_mlp": 1.02541316, + "epoch": 0.637186231775139, + "flos": 19683536901120.0, + "grad_norm": 1.60919791295429, + "language_loss": 0.74850726, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.76996648, + "num_input_tokens_seen": 228529050, + "step": 10598, + "time_per_iteration": 2.6332266330718994 + }, + { + "auxiliary_loss_clip": 0.01084454, + "auxiliary_loss_mlp": 0.00771297, + "balance_loss_clip": 1.03999043, + "balance_loss_mlp": 1.00013983, + "epoch": 0.637246355027807, + "flos": 13071196995840.0, + "grad_norm": 1.9548116793493355, + "language_loss": 0.68556929, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.70412678, + "num_input_tokens_seen": 228544665, + "step": 10599, + "time_per_iteration": 2.6878466606140137 + }, + { + "auxiliary_loss_clip": 0.01077983, + "auxiliary_loss_mlp": 0.01031504, + "balance_loss_clip": 1.03724337, + "balance_loss_mlp": 1.01745534, + "epoch": 0.6373064782804749, + "flos": 18222654856320.0, + "grad_norm": 2.0583135447897933, + "language_loss": 0.80303937, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.82413423, + "num_input_tokens_seen": 228562060, + "step": 10600, + "time_per_iteration": 2.653907060623169 + }, + { + "auxiliary_loss_clip": 0.01101937, + "auxiliary_loss_mlp": 0.01036294, + "balance_loss_clip": 1.03776395, + "balance_loss_mlp": 1.02380645, + "epoch": 0.637366601533143, + "flos": 24498475217280.0, + "grad_norm": 1.4639641102491714, + "language_loss": 0.79828721, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.81966954, + "num_input_tokens_seen": 228582550, + "step": 10601, + "time_per_iteration": 2.797588586807251 + }, + { + "auxiliary_loss_clip": 0.01085997, + "auxiliary_loss_mlp": 0.01032519, + "balance_loss_clip": 1.04335141, + "balance_loss_mlp": 1.01989436, + "epoch": 0.6374267247858109, + "flos": 26359042872960.0, + "grad_norm": 2.3452009289064737, + "language_loss": 0.6766789, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.69786406, + "num_input_tokens_seen": 228604960, + "step": 10602, + "time_per_iteration": 2.742664098739624 + }, + { + "auxiliary_loss_clip": 0.01037986, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.03193176, + "balance_loss_mlp": 1.02034974, + "epoch": 0.6374868480384789, + "flos": 20371728551040.0, + "grad_norm": 2.210099504390163, + "language_loss": 0.79618657, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.81690341, + "num_input_tokens_seen": 228622195, + "step": 10603, + "time_per_iteration": 2.8134090900421143 + }, + { + "auxiliary_loss_clip": 0.0107315, + "auxiliary_loss_mlp": 0.00770892, + "balance_loss_clip": 1.03933704, + "balance_loss_mlp": 1.00014615, + "epoch": 0.6375469712911469, + "flos": 20996251344000.0, + "grad_norm": 1.8573318102619591, + "language_loss": 0.76802522, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.78646559, + "num_input_tokens_seen": 228639735, + "step": 10604, + "time_per_iteration": 2.7761478424072266 + }, + { + "auxiliary_loss_clip": 0.01095415, + "auxiliary_loss_mlp": 0.01031172, + "balance_loss_clip": 1.03836191, + "balance_loss_mlp": 1.01792753, + "epoch": 0.6376070945438148, + "flos": 19715748422400.0, + "grad_norm": 1.6662789413728705, + "language_loss": 0.76640069, + "learning_rate": 1.226409972197281e-06, + "loss": 0.78766656, + "num_input_tokens_seen": 228658195, + "step": 10605, + "time_per_iteration": 4.650303602218628 + }, + { + "auxiliary_loss_clip": 0.01057897, + "auxiliary_loss_mlp": 0.01038795, + "balance_loss_clip": 1.03824091, + "balance_loss_mlp": 1.02234411, + "epoch": 0.6376672177964828, + "flos": 21506757390720.0, + "grad_norm": 1.7802518386545212, + "language_loss": 0.65565449, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.67662132, + "num_input_tokens_seen": 228677415, + "step": 10606, + "time_per_iteration": 2.8175783157348633 + }, + { + "auxiliary_loss_clip": 0.01090718, + "auxiliary_loss_mlp": 0.01037026, + "balance_loss_clip": 1.04083657, + "balance_loss_mlp": 1.02489638, + "epoch": 0.6377273410491507, + "flos": 18843873598080.0, + "grad_norm": 1.601218417819437, + "language_loss": 0.75069982, + "learning_rate": 1.225691734459971e-06, + "loss": 0.77197731, + "num_input_tokens_seen": 228696450, + "step": 10607, + "time_per_iteration": 2.6365914344787598 + }, + { + "auxiliary_loss_clip": 0.01091801, + "auxiliary_loss_mlp": 0.01037938, + "balance_loss_clip": 1.04039049, + "balance_loss_mlp": 1.02553403, + "epoch": 0.6377874643018188, + "flos": 53062970181120.0, + "grad_norm": 1.5840122270167216, + "language_loss": 0.65928984, + "learning_rate": 1.225332659627278e-06, + "loss": 0.68058717, + "num_input_tokens_seen": 228721600, + "step": 10608, + "time_per_iteration": 4.558081150054932 + }, + { + "auxiliary_loss_clip": 0.00982544, + "auxiliary_loss_mlp": 0.01007387, + "balance_loss_clip": 1.01596785, + "balance_loss_mlp": 1.00617146, + "epoch": 0.6378475875544867, + "flos": 65135026465920.0, + "grad_norm": 0.7133010996130292, + "language_loss": 0.51879215, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.53869152, + "num_input_tokens_seen": 228784535, + "step": 10609, + "time_per_iteration": 3.3632545471191406 + }, + { + "auxiliary_loss_clip": 0.0109935, + "auxiliary_loss_mlp": 0.01025243, + "balance_loss_clip": 1.03736722, + "balance_loss_mlp": 1.01379943, + "epoch": 0.6379077108071547, + "flos": 23002759958400.0, + "grad_norm": 1.6332455111471063, + "language_loss": 0.74713194, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.7683779, + "num_input_tokens_seen": 228804110, + "step": 10610, + "time_per_iteration": 3.2196428775787354 + }, + { + "auxiliary_loss_clip": 0.0101651, + "auxiliary_loss_mlp": 0.0100476, + "balance_loss_clip": 1.01297092, + "balance_loss_mlp": 1.00353765, + "epoch": 0.6379678340598226, + "flos": 67601947610880.0, + "grad_norm": 0.8493432056950548, + "language_loss": 0.63061231, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.65082502, + "num_input_tokens_seen": 228867705, + "step": 10611, + "time_per_iteration": 3.272512435913086 + }, + { + "auxiliary_loss_clip": 0.01103402, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.04139113, + "balance_loss_mlp": 1.0207442, + "epoch": 0.6380279573124906, + "flos": 29680061610240.0, + "grad_norm": 1.8312259315457267, + "language_loss": 0.72302759, + "learning_rate": 1.223896654187282e-06, + "loss": 0.74440277, + "num_input_tokens_seen": 228889215, + "step": 10612, + "time_per_iteration": 2.7299270629882812 + }, + { + "auxiliary_loss_clip": 0.01015421, + "auxiliary_loss_mlp": 0.0100432, + "balance_loss_clip": 1.0106107, + "balance_loss_mlp": 1.00311053, + "epoch": 0.6380880805651585, + "flos": 66484046580480.0, + "grad_norm": 0.7098749409658618, + "language_loss": 0.57844174, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.59863913, + "num_input_tokens_seen": 228948465, + "step": 10613, + "time_per_iteration": 4.943511009216309 + }, + { + "auxiliary_loss_clip": 0.01071494, + "auxiliary_loss_mlp": 0.01035158, + "balance_loss_clip": 1.03659904, + "balance_loss_mlp": 1.02168155, + "epoch": 0.6381482038178266, + "flos": 23914998691200.0, + "grad_norm": 1.7198956941454036, + "language_loss": 0.75381726, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.77488375, + "num_input_tokens_seen": 228967955, + "step": 10614, + "time_per_iteration": 2.8167922496795654 + }, + { + "auxiliary_loss_clip": 0.01094834, + "auxiliary_loss_mlp": 0.00770691, + "balance_loss_clip": 1.04056311, + "balance_loss_mlp": 1.00018597, + "epoch": 0.6382083270704945, + "flos": 24243042625920.0, + "grad_norm": 1.8795242058434967, + "language_loss": 0.79825491, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.81691015, + "num_input_tokens_seen": 228985495, + "step": 10615, + "time_per_iteration": 2.769399642944336 + }, + { + "auxiliary_loss_clip": 0.01013557, + "auxiliary_loss_mlp": 0.01001876, + "balance_loss_clip": 1.01154137, + "balance_loss_mlp": 1.00048769, + "epoch": 0.6382684503231625, + "flos": 70775552931840.0, + "grad_norm": 0.6556730902042093, + "language_loss": 0.55564505, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.57579941, + "num_input_tokens_seen": 229052995, + "step": 10616, + "time_per_iteration": 3.277085542678833 + }, + { + "auxiliary_loss_clip": 0.01086789, + "auxiliary_loss_mlp": 0.01036908, + "balance_loss_clip": 1.0364368, + "balance_loss_mlp": 1.0233475, + "epoch": 0.6383285735758305, + "flos": 16544836621440.0, + "grad_norm": 1.9142103073146424, + "language_loss": 0.83900499, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.86024189, + "num_input_tokens_seen": 229071030, + "step": 10617, + "time_per_iteration": 2.712834119796753 + }, + { + "auxiliary_loss_clip": 0.0110772, + "auxiliary_loss_mlp": 0.0104261, + "balance_loss_clip": 1.04189885, + "balance_loss_mlp": 1.02821589, + "epoch": 0.6383886968284984, + "flos": 14427651225600.0, + "grad_norm": 1.8904429928249138, + "language_loss": 0.87499708, + "learning_rate": 1.221743529196936e-06, + "loss": 0.89650035, + "num_input_tokens_seen": 229088275, + "step": 10618, + "time_per_iteration": 2.6345932483673096 + }, + { + "auxiliary_loss_clip": 0.01068321, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.04150379, + "balance_loss_mlp": 1.02012992, + "epoch": 0.6384488200811664, + "flos": 17929659617280.0, + "grad_norm": 1.7304686428232843, + "language_loss": 0.73287666, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.75388002, + "num_input_tokens_seen": 229105190, + "step": 10619, + "time_per_iteration": 2.777869701385498 + }, + { + "auxiliary_loss_clip": 0.0109667, + "auxiliary_loss_mlp": 0.01037459, + "balance_loss_clip": 1.04080129, + "balance_loss_mlp": 1.02271247, + "epoch": 0.6385089433338343, + "flos": 18515578268160.0, + "grad_norm": 1.9267832317981652, + "language_loss": 0.76312691, + "learning_rate": 1.221026056814193e-06, + "loss": 0.78446817, + "num_input_tokens_seen": 229122290, + "step": 10620, + "time_per_iteration": 2.701122760772705 + }, + { + "auxiliary_loss_clip": 0.01093794, + "auxiliary_loss_mlp": 0.01029286, + "balance_loss_clip": 1.04239035, + "balance_loss_mlp": 1.01672101, + "epoch": 0.6385690665865024, + "flos": 24753620499840.0, + "grad_norm": 2.5441546745937114, + "language_loss": 0.70669818, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.727929, + "num_input_tokens_seen": 229141620, + "step": 10621, + "time_per_iteration": 2.7129428386688232 + }, + { + "auxiliary_loss_clip": 0.01085349, + "auxiliary_loss_mlp": 0.0102653, + "balance_loss_clip": 1.03596258, + "balance_loss_mlp": 1.01482916, + "epoch": 0.6386291898391703, + "flos": 20120569678080.0, + "grad_norm": 1.616578696475536, + "language_loss": 0.77862823, + "learning_rate": 1.220308702586529e-06, + "loss": 0.79974699, + "num_input_tokens_seen": 229161570, + "step": 10622, + "time_per_iteration": 2.722543954849243 + }, + { + "auxiliary_loss_clip": 0.01075591, + "auxiliary_loss_mlp": 0.01030912, + "balance_loss_clip": 1.03845859, + "balance_loss_mlp": 1.01837754, + "epoch": 0.6386893130918383, + "flos": 16867278034560.0, + "grad_norm": 1.771071416148221, + "language_loss": 0.74746549, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.76853049, + "num_input_tokens_seen": 229178465, + "step": 10623, + "time_per_iteration": 2.728158712387085 + }, + { + "auxiliary_loss_clip": 0.0109049, + "auxiliary_loss_mlp": 0.01029194, + "balance_loss_clip": 1.03953004, + "balance_loss_mlp": 1.01796472, + "epoch": 0.6387494363445062, + "flos": 22966274718720.0, + "grad_norm": 1.3721054330124807, + "language_loss": 0.76588684, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.78708369, + "num_input_tokens_seen": 229198975, + "step": 10624, + "time_per_iteration": 2.833406925201416 + }, + { + "auxiliary_loss_clip": 0.0105041, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.03588271, + "balance_loss_mlp": 1.02247274, + "epoch": 0.6388095595971742, + "flos": 22857716839680.0, + "grad_norm": 1.873995828783276, + "language_loss": 0.80408549, + "learning_rate": 1.21923289302382e-06, + "loss": 0.82495034, + "num_input_tokens_seen": 229218825, + "step": 10625, + "time_per_iteration": 2.810683488845825 + }, + { + "auxiliary_loss_clip": 0.01094331, + "auxiliary_loss_mlp": 0.01033337, + "balance_loss_clip": 1.04317892, + "balance_loss_mlp": 1.02039063, + "epoch": 0.6388696828498421, + "flos": 17311529445120.0, + "grad_norm": 1.9242726484746675, + "language_loss": 0.72490007, + "learning_rate": 1.218874349031654e-06, + "loss": 0.74617672, + "num_input_tokens_seen": 229236060, + "step": 10626, + "time_per_iteration": 2.667686939239502 + }, + { + "auxiliary_loss_clip": 0.01093032, + "auxiliary_loss_mlp": 0.01033337, + "balance_loss_clip": 1.03836656, + "balance_loss_mlp": 1.02036738, + "epoch": 0.6389298061025102, + "flos": 17128636369920.0, + "grad_norm": 1.8547762721762564, + "language_loss": 0.72446245, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.74572611, + "num_input_tokens_seen": 229255160, + "step": 10627, + "time_per_iteration": 2.681147575378418 + }, + { + "auxiliary_loss_clip": 0.01095264, + "auxiliary_loss_mlp": 0.01034256, + "balance_loss_clip": 1.04398704, + "balance_loss_mlp": 1.01995111, + "epoch": 0.6389899293551781, + "flos": 27710971989120.0, + "grad_norm": 1.6812239823438198, + "language_loss": 0.67369878, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.69499397, + "num_input_tokens_seen": 229278705, + "step": 10628, + "time_per_iteration": 2.7938716411590576 + }, + { + "auxiliary_loss_clip": 0.0111173, + "auxiliary_loss_mlp": 0.01029902, + "balance_loss_clip": 1.04083705, + "balance_loss_mlp": 1.01804066, + "epoch": 0.6390500526078461, + "flos": 21215701486080.0, + "grad_norm": 1.7139884939852632, + "language_loss": 0.68161869, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.703035, + "num_input_tokens_seen": 229299990, + "step": 10629, + "time_per_iteration": 2.644061803817749 + }, + { + "auxiliary_loss_clip": 0.01079014, + "auxiliary_loss_mlp": 0.01040793, + "balance_loss_clip": 1.03948665, + "balance_loss_mlp": 1.02554584, + "epoch": 0.6391101758605141, + "flos": 21581056673280.0, + "grad_norm": 1.5487398291576047, + "language_loss": 0.75722307, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.77842116, + "num_input_tokens_seen": 229319230, + "step": 10630, + "time_per_iteration": 2.7381680011749268 + }, + { + "auxiliary_loss_clip": 0.01089485, + "auxiliary_loss_mlp": 0.01035881, + "balance_loss_clip": 1.03773403, + "balance_loss_mlp": 1.02401352, + "epoch": 0.639170299113182, + "flos": 19900473091200.0, + "grad_norm": 1.4699321095065776, + "language_loss": 0.7028895, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.72414321, + "num_input_tokens_seen": 229338600, + "step": 10631, + "time_per_iteration": 2.76301908493042 + }, + { + "auxiliary_loss_clip": 0.01010735, + "auxiliary_loss_mlp": 0.01020885, + "balance_loss_clip": 1.00987029, + "balance_loss_mlp": 1.01946056, + "epoch": 0.63923042236585, + "flos": 69877604833920.0, + "grad_norm": 1.2867788563374962, + "language_loss": 0.62960958, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.64992577, + "num_input_tokens_seen": 229402420, + "step": 10632, + "time_per_iteration": 3.23628306388855 + }, + { + "auxiliary_loss_clip": 0.01092617, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.04134142, + "balance_loss_mlp": 1.02143598, + "epoch": 0.639290545618518, + "flos": 22674823764480.0, + "grad_norm": 11.316815321652387, + "language_loss": 0.66998363, + "learning_rate": 1.216365371217893e-06, + "loss": 0.69125253, + "num_input_tokens_seen": 229419185, + "step": 10633, + "time_per_iteration": 2.719403028488159 + }, + { + "auxiliary_loss_clip": 0.01051248, + "auxiliary_loss_mlp": 0.01028599, + "balance_loss_clip": 1.04067874, + "balance_loss_mlp": 1.01645792, + "epoch": 0.639350668871186, + "flos": 19829190551040.0, + "grad_norm": 2.281228369443932, + "language_loss": 0.81935, + "learning_rate": 1.216007064569225e-06, + "loss": 0.84014845, + "num_input_tokens_seen": 229436735, + "step": 10634, + "time_per_iteration": 2.8779945373535156 + }, + { + "auxiliary_loss_clip": 0.01089506, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.0404712, + "balance_loss_mlp": 1.02211165, + "epoch": 0.6394107921238539, + "flos": 20553328736640.0, + "grad_norm": 1.5224758560315717, + "language_loss": 0.74918383, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.77043903, + "num_input_tokens_seen": 229455595, + "step": 10635, + "time_per_iteration": 2.7275381088256836 + }, + { + "auxiliary_loss_clip": 0.0110297, + "auxiliary_loss_mlp": 0.01033837, + "balance_loss_clip": 1.04365182, + "balance_loss_mlp": 1.02071238, + "epoch": 0.6394709153765219, + "flos": 25774991729280.0, + "grad_norm": 1.6416528841405902, + "language_loss": 0.71164483, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.73301286, + "num_input_tokens_seen": 229476230, + "step": 10636, + "time_per_iteration": 2.6989855766296387 + }, + { + "auxiliary_loss_clip": 0.0109626, + "auxiliary_loss_mlp": 0.01037788, + "balance_loss_clip": 1.04154992, + "balance_loss_mlp": 1.02471662, + "epoch": 0.6395310386291898, + "flos": 17530153574400.0, + "grad_norm": 1.863216274856941, + "language_loss": 0.73810291, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.7594434, + "num_input_tokens_seen": 229494300, + "step": 10637, + "time_per_iteration": 2.7064554691314697 + }, + { + "auxiliary_loss_clip": 0.01102986, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.04232454, + "balance_loss_mlp": 1.0214324, + "epoch": 0.6395911618818578, + "flos": 18588225525120.0, + "grad_norm": 1.8583759044592125, + "language_loss": 0.77674294, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.7981261, + "num_input_tokens_seen": 229512985, + "step": 10638, + "time_per_iteration": 2.742272138595581 + }, + { + "auxiliary_loss_clip": 0.01092544, + "auxiliary_loss_mlp": 0.01035401, + "balance_loss_clip": 1.039186, + "balance_loss_mlp": 1.02218056, + "epoch": 0.6396512851345257, + "flos": 28366557068160.0, + "grad_norm": 1.7706841809309422, + "language_loss": 0.81434906, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.83562851, + "num_input_tokens_seen": 229534270, + "step": 10639, + "time_per_iteration": 2.7076473236083984 + }, + { + "auxiliary_loss_clip": 0.0101793, + "auxiliary_loss_mlp": 0.0099976, + "balance_loss_clip": 1.01366258, + "balance_loss_mlp": 0.9985556, + "epoch": 0.6397114083871938, + "flos": 70724307202560.0, + "grad_norm": 0.8066832194631076, + "language_loss": 0.58980644, + "learning_rate": 1.21385784946359e-06, + "loss": 0.60998333, + "num_input_tokens_seen": 229596455, + "step": 10640, + "time_per_iteration": 3.175328254699707 + }, + { + "auxiliary_loss_clip": 0.01081778, + "auxiliary_loss_mlp": 0.01030879, + "balance_loss_clip": 1.03485847, + "balance_loss_mlp": 1.01876175, + "epoch": 0.6397715316398617, + "flos": 18142537570560.0, + "grad_norm": 1.8250663746988522, + "language_loss": 0.78291178, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.80403835, + "num_input_tokens_seen": 229612860, + "step": 10641, + "time_per_iteration": 2.6736910343170166 + }, + { + "auxiliary_loss_clip": 0.01069736, + "auxiliary_loss_mlp": 0.01041571, + "balance_loss_clip": 1.03781104, + "balance_loss_mlp": 1.02828479, + "epoch": 0.6398316548925297, + "flos": 25739512070400.0, + "grad_norm": 1.5814049726496198, + "language_loss": 0.63194126, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.65305436, + "num_input_tokens_seen": 229633960, + "step": 10642, + "time_per_iteration": 2.840916156768799 + }, + { + "auxiliary_loss_clip": 0.01004085, + "auxiliary_loss_mlp": 0.01008093, + "balance_loss_clip": 1.00885439, + "balance_loss_mlp": 1.00666296, + "epoch": 0.6398917781451977, + "flos": 71214234756480.0, + "grad_norm": 0.9138015475084418, + "language_loss": 0.55936515, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.57948697, + "num_input_tokens_seen": 229686730, + "step": 10643, + "time_per_iteration": 3.134157419204712 + }, + { + "auxiliary_loss_clip": 0.01082549, + "auxiliary_loss_mlp": 0.01028613, + "balance_loss_clip": 1.03844333, + "balance_loss_mlp": 1.01577973, + "epoch": 0.6399519013978656, + "flos": 20521835487360.0, + "grad_norm": 2.4755783411685055, + "language_loss": 0.76844835, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.78955996, + "num_input_tokens_seen": 229704800, + "step": 10644, + "time_per_iteration": 2.750016212463379 + }, + { + "auxiliary_loss_clip": 0.01083772, + "auxiliary_loss_mlp": 0.0103714, + "balance_loss_clip": 1.04259241, + "balance_loss_mlp": 1.02343059, + "epoch": 0.6400120246505336, + "flos": 24460840742400.0, + "grad_norm": 1.476966637211995, + "language_loss": 0.82139534, + "learning_rate": 1.212067656542203e-06, + "loss": 0.84260446, + "num_input_tokens_seen": 229725265, + "step": 10645, + "time_per_iteration": 4.434756755828857 + }, + { + "auxiliary_loss_clip": 0.01108206, + "auxiliary_loss_mlp": 0.01043381, + "balance_loss_clip": 1.0400579, + "balance_loss_mlp": 1.02844369, + "epoch": 0.6400721479032015, + "flos": 28366090191360.0, + "grad_norm": 1.9873684481859661, + "language_loss": 0.73491621, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.75643206, + "num_input_tokens_seen": 229744840, + "step": 10646, + "time_per_iteration": 2.790422201156616 + }, + { + "auxiliary_loss_clip": 0.01076409, + "auxiliary_loss_mlp": 0.01036032, + "balance_loss_clip": 1.037462, + "balance_loss_mlp": 1.02220368, + "epoch": 0.6401322711558696, + "flos": 17816540711040.0, + "grad_norm": 2.1141413827607227, + "language_loss": 0.79825467, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.81937909, + "num_input_tokens_seen": 229759095, + "step": 10647, + "time_per_iteration": 6.299994707107544 + }, + { + "auxiliary_loss_clip": 0.0106918, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.03744197, + "balance_loss_mlp": 1.02105761, + "epoch": 0.6401923944085375, + "flos": 26030855283840.0, + "grad_norm": 1.5992559976065106, + "language_loss": 0.75935119, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.7803787, + "num_input_tokens_seen": 229777750, + "step": 10648, + "time_per_iteration": 2.823535680770874 + }, + { + "auxiliary_loss_clip": 0.01088631, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.03901458, + "balance_loss_mlp": 1.02278256, + "epoch": 0.6402525176612055, + "flos": 23586451966080.0, + "grad_norm": 3.2506814778416566, + "language_loss": 0.78615916, + "learning_rate": 1.210636039936138e-06, + "loss": 0.80740136, + "num_input_tokens_seen": 229796785, + "step": 10649, + "time_per_iteration": 2.7334954738616943 + }, + { + "auxiliary_loss_clip": 0.01058756, + "auxiliary_loss_mlp": 0.01037312, + "balance_loss_clip": 1.03965068, + "balance_loss_mlp": 1.02403259, + "epoch": 0.6403126409138734, + "flos": 18041413806720.0, + "grad_norm": 4.7583637580681515, + "language_loss": 0.75450838, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.77546906, + "num_input_tokens_seen": 229815425, + "step": 10650, + "time_per_iteration": 2.834925651550293 + }, + { + "auxiliary_loss_clip": 0.01114658, + "auxiliary_loss_mlp": 0.01038182, + "balance_loss_clip": 1.04058218, + "balance_loss_mlp": 1.02501488, + "epoch": 0.6403727641665414, + "flos": 21979485308160.0, + "grad_norm": 1.5877577982319235, + "language_loss": 0.7111091, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.73263752, + "num_input_tokens_seen": 229834545, + "step": 10651, + "time_per_iteration": 2.599517345428467 + }, + { + "auxiliary_loss_clip": 0.01082313, + "auxiliary_loss_mlp": 0.01041331, + "balance_loss_clip": 1.03811073, + "balance_loss_mlp": 1.02803898, + "epoch": 0.6404328874192093, + "flos": 24895539135360.0, + "grad_norm": 2.6398543727492494, + "language_loss": 0.63837707, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.65961355, + "num_input_tokens_seen": 229849175, + "step": 10652, + "time_per_iteration": 4.367003679275513 + }, + { + "auxiliary_loss_clip": 0.0109017, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.03734291, + "balance_loss_mlp": 1.01728261, + "epoch": 0.6404930106718774, + "flos": 17597198309760.0, + "grad_norm": 2.0413197407443247, + "language_loss": 0.79417443, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.81537288, + "num_input_tokens_seen": 229865400, + "step": 10653, + "time_per_iteration": 2.672642707824707 + }, + { + "auxiliary_loss_clip": 0.01089835, + "auxiliary_loss_mlp": 0.01057293, + "balance_loss_clip": 1.03523707, + "balance_loss_mlp": 1.04088974, + "epoch": 0.6405531339245453, + "flos": 20157880930560.0, + "grad_norm": 2.1735639110567884, + "language_loss": 0.70573318, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.72720444, + "num_input_tokens_seen": 229882945, + "step": 10654, + "time_per_iteration": 2.6905150413513184 + }, + { + "auxiliary_loss_clip": 0.01109265, + "auxiliary_loss_mlp": 0.01041023, + "balance_loss_clip": 1.04214334, + "balance_loss_mlp": 1.02721834, + "epoch": 0.6406132571772133, + "flos": 21942281796480.0, + "grad_norm": 1.704852134606112, + "language_loss": 0.73023099, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.75173384, + "num_input_tokens_seen": 229901590, + "step": 10655, + "time_per_iteration": 2.6235902309417725 + }, + { + "auxiliary_loss_clip": 0.01082305, + "auxiliary_loss_mlp": 0.01040345, + "balance_loss_clip": 1.04245615, + "balance_loss_mlp": 1.0268271, + "epoch": 0.6406733804298813, + "flos": 28768002445440.0, + "grad_norm": 1.5348114269310231, + "language_loss": 0.82592511, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.84715158, + "num_input_tokens_seen": 229922535, + "step": 10656, + "time_per_iteration": 2.786027193069458 + }, + { + "auxiliary_loss_clip": 0.01057312, + "auxiliary_loss_mlp": 0.01037289, + "balance_loss_clip": 1.034778, + "balance_loss_mlp": 1.02465284, + "epoch": 0.6407335036825492, + "flos": 17457183095040.0, + "grad_norm": 2.2686127713919566, + "language_loss": 0.72339928, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.74434525, + "num_input_tokens_seen": 229939575, + "step": 10657, + "time_per_iteration": 2.7300093173980713 + }, + { + "auxiliary_loss_clip": 0.01080913, + "auxiliary_loss_mlp": 0.01039634, + "balance_loss_clip": 1.03770339, + "balance_loss_mlp": 1.0274924, + "epoch": 0.6407936269352172, + "flos": 22125282612480.0, + "grad_norm": 2.024621973540982, + "language_loss": 0.77556098, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.7967664, + "num_input_tokens_seen": 229958840, + "step": 10658, + "time_per_iteration": 2.7543232440948486 + }, + { + "auxiliary_loss_clip": 0.01119551, + "auxiliary_loss_mlp": 0.01041614, + "balance_loss_clip": 1.04269636, + "balance_loss_mlp": 1.02797651, + "epoch": 0.6408537501878852, + "flos": 23110635479040.0, + "grad_norm": 2.31675003494523, + "language_loss": 0.76086068, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.78247231, + "num_input_tokens_seen": 229979680, + "step": 10659, + "time_per_iteration": 2.64536190032959 + }, + { + "auxiliary_loss_clip": 0.01105159, + "auxiliary_loss_mlp": 0.01032937, + "balance_loss_clip": 1.04132307, + "balance_loss_mlp": 1.01971078, + "epoch": 0.6409138734405532, + "flos": 16472440759680.0, + "grad_norm": 1.82994064834737, + "language_loss": 0.78033829, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.80171925, + "num_input_tokens_seen": 229996830, + "step": 10660, + "time_per_iteration": 2.6234161853790283 + }, + { + "auxiliary_loss_clip": 0.01092799, + "auxiliary_loss_mlp": 0.01035048, + "balance_loss_clip": 1.03941202, + "balance_loss_mlp": 1.02086258, + "epoch": 0.6409739966932211, + "flos": 22777922776320.0, + "grad_norm": 1.735823034314566, + "language_loss": 0.68326354, + "learning_rate": 1.206344067135727e-06, + "loss": 0.70454198, + "num_input_tokens_seen": 230015115, + "step": 10661, + "time_per_iteration": 2.7175955772399902 + }, + { + "auxiliary_loss_clip": 0.01114459, + "auxiliary_loss_mlp": 0.01038734, + "balance_loss_clip": 1.04276872, + "balance_loss_mlp": 1.02682471, + "epoch": 0.6410341199458891, + "flos": 25152049134720.0, + "grad_norm": 1.9252684871674384, + "language_loss": 0.75755298, + "learning_rate": 1.205986598033362e-06, + "loss": 0.77908492, + "num_input_tokens_seen": 230035515, + "step": 10662, + "time_per_iteration": 2.633653402328491 + }, + { + "auxiliary_loss_clip": 0.01098112, + "auxiliary_loss_mlp": 0.01035568, + "balance_loss_clip": 1.03684235, + "balance_loss_mlp": 1.02221704, + "epoch": 0.641094243198557, + "flos": 27046193028480.0, + "grad_norm": 2.784052529669845, + "language_loss": 0.70107532, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.72241217, + "num_input_tokens_seen": 230054355, + "step": 10663, + "time_per_iteration": 2.7310519218444824 + }, + { + "auxiliary_loss_clip": 0.01083056, + "auxiliary_loss_mlp": 0.0104552, + "balance_loss_clip": 1.04077351, + "balance_loss_mlp": 1.03102446, + "epoch": 0.641154366451225, + "flos": 25374551932800.0, + "grad_norm": 1.9822481402863719, + "language_loss": 0.67971885, + "learning_rate": 1.205271750169389e-06, + "loss": 0.70100462, + "num_input_tokens_seen": 230074605, + "step": 10664, + "time_per_iteration": 2.773348093032837 + }, + { + "auxiliary_loss_clip": 0.01087025, + "auxiliary_loss_mlp": 0.01033843, + "balance_loss_clip": 1.03581822, + "balance_loss_mlp": 1.02188087, + "epoch": 0.6412144897038929, + "flos": 25153342024320.0, + "grad_norm": 1.8870991168532496, + "language_loss": 0.66328347, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.68449211, + "num_input_tokens_seen": 230093820, + "step": 10665, + "time_per_iteration": 2.6490859985351562 + }, + { + "auxiliary_loss_clip": 0.01103479, + "auxiliary_loss_mlp": 0.01027966, + "balance_loss_clip": 1.04036629, + "balance_loss_mlp": 1.01522827, + "epoch": 0.641274612956561, + "flos": 23440762402560.0, + "grad_norm": 1.6713056871656586, + "language_loss": 0.6435259, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.66484034, + "num_input_tokens_seen": 230114285, + "step": 10666, + "time_per_iteration": 2.667050361633301 + }, + { + "auxiliary_loss_clip": 0.01105312, + "auxiliary_loss_mlp": 0.01033422, + "balance_loss_clip": 1.04096031, + "balance_loss_mlp": 1.02103066, + "epoch": 0.6413347362092289, + "flos": 19427493778560.0, + "grad_norm": 1.5002235875983176, + "language_loss": 0.70960593, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.73099327, + "num_input_tokens_seen": 230132760, + "step": 10667, + "time_per_iteration": 2.701289176940918 + }, + { + "auxiliary_loss_clip": 0.01066227, + "auxiliary_loss_mlp": 0.00773491, + "balance_loss_clip": 1.0367496, + "balance_loss_mlp": 1.00030184, + "epoch": 0.6413948594618969, + "flos": 17196578945280.0, + "grad_norm": 2.416405769977824, + "language_loss": 0.77665913, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.79505634, + "num_input_tokens_seen": 230149690, + "step": 10668, + "time_per_iteration": 2.746056079864502 + }, + { + "auxiliary_loss_clip": 0.01108161, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.04348612, + "balance_loss_mlp": 1.02366185, + "epoch": 0.6414549827145648, + "flos": 22269787027200.0, + "grad_norm": 1.4845911693701175, + "language_loss": 0.67707181, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.69851947, + "num_input_tokens_seen": 230166950, + "step": 10669, + "time_per_iteration": 2.7345635890960693 + }, + { + "auxiliary_loss_clip": 0.0111572, + "auxiliary_loss_mlp": 0.01038211, + "balance_loss_clip": 1.04545701, + "balance_loss_mlp": 1.02449608, + "epoch": 0.6415151059672328, + "flos": 19640192163840.0, + "grad_norm": 2.894165174832574, + "language_loss": 0.78665972, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.80819899, + "num_input_tokens_seen": 230184785, + "step": 10670, + "time_per_iteration": 2.6661479473114014 + }, + { + "auxiliary_loss_clip": 0.01081535, + "auxiliary_loss_mlp": 0.01035319, + "balance_loss_clip": 1.03874564, + "balance_loss_mlp": 1.02164531, + "epoch": 0.6415752292199008, + "flos": 14865833237760.0, + "grad_norm": 2.1933536907134554, + "language_loss": 0.88588488, + "learning_rate": 1.20277073264638e-06, + "loss": 0.90705341, + "num_input_tokens_seen": 230201385, + "step": 10671, + "time_per_iteration": 2.641057252883911 + }, + { + "auxiliary_loss_clip": 0.01104202, + "auxiliary_loss_mlp": 0.01028531, + "balance_loss_clip": 1.04201674, + "balance_loss_mlp": 1.01649058, + "epoch": 0.6416353524725688, + "flos": 13735580906880.0, + "grad_norm": 1.6223655469146963, + "language_loss": 0.68986869, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.71119601, + "num_input_tokens_seen": 230220380, + "step": 10672, + "time_per_iteration": 2.6609199047088623 + }, + { + "auxiliary_loss_clip": 0.01111137, + "auxiliary_loss_mlp": 0.01033932, + "balance_loss_clip": 1.04236984, + "balance_loss_mlp": 1.01922166, + "epoch": 0.6416954757252368, + "flos": 24534924543360.0, + "grad_norm": 2.291371400531435, + "language_loss": 0.73951614, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.76096678, + "num_input_tokens_seen": 230239845, + "step": 10673, + "time_per_iteration": 2.7125818729400635 + }, + { + "auxiliary_loss_clip": 0.01076968, + "auxiliary_loss_mlp": 0.01038267, + "balance_loss_clip": 1.03657365, + "balance_loss_mlp": 1.02410507, + "epoch": 0.6417555989779047, + "flos": 27710002321920.0, + "grad_norm": 25.869198527491033, + "language_loss": 0.69720078, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.71835309, + "num_input_tokens_seen": 230262420, + "step": 10674, + "time_per_iteration": 2.8267860412597656 + }, + { + "auxiliary_loss_clip": 0.01119164, + "auxiliary_loss_mlp": 0.01029301, + "balance_loss_clip": 1.04007125, + "balance_loss_mlp": 1.01571679, + "epoch": 0.6418157222305727, + "flos": 20556632787840.0, + "grad_norm": 1.784339148090001, + "language_loss": 0.66459048, + "learning_rate": 1.201342244560338e-06, + "loss": 0.68607509, + "num_input_tokens_seen": 230279950, + "step": 10675, + "time_per_iteration": 2.6572489738464355 + }, + { + "auxiliary_loss_clip": 0.01117705, + "auxiliary_loss_mlp": 0.01037266, + "balance_loss_clip": 1.04312348, + "balance_loss_mlp": 1.02500582, + "epoch": 0.6418758454832406, + "flos": 22601530062720.0, + "grad_norm": 1.859703676283548, + "language_loss": 0.66479051, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.68634021, + "num_input_tokens_seen": 230299705, + "step": 10676, + "time_per_iteration": 2.6424221992492676 + }, + { + "auxiliary_loss_clip": 0.01119453, + "auxiliary_loss_mlp": 0.01034897, + "balance_loss_clip": 1.04334652, + "balance_loss_mlp": 1.02030003, + "epoch": 0.6419359687359086, + "flos": 27375098889600.0, + "grad_norm": 1.821732847085161, + "language_loss": 0.75731808, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.77886158, + "num_input_tokens_seen": 230320030, + "step": 10677, + "time_per_iteration": 2.651279926300049 + }, + { + "auxiliary_loss_clip": 0.01017238, + "auxiliary_loss_mlp": 0.0100428, + "balance_loss_clip": 1.01344991, + "balance_loss_mlp": 1.00320745, + "epoch": 0.6419960919885765, + "flos": 67251924552960.0, + "grad_norm": 0.7863000332751263, + "language_loss": 0.60634637, + "learning_rate": 1.200271196442818e-06, + "loss": 0.62656152, + "num_input_tokens_seen": 230381495, + "step": 10678, + "time_per_iteration": 3.29689359664917 + }, + { + "auxiliary_loss_clip": 0.01100247, + "auxiliary_loss_mlp": 0.01035517, + "balance_loss_clip": 1.03918314, + "balance_loss_mlp": 1.02296972, + "epoch": 0.6420562152412446, + "flos": 19901873721600.0, + "grad_norm": 1.6874144871208372, + "language_loss": 0.6772809, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.69863856, + "num_input_tokens_seen": 230401385, + "step": 10679, + "time_per_iteration": 2.656188488006592 + }, + { + "auxiliary_loss_clip": 0.01103127, + "auxiliary_loss_mlp": 0.01041549, + "balance_loss_clip": 1.04055119, + "balance_loss_mlp": 1.02634931, + "epoch": 0.6421163384939125, + "flos": 24790177566720.0, + "grad_norm": 2.4808394593739123, + "language_loss": 0.73067611, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.75212288, + "num_input_tokens_seen": 230421340, + "step": 10680, + "time_per_iteration": 2.6635870933532715 + }, + { + "auxiliary_loss_clip": 0.01079924, + "auxiliary_loss_mlp": 0.01028158, + "balance_loss_clip": 1.03821039, + "balance_loss_mlp": 1.01660097, + "epoch": 0.6421764617465805, + "flos": 25592816926080.0, + "grad_norm": 1.6629690093206273, + "language_loss": 0.67730248, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.69838333, + "num_input_tokens_seen": 230441270, + "step": 10681, + "time_per_iteration": 2.7426977157592773 + }, + { + "auxiliary_loss_clip": 0.0111386, + "auxiliary_loss_mlp": 0.01031892, + "balance_loss_clip": 1.04021406, + "balance_loss_mlp": 1.01944685, + "epoch": 0.6422365849992484, + "flos": 14134727813760.0, + "grad_norm": 1.7354882322045777, + "language_loss": 0.74501145, + "learning_rate": 1.198843556910427e-06, + "loss": 0.76646894, + "num_input_tokens_seen": 230457455, + "step": 10682, + "time_per_iteration": 2.5474164485931396 + }, + { + "auxiliary_loss_clip": 0.01051042, + "auxiliary_loss_mlp": 0.01032531, + "balance_loss_clip": 1.03735995, + "balance_loss_mlp": 1.02086592, + "epoch": 0.6422967082519164, + "flos": 22383911514240.0, + "grad_norm": 1.4579009699070558, + "language_loss": 0.79108202, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.81191772, + "num_input_tokens_seen": 230478955, + "step": 10683, + "time_per_iteration": 2.913137435913086 + }, + { + "auxiliary_loss_clip": 0.01118799, + "auxiliary_loss_mlp": 0.01035941, + "balance_loss_clip": 1.04291272, + "balance_loss_mlp": 1.0225358, + "epoch": 0.6423568315045844, + "flos": 14647927380480.0, + "grad_norm": 1.7236127231650058, + "language_loss": 0.67390025, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.69544768, + "num_input_tokens_seen": 230496425, + "step": 10684, + "time_per_iteration": 4.21756386756897 + }, + { + "auxiliary_loss_clip": 0.0110472, + "auxiliary_loss_mlp": 0.0103372, + "balance_loss_clip": 1.04010284, + "balance_loss_mlp": 1.02044034, + "epoch": 0.6424169547572524, + "flos": 26833925606400.0, + "grad_norm": 2.002909718847722, + "language_loss": 0.7144649, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.73584938, + "num_input_tokens_seen": 230516245, + "step": 10685, + "time_per_iteration": 2.715785026550293 + }, + { + "auxiliary_loss_clip": 0.0107774, + "auxiliary_loss_mlp": 0.01037662, + "balance_loss_clip": 1.03614187, + "balance_loss_mlp": 1.02484107, + "epoch": 0.6424770780099204, + "flos": 22707430335360.0, + "grad_norm": 1.5191327003401023, + "language_loss": 0.75144935, + "learning_rate": 1.197416403456935e-06, + "loss": 0.77260327, + "num_input_tokens_seen": 230534745, + "step": 10686, + "time_per_iteration": 4.366745948791504 + }, + { + "auxiliary_loss_clip": 0.01082252, + "auxiliary_loss_mlp": 0.01034259, + "balance_loss_clip": 1.04008722, + "balance_loss_mlp": 1.01991844, + "epoch": 0.6425372012625883, + "flos": 28469512425600.0, + "grad_norm": 6.424850822093427, + "language_loss": 0.68726957, + "learning_rate": 1.197059691144867e-06, + "loss": 0.7084347, + "num_input_tokens_seen": 230555895, + "step": 10687, + "time_per_iteration": 4.32355523109436 + }, + { + "auxiliary_loss_clip": 0.01092278, + "auxiliary_loss_mlp": 0.0103296, + "balance_loss_clip": 1.03951168, + "balance_loss_mlp": 1.02028227, + "epoch": 0.6425973245152563, + "flos": 29351694453120.0, + "grad_norm": 1.9785933660475024, + "language_loss": 0.66424388, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.68549621, + "num_input_tokens_seen": 230577460, + "step": 10688, + "time_per_iteration": 2.8096606731414795 + }, + { + "auxiliary_loss_clip": 0.01114997, + "auxiliary_loss_mlp": 0.01034078, + "balance_loss_clip": 1.04043853, + "balance_loss_mlp": 1.02081013, + "epoch": 0.6426574477679242, + "flos": 16430388912000.0, + "grad_norm": 1.653295180436115, + "language_loss": 0.73148823, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.75297892, + "num_input_tokens_seen": 230595030, + "step": 10689, + "time_per_iteration": 2.5335159301757812 + }, + { + "auxiliary_loss_clip": 0.01097981, + "auxiliary_loss_mlp": 0.01032097, + "balance_loss_clip": 1.04061198, + "balance_loss_mlp": 1.0200088, + "epoch": 0.6427175710205922, + "flos": 21835914647040.0, + "grad_norm": 2.974297200312542, + "language_loss": 0.72271609, + "learning_rate": 1.195989736948226e-06, + "loss": 0.74401689, + "num_input_tokens_seen": 230615135, + "step": 10690, + "time_per_iteration": 2.678732395172119 + }, + { + "auxiliary_loss_clip": 0.01087197, + "auxiliary_loss_mlp": 0.01034747, + "balance_loss_clip": 1.03962326, + "balance_loss_mlp": 1.02202129, + "epoch": 0.6427776942732601, + "flos": 17786627660160.0, + "grad_norm": 1.747376446154191, + "language_loss": 0.77734852, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.79856801, + "num_input_tokens_seen": 230631965, + "step": 10691, + "time_per_iteration": 2.659553050994873 + }, + { + "auxiliary_loss_clip": 0.01094577, + "auxiliary_loss_mlp": 0.01035823, + "balance_loss_clip": 1.03965449, + "balance_loss_mlp": 1.02299619, + "epoch": 0.6428378175259282, + "flos": 15085893911040.0, + "grad_norm": 1.8605559166150418, + "language_loss": 0.74422169, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.76552576, + "num_input_tokens_seen": 230649565, + "step": 10692, + "time_per_iteration": 4.251460790634155 + }, + { + "auxiliary_loss_clip": 0.01104664, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.04084218, + "balance_loss_mlp": 1.02164721, + "epoch": 0.6428979407785961, + "flos": 23841776816640.0, + "grad_norm": 1.9248860914210837, + "language_loss": 0.61550558, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.63689899, + "num_input_tokens_seen": 230669265, + "step": 10693, + "time_per_iteration": 2.6779651641845703 + }, + { + "auxiliary_loss_clip": 0.01080488, + "auxiliary_loss_mlp": 0.0102922, + "balance_loss_clip": 1.04029202, + "balance_loss_mlp": 1.016065, + "epoch": 0.6429580640312641, + "flos": 32926852892160.0, + "grad_norm": 2.329079095224612, + "language_loss": 0.59532356, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.61642069, + "num_input_tokens_seen": 230690575, + "step": 10694, + "time_per_iteration": 2.8363914489746094 + }, + { + "auxiliary_loss_clip": 0.01089804, + "auxiliary_loss_mlp": 0.01035527, + "balance_loss_clip": 1.03853726, + "balance_loss_mlp": 1.02255106, + "epoch": 0.643018187283932, + "flos": 21068359896960.0, + "grad_norm": 1.4014414192812676, + "language_loss": 0.80109406, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.82234728, + "num_input_tokens_seen": 230709420, + "step": 10695, + "time_per_iteration": 2.6794557571411133 + }, + { + "auxiliary_loss_clip": 0.01116687, + "auxiliary_loss_mlp": 0.01040293, + "balance_loss_clip": 1.04089379, + "balance_loss_mlp": 1.02677488, + "epoch": 0.6430783105366, + "flos": 26724649455360.0, + "grad_norm": 1.7759454400987778, + "language_loss": 0.73687971, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.75844944, + "num_input_tokens_seen": 230729350, + "step": 10696, + "time_per_iteration": 2.7068281173706055 + }, + { + "auxiliary_loss_clip": 0.01078835, + "auxiliary_loss_mlp": 0.01029709, + "balance_loss_clip": 1.03717327, + "balance_loss_mlp": 1.01736438, + "epoch": 0.643138433789268, + "flos": 23696841438720.0, + "grad_norm": 1.6299732646475602, + "language_loss": 0.75820529, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.7792908, + "num_input_tokens_seen": 230749220, + "step": 10697, + "time_per_iteration": 2.8328888416290283 + }, + { + "auxiliary_loss_clip": 0.01091041, + "auxiliary_loss_mlp": 0.01032937, + "balance_loss_clip": 1.03859711, + "balance_loss_mlp": 1.02061689, + "epoch": 0.643198557041936, + "flos": 34202184255360.0, + "grad_norm": 1.3945921589698136, + "language_loss": 0.65932959, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.68056941, + "num_input_tokens_seen": 230770245, + "step": 10698, + "time_per_iteration": 2.784822702407837 + }, + { + "auxiliary_loss_clip": 0.01036478, + "auxiliary_loss_mlp": 0.01005901, + "balance_loss_clip": 1.01277423, + "balance_loss_mlp": 1.00470889, + "epoch": 0.643258680294604, + "flos": 67626473621760.0, + "grad_norm": 0.8642865572859256, + "language_loss": 0.63445872, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.65488249, + "num_input_tokens_seen": 230837030, + "step": 10699, + "time_per_iteration": 3.1397321224212646 + }, + { + "auxiliary_loss_clip": 0.01103425, + "auxiliary_loss_mlp": 0.01028666, + "balance_loss_clip": 1.04155254, + "balance_loss_mlp": 1.01698923, + "epoch": 0.6433188035472719, + "flos": 25185984508800.0, + "grad_norm": 1.8812795881876412, + "language_loss": 0.69277722, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.71409816, + "num_input_tokens_seen": 230856845, + "step": 10700, + "time_per_iteration": 2.6566555500030518 + }, + { + "auxiliary_loss_clip": 0.01115928, + "auxiliary_loss_mlp": 0.01028377, + "balance_loss_clip": 1.04087234, + "balance_loss_mlp": 1.01547289, + "epoch": 0.6433789267999399, + "flos": 24973573432320.0, + "grad_norm": 2.050726314143076, + "language_loss": 0.7285673, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.75001037, + "num_input_tokens_seen": 230878785, + "step": 10701, + "time_per_iteration": 2.7663381099700928 + }, + { + "auxiliary_loss_clip": 0.01106257, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.03919315, + "balance_loss_mlp": 1.01695347, + "epoch": 0.6434390500526078, + "flos": 17566028282880.0, + "grad_norm": 1.983939492381853, + "language_loss": 0.82094157, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.84232259, + "num_input_tokens_seen": 230895445, + "step": 10702, + "time_per_iteration": 2.634734630584717 + }, + { + "auxiliary_loss_clip": 0.01084567, + "auxiliary_loss_mlp": 0.01040406, + "balance_loss_clip": 1.03733373, + "balance_loss_mlp": 1.02802002, + "epoch": 0.6434991733052758, + "flos": 20843594542080.0, + "grad_norm": 2.1366744665576536, + "language_loss": 0.74528348, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.76653326, + "num_input_tokens_seen": 230911375, + "step": 10703, + "time_per_iteration": 2.712024688720703 + }, + { + "auxiliary_loss_clip": 0.00980042, + "auxiliary_loss_mlp": 0.01002542, + "balance_loss_clip": 1.00990796, + "balance_loss_mlp": 1.00124288, + "epoch": 0.6435592965579437, + "flos": 66094596345600.0, + "grad_norm": 0.6668164665543085, + "language_loss": 0.54507017, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.56489605, + "num_input_tokens_seen": 230975990, + "step": 10704, + "time_per_iteration": 3.391496419906616 + }, + { + "auxiliary_loss_clip": 0.01074279, + "auxiliary_loss_mlp": 0.01024183, + "balance_loss_clip": 1.03965342, + "balance_loss_mlp": 1.01269126, + "epoch": 0.6436194198106118, + "flos": 23768842250880.0, + "grad_norm": 1.6398007726436414, + "language_loss": 0.76942575, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.79041034, + "num_input_tokens_seen": 230997110, + "step": 10705, + "time_per_iteration": 3.151123523712158 + }, + { + "auxiliary_loss_clip": 0.01080341, + "auxiliary_loss_mlp": 0.01040696, + "balance_loss_clip": 1.03794503, + "balance_loss_mlp": 1.02824438, + "epoch": 0.6436795430632797, + "flos": 20230312705920.0, + "grad_norm": 1.6220851966206657, + "language_loss": 0.78966212, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.81087244, + "num_input_tokens_seen": 231015590, + "step": 10706, + "time_per_iteration": 2.7351467609405518 + }, + { + "auxiliary_loss_clip": 0.01073614, + "auxiliary_loss_mlp": 0.01037334, + "balance_loss_clip": 1.03537798, + "balance_loss_mlp": 1.02350581, + "epoch": 0.6437396663159477, + "flos": 20301846641280.0, + "grad_norm": 1.995060337991945, + "language_loss": 0.80729055, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.82840002, + "num_input_tokens_seen": 231033800, + "step": 10707, + "time_per_iteration": 2.8090367317199707 + }, + { + "auxiliary_loss_clip": 0.01102074, + "auxiliary_loss_mlp": 0.01034341, + "balance_loss_clip": 1.03903484, + "balance_loss_mlp": 1.02176499, + "epoch": 0.6437997895686156, + "flos": 23878585278720.0, + "grad_norm": 1.8783086721412918, + "language_loss": 0.85947567, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.88083982, + "num_input_tokens_seen": 231053160, + "step": 10708, + "time_per_iteration": 2.7102444171905518 + }, + { + "auxiliary_loss_clip": 0.01070026, + "auxiliary_loss_mlp": 0.0104392, + "balance_loss_clip": 1.04000461, + "balance_loss_mlp": 1.02895367, + "epoch": 0.6438599128212836, + "flos": 18989275852800.0, + "grad_norm": 2.169380763975439, + "language_loss": 0.65262228, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.67376173, + "num_input_tokens_seen": 231069470, + "step": 10709, + "time_per_iteration": 2.6978535652160645 + }, + { + "auxiliary_loss_clip": 0.01115477, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.04076731, + "balance_loss_mlp": 1.02048671, + "epoch": 0.6439200360739517, + "flos": 24096347481600.0, + "grad_norm": 1.8116959175260157, + "language_loss": 0.80929708, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.83077991, + "num_input_tokens_seen": 231088205, + "step": 10710, + "time_per_iteration": 2.6809825897216797 + }, + { + "auxiliary_loss_clip": 0.0110175, + "auxiliary_loss_mlp": 0.01032632, + "balance_loss_clip": 1.03748906, + "balance_loss_mlp": 1.0203414, + "epoch": 0.6439801593266196, + "flos": 31902141697920.0, + "grad_norm": 2.6140299044708106, + "language_loss": 0.6634506, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.68479443, + "num_input_tokens_seen": 231107850, + "step": 10711, + "time_per_iteration": 2.71571946144104 + }, + { + "auxiliary_loss_clip": 0.01077359, + "auxiliary_loss_mlp": 0.01033147, + "balance_loss_clip": 1.0414753, + "balance_loss_mlp": 1.02000391, + "epoch": 0.6440402825792876, + "flos": 27125879351040.0, + "grad_norm": 2.2683722533974437, + "language_loss": 0.78656554, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.80767059, + "num_input_tokens_seen": 231127200, + "step": 10712, + "time_per_iteration": 2.785280466079712 + }, + { + "auxiliary_loss_clip": 0.01103094, + "auxiliary_loss_mlp": 0.01037973, + "balance_loss_clip": 1.03856206, + "balance_loss_mlp": 1.02487159, + "epoch": 0.6441004058319555, + "flos": 20667704618880.0, + "grad_norm": 1.6337129224497011, + "language_loss": 0.82845241, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.84986305, + "num_input_tokens_seen": 231146360, + "step": 10713, + "time_per_iteration": 2.6682519912719727 + }, + { + "auxiliary_loss_clip": 0.01111989, + "auxiliary_loss_mlp": 0.0103674, + "balance_loss_clip": 1.04118943, + "balance_loss_mlp": 1.02455091, + "epoch": 0.6441605290846235, + "flos": 26026006947840.0, + "grad_norm": 1.377683768387238, + "language_loss": 0.78550875, + "learning_rate": 1.187440012188684e-06, + "loss": 0.80699605, + "num_input_tokens_seen": 231168350, + "step": 10714, + "time_per_iteration": 2.6294350624084473 + }, + { + "auxiliary_loss_clip": 0.01081537, + "auxiliary_loss_mlp": 0.01031396, + "balance_loss_clip": 1.03937292, + "balance_loss_mlp": 1.01982093, + "epoch": 0.6442206523372914, + "flos": 24899489631360.0, + "grad_norm": 1.6804962466974145, + "language_loss": 0.8137539, + "learning_rate": 1.187084157517583e-06, + "loss": 0.83488327, + "num_input_tokens_seen": 231188385, + "step": 10715, + "time_per_iteration": 2.7179040908813477 + }, + { + "auxiliary_loss_clip": 0.01083275, + "auxiliary_loss_mlp": 0.01033506, + "balance_loss_clip": 1.03462327, + "balance_loss_mlp": 1.02041125, + "epoch": 0.6442807755899594, + "flos": 25156322853120.0, + "grad_norm": 2.56330690161098, + "language_loss": 0.81656396, + "learning_rate": 1.186728333672332e-06, + "loss": 0.83773172, + "num_input_tokens_seen": 231209880, + "step": 10716, + "time_per_iteration": 2.71616268157959 + }, + { + "auxiliary_loss_clip": 0.01080679, + "auxiliary_loss_mlp": 0.01037142, + "balance_loss_clip": 1.03870273, + "balance_loss_mlp": 1.02335536, + "epoch": 0.6443408988426274, + "flos": 27344503480320.0, + "grad_norm": 2.019166193158946, + "language_loss": 0.78575444, + "learning_rate": 1.186372540666424e-06, + "loss": 0.80693269, + "num_input_tokens_seen": 231230765, + "step": 10717, + "time_per_iteration": 2.7821998596191406 + }, + { + "auxiliary_loss_clip": 0.01111081, + "auxiliary_loss_mlp": 0.01033784, + "balance_loss_clip": 1.03954279, + "balance_loss_mlp": 1.0215416, + "epoch": 0.6444010220952954, + "flos": 27928339142400.0, + "grad_norm": 1.554880211694131, + "language_loss": 0.68287563, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.70432431, + "num_input_tokens_seen": 231252350, + "step": 10718, + "time_per_iteration": 2.619870662689209 + }, + { + "auxiliary_loss_clip": 0.01025406, + "auxiliary_loss_mlp": 0.01008951, + "balance_loss_clip": 1.01146674, + "balance_loss_mlp": 1.00788391, + "epoch": 0.6444611453479633, + "flos": 71215024855680.0, + "grad_norm": 0.7631804630715925, + "language_loss": 0.49633595, + "learning_rate": 1.185661047226603e-06, + "loss": 0.51667953, + "num_input_tokens_seen": 231313865, + "step": 10719, + "time_per_iteration": 3.3252131938934326 + }, + { + "auxiliary_loss_clip": 0.01118591, + "auxiliary_loss_mlp": 0.01039818, + "balance_loss_clip": 1.04287648, + "balance_loss_mlp": 1.02602601, + "epoch": 0.6445212686006313, + "flos": 22705131864960.0, + "grad_norm": 2.1022111741366603, + "language_loss": 0.77604353, + "learning_rate": 1.18530534681967e-06, + "loss": 0.79762757, + "num_input_tokens_seen": 231331710, + "step": 10720, + "time_per_iteration": 2.6171679496765137 + }, + { + "auxiliary_loss_clip": 0.01094489, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.04128611, + "balance_loss_mlp": 1.02126074, + "epoch": 0.6445813918532992, + "flos": 21178821196800.0, + "grad_norm": 1.7066840296237504, + "language_loss": 0.76980746, + "learning_rate": 1.18494967730604e-06, + "loss": 0.79110014, + "num_input_tokens_seen": 231350705, + "step": 10721, + "time_per_iteration": 2.8883464336395264 + }, + { + "auxiliary_loss_clip": 0.01077386, + "auxiliary_loss_mlp": 0.01035031, + "balance_loss_clip": 1.03889298, + "balance_loss_mlp": 1.02178049, + "epoch": 0.6446415151059672, + "flos": 25191910252800.0, + "grad_norm": 2.156937552750908, + "language_loss": 0.73425972, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.75538391, + "num_input_tokens_seen": 231369550, + "step": 10722, + "time_per_iteration": 3.0992050170898438 + }, + { + "auxiliary_loss_clip": 0.0111233, + "auxiliary_loss_mlp": 0.01033624, + "balance_loss_clip": 1.03991735, + "balance_loss_mlp": 1.02135682, + "epoch": 0.6447016383586353, + "flos": 25302227898240.0, + "grad_norm": 1.8325068766714112, + "language_loss": 0.77818036, + "learning_rate": 1.184238431012635e-06, + "loss": 0.79963994, + "num_input_tokens_seen": 231389285, + "step": 10723, + "time_per_iteration": 2.6199328899383545 + }, + { + "auxiliary_loss_clip": 0.01104393, + "auxiliary_loss_mlp": 0.01038636, + "balance_loss_clip": 1.03816402, + "balance_loss_mlp": 1.02488565, + "epoch": 0.6447617616113032, + "flos": 27703142824320.0, + "grad_norm": 2.2443871002503903, + "language_loss": 0.58686608, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.60829639, + "num_input_tokens_seen": 231408820, + "step": 10724, + "time_per_iteration": 4.554950475692749 + }, + { + "auxiliary_loss_clip": 0.01102176, + "auxiliary_loss_mlp": 0.01033682, + "balance_loss_clip": 1.0418992, + "balance_loss_mlp": 1.02188635, + "epoch": 0.6448218848639712, + "flos": 23039101543680.0, + "grad_norm": 1.7131170240074274, + "language_loss": 0.83707219, + "learning_rate": 1.183527308454271e-06, + "loss": 0.8584308, + "num_input_tokens_seen": 231428100, + "step": 10725, + "time_per_iteration": 2.5963871479034424 + }, + { + "auxiliary_loss_clip": 0.01089104, + "auxiliary_loss_mlp": 0.01037801, + "balance_loss_clip": 1.03586388, + "balance_loss_mlp": 1.02444363, + "epoch": 0.6448820081166391, + "flos": 24496104919680.0, + "grad_norm": 1.7945503193220944, + "language_loss": 0.82327414, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.84454322, + "num_input_tokens_seen": 231445810, + "step": 10726, + "time_per_iteration": 6.177702188491821 + }, + { + "auxiliary_loss_clip": 0.0110184, + "auxiliary_loss_mlp": 0.01037744, + "balance_loss_clip": 1.03911293, + "balance_loss_mlp": 1.02391601, + "epoch": 0.6449421313693071, + "flos": 22419283432320.0, + "grad_norm": 5.950779634023435, + "language_loss": 0.81306756, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.83446342, + "num_input_tokens_seen": 231463570, + "step": 10727, + "time_per_iteration": 2.646756172180176 + }, + { + "auxiliary_loss_clip": 0.01114052, + "auxiliary_loss_mlp": 0.01035116, + "balance_loss_clip": 1.04432821, + "balance_loss_mlp": 1.02101326, + "epoch": 0.645002254621975, + "flos": 20225715765120.0, + "grad_norm": 2.0767423550252047, + "language_loss": 0.79137063, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.81286234, + "num_input_tokens_seen": 231482155, + "step": 10728, + "time_per_iteration": 2.6014702320098877 + }, + { + "auxiliary_loss_clip": 0.01018281, + "auxiliary_loss_mlp": 0.01043432, + "balance_loss_clip": 1.03341746, + "balance_loss_mlp": 1.02857876, + "epoch": 0.645062377874643, + "flos": 27855440490240.0, + "grad_norm": 1.6698019924695346, + "language_loss": 0.74069214, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.76130933, + "num_input_tokens_seen": 231502465, + "step": 10729, + "time_per_iteration": 2.9942080974578857 + }, + { + "auxiliary_loss_clip": 0.01072033, + "auxiliary_loss_mlp": 0.01034846, + "balance_loss_clip": 1.03895199, + "balance_loss_mlp": 1.0206902, + "epoch": 0.645122501127311, + "flos": 25301509626240.0, + "grad_norm": 1.675292027703949, + "language_loss": 0.66314375, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.68421257, + "num_input_tokens_seen": 231522740, + "step": 10730, + "time_per_iteration": 3.029480218887329 + }, + { + "auxiliary_loss_clip": 0.01053326, + "auxiliary_loss_mlp": 0.01035886, + "balance_loss_clip": 1.03969455, + "balance_loss_mlp": 1.02077615, + "epoch": 0.645182624379979, + "flos": 18807352444800.0, + "grad_norm": 1.6301580114634824, + "language_loss": 0.63516945, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.65606159, + "num_input_tokens_seen": 231542050, + "step": 10731, + "time_per_iteration": 4.425801038742065 + }, + { + "auxiliary_loss_clip": 0.01111857, + "auxiliary_loss_mlp": 0.01032419, + "balance_loss_clip": 1.03885424, + "balance_loss_mlp": 1.01941907, + "epoch": 0.6452427476326469, + "flos": 18332182402560.0, + "grad_norm": 1.6688797138193545, + "language_loss": 0.68021357, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.70165634, + "num_input_tokens_seen": 231560380, + "step": 10732, + "time_per_iteration": 2.531669855117798 + }, + { + "auxiliary_loss_clip": 0.01104232, + "auxiliary_loss_mlp": 0.01036786, + "balance_loss_clip": 1.04108346, + "balance_loss_mlp": 1.0236969, + "epoch": 0.6453028708853149, + "flos": 22784746360320.0, + "grad_norm": 2.2675077381725557, + "language_loss": 0.75637865, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.77778876, + "num_input_tokens_seen": 231580810, + "step": 10733, + "time_per_iteration": 2.6263926029205322 + }, + { + "auxiliary_loss_clip": 0.01104718, + "auxiliary_loss_mlp": 0.01039721, + "balance_loss_clip": 1.03942811, + "balance_loss_mlp": 1.02548099, + "epoch": 0.6453629941379828, + "flos": 23945989150080.0, + "grad_norm": 2.5422080980889903, + "language_loss": 0.66799378, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.6894381, + "num_input_tokens_seen": 231600585, + "step": 10734, + "time_per_iteration": 2.639566421508789 + }, + { + "auxiliary_loss_clip": 0.01113842, + "auxiliary_loss_mlp": 0.01041504, + "balance_loss_clip": 1.04339838, + "balance_loss_mlp": 1.028898, + "epoch": 0.6454231173906508, + "flos": 17676381841920.0, + "grad_norm": 1.794099580406708, + "language_loss": 0.73622543, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.75777888, + "num_input_tokens_seen": 231618765, + "step": 10735, + "time_per_iteration": 2.5158708095550537 + }, + { + "auxiliary_loss_clip": 0.01052163, + "auxiliary_loss_mlp": 0.00771954, + "balance_loss_clip": 1.03596699, + "balance_loss_mlp": 1.00020361, + "epoch": 0.6454832406433189, + "flos": 23292774368640.0, + "grad_norm": 1.8433870916344732, + "language_loss": 0.74927819, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.76751935, + "num_input_tokens_seen": 231638525, + "step": 10736, + "time_per_iteration": 2.781177282333374 + }, + { + "auxiliary_loss_clip": 0.01109179, + "auxiliary_loss_mlp": 0.01033432, + "balance_loss_clip": 1.04235053, + "balance_loss_mlp": 1.01909697, + "epoch": 0.6455433638959868, + "flos": 20157198572160.0, + "grad_norm": 1.9123509169430688, + "language_loss": 0.70616424, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.72759038, + "num_input_tokens_seen": 231656785, + "step": 10737, + "time_per_iteration": 2.5800046920776367 + }, + { + "auxiliary_loss_clip": 0.0102545, + "auxiliary_loss_mlp": 0.01002929, + "balance_loss_clip": 1.01085997, + "balance_loss_mlp": 1.00164151, + "epoch": 0.6456034871486548, + "flos": 66532922012160.0, + "grad_norm": 0.7817772178911736, + "language_loss": 0.58405674, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.60434055, + "num_input_tokens_seen": 231719075, + "step": 10738, + "time_per_iteration": 3.238203287124634 + }, + { + "auxiliary_loss_clip": 0.01079809, + "auxiliary_loss_mlp": 0.01029827, + "balance_loss_clip": 1.0387454, + "balance_loss_mlp": 1.01666009, + "epoch": 0.6456636104013227, + "flos": 24206090509440.0, + "grad_norm": 1.920167100598176, + "language_loss": 0.74507523, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.76617157, + "num_input_tokens_seen": 231737810, + "step": 10739, + "time_per_iteration": 2.704909324645996 + }, + { + "auxiliary_loss_clip": 0.01096514, + "auxiliary_loss_mlp": 0.00771409, + "balance_loss_clip": 1.04137897, + "balance_loss_mlp": 1.00027609, + "epoch": 0.6457237336539907, + "flos": 23624086440960.0, + "grad_norm": 1.8028230929667255, + "language_loss": 0.70776832, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.72644746, + "num_input_tokens_seen": 231756140, + "step": 10740, + "time_per_iteration": 2.6947245597839355 + }, + { + "auxiliary_loss_clip": 0.01016337, + "auxiliary_loss_mlp": 0.01004394, + "balance_loss_clip": 1.01068592, + "balance_loss_mlp": 1.00314224, + "epoch": 0.6457838569066586, + "flos": 65846023251840.0, + "grad_norm": 0.8728350789543404, + "language_loss": 0.55255193, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.57275927, + "num_input_tokens_seen": 231823665, + "step": 10741, + "time_per_iteration": 3.214613676071167 + }, + { + "auxiliary_loss_clip": 0.01113695, + "auxiliary_loss_mlp": 0.01034634, + "balance_loss_clip": 1.04090226, + "balance_loss_mlp": 1.02212918, + "epoch": 0.6458439801593266, + "flos": 22381972179840.0, + "grad_norm": 1.5851201591734638, + "language_loss": 0.80647045, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.8279537, + "num_input_tokens_seen": 231844500, + "step": 10742, + "time_per_iteration": 2.6147494316101074 + }, + { + "auxiliary_loss_clip": 0.01089275, + "auxiliary_loss_mlp": 0.01034555, + "balance_loss_clip": 1.03800607, + "balance_loss_mlp": 1.02160883, + "epoch": 0.6459041034119946, + "flos": 24789243813120.0, + "grad_norm": 1.493920390788815, + "language_loss": 0.81934315, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.84058142, + "num_input_tokens_seen": 231864510, + "step": 10743, + "time_per_iteration": 2.7598674297332764 + }, + { + "auxiliary_loss_clip": 0.01088471, + "auxiliary_loss_mlp": 0.01032232, + "balance_loss_clip": 1.03786039, + "balance_loss_mlp": 1.01933324, + "epoch": 0.6459642266646626, + "flos": 18325358818560.0, + "grad_norm": 5.256757204998113, + "language_loss": 0.7177366, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.73894364, + "num_input_tokens_seen": 231881555, + "step": 10744, + "time_per_iteration": 2.620422840118408 + }, + { + "auxiliary_loss_clip": 0.01114623, + "auxiliary_loss_mlp": 0.01029271, + "balance_loss_clip": 1.04074514, + "balance_loss_mlp": 1.01683736, + "epoch": 0.6460243499173305, + "flos": 43581368891520.0, + "grad_norm": 1.6850885635931934, + "language_loss": 0.66688418, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.68832302, + "num_input_tokens_seen": 231905945, + "step": 10745, + "time_per_iteration": 2.7924861907958984 + }, + { + "auxiliary_loss_clip": 0.01101668, + "auxiliary_loss_mlp": 0.01034902, + "balance_loss_clip": 1.03878927, + "balance_loss_mlp": 1.02122271, + "epoch": 0.6460844731699985, + "flos": 19244026085760.0, + "grad_norm": 2.3841357931880536, + "language_loss": 0.73933601, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.76070166, + "num_input_tokens_seen": 231922535, + "step": 10746, + "time_per_iteration": 2.607113838195801 + }, + { + "auxiliary_loss_clip": 0.01106683, + "auxiliary_loss_mlp": 0.01035848, + "balance_loss_clip": 1.04162467, + "balance_loss_mlp": 1.02289009, + "epoch": 0.6461445964226664, + "flos": 27453348668160.0, + "grad_norm": 1.3562492191561222, + "language_loss": 0.66809833, + "learning_rate": 1.175713157660413e-06, + "loss": 0.6895237, + "num_input_tokens_seen": 231944800, + "step": 10747, + "time_per_iteration": 2.7339725494384766 + }, + { + "auxiliary_loss_clip": 0.01082798, + "auxiliary_loss_mlp": 0.0104212, + "balance_loss_clip": 1.03962016, + "balance_loss_mlp": 1.02953124, + "epoch": 0.6462047196753344, + "flos": 20295489934080.0, + "grad_norm": 1.7696623956762259, + "language_loss": 0.67370367, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.69495285, + "num_input_tokens_seen": 231962970, + "step": 10748, + "time_per_iteration": 2.733555555343628 + }, + { + "auxiliary_loss_clip": 0.01117812, + "auxiliary_loss_mlp": 0.01044313, + "balance_loss_clip": 1.04119956, + "balance_loss_mlp": 1.03015089, + "epoch": 0.6462648429280025, + "flos": 22018340845440.0, + "grad_norm": 1.9035207458082712, + "language_loss": 0.75889313, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.78051442, + "num_input_tokens_seen": 231981195, + "step": 10749, + "time_per_iteration": 2.6402747631073 + }, + { + "auxiliary_loss_clip": 0.01075833, + "auxiliary_loss_mlp": 0.01041632, + "balance_loss_clip": 1.03445184, + "balance_loss_mlp": 1.02752352, + "epoch": 0.6463249661806704, + "flos": 27781141207680.0, + "grad_norm": 1.5147294862876182, + "language_loss": 0.77007931, + "learning_rate": 1.17464876058473e-06, + "loss": 0.79125392, + "num_input_tokens_seen": 232001735, + "step": 10750, + "time_per_iteration": 2.7375411987304688 + }, + { + "auxiliary_loss_clip": 0.01097872, + "auxiliary_loss_mlp": 0.01038153, + "balance_loss_clip": 1.03953791, + "balance_loss_mlp": 1.02282298, + "epoch": 0.6463850894333384, + "flos": 22050588280320.0, + "grad_norm": 2.1693323351013496, + "language_loss": 0.68254787, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.70390815, + "num_input_tokens_seen": 232019830, + "step": 10751, + "time_per_iteration": 2.757457733154297 + }, + { + "auxiliary_loss_clip": 0.01088079, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.03963614, + "balance_loss_mlp": 1.0185945, + "epoch": 0.6464452126860063, + "flos": 21106245767040.0, + "grad_norm": 1.9208554879181607, + "language_loss": 0.71538639, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.73659164, + "num_input_tokens_seen": 232039625, + "step": 10752, + "time_per_iteration": 2.702068328857422 + }, + { + "auxiliary_loss_clip": 0.0108316, + "auxiliary_loss_mlp": 0.0104047, + "balance_loss_clip": 1.03569722, + "balance_loss_mlp": 1.02468061, + "epoch": 0.6465053359386743, + "flos": 16028045694720.0, + "grad_norm": 1.6304463713193273, + "language_loss": 0.78174138, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.80297774, + "num_input_tokens_seen": 232055855, + "step": 10753, + "time_per_iteration": 2.679288387298584 + }, + { + "auxiliary_loss_clip": 0.01114663, + "auxiliary_loss_mlp": 0.01041928, + "balance_loss_clip": 1.04108715, + "balance_loss_mlp": 1.02888012, + "epoch": 0.6465654591913422, + "flos": 23398674641280.0, + "grad_norm": 1.8389919923642137, + "language_loss": 0.85325253, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.87481844, + "num_input_tokens_seen": 232073475, + "step": 10754, + "time_per_iteration": 2.7047979831695557 + }, + { + "auxiliary_loss_clip": 0.01089928, + "auxiliary_loss_mlp": 0.01033615, + "balance_loss_clip": 1.0371294, + "balance_loss_mlp": 1.02018571, + "epoch": 0.6466255824440102, + "flos": 15377273038080.0, + "grad_norm": 2.0086067487297203, + "language_loss": 0.596542, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.61777741, + "num_input_tokens_seen": 232091090, + "step": 10755, + "time_per_iteration": 2.660458564758301 + }, + { + "auxiliary_loss_clip": 0.01070404, + "auxiliary_loss_mlp": 0.01034574, + "balance_loss_clip": 1.03757024, + "balance_loss_mlp": 1.02103186, + "epoch": 0.6466857056966782, + "flos": 16252846963200.0, + "grad_norm": 2.348911212047805, + "language_loss": 0.68158704, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.70263684, + "num_input_tokens_seen": 232107320, + "step": 10756, + "time_per_iteration": 2.667661190032959 + }, + { + "auxiliary_loss_clip": 0.0107653, + "auxiliary_loss_mlp": 0.01039991, + "balance_loss_clip": 1.03933072, + "balance_loss_mlp": 1.02511406, + "epoch": 0.6467458289493462, + "flos": 21178246579200.0, + "grad_norm": 2.3037886815422772, + "language_loss": 0.74333578, + "learning_rate": 1.172166263444844e-06, + "loss": 0.76450104, + "num_input_tokens_seen": 232123930, + "step": 10757, + "time_per_iteration": 2.752260446548462 + }, + { + "auxiliary_loss_clip": 0.01064083, + "auxiliary_loss_mlp": 0.01037594, + "balance_loss_clip": 1.0400213, + "balance_loss_mlp": 1.02434397, + "epoch": 0.6468059522020141, + "flos": 17968299672960.0, + "grad_norm": 1.4896032445983383, + "language_loss": 0.74085969, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.76187646, + "num_input_tokens_seen": 232142905, + "step": 10758, + "time_per_iteration": 2.752277135848999 + }, + { + "auxiliary_loss_clip": 0.01078484, + "auxiliary_loss_mlp": 0.0103444, + "balance_loss_clip": 1.04134357, + "balance_loss_mlp": 1.02081478, + "epoch": 0.6468660754546821, + "flos": 17890157635200.0, + "grad_norm": 1.5569302711566517, + "language_loss": 0.67830229, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.69943154, + "num_input_tokens_seen": 232162230, + "step": 10759, + "time_per_iteration": 2.6961419582366943 + }, + { + "auxiliary_loss_clip": 0.01078582, + "auxiliary_loss_mlp": 0.01038565, + "balance_loss_clip": 1.03437579, + "balance_loss_mlp": 1.02430177, + "epoch": 0.64692619870735, + "flos": 22600991358720.0, + "grad_norm": 1.7675629477863553, + "language_loss": 0.75511646, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.77628791, + "num_input_tokens_seen": 232182700, + "step": 10760, + "time_per_iteration": 2.7628531455993652 + }, + { + "auxiliary_loss_clip": 0.01088869, + "auxiliary_loss_mlp": 0.01035724, + "balance_loss_clip": 1.03735101, + "balance_loss_mlp": 1.02188993, + "epoch": 0.646986321960018, + "flos": 49600786993920.0, + "grad_norm": 1.635479063212096, + "language_loss": 0.65361971, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.6748656, + "num_input_tokens_seen": 232208235, + "step": 10761, + "time_per_iteration": 2.939115047454834 + }, + { + "auxiliary_loss_clip": 0.01069611, + "auxiliary_loss_mlp": 0.01035372, + "balance_loss_clip": 1.03998923, + "balance_loss_mlp": 1.02115035, + "epoch": 0.6470464452126861, + "flos": 21908454163200.0, + "grad_norm": 2.1978879485100014, + "language_loss": 0.6946497, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.71569955, + "num_input_tokens_seen": 232228720, + "step": 10762, + "time_per_iteration": 4.4654014110565186 + }, + { + "auxiliary_loss_clip": 0.01117949, + "auxiliary_loss_mlp": 0.01037436, + "balance_loss_clip": 1.04075444, + "balance_loss_mlp": 1.02360213, + "epoch": 0.647106568465354, + "flos": 18106124158080.0, + "grad_norm": 1.972655429723057, + "language_loss": 0.82998466, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.85153854, + "num_input_tokens_seen": 232244655, + "step": 10763, + "time_per_iteration": 2.592090129852295 + }, + { + "auxiliary_loss_clip": 0.0103456, + "auxiliary_loss_mlp": 0.01005031, + "balance_loss_clip": 1.01049972, + "balance_loss_mlp": 1.00385058, + "epoch": 0.647166691718022, + "flos": 69480038125440.0, + "grad_norm": 0.712357320853497, + "language_loss": 0.57828617, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.59868205, + "num_input_tokens_seen": 232308685, + "step": 10764, + "time_per_iteration": 3.3077809810638428 + }, + { + "auxiliary_loss_clip": 0.01077866, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.03704214, + "balance_loss_mlp": 1.02015924, + "epoch": 0.6472268149706899, + "flos": 34095170661120.0, + "grad_norm": 2.021573071850794, + "language_loss": 0.6068002, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.62791574, + "num_input_tokens_seen": 232327520, + "step": 10765, + "time_per_iteration": 2.940326690673828 + }, + { + "auxiliary_loss_clip": 0.01113775, + "auxiliary_loss_mlp": 0.01033181, + "balance_loss_clip": 1.04050612, + "balance_loss_mlp": 1.02059865, + "epoch": 0.6472869382233579, + "flos": 28111232217600.0, + "grad_norm": 1.7427036976648405, + "language_loss": 0.62848121, + "learning_rate": 1.168976742243437e-06, + "loss": 0.64995074, + "num_input_tokens_seen": 232349025, + "step": 10766, + "time_per_iteration": 5.861475229263306 + }, + { + "auxiliary_loss_clip": 0.01090186, + "auxiliary_loss_mlp": 0.01036411, + "balance_loss_clip": 1.04002905, + "balance_loss_mlp": 1.02172494, + "epoch": 0.6473470614760258, + "flos": 22492146170880.0, + "grad_norm": 2.0617673547917255, + "language_loss": 0.75767088, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.77893686, + "num_input_tokens_seen": 232367835, + "step": 10767, + "time_per_iteration": 2.7045323848724365 + }, + { + "auxiliary_loss_clip": 0.01096864, + "auxiliary_loss_mlp": 0.01033099, + "balance_loss_clip": 1.03984213, + "balance_loss_mlp": 1.02028418, + "epoch": 0.6474071847286939, + "flos": 14538938538240.0, + "grad_norm": 1.9988107632557572, + "language_loss": 0.78334147, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.80464113, + "num_input_tokens_seen": 232385840, + "step": 10768, + "time_per_iteration": 2.603180170059204 + }, + { + "auxiliary_loss_clip": 0.01056997, + "auxiliary_loss_mlp": 0.01034297, + "balance_loss_clip": 1.03838003, + "balance_loss_mlp": 1.02096355, + "epoch": 0.6474673079813618, + "flos": 24098214988800.0, + "grad_norm": 1.607650242718932, + "language_loss": 0.71857584, + "learning_rate": 1.167914135250663e-06, + "loss": 0.73948884, + "num_input_tokens_seen": 232406205, + "step": 10769, + "time_per_iteration": 2.7530863285064697 + }, + { + "auxiliary_loss_clip": 0.01113406, + "auxiliary_loss_mlp": 0.0103478, + "balance_loss_clip": 1.04209769, + "balance_loss_mlp": 1.02214372, + "epoch": 0.6475274312340298, + "flos": 14976186796800.0, + "grad_norm": 1.9573022312706896, + "language_loss": 0.71980953, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.74129134, + "num_input_tokens_seen": 232424995, + "step": 10770, + "time_per_iteration": 4.22503137588501 + }, + { + "auxiliary_loss_clip": 0.01073177, + "auxiliary_loss_mlp": 0.01031965, + "balance_loss_clip": 1.03501081, + "balance_loss_mlp": 1.01759458, + "epoch": 0.6475875544866977, + "flos": 25045322849280.0, + "grad_norm": 1.5542236081497367, + "language_loss": 0.73281699, + "learning_rate": 1.167205888330325e-06, + "loss": 0.75386834, + "num_input_tokens_seen": 232445870, + "step": 10771, + "time_per_iteration": 2.841069459915161 + }, + { + "auxiliary_loss_clip": 0.01074703, + "auxiliary_loss_mlp": 0.0103808, + "balance_loss_clip": 1.03516805, + "balance_loss_mlp": 1.02413297, + "epoch": 0.6476476777393657, + "flos": 16472153450880.0, + "grad_norm": 1.9087232907246778, + "language_loss": 0.74044871, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.76157653, + "num_input_tokens_seen": 232464285, + "step": 10772, + "time_per_iteration": 2.775754690170288 + }, + { + "auxiliary_loss_clip": 0.01088465, + "auxiliary_loss_mlp": 0.01031281, + "balance_loss_clip": 1.03951991, + "balance_loss_mlp": 1.01950288, + "epoch": 0.6477078009920336, + "flos": 25812267068160.0, + "grad_norm": 1.563820733818388, + "language_loss": 0.8277418, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.84893924, + "num_input_tokens_seen": 232485815, + "step": 10773, + "time_per_iteration": 2.7739098072052 + }, + { + "auxiliary_loss_clip": 0.01100228, + "auxiliary_loss_mlp": 0.00769385, + "balance_loss_clip": 1.03956735, + "balance_loss_mlp": 1.00008345, + "epoch": 0.6477679242447016, + "flos": 17676130446720.0, + "grad_norm": 1.451687382466444, + "language_loss": 0.78496003, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.80365622, + "num_input_tokens_seen": 232504875, + "step": 10774, + "time_per_iteration": 2.7035605907440186 + }, + { + "auxiliary_loss_clip": 0.01104625, + "auxiliary_loss_mlp": 0.01040629, + "balance_loss_clip": 1.04012299, + "balance_loss_mlp": 1.02751637, + "epoch": 0.6478280474973696, + "flos": 21032305620480.0, + "grad_norm": 2.3182968489247986, + "language_loss": 0.68886763, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.71032017, + "num_input_tokens_seen": 232521945, + "step": 10775, + "time_per_iteration": 2.7283878326416016 + }, + { + "auxiliary_loss_clip": 0.01078255, + "auxiliary_loss_mlp": 0.0104184, + "balance_loss_clip": 1.03620017, + "balance_loss_mlp": 1.02827358, + "epoch": 0.6478881707500376, + "flos": 21616931381760.0, + "grad_norm": 1.867125007130101, + "language_loss": 0.65918481, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.68038571, + "num_input_tokens_seen": 232541500, + "step": 10776, + "time_per_iteration": 2.792161226272583 + }, + { + "auxiliary_loss_clip": 0.01086281, + "auxiliary_loss_mlp": 0.01040573, + "balance_loss_clip": 1.03693199, + "balance_loss_mlp": 1.0267868, + "epoch": 0.6479482940027056, + "flos": 18442571875200.0, + "grad_norm": 2.7363901491618297, + "language_loss": 0.7900703, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.81133884, + "num_input_tokens_seen": 232559720, + "step": 10777, + "time_per_iteration": 2.6817147731781006 + }, + { + "auxiliary_loss_clip": 0.01101857, + "auxiliary_loss_mlp": 0.01033898, + "balance_loss_clip": 1.04061663, + "balance_loss_mlp": 1.0203439, + "epoch": 0.6480084172553735, + "flos": 22164066322560.0, + "grad_norm": 2.418675876930909, + "language_loss": 0.73090535, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.75226295, + "num_input_tokens_seen": 232579370, + "step": 10778, + "time_per_iteration": 2.7519023418426514 + }, + { + "auxiliary_loss_clip": 0.01098704, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.03796005, + "balance_loss_mlp": 1.01817703, + "epoch": 0.6480685405080415, + "flos": 24316228586880.0, + "grad_norm": 1.4697687567373847, + "language_loss": 0.78067875, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.80197144, + "num_input_tokens_seen": 232600495, + "step": 10779, + "time_per_iteration": 2.667295455932617 + }, + { + "auxiliary_loss_clip": 0.01021608, + "auxiliary_loss_mlp": 0.01004834, + "balance_loss_clip": 1.00979376, + "balance_loss_mlp": 1.00352228, + "epoch": 0.6481286637607094, + "flos": 59891207760000.0, + "grad_norm": 0.722667977363254, + "language_loss": 0.59406435, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.61432874, + "num_input_tokens_seen": 232663165, + "step": 10780, + "time_per_iteration": 3.146688461303711 + }, + { + "auxiliary_loss_clip": 0.01013668, + "auxiliary_loss_mlp": 0.01032512, + "balance_loss_clip": 1.03276062, + "balance_loss_mlp": 1.02043653, + "epoch": 0.6481887870133775, + "flos": 25484187219840.0, + "grad_norm": 1.9346405521822077, + "language_loss": 0.79079604, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.81125784, + "num_input_tokens_seen": 232683385, + "step": 10781, + "time_per_iteration": 3.1543314456939697 + }, + { + "auxiliary_loss_clip": 0.01117668, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_clip": 1.04143655, + "balance_loss_mlp": 1.02158904, + "epoch": 0.6482489102660454, + "flos": 19930206574080.0, + "grad_norm": 2.567868177502946, + "language_loss": 0.79041505, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.81195259, + "num_input_tokens_seen": 232699095, + "step": 10782, + "time_per_iteration": 2.8998003005981445 + }, + { + "auxiliary_loss_clip": 0.01106141, + "auxiliary_loss_mlp": 0.007711, + "balance_loss_clip": 1.04090714, + "balance_loss_mlp": 1.0001415, + "epoch": 0.6483090335187134, + "flos": 26979471515520.0, + "grad_norm": 2.672580630052252, + "language_loss": 0.64563107, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.66440344, + "num_input_tokens_seen": 232717920, + "step": 10783, + "time_per_iteration": 2.807725191116333 + }, + { + "auxiliary_loss_clip": 0.01119847, + "auxiliary_loss_mlp": 0.01038004, + "balance_loss_clip": 1.04234159, + "balance_loss_mlp": 1.02316856, + "epoch": 0.6483691567713813, + "flos": 25077965333760.0, + "grad_norm": 1.6110368507909019, + "language_loss": 0.88390124, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.90547979, + "num_input_tokens_seen": 232737605, + "step": 10784, + "time_per_iteration": 2.640153169631958 + }, + { + "auxiliary_loss_clip": 0.01089797, + "auxiliary_loss_mlp": 0.01033124, + "balance_loss_clip": 1.03887093, + "balance_loss_mlp": 1.02020776, + "epoch": 0.6484292800240493, + "flos": 16105972250880.0, + "grad_norm": 2.090784466794914, + "language_loss": 0.72988814, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.75111735, + "num_input_tokens_seen": 232755110, + "step": 10785, + "time_per_iteration": 2.6515488624572754 + }, + { + "auxiliary_loss_clip": 0.01078138, + "auxiliary_loss_mlp": 0.0103076, + "balance_loss_clip": 1.03758073, + "balance_loss_mlp": 1.01802194, + "epoch": 0.6484894032767172, + "flos": 28840398307200.0, + "grad_norm": 1.5672388778764104, + "language_loss": 0.69397259, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.71506155, + "num_input_tokens_seen": 232779040, + "step": 10786, + "time_per_iteration": 2.831984519958496 + }, + { + "auxiliary_loss_clip": 0.01075224, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.03817129, + "balance_loss_mlp": 1.01922286, + "epoch": 0.6485495265293852, + "flos": 30227052896640.0, + "grad_norm": 2.0612082804403404, + "language_loss": 0.71243078, + "learning_rate": 1.161544469455041e-06, + "loss": 0.73350048, + "num_input_tokens_seen": 232800515, + "step": 10787, + "time_per_iteration": 2.793691635131836 + }, + { + "auxiliary_loss_clip": 0.0111836, + "auxiliary_loss_mlp": 0.01035294, + "balance_loss_clip": 1.0412823, + "balance_loss_mlp": 1.0220623, + "epoch": 0.6486096497820532, + "flos": 20082181017600.0, + "grad_norm": 1.9333037316798733, + "language_loss": 0.84715712, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.86869359, + "num_input_tokens_seen": 232818450, + "step": 10788, + "time_per_iteration": 2.606229543685913 + }, + { + "auxiliary_loss_clip": 0.01078244, + "auxiliary_loss_mlp": 0.01034763, + "balance_loss_clip": 1.04034448, + "balance_loss_mlp": 1.02126873, + "epoch": 0.6486697730347212, + "flos": 17129067333120.0, + "grad_norm": 2.006310721450953, + "language_loss": 0.7757296, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.79685968, + "num_input_tokens_seen": 232834785, + "step": 10789, + "time_per_iteration": 2.689147710800171 + }, + { + "auxiliary_loss_clip": 0.01096496, + "auxiliary_loss_mlp": 0.01031686, + "balance_loss_clip": 1.03580093, + "balance_loss_mlp": 1.01941395, + "epoch": 0.6487298962873892, + "flos": 38911940570880.0, + "grad_norm": 1.6467685685264215, + "language_loss": 0.75511503, + "learning_rate": 1.160483857897479e-06, + "loss": 0.77639687, + "num_input_tokens_seen": 232856050, + "step": 10790, + "time_per_iteration": 2.8264946937561035 + }, + { + "auxiliary_loss_clip": 0.01113527, + "auxiliary_loss_mlp": 0.01036831, + "balance_loss_clip": 1.04156542, + "balance_loss_mlp": 1.02490366, + "epoch": 0.6487900195400571, + "flos": 11947840076160.0, + "grad_norm": 2.307183406251666, + "language_loss": 0.60332596, + "learning_rate": 1.160130384362823e-06, + "loss": 0.62482953, + "num_input_tokens_seen": 232873945, + "step": 10791, + "time_per_iteration": 2.5990047454833984 + }, + { + "auxiliary_loss_clip": 0.01076606, + "auxiliary_loss_mlp": 0.01034239, + "balance_loss_clip": 1.03773832, + "balance_loss_mlp": 1.0215373, + "epoch": 0.6488501427927251, + "flos": 22344445445760.0, + "grad_norm": 1.759760291391278, + "language_loss": 0.86496675, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.88607526, + "num_input_tokens_seen": 232892160, + "step": 10792, + "time_per_iteration": 2.771683692932129 + }, + { + "auxiliary_loss_clip": 0.01093434, + "auxiliary_loss_mlp": 0.01039713, + "balance_loss_clip": 1.03958428, + "balance_loss_mlp": 1.02602792, + "epoch": 0.648910266045393, + "flos": 22236282616320.0, + "grad_norm": 2.0358486422598445, + "language_loss": 0.78231007, + "learning_rate": 1.159423532850735e-06, + "loss": 0.8036415, + "num_input_tokens_seen": 232911725, + "step": 10793, + "time_per_iteration": 2.67922043800354 + }, + { + "auxiliary_loss_clip": 0.0108252, + "auxiliary_loss_mlp": 0.01032713, + "balance_loss_clip": 1.0395385, + "balance_loss_mlp": 1.0193671, + "epoch": 0.6489703892980611, + "flos": 25301258231040.0, + "grad_norm": 2.0060089316964667, + "language_loss": 0.75005889, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.77121115, + "num_input_tokens_seen": 232929085, + "step": 10794, + "time_per_iteration": 2.740185022354126 + }, + { + "auxiliary_loss_clip": 0.01102066, + "auxiliary_loss_mlp": 0.00770842, + "balance_loss_clip": 1.03801179, + "balance_loss_mlp": 1.00016379, + "epoch": 0.649030512550729, + "flos": 24571912573440.0, + "grad_norm": 1.6388436552304226, + "language_loss": 0.70095515, + "learning_rate": 1.158716808837621e-06, + "loss": 0.71968424, + "num_input_tokens_seen": 232949455, + "step": 10795, + "time_per_iteration": 2.7056167125701904 + }, + { + "auxiliary_loss_clip": 0.01092893, + "auxiliary_loss_mlp": 0.01034868, + "balance_loss_clip": 1.03938341, + "balance_loss_mlp": 1.02145672, + "epoch": 0.649090635803397, + "flos": 26244702904320.0, + "grad_norm": 1.931230622678825, + "language_loss": 0.54384381, + "learning_rate": 1.158363494676679e-06, + "loss": 0.56512141, + "num_input_tokens_seen": 232969445, + "step": 10796, + "time_per_iteration": 2.70178484916687 + }, + { + "auxiliary_loss_clip": 0.0110304, + "auxiliary_loss_mlp": 0.010382, + "balance_loss_clip": 1.04058564, + "balance_loss_mlp": 1.02635705, + "epoch": 0.6491507590560649, + "flos": 24937375501440.0, + "grad_norm": 1.521654875765255, + "language_loss": 0.77584833, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.7972607, + "num_input_tokens_seen": 232988900, + "step": 10797, + "time_per_iteration": 2.740236759185791 + }, + { + "auxiliary_loss_clip": 0.010649, + "auxiliary_loss_mlp": 0.01033495, + "balance_loss_clip": 1.03765631, + "balance_loss_mlp": 1.02110291, + "epoch": 0.6492108823087329, + "flos": 19499781899520.0, + "grad_norm": 2.1886950551197835, + "language_loss": 0.7017765, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.72276044, + "num_input_tokens_seen": 233005060, + "step": 10798, + "time_per_iteration": 2.7228379249572754 + }, + { + "auxiliary_loss_clip": 0.01059107, + "auxiliary_loss_mlp": 0.01032641, + "balance_loss_clip": 1.03400683, + "balance_loss_mlp": 1.02048159, + "epoch": 0.6492710055614008, + "flos": 19719303868800.0, + "grad_norm": 1.8018305819700693, + "language_loss": 0.76899987, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.78991735, + "num_input_tokens_seen": 233023375, + "step": 10799, + "time_per_iteration": 2.7452025413513184 + }, + { + "auxiliary_loss_clip": 0.01102121, + "auxiliary_loss_mlp": 0.0103607, + "balance_loss_clip": 1.03952456, + "balance_loss_mlp": 1.02255809, + "epoch": 0.6493311288140688, + "flos": 24317018686080.0, + "grad_norm": 1.8690878603480447, + "language_loss": 0.71881801, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.74019992, + "num_input_tokens_seen": 233043130, + "step": 10800, + "time_per_iteration": 2.681090831756592 + }, + { + "auxiliary_loss_clip": 0.01025406, + "auxiliary_loss_mlp": 0.01015193, + "balance_loss_clip": 1.01085913, + "balance_loss_mlp": 1.01379859, + "epoch": 0.6493912520667368, + "flos": 70934635290240.0, + "grad_norm": 0.7781340996665279, + "language_loss": 0.60211796, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.62252396, + "num_input_tokens_seen": 233110560, + "step": 10801, + "time_per_iteration": 3.3247601985931396 + }, + { + "auxiliary_loss_clip": 0.01104473, + "auxiliary_loss_mlp": 0.01042076, + "balance_loss_clip": 1.04024768, + "balance_loss_mlp": 1.02764034, + "epoch": 0.6494513753194048, + "flos": 25337779384320.0, + "grad_norm": 2.523744267443401, + "language_loss": 0.78645104, + "learning_rate": 1.156244280393614e-06, + "loss": 0.80791658, + "num_input_tokens_seen": 233130080, + "step": 10802, + "time_per_iteration": 4.631081581115723 + }, + { + "auxiliary_loss_clip": 0.01114091, + "auxiliary_loss_mlp": 0.01039322, + "balance_loss_clip": 1.03890288, + "balance_loss_mlp": 1.02562487, + "epoch": 0.6495114985720728, + "flos": 24681978823680.0, + "grad_norm": 1.6103480042358926, + "language_loss": 0.74409741, + "learning_rate": 1.155891189918541e-06, + "loss": 0.76563156, + "num_input_tokens_seen": 233150235, + "step": 10803, + "time_per_iteration": 2.6966469287872314 + }, + { + "auxiliary_loss_clip": 0.01052817, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.03642201, + "balance_loss_mlp": 1.02049232, + "epoch": 0.6495716218247407, + "flos": 23651162317440.0, + "grad_norm": 2.357483246632454, + "language_loss": 0.70044661, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.72131014, + "num_input_tokens_seen": 233166710, + "step": 10804, + "time_per_iteration": 2.8469581604003906 + }, + { + "auxiliary_loss_clip": 0.01100022, + "auxiliary_loss_mlp": 0.01031932, + "balance_loss_clip": 1.03885949, + "balance_loss_mlp": 1.01822269, + "epoch": 0.6496317450774087, + "flos": 22346169298560.0, + "grad_norm": 1.6424372411167527, + "language_loss": 0.72557664, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.74689615, + "num_input_tokens_seen": 233185445, + "step": 10805, + "time_per_iteration": 4.559306621551514 + }, + { + "auxiliary_loss_clip": 0.01088097, + "auxiliary_loss_mlp": 0.01030943, + "balance_loss_clip": 1.03999364, + "balance_loss_mlp": 1.01886106, + "epoch": 0.6496918683300766, + "flos": 30518647505280.0, + "grad_norm": 2.1225421947180467, + "language_loss": 0.65710378, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.67829412, + "num_input_tokens_seen": 233205805, + "step": 10806, + "time_per_iteration": 4.271615266799927 + }, + { + "auxiliary_loss_clip": 0.01093074, + "auxiliary_loss_mlp": 0.00771144, + "balance_loss_clip": 1.03741765, + "balance_loss_mlp": 1.00009441, + "epoch": 0.6497519915827447, + "flos": 12458992567680.0, + "grad_norm": 1.9214718172589236, + "language_loss": 0.78912604, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.80776823, + "num_input_tokens_seen": 233224215, + "step": 10807, + "time_per_iteration": 2.7781808376312256 + }, + { + "auxiliary_loss_clip": 0.01014724, + "auxiliary_loss_mlp": 0.0100172, + "balance_loss_clip": 1.0100404, + "balance_loss_mlp": 1.00033116, + "epoch": 0.6498121148354126, + "flos": 69093748287360.0, + "grad_norm": 0.7866075869002591, + "language_loss": 0.58888513, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.60904956, + "num_input_tokens_seen": 233294440, + "step": 10808, + "time_per_iteration": 3.3867762088775635 + }, + { + "auxiliary_loss_clip": 0.01091297, + "auxiliary_loss_mlp": 0.01027122, + "balance_loss_clip": 1.04009056, + "balance_loss_mlp": 1.01453352, + "epoch": 0.6498722380880806, + "flos": 36897135914880.0, + "grad_norm": 1.7443140014145102, + "language_loss": 0.63562334, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.65680754, + "num_input_tokens_seen": 233316125, + "step": 10809, + "time_per_iteration": 4.545352220535278 + }, + { + "auxiliary_loss_clip": 0.01101385, + "auxiliary_loss_mlp": 0.00769706, + "balance_loss_clip": 1.04086709, + "balance_loss_mlp": 1.00011587, + "epoch": 0.6499323613407485, + "flos": 29017760688000.0, + "grad_norm": 1.6271930156290193, + "language_loss": 0.81576955, + "learning_rate": 1.153420453586008e-06, + "loss": 0.8344804, + "num_input_tokens_seen": 233336140, + "step": 10810, + "time_per_iteration": 2.6756200790405273 + }, + { + "auxiliary_loss_clip": 0.01071315, + "auxiliary_loss_mlp": 0.01036036, + "balance_loss_clip": 1.0380795, + "balance_loss_mlp": 1.02466989, + "epoch": 0.6499924845934165, + "flos": 20119240874880.0, + "grad_norm": 1.6231866882582067, + "language_loss": 0.72109252, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.74216604, + "num_input_tokens_seen": 233356095, + "step": 10811, + "time_per_iteration": 2.6948235034942627 + }, + { + "auxiliary_loss_clip": 0.01053868, + "auxiliary_loss_mlp": 0.01028108, + "balance_loss_clip": 1.04128838, + "balance_loss_mlp": 1.01610339, + "epoch": 0.6500526078460844, + "flos": 24421338760320.0, + "grad_norm": 1.6351468205414483, + "language_loss": 0.77842551, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.79924524, + "num_input_tokens_seen": 233376830, + "step": 10812, + "time_per_iteration": 2.8678853511810303 + }, + { + "auxiliary_loss_clip": 0.01098947, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.04008079, + "balance_loss_mlp": 1.02321005, + "epoch": 0.6501127310987524, + "flos": 23331019374720.0, + "grad_norm": 1.6938636909154852, + "language_loss": 0.85069716, + "learning_rate": 1.152362047854413e-06, + "loss": 0.8720504, + "num_input_tokens_seen": 233395275, + "step": 10813, + "time_per_iteration": 2.618603467941284 + }, + { + "auxiliary_loss_clip": 0.01071283, + "auxiliary_loss_mlp": 0.01035396, + "balance_loss_clip": 1.03572655, + "balance_loss_mlp": 1.02187157, + "epoch": 0.6501728543514204, + "flos": 18697824898560.0, + "grad_norm": 2.609145629781726, + "language_loss": 0.79691541, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.8179822, + "num_input_tokens_seen": 233413345, + "step": 10814, + "time_per_iteration": 2.742004156112671 + }, + { + "auxiliary_loss_clip": 0.01064254, + "auxiliary_loss_mlp": 0.00773576, + "balance_loss_clip": 1.03794754, + "balance_loss_mlp": 1.00018024, + "epoch": 0.6502329776040884, + "flos": 44199858199680.0, + "grad_norm": 1.9285039571390825, + "language_loss": 0.65348196, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.67186022, + "num_input_tokens_seen": 233436105, + "step": 10815, + "time_per_iteration": 2.967710256576538 + }, + { + "auxiliary_loss_clip": 0.01118333, + "auxiliary_loss_mlp": 0.01032665, + "balance_loss_clip": 1.04089963, + "balance_loss_mlp": 1.01759648, + "epoch": 0.6502931008567564, + "flos": 14574741419520.0, + "grad_norm": 1.878543508830568, + "language_loss": 0.75245708, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.77396703, + "num_input_tokens_seen": 233452320, + "step": 10816, + "time_per_iteration": 2.619370698928833 + }, + { + "auxiliary_loss_clip": 0.01085538, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.03892541, + "balance_loss_mlp": 1.01897991, + "epoch": 0.6503532241094243, + "flos": 21395003201280.0, + "grad_norm": 1.8185101411846911, + "language_loss": 0.73227775, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.75345039, + "num_input_tokens_seen": 233469920, + "step": 10817, + "time_per_iteration": 2.758009672164917 + }, + { + "auxiliary_loss_clip": 0.01071537, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_clip": 1.03518438, + "balance_loss_mlp": 1.03168857, + "epoch": 0.6504133473620923, + "flos": 74740840986240.0, + "grad_norm": 1.5063120441652318, + "language_loss": 0.72075009, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.74193007, + "num_input_tokens_seen": 233499780, + "step": 10818, + "time_per_iteration": 3.143178701400757 + }, + { + "auxiliary_loss_clip": 0.01085148, + "auxiliary_loss_mlp": 0.01030336, + "balance_loss_clip": 1.03872418, + "balance_loss_mlp": 1.01738429, + "epoch": 0.6504734706147602, + "flos": 19713270384000.0, + "grad_norm": 2.002053752481776, + "language_loss": 0.65038371, + "learning_rate": 1.150246104600249e-06, + "loss": 0.67153859, + "num_input_tokens_seen": 233518235, + "step": 10819, + "time_per_iteration": 2.704205274581909 + }, + { + "auxiliary_loss_clip": 0.01077923, + "auxiliary_loss_mlp": 0.01031636, + "balance_loss_clip": 1.03569567, + "balance_loss_mlp": 1.01811743, + "epoch": 0.6505335938674283, + "flos": 25556870390400.0, + "grad_norm": 1.8302178372953948, + "language_loss": 0.83782417, + "learning_rate": 1.14989356009286e-06, + "loss": 0.85891974, + "num_input_tokens_seen": 233535215, + "step": 10820, + "time_per_iteration": 2.762343645095825 + }, + { + "auxiliary_loss_clip": 0.01106479, + "auxiliary_loss_mlp": 0.01030319, + "balance_loss_clip": 1.03934109, + "balance_loss_mlp": 1.01703274, + "epoch": 0.6505937171200962, + "flos": 17821424960640.0, + "grad_norm": 2.074138898013104, + "language_loss": 0.77881086, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.80017889, + "num_input_tokens_seen": 233552775, + "step": 10821, + "time_per_iteration": 2.6239891052246094 + }, + { + "auxiliary_loss_clip": 0.01077516, + "auxiliary_loss_mlp": 0.01028396, + "balance_loss_clip": 1.03843164, + "balance_loss_mlp": 1.01721418, + "epoch": 0.6506538403727642, + "flos": 20668135582080.0, + "grad_norm": 1.4292101756111668, + "language_loss": 0.80072695, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.82178605, + "num_input_tokens_seen": 233572080, + "step": 10822, + "time_per_iteration": 2.7913742065429688 + }, + { + "auxiliary_loss_clip": 0.01084959, + "auxiliary_loss_mlp": 0.01029649, + "balance_loss_clip": 1.04204702, + "balance_loss_mlp": 1.01634574, + "epoch": 0.6507139636254321, + "flos": 11721422695680.0, + "grad_norm": 2.216597297898186, + "language_loss": 0.8719157, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.89306176, + "num_input_tokens_seen": 233589155, + "step": 10823, + "time_per_iteration": 2.7045187950134277 + }, + { + "auxiliary_loss_clip": 0.01114569, + "auxiliary_loss_mlp": 0.01031767, + "balance_loss_clip": 1.0398941, + "balance_loss_mlp": 1.01913643, + "epoch": 0.6507740868781001, + "flos": 26761745226240.0, + "grad_norm": 1.6940233286010407, + "language_loss": 0.66299087, + "learning_rate": 1.148483704558183e-06, + "loss": 0.6844542, + "num_input_tokens_seen": 233608180, + "step": 10824, + "time_per_iteration": 2.609870433807373 + }, + { + "auxiliary_loss_clip": 0.01096015, + "auxiliary_loss_mlp": 0.01031659, + "balance_loss_clip": 1.04038215, + "balance_loss_mlp": 1.01846242, + "epoch": 0.650834210130768, + "flos": 16471722487680.0, + "grad_norm": 5.416027486189251, + "language_loss": 0.87431592, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.89559269, + "num_input_tokens_seen": 233625750, + "step": 10825, + "time_per_iteration": 2.649099588394165 + }, + { + "auxiliary_loss_clip": 0.01092468, + "auxiliary_loss_mlp": 0.01028826, + "balance_loss_clip": 1.03650379, + "balance_loss_mlp": 1.01514649, + "epoch": 0.650894333383436, + "flos": 17128672283520.0, + "grad_norm": 2.103621809336841, + "language_loss": 0.73180604, + "learning_rate": 1.147778970474885e-06, + "loss": 0.75301898, + "num_input_tokens_seen": 233644235, + "step": 10826, + "time_per_iteration": 2.6394810676574707 + }, + { + "auxiliary_loss_clip": 0.01104739, + "auxiliary_loss_mlp": 0.01027729, + "balance_loss_clip": 1.04116881, + "balance_loss_mlp": 1.01562333, + "epoch": 0.650954456636104, + "flos": 18734238311040.0, + "grad_norm": 1.7744084173415924, + "language_loss": 0.68743241, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.70875704, + "num_input_tokens_seen": 233662845, + "step": 10827, + "time_per_iteration": 2.5662622451782227 + }, + { + "auxiliary_loss_clip": 0.01089545, + "auxiliary_loss_mlp": 0.01031977, + "balance_loss_clip": 1.03715336, + "balance_loss_mlp": 1.02000248, + "epoch": 0.651014579888772, + "flos": 24528244613760.0, + "grad_norm": 1.7280110593006797, + "language_loss": 0.76715839, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.78837359, + "num_input_tokens_seen": 233681990, + "step": 10828, + "time_per_iteration": 2.6430130004882812 + }, + { + "auxiliary_loss_clip": 0.01101657, + "auxiliary_loss_mlp": 0.0102849, + "balance_loss_clip": 1.0396359, + "balance_loss_mlp": 1.01659322, + "epoch": 0.65107470314144, + "flos": 24061083304320.0, + "grad_norm": 2.028448280689147, + "language_loss": 0.89382976, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.91513121, + "num_input_tokens_seen": 233698930, + "step": 10829, + "time_per_iteration": 2.676887273788452 + }, + { + "auxiliary_loss_clip": 0.01033575, + "auxiliary_loss_mlp": 0.01003174, + "balance_loss_clip": 1.00994611, + "balance_loss_mlp": 1.00192249, + "epoch": 0.6511348263941079, + "flos": 72480734352000.0, + "grad_norm": 0.6385058987930536, + "language_loss": 0.55351257, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.57388008, + "num_input_tokens_seen": 233769825, + "step": 10830, + "time_per_iteration": 3.283604383468628 + }, + { + "auxiliary_loss_clip": 0.01080445, + "auxiliary_loss_mlp": 0.01033977, + "balance_loss_clip": 1.03753436, + "balance_loss_mlp": 1.02031004, + "epoch": 0.6511949496467759, + "flos": 23367684182400.0, + "grad_norm": 2.2496423265989263, + "language_loss": 0.74632305, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.76746726, + "num_input_tokens_seen": 233787095, + "step": 10831, + "time_per_iteration": 2.6958060264587402 + }, + { + "auxiliary_loss_clip": 0.01016148, + "auxiliary_loss_mlp": 0.01001305, + "balance_loss_clip": 1.01118171, + "balance_loss_mlp": 0.99989206, + "epoch": 0.6512550728994438, + "flos": 67333191073920.0, + "grad_norm": 0.6457874133081085, + "language_loss": 0.50977135, + "learning_rate": 1.145665544243828e-06, + "loss": 0.52994585, + "num_input_tokens_seen": 233853050, + "step": 10832, + "time_per_iteration": 3.3019638061523438 + }, + { + "auxiliary_loss_clip": 0.01094456, + "auxiliary_loss_mlp": 0.0103476, + "balance_loss_clip": 1.03838396, + "balance_loss_mlp": 1.02121806, + "epoch": 0.6513151961521119, + "flos": 21141689512320.0, + "grad_norm": 2.261964071454772, + "language_loss": 0.83006239, + "learning_rate": 1.145313419848316e-06, + "loss": 0.85135454, + "num_input_tokens_seen": 233871385, + "step": 10833, + "time_per_iteration": 2.643763542175293 + }, + { + "auxiliary_loss_clip": 0.01096358, + "auxiliary_loss_mlp": 0.01034172, + "balance_loss_clip": 1.04303241, + "balance_loss_mlp": 1.02144599, + "epoch": 0.6513753194047798, + "flos": 15158828476800.0, + "grad_norm": 2.0262015833742937, + "language_loss": 0.83040363, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.85170895, + "num_input_tokens_seen": 233888175, + "step": 10834, + "time_per_iteration": 2.696136713027954 + }, + { + "auxiliary_loss_clip": 0.01102331, + "auxiliary_loss_mlp": 0.01040155, + "balance_loss_clip": 1.039487, + "balance_loss_mlp": 1.02702951, + "epoch": 0.6514354426574478, + "flos": 30226621933440.0, + "grad_norm": 1.5116925476060534, + "language_loss": 0.7712391, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.79266393, + "num_input_tokens_seen": 233911470, + "step": 10835, + "time_per_iteration": 2.733752965927124 + }, + { + "auxiliary_loss_clip": 0.01087811, + "auxiliary_loss_mlp": 0.01038329, + "balance_loss_clip": 1.0393815, + "balance_loss_mlp": 1.02551985, + "epoch": 0.6514955659101157, + "flos": 24205587719040.0, + "grad_norm": 1.603053369126082, + "language_loss": 0.77712744, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.79838884, + "num_input_tokens_seen": 233932135, + "step": 10836, + "time_per_iteration": 2.7181618213653564 + }, + { + "auxiliary_loss_clip": 0.01076915, + "auxiliary_loss_mlp": 0.01034338, + "balance_loss_clip": 1.037691, + "balance_loss_mlp": 1.02143383, + "epoch": 0.6515556891627837, + "flos": 12377761960320.0, + "grad_norm": 2.035005351868823, + "language_loss": 0.82812917, + "learning_rate": 1.143905246497783e-06, + "loss": 0.84924167, + "num_input_tokens_seen": 233947880, + "step": 10837, + "time_per_iteration": 2.6514079570770264 + }, + { + "auxiliary_loss_clip": 0.01073313, + "auxiliary_loss_mlp": 0.01035471, + "balance_loss_clip": 1.03897333, + "balance_loss_mlp": 1.0211482, + "epoch": 0.6516158124154516, + "flos": 49601217957120.0, + "grad_norm": 1.8965490798746285, + "language_loss": 0.5910452, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.61213303, + "num_input_tokens_seen": 233971475, + "step": 10838, + "time_per_iteration": 2.955751419067383 + }, + { + "auxiliary_loss_clip": 0.01033147, + "auxiliary_loss_mlp": 0.01008878, + "balance_loss_clip": 1.0095979, + "balance_loss_mlp": 1.00770998, + "epoch": 0.6516759356681197, + "flos": 59702748076800.0, + "grad_norm": 0.7683915325325666, + "language_loss": 0.60835862, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.62877893, + "num_input_tokens_seen": 234030690, + "step": 10839, + "time_per_iteration": 3.200835943222046 + }, + { + "auxiliary_loss_clip": 0.0109233, + "auxiliary_loss_mlp": 0.0103157, + "balance_loss_clip": 1.04093075, + "balance_loss_mlp": 1.01998901, + "epoch": 0.6517360589207876, + "flos": 37450807130880.0, + "grad_norm": 1.7743025760939068, + "language_loss": 0.67926049, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.70049942, + "num_input_tokens_seen": 234052470, + "step": 10840, + "time_per_iteration": 2.8348867893218994 + }, + { + "auxiliary_loss_clip": 0.01067745, + "auxiliary_loss_mlp": 0.01034413, + "balance_loss_clip": 1.03654337, + "balance_loss_mlp": 1.02269483, + "epoch": 0.6517961821734556, + "flos": 25374911068800.0, + "grad_norm": 2.0615754511911306, + "language_loss": 0.73519421, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.75621581, + "num_input_tokens_seen": 234071495, + "step": 10841, + "time_per_iteration": 4.435396671295166 + }, + { + "auxiliary_loss_clip": 0.01114891, + "auxiliary_loss_mlp": 0.01038031, + "balance_loss_clip": 1.03930378, + "balance_loss_mlp": 1.02487588, + "epoch": 0.6518563054261236, + "flos": 28766996864640.0, + "grad_norm": 1.4942272074667713, + "language_loss": 0.62317944, + "learning_rate": 1.142145760331648e-06, + "loss": 0.64470863, + "num_input_tokens_seen": 234092325, + "step": 10842, + "time_per_iteration": 2.6767518520355225 + }, + { + "auxiliary_loss_clip": 0.01024949, + "auxiliary_loss_mlp": 0.01006106, + "balance_loss_clip": 1.01075029, + "balance_loss_mlp": 1.00497305, + "epoch": 0.6519164286787915, + "flos": 68924750797440.0, + "grad_norm": 0.8104047899585891, + "language_loss": 0.5617612, + "learning_rate": 1.141793960634807e-06, + "loss": 0.58207178, + "num_input_tokens_seen": 234148005, + "step": 10843, + "time_per_iteration": 3.0310990810394287 + }, + { + "auxiliary_loss_clip": 0.01104455, + "auxiliary_loss_mlp": 0.01039452, + "balance_loss_clip": 1.03846788, + "balance_loss_mlp": 1.02576053, + "epoch": 0.6519765519314595, + "flos": 20441933683200.0, + "grad_norm": 1.5675649945193708, + "language_loss": 0.82750475, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.84894383, + "num_input_tokens_seen": 234164280, + "step": 10844, + "time_per_iteration": 5.7787792682647705 + }, + { + "auxiliary_loss_clip": 0.01104311, + "auxiliary_loss_mlp": 0.01034465, + "balance_loss_clip": 1.04057419, + "balance_loss_mlp": 1.02136445, + "epoch": 0.6520366751841274, + "flos": 28402970480640.0, + "grad_norm": 1.85573565019848, + "language_loss": 0.59983897, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.62122673, + "num_input_tokens_seen": 234185090, + "step": 10845, + "time_per_iteration": 2.7293028831481934 + }, + { + "auxiliary_loss_clip": 0.0110391, + "auxiliary_loss_mlp": 0.01032078, + "balance_loss_clip": 1.04017997, + "balance_loss_mlp": 1.01897073, + "epoch": 0.6520967984367955, + "flos": 22273414300800.0, + "grad_norm": 1.668141485329768, + "language_loss": 0.79591072, + "learning_rate": 1.140738756857194e-06, + "loss": 0.81727064, + "num_input_tokens_seen": 234204050, + "step": 10846, + "time_per_iteration": 2.6495091915130615 + }, + { + "auxiliary_loss_clip": 0.01025275, + "auxiliary_loss_mlp": 0.01003438, + "balance_loss_clip": 1.01079941, + "balance_loss_mlp": 1.00228775, + "epoch": 0.6521569216894634, + "flos": 68917140092160.0, + "grad_norm": 0.709011283257112, + "language_loss": 0.60191703, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.62220418, + "num_input_tokens_seen": 234269790, + "step": 10847, + "time_per_iteration": 3.282104730606079 + }, + { + "auxiliary_loss_clip": 0.0111717, + "auxiliary_loss_mlp": 0.01037772, + "balance_loss_clip": 1.0412842, + "balance_loss_mlp": 1.02495718, + "epoch": 0.6522170449421314, + "flos": 29130520458240.0, + "grad_norm": 1.5919105635369972, + "language_loss": 0.81118578, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.8327353, + "num_input_tokens_seen": 234290135, + "step": 10848, + "time_per_iteration": 2.6569244861602783 + }, + { + "auxiliary_loss_clip": 0.01084019, + "auxiliary_loss_mlp": 0.01035374, + "balance_loss_clip": 1.03738701, + "balance_loss_mlp": 1.02265429, + "epoch": 0.6522771681947993, + "flos": 26651930371200.0, + "grad_norm": 2.586521111897064, + "language_loss": 0.74449492, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.7656889, + "num_input_tokens_seen": 234309535, + "step": 10849, + "time_per_iteration": 4.26736044883728 + }, + { + "auxiliary_loss_clip": 0.0106317, + "auxiliary_loss_mlp": 0.01031804, + "balance_loss_clip": 1.03691697, + "balance_loss_mlp": 1.0188818, + "epoch": 0.6523372914474673, + "flos": 25739763465600.0, + "grad_norm": 1.4022053902069738, + "language_loss": 0.67808872, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.69903851, + "num_input_tokens_seen": 234328755, + "step": 10850, + "time_per_iteration": 2.8357365131378174 + }, + { + "auxiliary_loss_clip": 0.01089828, + "auxiliary_loss_mlp": 0.00769863, + "balance_loss_clip": 1.03987718, + "balance_loss_mlp": 1.00014496, + "epoch": 0.6523974147001352, + "flos": 24827345164800.0, + "grad_norm": 1.627745472842777, + "language_loss": 0.66696799, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.68556488, + "num_input_tokens_seen": 234348655, + "step": 10851, + "time_per_iteration": 2.702782154083252 + }, + { + "auxiliary_loss_clip": 0.01092324, + "auxiliary_loss_mlp": 0.0103014, + "balance_loss_clip": 1.04054999, + "balance_loss_mlp": 1.01776636, + "epoch": 0.6524575379528033, + "flos": 26317637470080.0, + "grad_norm": 2.9837115627224238, + "language_loss": 0.73833734, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.75956196, + "num_input_tokens_seen": 234367445, + "step": 10852, + "time_per_iteration": 2.7116212844848633 + }, + { + "auxiliary_loss_clip": 0.0109357, + "auxiliary_loss_mlp": 0.01030172, + "balance_loss_clip": 1.04287267, + "balance_loss_mlp": 1.01617694, + "epoch": 0.6525176612054712, + "flos": 19494143464320.0, + "grad_norm": 1.9044884730623952, + "language_loss": 0.66662163, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.68785906, + "num_input_tokens_seen": 234384825, + "step": 10853, + "time_per_iteration": 2.7027504444122314 + }, + { + "auxiliary_loss_clip": 0.01002155, + "auxiliary_loss_mlp": 0.01000193, + "balance_loss_clip": 1.01079071, + "balance_loss_mlp": 0.99902517, + "epoch": 0.6525777844581392, + "flos": 71706894721920.0, + "grad_norm": 0.7271722971933409, + "language_loss": 0.62995195, + "learning_rate": 1.137926314758634e-06, + "loss": 0.64997554, + "num_input_tokens_seen": 234450630, + "step": 10854, + "time_per_iteration": 3.330467462539673 + }, + { + "auxiliary_loss_clip": 0.01098588, + "auxiliary_loss_mlp": 0.01040453, + "balance_loss_clip": 1.03749895, + "balance_loss_mlp": 1.02501512, + "epoch": 0.6526379077108072, + "flos": 26653115520000.0, + "grad_norm": 1.9818066069545293, + "language_loss": 0.77810514, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.79949546, + "num_input_tokens_seen": 234473505, + "step": 10855, + "time_per_iteration": 2.856804132461548 + }, + { + "auxiliary_loss_clip": 0.01073699, + "auxiliary_loss_mlp": 0.01028598, + "balance_loss_clip": 1.03438473, + "balance_loss_mlp": 1.01601565, + "epoch": 0.6526980309634751, + "flos": 22820369673600.0, + "grad_norm": 1.8477737717286657, + "language_loss": 0.78975284, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.81077588, + "num_input_tokens_seen": 234492485, + "step": 10856, + "time_per_iteration": 2.7385408878326416 + }, + { + "auxiliary_loss_clip": 0.01114282, + "auxiliary_loss_mlp": 0.01034182, + "balance_loss_clip": 1.04025459, + "balance_loss_mlp": 1.0199244, + "epoch": 0.6527581542161431, + "flos": 28365048696960.0, + "grad_norm": 3.158979628826276, + "language_loss": 0.73701787, + "learning_rate": 1.136872187988815e-06, + "loss": 0.75850254, + "num_input_tokens_seen": 234512645, + "step": 10857, + "time_per_iteration": 2.6843883991241455 + }, + { + "auxiliary_loss_clip": 0.01090082, + "auxiliary_loss_mlp": 0.01035453, + "balance_loss_clip": 1.03591764, + "balance_loss_mlp": 1.02337718, + "epoch": 0.652818277468811, + "flos": 18369206346240.0, + "grad_norm": 3.7655949608052257, + "language_loss": 0.6289376, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.65019298, + "num_input_tokens_seen": 234529310, + "step": 10858, + "time_per_iteration": 2.72822904586792 + }, + { + "auxiliary_loss_clip": 0.01110966, + "auxiliary_loss_mlp": 0.01034439, + "balance_loss_clip": 1.03902686, + "balance_loss_mlp": 1.02228558, + "epoch": 0.6528784007214791, + "flos": 18036170421120.0, + "grad_norm": 1.6211282430818235, + "language_loss": 0.78672451, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.80817854, + "num_input_tokens_seen": 234546685, + "step": 10859, + "time_per_iteration": 2.5962581634521484 + }, + { + "auxiliary_loss_clip": 0.01104671, + "auxiliary_loss_mlp": 0.01033239, + "balance_loss_clip": 1.03923452, + "balance_loss_mlp": 1.02013731, + "epoch": 0.652938523974147, + "flos": 22382008093440.0, + "grad_norm": 1.697122178276391, + "language_loss": 0.67908686, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.70046592, + "num_input_tokens_seen": 234566255, + "step": 10860, + "time_per_iteration": 2.7275006771087646 + }, + { + "auxiliary_loss_clip": 0.01105971, + "auxiliary_loss_mlp": 0.01029587, + "balance_loss_clip": 1.04165852, + "balance_loss_mlp": 1.01677179, + "epoch": 0.652998647226815, + "flos": 16764035368320.0, + "grad_norm": 2.149849017639803, + "language_loss": 0.67175591, + "learning_rate": 1.135467143909712e-06, + "loss": 0.69311142, + "num_input_tokens_seen": 234585405, + "step": 10861, + "time_per_iteration": 2.700737237930298 + }, + { + "auxiliary_loss_clip": 0.01093061, + "auxiliary_loss_mlp": 0.01034965, + "balance_loss_clip": 1.03918886, + "balance_loss_mlp": 1.02101707, + "epoch": 0.6530587704794829, + "flos": 35772522019200.0, + "grad_norm": 1.8900448169789823, + "language_loss": 0.64973295, + "learning_rate": 1.135115964814572e-06, + "loss": 0.67101324, + "num_input_tokens_seen": 234608095, + "step": 10862, + "time_per_iteration": 2.8191120624542236 + }, + { + "auxiliary_loss_clip": 0.01090214, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.03788185, + "balance_loss_mlp": 1.02351046, + "epoch": 0.6531188937321509, + "flos": 19316134638720.0, + "grad_norm": 1.7201949909347662, + "language_loss": 0.77214205, + "learning_rate": 1.13476481851592e-06, + "loss": 0.79340369, + "num_input_tokens_seen": 234627335, + "step": 10863, + "time_per_iteration": 2.7301394939422607 + }, + { + "auxiliary_loss_clip": 0.01086865, + "auxiliary_loss_mlp": 0.01035498, + "balance_loss_clip": 1.03934371, + "balance_loss_mlp": 1.0234524, + "epoch": 0.6531790169848188, + "flos": 22893771116160.0, + "grad_norm": 5.89922160085871, + "language_loss": 0.74717021, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.76839387, + "num_input_tokens_seen": 234646540, + "step": 10864, + "time_per_iteration": 2.694638729095459 + }, + { + "auxiliary_loss_clip": 0.01101868, + "auxiliary_loss_mlp": 0.01037064, + "balance_loss_clip": 1.03954864, + "balance_loss_mlp": 1.02464223, + "epoch": 0.6532391402374869, + "flos": 29563530912000.0, + "grad_norm": 1.7565530493907513, + "language_loss": 0.86014044, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.88152981, + "num_input_tokens_seen": 234665470, + "step": 10865, + "time_per_iteration": 2.6702401638031006 + }, + { + "auxiliary_loss_clip": 0.01084878, + "auxiliary_loss_mlp": 0.00771127, + "balance_loss_clip": 1.0366689, + "balance_loss_mlp": 1.00016713, + "epoch": 0.6532992634901548, + "flos": 23105463920640.0, + "grad_norm": 1.5997360666048854, + "language_loss": 0.81537604, + "learning_rate": 1.133711576532051e-06, + "loss": 0.8339361, + "num_input_tokens_seen": 234683955, + "step": 10866, + "time_per_iteration": 2.7677865028381348 + }, + { + "auxiliary_loss_clip": 0.01092326, + "auxiliary_loss_mlp": 0.01027552, + "balance_loss_clip": 1.04049444, + "balance_loss_mlp": 1.0153923, + "epoch": 0.6533593867428228, + "flos": 26067340523520.0, + "grad_norm": 1.499689557141503, + "language_loss": 0.82382023, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.84501904, + "num_input_tokens_seen": 234704595, + "step": 10867, + "time_per_iteration": 2.67887020111084 + }, + { + "auxiliary_loss_clip": 0.01086387, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.03923059, + "balance_loss_mlp": 1.01656437, + "epoch": 0.6534195099954908, + "flos": 21212469262080.0, + "grad_norm": 1.9931778054716736, + "language_loss": 0.81410849, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.83526063, + "num_input_tokens_seen": 234724090, + "step": 10868, + "time_per_iteration": 2.692563533782959 + }, + { + "auxiliary_loss_clip": 0.01085283, + "auxiliary_loss_mlp": 0.0103014, + "balance_loss_clip": 1.04046869, + "balance_loss_mlp": 1.01654446, + "epoch": 0.6534796332481587, + "flos": 19646584784640.0, + "grad_norm": 1.7926198693955093, + "language_loss": 0.79652596, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.81768018, + "num_input_tokens_seen": 234742560, + "step": 10869, + "time_per_iteration": 2.6747188568115234 + }, + { + "auxiliary_loss_clip": 0.01107733, + "auxiliary_loss_mlp": 0.01034253, + "balance_loss_clip": 1.04306769, + "balance_loss_mlp": 1.02144957, + "epoch": 0.6535397565008267, + "flos": 24022479162240.0, + "grad_norm": 1.9247655195442634, + "language_loss": 0.72409803, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.74551791, + "num_input_tokens_seen": 234762315, + "step": 10870, + "time_per_iteration": 2.6496713161468506 + }, + { + "auxiliary_loss_clip": 0.01073837, + "auxiliary_loss_mlp": 0.01040127, + "balance_loss_clip": 1.0374316, + "balance_loss_mlp": 1.02689457, + "epoch": 0.6535998797534947, + "flos": 24602759377920.0, + "grad_norm": 2.0567680865886797, + "language_loss": 0.7481339, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.76927352, + "num_input_tokens_seen": 234781300, + "step": 10871, + "time_per_iteration": 2.738467216491699 + }, + { + "auxiliary_loss_clip": 0.01094755, + "auxiliary_loss_mlp": 0.00768767, + "balance_loss_clip": 1.04057598, + "balance_loss_mlp": 1.00008535, + "epoch": 0.6536600030061627, + "flos": 23364164649600.0, + "grad_norm": 1.631721616705098, + "language_loss": 0.55669373, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.57532895, + "num_input_tokens_seen": 234801040, + "step": 10872, + "time_per_iteration": 2.7837493419647217 + }, + { + "auxiliary_loss_clip": 0.01089558, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.03836048, + "balance_loss_mlp": 1.02150071, + "epoch": 0.6537201262588306, + "flos": 23878477537920.0, + "grad_norm": 1.5206380793292014, + "language_loss": 0.74701464, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.76824474, + "num_input_tokens_seen": 234821415, + "step": 10873, + "time_per_iteration": 2.6991825103759766 + }, + { + "auxiliary_loss_clip": 0.01103837, + "auxiliary_loss_mlp": 0.01031753, + "balance_loss_clip": 1.04124331, + "balance_loss_mlp": 1.01923621, + "epoch": 0.6537802495114986, + "flos": 24354760901760.0, + "grad_norm": 1.5572607769752447, + "language_loss": 0.75670367, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.7780596, + "num_input_tokens_seen": 234843795, + "step": 10874, + "time_per_iteration": 2.78080153465271 + }, + { + "auxiliary_loss_clip": 0.01071596, + "auxiliary_loss_mlp": 0.01032474, + "balance_loss_clip": 1.03871393, + "balance_loss_mlp": 1.01939058, + "epoch": 0.6538403727641665, + "flos": 27996892248960.0, + "grad_norm": 1.5478335962993721, + "language_loss": 0.81636667, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.83740735, + "num_input_tokens_seen": 234862350, + "step": 10875, + "time_per_iteration": 2.8029510974884033 + }, + { + "auxiliary_loss_clip": 0.01113458, + "auxiliary_loss_mlp": 0.01038052, + "balance_loss_clip": 1.03928709, + "balance_loss_mlp": 1.0257194, + "epoch": 0.6539004960168345, + "flos": 27563594486400.0, + "grad_norm": 1.7147154744114859, + "language_loss": 0.70016718, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.72168231, + "num_input_tokens_seen": 234881790, + "step": 10876, + "time_per_iteration": 2.7378597259521484 + }, + { + "auxiliary_loss_clip": 0.01019889, + "auxiliary_loss_mlp": 0.01040083, + "balance_loss_clip": 1.03454161, + "balance_loss_mlp": 1.02664804, + "epoch": 0.6539606192695024, + "flos": 14530067879040.0, + "grad_norm": 5.2813318768904, + "language_loss": 0.79471064, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.81531036, + "num_input_tokens_seen": 234897775, + "step": 10877, + "time_per_iteration": 2.9654347896575928 + }, + { + "auxiliary_loss_clip": 0.0109536, + "auxiliary_loss_mlp": 0.00770832, + "balance_loss_clip": 1.04007304, + "balance_loss_mlp": 1.00019956, + "epoch": 0.6540207425221705, + "flos": 21616356764160.0, + "grad_norm": 10.000647074298708, + "language_loss": 0.79720318, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.81586516, + "num_input_tokens_seen": 234918395, + "step": 10878, + "time_per_iteration": 3.0778963565826416 + }, + { + "auxiliary_loss_clip": 0.01091014, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.03766847, + "balance_loss_mlp": 1.01898539, + "epoch": 0.6540808657748384, + "flos": 17668983640320.0, + "grad_norm": 1.8035849841716871, + "language_loss": 0.84622979, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.8674649, + "num_input_tokens_seen": 234936260, + "step": 10879, + "time_per_iteration": 2.668922185897827 + }, + { + "auxiliary_loss_clip": 0.01093903, + "auxiliary_loss_mlp": 0.01030306, + "balance_loss_clip": 1.03903461, + "balance_loss_mlp": 1.01730609, + "epoch": 0.6541409890275064, + "flos": 14538292093440.0, + "grad_norm": 2.263757202052665, + "language_loss": 0.71778309, + "learning_rate": 1.128800362199601e-06, + "loss": 0.73902524, + "num_input_tokens_seen": 234952110, + "step": 10880, + "time_per_iteration": 2.662271499633789 + }, + { + "auxiliary_loss_clip": 0.0107269, + "auxiliary_loss_mlp": 0.01037728, + "balance_loss_clip": 1.03594911, + "balance_loss_mlp": 1.02518129, + "epoch": 0.6542011122801744, + "flos": 17165301177600.0, + "grad_norm": 2.432806470924959, + "language_loss": 0.8439703, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.86507452, + "num_input_tokens_seen": 234970810, + "step": 10881, + "time_per_iteration": 4.583907127380371 + }, + { + "auxiliary_loss_clip": 0.01081012, + "auxiliary_loss_mlp": 0.01035122, + "balance_loss_clip": 1.03797197, + "balance_loss_mlp": 1.02026868, + "epoch": 0.6542612355328423, + "flos": 18186600579840.0, + "grad_norm": 1.797675187581498, + "language_loss": 0.78180546, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.80296683, + "num_input_tokens_seen": 234989565, + "step": 10882, + "time_per_iteration": 2.7273218631744385 + }, + { + "auxiliary_loss_clip": 0.01117869, + "auxiliary_loss_mlp": 0.01030964, + "balance_loss_clip": 1.04191113, + "balance_loss_mlp": 1.01693869, + "epoch": 0.6543213587855103, + "flos": 19792453916160.0, + "grad_norm": 2.0896373641472716, + "language_loss": 0.82002509, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.84151345, + "num_input_tokens_seen": 235007955, + "step": 10883, + "time_per_iteration": 2.6430859565734863 + }, + { + "auxiliary_loss_clip": 0.01063765, + "auxiliary_loss_mlp": 0.01039023, + "balance_loss_clip": 1.03828621, + "balance_loss_mlp": 1.02518272, + "epoch": 0.6543814820381783, + "flos": 21105096531840.0, + "grad_norm": 2.092498099252334, + "language_loss": 0.85347474, + "learning_rate": 1.127398345803988e-06, + "loss": 0.8745026, + "num_input_tokens_seen": 235024860, + "step": 10884, + "time_per_iteration": 6.071943998336792 + }, + { + "auxiliary_loss_clip": 0.01092231, + "auxiliary_loss_mlp": 0.01036915, + "balance_loss_clip": 1.03901005, + "balance_loss_mlp": 1.02371883, + "epoch": 0.6544416052908463, + "flos": 20194042947840.0, + "grad_norm": 2.4941716916648367, + "language_loss": 0.79124463, + "learning_rate": 1.127047924394715e-06, + "loss": 0.81253612, + "num_input_tokens_seen": 235043815, + "step": 10885, + "time_per_iteration": 2.675748586654663 + }, + { + "auxiliary_loss_clip": 0.01074538, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.03618622, + "balance_loss_mlp": 1.01794887, + "epoch": 0.6545017285435142, + "flos": 23368258800000.0, + "grad_norm": 1.8639137549782854, + "language_loss": 0.72277772, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.7438345, + "num_input_tokens_seen": 235062985, + "step": 10886, + "time_per_iteration": 2.750396490097046 + }, + { + "auxiliary_loss_clip": 0.0109826, + "auxiliary_loss_mlp": 0.01029695, + "balance_loss_clip": 1.04163647, + "balance_loss_mlp": 1.01777434, + "epoch": 0.6545618517961822, + "flos": 19134714021120.0, + "grad_norm": 1.7570103692481698, + "language_loss": 0.77918178, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.80046129, + "num_input_tokens_seen": 235081670, + "step": 10887, + "time_per_iteration": 2.6504671573638916 + }, + { + "auxiliary_loss_clip": 0.01087762, + "auxiliary_loss_mlp": 0.01034009, + "balance_loss_clip": 1.03893995, + "balance_loss_mlp": 1.02152205, + "epoch": 0.6546219750488501, + "flos": 14938624149120.0, + "grad_norm": 3.1473995780079567, + "language_loss": 0.78907198, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.81028962, + "num_input_tokens_seen": 235098510, + "step": 10888, + "time_per_iteration": 4.194061040878296 + }, + { + "auxiliary_loss_clip": 0.01101212, + "auxiliary_loss_mlp": 0.01030296, + "balance_loss_clip": 1.03934383, + "balance_loss_mlp": 1.0185833, + "epoch": 0.6546820983015181, + "flos": 36320518886400.0, + "grad_norm": 1.6496831253156983, + "language_loss": 0.66765958, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.68897462, + "num_input_tokens_seen": 235119990, + "step": 10889, + "time_per_iteration": 2.784081220626831 + }, + { + "auxiliary_loss_clip": 0.01087306, + "auxiliary_loss_mlp": 0.01041216, + "balance_loss_clip": 1.03762484, + "balance_loss_mlp": 1.02561128, + "epoch": 0.654742221554186, + "flos": 20411446014720.0, + "grad_norm": 1.423388332820949, + "language_loss": 0.7975992, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.81888443, + "num_input_tokens_seen": 235139255, + "step": 10890, + "time_per_iteration": 2.630934000015259 + }, + { + "auxiliary_loss_clip": 0.01103288, + "auxiliary_loss_mlp": 0.00771276, + "balance_loss_clip": 1.0388689, + "balance_loss_mlp": 1.00018215, + "epoch": 0.6548023448068541, + "flos": 24863650836480.0, + "grad_norm": 4.747441832744551, + "language_loss": 0.66281724, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.6815629, + "num_input_tokens_seen": 235158455, + "step": 10891, + "time_per_iteration": 2.7071638107299805 + }, + { + "auxiliary_loss_clip": 0.01100507, + "auxiliary_loss_mlp": 0.01034408, + "balance_loss_clip": 1.03802693, + "balance_loss_mlp": 1.02253485, + "epoch": 0.654862468059522, + "flos": 21427573858560.0, + "grad_norm": 1.8230572175778426, + "language_loss": 0.79398739, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.81533659, + "num_input_tokens_seen": 235177350, + "step": 10892, + "time_per_iteration": 2.7039225101470947 + }, + { + "auxiliary_loss_clip": 0.01109845, + "auxiliary_loss_mlp": 0.01032726, + "balance_loss_clip": 1.04345989, + "balance_loss_mlp": 1.01996517, + "epoch": 0.65492259131219, + "flos": 26577846570240.0, + "grad_norm": 1.9941624602256833, + "language_loss": 0.7830174, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.80444312, + "num_input_tokens_seen": 235196435, + "step": 10893, + "time_per_iteration": 2.6736834049224854 + }, + { + "auxiliary_loss_clip": 0.01119127, + "auxiliary_loss_mlp": 0.01033009, + "balance_loss_clip": 1.04234505, + "balance_loss_mlp": 1.01919901, + "epoch": 0.6549827145648579, + "flos": 21501334437120.0, + "grad_norm": 1.6280761795880925, + "language_loss": 0.70089674, + "learning_rate": 1.123895622914766e-06, + "loss": 0.72241807, + "num_input_tokens_seen": 235215430, + "step": 10894, + "time_per_iteration": 2.5782406330108643 + }, + { + "auxiliary_loss_clip": 0.01108084, + "auxiliary_loss_mlp": 0.01033898, + "balance_loss_clip": 1.03990614, + "balance_loss_mlp": 1.02057683, + "epoch": 0.6550428378175259, + "flos": 22594275515520.0, + "grad_norm": 3.549181275643373, + "language_loss": 0.63655615, + "learning_rate": 1.123545533127549e-06, + "loss": 0.65797597, + "num_input_tokens_seen": 235232015, + "step": 10895, + "time_per_iteration": 2.629176139831543 + }, + { + "auxiliary_loss_clip": 0.0109961, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.03651488, + "balance_loss_mlp": 1.02231681, + "epoch": 0.655102961070194, + "flos": 12823809050880.0, + "grad_norm": 1.94601425933446, + "language_loss": 0.78524303, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.80658519, + "num_input_tokens_seen": 235248115, + "step": 10896, + "time_per_iteration": 2.5840821266174316 + }, + { + "auxiliary_loss_clip": 0.01092224, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.04114115, + "balance_loss_mlp": 1.02101183, + "epoch": 0.6551630843228619, + "flos": 24791075406720.0, + "grad_norm": 1.3806195961019156, + "language_loss": 0.70286167, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.72411478, + "num_input_tokens_seen": 235270785, + "step": 10897, + "time_per_iteration": 2.7511391639709473 + }, + { + "auxiliary_loss_clip": 0.01117369, + "auxiliary_loss_mlp": 0.01034749, + "balance_loss_clip": 1.0412488, + "balance_loss_mlp": 1.02182722, + "epoch": 0.6552232075755299, + "flos": 16724461559040.0, + "grad_norm": 1.8561946448451885, + "language_loss": 0.75477493, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.77629614, + "num_input_tokens_seen": 235287905, + "step": 10898, + "time_per_iteration": 2.5865721702575684 + }, + { + "auxiliary_loss_clip": 0.0109408, + "auxiliary_loss_mlp": 0.01033257, + "balance_loss_clip": 1.03979027, + "balance_loss_mlp": 1.0210743, + "epoch": 0.6552833308281978, + "flos": 22016473338240.0, + "grad_norm": 3.2174058784853634, + "language_loss": 0.73745394, + "learning_rate": 1.122145506463827e-06, + "loss": 0.75872725, + "num_input_tokens_seen": 235305525, + "step": 10899, + "time_per_iteration": 2.6415457725524902 + }, + { + "auxiliary_loss_clip": 0.01092854, + "auxiliary_loss_mlp": 0.0103035, + "balance_loss_clip": 1.0398674, + "balance_loss_mlp": 1.0178864, + "epoch": 0.6553434540808658, + "flos": 24863399441280.0, + "grad_norm": 1.7775828030787661, + "language_loss": 0.5608502, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.58208227, + "num_input_tokens_seen": 235324415, + "step": 10900, + "time_per_iteration": 2.6782078742980957 + }, + { + "auxiliary_loss_clip": 0.0110767, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.04541218, + "balance_loss_mlp": 1.02259791, + "epoch": 0.6554035773335337, + "flos": 23221060865280.0, + "grad_norm": 1.7239848151303507, + "language_loss": 0.76706004, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.78849864, + "num_input_tokens_seen": 235341595, + "step": 10901, + "time_per_iteration": 2.6912708282470703 + }, + { + "auxiliary_loss_clip": 0.01116025, + "auxiliary_loss_mlp": 0.01030659, + "balance_loss_clip": 1.04228628, + "balance_loss_mlp": 1.01741457, + "epoch": 0.6554637005862017, + "flos": 22783597125120.0, + "grad_norm": 1.8287933063935295, + "language_loss": 0.73178118, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.7532481, + "num_input_tokens_seen": 235361700, + "step": 10902, + "time_per_iteration": 2.602215528488159 + }, + { + "auxiliary_loss_clip": 0.01116289, + "auxiliary_loss_mlp": 0.0103284, + "balance_loss_clip": 1.04363585, + "balance_loss_mlp": 1.020293, + "epoch": 0.6555238238388696, + "flos": 21507224267520.0, + "grad_norm": 2.8262041202402806, + "language_loss": 0.68081355, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.7023049, + "num_input_tokens_seen": 235382065, + "step": 10903, + "time_per_iteration": 2.6410489082336426 + }, + { + "auxiliary_loss_clip": 0.01095479, + "auxiliary_loss_mlp": 0.00772021, + "balance_loss_clip": 1.0381676, + "balance_loss_mlp": 1.00024486, + "epoch": 0.6555839470915377, + "flos": 30519473518080.0, + "grad_norm": 1.6908937242491595, + "language_loss": 0.66551757, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.6841926, + "num_input_tokens_seen": 235402130, + "step": 10904, + "time_per_iteration": 2.790280342102051 + }, + { + "auxiliary_loss_clip": 0.01106834, + "auxiliary_loss_mlp": 0.01041591, + "balance_loss_clip": 1.0399909, + "balance_loss_mlp": 1.02686858, + "epoch": 0.6556440703442056, + "flos": 24642943718400.0, + "grad_norm": 1.7449585350931947, + "language_loss": 0.90588987, + "learning_rate": 1.120046465383464e-06, + "loss": 0.92737412, + "num_input_tokens_seen": 235420435, + "step": 10905, + "time_per_iteration": 2.6630730628967285 + }, + { + "auxiliary_loss_clip": 0.01101239, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.0387404, + "balance_loss_mlp": 1.02384353, + "epoch": 0.6557041935968736, + "flos": 23732464752000.0, + "grad_norm": 1.68326433592196, + "language_loss": 0.75189042, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.77326465, + "num_input_tokens_seen": 235439960, + "step": 10906, + "time_per_iteration": 2.808749198913574 + }, + { + "auxiliary_loss_clip": 0.01120903, + "auxiliary_loss_mlp": 0.01039658, + "balance_loss_clip": 1.04417121, + "balance_loss_mlp": 1.02620482, + "epoch": 0.6557643168495415, + "flos": 11102753819520.0, + "grad_norm": 2.6025393297474753, + "language_loss": 0.74533153, + "learning_rate": 1.119347051825267e-06, + "loss": 0.76693714, + "num_input_tokens_seen": 235457495, + "step": 10907, + "time_per_iteration": 2.593248128890991 + }, + { + "auxiliary_loss_clip": 0.01074084, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.03740346, + "balance_loss_mlp": 1.01887107, + "epoch": 0.6558244401022095, + "flos": 30191034533760.0, + "grad_norm": 1.4237999067012654, + "language_loss": 0.72347319, + "learning_rate": 1.118997395131211e-06, + "loss": 0.74454939, + "num_input_tokens_seen": 235479525, + "step": 10908, + "time_per_iteration": 2.82675838470459 + }, + { + "auxiliary_loss_clip": 0.01119224, + "auxiliary_loss_mlp": 0.01039345, + "balance_loss_clip": 1.04407787, + "balance_loss_mlp": 1.02501035, + "epoch": 0.6558845633548775, + "flos": 17931060247680.0, + "grad_norm": 2.1324653040060206, + "language_loss": 0.81237155, + "learning_rate": 1.118647771844861e-06, + "loss": 0.83395725, + "num_input_tokens_seen": 235496305, + "step": 10909, + "time_per_iteration": 2.5471675395965576 + }, + { + "auxiliary_loss_clip": 0.01118639, + "auxiliary_loss_mlp": 0.01037445, + "balance_loss_clip": 1.04318082, + "balance_loss_mlp": 1.02355766, + "epoch": 0.6559446866075455, + "flos": 21904144531200.0, + "grad_norm": 2.016309466872126, + "language_loss": 0.6391021, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.66066295, + "num_input_tokens_seen": 235512545, + "step": 10910, + "time_per_iteration": 2.5981180667877197 + }, + { + "auxiliary_loss_clip": 0.01094899, + "auxiliary_loss_mlp": 0.01035785, + "balance_loss_clip": 1.03948653, + "balance_loss_mlp": 1.02022815, + "epoch": 0.6560048098602135, + "flos": 14127976056960.0, + "grad_norm": 3.167812850459713, + "language_loss": 0.75653553, + "learning_rate": 1.117948625548313e-06, + "loss": 0.7778424, + "num_input_tokens_seen": 235526045, + "step": 10911, + "time_per_iteration": 2.6054794788360596 + }, + { + "auxiliary_loss_clip": 0.01110901, + "auxiliary_loss_mlp": 0.01032832, + "balance_loss_clip": 1.03947508, + "balance_loss_mlp": 1.02068496, + "epoch": 0.6560649331128814, + "flos": 18807567926400.0, + "grad_norm": 1.6537881729795834, + "language_loss": 0.75314403, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.77458137, + "num_input_tokens_seen": 235545285, + "step": 10912, + "time_per_iteration": 2.5621368885040283 + }, + { + "auxiliary_loss_clip": 0.01080239, + "auxiliary_loss_mlp": 0.00773337, + "balance_loss_clip": 1.04076517, + "balance_loss_mlp": 1.00024402, + "epoch": 0.6561250563655494, + "flos": 17053618815360.0, + "grad_norm": 1.7152126223100395, + "language_loss": 0.77399373, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.79252946, + "num_input_tokens_seen": 235563150, + "step": 10913, + "time_per_iteration": 2.6770215034484863 + }, + { + "auxiliary_loss_clip": 0.01082486, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.03641891, + "balance_loss_mlp": 1.0197978, + "epoch": 0.6561851796182173, + "flos": 22637656166400.0, + "grad_norm": 1.7806335935721003, + "language_loss": 0.71243644, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.73357815, + "num_input_tokens_seen": 235582535, + "step": 10914, + "time_per_iteration": 2.667307138442993 + }, + { + "auxiliary_loss_clip": 0.01083296, + "auxiliary_loss_mlp": 0.01037173, + "balance_loss_clip": 1.03966224, + "balance_loss_mlp": 1.02398872, + "epoch": 0.6562453028708853, + "flos": 19239213663360.0, + "grad_norm": 1.6513290970886485, + "language_loss": 0.73859835, + "learning_rate": 1.116550734430958e-06, + "loss": 0.75980306, + "num_input_tokens_seen": 235601490, + "step": 10915, + "time_per_iteration": 2.6983346939086914 + }, + { + "auxiliary_loss_clip": 0.01073456, + "auxiliary_loss_mlp": 0.01034672, + "balance_loss_clip": 1.03744984, + "balance_loss_mlp": 1.02053952, + "epoch": 0.6563054261235532, + "flos": 23801305167360.0, + "grad_norm": 1.7082646806866446, + "language_loss": 0.79868412, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.81976539, + "num_input_tokens_seen": 235619165, + "step": 10916, + "time_per_iteration": 2.7007508277893066 + }, + { + "auxiliary_loss_clip": 0.01085821, + "auxiliary_loss_mlp": 0.01034159, + "balance_loss_clip": 1.03703237, + "balance_loss_mlp": 1.02174914, + "epoch": 0.6563655493762213, + "flos": 19240039676160.0, + "grad_norm": 7.0038314681057265, + "language_loss": 0.76291168, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.78411144, + "num_input_tokens_seen": 235637115, + "step": 10917, + "time_per_iteration": 2.6554038524627686 + }, + { + "auxiliary_loss_clip": 0.01114484, + "auxiliary_loss_mlp": 0.00770758, + "balance_loss_clip": 1.04096055, + "balance_loss_mlp": 1.00018668, + "epoch": 0.6564256726288892, + "flos": 25556439427200.0, + "grad_norm": 1.7912511669436304, + "language_loss": 0.69599342, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.7148459, + "num_input_tokens_seen": 235656330, + "step": 10918, + "time_per_iteration": 2.658940315246582 + }, + { + "auxiliary_loss_clip": 0.0108095, + "auxiliary_loss_mlp": 0.01038011, + "balance_loss_clip": 1.04091477, + "balance_loss_mlp": 1.02542877, + "epoch": 0.6564857958815572, + "flos": 22200623389440.0, + "grad_norm": 1.5721628638219425, + "language_loss": 0.76389003, + "learning_rate": 1.115153379321332e-06, + "loss": 0.78507966, + "num_input_tokens_seen": 235674510, + "step": 10919, + "time_per_iteration": 2.8179666996002197 + }, + { + "auxiliary_loss_clip": 0.01024309, + "auxiliary_loss_mlp": 0.00751654, + "balance_loss_clip": 1.01056981, + "balance_loss_mlp": 0.99972719, + "epoch": 0.6565459191342251, + "flos": 58123144604160.0, + "grad_norm": 0.7147618349733724, + "language_loss": 0.52982259, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.54758221, + "num_input_tokens_seen": 235735050, + "step": 10920, + "time_per_iteration": 4.864136457443237 + }, + { + "auxiliary_loss_clip": 0.01102705, + "auxiliary_loss_mlp": 0.01032103, + "balance_loss_clip": 1.03955173, + "balance_loss_mlp": 1.01899636, + "epoch": 0.6566060423868931, + "flos": 30809631582720.0, + "grad_norm": 1.4970588684029809, + "language_loss": 0.65309536, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.67444336, + "num_input_tokens_seen": 235757545, + "step": 10921, + "time_per_iteration": 2.6399025917053223 + }, + { + "auxiliary_loss_clip": 0.01088773, + "auxiliary_loss_mlp": 0.01042354, + "balance_loss_clip": 1.03777099, + "balance_loss_mlp": 1.02691627, + "epoch": 0.6566661656395612, + "flos": 23367432787200.0, + "grad_norm": 1.7149236463781705, + "language_loss": 0.81306088, + "learning_rate": 1.114105715254205e-06, + "loss": 0.83437216, + "num_input_tokens_seen": 235777265, + "step": 10922, + "time_per_iteration": 2.6043496131896973 + }, + { + "auxiliary_loss_clip": 0.0105706, + "auxiliary_loss_mlp": 0.00773782, + "balance_loss_clip": 1.03730524, + "balance_loss_mlp": 1.00019729, + "epoch": 0.6567262888922291, + "flos": 25735597488000.0, + "grad_norm": 1.8848622596547697, + "language_loss": 0.71114737, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.72945583, + "num_input_tokens_seen": 235796565, + "step": 10923, + "time_per_iteration": 4.080937385559082 + }, + { + "auxiliary_loss_clip": 0.01080403, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.04141772, + "balance_loss_mlp": 1.02234805, + "epoch": 0.6567864121448971, + "flos": 17123716206720.0, + "grad_norm": 1.9659077727339813, + "language_loss": 0.80819428, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.82934988, + "num_input_tokens_seen": 235814805, + "step": 10924, + "time_per_iteration": 4.207550287246704 + }, + { + "auxiliary_loss_clip": 0.01098058, + "auxiliary_loss_mlp": 0.01028766, + "balance_loss_clip": 1.03715539, + "balance_loss_mlp": 1.01637435, + "epoch": 0.656846535397565, + "flos": 22419319345920.0, + "grad_norm": 4.832574898603098, + "language_loss": 0.7250914, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.74635959, + "num_input_tokens_seen": 235833405, + "step": 10925, + "time_per_iteration": 2.637345790863037 + }, + { + "auxiliary_loss_clip": 0.01101634, + "auxiliary_loss_mlp": 0.01030115, + "balance_loss_clip": 1.03830063, + "balance_loss_mlp": 1.01710916, + "epoch": 0.656906658650233, + "flos": 17704535126400.0, + "grad_norm": 2.262744420383479, + "language_loss": 0.72445238, + "learning_rate": 1.112709300197942e-06, + "loss": 0.74576986, + "num_input_tokens_seen": 235848530, + "step": 10926, + "time_per_iteration": 2.6307756900787354 + }, + { + "auxiliary_loss_clip": 0.0106886, + "auxiliary_loss_mlp": 0.01034286, + "balance_loss_clip": 1.03765988, + "balance_loss_mlp": 1.02080905, + "epoch": 0.6569667819029009, + "flos": 21175158009600.0, + "grad_norm": 1.7200943700135627, + "language_loss": 0.72494638, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.74597794, + "num_input_tokens_seen": 235867225, + "step": 10927, + "time_per_iteration": 4.311558246612549 + }, + { + "auxiliary_loss_clip": 0.01005194, + "auxiliary_loss_mlp": 0.01007222, + "balance_loss_clip": 1.01187444, + "balance_loss_mlp": 1.00603569, + "epoch": 0.6570269051555689, + "flos": 68761897511040.0, + "grad_norm": 0.7266677598408974, + "language_loss": 0.64416504, + "learning_rate": 1.112011294493775e-06, + "loss": 0.66428924, + "num_input_tokens_seen": 235932925, + "step": 10928, + "time_per_iteration": 3.2423789501190186 + }, + { + "auxiliary_loss_clip": 0.01100905, + "auxiliary_loss_mlp": 0.01034907, + "balance_loss_clip": 1.03707099, + "balance_loss_mlp": 1.02176392, + "epoch": 0.6570870284082369, + "flos": 26319289495680.0, + "grad_norm": 1.7795232563837846, + "language_loss": 0.77698803, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.79834616, + "num_input_tokens_seen": 235952680, + "step": 10929, + "time_per_iteration": 2.6381664276123047 + }, + { + "auxiliary_loss_clip": 0.01078467, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.03687572, + "balance_loss_mlp": 1.01705337, + "epoch": 0.6571471516609049, + "flos": 26174749167360.0, + "grad_norm": 2.3625903698766826, + "language_loss": 0.65178704, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.67287529, + "num_input_tokens_seen": 235972075, + "step": 10930, + "time_per_iteration": 2.7424116134643555 + }, + { + "auxiliary_loss_clip": 0.01063728, + "auxiliary_loss_mlp": 0.01033942, + "balance_loss_clip": 1.03379416, + "balance_loss_mlp": 1.02037621, + "epoch": 0.6572072749135728, + "flos": 20376253664640.0, + "grad_norm": 1.690752691180261, + "language_loss": 0.70888293, + "learning_rate": 1.110964538515258e-06, + "loss": 0.72985959, + "num_input_tokens_seen": 235990340, + "step": 10931, + "time_per_iteration": 2.7526936531066895 + }, + { + "auxiliary_loss_clip": 0.01070712, + "auxiliary_loss_mlp": 0.01038764, + "balance_loss_clip": 1.03789568, + "balance_loss_mlp": 1.02569246, + "epoch": 0.6572673981662408, + "flos": 17128744110720.0, + "grad_norm": 2.7494594651926763, + "language_loss": 0.68903434, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.71012914, + "num_input_tokens_seen": 236007470, + "step": 10932, + "time_per_iteration": 2.699676036834717 + }, + { + "auxiliary_loss_clip": 0.01088862, + "auxiliary_loss_mlp": 0.0077114, + "balance_loss_clip": 1.03621304, + "balance_loss_mlp": 1.00018311, + "epoch": 0.6573275214189087, + "flos": 41275113281280.0, + "grad_norm": 1.7103641293724658, + "language_loss": 0.80041671, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.8190167, + "num_input_tokens_seen": 236029030, + "step": 10933, + "time_per_iteration": 2.884944200515747 + }, + { + "auxiliary_loss_clip": 0.01066755, + "auxiliary_loss_mlp": 0.01038188, + "balance_loss_clip": 1.03784192, + "balance_loss_mlp": 1.02397847, + "epoch": 0.6573876446715767, + "flos": 22890143842560.0, + "grad_norm": 1.944468432565168, + "language_loss": 0.73796332, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.75901282, + "num_input_tokens_seen": 236047160, + "step": 10934, + "time_per_iteration": 2.689169406890869 + }, + { + "auxiliary_loss_clip": 0.01097012, + "auxiliary_loss_mlp": 0.01038397, + "balance_loss_clip": 1.0375042, + "balance_loss_mlp": 1.02410352, + "epoch": 0.6574477679242448, + "flos": 44018150273280.0, + "grad_norm": 1.510657094056813, + "language_loss": 0.76061821, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.78197235, + "num_input_tokens_seen": 236069215, + "step": 10935, + "time_per_iteration": 2.798928737640381 + }, + { + "auxiliary_loss_clip": 0.01075783, + "auxiliary_loss_mlp": 0.01039916, + "balance_loss_clip": 1.03844082, + "balance_loss_mlp": 1.02543783, + "epoch": 0.6575078911769127, + "flos": 24571517523840.0, + "grad_norm": 1.6442083694725653, + "language_loss": 0.78311378, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.80427074, + "num_input_tokens_seen": 236088335, + "step": 10936, + "time_per_iteration": 2.718698263168335 + }, + { + "auxiliary_loss_clip": 0.01065449, + "auxiliary_loss_mlp": 0.01032937, + "balance_loss_clip": 1.0363667, + "balance_loss_mlp": 1.02052104, + "epoch": 0.6575680144295807, + "flos": 20924035050240.0, + "grad_norm": 1.7517271883506782, + "language_loss": 0.68920904, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.71019292, + "num_input_tokens_seen": 236108540, + "step": 10937, + "time_per_iteration": 2.7036542892456055 + }, + { + "auxiliary_loss_clip": 0.01087739, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.03832746, + "balance_loss_mlp": 1.01813471, + "epoch": 0.6576281376822486, + "flos": 10925642833920.0, + "grad_norm": 2.652931448732022, + "language_loss": 0.6823296, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.70351958, + "num_input_tokens_seen": 236124495, + "step": 10938, + "time_per_iteration": 2.6599676609039307 + }, + { + "auxiliary_loss_clip": 0.01085941, + "auxiliary_loss_mlp": 0.01033843, + "balance_loss_clip": 1.03766704, + "balance_loss_mlp": 1.02009773, + "epoch": 0.6576882609349166, + "flos": 19281552819840.0, + "grad_norm": 3.453384337403157, + "language_loss": 0.71610057, + "learning_rate": 1.108174673550927e-06, + "loss": 0.73729843, + "num_input_tokens_seen": 236142550, + "step": 10939, + "time_per_iteration": 2.650425672531128 + }, + { + "auxiliary_loss_clip": 0.01092138, + "auxiliary_loss_mlp": 0.00771382, + "balance_loss_clip": 1.03735209, + "balance_loss_mlp": 1.00023603, + "epoch": 0.6577483841875845, + "flos": 20220544206720.0, + "grad_norm": 2.2437103704575345, + "language_loss": 0.77729875, + "learning_rate": 1.107826092473037e-06, + "loss": 0.79593396, + "num_input_tokens_seen": 236156620, + "step": 10940, + "time_per_iteration": 2.669313669204712 + }, + { + "auxiliary_loss_clip": 0.01071259, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.03549123, + "balance_loss_mlp": 1.01780236, + "epoch": 0.6578085074402525, + "flos": 34751078962560.0, + "grad_norm": 2.3851655144351818, + "language_loss": 0.68552613, + "learning_rate": 1.107477545226471e-06, + "loss": 0.70655704, + "num_input_tokens_seen": 236177095, + "step": 10941, + "time_per_iteration": 2.8323819637298584 + }, + { + "auxiliary_loss_clip": 0.01098124, + "auxiliary_loss_mlp": 0.00771304, + "balance_loss_clip": 1.03532124, + "balance_loss_mlp": 1.00012338, + "epoch": 0.6578686306929205, + "flos": 23470998675840.0, + "grad_norm": 2.4287401057679436, + "language_loss": 0.68286288, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.70155716, + "num_input_tokens_seen": 236194695, + "step": 10942, + "time_per_iteration": 2.662338972091675 + }, + { + "auxiliary_loss_clip": 0.01082673, + "auxiliary_loss_mlp": 0.01036222, + "balance_loss_clip": 1.03803504, + "balance_loss_mlp": 1.02132106, + "epoch": 0.6579287539455885, + "flos": 18077073033600.0, + "grad_norm": 1.9182303150374724, + "language_loss": 0.71618617, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.73737514, + "num_input_tokens_seen": 236213885, + "step": 10943, + "time_per_iteration": 2.6217944622039795 + }, + { + "auxiliary_loss_clip": 0.01070671, + "auxiliary_loss_mlp": 0.01032935, + "balance_loss_clip": 1.03640389, + "balance_loss_mlp": 1.01936865, + "epoch": 0.6579888771982564, + "flos": 28661383900800.0, + "grad_norm": 1.8289069022809952, + "language_loss": 0.59149086, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.61252689, + "num_input_tokens_seen": 236237315, + "step": 10944, + "time_per_iteration": 2.8202292919158936 + }, + { + "auxiliary_loss_clip": 0.01109311, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.04082966, + "balance_loss_mlp": 1.02081347, + "epoch": 0.6580490004509244, + "flos": 25046543911680.0, + "grad_norm": 1.5174772565974388, + "language_loss": 0.7224496, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.74388736, + "num_input_tokens_seen": 236256345, + "step": 10945, + "time_per_iteration": 2.658428430557251 + }, + { + "auxiliary_loss_clip": 0.0109325, + "auxiliary_loss_mlp": 0.0102876, + "balance_loss_clip": 1.04045701, + "balance_loss_mlp": 1.0164274, + "epoch": 0.6581091237035923, + "flos": 43508793461760.0, + "grad_norm": 1.5303954795060517, + "language_loss": 0.70540607, + "learning_rate": 1.105735316926046e-06, + "loss": 0.72662616, + "num_input_tokens_seen": 236281890, + "step": 10946, + "time_per_iteration": 2.859764814376831 + }, + { + "auxiliary_loss_clip": 0.01103097, + "auxiliary_loss_mlp": 0.01034987, + "balance_loss_clip": 1.04042983, + "balance_loss_mlp": 1.02167702, + "epoch": 0.6581692469562603, + "flos": 22415404763520.0, + "grad_norm": 2.072130981046482, + "language_loss": 0.82211119, + "learning_rate": 1.105386972944934e-06, + "loss": 0.84349203, + "num_input_tokens_seen": 236298370, + "step": 10947, + "time_per_iteration": 2.630653142929077 + }, + { + "auxiliary_loss_clip": 0.01056612, + "auxiliary_loss_mlp": 0.00771489, + "balance_loss_clip": 1.0330416, + "balance_loss_mlp": 1.0001905, + "epoch": 0.6582293702089284, + "flos": 24859772167680.0, + "grad_norm": 1.881732401940151, + "language_loss": 0.77187896, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.79015994, + "num_input_tokens_seen": 236317380, + "step": 10948, + "time_per_iteration": 2.7764172554016113 + }, + { + "auxiliary_loss_clip": 0.01105319, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.04180968, + "balance_loss_mlp": 1.01791072, + "epoch": 0.6582894934615963, + "flos": 23039676161280.0, + "grad_norm": 2.2574860884284793, + "language_loss": 0.79085296, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.81221217, + "num_input_tokens_seen": 236336210, + "step": 10949, + "time_per_iteration": 2.7244157791137695 + }, + { + "auxiliary_loss_clip": 0.0102471, + "auxiliary_loss_mlp": 0.01003119, + "balance_loss_clip": 1.01120281, + "balance_loss_mlp": 1.00195682, + "epoch": 0.6583496167142643, + "flos": 72551980978560.0, + "grad_norm": 0.7330189150463328, + "language_loss": 0.6181432, + "learning_rate": 1.104342144597323e-06, + "loss": 0.63842142, + "num_input_tokens_seen": 236403090, + "step": 10950, + "time_per_iteration": 3.2641515731811523 + }, + { + "auxiliary_loss_clip": 0.01100983, + "auxiliary_loss_mlp": 0.01032251, + "balance_loss_clip": 1.0385226, + "balance_loss_mlp": 1.02026415, + "epoch": 0.6584097399669322, + "flos": 13078846592640.0, + "grad_norm": 2.3980091828088144, + "language_loss": 0.67179585, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.69312811, + "num_input_tokens_seen": 236420475, + "step": 10951, + "time_per_iteration": 2.619748115539551 + }, + { + "auxiliary_loss_clip": 0.01100086, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.03776073, + "balance_loss_mlp": 1.02158761, + "epoch": 0.6584698632196002, + "flos": 28693164458880.0, + "grad_norm": 1.4089441578543043, + "language_loss": 0.76300871, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.78435409, + "num_input_tokens_seen": 236441915, + "step": 10952, + "time_per_iteration": 2.7250633239746094 + }, + { + "auxiliary_loss_clip": 0.0111349, + "auxiliary_loss_mlp": 0.0103139, + "balance_loss_clip": 1.04090011, + "balance_loss_mlp": 1.018188, + "epoch": 0.6585299864722681, + "flos": 14319272914560.0, + "grad_norm": 1.8443663213164305, + "language_loss": 0.73402822, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.75547707, + "num_input_tokens_seen": 236460340, + "step": 10953, + "time_per_iteration": 2.566080331802368 + }, + { + "auxiliary_loss_clip": 0.01082894, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.03907454, + "balance_loss_mlp": 1.02891934, + "epoch": 0.6585901097249361, + "flos": 26797907243520.0, + "grad_norm": 2.1744380051357, + "language_loss": 0.78487962, + "learning_rate": 1.102949515683546e-06, + "loss": 0.80613929, + "num_input_tokens_seen": 236478280, + "step": 10954, + "time_per_iteration": 2.724165678024292 + }, + { + "auxiliary_loss_clip": 0.01088368, + "auxiliary_loss_mlp": 0.01037943, + "balance_loss_clip": 1.03427434, + "balance_loss_mlp": 1.0242219, + "epoch": 0.658650232977604, + "flos": 18733124989440.0, + "grad_norm": 2.555140209313338, + "language_loss": 0.69544291, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.71670604, + "num_input_tokens_seen": 236493225, + "step": 10955, + "time_per_iteration": 2.6414260864257812 + }, + { + "auxiliary_loss_clip": 0.01082497, + "auxiliary_loss_mlp": 0.01033927, + "balance_loss_clip": 1.03517938, + "balance_loss_mlp": 1.02191079, + "epoch": 0.6587103562302721, + "flos": 24753440931840.0, + "grad_norm": 2.1706102915019434, + "language_loss": 0.80620337, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.82736766, + "num_input_tokens_seen": 236514420, + "step": 10956, + "time_per_iteration": 2.679706335067749 + }, + { + "auxiliary_loss_clip": 0.01104337, + "auxiliary_loss_mlp": 0.01038231, + "balance_loss_clip": 1.04236186, + "balance_loss_mlp": 1.02459347, + "epoch": 0.65877047948294, + "flos": 22346133384960.0, + "grad_norm": 2.024941431440732, + "language_loss": 0.81428325, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.83570898, + "num_input_tokens_seen": 236532785, + "step": 10957, + "time_per_iteration": 2.7104432582855225 + }, + { + "auxiliary_loss_clip": 0.01091788, + "auxiliary_loss_mlp": 0.01030278, + "balance_loss_clip": 1.04065537, + "balance_loss_mlp": 1.01872087, + "epoch": 0.658830602735608, + "flos": 45180542298240.0, + "grad_norm": 1.6614910080791612, + "language_loss": 0.75887316, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.78009385, + "num_input_tokens_seen": 236553330, + "step": 10958, + "time_per_iteration": 2.829876661300659 + }, + { + "auxiliary_loss_clip": 0.01070256, + "auxiliary_loss_mlp": 0.01040301, + "balance_loss_clip": 1.0364852, + "balance_loss_mlp": 1.02578747, + "epoch": 0.6588907259882759, + "flos": 19901622326400.0, + "grad_norm": 1.76623385890274, + "language_loss": 0.74976909, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.77087468, + "num_input_tokens_seen": 236572960, + "step": 10959, + "time_per_iteration": 4.3221375942230225 + }, + { + "auxiliary_loss_clip": 0.01103616, + "auxiliary_loss_mlp": 0.01030743, + "balance_loss_clip": 1.03967154, + "balance_loss_mlp": 1.01826799, + "epoch": 0.6589508492409439, + "flos": 24133766474880.0, + "grad_norm": 1.6028003647190308, + "language_loss": 0.6497494, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.67109299, + "num_input_tokens_seen": 236594090, + "step": 10960, + "time_per_iteration": 2.685056209564209 + }, + { + "auxiliary_loss_clip": 0.01119947, + "auxiliary_loss_mlp": 0.01034422, + "balance_loss_clip": 1.04166222, + "balance_loss_mlp": 1.0203135, + "epoch": 0.659010972493612, + "flos": 18222906251520.0, + "grad_norm": 3.156226944144234, + "language_loss": 0.81759185, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.83913553, + "num_input_tokens_seen": 236610190, + "step": 10961, + "time_per_iteration": 2.6374056339263916 + }, + { + "auxiliary_loss_clip": 0.01076452, + "auxiliary_loss_mlp": 0.01033701, + "balance_loss_clip": 1.03810012, + "balance_loss_mlp": 1.01989698, + "epoch": 0.6590710957462799, + "flos": 27600007898880.0, + "grad_norm": 1.7436713822775258, + "language_loss": 0.73479664, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.75589824, + "num_input_tokens_seen": 236631575, + "step": 10962, + "time_per_iteration": 4.275976181030273 + }, + { + "auxiliary_loss_clip": 0.0109814, + "auxiliary_loss_mlp": 0.01033187, + "balance_loss_clip": 1.03807235, + "balance_loss_mlp": 1.01996064, + "epoch": 0.6591312189989479, + "flos": 20302959962880.0, + "grad_norm": 1.9404531224692678, + "language_loss": 0.80004346, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.82135677, + "num_input_tokens_seen": 236649815, + "step": 10963, + "time_per_iteration": 4.260782480239868 + }, + { + "auxiliary_loss_clip": 0.01062785, + "auxiliary_loss_mlp": 0.00769293, + "balance_loss_clip": 1.0372498, + "balance_loss_mlp": 1.00011432, + "epoch": 0.6591913422516158, + "flos": 12312943868160.0, + "grad_norm": 1.8441045997478804, + "language_loss": 0.78224564, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.80056643, + "num_input_tokens_seen": 236668335, + "step": 10964, + "time_per_iteration": 2.6830945014953613 + }, + { + "auxiliary_loss_clip": 0.01075287, + "auxiliary_loss_mlp": 0.01040262, + "balance_loss_clip": 1.03438485, + "balance_loss_mlp": 1.02721417, + "epoch": 0.6592514655042838, + "flos": 25884591102720.0, + "grad_norm": 1.683709186180651, + "language_loss": 0.73955643, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.76071191, + "num_input_tokens_seen": 236688945, + "step": 10965, + "time_per_iteration": 2.687619924545288 + }, + { + "auxiliary_loss_clip": 0.01081038, + "auxiliary_loss_mlp": 0.01038566, + "balance_loss_clip": 1.03631306, + "balance_loss_mlp": 1.02412999, + "epoch": 0.6593115887569517, + "flos": 14063624841600.0, + "grad_norm": 2.0913085470177943, + "language_loss": 0.73648584, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.75768185, + "num_input_tokens_seen": 236707055, + "step": 10966, + "time_per_iteration": 2.6525564193725586 + }, + { + "auxiliary_loss_clip": 0.01102724, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.03741455, + "balance_loss_mlp": 1.01951456, + "epoch": 0.6593717120096197, + "flos": 24717925359360.0, + "grad_norm": 1.533295813226106, + "language_loss": 0.76610076, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.78745747, + "num_input_tokens_seen": 236725900, + "step": 10967, + "time_per_iteration": 4.112145900726318 + }, + { + "auxiliary_loss_clip": 0.01023116, + "auxiliary_loss_mlp": 0.0100237, + "balance_loss_clip": 1.00873816, + "balance_loss_mlp": 1.00111854, + "epoch": 0.6594318352622877, + "flos": 55558083502080.0, + "grad_norm": 0.6961608375444348, + "language_loss": 0.48445863, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.50471348, + "num_input_tokens_seen": 236788415, + "step": 10968, + "time_per_iteration": 3.0989933013916016 + }, + { + "auxiliary_loss_clip": 0.01066259, + "auxiliary_loss_mlp": 0.01036629, + "balance_loss_clip": 1.03324318, + "balance_loss_mlp": 1.02168036, + "epoch": 0.6594919585149557, + "flos": 17456931699840.0, + "grad_norm": 1.7813410381881563, + "language_loss": 0.79142725, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.81245613, + "num_input_tokens_seen": 236805155, + "step": 10969, + "time_per_iteration": 2.6929845809936523 + }, + { + "auxiliary_loss_clip": 0.01103958, + "auxiliary_loss_mlp": 0.01031334, + "balance_loss_clip": 1.0396595, + "balance_loss_mlp": 1.01903188, + "epoch": 0.6595520817676236, + "flos": 18223229473920.0, + "grad_norm": 2.1653605986578137, + "language_loss": 0.65524602, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.67659903, + "num_input_tokens_seen": 236824360, + "step": 10970, + "time_per_iteration": 2.5729503631591797 + }, + { + "auxiliary_loss_clip": 0.01098998, + "auxiliary_loss_mlp": 0.01031128, + "balance_loss_clip": 1.03612995, + "balance_loss_mlp": 1.01827741, + "epoch": 0.6596122050202916, + "flos": 22199761463040.0, + "grad_norm": 1.6607954000770715, + "language_loss": 0.7680558, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.78935707, + "num_input_tokens_seen": 236844640, + "step": 10971, + "time_per_iteration": 2.699892997741699 + }, + { + "auxiliary_loss_clip": 0.01045077, + "auxiliary_loss_mlp": 0.01047077, + "balance_loss_clip": 1.03190637, + "balance_loss_mlp": 1.03174686, + "epoch": 0.6596723282729595, + "flos": 14173834746240.0, + "grad_norm": 2.880961149913922, + "language_loss": 0.70055163, + "learning_rate": 1.096689432978629e-06, + "loss": 0.72147322, + "num_input_tokens_seen": 236861160, + "step": 10972, + "time_per_iteration": 2.7359213829040527 + }, + { + "auxiliary_loss_clip": 0.01101135, + "auxiliary_loss_mlp": 0.01025815, + "balance_loss_clip": 1.03941655, + "balance_loss_mlp": 1.01266074, + "epoch": 0.6597324515256275, + "flos": 30553193410560.0, + "grad_norm": 9.926316428888306, + "language_loss": 0.55695325, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.57822275, + "num_input_tokens_seen": 236880465, + "step": 10973, + "time_per_iteration": 2.69612455368042 + }, + { + "auxiliary_loss_clip": 0.01099195, + "auxiliary_loss_mlp": 0.01040169, + "balance_loss_clip": 1.0419203, + "balance_loss_mlp": 1.02660263, + "epoch": 0.6597925747782956, + "flos": 17639860688640.0, + "grad_norm": 2.5012890193080026, + "language_loss": 0.78572869, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.80712223, + "num_input_tokens_seen": 236897730, + "step": 10974, + "time_per_iteration": 2.6455633640289307 + }, + { + "auxiliary_loss_clip": 0.01100482, + "auxiliary_loss_mlp": 0.01037502, + "balance_loss_clip": 1.04022431, + "balance_loss_mlp": 1.02422214, + "epoch": 0.6598526980309635, + "flos": 22819112697600.0, + "grad_norm": 2.251661999993696, + "language_loss": 0.68701649, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.70839626, + "num_input_tokens_seen": 236917300, + "step": 10975, + "time_per_iteration": 2.6761295795440674 + }, + { + "auxiliary_loss_clip": 0.01097399, + "auxiliary_loss_mlp": 0.01032564, + "balance_loss_clip": 1.03912926, + "balance_loss_mlp": 1.02060747, + "epoch": 0.6599128212836315, + "flos": 21068036674560.0, + "grad_norm": 1.6540937029958567, + "language_loss": 0.70881736, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.73011696, + "num_input_tokens_seen": 236935590, + "step": 10976, + "time_per_iteration": 2.5975265502929688 + }, + { + "auxiliary_loss_clip": 0.01083365, + "auxiliary_loss_mlp": 0.01033377, + "balance_loss_clip": 1.03734148, + "balance_loss_mlp": 1.02016902, + "epoch": 0.6599729445362994, + "flos": 22163527618560.0, + "grad_norm": 1.6096140121507374, + "language_loss": 0.67765009, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.69881749, + "num_input_tokens_seen": 236952830, + "step": 10977, + "time_per_iteration": 2.676992177963257 + }, + { + "auxiliary_loss_clip": 0.01079353, + "auxiliary_loss_mlp": 0.01037069, + "balance_loss_clip": 1.03872538, + "balance_loss_mlp": 1.02254331, + "epoch": 0.6600330677889674, + "flos": 18150079426560.0, + "grad_norm": 2.028840451789988, + "language_loss": 0.80975902, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.8309232, + "num_input_tokens_seen": 236971930, + "step": 10978, + "time_per_iteration": 2.670058488845825 + }, + { + "auxiliary_loss_clip": 0.01084138, + "auxiliary_loss_mlp": 0.01037844, + "balance_loss_clip": 1.03935933, + "balance_loss_mlp": 1.02446318, + "epoch": 0.6600931910416353, + "flos": 18150115340160.0, + "grad_norm": 3.3630669376979534, + "language_loss": 0.67552471, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.69674456, + "num_input_tokens_seen": 236989920, + "step": 10979, + "time_per_iteration": 2.6543848514556885 + }, + { + "auxiliary_loss_clip": 0.01082232, + "auxiliary_loss_mlp": 0.01035503, + "balance_loss_clip": 1.03750384, + "balance_loss_mlp": 1.02135265, + "epoch": 0.6601533142943034, + "flos": 17420733768960.0, + "grad_norm": 2.7062652296553793, + "language_loss": 0.7310946, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.75227201, + "num_input_tokens_seen": 237006570, + "step": 10980, + "time_per_iteration": 2.614719867706299 + }, + { + "auxiliary_loss_clip": 0.01075162, + "auxiliary_loss_mlp": 0.01033537, + "balance_loss_clip": 1.0369494, + "balance_loss_mlp": 1.02132368, + "epoch": 0.6602134375469713, + "flos": 28219574615040.0, + "grad_norm": 1.6769637422208983, + "language_loss": 0.72674447, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.74783146, + "num_input_tokens_seen": 237028415, + "step": 10981, + "time_per_iteration": 2.7521674633026123 + }, + { + "auxiliary_loss_clip": 0.01059889, + "auxiliary_loss_mlp": 0.0103708, + "balance_loss_clip": 1.03629518, + "balance_loss_mlp": 1.02407432, + "epoch": 0.6602735607996393, + "flos": 29418056830080.0, + "grad_norm": 2.169047564074697, + "language_loss": 0.68625891, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.70722854, + "num_input_tokens_seen": 237046595, + "step": 10982, + "time_per_iteration": 2.791590690612793 + }, + { + "auxiliary_loss_clip": 0.01102094, + "auxiliary_loss_mlp": 0.01028932, + "balance_loss_clip": 1.03903246, + "balance_loss_mlp": 1.01600397, + "epoch": 0.6603336840523072, + "flos": 18588045957120.0, + "grad_norm": 1.9479050528854345, + "language_loss": 0.69151658, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.71282685, + "num_input_tokens_seen": 237066150, + "step": 10983, + "time_per_iteration": 2.662109851837158 + }, + { + "auxiliary_loss_clip": 0.01102705, + "auxiliary_loss_mlp": 0.01032804, + "balance_loss_clip": 1.03690076, + "balance_loss_mlp": 1.0190587, + "epoch": 0.6603938073049752, + "flos": 33254860913280.0, + "grad_norm": 1.7348773084229319, + "language_loss": 0.70333445, + "learning_rate": 1.092522205413239e-06, + "loss": 0.72468954, + "num_input_tokens_seen": 237087060, + "step": 10984, + "time_per_iteration": 2.732595443725586 + }, + { + "auxiliary_loss_clip": 0.01077924, + "auxiliary_loss_mlp": 0.01038628, + "balance_loss_clip": 1.03689432, + "balance_loss_mlp": 1.02587259, + "epoch": 0.6604539305576431, + "flos": 17384284442880.0, + "grad_norm": 1.6767760179184985, + "language_loss": 0.83797729, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.85914278, + "num_input_tokens_seen": 237103825, + "step": 10985, + "time_per_iteration": 2.654433250427246 + }, + { + "auxiliary_loss_clip": 0.01105556, + "auxiliary_loss_mlp": 0.01034686, + "balance_loss_clip": 1.0407331, + "balance_loss_mlp": 1.02094078, + "epoch": 0.6605140538103111, + "flos": 21251145231360.0, + "grad_norm": 2.384704611416695, + "language_loss": 0.74183935, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.76324177, + "num_input_tokens_seen": 237121740, + "step": 10986, + "time_per_iteration": 2.6019506454467773 + }, + { + "auxiliary_loss_clip": 0.01100549, + "auxiliary_loss_mlp": 0.01029651, + "balance_loss_clip": 1.03883743, + "balance_loss_mlp": 1.01647878, + "epoch": 0.6605741770629792, + "flos": 13881701433600.0, + "grad_norm": 1.9122697335713108, + "language_loss": 0.78908652, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.81038857, + "num_input_tokens_seen": 237139565, + "step": 10987, + "time_per_iteration": 2.5722427368164062 + }, + { + "auxiliary_loss_clip": 0.01008768, + "auxiliary_loss_mlp": 0.01002668, + "balance_loss_clip": 1.00836062, + "balance_loss_mlp": 1.0013566, + "epoch": 0.6606343003156471, + "flos": 69316215171840.0, + "grad_norm": 0.8094121865469099, + "language_loss": 0.541363, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.5614773, + "num_input_tokens_seen": 237201055, + "step": 10988, + "time_per_iteration": 3.272397994995117 + }, + { + "auxiliary_loss_clip": 0.01053267, + "auxiliary_loss_mlp": 0.01036624, + "balance_loss_clip": 1.03639925, + "balance_loss_mlp": 1.02483392, + "epoch": 0.6606944235683151, + "flos": 27272394927360.0, + "grad_norm": 1.725996965304981, + "language_loss": 0.77469909, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.79559803, + "num_input_tokens_seen": 237221805, + "step": 10989, + "time_per_iteration": 2.911433458328247 + }, + { + "auxiliary_loss_clip": 0.01092952, + "auxiliary_loss_mlp": 0.01034291, + "balance_loss_clip": 1.04096937, + "balance_loss_mlp": 1.02172589, + "epoch": 0.660754546820983, + "flos": 13772820332160.0, + "grad_norm": 2.2526328276614542, + "language_loss": 0.77053428, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.7918067, + "num_input_tokens_seen": 237238270, + "step": 10990, + "time_per_iteration": 2.6875393390655518 + }, + { + "auxiliary_loss_clip": 0.01116631, + "auxiliary_loss_mlp": 0.01032525, + "balance_loss_clip": 1.04041815, + "balance_loss_mlp": 1.01960862, + "epoch": 0.660814670073651, + "flos": 15705209232000.0, + "grad_norm": 4.452653785760573, + "language_loss": 0.60725391, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.62874544, + "num_input_tokens_seen": 237255400, + "step": 10991, + "time_per_iteration": 2.581926107406616 + }, + { + "auxiliary_loss_clip": 0.01088945, + "auxiliary_loss_mlp": 0.01037016, + "balance_loss_clip": 1.03823137, + "balance_loss_mlp": 1.02305102, + "epoch": 0.6608747933263189, + "flos": 20850023076480.0, + "grad_norm": 2.2752499400269057, + "language_loss": 0.68441308, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.70567274, + "num_input_tokens_seen": 237273105, + "step": 10992, + "time_per_iteration": 2.6633994579315186 + }, + { + "auxiliary_loss_clip": 0.01102357, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.03874791, + "balance_loss_mlp": 1.02007651, + "epoch": 0.660934916578987, + "flos": 20632117219200.0, + "grad_norm": 1.7286431682231886, + "language_loss": 0.87802613, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.89938569, + "num_input_tokens_seen": 237292650, + "step": 10993, + "time_per_iteration": 2.618743419647217 + }, + { + "auxiliary_loss_clip": 0.01111168, + "auxiliary_loss_mlp": 0.01033402, + "balance_loss_clip": 1.04143643, + "balance_loss_mlp": 1.01837611, + "epoch": 0.6609950398316549, + "flos": 25113588647040.0, + "grad_norm": 1.7020728160261662, + "language_loss": 0.66939056, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.69083625, + "num_input_tokens_seen": 237312865, + "step": 10994, + "time_per_iteration": 2.694892406463623 + }, + { + "auxiliary_loss_clip": 0.01078298, + "auxiliary_loss_mlp": 0.01039322, + "balance_loss_clip": 1.03795636, + "balance_loss_mlp": 1.02551126, + "epoch": 0.6610551630843229, + "flos": 18661196004480.0, + "grad_norm": 2.5249476876910277, + "language_loss": 0.77071732, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.79189348, + "num_input_tokens_seen": 237331210, + "step": 10995, + "time_per_iteration": 2.6232664585113525 + }, + { + "auxiliary_loss_clip": 0.01093968, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.03934228, + "balance_loss_mlp": 1.01722097, + "epoch": 0.6611152863369908, + "flos": 23258192549760.0, + "grad_norm": 1.8438791376239891, + "language_loss": 0.74463415, + "learning_rate": 1.088359933123053e-06, + "loss": 0.76587015, + "num_input_tokens_seen": 237349455, + "step": 10996, + "time_per_iteration": 2.628135919570923 + }, + { + "auxiliary_loss_clip": 0.01115792, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.04123545, + "balance_loss_mlp": 1.02159739, + "epoch": 0.6611754095896588, + "flos": 22159720776960.0, + "grad_norm": 1.8400435689118084, + "language_loss": 0.69207805, + "learning_rate": 1.088013301487126e-06, + "loss": 0.71358246, + "num_input_tokens_seen": 237367100, + "step": 10997, + "time_per_iteration": 2.5729880332946777 + }, + { + "auxiliary_loss_clip": 0.01095929, + "auxiliary_loss_mlp": 0.01033874, + "balance_loss_clip": 1.0389818, + "balance_loss_mlp": 1.02096367, + "epoch": 0.6612355328423267, + "flos": 13991228979840.0, + "grad_norm": 2.212339587573469, + "language_loss": 0.68443197, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.70572996, + "num_input_tokens_seen": 237384840, + "step": 10998, + "time_per_iteration": 4.240036249160767 + }, + { + "auxiliary_loss_clip": 0.01026396, + "auxiliary_loss_mlp": 0.01003226, + "balance_loss_clip": 1.01201963, + "balance_loss_mlp": 1.00200462, + "epoch": 0.6612956560949947, + "flos": 61453716359040.0, + "grad_norm": 0.6556172869742106, + "language_loss": 0.51124817, + "learning_rate": 1.087320141976297e-06, + "loss": 0.53154439, + "num_input_tokens_seen": 237443355, + "step": 10999, + "time_per_iteration": 3.0903005599975586 + }, + { + "auxiliary_loss_clip": 0.01117437, + "auxiliary_loss_mlp": 0.00771071, + "balance_loss_clip": 1.04025114, + "balance_loss_mlp": 1.00016904, + "epoch": 0.6613557793476627, + "flos": 21616644072960.0, + "grad_norm": 2.396543073072743, + "language_loss": 0.70902514, + "learning_rate": 1.086973614127679e-06, + "loss": 0.72791028, + "num_input_tokens_seen": 237459205, + "step": 11000, + "time_per_iteration": 2.5685982704162598 + }, + { + "auxiliary_loss_clip": 0.01082819, + "auxiliary_loss_mlp": 0.01036908, + "balance_loss_clip": 1.03847837, + "balance_loss_mlp": 1.024737, + "epoch": 0.6614159026003307, + "flos": 34020117192960.0, + "grad_norm": 1.430398099595452, + "language_loss": 0.65089309, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.67209029, + "num_input_tokens_seen": 237483580, + "step": 11001, + "time_per_iteration": 4.2755303382873535 + }, + { + "auxiliary_loss_clip": 0.01112876, + "auxiliary_loss_mlp": 0.0103109, + "balance_loss_clip": 1.03954029, + "balance_loss_mlp": 1.01845384, + "epoch": 0.6614760258529987, + "flos": 24097281235200.0, + "grad_norm": 1.7701672836009255, + "language_loss": 0.7300179, + "learning_rate": 1.086280662309739e-06, + "loss": 0.75145757, + "num_input_tokens_seen": 237502860, + "step": 11002, + "time_per_iteration": 2.6314847469329834 + }, + { + "auxiliary_loss_clip": 0.01097492, + "auxiliary_loss_mlp": 0.01037063, + "balance_loss_clip": 1.03688526, + "balance_loss_mlp": 1.02355647, + "epoch": 0.6615361491056666, + "flos": 14903790935040.0, + "grad_norm": 1.9389438141435231, + "language_loss": 0.79010653, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.81145215, + "num_input_tokens_seen": 237521030, + "step": 11003, + "time_per_iteration": 4.314274072647095 + }, + { + "auxiliary_loss_clip": 0.01104366, + "auxiliary_loss_mlp": 0.01038348, + "balance_loss_clip": 1.03993845, + "balance_loss_mlp": 1.02454972, + "epoch": 0.6615962723583346, + "flos": 15304877176320.0, + "grad_norm": 1.933163608906101, + "language_loss": 0.69039351, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.7118206, + "num_input_tokens_seen": 237539585, + "step": 11004, + "time_per_iteration": 2.6783957481384277 + }, + { + "auxiliary_loss_clip": 0.01104574, + "auxiliary_loss_mlp": 0.0103687, + "balance_loss_clip": 1.03920364, + "balance_loss_mlp": 1.02226102, + "epoch": 0.6616563956110025, + "flos": 18732586285440.0, + "grad_norm": 2.0685835155239487, + "language_loss": 0.69767517, + "learning_rate": 1.085241494478132e-06, + "loss": 0.71908963, + "num_input_tokens_seen": 237557655, + "step": 11005, + "time_per_iteration": 2.5958964824676514 + }, + { + "auxiliary_loss_clip": 0.01094809, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.04032111, + "balance_loss_mlp": 1.01691008, + "epoch": 0.6617165188636706, + "flos": 24495063425280.0, + "grad_norm": 4.5323320504778035, + "language_loss": 0.78211862, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.80336696, + "num_input_tokens_seen": 237577000, + "step": 11006, + "time_per_iteration": 4.20892596244812 + }, + { + "auxiliary_loss_clip": 0.01102255, + "auxiliary_loss_mlp": 0.01033472, + "balance_loss_clip": 1.03898382, + "balance_loss_mlp": 1.02004886, + "epoch": 0.6617766421163385, + "flos": 22379673709440.0, + "grad_norm": 1.4341781143462713, + "language_loss": 0.76336843, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.78472567, + "num_input_tokens_seen": 237597960, + "step": 11007, + "time_per_iteration": 2.6313998699188232 + }, + { + "auxiliary_loss_clip": 0.0110241, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.0410744, + "balance_loss_mlp": 1.02089977, + "epoch": 0.6618367653690065, + "flos": 20850418126080.0, + "grad_norm": 1.678556210667641, + "language_loss": 0.78647077, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.80783153, + "num_input_tokens_seen": 237616385, + "step": 11008, + "time_per_iteration": 2.6336562633514404 + }, + { + "auxiliary_loss_clip": 0.01117118, + "auxiliary_loss_mlp": 0.01030319, + "balance_loss_clip": 1.0386076, + "balance_loss_mlp": 1.01620448, + "epoch": 0.6618968886216744, + "flos": 17712328377600.0, + "grad_norm": 1.8062458144067923, + "language_loss": 0.81780714, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.83928156, + "num_input_tokens_seen": 237634930, + "step": 11009, + "time_per_iteration": 2.559891939163208 + }, + { + "auxiliary_loss_clip": 0.01003698, + "auxiliary_loss_mlp": 0.01000096, + "balance_loss_clip": 1.01631284, + "balance_loss_mlp": 0.99864715, + "epoch": 0.6619570118743424, + "flos": 67035347498880.0, + "grad_norm": 1.1306824373429385, + "language_loss": 0.67373979, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.69377768, + "num_input_tokens_seen": 237693175, + "step": 11010, + "time_per_iteration": 3.1341817378997803 + }, + { + "auxiliary_loss_clip": 0.01103659, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.03835106, + "balance_loss_mlp": 1.02063894, + "epoch": 0.6620171351270103, + "flos": 18660908695680.0, + "grad_norm": 1.5388019077167303, + "language_loss": 0.71031803, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.73169947, + "num_input_tokens_seen": 237713160, + "step": 11011, + "time_per_iteration": 2.6373953819274902 + }, + { + "auxiliary_loss_clip": 0.01106184, + "auxiliary_loss_mlp": 0.01032454, + "balance_loss_clip": 1.04299128, + "balance_loss_mlp": 1.01964521, + "epoch": 0.6620772583796783, + "flos": 24170503109760.0, + "grad_norm": 1.4417744086263622, + "language_loss": 0.7236765, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.74506283, + "num_input_tokens_seen": 237733600, + "step": 11012, + "time_per_iteration": 2.6834990978240967 + }, + { + "auxiliary_loss_clip": 0.01098433, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.03888941, + "balance_loss_mlp": 1.02273691, + "epoch": 0.6621373816323463, + "flos": 23623547736960.0, + "grad_norm": 1.657176750213381, + "language_loss": 0.79366904, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.81499881, + "num_input_tokens_seen": 237752135, + "step": 11013, + "time_per_iteration": 2.6497538089752197 + }, + { + "auxiliary_loss_clip": 0.01092428, + "auxiliary_loss_mlp": 0.01032971, + "balance_loss_clip": 1.03971934, + "balance_loss_mlp": 1.01944637, + "epoch": 0.6621975048850143, + "flos": 18442212739200.0, + "grad_norm": 2.6791842321865698, + "language_loss": 0.70635635, + "learning_rate": 1.082125865538971e-06, + "loss": 0.72761035, + "num_input_tokens_seen": 237770735, + "step": 11014, + "time_per_iteration": 2.6886751651763916 + }, + { + "auxiliary_loss_clip": 0.01083433, + "auxiliary_loss_mlp": 0.00768947, + "balance_loss_clip": 1.03894365, + "balance_loss_mlp": 1.00011313, + "epoch": 0.6622576281376823, + "flos": 14063876236800.0, + "grad_norm": 1.8642341672837748, + "language_loss": 0.77003562, + "learning_rate": 1.081779858400137e-06, + "loss": 0.78855944, + "num_input_tokens_seen": 237789005, + "step": 11015, + "time_per_iteration": 2.7417409420013428 + }, + { + "auxiliary_loss_clip": 0.01104344, + "auxiliary_loss_mlp": 0.0077007, + "balance_loss_clip": 1.04066467, + "balance_loss_mlp": 1.00019598, + "epoch": 0.6623177513903502, + "flos": 17018965169280.0, + "grad_norm": 1.678948777257364, + "language_loss": 0.82612354, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.84486771, + "num_input_tokens_seen": 237807740, + "step": 11016, + "time_per_iteration": 2.6134469509124756 + }, + { + "auxiliary_loss_clip": 0.01098949, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.03807402, + "balance_loss_mlp": 1.02006221, + "epoch": 0.6623778746430182, + "flos": 17271021882240.0, + "grad_norm": 1.953011458286016, + "language_loss": 0.69714379, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.71846962, + "num_input_tokens_seen": 237826340, + "step": 11017, + "time_per_iteration": 2.58854079246521 + }, + { + "auxiliary_loss_clip": 0.01083899, + "auxiliary_loss_mlp": 0.01039946, + "balance_loss_clip": 1.0361867, + "balance_loss_mlp": 1.02632689, + "epoch": 0.6624379978956861, + "flos": 48792688767360.0, + "grad_norm": 1.7400500770773162, + "language_loss": 0.774885, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.79612345, + "num_input_tokens_seen": 237848305, + "step": 11018, + "time_per_iteration": 2.9582974910736084 + }, + { + "auxiliary_loss_clip": 0.01091037, + "auxiliary_loss_mlp": 0.01042104, + "balance_loss_clip": 1.03768778, + "balance_loss_mlp": 1.02916956, + "epoch": 0.6624981211483542, + "flos": 18952431477120.0, + "grad_norm": 2.014925244928839, + "language_loss": 0.83705002, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.85838139, + "num_input_tokens_seen": 237867020, + "step": 11019, + "time_per_iteration": 2.684549331665039 + }, + { + "auxiliary_loss_clip": 0.01097432, + "auxiliary_loss_mlp": 0.00772198, + "balance_loss_clip": 1.03844643, + "balance_loss_mlp": 1.00007081, + "epoch": 0.6625582444010221, + "flos": 23256576437760.0, + "grad_norm": 1.6087102704367435, + "language_loss": 0.71948653, + "learning_rate": 1.080050345253328e-06, + "loss": 0.73818284, + "num_input_tokens_seen": 237886710, + "step": 11020, + "time_per_iteration": 2.6002566814422607 + }, + { + "auxiliary_loss_clip": 0.01092653, + "auxiliary_loss_mlp": 0.01030916, + "balance_loss_clip": 1.03763211, + "balance_loss_mlp": 1.01636672, + "epoch": 0.6626183676536901, + "flos": 21394823633280.0, + "grad_norm": 1.6700673315170445, + "language_loss": 0.72552252, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.74675822, + "num_input_tokens_seen": 237904795, + "step": 11021, + "time_per_iteration": 2.677899122238159 + }, + { + "auxiliary_loss_clip": 0.01087084, + "auxiliary_loss_mlp": 0.0104125, + "balance_loss_clip": 1.03822863, + "balance_loss_mlp": 1.02790403, + "epoch": 0.662678490906358, + "flos": 14571293713920.0, + "grad_norm": 2.016335698142833, + "language_loss": 0.83232486, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.85360825, + "num_input_tokens_seen": 237921320, + "step": 11022, + "time_per_iteration": 2.62428879737854 + }, + { + "auxiliary_loss_clip": 0.01099654, + "auxiliary_loss_mlp": 0.01034417, + "balance_loss_clip": 1.04019356, + "balance_loss_mlp": 1.01928318, + "epoch": 0.662738614159026, + "flos": 15992350554240.0, + "grad_norm": 2.476624679148487, + "language_loss": 0.72735739, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.74869806, + "num_input_tokens_seen": 237933525, + "step": 11023, + "time_per_iteration": 2.632291316986084 + }, + { + "auxiliary_loss_clip": 0.01079183, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.03499722, + "balance_loss_mlp": 1.02165151, + "epoch": 0.6627987374116939, + "flos": 19536338966400.0, + "grad_norm": 1.8699789342451163, + "language_loss": 0.75085115, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.7719931, + "num_input_tokens_seen": 237953395, + "step": 11024, + "time_per_iteration": 2.7034032344818115 + }, + { + "auxiliary_loss_clip": 0.01083517, + "auxiliary_loss_mlp": 0.01031822, + "balance_loss_clip": 1.0384872, + "balance_loss_mlp": 1.01755285, + "epoch": 0.662858860664362, + "flos": 15702838934400.0, + "grad_norm": 2.491473090515614, + "language_loss": 0.69829249, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.71944588, + "num_input_tokens_seen": 237971445, + "step": 11025, + "time_per_iteration": 2.7056894302368164 + }, + { + "auxiliary_loss_clip": 0.01118609, + "auxiliary_loss_mlp": 0.01038933, + "balance_loss_clip": 1.04383016, + "balance_loss_mlp": 1.02548599, + "epoch": 0.6629189839170299, + "flos": 20154289570560.0, + "grad_norm": 1.5605432120454088, + "language_loss": 0.79108787, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.81266326, + "num_input_tokens_seen": 237989965, + "step": 11026, + "time_per_iteration": 2.6094040870666504 + }, + { + "auxiliary_loss_clip": 0.01104761, + "auxiliary_loss_mlp": 0.01030194, + "balance_loss_clip": 1.04092979, + "balance_loss_mlp": 1.01790905, + "epoch": 0.6629791071696979, + "flos": 20915415786240.0, + "grad_norm": 1.5667845463950276, + "language_loss": 0.75913531, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.7804848, + "num_input_tokens_seen": 238006820, + "step": 11027, + "time_per_iteration": 2.6272130012512207 + }, + { + "auxiliary_loss_clip": 0.01088271, + "auxiliary_loss_mlp": 0.01038744, + "balance_loss_clip": 1.03918552, + "balance_loss_mlp": 1.02465403, + "epoch": 0.6630392304223659, + "flos": 20846898593280.0, + "grad_norm": 2.126605601662929, + "language_loss": 0.703035, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.72430521, + "num_input_tokens_seen": 238022560, + "step": 11028, + "time_per_iteration": 2.7173945903778076 + }, + { + "auxiliary_loss_clip": 0.01103236, + "auxiliary_loss_mlp": 0.01033808, + "balance_loss_clip": 1.03955865, + "balance_loss_mlp": 1.0220722, + "epoch": 0.6630993536750338, + "flos": 20995820380800.0, + "grad_norm": 1.8721020554211893, + "language_loss": 0.79606169, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.81743217, + "num_input_tokens_seen": 238041895, + "step": 11029, + "time_per_iteration": 2.5954697132110596 + }, + { + "auxiliary_loss_clip": 0.01116256, + "auxiliary_loss_mlp": 0.01035331, + "balance_loss_clip": 1.03937316, + "balance_loss_mlp": 1.02168214, + "epoch": 0.6631594769277018, + "flos": 18259032355200.0, + "grad_norm": 2.1557545389807617, + "language_loss": 0.76608872, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.78760457, + "num_input_tokens_seen": 238060445, + "step": 11030, + "time_per_iteration": 2.5441596508026123 + }, + { + "auxiliary_loss_clip": 0.01113502, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.04352438, + "balance_loss_mlp": 1.02144003, + "epoch": 0.6632196001803697, + "flos": 17820491207040.0, + "grad_norm": 2.2370976778546803, + "language_loss": 0.75485003, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.77634418, + "num_input_tokens_seen": 238077080, + "step": 11031, + "time_per_iteration": 2.607260227203369 + }, + { + "auxiliary_loss_clip": 0.01106421, + "auxiliary_loss_mlp": 0.01038808, + "balance_loss_clip": 1.04007494, + "balance_loss_mlp": 1.02508116, + "epoch": 0.6632797234330378, + "flos": 12670182581760.0, + "grad_norm": 4.999518522839319, + "language_loss": 0.74670291, + "learning_rate": 1.075903075048228e-06, + "loss": 0.76815522, + "num_input_tokens_seen": 238091045, + "step": 11032, + "time_per_iteration": 2.594426393508911 + }, + { + "auxiliary_loss_clip": 0.01072119, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.0367676, + "balance_loss_mlp": 1.02086639, + "epoch": 0.6633398466857057, + "flos": 23584728113280.0, + "grad_norm": 1.76988392785946, + "language_loss": 0.80491328, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.82597411, + "num_input_tokens_seen": 238110220, + "step": 11033, + "time_per_iteration": 2.7742807865142822 + }, + { + "auxiliary_loss_clip": 0.01098023, + "auxiliary_loss_mlp": 0.01031641, + "balance_loss_clip": 1.04221106, + "balance_loss_mlp": 1.01806927, + "epoch": 0.6633999699383737, + "flos": 20631686256000.0, + "grad_norm": 1.7697764735120445, + "language_loss": 0.80480468, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.82610136, + "num_input_tokens_seen": 238130400, + "step": 11034, + "time_per_iteration": 2.72609543800354 + }, + { + "auxiliary_loss_clip": 0.01098853, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.03850234, + "balance_loss_mlp": 1.01725149, + "epoch": 0.6634600931910416, + "flos": 21797095023360.0, + "grad_norm": 1.6912859958234545, + "language_loss": 0.7568692, + "learning_rate": 1.074867045054166e-06, + "loss": 0.77815193, + "num_input_tokens_seen": 238148165, + "step": 11035, + "time_per_iteration": 2.6851565837860107 + }, + { + "auxiliary_loss_clip": 0.01080784, + "auxiliary_loss_mlp": 0.01029042, + "balance_loss_clip": 1.03555465, + "balance_loss_mlp": 1.01562476, + "epoch": 0.6635202164437096, + "flos": 18732873594240.0, + "grad_norm": 1.9570572428830155, + "language_loss": 0.8271299, + "learning_rate": 1.074521771867622e-06, + "loss": 0.84822816, + "num_input_tokens_seen": 238166360, + "step": 11036, + "time_per_iteration": 2.6795291900634766 + }, + { + "auxiliary_loss_clip": 0.01034271, + "auxiliary_loss_mlp": 0.01004373, + "balance_loss_clip": 1.01085413, + "balance_loss_mlp": 1.00327635, + "epoch": 0.6635803396963775, + "flos": 60222771227520.0, + "grad_norm": 0.7751211897866269, + "language_loss": 0.52259576, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.54298222, + "num_input_tokens_seen": 238227630, + "step": 11037, + "time_per_iteration": 4.7726218700408936 + }, + { + "auxiliary_loss_clip": 0.01060431, + "auxiliary_loss_mlp": 0.01041224, + "balance_loss_clip": 1.03799784, + "balance_loss_mlp": 1.0276525, + "epoch": 0.6636404629490456, + "flos": 29167041611520.0, + "grad_norm": 1.5502874196412986, + "language_loss": 0.79120708, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.81222361, + "num_input_tokens_seen": 238248435, + "step": 11038, + "time_per_iteration": 2.8115994930267334 + }, + { + "auxiliary_loss_clip": 0.01082049, + "auxiliary_loss_mlp": 0.01043022, + "balance_loss_clip": 1.03767705, + "balance_loss_mlp": 1.02863979, + "epoch": 0.6637005862017135, + "flos": 38907702766080.0, + "grad_norm": 1.791707577314863, + "language_loss": 0.63976014, + "learning_rate": 1.073486162925716e-06, + "loss": 0.66101086, + "num_input_tokens_seen": 238268755, + "step": 11039, + "time_per_iteration": 2.8266031742095947 + }, + { + "auxiliary_loss_clip": 0.0107412, + "auxiliary_loss_mlp": 0.01031845, + "balance_loss_clip": 1.03814256, + "balance_loss_mlp": 1.01877952, + "epoch": 0.6637607094543815, + "flos": 22783345729920.0, + "grad_norm": 1.6823578045159262, + "language_loss": 0.63401222, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.65507191, + "num_input_tokens_seen": 238290120, + "step": 11040, + "time_per_iteration": 2.705897569656372 + }, + { + "auxiliary_loss_clip": 0.01074324, + "auxiliary_loss_mlp": 0.01044015, + "balance_loss_clip": 1.03504574, + "balance_loss_mlp": 1.02977514, + "epoch": 0.6638208327070495, + "flos": 18114096977280.0, + "grad_norm": 1.8896789484535716, + "language_loss": 0.71718216, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.73836553, + "num_input_tokens_seen": 238309290, + "step": 11041, + "time_per_iteration": 4.213087320327759 + }, + { + "auxiliary_loss_clip": 0.01097087, + "auxiliary_loss_mlp": 0.01048475, + "balance_loss_clip": 1.03642857, + "balance_loss_mlp": 1.03349042, + "epoch": 0.6638809559597174, + "flos": 29424880414080.0, + "grad_norm": 2.2565600398451795, + "language_loss": 0.61915213, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.64060771, + "num_input_tokens_seen": 238327280, + "step": 11042, + "time_per_iteration": 4.279943943023682 + }, + { + "auxiliary_loss_clip": 0.011055, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.03810656, + "balance_loss_mlp": 1.01686156, + "epoch": 0.6639410792123854, + "flos": 28072699902720.0, + "grad_norm": 2.105682360594448, + "language_loss": 0.68285942, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.7042259, + "num_input_tokens_seen": 238346330, + "step": 11043, + "time_per_iteration": 2.6422598361968994 + }, + { + "auxiliary_loss_clip": 0.01101764, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.04116786, + "balance_loss_mlp": 1.018767, + "epoch": 0.6640012024650533, + "flos": 25556367600000.0, + "grad_norm": 1.5365440611155503, + "language_loss": 0.83934712, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.8606683, + "num_input_tokens_seen": 238364650, + "step": 11044, + "time_per_iteration": 2.732520341873169 + }, + { + "auxiliary_loss_clip": 0.01073049, + "auxiliary_loss_mlp": 0.01031878, + "balance_loss_clip": 1.03586829, + "balance_loss_mlp": 1.0185442, + "epoch": 0.6640613257177214, + "flos": 14866946559360.0, + "grad_norm": 2.1294485287076315, + "language_loss": 0.6951791, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.71622837, + "num_input_tokens_seen": 238381630, + "step": 11045, + "time_per_iteration": 2.6816322803497314 + }, + { + "auxiliary_loss_clip": 0.01104183, + "auxiliary_loss_mlp": 0.01027582, + "balance_loss_clip": 1.04048705, + "balance_loss_mlp": 1.0148927, + "epoch": 0.6641214489703893, + "flos": 23221096778880.0, + "grad_norm": 2.227953338696249, + "language_loss": 0.64640826, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.66772592, + "num_input_tokens_seen": 238402595, + "step": 11046, + "time_per_iteration": 4.160333156585693 + }, + { + "auxiliary_loss_clip": 0.01085109, + "auxiliary_loss_mlp": 0.01027972, + "balance_loss_clip": 1.03931284, + "balance_loss_mlp": 1.01488853, + "epoch": 0.6641815722230573, + "flos": 37742617221120.0, + "grad_norm": 1.6669339663762488, + "language_loss": 0.71004307, + "learning_rate": 1.070726085914088e-06, + "loss": 0.73117387, + "num_input_tokens_seen": 238426860, + "step": 11047, + "time_per_iteration": 2.8554368019104004 + }, + { + "auxiliary_loss_clip": 0.01049735, + "auxiliary_loss_mlp": 0.01036523, + "balance_loss_clip": 1.04015899, + "balance_loss_mlp": 1.02316511, + "epoch": 0.6642416954757252, + "flos": 17931132074880.0, + "grad_norm": 1.8883257209384914, + "language_loss": 0.77274108, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.79360354, + "num_input_tokens_seen": 238443990, + "step": 11048, + "time_per_iteration": 2.755452871322632 + }, + { + "auxiliary_loss_clip": 0.01010482, + "auxiliary_loss_mlp": 0.01002664, + "balance_loss_clip": 1.01594365, + "balance_loss_mlp": 1.00137699, + "epoch": 0.6643018187283932, + "flos": 51995384104320.0, + "grad_norm": 0.747851272534148, + "language_loss": 0.55009979, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.57023126, + "num_input_tokens_seen": 238503045, + "step": 11049, + "time_per_iteration": 3.232647180557251 + }, + { + "auxiliary_loss_clip": 0.01103139, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.04035759, + "balance_loss_mlp": 1.02035105, + "epoch": 0.6643619419810611, + "flos": 30226657847040.0, + "grad_norm": 1.8987287972691187, + "language_loss": 0.63542056, + "learning_rate": 1.069691638104648e-06, + "loss": 0.65677476, + "num_input_tokens_seen": 238527320, + "step": 11050, + "time_per_iteration": 2.712871551513672 + }, + { + "auxiliary_loss_clip": 0.01110292, + "auxiliary_loss_mlp": 0.01033953, + "balance_loss_clip": 1.03804648, + "balance_loss_mlp": 1.02145386, + "epoch": 0.6644220652337292, + "flos": 22966131064320.0, + "grad_norm": 2.56878578960884, + "language_loss": 0.78747934, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.80892181, + "num_input_tokens_seen": 238546030, + "step": 11051, + "time_per_iteration": 2.5602593421936035 + }, + { + "auxiliary_loss_clip": 0.01090775, + "auxiliary_loss_mlp": 0.01036923, + "balance_loss_clip": 1.04075074, + "balance_loss_mlp": 1.02409577, + "epoch": 0.6644821884863971, + "flos": 21142228216320.0, + "grad_norm": 1.6830071971795009, + "language_loss": 0.85365808, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.87493503, + "num_input_tokens_seen": 238564175, + "step": 11052, + "time_per_iteration": 2.6400978565216064 + }, + { + "auxiliary_loss_clip": 0.0106864, + "auxiliary_loss_mlp": 0.01035785, + "balance_loss_clip": 1.03640008, + "balance_loss_mlp": 1.02115774, + "epoch": 0.6645423117390651, + "flos": 20192821885440.0, + "grad_norm": 2.468702862512036, + "language_loss": 0.7442345, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.7652787, + "num_input_tokens_seen": 238581010, + "step": 11053, + "time_per_iteration": 2.7525177001953125 + }, + { + "auxiliary_loss_clip": 0.01081443, + "auxiliary_loss_mlp": 0.0103047, + "balance_loss_clip": 1.03704178, + "balance_loss_mlp": 1.01803088, + "epoch": 0.6646024349917331, + "flos": 24351959640960.0, + "grad_norm": 1.6350550334521685, + "language_loss": 0.7937814, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.81490058, + "num_input_tokens_seen": 238601365, + "step": 11054, + "time_per_iteration": 2.6874406337738037 + }, + { + "auxiliary_loss_clip": 0.01067976, + "auxiliary_loss_mlp": 0.01035163, + "balance_loss_clip": 1.03798532, + "balance_loss_mlp": 1.02267623, + "epoch": 0.664662558244401, + "flos": 18806706000000.0, + "grad_norm": 1.6423825875919162, + "language_loss": 0.73928297, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.76031435, + "num_input_tokens_seen": 238619850, + "step": 11055, + "time_per_iteration": 2.733832597732544 + }, + { + "auxiliary_loss_clip": 0.01082031, + "auxiliary_loss_mlp": 0.01043702, + "balance_loss_clip": 1.03823996, + "balance_loss_mlp": 1.02983165, + "epoch": 0.664722681497069, + "flos": 18952790613120.0, + "grad_norm": 1.8844406603153, + "language_loss": 0.7300725, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.75132978, + "num_input_tokens_seen": 238637635, + "step": 11056, + "time_per_iteration": 2.6787209510803223 + }, + { + "auxiliary_loss_clip": 0.01069462, + "auxiliary_loss_mlp": 0.01036287, + "balance_loss_clip": 1.0367837, + "balance_loss_mlp": 1.02314389, + "epoch": 0.6647828047497369, + "flos": 19571279921280.0, + "grad_norm": 3.230794817750296, + "language_loss": 0.69325733, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.71431488, + "num_input_tokens_seen": 238656200, + "step": 11057, + "time_per_iteration": 2.749843120574951 + }, + { + "auxiliary_loss_clip": 0.01103707, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.03987014, + "balance_loss_mlp": 1.0206548, + "epoch": 0.664842928002405, + "flos": 23149455102720.0, + "grad_norm": 1.6131185292636203, + "language_loss": 0.80123711, + "learning_rate": 1.066934663776291e-06, + "loss": 0.82261384, + "num_input_tokens_seen": 238675005, + "step": 11058, + "time_per_iteration": 2.6598408222198486 + }, + { + "auxiliary_loss_clip": 0.01008973, + "auxiliary_loss_mlp": 0.01008338, + "balance_loss_clip": 1.01433647, + "balance_loss_mlp": 1.00715828, + "epoch": 0.6649030512550729, + "flos": 65244913148160.0, + "grad_norm": 0.802003162122869, + "language_loss": 0.62611187, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.64628494, + "num_input_tokens_seen": 238731425, + "step": 11059, + "time_per_iteration": 3.12062668800354 + }, + { + "auxiliary_loss_clip": 0.01102046, + "auxiliary_loss_mlp": 0.01038723, + "balance_loss_clip": 1.03967965, + "balance_loss_mlp": 1.026546, + "epoch": 0.6649631745077409, + "flos": 20194797133440.0, + "grad_norm": 1.442710173280966, + "language_loss": 0.7869736, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.80838132, + "num_input_tokens_seen": 238752020, + "step": 11060, + "time_per_iteration": 2.776430606842041 + }, + { + "auxiliary_loss_clip": 0.01082742, + "auxiliary_loss_mlp": 0.01038039, + "balance_loss_clip": 1.03887463, + "balance_loss_mlp": 1.02412772, + "epoch": 0.6650232977604088, + "flos": 17238558965760.0, + "grad_norm": 1.6774634080954063, + "language_loss": 0.78738892, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.80859673, + "num_input_tokens_seen": 238769665, + "step": 11061, + "time_per_iteration": 2.6786346435546875 + }, + { + "auxiliary_loss_clip": 0.01092682, + "auxiliary_loss_mlp": 0.01030082, + "balance_loss_clip": 1.0417732, + "balance_loss_mlp": 1.01765454, + "epoch": 0.6650834210130768, + "flos": 10006867825920.0, + "grad_norm": 2.291207929066884, + "language_loss": 0.56939697, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.59062469, + "num_input_tokens_seen": 238782180, + "step": 11062, + "time_per_iteration": 2.6440412998199463 + }, + { + "auxiliary_loss_clip": 0.01100317, + "auxiliary_loss_mlp": 0.01037299, + "balance_loss_clip": 1.03600216, + "balance_loss_mlp": 1.02142608, + "epoch": 0.6651435442657447, + "flos": 10452088903680.0, + "grad_norm": 1.8230256032266374, + "language_loss": 0.75959098, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.78096718, + "num_input_tokens_seen": 238800315, + "step": 11063, + "time_per_iteration": 2.592930555343628 + }, + { + "auxiliary_loss_clip": 0.01056354, + "auxiliary_loss_mlp": 0.01044348, + "balance_loss_clip": 1.03860426, + "balance_loss_mlp": 1.03033507, + "epoch": 0.6652036675184128, + "flos": 22344229964160.0, + "grad_norm": 1.2698232462033214, + "language_loss": 0.70678842, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.72779548, + "num_input_tokens_seen": 238822250, + "step": 11064, + "time_per_iteration": 2.800218105316162 + }, + { + "auxiliary_loss_clip": 0.01032183, + "auxiliary_loss_mlp": 0.01006383, + "balance_loss_clip": 1.00864732, + "balance_loss_mlp": 1.00513113, + "epoch": 0.6652637907710807, + "flos": 52909633998720.0, + "grad_norm": 0.8463523903026119, + "language_loss": 0.629758, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.65014362, + "num_input_tokens_seen": 238877190, + "step": 11065, + "time_per_iteration": 3.1035780906677246 + }, + { + "auxiliary_loss_clip": 0.01099093, + "auxiliary_loss_mlp": 0.01039736, + "balance_loss_clip": 1.03762209, + "balance_loss_mlp": 1.02464366, + "epoch": 0.6653239140237487, + "flos": 23104637907840.0, + "grad_norm": 1.610155502063491, + "language_loss": 0.62464315, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.64603138, + "num_input_tokens_seen": 238896010, + "step": 11066, + "time_per_iteration": 2.6371681690216064 + }, + { + "auxiliary_loss_clip": 0.01074468, + "auxiliary_loss_mlp": 0.01041028, + "balance_loss_clip": 1.03320074, + "balance_loss_mlp": 1.02528, + "epoch": 0.6653840372764167, + "flos": 25959393175680.0, + "grad_norm": 1.5735109104273866, + "language_loss": 0.70316392, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.72431886, + "num_input_tokens_seen": 238918990, + "step": 11067, + "time_per_iteration": 2.712170362472534 + }, + { + "auxiliary_loss_clip": 0.01015121, + "auxiliary_loss_mlp": 0.0100891, + "balance_loss_clip": 1.01019919, + "balance_loss_mlp": 1.00739563, + "epoch": 0.6654441605290846, + "flos": 66041985899520.0, + "grad_norm": 0.9248325292472583, + "language_loss": 0.72063255, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.74087286, + "num_input_tokens_seen": 238975735, + "step": 11068, + "time_per_iteration": 3.188148021697998 + }, + { + "auxiliary_loss_clip": 0.01006694, + "auxiliary_loss_mlp": 0.01006942, + "balance_loss_clip": 1.01129699, + "balance_loss_mlp": 1.00560117, + "epoch": 0.6655042837817526, + "flos": 65196112521600.0, + "grad_norm": 0.8265951746379137, + "language_loss": 0.57727754, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.5974139, + "num_input_tokens_seen": 239042360, + "step": 11069, + "time_per_iteration": 3.3526012897491455 + }, + { + "auxiliary_loss_clip": 0.01011659, + "auxiliary_loss_mlp": 0.0100159, + "balance_loss_clip": 1.00811982, + "balance_loss_mlp": 1.00046349, + "epoch": 0.6655644070344205, + "flos": 69008746752000.0, + "grad_norm": 0.7554433068818003, + "language_loss": 0.63502038, + "learning_rate": 1.062803450204029e-06, + "loss": 0.65515292, + "num_input_tokens_seen": 239109410, + "step": 11070, + "time_per_iteration": 3.189624071121216 + }, + { + "auxiliary_loss_clip": 0.0111185, + "auxiliary_loss_mlp": 0.01029573, + "balance_loss_clip": 1.03767705, + "balance_loss_mlp": 1.01622725, + "epoch": 0.6656245302870886, + "flos": 36315562809600.0, + "grad_norm": 1.5957526683817405, + "language_loss": 0.58635205, + "learning_rate": 1.062459413096116e-06, + "loss": 0.60776627, + "num_input_tokens_seen": 239135345, + "step": 11071, + "time_per_iteration": 2.7373464107513428 + }, + { + "auxiliary_loss_clip": 0.01107113, + "auxiliary_loss_mlp": 0.01030576, + "balance_loss_clip": 1.04254675, + "balance_loss_mlp": 1.01822627, + "epoch": 0.6656846535397565, + "flos": 21794832466560.0, + "grad_norm": 1.7851142792546852, + "language_loss": 0.72693968, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.74831653, + "num_input_tokens_seen": 239154340, + "step": 11072, + "time_per_iteration": 2.6327590942382812 + }, + { + "auxiliary_loss_clip": 0.01103867, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.04155874, + "balance_loss_mlp": 1.01859617, + "epoch": 0.6657447767924245, + "flos": 37487615592960.0, + "grad_norm": 1.879864387077726, + "language_loss": 0.70789611, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.72926104, + "num_input_tokens_seen": 239177815, + "step": 11073, + "time_per_iteration": 2.704252243041992 + }, + { + "auxiliary_loss_clip": 0.01084232, + "auxiliary_loss_mlp": 0.0103114, + "balance_loss_clip": 1.03998876, + "balance_loss_mlp": 1.01784229, + "epoch": 0.6658049000450924, + "flos": 16837688206080.0, + "grad_norm": 2.6568090318066475, + "language_loss": 0.56073666, + "learning_rate": 1.061427515134354e-06, + "loss": 0.5818904, + "num_input_tokens_seen": 239195735, + "step": 11074, + "time_per_iteration": 2.6551811695098877 + }, + { + "auxiliary_loss_clip": 0.01116885, + "auxiliary_loss_mlp": 0.00770661, + "balance_loss_clip": 1.04282713, + "balance_loss_mlp": 1.00006819, + "epoch": 0.6658650232977604, + "flos": 33510975863040.0, + "grad_norm": 1.424580138870233, + "language_loss": 0.7252624, + "learning_rate": 1.061083620311235e-06, + "loss": 0.74413788, + "num_input_tokens_seen": 239217535, + "step": 11075, + "time_per_iteration": 2.7062625885009766 + }, + { + "auxiliary_loss_clip": 0.01100028, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_clip": 1.03886509, + "balance_loss_mlp": 1.01981592, + "epoch": 0.6659251465504283, + "flos": 37706311549440.0, + "grad_norm": 1.4599897176092982, + "language_loss": 0.66246772, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.68378824, + "num_input_tokens_seen": 239241975, + "step": 11076, + "time_per_iteration": 2.804659605026245 + }, + { + "auxiliary_loss_clip": 0.01087468, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.03394532, + "balance_loss_mlp": 1.01870489, + "epoch": 0.6659852698030964, + "flos": 24893420232960.0, + "grad_norm": 1.6180459271945493, + "language_loss": 0.75299704, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.77420044, + "num_input_tokens_seen": 239262025, + "step": 11077, + "time_per_iteration": 4.274590253829956 + }, + { + "auxiliary_loss_clip": 0.0108965, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.0374043, + "balance_loss_mlp": 1.01802957, + "epoch": 0.6660453930557643, + "flos": 24352821567360.0, + "grad_norm": 1.5713803954899295, + "language_loss": 0.66825247, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.68946046, + "num_input_tokens_seen": 239282775, + "step": 11078, + "time_per_iteration": 2.7334680557250977 + }, + { + "auxiliary_loss_clip": 0.01115428, + "auxiliary_loss_mlp": 0.01033982, + "balance_loss_clip": 1.03945637, + "balance_loss_mlp": 1.01990974, + "epoch": 0.6661055163084323, + "flos": 10597814380800.0, + "grad_norm": 2.400926553792791, + "language_loss": 0.69900686, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.72050095, + "num_input_tokens_seen": 239299775, + "step": 11079, + "time_per_iteration": 2.6223835945129395 + }, + { + "auxiliary_loss_clip": 0.01089448, + "auxiliary_loss_mlp": 0.01030137, + "balance_loss_clip": 1.03717136, + "balance_loss_mlp": 1.01738119, + "epoch": 0.6661656395611003, + "flos": 24057491944320.0, + "grad_norm": 1.61546827465866, + "language_loss": 0.80478466, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.82598048, + "num_input_tokens_seen": 239319660, + "step": 11080, + "time_per_iteration": 4.228775978088379 + }, + { + "auxiliary_loss_clip": 0.01075927, + "auxiliary_loss_mlp": 0.01033579, + "balance_loss_clip": 1.03583407, + "balance_loss_mlp": 1.02147329, + "epoch": 0.6662257628137682, + "flos": 23036192542080.0, + "grad_norm": 1.8384302926010723, + "language_loss": 0.78062707, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.80172205, + "num_input_tokens_seen": 239339215, + "step": 11081, + "time_per_iteration": 4.32209324836731 + }, + { + "auxiliary_loss_clip": 0.01076143, + "auxiliary_loss_mlp": 0.01039547, + "balance_loss_clip": 1.03748226, + "balance_loss_mlp": 1.02387714, + "epoch": 0.6662858860664362, + "flos": 24754446512640.0, + "grad_norm": 1.6809862267344533, + "language_loss": 0.80329323, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.82445014, + "num_input_tokens_seen": 239358545, + "step": 11082, + "time_per_iteration": 2.7251505851745605 + }, + { + "auxiliary_loss_clip": 0.01076739, + "auxiliary_loss_mlp": 0.01033286, + "balance_loss_clip": 1.04017997, + "balance_loss_mlp": 1.02098405, + "epoch": 0.6663460093191041, + "flos": 20009066883840.0, + "grad_norm": 1.4477945081554633, + "language_loss": 0.83849418, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.85959446, + "num_input_tokens_seen": 239376665, + "step": 11083, + "time_per_iteration": 2.669404983520508 + }, + { + "auxiliary_loss_clip": 0.01079397, + "auxiliary_loss_mlp": 0.01036057, + "balance_loss_clip": 1.04023921, + "balance_loss_mlp": 1.02203834, + "epoch": 0.6664061325717722, + "flos": 17821389047040.0, + "grad_norm": 2.7255574695502216, + "language_loss": 0.85510308, + "learning_rate": 1.057990170638731e-06, + "loss": 0.87625766, + "num_input_tokens_seen": 239394345, + "step": 11084, + "time_per_iteration": 2.663749933242798 + }, + { + "auxiliary_loss_clip": 0.01094685, + "auxiliary_loss_mlp": 0.01031242, + "balance_loss_clip": 1.03857958, + "balance_loss_mlp": 1.01727629, + "epoch": 0.6664662558244401, + "flos": 18076893465600.0, + "grad_norm": 2.199200259512602, + "language_loss": 0.73457599, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.75583529, + "num_input_tokens_seen": 239410605, + "step": 11085, + "time_per_iteration": 4.193335771560669 + }, + { + "auxiliary_loss_clip": 0.01087888, + "auxiliary_loss_mlp": 0.01031014, + "balance_loss_clip": 1.03528535, + "balance_loss_mlp": 1.01760268, + "epoch": 0.6665263790771081, + "flos": 21574197175680.0, + "grad_norm": 1.9746802098097909, + "language_loss": 0.80359179, + "learning_rate": 1.057303129975894e-06, + "loss": 0.82478082, + "num_input_tokens_seen": 239427155, + "step": 11086, + "time_per_iteration": 2.6708765029907227 + }, + { + "auxiliary_loss_clip": 0.01090857, + "auxiliary_loss_mlp": 0.01032379, + "balance_loss_clip": 1.03936315, + "balance_loss_mlp": 1.018646, + "epoch": 0.666586502329776, + "flos": 24206629213440.0, + "grad_norm": 1.7936971088038383, + "language_loss": 0.74496621, + "learning_rate": 1.056959663258702e-06, + "loss": 0.76619852, + "num_input_tokens_seen": 239445510, + "step": 11087, + "time_per_iteration": 2.7366881370544434 + }, + { + "auxiliary_loss_clip": 0.01101311, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.03835797, + "balance_loss_mlp": 1.02250183, + "epoch": 0.666646625582444, + "flos": 22200515648640.0, + "grad_norm": 1.692056233669114, + "language_loss": 0.64937711, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.67074656, + "num_input_tokens_seen": 239464805, + "step": 11088, + "time_per_iteration": 2.652937412261963 + }, + { + "auxiliary_loss_clip": 0.01099844, + "auxiliary_loss_mlp": 0.01029761, + "balance_loss_clip": 1.03648591, + "balance_loss_mlp": 1.01637387, + "epoch": 0.6667067488351119, + "flos": 18259930195200.0, + "grad_norm": 2.239140495962673, + "language_loss": 0.64203691, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.66333294, + "num_input_tokens_seen": 239483890, + "step": 11089, + "time_per_iteration": 2.6637988090515137 + }, + { + "auxiliary_loss_clip": 0.01113447, + "auxiliary_loss_mlp": 0.01031785, + "balance_loss_clip": 1.03998184, + "balance_loss_mlp": 1.01876771, + "epoch": 0.66676687208778, + "flos": 17236547804160.0, + "grad_norm": 2.535090345981802, + "language_loss": 0.80804038, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.82949275, + "num_input_tokens_seen": 239500080, + "step": 11090, + "time_per_iteration": 2.581758737564087 + }, + { + "auxiliary_loss_clip": 0.01092289, + "auxiliary_loss_mlp": 0.01035587, + "balance_loss_clip": 1.03686905, + "balance_loss_mlp": 1.02217007, + "epoch": 0.6668269953404479, + "flos": 19752197748480.0, + "grad_norm": 1.9927096976475185, + "language_loss": 0.77528715, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.79656601, + "num_input_tokens_seen": 239517335, + "step": 11091, + "time_per_iteration": 2.673798084259033 + }, + { + "auxiliary_loss_clip": 0.01114388, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.04016709, + "balance_loss_mlp": 1.02024066, + "epoch": 0.6668871185931159, + "flos": 20558428467840.0, + "grad_norm": 1.9227343143607547, + "language_loss": 0.79361308, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.81509137, + "num_input_tokens_seen": 239536240, + "step": 11092, + "time_per_iteration": 2.6783652305603027 + }, + { + "auxiliary_loss_clip": 0.01010839, + "auxiliary_loss_mlp": 0.01001852, + "balance_loss_clip": 1.01392734, + "balance_loss_mlp": 1.00064206, + "epoch": 0.6669472418457839, + "flos": 58088167735680.0, + "grad_norm": 1.5742465893545905, + "language_loss": 0.57764924, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.59777617, + "num_input_tokens_seen": 239598000, + "step": 11093, + "time_per_iteration": 3.25225567817688 + }, + { + "auxiliary_loss_clip": 0.011126, + "auxiliary_loss_mlp": 0.01032323, + "balance_loss_clip": 1.03999138, + "balance_loss_mlp": 1.01957977, + "epoch": 0.6670073650984518, + "flos": 26065113880320.0, + "grad_norm": 1.5604249547045095, + "language_loss": 0.76737595, + "learning_rate": 1.054556398252703e-06, + "loss": 0.78882521, + "num_input_tokens_seen": 239617650, + "step": 11094, + "time_per_iteration": 2.6441400051116943 + }, + { + "auxiliary_loss_clip": 0.01114242, + "auxiliary_loss_mlp": 0.01034632, + "balance_loss_clip": 1.03926766, + "balance_loss_mlp": 1.02063107, + "epoch": 0.6670674883511198, + "flos": 32416849635840.0, + "grad_norm": 1.725805849880736, + "language_loss": 0.73280704, + "learning_rate": 1.05421321798155e-06, + "loss": 0.75429583, + "num_input_tokens_seen": 239639825, + "step": 11095, + "time_per_iteration": 2.6807525157928467 + }, + { + "auxiliary_loss_clip": 0.01100599, + "auxiliary_loss_mlp": 0.01038236, + "balance_loss_clip": 1.03832078, + "balance_loss_mlp": 1.02496827, + "epoch": 0.6671276116037878, + "flos": 18037786533120.0, + "grad_norm": 1.9301541125816652, + "language_loss": 0.73262459, + "learning_rate": 1.053870073574727e-06, + "loss": 0.75401294, + "num_input_tokens_seen": 239656300, + "step": 11096, + "time_per_iteration": 2.568824052810669 + }, + { + "auxiliary_loss_clip": 0.01069521, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.03659153, + "balance_loss_mlp": 1.01915956, + "epoch": 0.6671877348564558, + "flos": 23767046570880.0, + "grad_norm": 2.803880463620154, + "language_loss": 0.64528841, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.66630697, + "num_input_tokens_seen": 239676655, + "step": 11097, + "time_per_iteration": 2.7534751892089844 + }, + { + "auxiliary_loss_clip": 0.01101343, + "auxiliary_loss_mlp": 0.01036605, + "balance_loss_clip": 1.03823709, + "balance_loss_mlp": 1.0242486, + "epoch": 0.6672478581091237, + "flos": 20918360701440.0, + "grad_norm": 1.9121192931903639, + "language_loss": 0.75842595, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.77980542, + "num_input_tokens_seen": 239695430, + "step": 11098, + "time_per_iteration": 2.6095056533813477 + }, + { + "auxiliary_loss_clip": 0.01115287, + "auxiliary_loss_mlp": 0.0103552, + "balance_loss_clip": 1.04045045, + "balance_loss_mlp": 1.02328897, + "epoch": 0.6673079813617917, + "flos": 27855799626240.0, + "grad_norm": 1.5630193182693057, + "language_loss": 0.74190086, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.76340902, + "num_input_tokens_seen": 239717070, + "step": 11099, + "time_per_iteration": 2.673234224319458 + }, + { + "auxiliary_loss_clip": 0.01098732, + "auxiliary_loss_mlp": 0.01036855, + "balance_loss_clip": 1.03607726, + "balance_loss_mlp": 1.02412391, + "epoch": 0.6673681046144596, + "flos": 21616859554560.0, + "grad_norm": 1.7967972361910232, + "language_loss": 0.78233874, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.80369455, + "num_input_tokens_seen": 239737105, + "step": 11100, + "time_per_iteration": 2.637829303741455 + }, + { + "auxiliary_loss_clip": 0.01112293, + "auxiliary_loss_mlp": 0.01037899, + "balance_loss_clip": 1.03913033, + "balance_loss_mlp": 1.02564454, + "epoch": 0.6674282278671276, + "flos": 20889884194560.0, + "grad_norm": 3.1933899226541804, + "language_loss": 0.60124767, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.62274957, + "num_input_tokens_seen": 239757835, + "step": 11101, + "time_per_iteration": 2.649627685546875 + }, + { + "auxiliary_loss_clip": 0.01098761, + "auxiliary_loss_mlp": 0.01034633, + "balance_loss_clip": 1.03970337, + "balance_loss_mlp": 1.02054238, + "epoch": 0.6674883511197955, + "flos": 23624194181760.0, + "grad_norm": 2.1447614079629362, + "language_loss": 0.71100485, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.73233879, + "num_input_tokens_seen": 239775425, + "step": 11102, + "time_per_iteration": 2.7131104469299316 + }, + { + "auxiliary_loss_clip": 0.01103363, + "auxiliary_loss_mlp": 0.01031628, + "balance_loss_clip": 1.03698874, + "balance_loss_mlp": 1.01878357, + "epoch": 0.6675484743724636, + "flos": 19609668581760.0, + "grad_norm": 1.3386493038394256, + "language_loss": 0.84490895, + "learning_rate": 1.051469068021034e-06, + "loss": 0.8662588, + "num_input_tokens_seen": 239794605, + "step": 11103, + "time_per_iteration": 2.630141496658325 + }, + { + "auxiliary_loss_clip": 0.01091051, + "auxiliary_loss_mlp": 0.01027938, + "balance_loss_clip": 1.03639507, + "balance_loss_mlp": 1.01571894, + "epoch": 0.6676085976251315, + "flos": 14319452482560.0, + "grad_norm": 1.8538250094473767, + "language_loss": 0.77889514, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.80008507, + "num_input_tokens_seen": 239812135, + "step": 11104, + "time_per_iteration": 2.7340710163116455 + }, + { + "auxiliary_loss_clip": 0.01067144, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.03659081, + "balance_loss_mlp": 1.01740217, + "epoch": 0.6676687208777995, + "flos": 38104596529920.0, + "grad_norm": 5.138036678415969, + "language_loss": 0.58146316, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.60243386, + "num_input_tokens_seen": 239835845, + "step": 11105, + "time_per_iteration": 2.882567882537842 + }, + { + "auxiliary_loss_clip": 0.01107097, + "auxiliary_loss_mlp": 0.0103608, + "balance_loss_clip": 1.03966367, + "balance_loss_mlp": 1.02172112, + "epoch": 0.6677288441304675, + "flos": 23981576549760.0, + "grad_norm": 6.297152012729004, + "language_loss": 0.73476273, + "learning_rate": 1.0504406049066e-06, + "loss": 0.75619453, + "num_input_tokens_seen": 239853820, + "step": 11106, + "time_per_iteration": 2.6627464294433594 + }, + { + "auxiliary_loss_clip": 0.01113601, + "auxiliary_loss_mlp": 0.0103128, + "balance_loss_clip": 1.0392319, + "balance_loss_mlp": 1.01777363, + "epoch": 0.6677889673831354, + "flos": 24170682677760.0, + "grad_norm": 1.6820711130448331, + "language_loss": 0.76552516, + "learning_rate": 1.0500978558659e-06, + "loss": 0.78697395, + "num_input_tokens_seen": 239873365, + "step": 11107, + "time_per_iteration": 2.655085325241089 + }, + { + "auxiliary_loss_clip": 0.01089336, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.03778529, + "balance_loss_mlp": 1.01969552, + "epoch": 0.6678490906358034, + "flos": 22309648145280.0, + "grad_norm": 2.264065486271505, + "language_loss": 0.90136391, + "learning_rate": 1.049755142845583e-06, + "loss": 0.92258334, + "num_input_tokens_seen": 239891215, + "step": 11108, + "time_per_iteration": 2.7129766941070557 + }, + { + "auxiliary_loss_clip": 0.01083707, + "auxiliary_loss_mlp": 0.01029485, + "balance_loss_clip": 1.04215026, + "balance_loss_mlp": 1.01795101, + "epoch": 0.6679092138884714, + "flos": 36898752026880.0, + "grad_norm": 1.413392892629677, + "language_loss": 0.82960904, + "learning_rate": 1.049412465858646e-06, + "loss": 0.85074097, + "num_input_tokens_seen": 239913490, + "step": 11109, + "time_per_iteration": 2.867154121398926 + }, + { + "auxiliary_loss_clip": 0.01087234, + "auxiliary_loss_mlp": 0.01035101, + "balance_loss_clip": 1.03826952, + "balance_loss_mlp": 1.02132595, + "epoch": 0.6679693371411394, + "flos": 18150294908160.0, + "grad_norm": 2.421344403388597, + "language_loss": 0.70021516, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.72143853, + "num_input_tokens_seen": 239931565, + "step": 11110, + "time_per_iteration": 2.6291885375976562 + }, + { + "auxiliary_loss_clip": 0.01087492, + "auxiliary_loss_mlp": 0.01037588, + "balance_loss_clip": 1.03955197, + "balance_loss_mlp": 1.02289498, + "epoch": 0.6680294603938073, + "flos": 27198167472000.0, + "grad_norm": 1.5840834743354089, + "language_loss": 0.73441553, + "learning_rate": 1.04872722003689e-06, + "loss": 0.75566632, + "num_input_tokens_seen": 239952395, + "step": 11111, + "time_per_iteration": 2.677231788635254 + }, + { + "auxiliary_loss_clip": 0.01110772, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.0385406, + "balance_loss_mlp": 1.01907229, + "epoch": 0.6680895836464753, + "flos": 21725309692800.0, + "grad_norm": 6.009810258732459, + "language_loss": 0.65599185, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.67741144, + "num_input_tokens_seen": 239968910, + "step": 11112, + "time_per_iteration": 2.5904297828674316 + }, + { + "auxiliary_loss_clip": 0.01086609, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.03706861, + "balance_loss_mlp": 1.01755929, + "epoch": 0.6681497068991432, + "flos": 19646477043840.0, + "grad_norm": 1.8628832000622026, + "language_loss": 0.6369822, + "learning_rate": 1.048042118504569e-06, + "loss": 0.65815663, + "num_input_tokens_seen": 239987680, + "step": 11113, + "time_per_iteration": 2.623263359069824 + }, + { + "auxiliary_loss_clip": 0.01072141, + "auxiliary_loss_mlp": 0.01037464, + "balance_loss_clip": 1.04164052, + "balance_loss_mlp": 1.02563202, + "epoch": 0.6682098301518112, + "flos": 17419153570560.0, + "grad_norm": 1.9634476729216852, + "language_loss": 0.6540277, + "learning_rate": 1.047699621879422e-06, + "loss": 0.67512381, + "num_input_tokens_seen": 240005790, + "step": 11114, + "time_per_iteration": 2.865252733230591 + }, + { + "auxiliary_loss_clip": 0.0110424, + "auxiliary_loss_mlp": 0.0103987, + "balance_loss_clip": 1.03883052, + "balance_loss_mlp": 1.0267992, + "epoch": 0.6682699534044791, + "flos": 22599016110720.0, + "grad_norm": 1.6562280172476918, + "language_loss": 0.78432989, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.80577099, + "num_input_tokens_seen": 240025895, + "step": 11115, + "time_per_iteration": 2.7281594276428223 + }, + { + "auxiliary_loss_clip": 0.0105862, + "auxiliary_loss_mlp": 0.00771764, + "balance_loss_clip": 1.0309999, + "balance_loss_mlp": 1.00021195, + "epoch": 0.6683300766571472, + "flos": 24863686750080.0, + "grad_norm": 1.6526033494173815, + "language_loss": 0.79655063, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.81485444, + "num_input_tokens_seen": 240044880, + "step": 11116, + "time_per_iteration": 4.51043963432312 + }, + { + "auxiliary_loss_clip": 0.01084566, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.03999686, + "balance_loss_mlp": 1.02240658, + "epoch": 0.6683901999098151, + "flos": 27126633536640.0, + "grad_norm": 2.4411111020753347, + "language_loss": 0.7904433, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.81165314, + "num_input_tokens_seen": 240065785, + "step": 11117, + "time_per_iteration": 2.748905897140503 + }, + { + "auxiliary_loss_clip": 0.01069081, + "auxiliary_loss_mlp": 0.01033165, + "balance_loss_clip": 1.03828013, + "balance_loss_mlp": 1.01844835, + "epoch": 0.6684503231624831, + "flos": 20739023072640.0, + "grad_norm": 3.4807828142340815, + "language_loss": 0.65610313, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.67712557, + "num_input_tokens_seen": 240085130, + "step": 11118, + "time_per_iteration": 2.707383871078491 + }, + { + "auxiliary_loss_clip": 0.01091583, + "auxiliary_loss_mlp": 0.01033843, + "balance_loss_clip": 1.03924751, + "balance_loss_mlp": 1.02176738, + "epoch": 0.668510446415151, + "flos": 21762189982080.0, + "grad_norm": 1.4374358637877027, + "language_loss": 0.68942273, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.71067697, + "num_input_tokens_seen": 240105495, + "step": 11119, + "time_per_iteration": 2.6769771575927734 + }, + { + "auxiliary_loss_clip": 0.01086506, + "auxiliary_loss_mlp": 0.01033952, + "balance_loss_clip": 1.03629112, + "balance_loss_mlp": 1.02011752, + "epoch": 0.668570569667819, + "flos": 30191250015360.0, + "grad_norm": 1.6841707968514, + "language_loss": 0.67587042, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.69707495, + "num_input_tokens_seen": 240125455, + "step": 11120, + "time_per_iteration": 5.847496509552002 + }, + { + "auxiliary_loss_clip": 0.01082761, + "auxiliary_loss_mlp": 0.01034422, + "balance_loss_clip": 1.03859222, + "balance_loss_mlp": 1.02105224, + "epoch": 0.668630692920487, + "flos": 24170646764160.0, + "grad_norm": 1.5497664343001825, + "language_loss": 0.72015131, + "learning_rate": 1.045303157347638e-06, + "loss": 0.74132311, + "num_input_tokens_seen": 240143870, + "step": 11121, + "time_per_iteration": 2.763155698776245 + }, + { + "auxiliary_loss_clip": 0.01090844, + "auxiliary_loss_mlp": 0.01037875, + "balance_loss_clip": 1.03589582, + "balance_loss_mlp": 1.02405834, + "epoch": 0.668690816173155, + "flos": 17457147181440.0, + "grad_norm": 2.929304268957898, + "language_loss": 0.70167738, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.72296458, + "num_input_tokens_seen": 240161020, + "step": 11122, + "time_per_iteration": 2.657095432281494 + }, + { + "auxiliary_loss_clip": 0.0105491, + "auxiliary_loss_mlp": 0.00772515, + "balance_loss_clip": 1.03529811, + "balance_loss_mlp": 1.00017619, + "epoch": 0.668750939425823, + "flos": 25005102595200.0, + "grad_norm": 1.8472771024518286, + "language_loss": 0.71752214, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.73579645, + "num_input_tokens_seen": 240179820, + "step": 11123, + "time_per_iteration": 2.811048984527588 + }, + { + "auxiliary_loss_clip": 0.01096616, + "auxiliary_loss_mlp": 0.01042035, + "balance_loss_clip": 1.04108119, + "balance_loss_mlp": 1.02759266, + "epoch": 0.6688110626784909, + "flos": 24096778444800.0, + "grad_norm": 1.6363097123873878, + "language_loss": 0.79147661, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.81286311, + "num_input_tokens_seen": 240200130, + "step": 11124, + "time_per_iteration": 4.317869663238525 + }, + { + "auxiliary_loss_clip": 0.01089397, + "auxiliary_loss_mlp": 0.01041307, + "balance_loss_clip": 1.04114437, + "balance_loss_mlp": 1.02808654, + "epoch": 0.6688711859311589, + "flos": 21759532375680.0, + "grad_norm": 1.733456144830199, + "language_loss": 0.74266189, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.76396894, + "num_input_tokens_seen": 240217945, + "step": 11125, + "time_per_iteration": 2.67317795753479 + }, + { + "auxiliary_loss_clip": 0.01076985, + "auxiliary_loss_mlp": 0.0104133, + "balance_loss_clip": 1.0369091, + "balance_loss_mlp": 1.02759719, + "epoch": 0.6689313091838268, + "flos": 22929645824640.0, + "grad_norm": 2.098915501123677, + "language_loss": 0.67166436, + "learning_rate": 1.043592482774116e-06, + "loss": 0.69284761, + "num_input_tokens_seen": 240237220, + "step": 11126, + "time_per_iteration": 2.739659547805786 + }, + { + "auxiliary_loss_clip": 0.01096554, + "auxiliary_loss_mlp": 0.01031996, + "balance_loss_clip": 1.03653789, + "balance_loss_mlp": 1.01875162, + "epoch": 0.6689914324364948, + "flos": 20886149180160.0, + "grad_norm": 1.7642293623874703, + "language_loss": 0.71071386, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.73199928, + "num_input_tokens_seen": 240256000, + "step": 11127, + "time_per_iteration": 2.729490041732788 + }, + { + "auxiliary_loss_clip": 0.01093813, + "auxiliary_loss_mlp": 0.01034088, + "balance_loss_clip": 1.03839648, + "balance_loss_mlp": 1.01959229, + "epoch": 0.6690515556891627, + "flos": 22748225207040.0, + "grad_norm": 1.9937177709857246, + "language_loss": 0.80368018, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.82495916, + "num_input_tokens_seen": 240275845, + "step": 11128, + "time_per_iteration": 2.6976559162139893 + }, + { + "auxiliary_loss_clip": 0.01114736, + "auxiliary_loss_mlp": 0.01031549, + "balance_loss_clip": 1.03945661, + "balance_loss_mlp": 1.01769066, + "epoch": 0.6691116789418308, + "flos": 23331450337920.0, + "grad_norm": 1.7224977753706385, + "language_loss": 0.80861622, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.83007908, + "num_input_tokens_seen": 240294095, + "step": 11129, + "time_per_iteration": 2.6617815494537354 + }, + { + "auxiliary_loss_clip": 0.01091652, + "auxiliary_loss_mlp": 0.010401, + "balance_loss_clip": 1.03546023, + "balance_loss_mlp": 1.02758944, + "epoch": 0.6691718021944987, + "flos": 32447014081920.0, + "grad_norm": 1.6214077068942991, + "language_loss": 0.70471781, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.72603536, + "num_input_tokens_seen": 240313460, + "step": 11130, + "time_per_iteration": 2.715178966522217 + }, + { + "auxiliary_loss_clip": 0.01088381, + "auxiliary_loss_mlp": 0.0103708, + "balance_loss_clip": 1.03720927, + "balance_loss_mlp": 1.02462888, + "epoch": 0.6692319254471667, + "flos": 23731602825600.0, + "grad_norm": 2.6655548100703643, + "language_loss": 0.70267725, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.72393191, + "num_input_tokens_seen": 240333540, + "step": 11131, + "time_per_iteration": 2.747252941131592 + }, + { + "auxiliary_loss_clip": 0.01104604, + "auxiliary_loss_mlp": 0.01034247, + "balance_loss_clip": 1.03865063, + "balance_loss_mlp": 1.01924431, + "epoch": 0.6692920486998346, + "flos": 14427902620800.0, + "grad_norm": 2.56171206247206, + "language_loss": 0.65588742, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.6772759, + "num_input_tokens_seen": 240350085, + "step": 11132, + "time_per_iteration": 2.697385311126709 + }, + { + "auxiliary_loss_clip": 0.01102641, + "auxiliary_loss_mlp": 0.01034541, + "balance_loss_clip": 1.03688669, + "balance_loss_mlp": 1.01992595, + "epoch": 0.6693521719525026, + "flos": 21507475662720.0, + "grad_norm": 1.866615287346346, + "language_loss": 0.74370456, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.7650764, + "num_input_tokens_seen": 240370015, + "step": 11133, + "time_per_iteration": 2.7032175064086914 + }, + { + "auxiliary_loss_clip": 0.01110623, + "auxiliary_loss_mlp": 0.01036691, + "balance_loss_clip": 1.04268622, + "balance_loss_mlp": 1.0218854, + "epoch": 0.6694122952051706, + "flos": 25406943022080.0, + "grad_norm": 1.7566380678066518, + "language_loss": 0.66696709, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.6884402, + "num_input_tokens_seen": 240390770, + "step": 11134, + "time_per_iteration": 2.7601702213287354 + }, + { + "auxiliary_loss_clip": 0.01106772, + "auxiliary_loss_mlp": 0.01043022, + "balance_loss_clip": 1.04027784, + "balance_loss_mlp": 1.0275842, + "epoch": 0.6694724184578386, + "flos": 25661729168640.0, + "grad_norm": 1.8684519143911829, + "language_loss": 0.77561742, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.79711533, + "num_input_tokens_seen": 240409590, + "step": 11135, + "time_per_iteration": 2.6581594944000244 + }, + { + "auxiliary_loss_clip": 0.01104169, + "auxiliary_loss_mlp": 0.01034806, + "balance_loss_clip": 1.04034281, + "balance_loss_mlp": 1.02143669, + "epoch": 0.6695325417105066, + "flos": 17709311635200.0, + "grad_norm": 1.6117039898518706, + "language_loss": 0.74245167, + "learning_rate": 1.040173855277898e-06, + "loss": 0.76384139, + "num_input_tokens_seen": 240428180, + "step": 11136, + "time_per_iteration": 2.7073006629943848 + }, + { + "auxiliary_loss_clip": 0.01109339, + "auxiliary_loss_mlp": 0.01037889, + "balance_loss_clip": 1.04210007, + "balance_loss_mlp": 1.0232265, + "epoch": 0.6695926649631745, + "flos": 24460050643200.0, + "grad_norm": 1.7129981010638282, + "language_loss": 0.62248957, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.64396185, + "num_input_tokens_seen": 240447815, + "step": 11137, + "time_per_iteration": 2.6636767387390137 + }, + { + "auxiliary_loss_clip": 0.01114546, + "auxiliary_loss_mlp": 0.01028612, + "balance_loss_clip": 1.04025912, + "balance_loss_mlp": 1.01526093, + "epoch": 0.6696527882158425, + "flos": 24280138396800.0, + "grad_norm": 1.688540284250028, + "language_loss": 0.66006732, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.68149894, + "num_input_tokens_seen": 240468635, + "step": 11138, + "time_per_iteration": 2.608583688735962 + }, + { + "auxiliary_loss_clip": 0.01077908, + "auxiliary_loss_mlp": 0.01040221, + "balance_loss_clip": 1.0351193, + "balance_loss_mlp": 1.02686357, + "epoch": 0.6697129114685104, + "flos": 23002759958400.0, + "grad_norm": 1.6525815819397558, + "language_loss": 0.73112983, + "learning_rate": 1.039148976175053e-06, + "loss": 0.75231111, + "num_input_tokens_seen": 240488550, + "step": 11139, + "time_per_iteration": 2.6988184452056885 + }, + { + "auxiliary_loss_clip": 0.01073576, + "auxiliary_loss_mlp": 0.01036448, + "balance_loss_clip": 1.0351299, + "balance_loss_mlp": 1.02378786, + "epoch": 0.6697730347211784, + "flos": 22638123043200.0, + "grad_norm": 1.9643022468042264, + "language_loss": 0.70518827, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.72628856, + "num_input_tokens_seen": 240508330, + "step": 11140, + "time_per_iteration": 2.782379150390625 + }, + { + "auxiliary_loss_clip": 0.01103316, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.0356679, + "balance_loss_mlp": 1.01478446, + "epoch": 0.6698331579738463, + "flos": 28877242682880.0, + "grad_norm": 1.8179612458816414, + "language_loss": 0.75826752, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.77959162, + "num_input_tokens_seen": 240528470, + "step": 11141, + "time_per_iteration": 2.662597417831421 + }, + { + "auxiliary_loss_clip": 0.01103859, + "auxiliary_loss_mlp": 0.01038503, + "balance_loss_clip": 1.03954339, + "balance_loss_mlp": 1.02456141, + "epoch": 0.6698932812265144, + "flos": 24207096090240.0, + "grad_norm": 1.817558320872016, + "language_loss": 0.81910652, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.84053016, + "num_input_tokens_seen": 240547815, + "step": 11142, + "time_per_iteration": 2.6364564895629883 + }, + { + "auxiliary_loss_clip": 0.01063471, + "auxiliary_loss_mlp": 0.01030688, + "balance_loss_clip": 1.03567362, + "balance_loss_mlp": 1.01705074, + "epoch": 0.6699534044791823, + "flos": 22090269830400.0, + "grad_norm": 1.605847382893669, + "language_loss": 0.70027417, + "learning_rate": 1.037782980862959e-06, + "loss": 0.72121578, + "num_input_tokens_seen": 240567765, + "step": 11143, + "time_per_iteration": 2.738811492919922 + }, + { + "auxiliary_loss_clip": 0.01071446, + "auxiliary_loss_mlp": 0.00771315, + "balance_loss_clip": 1.03594804, + "balance_loss_mlp": 1.00014567, + "epoch": 0.6700135277318503, + "flos": 25192377129600.0, + "grad_norm": 1.4724413771665843, + "language_loss": 0.70065033, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.71907794, + "num_input_tokens_seen": 240590750, + "step": 11144, + "time_per_iteration": 2.85090708732605 + }, + { + "auxiliary_loss_clip": 0.01087354, + "auxiliary_loss_mlp": 0.01033347, + "balance_loss_clip": 1.0364095, + "balance_loss_mlp": 1.02025223, + "epoch": 0.6700736509845182, + "flos": 23440187784960.0, + "grad_norm": 1.6283494272446573, + "language_loss": 0.74419498, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.76540208, + "num_input_tokens_seen": 240608875, + "step": 11145, + "time_per_iteration": 2.9192864894866943 + }, + { + "auxiliary_loss_clip": 0.0109431, + "auxiliary_loss_mlp": 0.01030391, + "balance_loss_clip": 1.03830147, + "balance_loss_mlp": 1.01683688, + "epoch": 0.6701337742371862, + "flos": 24389953251840.0, + "grad_norm": 5.654995580149679, + "language_loss": 0.7114135, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.73266053, + "num_input_tokens_seen": 240628565, + "step": 11146, + "time_per_iteration": 2.7690348625183105 + }, + { + "auxiliary_loss_clip": 0.01109374, + "auxiliary_loss_mlp": 0.00770286, + "balance_loss_clip": 1.03855777, + "balance_loss_mlp": 1.00021374, + "epoch": 0.6701938974898543, + "flos": 14793652857600.0, + "grad_norm": 1.9526898160613644, + "language_loss": 0.78687358, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.80567014, + "num_input_tokens_seen": 240646325, + "step": 11147, + "time_per_iteration": 2.6259043216705322 + }, + { + "auxiliary_loss_clip": 0.01104856, + "auxiliary_loss_mlp": 0.0077075, + "balance_loss_clip": 1.04050827, + "balance_loss_mlp": 1.00021648, + "epoch": 0.6702540207425222, + "flos": 20154002261760.0, + "grad_norm": 2.133465120376381, + "language_loss": 0.70325512, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.72201115, + "num_input_tokens_seen": 240666145, + "step": 11148, + "time_per_iteration": 2.6906309127807617 + }, + { + "auxiliary_loss_clip": 0.01094652, + "auxiliary_loss_mlp": 0.01033466, + "balance_loss_clip": 1.03719747, + "balance_loss_mlp": 1.02005529, + "epoch": 0.6703141439951902, + "flos": 21214157201280.0, + "grad_norm": 2.1690530349128148, + "language_loss": 0.7037127, + "learning_rate": 1.035735082774636e-06, + "loss": 0.72499388, + "num_input_tokens_seen": 240685570, + "step": 11149, + "time_per_iteration": 2.6307806968688965 + }, + { + "auxiliary_loss_clip": 0.01092611, + "auxiliary_loss_mlp": 0.010296, + "balance_loss_clip": 1.03670847, + "balance_loss_mlp": 1.01705897, + "epoch": 0.6703742672478581, + "flos": 23112538899840.0, + "grad_norm": 1.6662323590997945, + "language_loss": 0.73725748, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.75847954, + "num_input_tokens_seen": 240706945, + "step": 11150, + "time_per_iteration": 2.6917827129364014 + }, + { + "auxiliary_loss_clip": 0.01103639, + "auxiliary_loss_mlp": 0.01036073, + "balance_loss_clip": 1.0409379, + "balance_loss_mlp": 1.02276325, + "epoch": 0.6704343905005261, + "flos": 22528918719360.0, + "grad_norm": 1.705366168717962, + "language_loss": 0.78539407, + "learning_rate": 1.035052742460671e-06, + "loss": 0.80679119, + "num_input_tokens_seen": 240727990, + "step": 11151, + "time_per_iteration": 2.6567208766937256 + }, + { + "auxiliary_loss_clip": 0.00987579, + "auxiliary_loss_mlp": 0.01000572, + "balance_loss_clip": 1.01053739, + "balance_loss_mlp": 0.99935037, + "epoch": 0.670494513753194, + "flos": 64793158773120.0, + "grad_norm": 0.7884890805336543, + "language_loss": 0.55364567, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.57352722, + "num_input_tokens_seen": 240790380, + "step": 11152, + "time_per_iteration": 3.3006503582000732 + }, + { + "auxiliary_loss_clip": 0.0109132, + "auxiliary_loss_mlp": 0.01038631, + "balance_loss_clip": 1.03844714, + "balance_loss_mlp": 1.025244, + "epoch": 0.670554637005862, + "flos": 23511506238720.0, + "grad_norm": 1.9985135771335918, + "language_loss": 0.80859494, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.82989448, + "num_input_tokens_seen": 240811545, + "step": 11153, + "time_per_iteration": 2.7756435871124268 + }, + { + "auxiliary_loss_clip": 0.01076408, + "auxiliary_loss_mlp": 0.00771693, + "balance_loss_clip": 1.03820157, + "balance_loss_mlp": 1.00020981, + "epoch": 0.67061476025853, + "flos": 19463404400640.0, + "grad_norm": 1.6080859471988709, + "language_loss": 0.76408523, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.78256631, + "num_input_tokens_seen": 240831380, + "step": 11154, + "time_per_iteration": 2.8628106117248535 + }, + { + "auxiliary_loss_clip": 0.01094529, + "auxiliary_loss_mlp": 0.01041911, + "balance_loss_clip": 1.03737462, + "balance_loss_mlp": 1.02754045, + "epoch": 0.670674883511198, + "flos": 20519967980160.0, + "grad_norm": 1.6589905225029438, + "language_loss": 0.76200944, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.78337383, + "num_input_tokens_seen": 240851855, + "step": 11155, + "time_per_iteration": 4.394611120223999 + }, + { + "auxiliary_loss_clip": 0.01115828, + "auxiliary_loss_mlp": 0.01036265, + "balance_loss_clip": 1.04136384, + "balance_loss_mlp": 1.02283049, + "epoch": 0.6707350067638659, + "flos": 25483971738240.0, + "grad_norm": 2.252293716833977, + "language_loss": 0.82174289, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.8432638, + "num_input_tokens_seen": 240869980, + "step": 11156, + "time_per_iteration": 2.672253370285034 + }, + { + "auxiliary_loss_clip": 0.01114074, + "auxiliary_loss_mlp": 0.01037802, + "balance_loss_clip": 1.04081774, + "balance_loss_mlp": 1.02488017, + "epoch": 0.6707951300165339, + "flos": 22273450214400.0, + "grad_norm": 1.89603770151681, + "language_loss": 0.7505753, + "learning_rate": 1.033006600114165e-06, + "loss": 0.77209401, + "num_input_tokens_seen": 240888680, + "step": 11157, + "time_per_iteration": 2.6131577491760254 + }, + { + "auxiliary_loss_clip": 0.01109055, + "auxiliary_loss_mlp": 0.01042973, + "balance_loss_clip": 1.04226005, + "balance_loss_mlp": 1.02867961, + "epoch": 0.6708552532692018, + "flos": 23984593292160.0, + "grad_norm": 1.7747922187460388, + "language_loss": 0.74478519, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.76630545, + "num_input_tokens_seen": 240909050, + "step": 11158, + "time_per_iteration": 2.7293169498443604 + }, + { + "auxiliary_loss_clip": 0.01118082, + "auxiliary_loss_mlp": 0.01037488, + "balance_loss_clip": 1.04157019, + "balance_loss_mlp": 1.02339745, + "epoch": 0.6709153765218698, + "flos": 24937519155840.0, + "grad_norm": 1.5675836402135142, + "language_loss": 0.81520784, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.8367635, + "num_input_tokens_seen": 240930035, + "step": 11159, + "time_per_iteration": 5.697297811508179 + }, + { + "auxiliary_loss_clip": 0.01093112, + "auxiliary_loss_mlp": 0.01031466, + "balance_loss_clip": 1.037853, + "balance_loss_mlp": 1.01822233, + "epoch": 0.6709754997745379, + "flos": 17530225401600.0, + "grad_norm": 1.775658111941971, + "language_loss": 0.76943409, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.79067993, + "num_input_tokens_seen": 240948895, + "step": 11160, + "time_per_iteration": 2.649531602859497 + }, + { + "auxiliary_loss_clip": 0.01088534, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.03823304, + "balance_loss_mlp": 1.01970327, + "epoch": 0.6710356230272058, + "flos": 22090880361600.0, + "grad_norm": 1.7750116462358165, + "language_loss": 0.73715007, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.75836837, + "num_input_tokens_seen": 240967770, + "step": 11161, + "time_per_iteration": 2.677884817123413 + }, + { + "auxiliary_loss_clip": 0.01093874, + "auxiliary_loss_mlp": 0.01041282, + "balance_loss_clip": 1.03686976, + "balance_loss_mlp": 1.0268575, + "epoch": 0.6710957462798738, + "flos": 24206449645440.0, + "grad_norm": 1.942474500054277, + "language_loss": 0.68453658, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.70588821, + "num_input_tokens_seen": 240988985, + "step": 11162, + "time_per_iteration": 2.7426352500915527 + }, + { + "auxiliary_loss_clip": 0.01089967, + "auxiliary_loss_mlp": 0.01042721, + "balance_loss_clip": 1.03566909, + "balance_loss_mlp": 1.02965569, + "epoch": 0.6711558695325417, + "flos": 19093955063040.0, + "grad_norm": 2.157920195613674, + "language_loss": 0.70179218, + "learning_rate": 1.030961777833032e-06, + "loss": 0.72311902, + "num_input_tokens_seen": 241005455, + "step": 11163, + "time_per_iteration": 2.6737561225891113 + }, + { + "auxiliary_loss_clip": 0.01113094, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.0411427, + "balance_loss_mlp": 1.02216315, + "epoch": 0.6712159927852097, + "flos": 25557875971200.0, + "grad_norm": 1.7583635951984506, + "language_loss": 0.75421375, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.7756952, + "num_input_tokens_seen": 241026175, + "step": 11164, + "time_per_iteration": 4.2939674854278564 + }, + { + "auxiliary_loss_clip": 0.01115198, + "auxiliary_loss_mlp": 0.01033198, + "balance_loss_clip": 1.0404532, + "balance_loss_mlp": 1.01967335, + "epoch": 0.6712761160378776, + "flos": 22228812587520.0, + "grad_norm": 1.9153842218638528, + "language_loss": 0.65245664, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.67394054, + "num_input_tokens_seen": 241044040, + "step": 11165, + "time_per_iteration": 2.6558966636657715 + }, + { + "auxiliary_loss_clip": 0.01112642, + "auxiliary_loss_mlp": 0.01036219, + "balance_loss_clip": 1.03975642, + "balance_loss_mlp": 1.02284431, + "epoch": 0.6713362392905456, + "flos": 22455517276800.0, + "grad_norm": 2.4551304238389218, + "language_loss": 0.71630502, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.73779362, + "num_input_tokens_seen": 241063615, + "step": 11166, + "time_per_iteration": 2.594005823135376 + }, + { + "auxiliary_loss_clip": 0.01113176, + "auxiliary_loss_mlp": 0.01030674, + "balance_loss_clip": 1.04087472, + "balance_loss_mlp": 1.01834142, + "epoch": 0.6713963625432136, + "flos": 25630200005760.0, + "grad_norm": 2.7039163117890728, + "language_loss": 0.77024722, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.79168576, + "num_input_tokens_seen": 241082520, + "step": 11167, + "time_per_iteration": 2.630964517593384 + }, + { + "auxiliary_loss_clip": 0.01101695, + "auxiliary_loss_mlp": 0.01040458, + "balance_loss_clip": 1.03634501, + "balance_loss_mlp": 1.02711856, + "epoch": 0.6714564857958816, + "flos": 35006475640320.0, + "grad_norm": 1.6082371290328819, + "language_loss": 0.68865132, + "learning_rate": 1.029258769662629e-06, + "loss": 0.71007288, + "num_input_tokens_seen": 241103505, + "step": 11168, + "time_per_iteration": 2.845033884048462 + }, + { + "auxiliary_loss_clip": 0.01078889, + "auxiliary_loss_mlp": 0.01043458, + "balance_loss_clip": 1.03778422, + "balance_loss_mlp": 1.02867651, + "epoch": 0.6715166090485495, + "flos": 26279931168000.0, + "grad_norm": 1.9394421042077383, + "language_loss": 0.73349601, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.75471944, + "num_input_tokens_seen": 241122885, + "step": 11169, + "time_per_iteration": 2.9264886379241943 + }, + { + "auxiliary_loss_clip": 0.0110554, + "auxiliary_loss_mlp": 0.01039147, + "balance_loss_clip": 1.0378871, + "balance_loss_mlp": 1.02427554, + "epoch": 0.6715767323012175, + "flos": 15924156583680.0, + "grad_norm": 1.9283403176707277, + "language_loss": 0.76306462, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.78451145, + "num_input_tokens_seen": 241140865, + "step": 11170, + "time_per_iteration": 2.649400472640991 + }, + { + "auxiliary_loss_clip": 0.01095301, + "auxiliary_loss_mlp": 0.0103096, + "balance_loss_clip": 1.03898799, + "balance_loss_mlp": 1.01709008, + "epoch": 0.6716368555538854, + "flos": 17491441691520.0, + "grad_norm": 1.924665288480993, + "language_loss": 0.74140078, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.76266336, + "num_input_tokens_seen": 241158225, + "step": 11171, + "time_per_iteration": 2.672985076904297 + }, + { + "auxiliary_loss_clip": 0.0107518, + "auxiliary_loss_mlp": 0.01054034, + "balance_loss_clip": 1.03710866, + "balance_loss_mlp": 1.03831053, + "epoch": 0.6716969788065534, + "flos": 16761521416320.0, + "grad_norm": 1.4921239292463526, + "language_loss": 0.86225343, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.88354552, + "num_input_tokens_seen": 241175215, + "step": 11172, + "time_per_iteration": 2.720012664794922 + }, + { + "auxiliary_loss_clip": 0.01098137, + "auxiliary_loss_mlp": 0.01041037, + "balance_loss_clip": 1.0346545, + "balance_loss_mlp": 1.02693462, + "epoch": 0.6717571020592215, + "flos": 22709800632960.0, + "grad_norm": 1.8099463790548698, + "language_loss": 0.63222194, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.65361369, + "num_input_tokens_seen": 241195250, + "step": 11173, + "time_per_iteration": 2.6705803871154785 + }, + { + "auxiliary_loss_clip": 0.0111084, + "auxiliary_loss_mlp": 0.01040058, + "balance_loss_clip": 1.03873289, + "balance_loss_mlp": 1.02487719, + "epoch": 0.6718172253118894, + "flos": 18734094656640.0, + "grad_norm": 2.1708594678401707, + "language_loss": 0.71347594, + "learning_rate": 1.02721637475002e-06, + "loss": 0.73498487, + "num_input_tokens_seen": 241210720, + "step": 11174, + "time_per_iteration": 2.602283477783203 + }, + { + "auxiliary_loss_clip": 0.01075457, + "auxiliary_loss_mlp": 0.01030548, + "balance_loss_clip": 1.03783953, + "balance_loss_mlp": 1.01738167, + "epoch": 0.6718773485645574, + "flos": 15632526061440.0, + "grad_norm": 2.052442882823656, + "language_loss": 0.67971045, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.7007705, + "num_input_tokens_seen": 241227395, + "step": 11175, + "time_per_iteration": 2.669154644012451 + }, + { + "auxiliary_loss_clip": 0.01085, + "auxiliary_loss_mlp": 0.01037471, + "balance_loss_clip": 1.0389663, + "balance_loss_mlp": 1.02479339, + "epoch": 0.6719374718172253, + "flos": 19354774694400.0, + "grad_norm": 2.182967446535966, + "language_loss": 0.7362026, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.75742733, + "num_input_tokens_seen": 241246355, + "step": 11176, + "time_per_iteration": 2.644695997238159 + }, + { + "auxiliary_loss_clip": 0.01093824, + "auxiliary_loss_mlp": 0.01037825, + "balance_loss_clip": 1.03961146, + "balance_loss_mlp": 1.02334082, + "epoch": 0.6719975950698933, + "flos": 21981316901760.0, + "grad_norm": 1.8406147660483967, + "language_loss": 0.72720611, + "learning_rate": 1.026195675108182e-06, + "loss": 0.74852264, + "num_input_tokens_seen": 241264180, + "step": 11177, + "time_per_iteration": 2.6863327026367188 + }, + { + "auxiliary_loss_clip": 0.01115157, + "auxiliary_loss_mlp": 0.01038577, + "balance_loss_clip": 1.03991175, + "balance_loss_mlp": 1.0244683, + "epoch": 0.6720577183225612, + "flos": 25228072270080.0, + "grad_norm": 2.150822621130827, + "language_loss": 0.76274478, + "learning_rate": 1.025855515730551e-06, + "loss": 0.78428215, + "num_input_tokens_seen": 241282245, + "step": 11178, + "time_per_iteration": 2.580979108810425 + }, + { + "auxiliary_loss_clip": 0.01106474, + "auxiliary_loss_mlp": 0.0103827, + "balance_loss_clip": 1.04109895, + "balance_loss_mlp": 1.02494228, + "epoch": 0.6721178415752292, + "flos": 16945886949120.0, + "grad_norm": 1.6631958135032512, + "language_loss": 0.69917423, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.72062165, + "num_input_tokens_seen": 241300745, + "step": 11179, + "time_per_iteration": 2.765749454498291 + }, + { + "auxiliary_loss_clip": 0.01067075, + "auxiliary_loss_mlp": 0.01035482, + "balance_loss_clip": 1.03598237, + "balance_loss_mlp": 1.02269685, + "epoch": 0.6721779648278972, + "flos": 21541375123200.0, + "grad_norm": 1.5427953976374715, + "language_loss": 0.74147439, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.76249993, + "num_input_tokens_seen": 241319320, + "step": 11180, + "time_per_iteration": 2.7570419311523438 + }, + { + "auxiliary_loss_clip": 0.01094967, + "auxiliary_loss_mlp": 0.01032611, + "balance_loss_clip": 1.03934419, + "balance_loss_mlp": 1.01931906, + "epoch": 0.6722380880805652, + "flos": 22605444645120.0, + "grad_norm": 1.3453936001041888, + "language_loss": 0.75262862, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.77390438, + "num_input_tokens_seen": 241342225, + "step": 11181, + "time_per_iteration": 2.805821418762207 + }, + { + "auxiliary_loss_clip": 0.0109711, + "auxiliary_loss_mlp": 0.0103353, + "balance_loss_clip": 1.03977168, + "balance_loss_mlp": 1.0209651, + "epoch": 0.6722982113332331, + "flos": 15925269905280.0, + "grad_norm": 4.685407340613367, + "language_loss": 0.74491268, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.76621902, + "num_input_tokens_seen": 241358240, + "step": 11182, + "time_per_iteration": 2.7147958278656006 + }, + { + "auxiliary_loss_clip": 0.01098785, + "auxiliary_loss_mlp": 0.01033728, + "balance_loss_clip": 1.03787458, + "balance_loss_mlp": 1.02139592, + "epoch": 0.6723583345859011, + "flos": 20596170683520.0, + "grad_norm": 2.0288719371623323, + "language_loss": 0.69882548, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.72015059, + "num_input_tokens_seen": 241378420, + "step": 11183, + "time_per_iteration": 2.6687538623809814 + }, + { + "auxiliary_loss_clip": 0.01064932, + "auxiliary_loss_mlp": 0.01033349, + "balance_loss_clip": 1.0361743, + "balance_loss_mlp": 1.01995015, + "epoch": 0.672418457838569, + "flos": 21725848396800.0, + "grad_norm": 2.97348360718205, + "language_loss": 0.77805459, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.7990374, + "num_input_tokens_seen": 241397185, + "step": 11184, + "time_per_iteration": 2.777731418609619 + }, + { + "auxiliary_loss_clip": 0.0109739, + "auxiliary_loss_mlp": 0.00775757, + "balance_loss_clip": 1.04143977, + "balance_loss_mlp": 1.00022709, + "epoch": 0.672478581091237, + "flos": 21470379891840.0, + "grad_norm": 3.9325636134414426, + "language_loss": 0.66277105, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.68150252, + "num_input_tokens_seen": 241415785, + "step": 11185, + "time_per_iteration": 2.737527370452881 + }, + { + "auxiliary_loss_clip": 0.01076626, + "auxiliary_loss_mlp": 0.01036011, + "balance_loss_clip": 1.03503013, + "balance_loss_mlp": 1.02205157, + "epoch": 0.6725387043439051, + "flos": 30846763267200.0, + "grad_norm": 1.5938972508624505, + "language_loss": 0.80483949, + "learning_rate": 1.023135571620345e-06, + "loss": 0.82596588, + "num_input_tokens_seen": 241437390, + "step": 11186, + "time_per_iteration": 2.8201353549957275 + }, + { + "auxiliary_loss_clip": 0.01101545, + "auxiliary_loss_mlp": 0.01035806, + "balance_loss_clip": 1.04061747, + "balance_loss_mlp": 1.02350974, + "epoch": 0.672598827596573, + "flos": 24055947659520.0, + "grad_norm": 2.88496393330639, + "language_loss": 0.80385649, + "learning_rate": 1.022795745163813e-06, + "loss": 0.82523, + "num_input_tokens_seen": 241458085, + "step": 11187, + "time_per_iteration": 2.7198538780212402 + }, + { + "auxiliary_loss_clip": 0.0107469, + "auxiliary_loss_mlp": 0.01033866, + "balance_loss_clip": 1.04410124, + "balance_loss_mlp": 1.01917362, + "epoch": 0.672658950849241, + "flos": 21871861182720.0, + "grad_norm": 1.9454261923533847, + "language_loss": 0.7059114, + "learning_rate": 1.022455955762965e-06, + "loss": 0.7269969, + "num_input_tokens_seen": 241476880, + "step": 11188, + "time_per_iteration": 2.7985453605651855 + }, + { + "auxiliary_loss_clip": 0.01054991, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.04298103, + "balance_loss_mlp": 1.02394819, + "epoch": 0.6727190741019089, + "flos": 23222102359680.0, + "grad_norm": 1.8365043403177213, + "language_loss": 0.7589345, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.77985215, + "num_input_tokens_seen": 241496535, + "step": 11189, + "time_per_iteration": 2.905705213546753 + }, + { + "auxiliary_loss_clip": 0.01116413, + "auxiliary_loss_mlp": 0.01032806, + "balance_loss_clip": 1.0382818, + "balance_loss_mlp": 1.01785755, + "epoch": 0.6727791973545769, + "flos": 15778610674560.0, + "grad_norm": 2.0168522444965986, + "language_loss": 0.75364029, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.77513248, + "num_input_tokens_seen": 241513465, + "step": 11190, + "time_per_iteration": 2.833767890930176 + }, + { + "auxiliary_loss_clip": 0.01048034, + "auxiliary_loss_mlp": 0.0103557, + "balance_loss_clip": 1.03332615, + "balance_loss_mlp": 1.02153933, + "epoch": 0.6728393206072448, + "flos": 21249852341760.0, + "grad_norm": 2.7169326773783236, + "language_loss": 0.77364898, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.79448497, + "num_input_tokens_seen": 241534125, + "step": 11191, + "time_per_iteration": 2.782000780105591 + }, + { + "auxiliary_loss_clip": 0.01111788, + "auxiliary_loss_mlp": 0.01034042, + "balance_loss_clip": 1.03986657, + "balance_loss_mlp": 1.02100623, + "epoch": 0.6728994438599128, + "flos": 32123279779200.0, + "grad_norm": 2.4096830802466416, + "language_loss": 0.8635608, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.88501906, + "num_input_tokens_seen": 241556340, + "step": 11192, + "time_per_iteration": 2.7193620204925537 + }, + { + "auxiliary_loss_clip": 0.01104606, + "auxiliary_loss_mlp": 0.0103762, + "balance_loss_clip": 1.03892374, + "balance_loss_mlp": 1.023458, + "epoch": 0.6729595671125808, + "flos": 23112359331840.0, + "grad_norm": 2.0040177590782906, + "language_loss": 0.75960791, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.78103018, + "num_input_tokens_seen": 241575185, + "step": 11193, + "time_per_iteration": 2.713738441467285 + }, + { + "auxiliary_loss_clip": 0.01081133, + "auxiliary_loss_mlp": 0.01033198, + "balance_loss_clip": 1.04058063, + "balance_loss_mlp": 1.02000737, + "epoch": 0.6730196903652488, + "flos": 14611406227200.0, + "grad_norm": 1.775580074575331, + "language_loss": 0.78365123, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.80479455, + "num_input_tokens_seen": 241592970, + "step": 11194, + "time_per_iteration": 4.453005075454712 + }, + { + "auxiliary_loss_clip": 0.0110231, + "auxiliary_loss_mlp": 0.01028753, + "balance_loss_clip": 1.03805304, + "balance_loss_mlp": 1.01621783, + "epoch": 0.6730798136179167, + "flos": 21105922544640.0, + "grad_norm": 1.9211443871049516, + "language_loss": 0.89955217, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.92086279, + "num_input_tokens_seen": 241610245, + "step": 11195, + "time_per_iteration": 2.6450841426849365 + }, + { + "auxiliary_loss_clip": 0.01101967, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_clip": 1.03769374, + "balance_loss_mlp": 1.01929736, + "epoch": 0.6731399368705847, + "flos": 28986267438720.0, + "grad_norm": 1.64687980974086, + "language_loss": 0.72439396, + "learning_rate": 1.019738976106662e-06, + "loss": 0.74573386, + "num_input_tokens_seen": 241630350, + "step": 11196, + "time_per_iteration": 2.685826063156128 + }, + { + "auxiliary_loss_clip": 0.00973165, + "auxiliary_loss_mlp": 0.01004254, + "balance_loss_clip": 1.01249313, + "balance_loss_mlp": 1.00303793, + "epoch": 0.6732000601232526, + "flos": 64743708723840.0, + "grad_norm": 0.7752886509100162, + "language_loss": 0.5652535, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.58502769, + "num_input_tokens_seen": 241692380, + "step": 11197, + "time_per_iteration": 3.259193181991577 + }, + { + "auxiliary_loss_clip": 0.01093274, + "auxiliary_loss_mlp": 0.01029344, + "balance_loss_clip": 1.04169464, + "balance_loss_mlp": 1.01701725, + "epoch": 0.6732601833759206, + "flos": 17201642762880.0, + "grad_norm": 2.055708631074821, + "language_loss": 0.7532202, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.77444637, + "num_input_tokens_seen": 241710430, + "step": 11198, + "time_per_iteration": 5.827820777893066 + }, + { + "auxiliary_loss_clip": 0.01103142, + "auxiliary_loss_mlp": 0.01033637, + "balance_loss_clip": 1.03708792, + "balance_loss_mlp": 1.01949286, + "epoch": 0.6733203066285887, + "flos": 18658861620480.0, + "grad_norm": 2.036459352353542, + "language_loss": 0.81907552, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.84044337, + "num_input_tokens_seen": 241724775, + "step": 11199, + "time_per_iteration": 2.5949244499206543 + }, + { + "auxiliary_loss_clip": 0.01059201, + "auxiliary_loss_mlp": 0.01036318, + "balance_loss_clip": 1.03536808, + "balance_loss_mlp": 1.0218575, + "epoch": 0.6733804298812566, + "flos": 35809330481280.0, + "grad_norm": 1.7500176126376645, + "language_loss": 0.7166037, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.73755884, + "num_input_tokens_seen": 241744440, + "step": 11200, + "time_per_iteration": 2.9160830974578857 + }, + { + "auxiliary_loss_clip": 0.01115381, + "auxiliary_loss_mlp": 0.01035903, + "balance_loss_clip": 1.04130912, + "balance_loss_mlp": 1.02295125, + "epoch": 0.6734405531339246, + "flos": 61638833099520.0, + "grad_norm": 2.371327555495297, + "language_loss": 0.64769435, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.66920727, + "num_input_tokens_seen": 241771705, + "step": 11201, + "time_per_iteration": 2.9968230724334717 + }, + { + "auxiliary_loss_clip": 0.01096465, + "auxiliary_loss_mlp": 0.01040904, + "balance_loss_clip": 1.04056644, + "balance_loss_mlp": 1.02676558, + "epoch": 0.6735006763865925, + "flos": 20522338277760.0, + "grad_norm": 2.019287776053706, + "language_loss": 0.63276017, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.65413386, + "num_input_tokens_seen": 241790830, + "step": 11202, + "time_per_iteration": 2.7302961349487305 + }, + { + "auxiliary_loss_clip": 0.01112496, + "auxiliary_loss_mlp": 0.01028107, + "balance_loss_clip": 1.03865552, + "balance_loss_mlp": 1.01558411, + "epoch": 0.6735607996392605, + "flos": 13918869031680.0, + "grad_norm": 1.915253440556218, + "language_loss": 0.74535716, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.76676321, + "num_input_tokens_seen": 241808165, + "step": 11203, + "time_per_iteration": 4.089365243911743 + }, + { + "auxiliary_loss_clip": 0.01098401, + "auxiliary_loss_mlp": 0.01034017, + "balance_loss_clip": 1.04094148, + "balance_loss_mlp": 1.01900887, + "epoch": 0.6736209228919284, + "flos": 18807244704000.0, + "grad_norm": 1.6291352462615132, + "language_loss": 0.67681134, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.6981355, + "num_input_tokens_seen": 241826925, + "step": 11204, + "time_per_iteration": 2.6192142963409424 + }, + { + "auxiliary_loss_clip": 0.01110427, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.04293954, + "balance_loss_mlp": 1.02012277, + "epoch": 0.6736810461445965, + "flos": 20373129181440.0, + "grad_norm": 1.6630781718701608, + "language_loss": 0.74060369, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.76205349, + "num_input_tokens_seen": 241845525, + "step": 11205, + "time_per_iteration": 2.6068971157073975 + }, + { + "auxiliary_loss_clip": 0.01109012, + "auxiliary_loss_mlp": 0.01037861, + "balance_loss_clip": 1.03733087, + "balance_loss_mlp": 1.02507019, + "epoch": 0.6737411693972644, + "flos": 30007530927360.0, + "grad_norm": 1.5764181927902094, + "language_loss": 0.71426833, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.73573703, + "num_input_tokens_seen": 241866815, + "step": 11206, + "time_per_iteration": 2.6492159366607666 + }, + { + "auxiliary_loss_clip": 0.0107907, + "auxiliary_loss_mlp": 0.0077308, + "balance_loss_clip": 1.03777742, + "balance_loss_mlp": 1.00019574, + "epoch": 0.6738012926499324, + "flos": 25447342844160.0, + "grad_norm": 3.2743303537758712, + "language_loss": 0.67471528, + "learning_rate": 1.016007014855092e-06, + "loss": 0.69323683, + "num_input_tokens_seen": 241887050, + "step": 11207, + "time_per_iteration": 2.7261955738067627 + }, + { + "auxiliary_loss_clip": 0.01062123, + "auxiliary_loss_mlp": 0.01037554, + "balance_loss_clip": 1.035918, + "balance_loss_mlp": 1.02464974, + "epoch": 0.6738614159026003, + "flos": 20776873029120.0, + "grad_norm": 1.9421642242153492, + "language_loss": 0.73736989, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.7583667, + "num_input_tokens_seen": 241904280, + "step": 11208, + "time_per_iteration": 2.7930853366851807 + }, + { + "auxiliary_loss_clip": 0.01097466, + "auxiliary_loss_mlp": 0.01047913, + "balance_loss_clip": 1.03587651, + "balance_loss_mlp": 1.03142679, + "epoch": 0.6739215391552683, + "flos": 19566898462080.0, + "grad_norm": 2.600803881105225, + "language_loss": 0.75433391, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.77578771, + "num_input_tokens_seen": 241919190, + "step": 11209, + "time_per_iteration": 2.626483678817749 + }, + { + "auxiliary_loss_clip": 0.01073019, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.03657913, + "balance_loss_mlp": 1.02250111, + "epoch": 0.6739816624079362, + "flos": 24388193485440.0, + "grad_norm": 1.7906778547342261, + "language_loss": 0.66272515, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.68380302, + "num_input_tokens_seen": 241940525, + "step": 11210, + "time_per_iteration": 2.711866617202759 + }, + { + "auxiliary_loss_clip": 0.01108754, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.03769946, + "balance_loss_mlp": 1.02072227, + "epoch": 0.6740417856606042, + "flos": 22528164533760.0, + "grad_norm": 2.164396283420133, + "language_loss": 0.80170596, + "learning_rate": 1.014651056529377e-06, + "loss": 0.82311797, + "num_input_tokens_seen": 241959290, + "step": 11211, + "time_per_iteration": 2.650737762451172 + }, + { + "auxiliary_loss_clip": 0.01065338, + "auxiliary_loss_mlp": 0.01034108, + "balance_loss_clip": 1.03782678, + "balance_loss_mlp": 1.02107227, + "epoch": 0.6741019089132723, + "flos": 25775458606080.0, + "grad_norm": 1.391978533622519, + "language_loss": 0.76499903, + "learning_rate": 1.014312160327143e-06, + "loss": 0.78599358, + "num_input_tokens_seen": 241980715, + "step": 11212, + "time_per_iteration": 2.778548240661621 + }, + { + "auxiliary_loss_clip": 0.0107247, + "auxiliary_loss_mlp": 0.00773763, + "balance_loss_clip": 1.03496337, + "balance_loss_mlp": 1.00017405, + "epoch": 0.6741620321659402, + "flos": 21105671149440.0, + "grad_norm": 1.7101000867736138, + "language_loss": 0.7758128, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.79427516, + "num_input_tokens_seen": 241999985, + "step": 11213, + "time_per_iteration": 2.7835280895233154 + }, + { + "auxiliary_loss_clip": 0.01061037, + "auxiliary_loss_mlp": 0.01033911, + "balance_loss_clip": 1.03824186, + "balance_loss_mlp": 1.0204041, + "epoch": 0.6742221554186082, + "flos": 20740423703040.0, + "grad_norm": 1.981711873743371, + "language_loss": 0.67612016, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.69706964, + "num_input_tokens_seen": 242018990, + "step": 11214, + "time_per_iteration": 2.9053549766540527 + }, + { + "auxiliary_loss_clip": 0.01113738, + "auxiliary_loss_mlp": 0.00770067, + "balance_loss_clip": 1.03925085, + "balance_loss_mlp": 1.00014567, + "epoch": 0.6742822786712761, + "flos": 37774146384000.0, + "grad_norm": 1.8488975905792826, + "language_loss": 0.72834229, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.74718034, + "num_input_tokens_seen": 242039340, + "step": 11215, + "time_per_iteration": 2.7654783725738525 + }, + { + "auxiliary_loss_clip": 0.0110075, + "auxiliary_loss_mlp": 0.00770504, + "balance_loss_clip": 1.03589749, + "balance_loss_mlp": 1.00019991, + "epoch": 0.6743424019239441, + "flos": 37263891732480.0, + "grad_norm": 2.067200737701415, + "language_loss": 0.67394143, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.69265401, + "num_input_tokens_seen": 242062215, + "step": 11216, + "time_per_iteration": 2.7729885578155518 + }, + { + "auxiliary_loss_clip": 0.01032166, + "auxiliary_loss_mlp": 0.01006139, + "balance_loss_clip": 1.00926828, + "balance_loss_mlp": 1.0051198, + "epoch": 0.674402525176612, + "flos": 65997746300160.0, + "grad_norm": 0.6926580332237084, + "language_loss": 0.56280029, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.58318341, + "num_input_tokens_seen": 242131130, + "step": 11217, + "time_per_iteration": 3.255324125289917 + }, + { + "auxiliary_loss_clip": 0.01099919, + "auxiliary_loss_mlp": 0.0103498, + "balance_loss_clip": 1.0376718, + "balance_loss_mlp": 1.02188516, + "epoch": 0.67446264842928, + "flos": 26461208131200.0, + "grad_norm": 1.7874095302934647, + "language_loss": 0.74496436, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.76631337, + "num_input_tokens_seen": 242149720, + "step": 11218, + "time_per_iteration": 2.672130823135376 + }, + { + "auxiliary_loss_clip": 0.01080832, + "auxiliary_loss_mlp": 0.01049632, + "balance_loss_clip": 1.03884029, + "balance_loss_mlp": 1.03438509, + "epoch": 0.674522771681948, + "flos": 23732392924800.0, + "grad_norm": 1.6252161995703833, + "language_loss": 0.65911674, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.68042141, + "num_input_tokens_seen": 242168875, + "step": 11219, + "time_per_iteration": 2.734159469604492 + }, + { + "auxiliary_loss_clip": 0.01070647, + "auxiliary_loss_mlp": 0.01046329, + "balance_loss_clip": 1.0323844, + "balance_loss_mlp": 1.03093362, + "epoch": 0.674582894934616, + "flos": 24754338771840.0, + "grad_norm": 1.8389940842715735, + "language_loss": 0.75087273, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.77204245, + "num_input_tokens_seen": 242188465, + "step": 11220, + "time_per_iteration": 2.6810474395751953 + }, + { + "auxiliary_loss_clip": 0.01097202, + "auxiliary_loss_mlp": 0.01035624, + "balance_loss_clip": 1.03908563, + "balance_loss_mlp": 1.02207017, + "epoch": 0.6746430181872839, + "flos": 24826626892800.0, + "grad_norm": 1.6228841440103556, + "language_loss": 0.70216316, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.72349143, + "num_input_tokens_seen": 242208675, + "step": 11221, + "time_per_iteration": 2.655421733856201 + }, + { + "auxiliary_loss_clip": 0.01076344, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.0356853, + "balance_loss_mlp": 1.02112806, + "epoch": 0.6747031414399519, + "flos": 16873491087360.0, + "grad_norm": 2.1723447923231554, + "language_loss": 0.58043802, + "learning_rate": 1.010925256180498e-06, + "loss": 0.60153466, + "num_input_tokens_seen": 242227440, + "step": 11222, + "time_per_iteration": 2.698503255844116 + }, + { + "auxiliary_loss_clip": 0.01100055, + "auxiliary_loss_mlp": 0.01035283, + "balance_loss_clip": 1.03881896, + "balance_loss_mlp": 1.02253962, + "epoch": 0.6747632646926198, + "flos": 22784925928320.0, + "grad_norm": 2.113432638374052, + "language_loss": 0.76298106, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.78433442, + "num_input_tokens_seen": 242245240, + "step": 11223, + "time_per_iteration": 2.6607108116149902 + }, + { + "auxiliary_loss_clip": 0.01108219, + "auxiliary_loss_mlp": 0.01036403, + "balance_loss_clip": 1.04148507, + "balance_loss_mlp": 1.02318275, + "epoch": 0.6748233879452878, + "flos": 20046090827520.0, + "grad_norm": 1.767291040158093, + "language_loss": 0.75444579, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.77589202, + "num_input_tokens_seen": 242263435, + "step": 11224, + "time_per_iteration": 2.7242133617401123 + }, + { + "auxiliary_loss_clip": 0.01060708, + "auxiliary_loss_mlp": 0.01032334, + "balance_loss_clip": 1.03886676, + "balance_loss_mlp": 1.02131319, + "epoch": 0.6748835111979558, + "flos": 23002831785600.0, + "grad_norm": 1.693566744371799, + "language_loss": 0.6366834, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.65761381, + "num_input_tokens_seen": 242282765, + "step": 11225, + "time_per_iteration": 2.8525750637054443 + }, + { + "auxiliary_loss_clip": 0.01108343, + "auxiliary_loss_mlp": 0.00768466, + "balance_loss_clip": 1.03901696, + "balance_loss_mlp": 1.00013793, + "epoch": 0.6749436344506238, + "flos": 12197311009920.0, + "grad_norm": 2.4278333466029163, + "language_loss": 0.63865972, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.65742779, + "num_input_tokens_seen": 242298980, + "step": 11226, + "time_per_iteration": 2.5835680961608887 + }, + { + "auxiliary_loss_clip": 0.01105473, + "auxiliary_loss_mlp": 0.01037357, + "balance_loss_clip": 1.04047155, + "balance_loss_mlp": 1.02423763, + "epoch": 0.6750037577032918, + "flos": 11873720361600.0, + "grad_norm": 2.1970918660293717, + "language_loss": 0.71417314, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.73560148, + "num_input_tokens_seen": 242315420, + "step": 11227, + "time_per_iteration": 2.5965003967285156 + }, + { + "auxiliary_loss_clip": 0.01082342, + "auxiliary_loss_mlp": 0.01039904, + "balance_loss_clip": 1.0346818, + "balance_loss_mlp": 1.02601051, + "epoch": 0.6750638809559597, + "flos": 17019611614080.0, + "grad_norm": 1.9754224773045619, + "language_loss": 0.71259153, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.733814, + "num_input_tokens_seen": 242332805, + "step": 11228, + "time_per_iteration": 2.6131396293640137 + }, + { + "auxiliary_loss_clip": 0.01010708, + "auxiliary_loss_mlp": 0.01005158, + "balance_loss_clip": 1.01072896, + "balance_loss_mlp": 1.00386512, + "epoch": 0.6751240042086277, + "flos": 70951011891840.0, + "grad_norm": 0.7503769026779433, + "language_loss": 0.5320974, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.55225611, + "num_input_tokens_seen": 242396160, + "step": 11229, + "time_per_iteration": 3.22717022895813 + }, + { + "auxiliary_loss_clip": 0.01101526, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.03896284, + "balance_loss_mlp": 1.02234495, + "epoch": 0.6751841274612956, + "flos": 22675146986880.0, + "grad_norm": 1.7298636476457805, + "language_loss": 0.8039158, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.82527316, + "num_input_tokens_seen": 242414660, + "step": 11230, + "time_per_iteration": 2.6328141689300537 + }, + { + "auxiliary_loss_clip": 0.01082067, + "auxiliary_loss_mlp": 0.01035691, + "balance_loss_clip": 1.03726006, + "balance_loss_mlp": 1.02402639, + "epoch": 0.6752442507139637, + "flos": 21288636051840.0, + "grad_norm": 1.6008618014341174, + "language_loss": 0.65935898, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.68053663, + "num_input_tokens_seen": 242434225, + "step": 11231, + "time_per_iteration": 2.626856803894043 + }, + { + "auxiliary_loss_clip": 0.01078317, + "auxiliary_loss_mlp": 0.01042803, + "balance_loss_clip": 1.04247785, + "balance_loss_mlp": 1.02774644, + "epoch": 0.6753043739666316, + "flos": 28256921781120.0, + "grad_norm": 2.0251672391245936, + "language_loss": 0.66539383, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.68660504, + "num_input_tokens_seen": 242454355, + "step": 11232, + "time_per_iteration": 2.743908166885376 + }, + { + "auxiliary_loss_clip": 0.01066681, + "auxiliary_loss_mlp": 0.0103246, + "balance_loss_clip": 1.03211284, + "balance_loss_mlp": 1.01948404, + "epoch": 0.6753644972192996, + "flos": 21360349555200.0, + "grad_norm": 1.6294007960486003, + "language_loss": 0.72326458, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.74425602, + "num_input_tokens_seen": 242474935, + "step": 11233, + "time_per_iteration": 4.338082790374756 + }, + { + "auxiliary_loss_clip": 0.01103097, + "auxiliary_loss_mlp": 0.01037684, + "balance_loss_clip": 1.03895485, + "balance_loss_mlp": 1.02474928, + "epoch": 0.6754246204719675, + "flos": 26541971861760.0, + "grad_norm": 1.5686096839287218, + "language_loss": 0.76833057, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.7897383, + "num_input_tokens_seen": 242495530, + "step": 11234, + "time_per_iteration": 2.6492395401000977 + }, + { + "auxiliary_loss_clip": 0.01111909, + "auxiliary_loss_mlp": 0.01036672, + "balance_loss_clip": 1.03924251, + "balance_loss_mlp": 1.02342129, + "epoch": 0.6754847437246355, + "flos": 25556690822400.0, + "grad_norm": 1.5014850027131166, + "language_loss": 0.75410771, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.77559352, + "num_input_tokens_seen": 242514550, + "step": 11235, + "time_per_iteration": 2.5974621772766113 + }, + { + "auxiliary_loss_clip": 0.01025646, + "auxiliary_loss_mlp": 0.0100208, + "balance_loss_clip": 1.01184058, + "balance_loss_mlp": 1.00095963, + "epoch": 0.6755448669773034, + "flos": 59513318726400.0, + "grad_norm": 0.7779431781811396, + "language_loss": 0.51255912, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.53283638, + "num_input_tokens_seen": 242569200, + "step": 11236, + "time_per_iteration": 3.1667306423187256 + }, + { + "auxiliary_loss_clip": 0.0107986, + "auxiliary_loss_mlp": 0.01032431, + "balance_loss_clip": 1.03790748, + "balance_loss_mlp": 1.01711285, + "epoch": 0.6756049902299714, + "flos": 23294534135040.0, + "grad_norm": 2.192780802483493, + "language_loss": 0.75628972, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.77741265, + "num_input_tokens_seen": 242586950, + "step": 11237, + "time_per_iteration": 5.957702159881592 + }, + { + "auxiliary_loss_clip": 0.01086462, + "auxiliary_loss_mlp": 0.01041243, + "balance_loss_clip": 1.03836346, + "balance_loss_mlp": 1.0282433, + "epoch": 0.6756651134826394, + "flos": 31575426566400.0, + "grad_norm": 2.7350155461999184, + "language_loss": 0.77482605, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.79610306, + "num_input_tokens_seen": 242607380, + "step": 11238, + "time_per_iteration": 2.7448818683624268 + }, + { + "auxiliary_loss_clip": 0.01099837, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.03648901, + "balance_loss_mlp": 1.0227412, + "epoch": 0.6757252367353074, + "flos": 27272287186560.0, + "grad_norm": 1.6539290066506784, + "language_loss": 0.66314852, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.6845113, + "num_input_tokens_seen": 242628025, + "step": 11239, + "time_per_iteration": 2.740363597869873 + }, + { + "auxiliary_loss_clip": 0.01089775, + "auxiliary_loss_mlp": 0.01030806, + "balance_loss_clip": 1.04055905, + "balance_loss_mlp": 1.0182538, + "epoch": 0.6757853599879754, + "flos": 16830900535680.0, + "grad_norm": 1.7720867918116858, + "language_loss": 0.82882285, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.85002863, + "num_input_tokens_seen": 242643825, + "step": 11240, + "time_per_iteration": 2.7659623622894287 + }, + { + "auxiliary_loss_clip": 0.01090669, + "auxiliary_loss_mlp": 0.01035174, + "balance_loss_clip": 1.04133797, + "balance_loss_mlp": 1.01949787, + "epoch": 0.6758454832406433, + "flos": 23220055284480.0, + "grad_norm": 2.676956533168836, + "language_loss": 0.74727547, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.76853395, + "num_input_tokens_seen": 242661820, + "step": 11241, + "time_per_iteration": 2.7259037494659424 + }, + { + "auxiliary_loss_clip": 0.01064722, + "auxiliary_loss_mlp": 0.0103699, + "balance_loss_clip": 1.03947997, + "balance_loss_mlp": 1.02388871, + "epoch": 0.6759056064933113, + "flos": 16289547684480.0, + "grad_norm": 2.2859314322063415, + "language_loss": 0.80506319, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.82608032, + "num_input_tokens_seen": 242679890, + "step": 11242, + "time_per_iteration": 2.7591724395751953 + }, + { + "auxiliary_loss_clip": 0.01095714, + "auxiliary_loss_mlp": 0.01047852, + "balance_loss_clip": 1.03617179, + "balance_loss_mlp": 1.03376102, + "epoch": 0.6759657297459792, + "flos": 25922297404800.0, + "grad_norm": 1.8958528418461225, + "language_loss": 0.72530574, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.74674141, + "num_input_tokens_seen": 242699495, + "step": 11243, + "time_per_iteration": 4.2785255908966064 + }, + { + "auxiliary_loss_clip": 0.01102771, + "auxiliary_loss_mlp": 0.01038727, + "balance_loss_clip": 1.03992796, + "balance_loss_mlp": 1.02621591, + "epoch": 0.6760258529986473, + "flos": 23000820624000.0, + "grad_norm": 3.620795649046328, + "language_loss": 0.72916102, + "learning_rate": 1.003487287162221e-06, + "loss": 0.75057596, + "num_input_tokens_seen": 242719500, + "step": 11244, + "time_per_iteration": 2.656297445297241 + }, + { + "auxiliary_loss_clip": 0.01115915, + "auxiliary_loss_mlp": 0.01045105, + "balance_loss_clip": 1.04072213, + "balance_loss_mlp": 1.03150368, + "epoch": 0.6760859762513152, + "flos": 20959335141120.0, + "grad_norm": 2.083059893523475, + "language_loss": 0.86242104, + "learning_rate": 1.003149631190393e-06, + "loss": 0.8840313, + "num_input_tokens_seen": 242738325, + "step": 11245, + "time_per_iteration": 2.6280319690704346 + }, + { + "auxiliary_loss_clip": 0.01117876, + "auxiliary_loss_mlp": 0.0077189, + "balance_loss_clip": 1.04022503, + "balance_loss_mlp": 1.00016975, + "epoch": 0.6761460995039832, + "flos": 23622937205760.0, + "grad_norm": 2.1743867677621918, + "language_loss": 0.73484135, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.753739, + "num_input_tokens_seen": 242756620, + "step": 11246, + "time_per_iteration": 2.696730375289917 + }, + { + "auxiliary_loss_clip": 0.0109861, + "auxiliary_loss_mlp": 0.01029896, + "balance_loss_clip": 1.03731704, + "balance_loss_mlp": 1.01679528, + "epoch": 0.6762062227566511, + "flos": 20770875457920.0, + "grad_norm": 1.7495113919795662, + "language_loss": 0.87749994, + "learning_rate": 1.002474432661539e-06, + "loss": 0.89878494, + "num_input_tokens_seen": 242774505, + "step": 11247, + "time_per_iteration": 2.6828203201293945 + }, + { + "auxiliary_loss_clip": 0.01009927, + "auxiliary_loss_mlp": 0.01001384, + "balance_loss_clip": 1.00954247, + "balance_loss_mlp": 1.00016785, + "epoch": 0.6762663460093191, + "flos": 52818099166080.0, + "grad_norm": 0.8307013339921004, + "language_loss": 0.53909206, + "learning_rate": 1.002136890130115e-06, + "loss": 0.55920517, + "num_input_tokens_seen": 242828645, + "step": 11248, + "time_per_iteration": 3.2222228050231934 + }, + { + "auxiliary_loss_clip": 0.01057434, + "auxiliary_loss_mlp": 0.01030146, + "balance_loss_clip": 1.0432303, + "balance_loss_mlp": 1.01780176, + "epoch": 0.676326469261987, + "flos": 23696302734720.0, + "grad_norm": 1.557725146566793, + "language_loss": 0.73398393, + "learning_rate": 1.001799385437761e-06, + "loss": 0.75485975, + "num_input_tokens_seen": 242850100, + "step": 11249, + "time_per_iteration": 2.8122363090515137 + }, + { + "auxiliary_loss_clip": 0.01102856, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.03738058, + "balance_loss_mlp": 1.02277553, + "epoch": 0.676386592514655, + "flos": 14063732582400.0, + "grad_norm": 2.1313223732491506, + "language_loss": 0.73983771, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.76123083, + "num_input_tokens_seen": 242867775, + "step": 11250, + "time_per_iteration": 2.697199583053589 + }, + { + "auxiliary_loss_clip": 0.01113481, + "auxiliary_loss_mlp": 0.01031768, + "balance_loss_clip": 1.03948021, + "balance_loss_mlp": 1.01904869, + "epoch": 0.676446715767323, + "flos": 20412236113920.0, + "grad_norm": 1.816015271011089, + "language_loss": 0.75130785, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.77276027, + "num_input_tokens_seen": 242886865, + "step": 11251, + "time_per_iteration": 2.6333305835723877 + }, + { + "auxiliary_loss_clip": 0.01078452, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.04010725, + "balance_loss_mlp": 1.0182966, + "epoch": 0.676506839019991, + "flos": 21288241002240.0, + "grad_norm": 1.551518166422534, + "language_loss": 0.69901943, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.72011709, + "num_input_tokens_seen": 242906705, + "step": 11252, + "time_per_iteration": 2.9181244373321533 + }, + { + "auxiliary_loss_clip": 0.01064839, + "auxiliary_loss_mlp": 0.01033228, + "balance_loss_clip": 1.03892565, + "balance_loss_mlp": 1.02052665, + "epoch": 0.676566962272659, + "flos": 29932477459200.0, + "grad_norm": 1.6718962617994413, + "language_loss": 0.66779602, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.68877667, + "num_input_tokens_seen": 242925215, + "step": 11253, + "time_per_iteration": 2.8428003787994385 + }, + { + "auxiliary_loss_clip": 0.01070699, + "auxiliary_loss_mlp": 0.00775318, + "balance_loss_clip": 1.03454018, + "balance_loss_mlp": 1.00019038, + "epoch": 0.6766270855253269, + "flos": 17931203902080.0, + "grad_norm": 1.5527696111799332, + "language_loss": 0.7722379, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.79069805, + "num_input_tokens_seen": 242944750, + "step": 11254, + "time_per_iteration": 2.712817668914795 + }, + { + "auxiliary_loss_clip": 0.0110248, + "auxiliary_loss_mlp": 0.01035428, + "balance_loss_clip": 1.03869474, + "balance_loss_mlp": 1.02183247, + "epoch": 0.6766872087779949, + "flos": 23104853389440.0, + "grad_norm": 3.950409887802226, + "language_loss": 0.72361761, + "learning_rate": 9.997751526206835e-07, + "loss": 0.74499667, + "num_input_tokens_seen": 242963860, + "step": 11255, + "time_per_iteration": 2.6217257976531982 + }, + { + "auxiliary_loss_clip": 0.01061354, + "auxiliary_loss_mlp": 0.00771432, + "balance_loss_clip": 1.03389072, + "balance_loss_mlp": 1.00019884, + "epoch": 0.6767473320306628, + "flos": 26213137827840.0, + "grad_norm": 2.6592328120058584, + "language_loss": 0.75315595, + "learning_rate": 9.994379131600828e-07, + "loss": 0.7714839, + "num_input_tokens_seen": 242983050, + "step": 11256, + "time_per_iteration": 2.834801435470581 + }, + { + "auxiliary_loss_clip": 0.01105322, + "auxiliary_loss_mlp": 0.01036203, + "balance_loss_clip": 1.04157603, + "balance_loss_mlp": 1.0230726, + "epoch": 0.6768074552833309, + "flos": 18368739469440.0, + "grad_norm": 2.0954099595982836, + "language_loss": 0.6498003, + "learning_rate": 9.991007116408965e-07, + "loss": 0.67121565, + "num_input_tokens_seen": 243001125, + "step": 11257, + "time_per_iteration": 2.6306867599487305 + }, + { + "auxiliary_loss_clip": 0.01067487, + "auxiliary_loss_mlp": 0.01033246, + "balance_loss_clip": 1.04190707, + "balance_loss_mlp": 1.02090788, + "epoch": 0.6768675785359988, + "flos": 23039927556480.0, + "grad_norm": 1.6004297573343516, + "language_loss": 0.75491571, + "learning_rate": 9.987635480759109e-07, + "loss": 0.77592301, + "num_input_tokens_seen": 243021865, + "step": 11258, + "time_per_iteration": 2.9189696311950684 + }, + { + "auxiliary_loss_clip": 0.01089351, + "auxiliary_loss_mlp": 0.01036704, + "balance_loss_clip": 1.03900814, + "balance_loss_mlp": 1.02450931, + "epoch": 0.6769277017886668, + "flos": 33036524092800.0, + "grad_norm": 1.5823758287987741, + "language_loss": 0.66676503, + "learning_rate": 9.984264224779127e-07, + "loss": 0.68802559, + "num_input_tokens_seen": 243042970, + "step": 11259, + "time_per_iteration": 2.7564451694488525 + }, + { + "auxiliary_loss_clip": 0.01090564, + "auxiliary_loss_mlp": 0.01035059, + "balance_loss_clip": 1.03726125, + "balance_loss_mlp": 1.02206516, + "epoch": 0.6769878250413347, + "flos": 20848406964480.0, + "grad_norm": 2.48292750681471, + "language_loss": 0.85291499, + "learning_rate": 9.980893348596839e-07, + "loss": 0.8741712, + "num_input_tokens_seen": 243058470, + "step": 11260, + "time_per_iteration": 2.660332441329956 + }, + { + "auxiliary_loss_clip": 0.01085932, + "auxiliary_loss_mlp": 0.01039752, + "balance_loss_clip": 1.03486264, + "balance_loss_mlp": 1.02588189, + "epoch": 0.6770479482940027, + "flos": 15595968994560.0, + "grad_norm": 2.613252024528438, + "language_loss": 0.77209002, + "learning_rate": 9.977522852340081e-07, + "loss": 0.79334688, + "num_input_tokens_seen": 243076630, + "step": 11261, + "time_per_iteration": 2.6410372257232666 + }, + { + "auxiliary_loss_clip": 0.0109228, + "auxiliary_loss_mlp": 0.01040148, + "balance_loss_clip": 1.03776324, + "balance_loss_mlp": 1.02687383, + "epoch": 0.6771080715466706, + "flos": 18621011664000.0, + "grad_norm": 2.010792691714421, + "language_loss": 0.87528884, + "learning_rate": 9.97415273613666e-07, + "loss": 0.89661312, + "num_input_tokens_seen": 243092260, + "step": 11262, + "time_per_iteration": 2.6288645267486572 + }, + { + "auxiliary_loss_clip": 0.01089821, + "auxiliary_loss_mlp": 0.01035622, + "balance_loss_clip": 1.03876138, + "balance_loss_mlp": 1.02234757, + "epoch": 0.6771681947993387, + "flos": 12495441893760.0, + "grad_norm": 1.942743373156589, + "language_loss": 0.74668461, + "learning_rate": 9.97078300011439e-07, + "loss": 0.76793909, + "num_input_tokens_seen": 243109405, + "step": 11263, + "time_per_iteration": 2.666969060897827 + }, + { + "auxiliary_loss_clip": 0.01107967, + "auxiliary_loss_mlp": 0.01034392, + "balance_loss_clip": 1.04032826, + "balance_loss_mlp": 1.02013433, + "epoch": 0.6772283180520066, + "flos": 22236964974720.0, + "grad_norm": 3.4280923778329435, + "language_loss": 0.67490625, + "learning_rate": 9.967413644401016e-07, + "loss": 0.69632983, + "num_input_tokens_seen": 243128135, + "step": 11264, + "time_per_iteration": 2.620027780532837 + }, + { + "auxiliary_loss_clip": 0.01092011, + "auxiliary_loss_mlp": 0.01036919, + "balance_loss_clip": 1.04065371, + "balance_loss_mlp": 1.02333474, + "epoch": 0.6772884413046746, + "flos": 16143139848960.0, + "grad_norm": 1.9352746586576008, + "language_loss": 0.7301234, + "learning_rate": 9.964044669124324e-07, + "loss": 0.75141263, + "num_input_tokens_seen": 243146785, + "step": 11265, + "time_per_iteration": 2.638399600982666 + }, + { + "auxiliary_loss_clip": 0.0106857, + "auxiliary_loss_mlp": 0.01046347, + "balance_loss_clip": 1.03400207, + "balance_loss_mlp": 1.03247142, + "epoch": 0.6773485645573426, + "flos": 19135755515520.0, + "grad_norm": 2.1206255290710594, + "language_loss": 0.61617583, + "learning_rate": 9.96067607441207e-07, + "loss": 0.63732499, + "num_input_tokens_seen": 243165275, + "step": 11266, + "time_per_iteration": 2.6741204261779785 + }, + { + "auxiliary_loss_clip": 0.01086026, + "auxiliary_loss_mlp": 0.01036436, + "balance_loss_clip": 1.04107964, + "balance_loss_mlp": 1.02305472, + "epoch": 0.6774086878100105, + "flos": 14136918543360.0, + "grad_norm": 1.7639590037989088, + "language_loss": 0.7056399, + "learning_rate": 9.957307860391976e-07, + "loss": 0.72686452, + "num_input_tokens_seen": 243182845, + "step": 11267, + "time_per_iteration": 2.701676607131958 + }, + { + "auxiliary_loss_clip": 0.01112717, + "auxiliary_loss_mlp": 0.01034596, + "balance_loss_clip": 1.03907073, + "balance_loss_mlp": 1.02175152, + "epoch": 0.6774688110626785, + "flos": 22197067943040.0, + "grad_norm": 3.4663937575357093, + "language_loss": 0.71232986, + "learning_rate": 9.953940027191785e-07, + "loss": 0.73380297, + "num_input_tokens_seen": 243201475, + "step": 11268, + "time_per_iteration": 2.582158327102661 + }, + { + "auxiliary_loss_clip": 0.01089704, + "auxiliary_loss_mlp": 0.01038381, + "balance_loss_clip": 1.03727484, + "balance_loss_mlp": 1.02395701, + "epoch": 0.6775289343153464, + "flos": 23039963470080.0, + "grad_norm": 1.54975098078917, + "language_loss": 0.76582503, + "learning_rate": 9.950572574939194e-07, + "loss": 0.78710592, + "num_input_tokens_seen": 243221850, + "step": 11269, + "time_per_iteration": 2.6784608364105225 + }, + { + "auxiliary_loss_clip": 0.01079985, + "auxiliary_loss_mlp": 0.01039406, + "balance_loss_clip": 1.03688645, + "balance_loss_mlp": 1.02560711, + "epoch": 0.6775890575680145, + "flos": 18293506433280.0, + "grad_norm": 3.9513063189541895, + "language_loss": 0.74380577, + "learning_rate": 9.94720550376189e-07, + "loss": 0.76499963, + "num_input_tokens_seen": 243239855, + "step": 11270, + "time_per_iteration": 2.82761812210083 + }, + { + "auxiliary_loss_clip": 0.01059034, + "auxiliary_loss_mlp": 0.0104238, + "balance_loss_clip": 1.03957486, + "balance_loss_mlp": 1.02821255, + "epoch": 0.6776491808206824, + "flos": 25336450581120.0, + "grad_norm": 1.7088059356464216, + "language_loss": 0.73103487, + "learning_rate": 9.94383881378756e-07, + "loss": 0.75204897, + "num_input_tokens_seen": 243260085, + "step": 11271, + "time_per_iteration": 2.7955849170684814 + }, + { + "auxiliary_loss_clip": 0.01113021, + "auxiliary_loss_mlp": 0.01036559, + "balance_loss_clip": 1.03949916, + "balance_loss_mlp": 1.02401781, + "epoch": 0.6777093040733504, + "flos": 26028233591040.0, + "grad_norm": 3.147727016102492, + "language_loss": 0.68212342, + "learning_rate": 9.94047250514387e-07, + "loss": 0.70361924, + "num_input_tokens_seen": 243280065, + "step": 11272, + "time_per_iteration": 2.637103796005249 + }, + { + "auxiliary_loss_clip": 0.01103771, + "auxiliary_loss_mlp": 0.01035716, + "balance_loss_clip": 1.0390712, + "balance_loss_mlp": 1.02126229, + "epoch": 0.6777694273260183, + "flos": 18003599763840.0, + "grad_norm": 1.7828232071604915, + "language_loss": 0.73829705, + "learning_rate": 9.937106577958481e-07, + "loss": 0.75969195, + "num_input_tokens_seen": 243297775, + "step": 11273, + "time_per_iteration": 4.394399642944336 + }, + { + "auxiliary_loss_clip": 0.01094453, + "auxiliary_loss_mlp": 0.01041712, + "balance_loss_clip": 1.03916848, + "balance_loss_mlp": 1.02846813, + "epoch": 0.6778295505786863, + "flos": 23441085624960.0, + "grad_norm": 1.8919224773021028, + "language_loss": 0.70701563, + "learning_rate": 9.933741032359015e-07, + "loss": 0.72837734, + "num_input_tokens_seen": 243315760, + "step": 11274, + "time_per_iteration": 2.5985240936279297 + }, + { + "auxiliary_loss_clip": 0.01114225, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.0387696, + "balance_loss_mlp": 1.02027476, + "epoch": 0.6778896738313542, + "flos": 19098408349440.0, + "grad_norm": 4.65357993377392, + "language_loss": 0.6543079, + "learning_rate": 9.930375868473093e-07, + "loss": 0.67578733, + "num_input_tokens_seen": 243335715, + "step": 11275, + "time_per_iteration": 2.6151697635650635 + }, + { + "auxiliary_loss_clip": 0.01106727, + "auxiliary_loss_mlp": 0.01033632, + "balance_loss_clip": 1.04250121, + "balance_loss_mlp": 1.02126956, + "epoch": 0.6779497970840223, + "flos": 26103933504000.0, + "grad_norm": 1.5789952929470612, + "language_loss": 0.72767758, + "learning_rate": 9.927011086428335e-07, + "loss": 0.74908113, + "num_input_tokens_seen": 243356935, + "step": 11276, + "time_per_iteration": 5.899662017822266 + }, + { + "auxiliary_loss_clip": 0.01087765, + "auxiliary_loss_mlp": 0.00771415, + "balance_loss_clip": 1.03646386, + "balance_loss_mlp": 1.00016904, + "epoch": 0.6780099203366902, + "flos": 19719232041600.0, + "grad_norm": 1.681215818326951, + "language_loss": 0.76681376, + "learning_rate": 9.923646686352317e-07, + "loss": 0.78540558, + "num_input_tokens_seen": 243375625, + "step": 11277, + "time_per_iteration": 2.6914784908294678 + }, + { + "auxiliary_loss_clip": 0.01092848, + "auxiliary_loss_mlp": 0.01033178, + "balance_loss_clip": 1.03808713, + "balance_loss_mlp": 1.01976132, + "epoch": 0.6780700435893582, + "flos": 18214538382720.0, + "grad_norm": 2.7540725669591724, + "language_loss": 0.83632004, + "learning_rate": 9.920282668372627e-07, + "loss": 0.8575803, + "num_input_tokens_seen": 243390195, + "step": 11278, + "time_per_iteration": 2.637618064880371 + }, + { + "auxiliary_loss_clip": 0.01085002, + "auxiliary_loss_mlp": 0.00769413, + "balance_loss_clip": 1.04043651, + "balance_loss_mlp": 1.00012565, + "epoch": 0.6781301668420262, + "flos": 25376239872000.0, + "grad_norm": 1.5336771376537068, + "language_loss": 0.70519423, + "learning_rate": 9.916919032616844e-07, + "loss": 0.72373849, + "num_input_tokens_seen": 243411690, + "step": 11279, + "time_per_iteration": 2.7715609073638916 + }, + { + "auxiliary_loss_clip": 0.01105152, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.03994751, + "balance_loss_mlp": 1.0217998, + "epoch": 0.6781902900946941, + "flos": 24020432087040.0, + "grad_norm": 1.8650529100630782, + "language_loss": 0.73610586, + "learning_rate": 9.913555779212485e-07, + "loss": 0.75751317, + "num_input_tokens_seen": 243430280, + "step": 11280, + "time_per_iteration": 2.734544277191162 + }, + { + "auxiliary_loss_clip": 0.01103265, + "auxiliary_loss_mlp": 0.01036072, + "balance_loss_clip": 1.03754926, + "balance_loss_mlp": 1.02211285, + "epoch": 0.6782504133473621, + "flos": 19646764352640.0, + "grad_norm": 2.0625858122456178, + "language_loss": 0.70312506, + "learning_rate": 9.910192908287104e-07, + "loss": 0.72451842, + "num_input_tokens_seen": 243448690, + "step": 11281, + "time_per_iteration": 2.622098684310913 + }, + { + "auxiliary_loss_clip": 0.01111077, + "auxiliary_loss_mlp": 0.01028525, + "balance_loss_clip": 1.04020238, + "balance_loss_mlp": 1.01619864, + "epoch": 0.67831053660003, + "flos": 24932742647040.0, + "grad_norm": 1.4839814095064274, + "language_loss": 0.63879716, + "learning_rate": 9.906830419968217e-07, + "loss": 0.66019315, + "num_input_tokens_seen": 243470695, + "step": 11282, + "time_per_iteration": 4.292442798614502 + }, + { + "auxiliary_loss_clip": 0.01075036, + "auxiliary_loss_mlp": 0.01049322, + "balance_loss_clip": 1.03349972, + "balance_loss_mlp": 1.03204811, + "epoch": 0.6783706598526981, + "flos": 31208383440000.0, + "grad_norm": 1.7556170346158129, + "language_loss": 0.74497384, + "learning_rate": 9.90346831438334e-07, + "loss": 0.76621741, + "num_input_tokens_seen": 243493345, + "step": 11283, + "time_per_iteration": 2.9562923908233643 + }, + { + "auxiliary_loss_clip": 0.01103456, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.04012847, + "balance_loss_mlp": 1.01659822, + "epoch": 0.678430783105366, + "flos": 35441317687680.0, + "grad_norm": 1.5708851854862296, + "language_loss": 0.56767416, + "learning_rate": 9.900106591659948e-07, + "loss": 0.58900023, + "num_input_tokens_seen": 243515670, + "step": 11284, + "time_per_iteration": 2.8391168117523193 + }, + { + "auxiliary_loss_clip": 0.01090169, + "auxiliary_loss_mlp": 0.0103128, + "balance_loss_clip": 1.03865993, + "balance_loss_mlp": 1.01850688, + "epoch": 0.678490906358034, + "flos": 14428800460800.0, + "grad_norm": 1.928659697893085, + "language_loss": 0.75430858, + "learning_rate": 9.896745251925535e-07, + "loss": 0.77552313, + "num_input_tokens_seen": 243533625, + "step": 11285, + "time_per_iteration": 2.8025879859924316 + }, + { + "auxiliary_loss_clip": 0.01113003, + "auxiliary_loss_mlp": 0.01032581, + "balance_loss_clip": 1.04134154, + "balance_loss_mlp": 1.01964653, + "epoch": 0.6785510296107019, + "flos": 24311236596480.0, + "grad_norm": 1.7863771120930665, + "language_loss": 0.66262901, + "learning_rate": 9.893384295307557e-07, + "loss": 0.68408483, + "num_input_tokens_seen": 243553040, + "step": 11286, + "time_per_iteration": 2.6879425048828125 + }, + { + "auxiliary_loss_clip": 0.0109176, + "auxiliary_loss_mlp": 0.01029833, + "balance_loss_clip": 1.03810883, + "balance_loss_mlp": 1.01649332, + "epoch": 0.6786111528633699, + "flos": 26977244872320.0, + "grad_norm": 2.710702139669138, + "language_loss": 0.5293864, + "learning_rate": 9.890023721933447e-07, + "loss": 0.55060238, + "num_input_tokens_seen": 243572590, + "step": 11287, + "time_per_iteration": 2.6729018688201904 + }, + { + "auxiliary_loss_clip": 0.01070232, + "auxiliary_loss_mlp": 0.01039812, + "balance_loss_clip": 1.03733265, + "balance_loss_mlp": 1.0263176, + "epoch": 0.6786712761160378, + "flos": 24317557390080.0, + "grad_norm": 1.5085934530827387, + "language_loss": 0.77353847, + "learning_rate": 9.886663531930655e-07, + "loss": 0.79463893, + "num_input_tokens_seen": 243594140, + "step": 11288, + "time_per_iteration": 2.76521897315979 + }, + { + "auxiliary_loss_clip": 0.01106153, + "auxiliary_loss_mlp": 0.0103743, + "balance_loss_clip": 1.04171705, + "balance_loss_mlp": 1.0247159, + "epoch": 0.6787313993687059, + "flos": 22930435923840.0, + "grad_norm": 1.9499442288864346, + "language_loss": 0.73456311, + "learning_rate": 9.883303725426593e-07, + "loss": 0.75599885, + "num_input_tokens_seen": 243615170, + "step": 11289, + "time_per_iteration": 2.6673424243927 + }, + { + "auxiliary_loss_clip": 0.01114362, + "auxiliary_loss_mlp": 0.010388, + "balance_loss_clip": 1.04031169, + "balance_loss_mlp": 1.02534115, + "epoch": 0.6787915226213738, + "flos": 26868435598080.0, + "grad_norm": 1.6821989273437945, + "language_loss": 0.80101818, + "learning_rate": 9.879944302548682e-07, + "loss": 0.82254982, + "num_input_tokens_seen": 243635675, + "step": 11290, + "time_per_iteration": 2.632082223892212 + }, + { + "auxiliary_loss_clip": 0.01101296, + "auxiliary_loss_mlp": 0.010341, + "balance_loss_clip": 1.04066992, + "balance_loss_mlp": 1.02134442, + "epoch": 0.6788516458740418, + "flos": 20008851402240.0, + "grad_norm": 1.599385548142358, + "language_loss": 0.75065523, + "learning_rate": 9.87658526342428e-07, + "loss": 0.77200925, + "num_input_tokens_seen": 243654950, + "step": 11291, + "time_per_iteration": 2.6852645874023438 + }, + { + "auxiliary_loss_clip": 0.01096412, + "auxiliary_loss_mlp": 0.00771696, + "balance_loss_clip": 1.04071581, + "balance_loss_mlp": 1.0002079, + "epoch": 0.6789117691267098, + "flos": 28727099832960.0, + "grad_norm": 1.9085592005378407, + "language_loss": 0.75479198, + "learning_rate": 9.873226608180785e-07, + "loss": 0.77347308, + "num_input_tokens_seen": 243674970, + "step": 11292, + "time_per_iteration": 2.699632167816162 + }, + { + "auxiliary_loss_clip": 0.01073788, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.03495204, + "balance_loss_mlp": 1.02013278, + "epoch": 0.6789718923793777, + "flos": 23403451150080.0, + "grad_norm": 2.0284676657461858, + "language_loss": 0.84163547, + "learning_rate": 9.869868336945556e-07, + "loss": 0.86271405, + "num_input_tokens_seen": 243693440, + "step": 11293, + "time_per_iteration": 2.719419240951538 + }, + { + "auxiliary_loss_clip": 0.01119964, + "auxiliary_loss_mlp": 0.01040774, + "balance_loss_clip": 1.04201078, + "balance_loss_mlp": 1.02618265, + "epoch": 0.6790320156320457, + "flos": 20448865008000.0, + "grad_norm": 2.5902571187100722, + "language_loss": 0.79863316, + "learning_rate": 9.866510449845929e-07, + "loss": 0.8202405, + "num_input_tokens_seen": 243710055, + "step": 11294, + "time_per_iteration": 2.5868771076202393 + }, + { + "auxiliary_loss_clip": 0.0109056, + "auxiliary_loss_mlp": 0.01027861, + "balance_loss_clip": 1.03927612, + "balance_loss_mlp": 1.01579142, + "epoch": 0.6790921388847136, + "flos": 24167199058560.0, + "grad_norm": 1.662741005119297, + "language_loss": 0.79054183, + "learning_rate": 9.86315294700924e-07, + "loss": 0.81172609, + "num_input_tokens_seen": 243728635, + "step": 11295, + "time_per_iteration": 2.677510976791382 + }, + { + "auxiliary_loss_clip": 0.0108512, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.03939927, + "balance_loss_mlp": 1.02034116, + "epoch": 0.6791522621373817, + "flos": 21908095027200.0, + "grad_norm": 1.7946887652261734, + "language_loss": 0.71118504, + "learning_rate": 9.859795828562823e-07, + "loss": 0.7323516, + "num_input_tokens_seen": 243748330, + "step": 11296, + "time_per_iteration": 2.7060418128967285 + }, + { + "auxiliary_loss_clip": 0.01100933, + "auxiliary_loss_mlp": 0.01032318, + "balance_loss_clip": 1.03921032, + "balance_loss_mlp": 1.01968789, + "epoch": 0.6792123853900496, + "flos": 24826519152000.0, + "grad_norm": 1.4998043731898119, + "language_loss": 0.70986772, + "learning_rate": 9.856439094633949e-07, + "loss": 0.73120022, + "num_input_tokens_seen": 243769380, + "step": 11297, + "time_per_iteration": 2.6602540016174316 + }, + { + "auxiliary_loss_clip": 0.01086842, + "auxiliary_loss_mlp": 0.01036373, + "balance_loss_clip": 1.03981018, + "balance_loss_mlp": 1.02242589, + "epoch": 0.6792725086427176, + "flos": 17566279678080.0, + "grad_norm": 2.428106974293634, + "language_loss": 0.66109335, + "learning_rate": 9.853082745349918e-07, + "loss": 0.68232548, + "num_input_tokens_seen": 243785510, + "step": 11298, + "time_per_iteration": 2.694490671157837 + }, + { + "auxiliary_loss_clip": 0.01105001, + "auxiliary_loss_mlp": 0.01027328, + "balance_loss_clip": 1.03936362, + "balance_loss_mlp": 1.0155797, + "epoch": 0.6793326318953855, + "flos": 26941837040640.0, + "grad_norm": 1.6664116325381613, + "language_loss": 0.71988988, + "learning_rate": 9.84972678083801e-07, + "loss": 0.7412132, + "num_input_tokens_seen": 243805545, + "step": 11299, + "time_per_iteration": 2.713809013366699 + }, + { + "auxiliary_loss_clip": 0.0111669, + "auxiliary_loss_mlp": 0.01035745, + "balance_loss_clip": 1.04250383, + "balance_loss_mlp": 1.02194023, + "epoch": 0.6793927551480535, + "flos": 24318275662080.0, + "grad_norm": 1.2656496116170863, + "language_loss": 0.77410668, + "learning_rate": 9.846371201225488e-07, + "loss": 0.79563105, + "num_input_tokens_seen": 243825185, + "step": 11300, + "time_per_iteration": 2.6434032917022705 + }, + { + "auxiliary_loss_clip": 0.01101279, + "auxiliary_loss_mlp": 0.01035248, + "balance_loss_clip": 1.0383904, + "balance_loss_mlp": 1.02223039, + "epoch": 0.6794528784007214, + "flos": 11436615757440.0, + "grad_norm": 1.7784061043917232, + "language_loss": 0.63197196, + "learning_rate": 9.843016006639577e-07, + "loss": 0.65333718, + "num_input_tokens_seen": 243841600, + "step": 11301, + "time_per_iteration": 2.5723037719726562 + }, + { + "auxiliary_loss_clip": 0.0110239, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.03951216, + "balance_loss_mlp": 1.01922345, + "epoch": 0.6795130016533895, + "flos": 25229688382080.0, + "grad_norm": 1.6293976559584973, + "language_loss": 0.82879919, + "learning_rate": 9.839661197207525e-07, + "loss": 0.85014397, + "num_input_tokens_seen": 243862250, + "step": 11302, + "time_per_iteration": 2.8143625259399414 + }, + { + "auxiliary_loss_clip": 0.01105417, + "auxiliary_loss_mlp": 0.01036251, + "balance_loss_clip": 1.039042, + "balance_loss_mlp": 1.02304244, + "epoch": 0.6795731249060574, + "flos": 18296415434880.0, + "grad_norm": 2.345439766685576, + "language_loss": 0.69651306, + "learning_rate": 9.83630677305654e-07, + "loss": 0.71792972, + "num_input_tokens_seen": 243880560, + "step": 11303, + "time_per_iteration": 2.685213565826416 + }, + { + "auxiliary_loss_clip": 0.01084889, + "auxiliary_loss_mlp": 0.01035911, + "balance_loss_clip": 1.03954554, + "balance_loss_mlp": 1.02285755, + "epoch": 0.6796332481587254, + "flos": 20300374183680.0, + "grad_norm": 2.3397839406401864, + "language_loss": 0.70244884, + "learning_rate": 9.832952734313813e-07, + "loss": 0.72365683, + "num_input_tokens_seen": 243900635, + "step": 11304, + "time_per_iteration": 2.7310903072357178 + }, + { + "auxiliary_loss_clip": 0.01105112, + "auxiliary_loss_mlp": 0.01033253, + "balance_loss_clip": 1.04138947, + "balance_loss_mlp": 1.0197227, + "epoch": 0.6796933714113934, + "flos": 23586847015680.0, + "grad_norm": 2.2457086045709107, + "language_loss": 0.72435552, + "learning_rate": 9.829599081106536e-07, + "loss": 0.74573922, + "num_input_tokens_seen": 243920160, + "step": 11305, + "time_per_iteration": 2.651684522628784 + }, + { + "auxiliary_loss_clip": 0.01091817, + "auxiliary_loss_mlp": 0.01031313, + "balance_loss_clip": 1.03978157, + "balance_loss_mlp": 1.01869428, + "epoch": 0.6797534946640613, + "flos": 27119917693440.0, + "grad_norm": 2.0035628154788268, + "language_loss": 0.66448355, + "learning_rate": 9.826245813561882e-07, + "loss": 0.68571484, + "num_input_tokens_seen": 243939015, + "step": 11306, + "time_per_iteration": 2.655308723449707 + }, + { + "auxiliary_loss_clip": 0.01089759, + "auxiliary_loss_mlp": 0.0103013, + "balance_loss_clip": 1.03967357, + "balance_loss_mlp": 1.0164274, + "epoch": 0.6798136179167293, + "flos": 22127437428480.0, + "grad_norm": 1.6430540606311845, + "language_loss": 0.80062962, + "learning_rate": 9.822892931807021e-07, + "loss": 0.82182848, + "num_input_tokens_seen": 243958470, + "step": 11307, + "time_per_iteration": 2.661414861679077 + }, + { + "auxiliary_loss_clip": 0.01087499, + "auxiliary_loss_mlp": 0.01040608, + "balance_loss_clip": 1.03799939, + "balance_loss_mlp": 1.0259217, + "epoch": 0.6798737411693972, + "flos": 17488640430720.0, + "grad_norm": 1.645939087248785, + "language_loss": 0.89180249, + "learning_rate": 9.819540435969066e-07, + "loss": 0.91308355, + "num_input_tokens_seen": 243975450, + "step": 11308, + "time_per_iteration": 2.677755117416382 + }, + { + "auxiliary_loss_clip": 0.01075745, + "auxiliary_loss_mlp": 0.01043456, + "balance_loss_clip": 1.03412437, + "balance_loss_mlp": 1.02860808, + "epoch": 0.6799338644220653, + "flos": 22892262744960.0, + "grad_norm": 2.440998746341053, + "language_loss": 0.70999914, + "learning_rate": 9.816188326175154e-07, + "loss": 0.73119116, + "num_input_tokens_seen": 243994355, + "step": 11309, + "time_per_iteration": 2.716607093811035 + }, + { + "auxiliary_loss_clip": 0.01084669, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.0404917, + "balance_loss_mlp": 1.02482367, + "epoch": 0.6799939876747332, + "flos": 23180409648000.0, + "grad_norm": 1.9482500712222228, + "language_loss": 0.84240967, + "learning_rate": 9.812836602552411e-07, + "loss": 0.86364162, + "num_input_tokens_seen": 244011620, + "step": 11310, + "time_per_iteration": 2.722900152206421 + }, + { + "auxiliary_loss_clip": 0.01085075, + "auxiliary_loss_mlp": 0.01035424, + "balance_loss_clip": 1.03975177, + "balance_loss_mlp": 1.0229609, + "epoch": 0.6800541109274012, + "flos": 19499925553920.0, + "grad_norm": 2.93005287761764, + "language_loss": 0.83355272, + "learning_rate": 9.80948526522792e-07, + "loss": 0.85475767, + "num_input_tokens_seen": 244029925, + "step": 11311, + "time_per_iteration": 2.6596853733062744 + }, + { + "auxiliary_loss_clip": 0.01066687, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.03414321, + "balance_loss_mlp": 1.01699138, + "epoch": 0.6801142341800691, + "flos": 22277652105600.0, + "grad_norm": 2.3870630839480045, + "language_loss": 0.76332116, + "learning_rate": 9.806134314328767e-07, + "loss": 0.78430879, + "num_input_tokens_seen": 244051225, + "step": 11312, + "time_per_iteration": 4.449703693389893 + }, + { + "auxiliary_loss_clip": 0.01032073, + "auxiliary_loss_mlp": 0.01012541, + "balance_loss_clip": 1.00875306, + "balance_loss_mlp": 1.0114975, + "epoch": 0.6801743574327371, + "flos": 68714817759360.0, + "grad_norm": 0.6670891966515724, + "language_loss": 0.57208383, + "learning_rate": 9.802783749982038e-07, + "loss": 0.59253001, + "num_input_tokens_seen": 244115930, + "step": 11313, + "time_per_iteration": 3.371553897857666 + }, + { + "auxiliary_loss_clip": 0.01103732, + "auxiliary_loss_mlp": 0.01030366, + "balance_loss_clip": 1.03782439, + "balance_loss_mlp": 1.01666939, + "epoch": 0.680234480685405, + "flos": 29460467813760.0, + "grad_norm": 2.3056583415168075, + "language_loss": 0.69011742, + "learning_rate": 9.799433572314754e-07, + "loss": 0.71145844, + "num_input_tokens_seen": 244137320, + "step": 11314, + "time_per_iteration": 2.697596549987793 + }, + { + "auxiliary_loss_clip": 0.01097203, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.0348376, + "balance_loss_mlp": 1.01988137, + "epoch": 0.6802946039380731, + "flos": 15916866122880.0, + "grad_norm": 1.7422328614888773, + "language_loss": 0.81493306, + "learning_rate": 9.796083781453972e-07, + "loss": 0.83622658, + "num_input_tokens_seen": 244152755, + "step": 11315, + "time_per_iteration": 5.966327905654907 + }, + { + "auxiliary_loss_clip": 0.01074474, + "auxiliary_loss_mlp": 0.01028329, + "balance_loss_clip": 1.04119301, + "balance_loss_mlp": 1.01551998, + "epoch": 0.680354727190741, + "flos": 22018664067840.0, + "grad_norm": 1.6310079102471389, + "language_loss": 0.6954093, + "learning_rate": 9.792734377526718e-07, + "loss": 0.71643734, + "num_input_tokens_seen": 244171480, + "step": 11316, + "time_per_iteration": 2.767069101333618 + }, + { + "auxiliary_loss_clip": 0.01101612, + "auxiliary_loss_mlp": 0.01031421, + "balance_loss_clip": 1.04027951, + "balance_loss_mlp": 1.01897597, + "epoch": 0.680414850443409, + "flos": 18441494467200.0, + "grad_norm": 2.220463387251609, + "language_loss": 0.66746044, + "learning_rate": 9.789385360660003e-07, + "loss": 0.6887908, + "num_input_tokens_seen": 244187920, + "step": 11317, + "time_per_iteration": 2.6441752910614014 + }, + { + "auxiliary_loss_clip": 0.01104685, + "auxiliary_loss_mlp": 0.01039234, + "balance_loss_clip": 1.04243541, + "balance_loss_mlp": 1.02681887, + "epoch": 0.680474973696077, + "flos": 26358611909760.0, + "grad_norm": 1.689359585188632, + "language_loss": 0.74998158, + "learning_rate": 9.78603673098082e-07, + "loss": 0.77142078, + "num_input_tokens_seen": 244209565, + "step": 11318, + "time_per_iteration": 2.6722664833068848 + }, + { + "auxiliary_loss_clip": 0.01082639, + "auxiliary_loss_mlp": 0.01031826, + "balance_loss_clip": 1.03599584, + "balance_loss_mlp": 1.01942801, + "epoch": 0.6805350969487449, + "flos": 18333116156160.0, + "grad_norm": 1.901183060662594, + "language_loss": 0.67961919, + "learning_rate": 9.782688488616143e-07, + "loss": 0.70076376, + "num_input_tokens_seen": 244228015, + "step": 11319, + "time_per_iteration": 2.6768836975097656 + }, + { + "auxiliary_loss_clip": 0.01075168, + "auxiliary_loss_mlp": 0.00771315, + "balance_loss_clip": 1.04308462, + "balance_loss_mlp": 1.00015819, + "epoch": 0.6805952202014129, + "flos": 19937497034880.0, + "grad_norm": 2.0018434908969054, + "language_loss": 0.7674346, + "learning_rate": 9.779340633692945e-07, + "loss": 0.7858994, + "num_input_tokens_seen": 244245615, + "step": 11320, + "time_per_iteration": 2.7204952239990234 + }, + { + "auxiliary_loss_clip": 0.01085122, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.03866565, + "balance_loss_mlp": 1.01947236, + "epoch": 0.6806553434540809, + "flos": 25224301342080.0, + "grad_norm": 1.7764880825865026, + "language_loss": 0.74452037, + "learning_rate": 9.77599316633817e-07, + "loss": 0.76569694, + "num_input_tokens_seen": 244263625, + "step": 11321, + "time_per_iteration": 4.261602878570557 + }, + { + "auxiliary_loss_clip": 0.01093792, + "auxiliary_loss_mlp": 0.0103627, + "balance_loss_clip": 1.04168797, + "balance_loss_mlp": 1.02327621, + "epoch": 0.6807154667067489, + "flos": 17785586165760.0, + "grad_norm": 1.8638596765379807, + "language_loss": 0.72978008, + "learning_rate": 9.772646086678758e-07, + "loss": 0.75108075, + "num_input_tokens_seen": 244282745, + "step": 11322, + "time_per_iteration": 2.672649383544922 + }, + { + "auxiliary_loss_clip": 0.0106289, + "auxiliary_loss_mlp": 0.00772149, + "balance_loss_clip": 1.03630495, + "balance_loss_mlp": 1.00025296, + "epoch": 0.6807755899594168, + "flos": 22199905117440.0, + "grad_norm": 1.6392607041064693, + "language_loss": 0.78432202, + "learning_rate": 9.769299394841638e-07, + "loss": 0.80267245, + "num_input_tokens_seen": 244303770, + "step": 11323, + "time_per_iteration": 2.9052011966705322 + }, + { + "auxiliary_loss_clip": 0.01000379, + "auxiliary_loss_mlp": 0.01000283, + "balance_loss_clip": 1.00907302, + "balance_loss_mlp": 0.99898928, + "epoch": 0.6808357132120848, + "flos": 68631073200000.0, + "grad_norm": 0.7447348872303097, + "language_loss": 0.57086504, + "learning_rate": 9.765953090953714e-07, + "loss": 0.59087169, + "num_input_tokens_seen": 244355910, + "step": 11324, + "time_per_iteration": 3.0236268043518066 + }, + { + "auxiliary_loss_clip": 0.01094828, + "auxiliary_loss_mlp": 0.01037923, + "balance_loss_clip": 1.04058325, + "balance_loss_mlp": 1.02427304, + "epoch": 0.6808958364647527, + "flos": 23843357015040.0, + "grad_norm": 2.178701691002947, + "language_loss": 0.68127519, + "learning_rate": 9.76260717514186e-07, + "loss": 0.70260274, + "num_input_tokens_seen": 244376610, + "step": 11325, + "time_per_iteration": 2.6579439640045166 + }, + { + "auxiliary_loss_clip": 0.01104202, + "auxiliary_loss_mlp": 0.01032415, + "balance_loss_clip": 1.03789818, + "balance_loss_mlp": 1.01858699, + "epoch": 0.6809559597174207, + "flos": 17711717846400.0, + "grad_norm": 2.3301645499310655, + "language_loss": 0.70840496, + "learning_rate": 9.759261647532974e-07, + "loss": 0.72977114, + "num_input_tokens_seen": 244393000, + "step": 11326, + "time_per_iteration": 2.599581718444824 + }, + { + "auxiliary_loss_clip": 0.0111401, + "auxiliary_loss_mlp": 0.01033979, + "balance_loss_clip": 1.03960943, + "balance_loss_mlp": 1.02112806, + "epoch": 0.6810160829700886, + "flos": 22491894775680.0, + "grad_norm": 1.9162803943824422, + "language_loss": 0.73098135, + "learning_rate": 9.75591650825392e-07, + "loss": 0.75246119, + "num_input_tokens_seen": 244409515, + "step": 11327, + "time_per_iteration": 2.6066248416900635 + }, + { + "auxiliary_loss_clip": 0.01099529, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.03761911, + "balance_loss_mlp": 1.01766074, + "epoch": 0.6810762062227567, + "flos": 16832875783680.0, + "grad_norm": 1.8101774590253075, + "language_loss": 0.774257, + "learning_rate": 9.752571757431526e-07, + "loss": 0.79555899, + "num_input_tokens_seen": 244427165, + "step": 11328, + "time_per_iteration": 2.6368680000305176 + }, + { + "auxiliary_loss_clip": 0.01114029, + "auxiliary_loss_mlp": 0.01029553, + "balance_loss_clip": 1.03958964, + "balance_loss_mlp": 1.01668477, + "epoch": 0.6811363294754246, + "flos": 12714676554240.0, + "grad_norm": 2.0540376759628183, + "language_loss": 0.64911687, + "learning_rate": 9.74922739519265e-07, + "loss": 0.67055273, + "num_input_tokens_seen": 244445705, + "step": 11329, + "time_per_iteration": 2.573288679122925 + }, + { + "auxiliary_loss_clip": 0.0105984, + "auxiliary_loss_mlp": 0.00771154, + "balance_loss_clip": 1.03904939, + "balance_loss_mlp": 1.00018847, + "epoch": 0.6811964527280926, + "flos": 17711969241600.0, + "grad_norm": 1.9764612571544942, + "language_loss": 0.79003155, + "learning_rate": 9.745883421664096e-07, + "loss": 0.80834144, + "num_input_tokens_seen": 244460415, + "step": 11330, + "time_per_iteration": 2.773776054382324 + }, + { + "auxiliary_loss_clip": 0.01103225, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.03934312, + "balance_loss_mlp": 1.01867759, + "epoch": 0.6812565759807605, + "flos": 24863471268480.0, + "grad_norm": 2.2394798931993467, + "language_loss": 0.6390332, + "learning_rate": 9.742539836972665e-07, + "loss": 0.66039056, + "num_input_tokens_seen": 244480555, + "step": 11331, + "time_per_iteration": 2.648928165435791 + }, + { + "auxiliary_loss_clip": 0.01066258, + "auxiliary_loss_mlp": 0.01041405, + "balance_loss_clip": 1.0375067, + "balance_loss_mlp": 1.02576447, + "epoch": 0.6813166992334285, + "flos": 17166019449600.0, + "grad_norm": 1.609984813626945, + "language_loss": 0.72195572, + "learning_rate": 9.739196641245148e-07, + "loss": 0.74303234, + "num_input_tokens_seen": 244498540, + "step": 11332, + "time_per_iteration": 2.713545322418213 + }, + { + "auxiliary_loss_clip": 0.01103166, + "auxiliary_loss_mlp": 0.01035791, + "balance_loss_clip": 1.03957558, + "balance_loss_mlp": 1.02199841, + "epoch": 0.6813768224860965, + "flos": 18843550375680.0, + "grad_norm": 2.168847998609439, + "language_loss": 0.74432015, + "learning_rate": 9.735853834608326e-07, + "loss": 0.76570976, + "num_input_tokens_seen": 244517015, + "step": 11333, + "time_per_iteration": 2.6175012588500977 + }, + { + "auxiliary_loss_clip": 0.01105297, + "auxiliary_loss_mlp": 0.01033437, + "balance_loss_clip": 1.04025042, + "balance_loss_mlp": 1.01950097, + "epoch": 0.6814369457387645, + "flos": 24532733813760.0, + "grad_norm": 1.5744091613936917, + "language_loss": 0.71911031, + "learning_rate": 9.732511417188963e-07, + "loss": 0.74049771, + "num_input_tokens_seen": 244537450, + "step": 11334, + "time_per_iteration": 2.6650426387786865 + }, + { + "auxiliary_loss_clip": 0.0109758, + "auxiliary_loss_mlp": 0.01035782, + "balance_loss_clip": 1.0405134, + "balance_loss_mlp": 1.02300835, + "epoch": 0.6814970689914325, + "flos": 18222978078720.0, + "grad_norm": 1.6636732632213396, + "language_loss": 0.85627699, + "learning_rate": 9.729169389113791e-07, + "loss": 0.87761062, + "num_input_tokens_seen": 244555640, + "step": 11335, + "time_per_iteration": 2.6842143535614014 + }, + { + "auxiliary_loss_clip": 0.01094419, + "auxiliary_loss_mlp": 0.01029844, + "balance_loss_clip": 1.03577423, + "balance_loss_mlp": 1.01767254, + "epoch": 0.6815571922441004, + "flos": 25228790542080.0, + "grad_norm": 1.7085573075393508, + "language_loss": 0.82023531, + "learning_rate": 9.725827750509542e-07, + "loss": 0.84147793, + "num_input_tokens_seen": 244574005, + "step": 11336, + "time_per_iteration": 2.6614902019500732 + }, + { + "auxiliary_loss_clip": 0.01068778, + "auxiliary_loss_mlp": 0.01036536, + "balance_loss_clip": 1.03518701, + "balance_loss_mlp": 1.02383399, + "epoch": 0.6816173154967684, + "flos": 19456078026240.0, + "grad_norm": 2.367816368546713, + "language_loss": 0.81341016, + "learning_rate": 9.72248650150294e-07, + "loss": 0.83446324, + "num_input_tokens_seen": 244591395, + "step": 11337, + "time_per_iteration": 2.657960891723633 + }, + { + "auxiliary_loss_clip": 0.01066549, + "auxiliary_loss_mlp": 0.01031579, + "balance_loss_clip": 1.03811026, + "balance_loss_mlp": 1.01948464, + "epoch": 0.6816774387494363, + "flos": 17931455297280.0, + "grad_norm": 1.783864414164821, + "language_loss": 0.72693783, + "learning_rate": 9.719145642220673e-07, + "loss": 0.74791908, + "num_input_tokens_seen": 244610400, + "step": 11338, + "time_per_iteration": 2.7979798316955566 + }, + { + "auxiliary_loss_clip": 0.0107103, + "auxiliary_loss_mlp": 0.0103912, + "balance_loss_clip": 1.03607392, + "balance_loss_mlp": 1.02586377, + "epoch": 0.6817375620021043, + "flos": 22233014478720.0, + "grad_norm": 1.4931481110033054, + "language_loss": 0.775159, + "learning_rate": 9.715805172789435e-07, + "loss": 0.79626048, + "num_input_tokens_seen": 244630400, + "step": 11339, + "time_per_iteration": 2.795623540878296 + }, + { + "auxiliary_loss_clip": 0.01078643, + "auxiliary_loss_mlp": 0.01038418, + "balance_loss_clip": 1.03534794, + "balance_loss_mlp": 1.02518606, + "epoch": 0.6817976852547722, + "flos": 25374408278400.0, + "grad_norm": 2.310939550899193, + "language_loss": 0.70462239, + "learning_rate": 9.712465093335901e-07, + "loss": 0.72579294, + "num_input_tokens_seen": 244649155, + "step": 11340, + "time_per_iteration": 2.7247865200042725 + }, + { + "auxiliary_loss_clip": 0.01095095, + "auxiliary_loss_mlp": 0.01039919, + "balance_loss_clip": 1.04094124, + "balance_loss_mlp": 1.02693105, + "epoch": 0.6818578085074403, + "flos": 22265764704000.0, + "grad_norm": 2.4098119524731483, + "language_loss": 0.83344734, + "learning_rate": 9.709125403986722e-07, + "loss": 0.85479748, + "num_input_tokens_seen": 244665470, + "step": 11341, + "time_per_iteration": 2.693506956100464 + }, + { + "auxiliary_loss_clip": 0.01081074, + "auxiliary_loss_mlp": 0.01039727, + "balance_loss_clip": 1.03870487, + "balance_loss_mlp": 1.02477837, + "epoch": 0.6819179317601082, + "flos": 19318145800320.0, + "grad_norm": 2.2255629522007907, + "language_loss": 0.68262535, + "learning_rate": 9.705786104868531e-07, + "loss": 0.70383334, + "num_input_tokens_seen": 244684390, + "step": 11342, + "time_per_iteration": 2.73895263671875 + }, + { + "auxiliary_loss_clip": 0.01057789, + "auxiliary_loss_mlp": 0.01030928, + "balance_loss_clip": 1.0362978, + "balance_loss_mlp": 1.01732635, + "epoch": 0.6819780550127762, + "flos": 21104126864640.0, + "grad_norm": 1.497693268832665, + "language_loss": 0.74835187, + "learning_rate": 9.702447196107963e-07, + "loss": 0.76923907, + "num_input_tokens_seen": 244703370, + "step": 11343, + "time_per_iteration": 2.713353157043457 + }, + { + "auxiliary_loss_clip": 0.0107318, + "auxiliary_loss_mlp": 0.01047273, + "balance_loss_clip": 1.03880191, + "balance_loss_mlp": 1.03244925, + "epoch": 0.6820381782654441, + "flos": 29716403195520.0, + "grad_norm": 2.1197347783426714, + "language_loss": 0.79880822, + "learning_rate": 9.699108677831639e-07, + "loss": 0.82001281, + "num_input_tokens_seen": 244723325, + "step": 11344, + "time_per_iteration": 2.794928550720215 + }, + { + "auxiliary_loss_clip": 0.01076417, + "auxiliary_loss_mlp": 0.01035037, + "balance_loss_clip": 1.03809428, + "balance_loss_mlp": 1.02200782, + "epoch": 0.6820983015181121, + "flos": 29242130993280.0, + "grad_norm": 2.387575724266914, + "language_loss": 0.66499114, + "learning_rate": 9.695770550166136e-07, + "loss": 0.68610573, + "num_input_tokens_seen": 244745650, + "step": 11345, + "time_per_iteration": 2.7620160579681396 + }, + { + "auxiliary_loss_clip": 0.01095586, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.04086328, + "balance_loss_mlp": 1.02385557, + "epoch": 0.6821584247707801, + "flos": 18871775487360.0, + "grad_norm": 2.5184537784822347, + "language_loss": 0.64879942, + "learning_rate": 9.692432813238054e-07, + "loss": 0.67012918, + "num_input_tokens_seen": 244760270, + "step": 11346, + "time_per_iteration": 2.6018569469451904 + }, + { + "auxiliary_loss_clip": 0.01047778, + "auxiliary_loss_mlp": 0.00774999, + "balance_loss_clip": 1.02989614, + "balance_loss_mlp": 1.00022125, + "epoch": 0.6822185480234481, + "flos": 21324582587520.0, + "grad_norm": 1.6732433495926922, + "language_loss": 0.78631318, + "learning_rate": 9.689095467173952e-07, + "loss": 0.80454087, + "num_input_tokens_seen": 244779565, + "step": 11347, + "time_per_iteration": 2.7881743907928467 + }, + { + "auxiliary_loss_clip": 0.01023846, + "auxiliary_loss_mlp": 0.01003144, + "balance_loss_clip": 1.01006222, + "balance_loss_mlp": 1.00196934, + "epoch": 0.6822786712761161, + "flos": 63488306430720.0, + "grad_norm": 0.7197437780259376, + "language_loss": 0.5252403, + "learning_rate": 9.685758512100378e-07, + "loss": 0.54551017, + "num_input_tokens_seen": 244838480, + "step": 11348, + "time_per_iteration": 3.159472942352295 + }, + { + "auxiliary_loss_clip": 0.01111335, + "auxiliary_loss_mlp": 0.0103669, + "balance_loss_clip": 1.03910565, + "balance_loss_mlp": 1.02423859, + "epoch": 0.682338794528784, + "flos": 21068934514560.0, + "grad_norm": 1.699268260200451, + "language_loss": 0.79743314, + "learning_rate": 9.682421948143873e-07, + "loss": 0.8189134, + "num_input_tokens_seen": 244855265, + "step": 11349, + "time_per_iteration": 2.6090118885040283 + }, + { + "auxiliary_loss_clip": 0.01107133, + "auxiliary_loss_mlp": 0.01033347, + "balance_loss_clip": 1.03977346, + "balance_loss_mlp": 1.01808834, + "epoch": 0.682398917781452, + "flos": 36283243547520.0, + "grad_norm": 1.8874948598089236, + "language_loss": 0.73788822, + "learning_rate": 9.67908577543096e-07, + "loss": 0.75929302, + "num_input_tokens_seen": 244875555, + "step": 11350, + "time_per_iteration": 2.819202184677124 + }, + { + "auxiliary_loss_clip": 0.01113228, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.04043305, + "balance_loss_mlp": 1.01912093, + "epoch": 0.6824590410341199, + "flos": 24859197550080.0, + "grad_norm": 1.5956944903953967, + "language_loss": 0.79352248, + "learning_rate": 9.675749994088161e-07, + "loss": 0.81498003, + "num_input_tokens_seen": 244895270, + "step": 11351, + "time_per_iteration": 4.24803614616394 + }, + { + "auxiliary_loss_clip": 0.01100964, + "auxiliary_loss_mlp": 0.01038048, + "balance_loss_clip": 1.03889775, + "balance_loss_mlp": 1.02563834, + "epoch": 0.6825191642867879, + "flos": 22452392793600.0, + "grad_norm": 1.5926332734392936, + "language_loss": 0.73048198, + "learning_rate": 9.672414604241954e-07, + "loss": 0.75187206, + "num_input_tokens_seen": 244914535, + "step": 11352, + "time_per_iteration": 2.608426094055176 + }, + { + "auxiliary_loss_clip": 0.01066712, + "auxiliary_loss_mlp": 0.01039687, + "balance_loss_clip": 1.03466344, + "balance_loss_mlp": 1.0250721, + "epoch": 0.6825792875394558, + "flos": 29424377623680.0, + "grad_norm": 1.4647464086643525, + "language_loss": 0.79812884, + "learning_rate": 9.669079606018814e-07, + "loss": 0.81919283, + "num_input_tokens_seen": 244936095, + "step": 11353, + "time_per_iteration": 2.789823532104492 + }, + { + "auxiliary_loss_clip": 0.0110206, + "auxiliary_loss_mlp": 0.01030724, + "balance_loss_clip": 1.03809357, + "balance_loss_mlp": 1.01783192, + "epoch": 0.6826394107921239, + "flos": 18770974945920.0, + "grad_norm": 1.6494881368897465, + "language_loss": 0.78637832, + "learning_rate": 9.665744999545218e-07, + "loss": 0.80770618, + "num_input_tokens_seen": 244955290, + "step": 11354, + "time_per_iteration": 2.621384382247925 + }, + { + "auxiliary_loss_clip": 0.0105434, + "auxiliary_loss_mlp": 0.01030514, + "balance_loss_clip": 1.03731966, + "balance_loss_mlp": 1.01798463, + "epoch": 0.6826995340447918, + "flos": 16617591619200.0, + "grad_norm": 2.3579555752139107, + "language_loss": 0.61690813, + "learning_rate": 9.662410784947599e-07, + "loss": 0.63775671, + "num_input_tokens_seen": 244972935, + "step": 11355, + "time_per_iteration": 4.416518449783325 + }, + { + "auxiliary_loss_clip": 0.01059431, + "auxiliary_loss_mlp": 0.01031412, + "balance_loss_clip": 1.03248143, + "balance_loss_mlp": 1.01780415, + "epoch": 0.6827596572974598, + "flos": 20848299223680.0, + "grad_norm": 2.0827591580165525, + "language_loss": 0.81958997, + "learning_rate": 9.659076962352398e-07, + "loss": 0.84049839, + "num_input_tokens_seen": 244989440, + "step": 11356, + "time_per_iteration": 2.772223949432373 + }, + { + "auxiliary_loss_clip": 0.0109731, + "auxiliary_loss_mlp": 0.01033608, + "balance_loss_clip": 1.04186547, + "balance_loss_mlp": 1.01991129, + "epoch": 0.6828197805501277, + "flos": 22748081552640.0, + "grad_norm": 1.8123732324115438, + "language_loss": 0.78407294, + "learning_rate": 9.655743531886052e-07, + "loss": 0.80538213, + "num_input_tokens_seen": 245007830, + "step": 11357, + "time_per_iteration": 2.7943849563598633 + }, + { + "auxiliary_loss_clip": 0.01014132, + "auxiliary_loss_mlp": 0.01026339, + "balance_loss_clip": 1.008708, + "balance_loss_mlp": 1.02481925, + "epoch": 0.6828799038027957, + "flos": 71646565829760.0, + "grad_norm": 0.8539086135635664, + "language_loss": 0.59534943, + "learning_rate": 9.65241049367493e-07, + "loss": 0.61575413, + "num_input_tokens_seen": 245070720, + "step": 11358, + "time_per_iteration": 3.2622344493865967 + }, + { + "auxiliary_loss_clip": 0.0107463, + "auxiliary_loss_mlp": 0.01049087, + "balance_loss_clip": 1.03272629, + "balance_loss_mlp": 1.03269577, + "epoch": 0.6829400270554637, + "flos": 19829154637440.0, + "grad_norm": 1.8877966802390573, + "language_loss": 0.78321809, + "learning_rate": 9.64907784784544e-07, + "loss": 0.80445516, + "num_input_tokens_seen": 245089070, + "step": 11359, + "time_per_iteration": 2.7041001319885254 + }, + { + "auxiliary_loss_clip": 0.01102262, + "auxiliary_loss_mlp": 0.01035361, + "balance_loss_clip": 1.03907776, + "balance_loss_mlp": 1.02245069, + "epoch": 0.6830001503081317, + "flos": 21980634543360.0, + "grad_norm": 1.8916230773559268, + "language_loss": 0.81728172, + "learning_rate": 9.645745594523958e-07, + "loss": 0.83865792, + "num_input_tokens_seen": 245106500, + "step": 11360, + "time_per_iteration": 2.7257988452911377 + }, + { + "auxiliary_loss_clip": 0.01103488, + "auxiliary_loss_mlp": 0.01039736, + "balance_loss_clip": 1.04132986, + "balance_loss_mlp": 1.02476335, + "epoch": 0.6830602735607997, + "flos": 24316767290880.0, + "grad_norm": 1.928922588936481, + "language_loss": 0.75214481, + "learning_rate": 9.642413733836844e-07, + "loss": 0.77357709, + "num_input_tokens_seen": 245125260, + "step": 11361, + "time_per_iteration": 4.145911931991577 + }, + { + "auxiliary_loss_clip": 0.01014146, + "auxiliary_loss_mlp": 0.01006013, + "balance_loss_clip": 1.01607728, + "balance_loss_mlp": 1.00464237, + "epoch": 0.6831203968134676, + "flos": 57690062323200.0, + "grad_norm": 0.8723971229353714, + "language_loss": 0.59647572, + "learning_rate": 9.639082265910437e-07, + "loss": 0.6166774, + "num_input_tokens_seen": 245188730, + "step": 11362, + "time_per_iteration": 3.303649425506592 + }, + { + "auxiliary_loss_clip": 0.01085969, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.03544044, + "balance_loss_mlp": 1.01791525, + "epoch": 0.6831805200661356, + "flos": 14388436552320.0, + "grad_norm": 2.2849011537380384, + "language_loss": 0.75293076, + "learning_rate": 9.635751190871074e-07, + "loss": 0.77411127, + "num_input_tokens_seen": 245205065, + "step": 11363, + "time_per_iteration": 2.646646499633789 + }, + { + "auxiliary_loss_clip": 0.0109026, + "auxiliary_loss_mlp": 0.01039827, + "balance_loss_clip": 1.03785634, + "balance_loss_mlp": 1.02562308, + "epoch": 0.6832406433188035, + "flos": 22820297846400.0, + "grad_norm": 2.373792593478636, + "language_loss": 0.89238822, + "learning_rate": 9.632420508845063e-07, + "loss": 0.91368914, + "num_input_tokens_seen": 245224265, + "step": 11364, + "time_per_iteration": 2.7119343280792236 + }, + { + "auxiliary_loss_clip": 0.0108884, + "auxiliary_loss_mlp": 0.01037187, + "balance_loss_clip": 1.03655684, + "balance_loss_mlp": 1.02403259, + "epoch": 0.6833007665714715, + "flos": 17561718650880.0, + "grad_norm": 2.1030126962068634, + "language_loss": 0.88149464, + "learning_rate": 9.629090219958697e-07, + "loss": 0.9027549, + "num_input_tokens_seen": 245243360, + "step": 11365, + "time_per_iteration": 2.702363967895508 + }, + { + "auxiliary_loss_clip": 0.01078844, + "auxiliary_loss_mlp": 0.01042641, + "balance_loss_clip": 1.03964448, + "balance_loss_mlp": 1.02709591, + "epoch": 0.6833608898241395, + "flos": 22445928345600.0, + "grad_norm": 2.424437854190435, + "language_loss": 0.81156111, + "learning_rate": 9.625760324338272e-07, + "loss": 0.83277589, + "num_input_tokens_seen": 245256350, + "step": 11366, + "time_per_iteration": 2.674567937850952 + }, + { + "auxiliary_loss_clip": 0.01093776, + "auxiliary_loss_mlp": 0.01032035, + "balance_loss_clip": 1.03835893, + "balance_loss_mlp": 1.0188446, + "epoch": 0.6834210130768075, + "flos": 24534637234560.0, + "grad_norm": 6.88774223787515, + "language_loss": 0.76857549, + "learning_rate": 9.622430822110062e-07, + "loss": 0.78983361, + "num_input_tokens_seen": 245277575, + "step": 11367, + "time_per_iteration": 2.759528160095215 + }, + { + "auxiliary_loss_clip": 0.01087848, + "auxiliary_loss_mlp": 0.01037587, + "balance_loss_clip": 1.03855908, + "balance_loss_mlp": 1.0238061, + "epoch": 0.6834811363294754, + "flos": 20047132321920.0, + "grad_norm": 1.645021355885407, + "language_loss": 0.69147146, + "learning_rate": 9.619101713400312e-07, + "loss": 0.71272576, + "num_input_tokens_seen": 245296615, + "step": 11368, + "time_per_iteration": 2.730281352996826 + }, + { + "auxiliary_loss_clip": 0.01073853, + "auxiliary_loss_mlp": 0.01036705, + "balance_loss_clip": 1.03326845, + "balance_loss_mlp": 1.02335334, + "epoch": 0.6835412595821434, + "flos": 24790752184320.0, + "grad_norm": 2.698591110002851, + "language_loss": 0.73457599, + "learning_rate": 9.615772998335261e-07, + "loss": 0.75568151, + "num_input_tokens_seen": 245316275, + "step": 11369, + "time_per_iteration": 2.7577805519104004 + }, + { + "auxiliary_loss_clip": 0.01098451, + "auxiliary_loss_mlp": 0.01031594, + "balance_loss_clip": 1.03844953, + "balance_loss_mlp": 1.01816475, + "epoch": 0.6836013828348113, + "flos": 19500356517120.0, + "grad_norm": 1.7788685178761994, + "language_loss": 0.79114872, + "learning_rate": 9.612444677041138e-07, + "loss": 0.81244916, + "num_input_tokens_seen": 245334595, + "step": 11370, + "time_per_iteration": 2.6396684646606445 + }, + { + "auxiliary_loss_clip": 0.01022242, + "auxiliary_loss_mlp": 0.01001045, + "balance_loss_clip": 1.00799215, + "balance_loss_mlp": 0.99983543, + "epoch": 0.6836615060874793, + "flos": 58363999251840.0, + "grad_norm": 0.7422193722806905, + "language_loss": 0.59737813, + "learning_rate": 9.609116749644162e-07, + "loss": 0.61761105, + "num_input_tokens_seen": 245389750, + "step": 11371, + "time_per_iteration": 3.0740749835968018 + }, + { + "auxiliary_loss_clip": 0.01085535, + "auxiliary_loss_mlp": 0.01029543, + "balance_loss_clip": 1.03905046, + "balance_loss_mlp": 1.01730061, + "epoch": 0.6837216293401474, + "flos": 12166895168640.0, + "grad_norm": 1.4865479653921647, + "language_loss": 0.63814664, + "learning_rate": 9.605789216270511e-07, + "loss": 0.65929747, + "num_input_tokens_seen": 245407530, + "step": 11372, + "time_per_iteration": 2.7413337230682373 + }, + { + "auxiliary_loss_clip": 0.0110109, + "auxiliary_loss_mlp": 0.01031341, + "balance_loss_clip": 1.03960001, + "balance_loss_mlp": 1.01804972, + "epoch": 0.6837817525928153, + "flos": 22127581082880.0, + "grad_norm": 1.4899002874098461, + "language_loss": 0.71882284, + "learning_rate": 9.602462077046375e-07, + "loss": 0.74014717, + "num_input_tokens_seen": 245427000, + "step": 11373, + "time_per_iteration": 2.6774277687072754 + }, + { + "auxiliary_loss_clip": 0.01004865, + "auxiliary_loss_mlp": 0.01001461, + "balance_loss_clip": 1.00957799, + "balance_loss_mlp": 1.00026858, + "epoch": 0.6838418758454833, + "flos": 65005928985600.0, + "grad_norm": 1.2536847263503932, + "language_loss": 0.56630689, + "learning_rate": 9.599135332097935e-07, + "loss": 0.58637011, + "num_input_tokens_seen": 245491620, + "step": 11374, + "time_per_iteration": 3.388324499130249 + }, + { + "auxiliary_loss_clip": 0.01107854, + "auxiliary_loss_mlp": 0.01029868, + "balance_loss_clip": 1.04247069, + "balance_loss_mlp": 1.01605177, + "epoch": 0.6839019990981512, + "flos": 21030833162880.0, + "grad_norm": 1.4678437510466378, + "language_loss": 0.74034035, + "learning_rate": 9.595808981551312e-07, + "loss": 0.76171762, + "num_input_tokens_seen": 245511285, + "step": 11375, + "time_per_iteration": 2.6397507190704346 + }, + { + "auxiliary_loss_clip": 0.01095867, + "auxiliary_loss_mlp": 0.01034414, + "balance_loss_clip": 1.04174185, + "balance_loss_mlp": 1.02130103, + "epoch": 0.6839621223508192, + "flos": 24935543907840.0, + "grad_norm": 1.7532880441573435, + "language_loss": 0.70852029, + "learning_rate": 9.592483025532651e-07, + "loss": 0.72982311, + "num_input_tokens_seen": 245532910, + "step": 11376, + "time_per_iteration": 2.693699598312378 + }, + { + "auxiliary_loss_clip": 0.01115191, + "auxiliary_loss_mlp": 0.01033917, + "balance_loss_clip": 1.03887844, + "balance_loss_mlp": 1.02039814, + "epoch": 0.6840222456034871, + "flos": 26358827391360.0, + "grad_norm": 2.037488504873538, + "language_loss": 0.74301463, + "learning_rate": 9.58915746416808e-07, + "loss": 0.76450574, + "num_input_tokens_seen": 245550540, + "step": 11377, + "time_per_iteration": 2.5986266136169434 + }, + { + "auxiliary_loss_clip": 0.01014709, + "auxiliary_loss_mlp": 0.010028, + "balance_loss_clip": 1.00959396, + "balance_loss_mlp": 1.00172734, + "epoch": 0.6840823688561551, + "flos": 65988336936960.0, + "grad_norm": 0.7236208934827679, + "language_loss": 0.56872022, + "learning_rate": 9.585832297583707e-07, + "loss": 0.58889532, + "num_input_tokens_seen": 245619570, + "step": 11378, + "time_per_iteration": 3.304108142852783 + }, + { + "auxiliary_loss_clip": 0.01114944, + "auxiliary_loss_mlp": 0.01038581, + "balance_loss_clip": 1.04010487, + "balance_loss_mlp": 1.02452612, + "epoch": 0.684142492108823, + "flos": 21397588980480.0, + "grad_norm": 1.9771846674075895, + "language_loss": 0.78299057, + "learning_rate": 9.58250752590561e-07, + "loss": 0.80452579, + "num_input_tokens_seen": 245637980, + "step": 11379, + "time_per_iteration": 2.5876471996307373 + }, + { + "auxiliary_loss_clip": 0.01110374, + "auxiliary_loss_mlp": 0.0102753, + "balance_loss_clip": 1.04125404, + "balance_loss_mlp": 1.01560271, + "epoch": 0.6842026153614911, + "flos": 18801426700800.0, + "grad_norm": 1.9586312359498843, + "language_loss": 0.69083488, + "learning_rate": 9.57918314925988e-07, + "loss": 0.71221387, + "num_input_tokens_seen": 245655690, + "step": 11380, + "time_per_iteration": 2.565652847290039 + }, + { + "auxiliary_loss_clip": 0.0109036, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.03853393, + "balance_loss_mlp": 1.02266991, + "epoch": 0.684262738614159, + "flos": 19646405216640.0, + "grad_norm": 1.9286622353610317, + "language_loss": 0.78519118, + "learning_rate": 9.575859167772568e-07, + "loss": 0.80645669, + "num_input_tokens_seen": 245671525, + "step": 11381, + "time_per_iteration": 2.6301379203796387 + }, + { + "auxiliary_loss_clip": 0.010226, + "auxiliary_loss_mlp": 0.01003847, + "balance_loss_clip": 1.00947046, + "balance_loss_mlp": 1.00290525, + "epoch": 0.684322861866827, + "flos": 62354462739840.0, + "grad_norm": 0.864991455722599, + "language_loss": 0.67092407, + "learning_rate": 9.572535581569713e-07, + "loss": 0.69118857, + "num_input_tokens_seen": 245724115, + "step": 11382, + "time_per_iteration": 3.0039761066436768 + }, + { + "auxiliary_loss_clip": 0.01021817, + "auxiliary_loss_mlp": 0.01001983, + "balance_loss_clip": 1.0083313, + "balance_loss_mlp": 1.000862, + "epoch": 0.6843829851194949, + "flos": 65805048812160.0, + "grad_norm": 0.8192420417585807, + "language_loss": 0.58103538, + "learning_rate": 9.569212390777356e-07, + "loss": 0.60127336, + "num_input_tokens_seen": 245789245, + "step": 11383, + "time_per_iteration": 3.165360450744629 + }, + { + "auxiliary_loss_clip": 0.01062418, + "auxiliary_loss_mlp": 0.01038341, + "balance_loss_clip": 1.03478217, + "balance_loss_mlp": 1.02372622, + "epoch": 0.6844431083721629, + "flos": 27855153181440.0, + "grad_norm": 2.857238801522205, + "language_loss": 0.80316836, + "learning_rate": 9.565889595521517e-07, + "loss": 0.82417595, + "num_input_tokens_seen": 245812420, + "step": 11384, + "time_per_iteration": 2.770827054977417 + }, + { + "auxiliary_loss_clip": 0.01103805, + "auxiliary_loss_mlp": 0.01030338, + "balance_loss_clip": 1.03886342, + "balance_loss_mlp": 1.01740408, + "epoch": 0.684503231624831, + "flos": 18255010032000.0, + "grad_norm": 2.2679652674144157, + "language_loss": 0.77255201, + "learning_rate": 9.562567195928187e-07, + "loss": 0.79389346, + "num_input_tokens_seen": 245829135, + "step": 11385, + "time_per_iteration": 2.591132164001465 + }, + { + "auxiliary_loss_clip": 0.0108167, + "auxiliary_loss_mlp": 0.01042801, + "balance_loss_clip": 1.0381335, + "balance_loss_mlp": 1.02736902, + "epoch": 0.6845633548774989, + "flos": 17639681120640.0, + "grad_norm": 2.065101540426227, + "language_loss": 0.84796238, + "learning_rate": 9.55924519212335e-07, + "loss": 0.86920702, + "num_input_tokens_seen": 245847140, + "step": 11386, + "time_per_iteration": 2.6891727447509766 + }, + { + "auxiliary_loss_clip": 0.01103811, + "auxiliary_loss_mlp": 0.01041499, + "balance_loss_clip": 1.04075646, + "balance_loss_mlp": 1.02887416, + "epoch": 0.6846234781301669, + "flos": 20807576179200.0, + "grad_norm": 2.1990004125382634, + "language_loss": 0.83455414, + "learning_rate": 9.555923584232984e-07, + "loss": 0.85600722, + "num_input_tokens_seen": 245862855, + "step": 11387, + "time_per_iteration": 2.61997127532959 + }, + { + "auxiliary_loss_clip": 0.01092977, + "auxiliary_loss_mlp": 0.01030654, + "balance_loss_clip": 1.03438902, + "balance_loss_mlp": 1.01760602, + "epoch": 0.6846836013828348, + "flos": 36101176485120.0, + "grad_norm": 1.848194295156486, + "language_loss": 0.72119319, + "learning_rate": 9.552602372383047e-07, + "loss": 0.74242949, + "num_input_tokens_seen": 245885415, + "step": 11388, + "time_per_iteration": 2.7075023651123047 + }, + { + "auxiliary_loss_clip": 0.01098153, + "auxiliary_loss_mlp": 0.01027593, + "balance_loss_clip": 1.04095197, + "balance_loss_mlp": 1.01512408, + "epoch": 0.6847437246355028, + "flos": 43142468607360.0, + "grad_norm": 2.050560945832389, + "language_loss": 0.6225087, + "learning_rate": 9.549281556699469e-07, + "loss": 0.64376616, + "num_input_tokens_seen": 245906285, + "step": 11389, + "time_per_iteration": 2.8079371452331543 + }, + { + "auxiliary_loss_clip": 0.01011672, + "auxiliary_loss_mlp": 0.01004667, + "balance_loss_clip": 1.00851202, + "balance_loss_mlp": 1.00345695, + "epoch": 0.6848038478881707, + "flos": 71663729552640.0, + "grad_norm": 0.7413355837031695, + "language_loss": 0.5598197, + "learning_rate": 9.54596113730818e-07, + "loss": 0.57998312, + "num_input_tokens_seen": 245967620, + "step": 11390, + "time_per_iteration": 5.026982307434082 + }, + { + "auxiliary_loss_clip": 0.01076744, + "auxiliary_loss_mlp": 0.00771583, + "balance_loss_clip": 1.03878915, + "balance_loss_mlp": 1.00011551, + "epoch": 0.6848639711408387, + "flos": 19937820257280.0, + "grad_norm": 1.8276249174915487, + "language_loss": 0.87787604, + "learning_rate": 9.542641114335109e-07, + "loss": 0.89635926, + "num_input_tokens_seen": 245985075, + "step": 11391, + "time_per_iteration": 2.7455759048461914 + }, + { + "auxiliary_loss_clip": 0.01073324, + "auxiliary_loss_mlp": 0.01040704, + "balance_loss_clip": 1.03858793, + "balance_loss_mlp": 1.0274303, + "epoch": 0.6849240943935067, + "flos": 26867501844480.0, + "grad_norm": 1.7326264104251545, + "language_loss": 0.79257655, + "learning_rate": 9.539321487906117e-07, + "loss": 0.81371683, + "num_input_tokens_seen": 246003560, + "step": 11392, + "time_per_iteration": 2.8595845699310303 + }, + { + "auxiliary_loss_clip": 0.0108908, + "auxiliary_loss_mlp": 0.01032172, + "balance_loss_clip": 1.03764129, + "balance_loss_mlp": 1.01933873, + "epoch": 0.6849842176461747, + "flos": 13735365425280.0, + "grad_norm": 2.218400680256619, + "language_loss": 0.71076894, + "learning_rate": 9.536002258147104e-07, + "loss": 0.7319814, + "num_input_tokens_seen": 246019600, + "step": 11393, + "time_per_iteration": 2.680263042449951 + }, + { + "auxiliary_loss_clip": 0.01075845, + "auxiliary_loss_mlp": 0.01032815, + "balance_loss_clip": 1.03768921, + "balance_loss_mlp": 1.01831901, + "epoch": 0.6850443408988426, + "flos": 24973070641920.0, + "grad_norm": 1.8123459031815148, + "language_loss": 0.64661837, + "learning_rate": 9.532683425183936e-07, + "loss": 0.66770494, + "num_input_tokens_seen": 246038920, + "step": 11394, + "time_per_iteration": 4.561980724334717 + }, + { + "auxiliary_loss_clip": 0.01087026, + "auxiliary_loss_mlp": 0.00773484, + "balance_loss_clip": 1.03753853, + "balance_loss_mlp": 1.00009871, + "epoch": 0.6851044641515106, + "flos": 27744225004800.0, + "grad_norm": 2.9988719811827633, + "language_loss": 0.80739737, + "learning_rate": 9.529364989142468e-07, + "loss": 0.82600248, + "num_input_tokens_seen": 246060490, + "step": 11395, + "time_per_iteration": 2.758030891418457 + }, + { + "auxiliary_loss_clip": 0.01077162, + "auxiliary_loss_mlp": 0.01035315, + "balance_loss_clip": 1.03991926, + "balance_loss_mlp": 1.02056861, + "epoch": 0.6851645874041785, + "flos": 24351061800960.0, + "grad_norm": 1.836466665804894, + "language_loss": 0.73088896, + "learning_rate": 9.526046950148527e-07, + "loss": 0.75201374, + "num_input_tokens_seen": 246081465, + "step": 11396, + "time_per_iteration": 2.781780481338501 + }, + { + "auxiliary_loss_clip": 0.01084632, + "auxiliary_loss_mlp": 0.01031663, + "balance_loss_clip": 1.03876483, + "balance_loss_mlp": 1.01705348, + "epoch": 0.6852247106568465, + "flos": 15077849264640.0, + "grad_norm": 3.468595562954195, + "language_loss": 0.79397655, + "learning_rate": 9.522729308327931e-07, + "loss": 0.81513953, + "num_input_tokens_seen": 246096110, + "step": 11397, + "time_per_iteration": 2.759290933609009 + }, + { + "auxiliary_loss_clip": 0.01035311, + "auxiliary_loss_mlp": 0.01038237, + "balance_loss_clip": 1.03174019, + "balance_loss_mlp": 1.02346683, + "epoch": 0.6852848339095146, + "flos": 18770005278720.0, + "grad_norm": 1.7538021552670298, + "language_loss": 0.71620733, + "learning_rate": 9.519412063806493e-07, + "loss": 0.73694277, + "num_input_tokens_seen": 246114785, + "step": 11398, + "time_per_iteration": 2.8469512462615967 + }, + { + "auxiliary_loss_clip": 0.01063012, + "auxiliary_loss_mlp": 0.01031401, + "balance_loss_clip": 1.03693652, + "balance_loss_mlp": 1.01927114, + "epoch": 0.6853449571621825, + "flos": 27854363082240.0, + "grad_norm": 1.5995227781124475, + "language_loss": 0.70539916, + "learning_rate": 9.516095216709996e-07, + "loss": 0.72634327, + "num_input_tokens_seen": 246136375, + "step": 11399, + "time_per_iteration": 2.8067455291748047 + }, + { + "auxiliary_loss_clip": 0.01099638, + "auxiliary_loss_mlp": 0.01034955, + "balance_loss_clip": 1.03867149, + "balance_loss_mlp": 1.02175879, + "epoch": 0.6854050804148505, + "flos": 18150510389760.0, + "grad_norm": 1.5522963452984355, + "language_loss": 0.7023446, + "learning_rate": 9.512778767164217e-07, + "loss": 0.72369051, + "num_input_tokens_seen": 246155090, + "step": 11400, + "time_per_iteration": 4.245120286941528 + }, + { + "auxiliary_loss_clip": 0.01077599, + "auxiliary_loss_mlp": 0.01038032, + "balance_loss_clip": 1.04081964, + "balance_loss_mlp": 1.02140248, + "epoch": 0.6854652036675184, + "flos": 16326212492160.0, + "grad_norm": 2.0326109813245217, + "language_loss": 0.77846044, + "learning_rate": 9.509462715294927e-07, + "loss": 0.79961675, + "num_input_tokens_seen": 246172645, + "step": 11401, + "time_per_iteration": 2.758004665374756 + }, + { + "auxiliary_loss_clip": 0.01113766, + "auxiliary_loss_mlp": 0.01037682, + "balance_loss_clip": 1.04050303, + "balance_loss_mlp": 1.02477169, + "epoch": 0.6855253269201864, + "flos": 14940814878720.0, + "grad_norm": 1.868317908345602, + "language_loss": 0.75315881, + "learning_rate": 9.50614706122786e-07, + "loss": 0.77467334, + "num_input_tokens_seen": 246189055, + "step": 11402, + "time_per_iteration": 2.562199115753174 + }, + { + "auxiliary_loss_clip": 0.0109933, + "auxiliary_loss_mlp": 0.01041955, + "balance_loss_clip": 1.03740358, + "balance_loss_mlp": 1.02720892, + "epoch": 0.6855854501728543, + "flos": 23037736826880.0, + "grad_norm": 1.5371325501517963, + "language_loss": 0.72588831, + "learning_rate": 9.502831805088742e-07, + "loss": 0.74730122, + "num_input_tokens_seen": 246207990, + "step": 11403, + "time_per_iteration": 2.677266836166382 + }, + { + "auxiliary_loss_clip": 0.01114001, + "auxiliary_loss_mlp": 0.01034831, + "balance_loss_clip": 1.04157901, + "balance_loss_mlp": 1.02145004, + "epoch": 0.6856455734255223, + "flos": 13253623194240.0, + "grad_norm": 2.0747553095420175, + "language_loss": 0.81451255, + "learning_rate": 9.499516947003294e-07, + "loss": 0.83600086, + "num_input_tokens_seen": 246221595, + "step": 11404, + "time_per_iteration": 2.5857958793640137 + }, + { + "auxiliary_loss_clip": 0.01086293, + "auxiliary_loss_mlp": 0.01040032, + "balance_loss_clip": 1.0375222, + "balance_loss_mlp": 1.0263046, + "epoch": 0.6857056966781903, + "flos": 23333461499520.0, + "grad_norm": 1.3945702093947172, + "language_loss": 0.77848321, + "learning_rate": 9.496202487097222e-07, + "loss": 0.79974639, + "num_input_tokens_seen": 246242970, + "step": 11405, + "time_per_iteration": 2.743281364440918 + }, + { + "auxiliary_loss_clip": 0.01023454, + "auxiliary_loss_mlp": 0.00999881, + "balance_loss_clip": 1.00911474, + "balance_loss_mlp": 0.99873084, + "epoch": 0.6857658199308583, + "flos": 61852647784320.0, + "grad_norm": 0.7882286280493239, + "language_loss": 0.60976082, + "learning_rate": 9.492888425496199e-07, + "loss": 0.62999415, + "num_input_tokens_seen": 246300405, + "step": 11406, + "time_per_iteration": 3.236720085144043 + }, + { + "auxiliary_loss_clip": 0.01080565, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.03731775, + "balance_loss_mlp": 1.02062762, + "epoch": 0.6858259431835262, + "flos": 16654543735680.0, + "grad_norm": 1.6671355551751728, + "language_loss": 0.76914632, + "learning_rate": 9.489574762325907e-07, + "loss": 0.79030716, + "num_input_tokens_seen": 246318780, + "step": 11407, + "time_per_iteration": 2.7857916355133057 + }, + { + "auxiliary_loss_clip": 0.01092831, + "auxiliary_loss_mlp": 0.01039174, + "balance_loss_clip": 1.0389874, + "balance_loss_mlp": 1.02427292, + "epoch": 0.6858860664361942, + "flos": 21872974504320.0, + "grad_norm": 2.9798515710303572, + "language_loss": 0.71276259, + "learning_rate": 9.486261497711991e-07, + "loss": 0.7340827, + "num_input_tokens_seen": 246339405, + "step": 11408, + "time_per_iteration": 2.8327853679656982 + }, + { + "auxiliary_loss_clip": 0.01104322, + "auxiliary_loss_mlp": 0.01031824, + "balance_loss_clip": 1.03901792, + "balance_loss_mlp": 1.01819825, + "epoch": 0.6859461896888621, + "flos": 15267637751040.0, + "grad_norm": 1.7652749442295346, + "language_loss": 0.70438635, + "learning_rate": 9.482948631780087e-07, + "loss": 0.72574776, + "num_input_tokens_seen": 246357055, + "step": 11409, + "time_per_iteration": 2.6262388229370117 + }, + { + "auxiliary_loss_clip": 0.01069373, + "auxiliary_loss_mlp": 0.01029456, + "balance_loss_clip": 1.03979826, + "balance_loss_mlp": 1.01718974, + "epoch": 0.6860063129415301, + "flos": 18620293392000.0, + "grad_norm": 1.5800008029842278, + "language_loss": 0.78244615, + "learning_rate": 9.479636164655825e-07, + "loss": 0.80343449, + "num_input_tokens_seen": 246374050, + "step": 11410, + "time_per_iteration": 2.742436408996582 + }, + { + "auxiliary_loss_clip": 0.01104718, + "auxiliary_loss_mlp": 0.0103935, + "balance_loss_clip": 1.03746653, + "balance_loss_mlp": 1.02479458, + "epoch": 0.6860664361941982, + "flos": 23951376190080.0, + "grad_norm": 1.9022000774970669, + "language_loss": 0.71458399, + "learning_rate": 9.476324096464821e-07, + "loss": 0.73602462, + "num_input_tokens_seen": 246392910, + "step": 11411, + "time_per_iteration": 2.7334024906158447 + }, + { + "auxiliary_loss_clip": 0.01062107, + "auxiliary_loss_mlp": 0.01047011, + "balance_loss_clip": 1.03538156, + "balance_loss_mlp": 1.03152537, + "epoch": 0.6861265594468661, + "flos": 20407782827520.0, + "grad_norm": 2.454164229167477, + "language_loss": 0.70101523, + "learning_rate": 9.473012427332654e-07, + "loss": 0.7221064, + "num_input_tokens_seen": 246411540, + "step": 11412, + "time_per_iteration": 2.830611228942871 + }, + { + "auxiliary_loss_clip": 0.01114643, + "auxiliary_loss_mlp": 0.01034015, + "balance_loss_clip": 1.03966832, + "balance_loss_mlp": 1.02018094, + "epoch": 0.6861866826995341, + "flos": 11428571111040.0, + "grad_norm": 3.537487518671294, + "language_loss": 0.71493286, + "learning_rate": 9.469701157384919e-07, + "loss": 0.73641944, + "num_input_tokens_seen": 246423295, + "step": 11413, + "time_per_iteration": 2.5294950008392334 + }, + { + "auxiliary_loss_clip": 0.01104826, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.03952384, + "balance_loss_mlp": 1.02099848, + "epoch": 0.686246805952202, + "flos": 15997593939840.0, + "grad_norm": 1.8251318339835605, + "language_loss": 0.73947906, + "learning_rate": 9.466390286747164e-07, + "loss": 0.7608704, + "num_input_tokens_seen": 246441045, + "step": 11414, + "time_per_iteration": 2.5965075492858887 + }, + { + "auxiliary_loss_clip": 0.01090896, + "auxiliary_loss_mlp": 0.01033258, + "balance_loss_clip": 1.03907096, + "balance_loss_mlp": 1.01883936, + "epoch": 0.68630692920487, + "flos": 19826712512640.0, + "grad_norm": 2.434529931317787, + "language_loss": 0.8682794, + "learning_rate": 9.46307981554495e-07, + "loss": 0.88952088, + "num_input_tokens_seen": 246456905, + "step": 11415, + "time_per_iteration": 2.6476597785949707 + }, + { + "auxiliary_loss_clip": 0.01106277, + "auxiliary_loss_mlp": 0.01036888, + "balance_loss_clip": 1.04034388, + "balance_loss_mlp": 1.02316129, + "epoch": 0.6863670524575379, + "flos": 26286216048000.0, + "grad_norm": 1.8704963128355632, + "language_loss": 0.67290139, + "learning_rate": 9.459769743903801e-07, + "loss": 0.69433296, + "num_input_tokens_seen": 246477545, + "step": 11416, + "time_per_iteration": 2.658177137374878 + }, + { + "auxiliary_loss_clip": 0.01090013, + "auxiliary_loss_mlp": 0.01041407, + "balance_loss_clip": 1.03826094, + "balance_loss_mlp": 1.02668476, + "epoch": 0.686427175710206, + "flos": 19173138595200.0, + "grad_norm": 1.424267552511055, + "language_loss": 0.76128805, + "learning_rate": 9.456460071949237e-07, + "loss": 0.78260225, + "num_input_tokens_seen": 246496705, + "step": 11417, + "time_per_iteration": 2.6901679039001465 + }, + { + "auxiliary_loss_clip": 0.01087664, + "auxiliary_loss_mlp": 0.01036409, + "balance_loss_clip": 1.03694177, + "balance_loss_mlp": 1.02199018, + "epoch": 0.6864872989628739, + "flos": 18916628595840.0, + "grad_norm": 2.686574302160358, + "language_loss": 0.7732662, + "learning_rate": 9.45315079980678e-07, + "loss": 0.79450691, + "num_input_tokens_seen": 246514860, + "step": 11418, + "time_per_iteration": 2.755699872970581 + }, + { + "auxiliary_loss_clip": 0.01066399, + "auxiliary_loss_mlp": 0.01031853, + "balance_loss_clip": 1.03764701, + "balance_loss_mlp": 1.01901984, + "epoch": 0.6865474222155419, + "flos": 25956196865280.0, + "grad_norm": 1.6068340325317958, + "language_loss": 0.76434135, + "learning_rate": 9.449841927601887e-07, + "loss": 0.78532386, + "num_input_tokens_seen": 246536145, + "step": 11419, + "time_per_iteration": 2.865663766860962 + }, + { + "auxiliary_loss_clip": 0.01111545, + "auxiliary_loss_mlp": 0.0103699, + "balance_loss_clip": 1.03836358, + "balance_loss_mlp": 1.02422845, + "epoch": 0.6866075454682098, + "flos": 18478087447680.0, + "grad_norm": 2.1745569847310624, + "language_loss": 0.71438152, + "learning_rate": 9.446533455460044e-07, + "loss": 0.7358669, + "num_input_tokens_seen": 246553265, + "step": 11420, + "time_per_iteration": 2.6367876529693604 + }, + { + "auxiliary_loss_clip": 0.01071734, + "auxiliary_loss_mlp": 0.01035393, + "balance_loss_clip": 1.03420091, + "balance_loss_mlp": 1.02145147, + "epoch": 0.6866676687208778, + "flos": 34239998298240.0, + "grad_norm": 1.4612378577280256, + "language_loss": 0.74987674, + "learning_rate": 9.443225383506712e-07, + "loss": 0.77094799, + "num_input_tokens_seen": 246575130, + "step": 11421, + "time_per_iteration": 2.905451774597168 + }, + { + "auxiliary_loss_clip": 0.01099049, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.03840101, + "balance_loss_mlp": 1.01820481, + "epoch": 0.6867277919735457, + "flos": 21721754246400.0, + "grad_norm": 1.8216780462844224, + "language_loss": 0.76901162, + "learning_rate": 9.439917711867338e-07, + "loss": 0.79031521, + "num_input_tokens_seen": 246593095, + "step": 11422, + "time_per_iteration": 2.7650146484375 + }, + { + "auxiliary_loss_clip": 0.01107124, + "auxiliary_loss_mlp": 0.01039503, + "balance_loss_clip": 1.04101586, + "balance_loss_mlp": 1.02516782, + "epoch": 0.6867879152262137, + "flos": 24097999507200.0, + "grad_norm": 1.6784649507913934, + "language_loss": 0.77082187, + "learning_rate": 9.436610440667334e-07, + "loss": 0.79228812, + "num_input_tokens_seen": 246612165, + "step": 11423, + "time_per_iteration": 2.8607351779937744 + }, + { + "auxiliary_loss_clip": 0.01082395, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.03814936, + "balance_loss_mlp": 1.01943564, + "epoch": 0.6868480384788818, + "flos": 21615818060160.0, + "grad_norm": 1.4803784362494392, + "language_loss": 0.72793746, + "learning_rate": 9.433303570032129e-07, + "loss": 0.74909317, + "num_input_tokens_seen": 246632065, + "step": 11424, + "time_per_iteration": 2.8126673698425293 + }, + { + "auxiliary_loss_clip": 0.01092944, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.03935122, + "balance_loss_mlp": 1.01783705, + "epoch": 0.6869081617315497, + "flos": 26286144220800.0, + "grad_norm": 1.8921444035478678, + "language_loss": 0.65257877, + "learning_rate": 9.429997100087112e-07, + "loss": 0.67382109, + "num_input_tokens_seen": 246651245, + "step": 11425, + "time_per_iteration": 2.7407920360565186 + }, + { + "auxiliary_loss_clip": 0.01073701, + "auxiliary_loss_mlp": 0.01028234, + "balance_loss_clip": 1.03880644, + "balance_loss_mlp": 1.01543677, + "epoch": 0.6869682849842177, + "flos": 21105096531840.0, + "grad_norm": 1.3754232219458198, + "language_loss": 0.71719813, + "learning_rate": 9.426691030957657e-07, + "loss": 0.73821747, + "num_input_tokens_seen": 246672225, + "step": 11426, + "time_per_iteration": 2.821906089782715 + }, + { + "auxiliary_loss_clip": 0.01060498, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.03450513, + "balance_loss_mlp": 1.02006936, + "epoch": 0.6870284082368856, + "flos": 17092653920640.0, + "grad_norm": 2.015308300418605, + "language_loss": 0.84978002, + "learning_rate": 9.423385362769136e-07, + "loss": 0.87072647, + "num_input_tokens_seen": 246688385, + "step": 11427, + "time_per_iteration": 2.769426107406616 + }, + { + "auxiliary_loss_clip": 0.01100329, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.03816628, + "balance_loss_mlp": 1.02096152, + "epoch": 0.6870885314895536, + "flos": 27308090067840.0, + "grad_norm": 1.7434629559161423, + "language_loss": 0.76254469, + "learning_rate": 9.420080095646909e-07, + "loss": 0.78388786, + "num_input_tokens_seen": 246710730, + "step": 11428, + "time_per_iteration": 2.708268165588379 + }, + { + "auxiliary_loss_clip": 0.01079241, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.03692877, + "balance_loss_mlp": 1.02690864, + "epoch": 0.6871486547422215, + "flos": 20814543417600.0, + "grad_norm": 2.419770929596293, + "language_loss": 0.73118293, + "learning_rate": 9.4167752297163e-07, + "loss": 0.75238913, + "num_input_tokens_seen": 246730350, + "step": 11429, + "time_per_iteration": 2.7956650257110596 + }, + { + "auxiliary_loss_clip": 0.01089951, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.03737235, + "balance_loss_mlp": 1.01640451, + "epoch": 0.6872087779948896, + "flos": 30154118330880.0, + "grad_norm": 1.9861165427275798, + "language_loss": 0.83426887, + "learning_rate": 9.413470765102643e-07, + "loss": 0.8554647, + "num_input_tokens_seen": 246751700, + "step": 11430, + "time_per_iteration": 4.525273084640503 + }, + { + "auxiliary_loss_clip": 0.01105193, + "auxiliary_loss_mlp": 0.01038794, + "balance_loss_clip": 1.03941309, + "balance_loss_mlp": 1.02549052, + "epoch": 0.6872689012475575, + "flos": 20704584908160.0, + "grad_norm": 2.206529961181577, + "language_loss": 0.7042433, + "learning_rate": 9.410166701931225e-07, + "loss": 0.72568321, + "num_input_tokens_seen": 246769860, + "step": 11431, + "time_per_iteration": 2.6291654109954834 + }, + { + "auxiliary_loss_clip": 0.01093068, + "auxiliary_loss_mlp": 0.00771593, + "balance_loss_clip": 1.03726888, + "balance_loss_mlp": 1.0001148, + "epoch": 0.6873290245002255, + "flos": 25520852027520.0, + "grad_norm": 1.7240375281978666, + "language_loss": 0.80058414, + "learning_rate": 9.406863040327355e-07, + "loss": 0.81923079, + "num_input_tokens_seen": 246789905, + "step": 11432, + "time_per_iteration": 2.7238457202911377 + }, + { + "auxiliary_loss_clip": 0.01089362, + "auxiliary_loss_mlp": 0.01029871, + "balance_loss_clip": 1.03868675, + "balance_loss_mlp": 1.01700783, + "epoch": 0.6873891477528934, + "flos": 25191479289600.0, + "grad_norm": 1.5401718085798923, + "language_loss": 0.67718959, + "learning_rate": 9.403559780416295e-07, + "loss": 0.6983819, + "num_input_tokens_seen": 246808815, + "step": 11433, + "time_per_iteration": 4.300631999969482 + }, + { + "auxiliary_loss_clip": 0.01108222, + "auxiliary_loss_mlp": 0.01044912, + "balance_loss_clip": 1.04331732, + "balance_loss_mlp": 1.03123283, + "epoch": 0.6874492710055614, + "flos": 35152380685440.0, + "grad_norm": 1.9633714481574007, + "language_loss": 0.73058158, + "learning_rate": 9.400256922323309e-07, + "loss": 0.75211298, + "num_input_tokens_seen": 246829775, + "step": 11434, + "time_per_iteration": 4.712211608886719 + }, + { + "auxiliary_loss_clip": 0.0107867, + "auxiliary_loss_mlp": 0.01034388, + "balance_loss_clip": 1.04082966, + "balance_loss_mlp": 1.02101231, + "epoch": 0.6875093942582293, + "flos": 17822215059840.0, + "grad_norm": 1.6101742183302694, + "language_loss": 0.80406773, + "learning_rate": 9.396954466173657e-07, + "loss": 0.82519835, + "num_input_tokens_seen": 246848045, + "step": 11435, + "time_per_iteration": 2.644397735595703 + }, + { + "auxiliary_loss_clip": 0.01116024, + "auxiliary_loss_mlp": 0.01035166, + "balance_loss_clip": 1.04015982, + "balance_loss_mlp": 1.02111077, + "epoch": 0.6875695175108973, + "flos": 20704548994560.0, + "grad_norm": 3.274458448067563, + "language_loss": 0.81117046, + "learning_rate": 9.393652412092538e-07, + "loss": 0.83268237, + "num_input_tokens_seen": 246866095, + "step": 11436, + "time_per_iteration": 2.600048303604126 + }, + { + "auxiliary_loss_clip": 0.0106725, + "auxiliary_loss_mlp": 0.0104019, + "balance_loss_clip": 1.03428948, + "balance_loss_mlp": 1.02743411, + "epoch": 0.6876296407635654, + "flos": 25374013228800.0, + "grad_norm": 1.9842620224172498, + "language_loss": 0.82207173, + "learning_rate": 9.390350760205183e-07, + "loss": 0.84314615, + "num_input_tokens_seen": 246883975, + "step": 11437, + "time_per_iteration": 2.7188313007354736 + }, + { + "auxiliary_loss_clip": 0.01097489, + "auxiliary_loss_mlp": 0.01042761, + "balance_loss_clip": 1.03876507, + "balance_loss_mlp": 1.02794886, + "epoch": 0.6876897640162333, + "flos": 23222317841280.0, + "grad_norm": 4.685984752688369, + "language_loss": 0.78381348, + "learning_rate": 9.387049510636793e-07, + "loss": 0.80521595, + "num_input_tokens_seen": 246901560, + "step": 11438, + "time_per_iteration": 2.6525228023529053 + }, + { + "auxiliary_loss_clip": 0.01108734, + "auxiliary_loss_mlp": 0.0103476, + "balance_loss_clip": 1.03871489, + "balance_loss_mlp": 1.02167058, + "epoch": 0.6877498872689013, + "flos": 27124335066240.0, + "grad_norm": 1.647979155501369, + "language_loss": 0.72087812, + "learning_rate": 9.383748663512554e-07, + "loss": 0.74231309, + "num_input_tokens_seen": 246922655, + "step": 11439, + "time_per_iteration": 4.218140363693237 + }, + { + "auxiliary_loss_clip": 0.01101936, + "auxiliary_loss_mlp": 0.01030006, + "balance_loss_clip": 1.03944337, + "balance_loss_mlp": 1.01658285, + "epoch": 0.6878100105215692, + "flos": 11581658876160.0, + "grad_norm": 1.9534001671179906, + "language_loss": 0.75862855, + "learning_rate": 9.380448218957623e-07, + "loss": 0.779948, + "num_input_tokens_seen": 246940100, + "step": 11440, + "time_per_iteration": 2.580472946166992 + }, + { + "auxiliary_loss_clip": 0.01066967, + "auxiliary_loss_mlp": 0.01040415, + "balance_loss_clip": 1.03528094, + "balance_loss_mlp": 1.02684307, + "epoch": 0.6878701337742372, + "flos": 20303175444480.0, + "grad_norm": 7.861818924260737, + "language_loss": 0.71750253, + "learning_rate": 9.377148177097167e-07, + "loss": 0.73857641, + "num_input_tokens_seen": 246958545, + "step": 11441, + "time_per_iteration": 2.706754207611084 + }, + { + "auxiliary_loss_clip": 0.01074524, + "auxiliary_loss_mlp": 0.01043281, + "balance_loss_clip": 1.03488159, + "balance_loss_mlp": 1.02677059, + "epoch": 0.6879302570269051, + "flos": 13840080549120.0, + "grad_norm": 1.6357806540454092, + "language_loss": 0.66401327, + "learning_rate": 9.373848538056317e-07, + "loss": 0.68519139, + "num_input_tokens_seen": 246974805, + "step": 11442, + "time_per_iteration": 2.7559654712677 + }, + { + "auxiliary_loss_clip": 0.0109822, + "auxiliary_loss_mlp": 0.01033105, + "balance_loss_clip": 1.03951812, + "balance_loss_mlp": 1.02001595, + "epoch": 0.6879903802795732, + "flos": 21324654414720.0, + "grad_norm": 4.42004898936703, + "language_loss": 0.69321597, + "learning_rate": 9.370549301960189e-07, + "loss": 0.71452922, + "num_input_tokens_seen": 246992505, + "step": 11443, + "time_per_iteration": 2.6616227626800537 + }, + { + "auxiliary_loss_clip": 0.0109609, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.03986192, + "balance_loss_mlp": 1.02196562, + "epoch": 0.6880505035322411, + "flos": 25152049134720.0, + "grad_norm": 2.6937329387099784, + "language_loss": 0.76372284, + "learning_rate": 9.367250468933893e-07, + "loss": 0.78504163, + "num_input_tokens_seen": 247013370, + "step": 11444, + "time_per_iteration": 2.8355183601379395 + }, + { + "auxiliary_loss_clip": 0.01110169, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.03819597, + "balance_loss_mlp": 1.02007592, + "epoch": 0.6881106267849091, + "flos": 23215530170880.0, + "grad_norm": 2.350463307106156, + "language_loss": 0.76555073, + "learning_rate": 9.363952039102536e-07, + "loss": 0.78698158, + "num_input_tokens_seen": 247029855, + "step": 11445, + "time_per_iteration": 2.567321300506592 + }, + { + "auxiliary_loss_clip": 0.01022025, + "auxiliary_loss_mlp": 0.01003467, + "balance_loss_clip": 1.00763083, + "balance_loss_mlp": 1.00232887, + "epoch": 0.688170750037577, + "flos": 48484397312640.0, + "grad_norm": 0.815591807379434, + "language_loss": 0.58349764, + "learning_rate": 9.360654012591183e-07, + "loss": 0.60375261, + "num_input_tokens_seen": 247085030, + "step": 11446, + "time_per_iteration": 3.1823232173919678 + }, + { + "auxiliary_loss_clip": 0.01102524, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.03622508, + "balance_loss_mlp": 1.01726246, + "epoch": 0.688230873290245, + "flos": 22783633038720.0, + "grad_norm": 1.4577025181029204, + "language_loss": 0.75851154, + "learning_rate": 9.357356389524886e-07, + "loss": 0.77984923, + "num_input_tokens_seen": 247104840, + "step": 11447, + "time_per_iteration": 2.6292076110839844 + }, + { + "auxiliary_loss_clip": 0.01092756, + "auxiliary_loss_mlp": 0.01038788, + "balance_loss_clip": 1.0371995, + "balance_loss_mlp": 1.02566266, + "epoch": 0.6882909965429129, + "flos": 22455660931200.0, + "grad_norm": 1.9523079882919305, + "language_loss": 0.73051161, + "learning_rate": 9.354059170028705e-07, + "loss": 0.75182706, + "num_input_tokens_seen": 247121905, + "step": 11448, + "time_per_iteration": 2.6177000999450684 + }, + { + "auxiliary_loss_clip": 0.01100637, + "auxiliary_loss_mlp": 0.01044689, + "balance_loss_clip": 1.0369277, + "balance_loss_mlp": 1.02910876, + "epoch": 0.688351119795581, + "flos": 26214143408640.0, + "grad_norm": 1.5228707353550825, + "language_loss": 0.74738759, + "learning_rate": 9.350762354227673e-07, + "loss": 0.76884079, + "num_input_tokens_seen": 247142375, + "step": 11449, + "time_per_iteration": 2.601680040359497 + }, + { + "auxiliary_loss_clip": 0.01111281, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.03867829, + "balance_loss_mlp": 1.02147889, + "epoch": 0.6884112430482489, + "flos": 22565260304640.0, + "grad_norm": 3.717332242852324, + "language_loss": 0.69703102, + "learning_rate": 9.34746594224679e-07, + "loss": 0.71848536, + "num_input_tokens_seen": 247161095, + "step": 11450, + "time_per_iteration": 2.664257764816284 + }, + { + "auxiliary_loss_clip": 0.0107707, + "auxiliary_loss_mlp": 0.01038466, + "balance_loss_clip": 1.03789186, + "balance_loss_mlp": 1.02427959, + "epoch": 0.6884713663009169, + "flos": 17341047446400.0, + "grad_norm": 1.8597549906829547, + "language_loss": 0.75942892, + "learning_rate": 9.344169934211068e-07, + "loss": 0.78058428, + "num_input_tokens_seen": 247178565, + "step": 11451, + "time_per_iteration": 2.6398167610168457 + }, + { + "auxiliary_loss_clip": 0.01101483, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.03904259, + "balance_loss_mlp": 1.01854348, + "epoch": 0.6885314895535849, + "flos": 26470832976000.0, + "grad_norm": 1.4408172988825247, + "language_loss": 0.69557142, + "learning_rate": 9.340874330245505e-07, + "loss": 0.71690023, + "num_input_tokens_seen": 247202345, + "step": 11452, + "time_per_iteration": 2.6441712379455566 + }, + { + "auxiliary_loss_clip": 0.01112297, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.03905725, + "balance_loss_mlp": 1.02143824, + "epoch": 0.6885916128062528, + "flos": 20521548178560.0, + "grad_norm": 1.603751678545201, + "language_loss": 0.71996975, + "learning_rate": 9.337579130475042e-07, + "loss": 0.74146044, + "num_input_tokens_seen": 247219240, + "step": 11453, + "time_per_iteration": 2.564039707183838 + }, + { + "auxiliary_loss_clip": 0.010232, + "auxiliary_loss_mlp": 0.0075158, + "balance_loss_clip": 1.00928593, + "balance_loss_mlp": 0.99959499, + "epoch": 0.6886517360589208, + "flos": 70715795679360.0, + "grad_norm": 0.7798992537715281, + "language_loss": 0.50685745, + "learning_rate": 9.334284335024644e-07, + "loss": 0.52460527, + "num_input_tokens_seen": 247272010, + "step": 11454, + "time_per_iteration": 3.016122341156006 + }, + { + "auxiliary_loss_clip": 0.01097098, + "auxiliary_loss_mlp": 0.01035719, + "balance_loss_clip": 1.03854132, + "balance_loss_mlp": 1.02329731, + "epoch": 0.6887118593115887, + "flos": 17893533513600.0, + "grad_norm": 2.526020135416449, + "language_loss": 0.75680363, + "learning_rate": 9.330989944019263e-07, + "loss": 0.77813178, + "num_input_tokens_seen": 247290630, + "step": 11455, + "time_per_iteration": 2.7730109691619873 + }, + { + "auxiliary_loss_clip": 0.01092116, + "auxiliary_loss_mlp": 0.0103676, + "balance_loss_clip": 1.03623128, + "balance_loss_mlp": 1.02249074, + "epoch": 0.6887719825642568, + "flos": 17453017117440.0, + "grad_norm": 2.7328430061690154, + "language_loss": 0.7254653, + "learning_rate": 9.327695957583803e-07, + "loss": 0.74675405, + "num_input_tokens_seen": 247304800, + "step": 11456, + "time_per_iteration": 2.7660651206970215 + }, + { + "auxiliary_loss_clip": 0.0108935, + "auxiliary_loss_mlp": 0.01035233, + "balance_loss_clip": 1.03873277, + "balance_loss_mlp": 1.02247739, + "epoch": 0.6888321058169247, + "flos": 23070199743360.0, + "grad_norm": 2.090937721500204, + "language_loss": 0.81322861, + "learning_rate": 9.32440237584319e-07, + "loss": 0.83447444, + "num_input_tokens_seen": 247323450, + "step": 11457, + "time_per_iteration": 2.691455841064453 + }, + { + "auxiliary_loss_clip": 0.01105328, + "auxiliary_loss_mlp": 0.00771348, + "balance_loss_clip": 1.04052448, + "balance_loss_mlp": 1.00017715, + "epoch": 0.6888922290695927, + "flos": 23368833417600.0, + "grad_norm": 1.548184255846192, + "language_loss": 0.76552927, + "learning_rate": 9.321109198922301e-07, + "loss": 0.78429604, + "num_input_tokens_seen": 247343845, + "step": 11458, + "time_per_iteration": 2.6362695693969727 + }, + { + "auxiliary_loss_clip": 0.01113281, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.03937232, + "balance_loss_mlp": 1.02138472, + "epoch": 0.6889523523222606, + "flos": 17631636474240.0, + "grad_norm": 2.7369612879197986, + "language_loss": 0.67654693, + "learning_rate": 9.31781642694603e-07, + "loss": 0.69802415, + "num_input_tokens_seen": 247356650, + "step": 11459, + "time_per_iteration": 2.6157007217407227 + }, + { + "auxiliary_loss_clip": 0.01064164, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.03582239, + "balance_loss_mlp": 1.01958048, + "epoch": 0.6890124755749286, + "flos": 25228144097280.0, + "grad_norm": 1.4844709645177188, + "language_loss": 0.68446231, + "learning_rate": 9.314524060039221e-07, + "loss": 0.70542651, + "num_input_tokens_seen": 247377340, + "step": 11460, + "time_per_iteration": 2.7714388370513916 + }, + { + "auxiliary_loss_clip": 0.01087273, + "auxiliary_loss_mlp": 0.01033379, + "balance_loss_clip": 1.03934288, + "balance_loss_mlp": 1.01844215, + "epoch": 0.6890725988275965, + "flos": 20230240878720.0, + "grad_norm": 1.8579339278918177, + "language_loss": 0.77017105, + "learning_rate": 9.311232098326731e-07, + "loss": 0.7913776, + "num_input_tokens_seen": 247395805, + "step": 11461, + "time_per_iteration": 2.7195050716400146 + }, + { + "auxiliary_loss_clip": 0.01091784, + "auxiliary_loss_mlp": 0.01037341, + "balance_loss_clip": 1.03789628, + "balance_loss_mlp": 1.02331018, + "epoch": 0.6891327220802645, + "flos": 14535311264640.0, + "grad_norm": 1.7919419635412812, + "language_loss": 0.6962589, + "learning_rate": 9.307940541933401e-07, + "loss": 0.71755016, + "num_input_tokens_seen": 247413165, + "step": 11462, + "time_per_iteration": 2.695122718811035 + }, + { + "auxiliary_loss_clip": 0.01105224, + "auxiliary_loss_mlp": 0.01028927, + "balance_loss_clip": 1.04118133, + "balance_loss_mlp": 1.01500297, + "epoch": 0.6891928453329325, + "flos": 21139139646720.0, + "grad_norm": 1.4465330715019271, + "language_loss": 0.8737253, + "learning_rate": 9.304649390984034e-07, + "loss": 0.89506674, + "num_input_tokens_seen": 247433140, + "step": 11463, + "time_per_iteration": 2.746290922164917 + }, + { + "auxiliary_loss_clip": 0.01064548, + "auxiliary_loss_mlp": 0.01030124, + "balance_loss_clip": 1.04010975, + "balance_loss_mlp": 1.01829851, + "epoch": 0.6892529685856005, + "flos": 17858520731520.0, + "grad_norm": 1.5297822834727555, + "language_loss": 0.68426907, + "learning_rate": 9.301358645603428e-07, + "loss": 0.70521581, + "num_input_tokens_seen": 247451265, + "step": 11464, + "time_per_iteration": 2.8325612545013428 + }, + { + "auxiliary_loss_clip": 0.01102764, + "auxiliary_loss_mlp": 0.01040883, + "balance_loss_clip": 1.03917408, + "balance_loss_mlp": 1.02711463, + "epoch": 0.6893130918382685, + "flos": 29934811843200.0, + "grad_norm": 2.288958108481903, + "language_loss": 0.65110016, + "learning_rate": 9.298068305916373e-07, + "loss": 0.67253661, + "num_input_tokens_seen": 247471645, + "step": 11465, + "time_per_iteration": 2.815046787261963 + }, + { + "auxiliary_loss_clip": 0.01104457, + "auxiliary_loss_mlp": 0.01038209, + "balance_loss_clip": 1.03854775, + "balance_loss_mlp": 1.02463746, + "epoch": 0.6893732150909364, + "flos": 24388516707840.0, + "grad_norm": 1.3495813204241554, + "language_loss": 0.72669965, + "learning_rate": 9.294778372047649e-07, + "loss": 0.74812633, + "num_input_tokens_seen": 247491170, + "step": 11466, + "time_per_iteration": 2.671194314956665 + }, + { + "auxiliary_loss_clip": 0.01114766, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.04005003, + "balance_loss_mlp": 1.02122736, + "epoch": 0.6894333383436044, + "flos": 16982874979200.0, + "grad_norm": 1.6856701084963044, + "language_loss": 0.71847236, + "learning_rate": 9.291488844121995e-07, + "loss": 0.73996592, + "num_input_tokens_seen": 247509005, + "step": 11467, + "time_per_iteration": 2.759052276611328 + }, + { + "auxiliary_loss_clip": 0.01096068, + "auxiliary_loss_mlp": 0.01036799, + "balance_loss_clip": 1.03972626, + "balance_loss_mlp": 1.02171886, + "epoch": 0.6894934615962723, + "flos": 18985540838400.0, + "grad_norm": 1.978085572567592, + "language_loss": 0.80877995, + "learning_rate": 9.288199722264156e-07, + "loss": 0.83010864, + "num_input_tokens_seen": 247527050, + "step": 11468, + "time_per_iteration": 2.8261470794677734 + }, + { + "auxiliary_loss_clip": 0.01116061, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.04050148, + "balance_loss_mlp": 1.02103066, + "epoch": 0.6895535848489404, + "flos": 34531664734080.0, + "grad_norm": 1.489529294726542, + "language_loss": 0.66164148, + "learning_rate": 9.284911006598875e-07, + "loss": 0.68314791, + "num_input_tokens_seen": 247547765, + "step": 11469, + "time_per_iteration": 5.082685232162476 + }, + { + "auxiliary_loss_clip": 0.01023211, + "auxiliary_loss_mlp": 0.01004328, + "balance_loss_clip": 1.00959301, + "balance_loss_mlp": 1.00309992, + "epoch": 0.6896137081016083, + "flos": 50075852273280.0, + "grad_norm": 0.7983802511717295, + "language_loss": 0.55211931, + "learning_rate": 9.281622697250824e-07, + "loss": 0.57239467, + "num_input_tokens_seen": 247603515, + "step": 11470, + "time_per_iteration": 3.123518228530884 + }, + { + "auxiliary_loss_clip": 0.01098666, + "auxiliary_loss_mlp": 0.01034034, + "balance_loss_clip": 1.03866851, + "balance_loss_mlp": 1.02299523, + "epoch": 0.6896738313542763, + "flos": 19938215306880.0, + "grad_norm": 1.7748421149249738, + "language_loss": 0.78111279, + "learning_rate": 9.278334794344715e-07, + "loss": 0.80243975, + "num_input_tokens_seen": 247622110, + "step": 11471, + "time_per_iteration": 2.6707584857940674 + }, + { + "auxiliary_loss_clip": 0.01088217, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.03501463, + "balance_loss_mlp": 1.02104771, + "epoch": 0.6897339546069442, + "flos": 21725489260800.0, + "grad_norm": 1.724757958239778, + "language_loss": 0.78451025, + "learning_rate": 9.275047298005232e-07, + "loss": 0.80573976, + "num_input_tokens_seen": 247641905, + "step": 11472, + "time_per_iteration": 4.256728887557983 + }, + { + "auxiliary_loss_clip": 0.01081643, + "auxiliary_loss_mlp": 0.01032054, + "balance_loss_clip": 1.03641033, + "balance_loss_mlp": 1.0195905, + "epoch": 0.6897940778596122, + "flos": 19826497031040.0, + "grad_norm": 1.5995976978854818, + "language_loss": 0.76272285, + "learning_rate": 9.271760208357024e-07, + "loss": 0.78385979, + "num_input_tokens_seen": 247660945, + "step": 11473, + "time_per_iteration": 4.321485757827759 + }, + { + "auxiliary_loss_clip": 0.01070517, + "auxiliary_loss_mlp": 0.0105009, + "balance_loss_clip": 1.03430462, + "balance_loss_mlp": 1.03352571, + "epoch": 0.6898542011122801, + "flos": 17310056987520.0, + "grad_norm": 1.7861232918293928, + "language_loss": 0.75359839, + "learning_rate": 9.268473525524751e-07, + "loss": 0.77480447, + "num_input_tokens_seen": 247678395, + "step": 11474, + "time_per_iteration": 2.788238525390625 + }, + { + "auxiliary_loss_clip": 0.01068006, + "auxiliary_loss_mlp": 0.01032395, + "balance_loss_clip": 1.04364872, + "balance_loss_mlp": 1.01921058, + "epoch": 0.6899143243649482, + "flos": 24754051463040.0, + "grad_norm": 1.4663281053614279, + "language_loss": 0.74502885, + "learning_rate": 9.26518724963303e-07, + "loss": 0.76603287, + "num_input_tokens_seen": 247698380, + "step": 11475, + "time_per_iteration": 2.878188371658325 + }, + { + "auxiliary_loss_clip": 0.01084179, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.03779638, + "balance_loss_mlp": 1.02154493, + "epoch": 0.6899744476176161, + "flos": 17234536642560.0, + "grad_norm": 1.9957028062650322, + "language_loss": 0.88603026, + "learning_rate": 9.261901380806491e-07, + "loss": 0.90722603, + "num_input_tokens_seen": 247716370, + "step": 11476, + "time_per_iteration": 2.7922370433807373 + }, + { + "auxiliary_loss_clip": 0.01112551, + "auxiliary_loss_mlp": 0.01037017, + "balance_loss_clip": 1.03934443, + "balance_loss_mlp": 1.02450645, + "epoch": 0.6900345708702841, + "flos": 25410678036480.0, + "grad_norm": 1.5288697914631357, + "language_loss": 0.70166922, + "learning_rate": 9.258615919169724e-07, + "loss": 0.72316492, + "num_input_tokens_seen": 247737335, + "step": 11477, + "time_per_iteration": 2.780515193939209 + }, + { + "auxiliary_loss_clip": 0.01107191, + "auxiliary_loss_mlp": 0.01045383, + "balance_loss_clip": 1.03964376, + "balance_loss_mlp": 1.03081584, + "epoch": 0.6900946941229521, + "flos": 23434190213760.0, + "grad_norm": 2.1987152086234723, + "language_loss": 0.68323863, + "learning_rate": 9.255330864847313e-07, + "loss": 0.70476437, + "num_input_tokens_seen": 247756680, + "step": 11478, + "time_per_iteration": 4.340089559555054 + }, + { + "auxiliary_loss_clip": 0.01104632, + "auxiliary_loss_mlp": 0.0103447, + "balance_loss_clip": 1.04020643, + "balance_loss_mlp": 1.02203012, + "epoch": 0.69015481737562, + "flos": 17820096157440.0, + "grad_norm": 1.918426525633328, + "language_loss": 0.76238775, + "learning_rate": 9.252046217963843e-07, + "loss": 0.78377873, + "num_input_tokens_seen": 247774265, + "step": 11479, + "time_per_iteration": 2.7662193775177 + }, + { + "auxiliary_loss_clip": 0.01104072, + "auxiliary_loss_mlp": 0.01030752, + "balance_loss_clip": 1.03842354, + "balance_loss_mlp": 1.01716816, + "epoch": 0.690214940628288, + "flos": 17456500736640.0, + "grad_norm": 1.8031624410020608, + "language_loss": 0.78769386, + "learning_rate": 9.248761978643856e-07, + "loss": 0.8090421, + "num_input_tokens_seen": 247792395, + "step": 11480, + "time_per_iteration": 2.6917519569396973 + }, + { + "auxiliary_loss_clip": 0.01071212, + "auxiliary_loss_mlp": 0.01033409, + "balance_loss_clip": 1.03474808, + "balance_loss_mlp": 1.01971793, + "epoch": 0.6902750638809559, + "flos": 29566691308800.0, + "grad_norm": 2.1117215547556922, + "language_loss": 0.75273913, + "learning_rate": 9.245478147011885e-07, + "loss": 0.77378535, + "num_input_tokens_seen": 247811985, + "step": 11481, + "time_per_iteration": 2.914005994796753 + }, + { + "auxiliary_loss_clip": 0.01078232, + "auxiliary_loss_mlp": 0.01031748, + "balance_loss_clip": 1.03950965, + "balance_loss_mlp": 1.01795578, + "epoch": 0.690335187133624, + "flos": 25557121785600.0, + "grad_norm": 1.8140875397528662, + "language_loss": 0.69146681, + "learning_rate": 9.24219472319246e-07, + "loss": 0.71256661, + "num_input_tokens_seen": 247831880, + "step": 11482, + "time_per_iteration": 2.888972759246826 + }, + { + "auxiliary_loss_clip": 0.01114892, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.04087675, + "balance_loss_mlp": 1.02031827, + "epoch": 0.6903953103862919, + "flos": 22488447070080.0, + "grad_norm": 1.4863828280794367, + "language_loss": 0.82752049, + "learning_rate": 9.238911707310096e-07, + "loss": 0.84900403, + "num_input_tokens_seen": 247851170, + "step": 11483, + "time_per_iteration": 2.6664347648620605 + }, + { + "auxiliary_loss_clip": 0.01116625, + "auxiliary_loss_mlp": 0.01030991, + "balance_loss_clip": 1.0412333, + "balance_loss_mlp": 1.01880169, + "epoch": 0.6904554336389599, + "flos": 26100521712000.0, + "grad_norm": 1.9326210731008662, + "language_loss": 0.65550387, + "learning_rate": 9.235629099489273e-07, + "loss": 0.67697996, + "num_input_tokens_seen": 247868950, + "step": 11484, + "time_per_iteration": 2.629709005355835 + }, + { + "auxiliary_loss_clip": 0.01079245, + "auxiliary_loss_mlp": 0.01044412, + "balance_loss_clip": 1.03618813, + "balance_loss_mlp": 1.03035724, + "epoch": 0.6905155568916278, + "flos": 31171754545920.0, + "grad_norm": 1.4648771757692296, + "language_loss": 0.7359699, + "learning_rate": 9.232346899854479e-07, + "loss": 0.75720656, + "num_input_tokens_seen": 247889805, + "step": 11485, + "time_per_iteration": 2.780137300491333 + }, + { + "auxiliary_loss_clip": 0.01100883, + "auxiliary_loss_mlp": 0.00771626, + "balance_loss_clip": 1.04121161, + "balance_loss_mlp": 1.00017738, + "epoch": 0.6905756801442958, + "flos": 17639681120640.0, + "grad_norm": 1.7496856130724467, + "language_loss": 0.84967637, + "learning_rate": 9.22906510853017e-07, + "loss": 0.86840141, + "num_input_tokens_seen": 247908585, + "step": 11486, + "time_per_iteration": 2.6427667140960693 + }, + { + "auxiliary_loss_clip": 0.01053468, + "auxiliary_loss_mlp": 0.01037616, + "balance_loss_clip": 1.03498769, + "balance_loss_mlp": 1.02395463, + "epoch": 0.6906358033969637, + "flos": 22343691260160.0, + "grad_norm": 1.463253599304318, + "language_loss": 0.72599518, + "learning_rate": 9.225783725640786e-07, + "loss": 0.74690592, + "num_input_tokens_seen": 247928480, + "step": 11487, + "time_per_iteration": 2.8396995067596436 + }, + { + "auxiliary_loss_clip": 0.01016718, + "auxiliary_loss_mlp": 0.0100075, + "balance_loss_clip": 1.01205254, + "balance_loss_mlp": 0.99957544, + "epoch": 0.6906959266496318, + "flos": 69747789081600.0, + "grad_norm": 0.9486957802927981, + "language_loss": 0.66587651, + "learning_rate": 9.222502751310759e-07, + "loss": 0.68605119, + "num_input_tokens_seen": 247988855, + "step": 11488, + "time_per_iteration": 3.256028175354004 + }, + { + "auxiliary_loss_clip": 0.01090242, + "auxiliary_loss_mlp": 0.01035444, + "balance_loss_clip": 1.039554, + "balance_loss_mlp": 1.02100825, + "epoch": 0.6907560499022997, + "flos": 21434253788160.0, + "grad_norm": 1.736123733035723, + "language_loss": 0.74721605, + "learning_rate": 9.219222185664519e-07, + "loss": 0.76847291, + "num_input_tokens_seen": 248007685, + "step": 11489, + "time_per_iteration": 2.6813058853149414 + }, + { + "auxiliary_loss_clip": 0.01102738, + "auxiliary_loss_mlp": 0.01041104, + "balance_loss_clip": 1.03759074, + "balance_loss_mlp": 1.0267272, + "epoch": 0.6908161731549677, + "flos": 14392207480320.0, + "grad_norm": 2.0464811594474006, + "language_loss": 0.62228811, + "learning_rate": 9.215942028826445e-07, + "loss": 0.64372647, + "num_input_tokens_seen": 248025145, + "step": 11490, + "time_per_iteration": 2.7024333477020264 + }, + { + "auxiliary_loss_clip": 0.01090002, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.03779197, + "balance_loss_mlp": 1.01960242, + "epoch": 0.6908762964076357, + "flos": 20010970304640.0, + "grad_norm": 1.709286193075208, + "language_loss": 0.72809607, + "learning_rate": 9.212662280920937e-07, + "loss": 0.74932313, + "num_input_tokens_seen": 248043750, + "step": 11491, + "time_per_iteration": 2.746288537979126 + }, + { + "auxiliary_loss_clip": 0.01089559, + "auxiliary_loss_mlp": 0.00771788, + "balance_loss_clip": 1.03801966, + "balance_loss_mlp": 1.00016296, + "epoch": 0.6909364196603036, + "flos": 28769079853440.0, + "grad_norm": 1.39649539646883, + "language_loss": 0.70297456, + "learning_rate": 9.20938294207235e-07, + "loss": 0.72158802, + "num_input_tokens_seen": 248065765, + "step": 11492, + "time_per_iteration": 2.7897520065307617 + }, + { + "auxiliary_loss_clip": 0.010831, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.04620051, + "balance_loss_mlp": 1.0190773, + "epoch": 0.6909965429129716, + "flos": 22528128620160.0, + "grad_norm": 1.7344123027630052, + "language_loss": 0.74773538, + "learning_rate": 9.206104012405049e-07, + "loss": 0.76890349, + "num_input_tokens_seen": 248083810, + "step": 11493, + "time_per_iteration": 2.9563519954681396 + }, + { + "auxiliary_loss_clip": 0.01114123, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.04090369, + "balance_loss_mlp": 1.01648879, + "epoch": 0.6910566661656395, + "flos": 18405942981120.0, + "grad_norm": 1.7115108202132974, + "language_loss": 0.74647975, + "learning_rate": 9.20282549204336e-07, + "loss": 0.76792109, + "num_input_tokens_seen": 248103185, + "step": 11494, + "time_per_iteration": 2.606947422027588 + }, + { + "auxiliary_loss_clip": 0.01086005, + "auxiliary_loss_mlp": 0.01030061, + "balance_loss_clip": 1.03727913, + "balance_loss_mlp": 1.01682854, + "epoch": 0.6911167894183076, + "flos": 30773972355840.0, + "grad_norm": 1.4748735208604244, + "language_loss": 0.68749166, + "learning_rate": 9.19954738111161e-07, + "loss": 0.70865232, + "num_input_tokens_seen": 248125665, + "step": 11495, + "time_per_iteration": 2.768889904022217 + }, + { + "auxiliary_loss_clip": 0.01089976, + "auxiliary_loss_mlp": 0.01029957, + "balance_loss_clip": 1.03646207, + "balance_loss_mlp": 1.01640916, + "epoch": 0.6911769126709755, + "flos": 13735724561280.0, + "grad_norm": 1.8085674885564547, + "language_loss": 0.74088383, + "learning_rate": 9.196269679734119e-07, + "loss": 0.76208317, + "num_input_tokens_seen": 248142545, + "step": 11496, + "time_per_iteration": 2.6374707221984863 + }, + { + "auxiliary_loss_clip": 0.01075882, + "auxiliary_loss_mlp": 0.01033532, + "balance_loss_clip": 1.03445745, + "balance_loss_mlp": 1.02084804, + "epoch": 0.6912370359236435, + "flos": 17566854295680.0, + "grad_norm": 2.1445438478171, + "language_loss": 0.80236906, + "learning_rate": 9.19299238803515e-07, + "loss": 0.82346314, + "num_input_tokens_seen": 248160225, + "step": 11497, + "time_per_iteration": 2.6873879432678223 + }, + { + "auxiliary_loss_clip": 0.01074496, + "auxiliary_loss_mlp": 0.01037916, + "balance_loss_clip": 1.03591168, + "balance_loss_mlp": 1.02401567, + "epoch": 0.6912971591763114, + "flos": 22090772620800.0, + "grad_norm": 1.4245028324847169, + "language_loss": 0.8060286, + "learning_rate": 9.189715506138993e-07, + "loss": 0.82715273, + "num_input_tokens_seen": 248180430, + "step": 11498, + "time_per_iteration": 2.7175493240356445 + }, + { + "auxiliary_loss_clip": 0.01099715, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.03920996, + "balance_loss_mlp": 1.01955223, + "epoch": 0.6913572824289794, + "flos": 29971476650880.0, + "grad_norm": 1.5050738051892152, + "language_loss": 0.86088848, + "learning_rate": 9.186439034169915e-07, + "loss": 0.88221788, + "num_input_tokens_seen": 248202365, + "step": 11499, + "time_per_iteration": 2.7579431533813477 + }, + { + "auxiliary_loss_clip": 0.01080625, + "auxiliary_loss_mlp": 0.00771124, + "balance_loss_clip": 1.040236, + "balance_loss_mlp": 1.00014019, + "epoch": 0.6914174056816473, + "flos": 20448936835200.0, + "grad_norm": 1.7961404954828535, + "language_loss": 0.75816536, + "learning_rate": 9.183162972252145e-07, + "loss": 0.77668285, + "num_input_tokens_seen": 248221750, + "step": 11500, + "time_per_iteration": 2.658766031265259 + }, + { + "auxiliary_loss_clip": 0.01058615, + "auxiliary_loss_mlp": 0.01050016, + "balance_loss_clip": 1.03728688, + "balance_loss_mlp": 1.03423262, + "epoch": 0.6914775289343154, + "flos": 21282530739840.0, + "grad_norm": 1.8214656574654693, + "language_loss": 0.77514184, + "learning_rate": 9.179887320509921e-07, + "loss": 0.79622817, + "num_input_tokens_seen": 248239535, + "step": 11501, + "time_per_iteration": 2.751330614089966 + }, + { + "auxiliary_loss_clip": 0.01099448, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.03807986, + "balance_loss_mlp": 1.02625489, + "epoch": 0.6915376521869833, + "flos": 23878118401920.0, + "grad_norm": 1.7335303734743124, + "language_loss": 0.73580784, + "learning_rate": 9.176612079067458e-07, + "loss": 0.75720799, + "num_input_tokens_seen": 248259055, + "step": 11502, + "time_per_iteration": 2.8098790645599365 + }, + { + "auxiliary_loss_clip": 0.01041175, + "auxiliary_loss_mlp": 0.01044606, + "balance_loss_clip": 1.034199, + "balance_loss_mlp": 1.02875125, + "epoch": 0.6915977754396513, + "flos": 11510268595200.0, + "grad_norm": 2.5749254426128743, + "language_loss": 0.73368824, + "learning_rate": 9.173337248048953e-07, + "loss": 0.75454605, + "num_input_tokens_seen": 248276765, + "step": 11503, + "time_per_iteration": 2.747083902359009 + }, + { + "auxiliary_loss_clip": 0.01098455, + "auxiliary_loss_mlp": 0.01041122, + "balance_loss_clip": 1.03777838, + "balance_loss_mlp": 1.02701986, + "epoch": 0.6916578986923193, + "flos": 22601278667520.0, + "grad_norm": 1.7356607503629284, + "language_loss": 0.77010226, + "learning_rate": 9.170062827578575e-07, + "loss": 0.79149806, + "num_input_tokens_seen": 248295310, + "step": 11504, + "time_per_iteration": 2.706209182739258 + }, + { + "auxiliary_loss_clip": 0.01069336, + "auxiliary_loss_mlp": 0.01039527, + "balance_loss_clip": 1.0342164, + "balance_loss_mlp": 1.02457845, + "epoch": 0.6917180219449872, + "flos": 23477355383040.0, + "grad_norm": 1.7715532639399074, + "language_loss": 0.73565066, + "learning_rate": 9.166788817780499e-07, + "loss": 0.75673938, + "num_input_tokens_seen": 248315230, + "step": 11505, + "time_per_iteration": 2.725203514099121 + }, + { + "auxiliary_loss_clip": 0.01054739, + "auxiliary_loss_mlp": 0.00772936, + "balance_loss_clip": 1.03434849, + "balance_loss_mlp": 1.00009656, + "epoch": 0.6917781451976552, + "flos": 23732536579200.0, + "grad_norm": 1.8090122394504842, + "language_loss": 0.88027036, + "learning_rate": 9.163515218778886e-07, + "loss": 0.89854711, + "num_input_tokens_seen": 248332980, + "step": 11506, + "time_per_iteration": 2.796102285385132 + }, + { + "auxiliary_loss_clip": 0.01086001, + "auxiliary_loss_mlp": 0.01030005, + "balance_loss_clip": 1.03935504, + "balance_loss_mlp": 1.01724994, + "epoch": 0.6918382684503231, + "flos": 31466760946560.0, + "grad_norm": 2.045878343291588, + "language_loss": 0.7011205, + "learning_rate": 9.160242030697856e-07, + "loss": 0.72228056, + "num_input_tokens_seen": 248352865, + "step": 11507, + "time_per_iteration": 2.755439043045044 + }, + { + "auxiliary_loss_clip": 0.01086914, + "auxiliary_loss_mlp": 0.01036889, + "balance_loss_clip": 1.03763783, + "balance_loss_mlp": 1.02344775, + "epoch": 0.6918983917029912, + "flos": 21650471706240.0, + "grad_norm": 1.853503068489786, + "language_loss": 0.76915097, + "learning_rate": 9.156969253661538e-07, + "loss": 0.79038906, + "num_input_tokens_seen": 248371125, + "step": 11508, + "time_per_iteration": 4.521030426025391 + }, + { + "auxiliary_loss_clip": 0.0109627, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.03752184, + "balance_loss_mlp": 1.02148128, + "epoch": 0.6919585149556591, + "flos": 25550082720000.0, + "grad_norm": 1.8821969374944694, + "language_loss": 0.75171518, + "learning_rate": 9.153696887794027e-07, + "loss": 0.77302009, + "num_input_tokens_seen": 248390455, + "step": 11509, + "time_per_iteration": 2.69903826713562 + }, + { + "auxiliary_loss_clip": 0.01062313, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.03829181, + "balance_loss_mlp": 1.02342892, + "epoch": 0.6920186382083271, + "flos": 23659781581440.0, + "grad_norm": 1.9874772496775723, + "language_loss": 0.64212132, + "learning_rate": 9.150424933219425e-07, + "loss": 0.66311103, + "num_input_tokens_seen": 248411305, + "step": 11510, + "time_per_iteration": 2.848520278930664 + }, + { + "auxiliary_loss_clip": 0.0108123, + "auxiliary_loss_mlp": 0.01034683, + "balance_loss_clip": 1.03798079, + "balance_loss_mlp": 1.02002048, + "epoch": 0.692078761460995, + "flos": 19061959023360.0, + "grad_norm": 1.835249008120565, + "language_loss": 0.75375962, + "learning_rate": 9.147153390061788e-07, + "loss": 0.77491868, + "num_input_tokens_seen": 248430190, + "step": 11511, + "time_per_iteration": 4.174523115158081 + }, + { + "auxiliary_loss_clip": 0.01084843, + "auxiliary_loss_mlp": 0.01029668, + "balance_loss_clip": 1.04214227, + "balance_loss_mlp": 1.01793194, + "epoch": 0.692138884713663, + "flos": 29023291382400.0, + "grad_norm": 1.7047662296255404, + "language_loss": 0.62659085, + "learning_rate": 9.143882258445184e-07, + "loss": 0.64773595, + "num_input_tokens_seen": 248450830, + "step": 11512, + "time_per_iteration": 4.534400224685669 + }, + { + "auxiliary_loss_clip": 0.01080139, + "auxiliary_loss_mlp": 0.01036917, + "balance_loss_clip": 1.03771234, + "balance_loss_mlp": 1.02366054, + "epoch": 0.6921990079663309, + "flos": 14757849976320.0, + "grad_norm": 1.7738066365158425, + "language_loss": 0.82885146, + "learning_rate": 9.140611538493666e-07, + "loss": 0.85002202, + "num_input_tokens_seen": 248468585, + "step": 11513, + "time_per_iteration": 2.744152545928955 + }, + { + "auxiliary_loss_clip": 0.01050332, + "auxiliary_loss_mlp": 0.0103433, + "balance_loss_clip": 1.03769469, + "balance_loss_mlp": 1.02236128, + "epoch": 0.692259131218999, + "flos": 23841848643840.0, + "grad_norm": 1.4085469853389758, + "language_loss": 0.78494793, + "learning_rate": 9.137341230331233e-07, + "loss": 0.8057946, + "num_input_tokens_seen": 248490535, + "step": 11514, + "time_per_iteration": 2.7933335304260254 + }, + { + "auxiliary_loss_clip": 0.0106844, + "auxiliary_loss_mlp": 0.01038567, + "balance_loss_clip": 1.03552842, + "balance_loss_mlp": 1.0250721, + "epoch": 0.6923192544716669, + "flos": 19135073157120.0, + "grad_norm": 2.196765924687951, + "language_loss": 0.75278533, + "learning_rate": 9.134071334081907e-07, + "loss": 0.77385533, + "num_input_tokens_seen": 248508575, + "step": 11515, + "time_per_iteration": 2.7745299339294434 + }, + { + "auxiliary_loss_clip": 0.01070009, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.03992462, + "balance_loss_mlp": 1.02032304, + "epoch": 0.6923793777243349, + "flos": 28074639237120.0, + "grad_norm": 1.799388111244089, + "language_loss": 0.53198493, + "learning_rate": 9.130801849869694e-07, + "loss": 0.55301601, + "num_input_tokens_seen": 248527025, + "step": 11516, + "time_per_iteration": 2.775190830230713 + }, + { + "auxiliary_loss_clip": 0.01097274, + "auxiliary_loss_mlp": 0.01037809, + "balance_loss_clip": 1.03787732, + "balance_loss_mlp": 1.02451098, + "epoch": 0.6924395009770029, + "flos": 16581250033920.0, + "grad_norm": 1.6962423082360507, + "language_loss": 0.72982675, + "learning_rate": 9.127532777818557e-07, + "loss": 0.75117755, + "num_input_tokens_seen": 248544275, + "step": 11517, + "time_per_iteration": 2.598116397857666 + }, + { + "auxiliary_loss_clip": 0.0111384, + "auxiliary_loss_mlp": 0.01037211, + "balance_loss_clip": 1.03925538, + "balance_loss_mlp": 1.02354932, + "epoch": 0.6924996242296708, + "flos": 16655297921280.0, + "grad_norm": 1.6338598791065129, + "language_loss": 0.76462078, + "learning_rate": 9.124264118052465e-07, + "loss": 0.78613126, + "num_input_tokens_seen": 248561870, + "step": 11518, + "time_per_iteration": 4.141700983047485 + }, + { + "auxiliary_loss_clip": 0.0110627, + "auxiliary_loss_mlp": 0.01040853, + "balance_loss_clip": 1.04075885, + "balance_loss_mlp": 1.02653027, + "epoch": 0.6925597474823388, + "flos": 34754167532160.0, + "grad_norm": 1.3592469216685072, + "language_loss": 0.64467025, + "learning_rate": 9.120995870695376e-07, + "loss": 0.66614151, + "num_input_tokens_seen": 248588190, + "step": 11519, + "time_per_iteration": 2.8347549438476562 + }, + { + "auxiliary_loss_clip": 0.01080573, + "auxiliary_loss_mlp": 0.0103987, + "balance_loss_clip": 1.03696394, + "balance_loss_mlp": 1.02670944, + "epoch": 0.6926198707350067, + "flos": 21871717528320.0, + "grad_norm": 2.1051263263306805, + "language_loss": 0.62538528, + "learning_rate": 9.117728035871212e-07, + "loss": 0.64658964, + "num_input_tokens_seen": 248606460, + "step": 11520, + "time_per_iteration": 2.7294435501098633 + }, + { + "auxiliary_loss_clip": 0.01075792, + "auxiliary_loss_mlp": 0.01037449, + "balance_loss_clip": 1.03631949, + "balance_loss_mlp": 1.0228461, + "epoch": 0.6926799939876748, + "flos": 13006271162880.0, + "grad_norm": 2.2378150496595013, + "language_loss": 0.77924216, + "learning_rate": 9.114460613703887e-07, + "loss": 0.80037463, + "num_input_tokens_seen": 248623715, + "step": 11521, + "time_per_iteration": 2.717240571975708 + }, + { + "auxiliary_loss_clip": 0.01100684, + "auxiliary_loss_mlp": 0.0103794, + "balance_loss_clip": 1.03691578, + "balance_loss_mlp": 1.02260375, + "epoch": 0.6927401172403427, + "flos": 16761234107520.0, + "grad_norm": 2.442345109030128, + "language_loss": 0.81992316, + "learning_rate": 9.111193604317304e-07, + "loss": 0.84130937, + "num_input_tokens_seen": 248640575, + "step": 11522, + "time_per_iteration": 2.6045098304748535 + }, + { + "auxiliary_loss_clip": 0.01100284, + "auxiliary_loss_mlp": 0.01034079, + "balance_loss_clip": 1.04276228, + "balance_loss_mlp": 1.02152598, + "epoch": 0.6928002404930107, + "flos": 25705648523520.0, + "grad_norm": 1.8984649858129847, + "language_loss": 0.76575756, + "learning_rate": 9.107927007835361e-07, + "loss": 0.78710121, + "num_input_tokens_seen": 248663535, + "step": 11523, + "time_per_iteration": 2.6705586910247803 + }, + { + "auxiliary_loss_clip": 0.01082858, + "auxiliary_loss_mlp": 0.01035266, + "balance_loss_clip": 1.03894114, + "balance_loss_mlp": 1.02276123, + "epoch": 0.6928603637456786, + "flos": 18588261438720.0, + "grad_norm": 2.087470687803226, + "language_loss": 0.68297094, + "learning_rate": 9.104660824381915e-07, + "loss": 0.70415223, + "num_input_tokens_seen": 248681125, + "step": 11524, + "time_per_iteration": 2.6786375045776367 + }, + { + "auxiliary_loss_clip": 0.0108268, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.03927469, + "balance_loss_mlp": 1.01960993, + "epoch": 0.6929204869983466, + "flos": 22200874784640.0, + "grad_norm": 1.782896915319788, + "language_loss": 0.64250147, + "learning_rate": 9.101395054080815e-07, + "loss": 0.66366494, + "num_input_tokens_seen": 248700555, + "step": 11525, + "time_per_iteration": 2.709665536880493 + }, + { + "auxiliary_loss_clip": 0.01076674, + "auxiliary_loss_mlp": 0.01040353, + "balance_loss_clip": 1.04186177, + "balance_loss_mlp": 1.02660835, + "epoch": 0.6929806102510145, + "flos": 17894754576000.0, + "grad_norm": 2.1892792904192366, + "language_loss": 0.70518214, + "learning_rate": 9.098129697055907e-07, + "loss": 0.72635239, + "num_input_tokens_seen": 248716095, + "step": 11526, + "time_per_iteration": 2.7389345169067383 + }, + { + "auxiliary_loss_clip": 0.01089418, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.03708529, + "balance_loss_mlp": 1.02210879, + "epoch": 0.6930407335036826, + "flos": 19755178577280.0, + "grad_norm": 2.017152131296503, + "language_loss": 0.76394051, + "learning_rate": 9.094864753431022e-07, + "loss": 0.78518212, + "num_input_tokens_seen": 248735330, + "step": 11527, + "time_per_iteration": 2.675387382507324 + }, + { + "auxiliary_loss_clip": 0.01084801, + "auxiliary_loss_mlp": 0.01040813, + "balance_loss_clip": 1.03604603, + "balance_loss_mlp": 1.02701497, + "epoch": 0.6931008567563505, + "flos": 21544248211200.0, + "grad_norm": 1.6619978055585172, + "language_loss": 0.7924946, + "learning_rate": 9.091600223329952e-07, + "loss": 0.81375074, + "num_input_tokens_seen": 248754530, + "step": 11528, + "time_per_iteration": 2.708937883377075 + }, + { + "auxiliary_loss_clip": 0.01097731, + "auxiliary_loss_mlp": 0.01032598, + "balance_loss_clip": 1.03879142, + "balance_loss_mlp": 1.02049828, + "epoch": 0.6931609800090185, + "flos": 26250018117120.0, + "grad_norm": 1.5147905000718478, + "language_loss": 0.76348805, + "learning_rate": 9.088336106876491e-07, + "loss": 0.78479135, + "num_input_tokens_seen": 248775825, + "step": 11529, + "time_per_iteration": 2.7546539306640625 + }, + { + "auxiliary_loss_clip": 0.01110971, + "auxiliary_loss_mlp": 0.00770303, + "balance_loss_clip": 1.03999567, + "balance_loss_mlp": 1.00013018, + "epoch": 0.6932211032616865, + "flos": 32343376366080.0, + "grad_norm": 1.6406393226660527, + "language_loss": 0.7214883, + "learning_rate": 9.085072404194436e-07, + "loss": 0.74030107, + "num_input_tokens_seen": 248796180, + "step": 11530, + "time_per_iteration": 2.6844561100006104 + }, + { + "auxiliary_loss_clip": 0.01098446, + "auxiliary_loss_mlp": 0.01035346, + "balance_loss_clip": 1.04138708, + "balance_loss_mlp": 1.02000356, + "epoch": 0.6932812265143544, + "flos": 22049079909120.0, + "grad_norm": 1.6484845997572906, + "language_loss": 0.78485453, + "learning_rate": 9.081809115407513e-07, + "loss": 0.80619252, + "num_input_tokens_seen": 248814735, + "step": 11531, + "time_per_iteration": 2.753316879272461 + }, + { + "auxiliary_loss_clip": 0.010964, + "auxiliary_loss_mlp": 0.01038589, + "balance_loss_clip": 1.03926003, + "balance_loss_mlp": 1.02656698, + "epoch": 0.6933413497670224, + "flos": 26256626219520.0, + "grad_norm": 1.5040049491252714, + "language_loss": 0.69552708, + "learning_rate": 9.078546240639484e-07, + "loss": 0.71687698, + "num_input_tokens_seen": 248839140, + "step": 11532, + "time_per_iteration": 2.7001755237579346 + }, + { + "auxiliary_loss_clip": 0.01087082, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.03650141, + "balance_loss_mlp": 1.01820564, + "epoch": 0.6934014730196904, + "flos": 19573003774080.0, + "grad_norm": 1.314927604950551, + "language_loss": 0.6689446, + "learning_rate": 9.075283780014082e-07, + "loss": 0.69013584, + "num_input_tokens_seen": 248858300, + "step": 11533, + "time_per_iteration": 2.761096239089966 + }, + { + "auxiliary_loss_clip": 0.01089563, + "auxiliary_loss_mlp": 0.01038191, + "balance_loss_clip": 1.04126263, + "balance_loss_mlp": 1.02426171, + "epoch": 0.6934615962723584, + "flos": 22119249127680.0, + "grad_norm": 3.125205881661687, + "language_loss": 0.58564359, + "learning_rate": 9.072021733655007e-07, + "loss": 0.60692114, + "num_input_tokens_seen": 248876310, + "step": 11534, + "time_per_iteration": 2.6929404735565186 + }, + { + "auxiliary_loss_clip": 0.01078734, + "auxiliary_loss_mlp": 0.01030158, + "balance_loss_clip": 1.03795767, + "balance_loss_mlp": 1.01613939, + "epoch": 0.6935217195250263, + "flos": 21360816432000.0, + "grad_norm": 2.25045203731707, + "language_loss": 0.71212113, + "learning_rate": 9.068760101685971e-07, + "loss": 0.73321003, + "num_input_tokens_seen": 248895650, + "step": 11535, + "time_per_iteration": 2.68656849861145 + }, + { + "auxiliary_loss_clip": 0.01013917, + "auxiliary_loss_mlp": 0.01003832, + "balance_loss_clip": 1.00924766, + "balance_loss_mlp": 1.00264609, + "epoch": 0.6935818427776943, + "flos": 64063813115520.0, + "grad_norm": 0.7110018734854711, + "language_loss": 0.59062427, + "learning_rate": 9.065498884230638e-07, + "loss": 0.61080176, + "num_input_tokens_seen": 248963920, + "step": 11536, + "time_per_iteration": 3.347024917602539 + }, + { + "auxiliary_loss_clip": 0.0110154, + "auxiliary_loss_mlp": 0.00771293, + "balance_loss_clip": 1.04176164, + "balance_loss_mlp": 1.00036359, + "epoch": 0.6936419660303622, + "flos": 20302564913280.0, + "grad_norm": 1.511578579133692, + "language_loss": 0.72917026, + "learning_rate": 9.062238081412692e-07, + "loss": 0.74789858, + "num_input_tokens_seen": 248983380, + "step": 11537, + "time_per_iteration": 2.7138421535491943 + }, + { + "auxiliary_loss_clip": 0.01022423, + "auxiliary_loss_mlp": 0.00751474, + "balance_loss_clip": 1.0083034, + "balance_loss_mlp": 0.99969625, + "epoch": 0.6937020892830302, + "flos": 67182581347200.0, + "grad_norm": 0.7456734981947979, + "language_loss": 0.55525714, + "learning_rate": 9.058977693355767e-07, + "loss": 0.57299614, + "num_input_tokens_seen": 249044680, + "step": 11538, + "time_per_iteration": 3.1686036586761475 + }, + { + "auxiliary_loss_clip": 0.01097095, + "auxiliary_loss_mlp": 0.01037153, + "balance_loss_clip": 1.03834343, + "balance_loss_mlp": 1.02519631, + "epoch": 0.6937622125356981, + "flos": 23878190229120.0, + "grad_norm": 1.582889813468805, + "language_loss": 0.77747178, + "learning_rate": 9.055717720183505e-07, + "loss": 0.79881424, + "num_input_tokens_seen": 249061060, + "step": 11539, + "time_per_iteration": 2.7487149238586426 + }, + { + "auxiliary_loss_clip": 0.01088793, + "auxiliary_loss_mlp": 0.01029242, + "balance_loss_clip": 1.03841698, + "balance_loss_mlp": 1.01741016, + "epoch": 0.6938223357883662, + "flos": 28730619365760.0, + "grad_norm": 1.696359380020658, + "language_loss": 0.63957608, + "learning_rate": 9.05245816201953e-07, + "loss": 0.66075647, + "num_input_tokens_seen": 249081430, + "step": 11540, + "time_per_iteration": 2.897141456604004 + }, + { + "auxiliary_loss_clip": 0.01064567, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.0352695, + "balance_loss_mlp": 1.01913404, + "epoch": 0.6938824590410341, + "flos": 28655027193600.0, + "grad_norm": 1.5143087108308135, + "language_loss": 0.86776996, + "learning_rate": 9.049199018987437e-07, + "loss": 0.8887341, + "num_input_tokens_seen": 249103020, + "step": 11541, + "time_per_iteration": 2.790721893310547 + }, + { + "auxiliary_loss_clip": 0.01113533, + "auxiliary_loss_mlp": 0.00771014, + "balance_loss_clip": 1.04010653, + "balance_loss_mlp": 1.00017405, + "epoch": 0.6939425822937021, + "flos": 18983062800000.0, + "grad_norm": 2.0467106914623483, + "language_loss": 0.84313244, + "learning_rate": 9.04594029121081e-07, + "loss": 0.86197793, + "num_input_tokens_seen": 249120810, + "step": 11542, + "time_per_iteration": 2.6897356510162354 + }, + { + "auxiliary_loss_clip": 0.01101602, + "auxiliary_loss_mlp": 0.0103373, + "balance_loss_clip": 1.03908658, + "balance_loss_mlp": 1.01946616, + "epoch": 0.6940027055463701, + "flos": 23075838178560.0, + "grad_norm": 1.712845406510252, + "language_loss": 0.75460529, + "learning_rate": 9.04268197881323e-07, + "loss": 0.7759586, + "num_input_tokens_seen": 249138050, + "step": 11543, + "time_per_iteration": 2.6957714557647705 + }, + { + "auxiliary_loss_clip": 0.01092628, + "auxiliary_loss_mlp": 0.01030945, + "balance_loss_clip": 1.03984666, + "balance_loss_mlp": 1.01842248, + "epoch": 0.694062828799038, + "flos": 18186564666240.0, + "grad_norm": 1.7601740067431768, + "language_loss": 0.76118124, + "learning_rate": 9.039424081918241e-07, + "loss": 0.782417, + "num_input_tokens_seen": 249155570, + "step": 11544, + "time_per_iteration": 2.6654560565948486 + }, + { + "auxiliary_loss_clip": 0.01059106, + "auxiliary_loss_mlp": 0.01041973, + "balance_loss_clip": 1.03483558, + "balance_loss_mlp": 1.02701259, + "epoch": 0.694122952051706, + "flos": 17821532701440.0, + "grad_norm": 1.7077891138664472, + "language_loss": 0.71304005, + "learning_rate": 9.036166600649388e-07, + "loss": 0.73405087, + "num_input_tokens_seen": 249172960, + "step": 11545, + "time_per_iteration": 2.6869020462036133 + }, + { + "auxiliary_loss_clip": 0.0109854, + "auxiliary_loss_mlp": 0.01030108, + "balance_loss_clip": 1.04018188, + "balance_loss_mlp": 1.01828814, + "epoch": 0.694183075304374, + "flos": 21215306436480.0, + "grad_norm": 1.7532682541368763, + "language_loss": 0.79367101, + "learning_rate": 9.0329095351302e-07, + "loss": 0.8149575, + "num_input_tokens_seen": 249192450, + "step": 11546, + "time_per_iteration": 2.6320011615753174 + }, + { + "auxiliary_loss_clip": 0.01080505, + "auxiliary_loss_mlp": 0.01029935, + "balance_loss_clip": 1.03777122, + "balance_loss_mlp": 1.01704824, + "epoch": 0.694243198557042, + "flos": 24060508686720.0, + "grad_norm": 1.4008683277346297, + "language_loss": 0.78635859, + "learning_rate": 9.029652885484194e-07, + "loss": 0.80746305, + "num_input_tokens_seen": 249214320, + "step": 11547, + "time_per_iteration": 2.7307076454162598 + }, + { + "auxiliary_loss_clip": 0.010916, + "auxiliary_loss_mlp": 0.00771764, + "balance_loss_clip": 1.04151332, + "balance_loss_mlp": 1.00021195, + "epoch": 0.6943033218097099, + "flos": 21141869080320.0, + "grad_norm": 2.101396590702846, + "language_loss": 0.80507267, + "learning_rate": 9.026396651834834e-07, + "loss": 0.82370633, + "num_input_tokens_seen": 249230925, + "step": 11548, + "time_per_iteration": 4.426462650299072 + }, + { + "auxiliary_loss_clip": 0.01032364, + "auxiliary_loss_mlp": 0.0075149, + "balance_loss_clip": 1.00922537, + "balance_loss_mlp": 0.99970454, + "epoch": 0.6943634450623779, + "flos": 57812015975040.0, + "grad_norm": 0.6903286764237632, + "language_loss": 0.53703904, + "learning_rate": 9.023140834305613e-07, + "loss": 0.55487758, + "num_input_tokens_seen": 249293975, + "step": 11549, + "time_per_iteration": 3.1308066844940186 + }, + { + "auxiliary_loss_clip": 0.01093982, + "auxiliary_loss_mlp": 0.01035759, + "balance_loss_clip": 1.03542507, + "balance_loss_mlp": 1.02189505, + "epoch": 0.6944235683150458, + "flos": 30590684231040.0, + "grad_norm": 1.55426436192092, + "language_loss": 0.73198104, + "learning_rate": 9.01988543302e-07, + "loss": 0.75327837, + "num_input_tokens_seen": 249315285, + "step": 11550, + "time_per_iteration": 5.8028564453125 + }, + { + "auxiliary_loss_clip": 0.010896, + "auxiliary_loss_mlp": 0.01039664, + "balance_loss_clip": 1.04099548, + "balance_loss_mlp": 1.02650332, + "epoch": 0.6944836915677138, + "flos": 19719447523200.0, + "grad_norm": 1.9506864007678324, + "language_loss": 0.74081314, + "learning_rate": 9.016630448101425e-07, + "loss": 0.76210582, + "num_input_tokens_seen": 249333505, + "step": 11551, + "time_per_iteration": 2.665813446044922 + }, + { + "auxiliary_loss_clip": 0.01114588, + "auxiliary_loss_mlp": 0.01038306, + "balance_loss_clip": 1.0404079, + "balance_loss_mlp": 1.0249548, + "epoch": 0.6945438148203817, + "flos": 24863579009280.0, + "grad_norm": 1.5863003603219143, + "language_loss": 0.84288925, + "learning_rate": 9.01337587967333e-07, + "loss": 0.86441821, + "num_input_tokens_seen": 249354180, + "step": 11552, + "time_per_iteration": 2.8407604694366455 + }, + { + "auxiliary_loss_clip": 0.01112485, + "auxiliary_loss_mlp": 0.01035825, + "balance_loss_clip": 1.03997219, + "balance_loss_mlp": 1.02287877, + "epoch": 0.6946039380730498, + "flos": 33326646243840.0, + "grad_norm": 1.6205787984736582, + "language_loss": 0.6727165, + "learning_rate": 9.010121727859117e-07, + "loss": 0.69419956, + "num_input_tokens_seen": 249377035, + "step": 11553, + "time_per_iteration": 2.7572171688079834 + }, + { + "auxiliary_loss_clip": 0.01097133, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.04150629, + "balance_loss_mlp": 1.0176357, + "epoch": 0.6946640613257177, + "flos": 20850956830080.0, + "grad_norm": 2.0885031059024017, + "language_loss": 0.79817116, + "learning_rate": 9.006867992782195e-07, + "loss": 0.81946045, + "num_input_tokens_seen": 249396155, + "step": 11554, + "time_per_iteration": 2.721204996109009 + }, + { + "auxiliary_loss_clip": 0.01101639, + "auxiliary_loss_mlp": 0.01028417, + "balance_loss_clip": 1.03683937, + "balance_loss_mlp": 1.01538706, + "epoch": 0.6947241845783857, + "flos": 19354846521600.0, + "grad_norm": 5.498909507177023, + "language_loss": 0.72485244, + "learning_rate": 9.003614674565934e-07, + "loss": 0.746153, + "num_input_tokens_seen": 249414555, + "step": 11555, + "time_per_iteration": 2.5764734745025635 + }, + { + "auxiliary_loss_clip": 0.01075985, + "auxiliary_loss_mlp": 0.01033395, + "balance_loss_clip": 1.0355674, + "balance_loss_mlp": 1.02071118, + "epoch": 0.6947843078310536, + "flos": 27120240915840.0, + "grad_norm": 1.691992683709007, + "language_loss": 0.78099442, + "learning_rate": 9.000361773333705e-07, + "loss": 0.80208826, + "num_input_tokens_seen": 249433570, + "step": 11556, + "time_per_iteration": 2.709371328353882 + }, + { + "auxiliary_loss_clip": 0.01053238, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.03567553, + "balance_loss_mlp": 1.02977192, + "epoch": 0.6948444310837216, + "flos": 28585109370240.0, + "grad_norm": 2.67608324512941, + "language_loss": 0.6078257, + "learning_rate": 8.997109289208869e-07, + "loss": 0.62878561, + "num_input_tokens_seen": 249453735, + "step": 11557, + "time_per_iteration": 2.802755832672119 + }, + { + "auxiliary_loss_clip": 0.01091412, + "auxiliary_loss_mlp": 0.01036617, + "balance_loss_clip": 1.04582477, + "balance_loss_mlp": 1.02432072, + "epoch": 0.6949045543363896, + "flos": 15669262696320.0, + "grad_norm": 1.8868639353826757, + "language_loss": 0.85245895, + "learning_rate": 8.993857222314752e-07, + "loss": 0.87373924, + "num_input_tokens_seen": 249470805, + "step": 11558, + "time_per_iteration": 4.191239595413208 + }, + { + "auxiliary_loss_clip": 0.01103665, + "auxiliary_loss_mlp": 0.01036679, + "balance_loss_clip": 1.03848016, + "balance_loss_mlp": 1.02259421, + "epoch": 0.6949646775890576, + "flos": 23259413612160.0, + "grad_norm": 1.6011995670577914, + "language_loss": 0.70525056, + "learning_rate": 8.990605572774664e-07, + "loss": 0.72665399, + "num_input_tokens_seen": 249491150, + "step": 11559, + "time_per_iteration": 2.7076830863952637 + }, + { + "auxiliary_loss_clip": 0.01078357, + "auxiliary_loss_mlp": 0.01032244, + "balance_loss_clip": 1.03816533, + "balance_loss_mlp": 1.01998925, + "epoch": 0.6950248008417256, + "flos": 22382546797440.0, + "grad_norm": 2.0259020832909234, + "language_loss": 0.78594178, + "learning_rate": 8.987354340711921e-07, + "loss": 0.80704772, + "num_input_tokens_seen": 249511560, + "step": 11560, + "time_per_iteration": 2.7197508811950684 + }, + { + "auxiliary_loss_clip": 0.01087442, + "auxiliary_loss_mlp": 0.01034646, + "balance_loss_clip": 1.03931344, + "balance_loss_mlp": 1.0221293, + "epoch": 0.6950849240943935, + "flos": 23477355383040.0, + "grad_norm": 1.532648657325296, + "language_loss": 0.76758087, + "learning_rate": 8.9841035262498e-07, + "loss": 0.78880179, + "num_input_tokens_seen": 249531910, + "step": 11561, + "time_per_iteration": 2.707702159881592 + }, + { + "auxiliary_loss_clip": 0.01108982, + "auxiliary_loss_mlp": 0.01032613, + "balance_loss_clip": 1.03717422, + "balance_loss_mlp": 1.01877272, + "epoch": 0.6951450473470615, + "flos": 17420554200960.0, + "grad_norm": 1.812422200416747, + "language_loss": 0.78550988, + "learning_rate": 8.980853129511577e-07, + "loss": 0.80692589, + "num_input_tokens_seen": 249550300, + "step": 11562, + "time_per_iteration": 2.5765740871429443 + }, + { + "auxiliary_loss_clip": 0.01104346, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.0394088, + "balance_loss_mlp": 1.02134836, + "epoch": 0.6952051705997294, + "flos": 20485745297280.0, + "grad_norm": 1.9668484309221967, + "language_loss": 0.69117391, + "learning_rate": 8.977603150620515e-07, + "loss": 0.7125628, + "num_input_tokens_seen": 249567740, + "step": 11563, + "time_per_iteration": 2.6727218627929688 + }, + { + "auxiliary_loss_clip": 0.01090765, + "auxiliary_loss_mlp": 0.0102846, + "balance_loss_clip": 1.03766811, + "balance_loss_mlp": 1.0160023, + "epoch": 0.6952652938523974, + "flos": 13989541040640.0, + "grad_norm": 2.495686990019142, + "language_loss": 0.73530227, + "learning_rate": 8.974353589699846e-07, + "loss": 0.75649452, + "num_input_tokens_seen": 249582700, + "step": 11564, + "time_per_iteration": 2.576385259628296 + }, + { + "auxiliary_loss_clip": 0.01083646, + "auxiliary_loss_mlp": 0.01038821, + "balance_loss_clip": 1.04269266, + "balance_loss_mlp": 1.02250147, + "epoch": 0.6953254171050653, + "flos": 30953956429440.0, + "grad_norm": 1.8121742039667086, + "language_loss": 0.71753776, + "learning_rate": 8.971104446872785e-07, + "loss": 0.73876244, + "num_input_tokens_seen": 249602920, + "step": 11565, + "time_per_iteration": 2.732823133468628 + }, + { + "auxiliary_loss_clip": 0.01016312, + "auxiliary_loss_mlp": 0.01000486, + "balance_loss_clip": 1.01167345, + "balance_loss_mlp": 0.99898958, + "epoch": 0.6953855403577334, + "flos": 61670257499520.0, + "grad_norm": 0.9560441236848968, + "language_loss": 0.58358735, + "learning_rate": 8.96785572226255e-07, + "loss": 0.60375541, + "num_input_tokens_seen": 249660400, + "step": 11566, + "time_per_iteration": 3.0193676948547363 + }, + { + "auxiliary_loss_clip": 0.01081084, + "auxiliary_loss_mlp": 0.01031181, + "balance_loss_clip": 1.04008102, + "balance_loss_mlp": 1.01653004, + "epoch": 0.6954456636104013, + "flos": 23039029716480.0, + "grad_norm": 1.9855993328717996, + "language_loss": 0.7417689, + "learning_rate": 8.964607415992338e-07, + "loss": 0.76289153, + "num_input_tokens_seen": 249679335, + "step": 11567, + "time_per_iteration": 2.72933030128479 + }, + { + "auxiliary_loss_clip": 0.01081196, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.03550458, + "balance_loss_mlp": 1.02039182, + "epoch": 0.6955057868630693, + "flos": 23918518224000.0, + "grad_norm": 1.2846819146580761, + "language_loss": 0.76948917, + "learning_rate": 8.961359528185313e-07, + "loss": 0.79064202, + "num_input_tokens_seen": 249701805, + "step": 11568, + "time_per_iteration": 2.715871572494507 + }, + { + "auxiliary_loss_clip": 0.01096832, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.04105902, + "balance_loss_mlp": 1.02265501, + "epoch": 0.6955659101157372, + "flos": 22594634651520.0, + "grad_norm": 1.619102090378134, + "language_loss": 0.72502244, + "learning_rate": 8.958112058964649e-07, + "loss": 0.74634463, + "num_input_tokens_seen": 249720550, + "step": 11569, + "time_per_iteration": 2.645249366760254 + }, + { + "auxiliary_loss_clip": 0.01091211, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.04227805, + "balance_loss_mlp": 1.01993299, + "epoch": 0.6956260333684052, + "flos": 24572523104640.0, + "grad_norm": 1.7249170948582337, + "language_loss": 0.76852113, + "learning_rate": 8.954865008453471e-07, + "loss": 0.78976554, + "num_input_tokens_seen": 249740325, + "step": 11570, + "time_per_iteration": 2.7455241680145264 + }, + { + "auxiliary_loss_clip": 0.01102536, + "auxiliary_loss_mlp": 0.01035635, + "balance_loss_clip": 1.03880751, + "balance_loss_mlp": 1.0223434, + "epoch": 0.6956861566210732, + "flos": 25846058787840.0, + "grad_norm": 2.852635379776112, + "language_loss": 0.7431376, + "learning_rate": 8.95161837677493e-07, + "loss": 0.76451933, + "num_input_tokens_seen": 249760570, + "step": 11571, + "time_per_iteration": 2.6448328495025635 + }, + { + "auxiliary_loss_clip": 0.01094888, + "auxiliary_loss_mlp": 0.01033197, + "balance_loss_clip": 1.03635645, + "balance_loss_mlp": 1.01997066, + "epoch": 0.6957462798737412, + "flos": 15301393557120.0, + "grad_norm": 1.759907053555304, + "language_loss": 0.74442685, + "learning_rate": 8.948372164052118e-07, + "loss": 0.76570773, + "num_input_tokens_seen": 249778290, + "step": 11572, + "time_per_iteration": 2.6260786056518555 + }, + { + "auxiliary_loss_clip": 0.01089599, + "auxiliary_loss_mlp": 0.01029265, + "balance_loss_clip": 1.03659272, + "balance_loss_mlp": 1.01614022, + "epoch": 0.6958064031264092, + "flos": 36246830135040.0, + "grad_norm": 1.9436396296550662, + "language_loss": 0.7025919, + "learning_rate": 8.94512637040814e-07, + "loss": 0.72378051, + "num_input_tokens_seen": 249800925, + "step": 11573, + "time_per_iteration": 2.783256769180298 + }, + { + "auxiliary_loss_clip": 0.0109259, + "auxiliary_loss_mlp": 0.01036465, + "balance_loss_clip": 1.04023504, + "balance_loss_mlp": 1.02252948, + "epoch": 0.6958665263790771, + "flos": 19208725994880.0, + "grad_norm": 2.0706527554899137, + "language_loss": 0.75003505, + "learning_rate": 8.941880995966095e-07, + "loss": 0.77132565, + "num_input_tokens_seen": 249820500, + "step": 11574, + "time_per_iteration": 2.684457540512085 + }, + { + "auxiliary_loss_clip": 0.01077067, + "auxiliary_loss_mlp": 0.01034435, + "balance_loss_clip": 1.03447127, + "balance_loss_mlp": 1.02117276, + "epoch": 0.6959266496317451, + "flos": 21795838047360.0, + "grad_norm": 1.601224427976484, + "language_loss": 0.74403846, + "learning_rate": 8.938636040849014e-07, + "loss": 0.76515353, + "num_input_tokens_seen": 249839845, + "step": 11575, + "time_per_iteration": 2.7856502532958984 + }, + { + "auxiliary_loss_clip": 0.01102844, + "auxiliary_loss_mlp": 0.0103293, + "balance_loss_clip": 1.03945291, + "balance_loss_mlp": 1.01965618, + "epoch": 0.695986772884413, + "flos": 20558248899840.0, + "grad_norm": 1.7874437641987468, + "language_loss": 0.78887069, + "learning_rate": 8.935391505179966e-07, + "loss": 0.81022847, + "num_input_tokens_seen": 249857400, + "step": 11576, + "time_per_iteration": 2.6610217094421387 + }, + { + "auxiliary_loss_clip": 0.01068698, + "auxiliary_loss_mlp": 0.01032851, + "balance_loss_clip": 1.03670073, + "balance_loss_mlp": 1.02041745, + "epoch": 0.696046896137081, + "flos": 14936217937920.0, + "grad_norm": 2.693764444619471, + "language_loss": 0.567918, + "learning_rate": 8.932147389081985e-07, + "loss": 0.58893347, + "num_input_tokens_seen": 249871645, + "step": 11577, + "time_per_iteration": 2.666973114013672 + }, + { + "auxiliary_loss_clip": 0.01034011, + "auxiliary_loss_mlp": 0.01034025, + "balance_loss_clip": 1.03415358, + "balance_loss_mlp": 1.02244925, + "epoch": 0.696107019389749, + "flos": 30740216549760.0, + "grad_norm": 1.3814908758376254, + "language_loss": 0.77030635, + "learning_rate": 8.928903692678081e-07, + "loss": 0.79098672, + "num_input_tokens_seen": 249894215, + "step": 11578, + "time_per_iteration": 2.8858745098114014 + }, + { + "auxiliary_loss_clip": 0.01078498, + "auxiliary_loss_mlp": 0.01037884, + "balance_loss_clip": 1.03798914, + "balance_loss_mlp": 1.02474117, + "epoch": 0.696167142642417, + "flos": 20776729374720.0, + "grad_norm": 1.8210944500658799, + "language_loss": 0.79498136, + "learning_rate": 8.925660416091254e-07, + "loss": 0.81614518, + "num_input_tokens_seen": 249912850, + "step": 11579, + "time_per_iteration": 2.664579153060913 + }, + { + "auxiliary_loss_clip": 0.01072667, + "auxiliary_loss_mlp": 0.01030035, + "balance_loss_clip": 1.03569424, + "balance_loss_mlp": 1.01685047, + "epoch": 0.6962272658950849, + "flos": 22565152563840.0, + "grad_norm": 1.691551451223947, + "language_loss": 0.72261667, + "learning_rate": 8.922417559444502e-07, + "loss": 0.7436437, + "num_input_tokens_seen": 249932650, + "step": 11580, + "time_per_iteration": 2.61865496635437 + }, + { + "auxiliary_loss_clip": 0.01096209, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.04078209, + "balance_loss_mlp": 1.01977623, + "epoch": 0.6962873891477529, + "flos": 22200156512640.0, + "grad_norm": 2.100362129300219, + "language_loss": 0.65822804, + "learning_rate": 8.919175122860787e-07, + "loss": 0.67952627, + "num_input_tokens_seen": 249951205, + "step": 11581, + "time_per_iteration": 2.559589385986328 + }, + { + "auxiliary_loss_clip": 0.0111328, + "auxiliary_loss_mlp": 0.01033054, + "balance_loss_clip": 1.03978491, + "balance_loss_mlp": 1.02038205, + "epoch": 0.6963475124004208, + "flos": 12489695717760.0, + "grad_norm": 1.963411305318447, + "language_loss": 0.76478052, + "learning_rate": 8.915933106463056e-07, + "loss": 0.78624392, + "num_input_tokens_seen": 249967045, + "step": 11582, + "time_per_iteration": 2.4627199172973633 + }, + { + "auxiliary_loss_clip": 0.01086826, + "auxiliary_loss_mlp": 0.01032933, + "balance_loss_clip": 1.03617883, + "balance_loss_mlp": 1.0209347, + "epoch": 0.6964076356530888, + "flos": 17165085696000.0, + "grad_norm": 2.040960017039001, + "language_loss": 0.69914186, + "learning_rate": 8.91269151037425e-07, + "loss": 0.72033942, + "num_input_tokens_seen": 249984565, + "step": 11583, + "time_per_iteration": 2.5302255153656006 + }, + { + "auxiliary_loss_clip": 0.01087174, + "auxiliary_loss_mlp": 0.01035499, + "balance_loss_clip": 1.0447557, + "balance_loss_mlp": 1.02192092, + "epoch": 0.6964677589057569, + "flos": 19937317466880.0, + "grad_norm": 4.628034393643301, + "language_loss": 0.8247509, + "learning_rate": 8.909450334717301e-07, + "loss": 0.84597766, + "num_input_tokens_seen": 250004235, + "step": 11584, + "time_per_iteration": 2.6446306705474854 + }, + { + "auxiliary_loss_clip": 0.01064623, + "auxiliary_loss_mlp": 0.01039599, + "balance_loss_clip": 1.04226518, + "balance_loss_mlp": 1.02518678, + "epoch": 0.6965278821584248, + "flos": 22784064001920.0, + "grad_norm": 2.204044356312338, + "language_loss": 0.80097932, + "learning_rate": 8.906209579615107e-07, + "loss": 0.82202154, + "num_input_tokens_seen": 250017645, + "step": 11585, + "time_per_iteration": 2.739288568496704 + }, + { + "auxiliary_loss_clip": 0.01109133, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.03880882, + "balance_loss_mlp": 1.02158463, + "epoch": 0.6965880054110928, + "flos": 20047563285120.0, + "grad_norm": 1.6413269485286424, + "language_loss": 0.7727071, + "learning_rate": 8.90296924519055e-07, + "loss": 0.79413539, + "num_input_tokens_seen": 250037640, + "step": 11586, + "time_per_iteration": 2.624624013900757 + }, + { + "auxiliary_loss_clip": 0.01098392, + "auxiliary_loss_mlp": 0.01031126, + "balance_loss_clip": 1.03904796, + "balance_loss_mlp": 1.0188241, + "epoch": 0.6966481286637607, + "flos": 21908238681600.0, + "grad_norm": 1.6313830444330661, + "language_loss": 0.78602171, + "learning_rate": 8.899729331566519e-07, + "loss": 0.8073169, + "num_input_tokens_seen": 250056490, + "step": 11587, + "time_per_iteration": 4.355099439620972 + }, + { + "auxiliary_loss_clip": 0.01088702, + "auxiliary_loss_mlp": 0.01033313, + "balance_loss_clip": 1.04040754, + "balance_loss_mlp": 1.02037311, + "epoch": 0.6967082519164287, + "flos": 15633172506240.0, + "grad_norm": 1.7929628743992274, + "language_loss": 0.72862899, + "learning_rate": 8.896489838865857e-07, + "loss": 0.74984908, + "num_input_tokens_seen": 250074285, + "step": 11588, + "time_per_iteration": 2.609231472015381 + }, + { + "auxiliary_loss_clip": 0.01084626, + "auxiliary_loss_mlp": 0.01027673, + "balance_loss_clip": 1.03783381, + "balance_loss_mlp": 1.01608634, + "epoch": 0.6967683751690966, + "flos": 24024598064640.0, + "grad_norm": 1.8203274112017525, + "language_loss": 0.75158805, + "learning_rate": 8.893250767211413e-07, + "loss": 0.77271104, + "num_input_tokens_seen": 250093350, + "step": 11589, + "time_per_iteration": 4.13300895690918 + }, + { + "auxiliary_loss_clip": 0.01092018, + "auxiliary_loss_mlp": 0.01029816, + "balance_loss_clip": 1.0398705, + "balance_loss_mlp": 1.01764512, + "epoch": 0.6968284984217646, + "flos": 31024700265600.0, + "grad_norm": 2.003391247755446, + "language_loss": 0.63547194, + "learning_rate": 8.890012116726012e-07, + "loss": 0.6566903, + "num_input_tokens_seen": 250114170, + "step": 11590, + "time_per_iteration": 4.382747411727905 + }, + { + "auxiliary_loss_clip": 0.0099554, + "auxiliary_loss_mlp": 0.01020589, + "balance_loss_clip": 1.01743388, + "balance_loss_mlp": 1.01875329, + "epoch": 0.6968886216744326, + "flos": 67622990002560.0, + "grad_norm": 0.7509859568380568, + "language_loss": 0.61225605, + "learning_rate": 8.88677388753248e-07, + "loss": 0.63241732, + "num_input_tokens_seen": 250178250, + "step": 11591, + "time_per_iteration": 3.3300459384918213 + }, + { + "auxiliary_loss_clip": 0.01070341, + "auxiliary_loss_mlp": 0.00770828, + "balance_loss_clip": 1.04759347, + "balance_loss_mlp": 1.0002979, + "epoch": 0.6969487449271006, + "flos": 24863686750080.0, + "grad_norm": 3.0435355552778893, + "language_loss": 0.69087148, + "learning_rate": 8.883536079753582e-07, + "loss": 0.70928317, + "num_input_tokens_seen": 250198420, + "step": 11592, + "time_per_iteration": 2.8862390518188477 + }, + { + "auxiliary_loss_clip": 0.010765, + "auxiliary_loss_mlp": 0.01030332, + "balance_loss_clip": 1.03645754, + "balance_loss_mlp": 1.01758289, + "epoch": 0.6970088681797685, + "flos": 28767858791040.0, + "grad_norm": 1.559625176666528, + "language_loss": 0.6217618, + "learning_rate": 8.880298693512109e-07, + "loss": 0.64283013, + "num_input_tokens_seen": 250220650, + "step": 11593, + "time_per_iteration": 2.743360757827759 + }, + { + "auxiliary_loss_clip": 0.01085759, + "auxiliary_loss_mlp": 0.01027148, + "balance_loss_clip": 1.0385946, + "balance_loss_mlp": 1.01526248, + "epoch": 0.6970689914324365, + "flos": 27308556944640.0, + "grad_norm": 4.989009563072009, + "language_loss": 0.54315436, + "learning_rate": 8.877061728930832e-07, + "loss": 0.56428343, + "num_input_tokens_seen": 250241750, + "step": 11594, + "time_per_iteration": 2.738746404647827 + }, + { + "auxiliary_loss_clip": 0.01100892, + "auxiliary_loss_mlp": 0.01029271, + "balance_loss_clip": 1.04011106, + "balance_loss_mlp": 1.01718903, + "epoch": 0.6971291146851044, + "flos": 19136258305920.0, + "grad_norm": 1.879143273787494, + "language_loss": 0.76764494, + "learning_rate": 8.87382518613248e-07, + "loss": 0.78894663, + "num_input_tokens_seen": 250259445, + "step": 11595, + "time_per_iteration": 2.6188101768493652 + }, + { + "auxiliary_loss_clip": 0.01091633, + "auxiliary_loss_mlp": 0.00771425, + "balance_loss_clip": 1.04053104, + "balance_loss_mlp": 1.00017834, + "epoch": 0.6971892379377724, + "flos": 14610508387200.0, + "grad_norm": 3.031219644590871, + "language_loss": 0.71711326, + "learning_rate": 8.870589065239793e-07, + "loss": 0.73574388, + "num_input_tokens_seen": 250275640, + "step": 11596, + "time_per_iteration": 4.301288843154907 + }, + { + "auxiliary_loss_clip": 0.01114621, + "auxiliary_loss_mlp": 0.0103195, + "balance_loss_clip": 1.04142773, + "balance_loss_mlp": 1.0187242, + "epoch": 0.6972493611904405, + "flos": 22307457415680.0, + "grad_norm": 1.6942281967354775, + "language_loss": 0.76373446, + "learning_rate": 8.867353366375492e-07, + "loss": 0.78520012, + "num_input_tokens_seen": 250296435, + "step": 11597, + "time_per_iteration": 2.6642062664031982 + }, + { + "auxiliary_loss_clip": 0.0110086, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.03868103, + "balance_loss_mlp": 1.02113771, + "epoch": 0.6973094844431084, + "flos": 17420374632960.0, + "grad_norm": 1.9087026852292102, + "language_loss": 0.74286294, + "learning_rate": 8.864118089662267e-07, + "loss": 0.76421189, + "num_input_tokens_seen": 250314035, + "step": 11598, + "time_per_iteration": 2.6227927207946777 + }, + { + "auxiliary_loss_clip": 0.01097599, + "auxiliary_loss_mlp": 0.01035152, + "balance_loss_clip": 1.04099619, + "balance_loss_mlp": 1.02143049, + "epoch": 0.6973696076957764, + "flos": 27235370983680.0, + "grad_norm": 1.7329159354231054, + "language_loss": 0.89613545, + "learning_rate": 8.860883235222791e-07, + "loss": 0.91746294, + "num_input_tokens_seen": 250332995, + "step": 11599, + "time_per_iteration": 2.6820266246795654 + }, + { + "auxiliary_loss_clip": 0.0111129, + "auxiliary_loss_mlp": 0.01041078, + "balance_loss_clip": 1.04336691, + "balance_loss_mlp": 1.02599812, + "epoch": 0.6974297309484443, + "flos": 22018089450240.0, + "grad_norm": 2.876326408269763, + "language_loss": 0.69646597, + "learning_rate": 8.85764880317974e-07, + "loss": 0.71798968, + "num_input_tokens_seen": 250352120, + "step": 11600, + "time_per_iteration": 2.6357643604278564 + }, + { + "auxiliary_loss_clip": 0.01071835, + "auxiliary_loss_mlp": 0.01034549, + "balance_loss_clip": 1.03591609, + "balance_loss_mlp": 1.02162719, + "epoch": 0.6974898542011123, + "flos": 28366449327360.0, + "grad_norm": 1.8870788782898071, + "language_loss": 0.77037942, + "learning_rate": 8.854414793655771e-07, + "loss": 0.79144335, + "num_input_tokens_seen": 250371705, + "step": 11601, + "time_per_iteration": 2.767747402191162 + }, + { + "auxiliary_loss_clip": 0.01095268, + "auxiliary_loss_mlp": 0.00769859, + "balance_loss_clip": 1.03727877, + "balance_loss_mlp": 1.0001725, + "epoch": 0.6975499774537802, + "flos": 15232050351360.0, + "grad_norm": 1.9951550875439237, + "language_loss": 0.7223537, + "learning_rate": 8.851181206773508e-07, + "loss": 0.74100494, + "num_input_tokens_seen": 250390485, + "step": 11602, + "time_per_iteration": 2.7312278747558594 + }, + { + "auxiliary_loss_clip": 0.0109282, + "auxiliary_loss_mlp": 0.00770776, + "balance_loss_clip": 1.03932607, + "balance_loss_mlp": 1.00030899, + "epoch": 0.6976101007064482, + "flos": 22157422306560.0, + "grad_norm": 2.0827075826583115, + "language_loss": 0.76365876, + "learning_rate": 8.847948042655567e-07, + "loss": 0.78229469, + "num_input_tokens_seen": 250407020, + "step": 11603, + "time_per_iteration": 2.689286231994629 + }, + { + "auxiliary_loss_clip": 0.01062872, + "auxiliary_loss_mlp": 0.01031896, + "balance_loss_clip": 1.03511834, + "balance_loss_mlp": 1.01951063, + "epoch": 0.6976702239591162, + "flos": 22273522041600.0, + "grad_norm": 1.585564011892126, + "language_loss": 0.62287712, + "learning_rate": 8.844715301424557e-07, + "loss": 0.64382482, + "num_input_tokens_seen": 250425880, + "step": 11604, + "time_per_iteration": 2.7053442001342773 + }, + { + "auxiliary_loss_clip": 0.01097384, + "auxiliary_loss_mlp": 0.01033963, + "balance_loss_clip": 1.0394032, + "balance_loss_mlp": 1.01989651, + "epoch": 0.6977303472117842, + "flos": 25848608653440.0, + "grad_norm": 2.5387493951629954, + "language_loss": 0.82072401, + "learning_rate": 8.841482983203057e-07, + "loss": 0.8420375, + "num_input_tokens_seen": 250442925, + "step": 11605, + "time_per_iteration": 2.62129545211792 + }, + { + "auxiliary_loss_clip": 0.01101547, + "auxiliary_loss_mlp": 0.01035684, + "balance_loss_clip": 1.03945065, + "balance_loss_mlp": 1.02364397, + "epoch": 0.6977904704644521, + "flos": 20959586536320.0, + "grad_norm": 1.5446115393296036, + "language_loss": 0.70200372, + "learning_rate": 8.838251088113638e-07, + "loss": 0.72337604, + "num_input_tokens_seen": 250461220, + "step": 11606, + "time_per_iteration": 2.5922529697418213 + }, + { + "auxiliary_loss_clip": 0.01092847, + "auxiliary_loss_mlp": 0.01030242, + "balance_loss_clip": 1.03967309, + "balance_loss_mlp": 1.01759934, + "epoch": 0.6978505937171201, + "flos": 22055041566720.0, + "grad_norm": 2.256488814952284, + "language_loss": 0.82503331, + "learning_rate": 8.835019616278856e-07, + "loss": 0.84626418, + "num_input_tokens_seen": 250480975, + "step": 11607, + "time_per_iteration": 2.6494314670562744 + }, + { + "auxiliary_loss_clip": 0.0109393, + "auxiliary_loss_mlp": 0.01035205, + "balance_loss_clip": 1.04016328, + "balance_loss_mlp": 1.02121639, + "epoch": 0.697910716969788, + "flos": 20043720529920.0, + "grad_norm": 1.8306370219101196, + "language_loss": 0.78975695, + "learning_rate": 8.831788567821265e-07, + "loss": 0.81104833, + "num_input_tokens_seen": 250497980, + "step": 11608, + "time_per_iteration": 2.6512763500213623 + }, + { + "auxiliary_loss_clip": 0.01095127, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.03882265, + "balance_loss_mlp": 1.02155733, + "epoch": 0.697970840222456, + "flos": 15888245961600.0, + "grad_norm": 2.0581522063930104, + "language_loss": 0.89782465, + "learning_rate": 8.828557942863357e-07, + "loss": 0.91912538, + "num_input_tokens_seen": 250511910, + "step": 11609, + "time_per_iteration": 2.608104944229126 + }, + { + "auxiliary_loss_clip": 0.01078996, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.03936923, + "balance_loss_mlp": 1.0155983, + "epoch": 0.698030963475124, + "flos": 21215629658880.0, + "grad_norm": 1.5744545790773263, + "language_loss": 0.63836277, + "learning_rate": 8.82532774152765e-07, + "loss": 0.65944076, + "num_input_tokens_seen": 250531090, + "step": 11610, + "time_per_iteration": 2.743638038635254 + }, + { + "auxiliary_loss_clip": 0.0108087, + "auxiliary_loss_mlp": 0.01031692, + "balance_loss_clip": 1.03968239, + "balance_loss_mlp": 1.01942515, + "epoch": 0.698091086727792, + "flos": 33759728524800.0, + "grad_norm": 1.9401424134223804, + "language_loss": 0.84452772, + "learning_rate": 8.822097963936643e-07, + "loss": 0.86565328, + "num_input_tokens_seen": 250551565, + "step": 11611, + "time_per_iteration": 2.840013265609741 + }, + { + "auxiliary_loss_clip": 0.01102996, + "auxiliary_loss_mlp": 0.01033816, + "balance_loss_clip": 1.03944659, + "balance_loss_mlp": 1.02075076, + "epoch": 0.69815120998046, + "flos": 15887850912000.0, + "grad_norm": 1.998962318660509, + "language_loss": 0.70772141, + "learning_rate": 8.818868610212793e-07, + "loss": 0.7290895, + "num_input_tokens_seen": 250569625, + "step": 11612, + "time_per_iteration": 2.625783681869507 + }, + { + "auxiliary_loss_clip": 0.01094811, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.03740323, + "balance_loss_mlp": 1.01831591, + "epoch": 0.6982113332331279, + "flos": 18947044437120.0, + "grad_norm": 1.6349871835857621, + "language_loss": 0.80866611, + "learning_rate": 8.815639680478573e-07, + "loss": 0.82992887, + "num_input_tokens_seen": 250586960, + "step": 11613, + "time_per_iteration": 2.601461887359619 + }, + { + "auxiliary_loss_clip": 0.0110142, + "auxiliary_loss_mlp": 0.01035984, + "balance_loss_clip": 1.03952324, + "balance_loss_mlp": 1.02403307, + "epoch": 0.6982714564857959, + "flos": 24389594115840.0, + "grad_norm": 1.8644816544131795, + "language_loss": 0.75648409, + "learning_rate": 8.812411174856411e-07, + "loss": 0.77785814, + "num_input_tokens_seen": 250605080, + "step": 11614, + "time_per_iteration": 2.5961225032806396 + }, + { + "auxiliary_loss_clip": 0.01054504, + "auxiliary_loss_mlp": 0.01034441, + "balance_loss_clip": 1.04426599, + "balance_loss_mlp": 1.0214231, + "epoch": 0.6983315797384638, + "flos": 20083725302400.0, + "grad_norm": 2.0397367451391033, + "language_loss": 0.77191758, + "learning_rate": 8.809183093468746e-07, + "loss": 0.79280698, + "num_input_tokens_seen": 250623965, + "step": 11615, + "time_per_iteration": 2.9072482585906982 + }, + { + "auxiliary_loss_clip": 0.01083429, + "auxiliary_loss_mlp": 0.01032942, + "balance_loss_clip": 1.03927791, + "balance_loss_mlp": 1.0201925, + "epoch": 0.6983917029911318, + "flos": 13512431664000.0, + "grad_norm": 1.8809513154033768, + "language_loss": 0.73199165, + "learning_rate": 8.80595543643797e-07, + "loss": 0.75315541, + "num_input_tokens_seen": 250640675, + "step": 11616, + "time_per_iteration": 2.961540937423706 + }, + { + "auxiliary_loss_clip": 0.01114861, + "auxiliary_loss_mlp": 0.01037909, + "balance_loss_clip": 1.04299331, + "balance_loss_mlp": 1.02498102, + "epoch": 0.6984518262437998, + "flos": 22018412672640.0, + "grad_norm": 1.6387107954550246, + "language_loss": 0.84313893, + "learning_rate": 8.802728203886487e-07, + "loss": 0.86466658, + "num_input_tokens_seen": 250660295, + "step": 11617, + "time_per_iteration": 2.586671829223633 + }, + { + "auxiliary_loss_clip": 0.01074632, + "auxiliary_loss_mlp": 0.01043262, + "balance_loss_clip": 1.03665471, + "balance_loss_mlp": 1.02901638, + "epoch": 0.6985119494964678, + "flos": 18770615809920.0, + "grad_norm": 2.7737807419222102, + "language_loss": 0.59687322, + "learning_rate": 8.799501395936682e-07, + "loss": 0.61805212, + "num_input_tokens_seen": 250678155, + "step": 11618, + "time_per_iteration": 2.6617705821990967 + }, + { + "auxiliary_loss_clip": 0.01090766, + "auxiliary_loss_mlp": 0.01037543, + "balance_loss_clip": 1.04022956, + "balance_loss_mlp": 1.02521658, + "epoch": 0.6985720727491357, + "flos": 22382834106240.0, + "grad_norm": 1.6558560034629248, + "language_loss": 0.83071142, + "learning_rate": 8.796275012710903e-07, + "loss": 0.85199451, + "num_input_tokens_seen": 250697230, + "step": 11619, + "time_per_iteration": 2.6859943866729736 + }, + { + "auxiliary_loss_clip": 0.0109875, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.04005098, + "balance_loss_mlp": 1.01863778, + "epoch": 0.6986321960018037, + "flos": 39567884785920.0, + "grad_norm": 1.8289485555178766, + "language_loss": 0.67044997, + "learning_rate": 8.793049054331494e-07, + "loss": 0.69173336, + "num_input_tokens_seen": 250719865, + "step": 11620, + "time_per_iteration": 2.7397263050079346 + }, + { + "auxiliary_loss_clip": 0.01062849, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.03714263, + "balance_loss_mlp": 1.01732397, + "epoch": 0.6986923192544716, + "flos": 17967725055360.0, + "grad_norm": 2.0135534332411353, + "language_loss": 0.72860134, + "learning_rate": 8.789823520920794e-07, + "loss": 0.7495327, + "num_input_tokens_seen": 250736565, + "step": 11621, + "time_per_iteration": 2.731579303741455 + }, + { + "auxiliary_loss_clip": 0.01060219, + "auxiliary_loss_mlp": 0.01043603, + "balance_loss_clip": 1.03684866, + "balance_loss_mlp": 1.02971494, + "epoch": 0.6987524425071396, + "flos": 25594325297280.0, + "grad_norm": 2.5508212623422573, + "language_loss": 0.68065464, + "learning_rate": 8.7865984126011e-07, + "loss": 0.70169282, + "num_input_tokens_seen": 250757235, + "step": 11622, + "time_per_iteration": 2.7399957180023193 + }, + { + "auxiliary_loss_clip": 0.01044503, + "auxiliary_loss_mlp": 0.01029486, + "balance_loss_clip": 1.03589344, + "balance_loss_mlp": 1.01729143, + "epoch": 0.6988125657598077, + "flos": 17530081747200.0, + "grad_norm": 4.399012607705736, + "language_loss": 0.62462044, + "learning_rate": 8.783373729494721e-07, + "loss": 0.64536035, + "num_input_tokens_seen": 250775585, + "step": 11623, + "time_per_iteration": 2.712000846862793 + }, + { + "auxiliary_loss_clip": 0.01115272, + "auxiliary_loss_mlp": 0.01029213, + "balance_loss_clip": 1.03893709, + "balance_loss_mlp": 1.01590955, + "epoch": 0.6988726890124756, + "flos": 39165721136640.0, + "grad_norm": 1.8283272495674747, + "language_loss": 0.6077522, + "learning_rate": 8.780149471723932e-07, + "loss": 0.62919706, + "num_input_tokens_seen": 250795725, + "step": 11624, + "time_per_iteration": 2.765336275100708 + }, + { + "auxiliary_loss_clip": 0.01103178, + "auxiliary_loss_mlp": 0.01043279, + "balance_loss_clip": 1.0379796, + "balance_loss_mlp": 1.02959347, + "epoch": 0.6989328122651436, + "flos": 20193468330240.0, + "grad_norm": 3.187254618002262, + "language_loss": 0.78135574, + "learning_rate": 8.776925639411017e-07, + "loss": 0.80282032, + "num_input_tokens_seen": 250814555, + "step": 11625, + "time_per_iteration": 2.673025608062744 + }, + { + "auxiliary_loss_clip": 0.01074244, + "auxiliary_loss_mlp": 0.01033485, + "balance_loss_clip": 1.03602779, + "balance_loss_mlp": 1.0217849, + "epoch": 0.6989929355178115, + "flos": 21834873152640.0, + "grad_norm": 2.0895075471451903, + "language_loss": 0.65869164, + "learning_rate": 8.773702232678188e-07, + "loss": 0.67976898, + "num_input_tokens_seen": 250833105, + "step": 11626, + "time_per_iteration": 4.503946542739868 + }, + { + "auxiliary_loss_clip": 0.01092456, + "auxiliary_loss_mlp": 0.00770949, + "balance_loss_clip": 1.03971887, + "balance_loss_mlp": 1.000265, + "epoch": 0.6990530587704795, + "flos": 26322880855680.0, + "grad_norm": 1.5700426870038287, + "language_loss": 0.70198143, + "learning_rate": 8.770479251647697e-07, + "loss": 0.72061551, + "num_input_tokens_seen": 250852570, + "step": 11627, + "time_per_iteration": 2.7615082263946533 + }, + { + "auxiliary_loss_clip": 0.01110072, + "auxiliary_loss_mlp": 0.01029931, + "balance_loss_clip": 1.04070234, + "balance_loss_mlp": 1.01854658, + "epoch": 0.6991131820231474, + "flos": 19828975069440.0, + "grad_norm": 1.7298548393631112, + "language_loss": 0.6256994, + "learning_rate": 8.767256696441768e-07, + "loss": 0.64709944, + "num_input_tokens_seen": 250870500, + "step": 11628, + "time_per_iteration": 2.5734152793884277 + }, + { + "auxiliary_loss_clip": 0.01103325, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.03856647, + "balance_loss_mlp": 1.02237749, + "epoch": 0.6991733052758154, + "flos": 33984817102080.0, + "grad_norm": 2.290690432267753, + "language_loss": 0.67708141, + "learning_rate": 8.764034567182581e-07, + "loss": 0.69847167, + "num_input_tokens_seen": 250892745, + "step": 11629, + "time_per_iteration": 5.866469621658325 + }, + { + "auxiliary_loss_clip": 0.01112912, + "auxiliary_loss_mlp": 0.01036564, + "balance_loss_clip": 1.04074121, + "balance_loss_mlp": 1.02318311, + "epoch": 0.6992334285284834, + "flos": 15633136592640.0, + "grad_norm": 1.543023133812331, + "language_loss": 0.72312945, + "learning_rate": 8.760812863992337e-07, + "loss": 0.74462426, + "num_input_tokens_seen": 250910225, + "step": 11630, + "time_per_iteration": 2.657487392425537 + }, + { + "auxiliary_loss_clip": 0.01113352, + "auxiliary_loss_mlp": 0.01034612, + "balance_loss_clip": 1.04170883, + "balance_loss_mlp": 1.02198827, + "epoch": 0.6992935517811514, + "flos": 21726279360000.0, + "grad_norm": 1.59875756731347, + "language_loss": 0.73932934, + "learning_rate": 8.757591586993196e-07, + "loss": 0.76080894, + "num_input_tokens_seen": 250929715, + "step": 11631, + "time_per_iteration": 2.5861480236053467 + }, + { + "auxiliary_loss_clip": 0.01104832, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.0415566, + "balance_loss_mlp": 1.02083039, + "epoch": 0.6993536750338193, + "flos": 20115254465280.0, + "grad_norm": 2.102391376159487, + "language_loss": 0.89547968, + "learning_rate": 8.7543707363073e-07, + "loss": 0.91687244, + "num_input_tokens_seen": 250944230, + "step": 11632, + "time_per_iteration": 2.590348482131958 + }, + { + "auxiliary_loss_clip": 0.01094827, + "auxiliary_loss_mlp": 0.01040482, + "balance_loss_clip": 1.04256976, + "balance_loss_mlp": 1.02795899, + "epoch": 0.6994137982864873, + "flos": 22010547594240.0, + "grad_norm": 1.8034038956889087, + "language_loss": 0.80041152, + "learning_rate": 8.751150312056792e-07, + "loss": 0.82176459, + "num_input_tokens_seen": 250961865, + "step": 11633, + "time_per_iteration": 2.681643486022949 + }, + { + "auxiliary_loss_clip": 0.011161, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.04005051, + "balance_loss_mlp": 1.02334523, + "epoch": 0.6994739215391552, + "flos": 25519020433920.0, + "grad_norm": 1.9182761585422314, + "language_loss": 0.67464936, + "learning_rate": 8.747930314363794e-07, + "loss": 0.69618672, + "num_input_tokens_seen": 250982025, + "step": 11634, + "time_per_iteration": 2.604487419128418 + }, + { + "auxiliary_loss_clip": 0.01010044, + "auxiliary_loss_mlp": 0.01002813, + "balance_loss_clip": 1.01407039, + "balance_loss_mlp": 1.00143051, + "epoch": 0.6995340447918232, + "flos": 59128357691520.0, + "grad_norm": 0.6847666166760457, + "language_loss": 0.53152555, + "learning_rate": 8.744710743350412e-07, + "loss": 0.5516541, + "num_input_tokens_seen": 251046900, + "step": 11635, + "time_per_iteration": 4.9191200733184814 + }, + { + "auxiliary_loss_clip": 0.01086524, + "auxiliary_loss_mlp": 0.01034637, + "balance_loss_clip": 1.03990149, + "balance_loss_mlp": 1.02167284, + "epoch": 0.6995941680444913, + "flos": 17967832796160.0, + "grad_norm": 1.5113357083257617, + "language_loss": 0.81950343, + "learning_rate": 8.741491599138726e-07, + "loss": 0.84071505, + "num_input_tokens_seen": 251065050, + "step": 11636, + "time_per_iteration": 2.6814749240875244 + }, + { + "auxiliary_loss_clip": 0.01114034, + "auxiliary_loss_mlp": 0.01030856, + "balance_loss_clip": 1.04003048, + "balance_loss_mlp": 1.01799953, + "epoch": 0.6996542912971592, + "flos": 21980095839360.0, + "grad_norm": 4.210307970710786, + "language_loss": 0.83255941, + "learning_rate": 8.738272881850801e-07, + "loss": 0.85400826, + "num_input_tokens_seen": 251083355, + "step": 11637, + "time_per_iteration": 2.6101019382476807 + }, + { + "auxiliary_loss_clip": 0.01063351, + "auxiliary_loss_mlp": 0.01039835, + "balance_loss_clip": 1.0357244, + "balance_loss_mlp": 1.0266149, + "epoch": 0.6997144145498272, + "flos": 11686158518400.0, + "grad_norm": 1.88143316282286, + "language_loss": 0.68318653, + "learning_rate": 8.735054591608704e-07, + "loss": 0.70421839, + "num_input_tokens_seen": 251096420, + "step": 11638, + "time_per_iteration": 2.757967233657837 + }, + { + "auxiliary_loss_clip": 0.0110744, + "auxiliary_loss_mlp": 0.01034716, + "balance_loss_clip": 1.04054809, + "balance_loss_mlp": 1.02038121, + "epoch": 0.6997745378024951, + "flos": 29607162958080.0, + "grad_norm": 2.0225103573047334, + "language_loss": 0.77908248, + "learning_rate": 8.731836728534459e-07, + "loss": 0.80050403, + "num_input_tokens_seen": 251115410, + "step": 11639, + "time_per_iteration": 2.7171573638916016 + }, + { + "auxiliary_loss_clip": 0.01088431, + "auxiliary_loss_mlp": 0.01044388, + "balance_loss_clip": 1.03905129, + "balance_loss_mlp": 1.03095889, + "epoch": 0.6998346610551631, + "flos": 20886616056960.0, + "grad_norm": 2.0145862528244542, + "language_loss": 0.82033116, + "learning_rate": 8.728619292750093e-07, + "loss": 0.84165937, + "num_input_tokens_seen": 251133530, + "step": 11640, + "time_per_iteration": 2.746412515640259 + }, + { + "auxiliary_loss_clip": 0.01079412, + "auxiliary_loss_mlp": 0.01033499, + "balance_loss_clip": 1.03800678, + "balance_loss_mlp": 1.02089286, + "epoch": 0.699894784307831, + "flos": 27163046949120.0, + "grad_norm": 1.988818239892088, + "language_loss": 0.75212121, + "learning_rate": 8.725402284377619e-07, + "loss": 0.77325034, + "num_input_tokens_seen": 251153985, + "step": 11641, + "time_per_iteration": 2.789306879043579 + }, + { + "auxiliary_loss_clip": 0.01089337, + "auxiliary_loss_mlp": 0.01024848, + "balance_loss_clip": 1.03791595, + "balance_loss_mlp": 1.01126993, + "epoch": 0.699954907560499, + "flos": 20923640000640.0, + "grad_norm": 1.9133228013475547, + "language_loss": 0.77589947, + "learning_rate": 8.722185703539022e-07, + "loss": 0.79704136, + "num_input_tokens_seen": 251173225, + "step": 11642, + "time_per_iteration": 2.6469504833221436 + }, + { + "auxiliary_loss_clip": 0.01110134, + "auxiliary_loss_mlp": 0.01039612, + "balance_loss_clip": 1.04202175, + "balance_loss_mlp": 1.02436519, + "epoch": 0.700015030813167, + "flos": 28657792540800.0, + "grad_norm": 1.9777967243613577, + "language_loss": 0.74846154, + "learning_rate": 8.718969550356266e-07, + "loss": 0.76995897, + "num_input_tokens_seen": 251192485, + "step": 11643, + "time_per_iteration": 2.6794352531433105 + }, + { + "auxiliary_loss_clip": 0.01079698, + "auxiliary_loss_mlp": 0.01030119, + "balance_loss_clip": 1.03809059, + "balance_loss_mlp": 1.01665401, + "epoch": 0.700075154065835, + "flos": 29205286617600.0, + "grad_norm": 1.5319096526083835, + "language_loss": 0.60467082, + "learning_rate": 8.715753824951315e-07, + "loss": 0.62576902, + "num_input_tokens_seen": 251214965, + "step": 11644, + "time_per_iteration": 2.7573509216308594 + }, + { + "auxiliary_loss_clip": 0.01098759, + "auxiliary_loss_mlp": 0.0103069, + "balance_loss_clip": 1.03691936, + "balance_loss_mlp": 1.01848316, + "epoch": 0.7001352773185029, + "flos": 23112431159040.0, + "grad_norm": 2.3431210800913203, + "language_loss": 0.81582069, + "learning_rate": 8.712538527446119e-07, + "loss": 0.83711517, + "num_input_tokens_seen": 251234500, + "step": 11645, + "time_per_iteration": 2.6731204986572266 + }, + { + "auxiliary_loss_clip": 0.01102676, + "auxiliary_loss_mlp": 0.01031812, + "balance_loss_clip": 1.03974915, + "balance_loss_mlp": 1.01880574, + "epoch": 0.7001954005711709, + "flos": 21322858734720.0, + "grad_norm": 2.1357575492399143, + "language_loss": 0.68504727, + "learning_rate": 8.709323657962584e-07, + "loss": 0.70639217, + "num_input_tokens_seen": 251254360, + "step": 11646, + "time_per_iteration": 2.622621774673462 + }, + { + "auxiliary_loss_clip": 0.01096745, + "auxiliary_loss_mlp": 0.01045056, + "balance_loss_clip": 1.03817129, + "balance_loss_mlp": 1.03125119, + "epoch": 0.7002555238238388, + "flos": 24535822383360.0, + "grad_norm": 1.6406688140332686, + "language_loss": 0.71264708, + "learning_rate": 8.706109216622635e-07, + "loss": 0.73406506, + "num_input_tokens_seen": 251274790, + "step": 11647, + "time_per_iteration": 2.627837896347046 + }, + { + "auxiliary_loss_clip": 0.01105019, + "auxiliary_loss_mlp": 0.01036652, + "balance_loss_clip": 1.041466, + "balance_loss_mlp": 1.02333069, + "epoch": 0.7003156470765068, + "flos": 39056552726400.0, + "grad_norm": 1.5459607229268986, + "language_loss": 0.71446347, + "learning_rate": 8.702895203548155e-07, + "loss": 0.7358802, + "num_input_tokens_seen": 251296275, + "step": 11648, + "time_per_iteration": 2.753802537918091 + }, + { + "auxiliary_loss_clip": 0.01057301, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.03298402, + "balance_loss_mlp": 1.02418768, + "epoch": 0.7003757703291749, + "flos": 28804092635520.0, + "grad_norm": 1.667578697289853, + "language_loss": 0.77163005, + "learning_rate": 8.699681618861014e-07, + "loss": 0.79257905, + "num_input_tokens_seen": 251317375, + "step": 11649, + "time_per_iteration": 2.7761147022247314 + }, + { + "auxiliary_loss_clip": 0.01090081, + "auxiliary_loss_mlp": 0.01033039, + "balance_loss_clip": 1.03838015, + "balance_loss_mlp": 1.02049243, + "epoch": 0.7004358935818428, + "flos": 15953854152960.0, + "grad_norm": 2.229815779289175, + "language_loss": 0.787054, + "learning_rate": 8.69646846268308e-07, + "loss": 0.80828524, + "num_input_tokens_seen": 251333570, + "step": 11650, + "time_per_iteration": 2.651338815689087 + }, + { + "auxiliary_loss_clip": 0.01087246, + "auxiliary_loss_mlp": 0.01026359, + "balance_loss_clip": 1.03717887, + "balance_loss_mlp": 1.01436639, + "epoch": 0.7004960168345108, + "flos": 20411984718720.0, + "grad_norm": 2.805466583174802, + "language_loss": 0.78653586, + "learning_rate": 8.693255735136194e-07, + "loss": 0.8076719, + "num_input_tokens_seen": 251351070, + "step": 11651, + "time_per_iteration": 2.650684118270874 + }, + { + "auxiliary_loss_clip": 0.01078764, + "auxiliary_loss_mlp": 0.01048293, + "balance_loss_clip": 1.03785074, + "balance_loss_mlp": 1.03431594, + "epoch": 0.7005561400871787, + "flos": 17347547808000.0, + "grad_norm": 1.6343941081799256, + "language_loss": 0.69484842, + "learning_rate": 8.690043436342198e-07, + "loss": 0.71611905, + "num_input_tokens_seen": 251370005, + "step": 11652, + "time_per_iteration": 2.807304859161377 + }, + { + "auxiliary_loss_clip": 0.01104104, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.04046094, + "balance_loss_mlp": 1.01811981, + "epoch": 0.7006162633398467, + "flos": 25302120157440.0, + "grad_norm": 1.3561275324415532, + "language_loss": 0.74232221, + "learning_rate": 8.686831566422874e-07, + "loss": 0.7636776, + "num_input_tokens_seen": 251391210, + "step": 11653, + "time_per_iteration": 2.696967601776123 + }, + { + "auxiliary_loss_clip": 0.01087115, + "auxiliary_loss_mlp": 0.01035907, + "balance_loss_clip": 1.03951633, + "balance_loss_mlp": 1.02182245, + "epoch": 0.7006763865925146, + "flos": 20668997508480.0, + "grad_norm": 2.1100512473261808, + "language_loss": 0.70994234, + "learning_rate": 8.68362012550003e-07, + "loss": 0.73117256, + "num_input_tokens_seen": 251411505, + "step": 11654, + "time_per_iteration": 2.6838555335998535 + }, + { + "auxiliary_loss_clip": 0.01066217, + "auxiliary_loss_mlp": 0.01033137, + "balance_loss_clip": 1.03629136, + "balance_loss_mlp": 1.0182364, + "epoch": 0.7007365098451827, + "flos": 20046449963520.0, + "grad_norm": 2.5073875946500093, + "language_loss": 0.73771894, + "learning_rate": 8.680409113695453e-07, + "loss": 0.75871241, + "num_input_tokens_seen": 251428975, + "step": 11655, + "time_per_iteration": 2.7410359382629395 + }, + { + "auxiliary_loss_clip": 0.01111257, + "auxiliary_loss_mlp": 0.01039302, + "balance_loss_clip": 1.04240656, + "balance_loss_mlp": 1.02404356, + "epoch": 0.7007966330978506, + "flos": 20777375819520.0, + "grad_norm": 1.9005607339243875, + "language_loss": 0.70418394, + "learning_rate": 8.677198531130889e-07, + "loss": 0.72568953, + "num_input_tokens_seen": 251446940, + "step": 11656, + "time_per_iteration": 2.731491804122925 + }, + { + "auxiliary_loss_clip": 0.01066256, + "auxiliary_loss_mlp": 0.01032163, + "balance_loss_clip": 1.03605461, + "balance_loss_mlp": 1.0202893, + "epoch": 0.7008567563505186, + "flos": 29638189330560.0, + "grad_norm": 1.6373792531081708, + "language_loss": 0.7814554, + "learning_rate": 8.673988377928092e-07, + "loss": 0.80243957, + "num_input_tokens_seen": 251466205, + "step": 11657, + "time_per_iteration": 2.77717924118042 + }, + { + "auxiliary_loss_clip": 0.01118749, + "auxiliary_loss_mlp": 0.010372, + "balance_loss_clip": 1.04163647, + "balance_loss_mlp": 1.02257895, + "epoch": 0.7009168796031865, + "flos": 17092007475840.0, + "grad_norm": 1.9768682726826163, + "language_loss": 0.78330362, + "learning_rate": 8.670778654208797e-07, + "loss": 0.8048631, + "num_input_tokens_seen": 251484820, + "step": 11658, + "time_per_iteration": 2.6049365997314453 + }, + { + "auxiliary_loss_clip": 0.01086248, + "auxiliary_loss_mlp": 0.0103209, + "balance_loss_clip": 1.03589261, + "balance_loss_mlp": 1.01928139, + "epoch": 0.7009770028558545, + "flos": 20448972748800.0, + "grad_norm": 5.565674692031433, + "language_loss": 0.82623971, + "learning_rate": 8.667569360094713e-07, + "loss": 0.84742308, + "num_input_tokens_seen": 251502670, + "step": 11659, + "time_per_iteration": 2.686923027038574 + }, + { + "auxiliary_loss_clip": 0.01069607, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.03661668, + "balance_loss_mlp": 1.0180831, + "epoch": 0.7010371261085224, + "flos": 19245139407360.0, + "grad_norm": 1.865164954565192, + "language_loss": 0.6914413, + "learning_rate": 8.664360495707526e-07, + "loss": 0.71244586, + "num_input_tokens_seen": 251521630, + "step": 11660, + "time_per_iteration": 2.6798696517944336 + }, + { + "auxiliary_loss_clip": 0.01114876, + "auxiliary_loss_mlp": 0.01039123, + "balance_loss_clip": 1.03920841, + "balance_loss_mlp": 1.02499104, + "epoch": 0.7010972493611904, + "flos": 22127581082880.0, + "grad_norm": 1.6974567931874, + "language_loss": 0.81309623, + "learning_rate": 8.661152061168924e-07, + "loss": 0.83463621, + "num_input_tokens_seen": 251540105, + "step": 11661, + "time_per_iteration": 2.665506601333618 + }, + { + "auxiliary_loss_clip": 0.01100544, + "auxiliary_loss_mlp": 0.01036779, + "balance_loss_clip": 1.03771257, + "balance_loss_mlp": 1.02428651, + "epoch": 0.7011573726138585, + "flos": 31391132860800.0, + "grad_norm": 1.6549167062780274, + "language_loss": 0.79250038, + "learning_rate": 8.657944056600579e-07, + "loss": 0.81387359, + "num_input_tokens_seen": 251560530, + "step": 11662, + "time_per_iteration": 2.747738838195801 + }, + { + "auxiliary_loss_clip": 0.01099278, + "auxiliary_loss_mlp": 0.01034891, + "balance_loss_clip": 1.03757107, + "balance_loss_mlp": 1.02009749, + "epoch": 0.7012174958665264, + "flos": 18150582216960.0, + "grad_norm": 1.8518490996849224, + "language_loss": 0.83547205, + "learning_rate": 8.654736482124134e-07, + "loss": 0.85681379, + "num_input_tokens_seen": 251577930, + "step": 11663, + "time_per_iteration": 2.631399631500244 + }, + { + "auxiliary_loss_clip": 0.01021926, + "auxiliary_loss_mlp": 0.00999736, + "balance_loss_clip": 1.00981212, + "balance_loss_mlp": 0.99871653, + "epoch": 0.7012776191191944, + "flos": 60651256567680.0, + "grad_norm": 0.8199034033651936, + "language_loss": 0.5377062, + "learning_rate": 8.651529337861209e-07, + "loss": 0.55792284, + "num_input_tokens_seen": 251638820, + "step": 11664, + "time_per_iteration": 3.219939708709717 + }, + { + "auxiliary_loss_clip": 0.01091352, + "auxiliary_loss_mlp": 0.01036957, + "balance_loss_clip": 1.03675961, + "balance_loss_mlp": 1.02283645, + "epoch": 0.7013377423718623, + "flos": 27198598435200.0, + "grad_norm": 25.234897895477353, + "language_loss": 0.78593969, + "learning_rate": 8.64832262393344e-07, + "loss": 0.80722272, + "num_input_tokens_seen": 251658070, + "step": 11665, + "time_per_iteration": 2.7333061695098877 + }, + { + "auxiliary_loss_clip": 0.01097626, + "auxiliary_loss_mlp": 0.01033659, + "balance_loss_clip": 1.03675759, + "balance_loss_mlp": 1.02039695, + "epoch": 0.7013978656245303, + "flos": 16543543731840.0, + "grad_norm": 3.3099112213098576, + "language_loss": 0.76706922, + "learning_rate": 8.645116340462404e-07, + "loss": 0.78838205, + "num_input_tokens_seen": 251671575, + "step": 11666, + "time_per_iteration": 4.164456844329834 + }, + { + "auxiliary_loss_clip": 0.0109964, + "auxiliary_loss_mlp": 0.01033563, + "balance_loss_clip": 1.03881526, + "balance_loss_mlp": 1.02059937, + "epoch": 0.7014579888771982, + "flos": 23143780753920.0, + "grad_norm": 1.9172711089554313, + "language_loss": 0.81507015, + "learning_rate": 8.641910487569695e-07, + "loss": 0.83640218, + "num_input_tokens_seen": 251689350, + "step": 11667, + "time_per_iteration": 2.6507012844085693 + }, + { + "auxiliary_loss_clip": 0.01080493, + "auxiliary_loss_mlp": 0.01039617, + "balance_loss_clip": 1.03758526, + "balance_loss_mlp": 1.02586019, + "epoch": 0.7015181121298663, + "flos": 25082095397760.0, + "grad_norm": 4.617945846615331, + "language_loss": 0.65072989, + "learning_rate": 8.638705065376879e-07, + "loss": 0.67193091, + "num_input_tokens_seen": 251704635, + "step": 11668, + "time_per_iteration": 4.238234758377075 + }, + { + "auxiliary_loss_clip": 0.01094365, + "auxiliary_loss_mlp": 0.0102871, + "balance_loss_clip": 1.03865385, + "balance_loss_mlp": 1.01505494, + "epoch": 0.7015782353825342, + "flos": 23327894891520.0, + "grad_norm": 2.259598520998094, + "language_loss": 0.7661069, + "learning_rate": 8.635500074005519e-07, + "loss": 0.78733766, + "num_input_tokens_seen": 251723035, + "step": 11669, + "time_per_iteration": 4.344635248184204 + }, + { + "auxiliary_loss_clip": 0.01013949, + "auxiliary_loss_mlp": 0.0100684, + "balance_loss_clip": 1.00989032, + "balance_loss_mlp": 1.00561166, + "epoch": 0.7016383586352022, + "flos": 70397161107840.0, + "grad_norm": 0.6970325216386312, + "language_loss": 0.54508567, + "learning_rate": 8.632295513577122e-07, + "loss": 0.56529355, + "num_input_tokens_seen": 251791630, + "step": 11670, + "time_per_iteration": 3.3269011974334717 + }, + { + "auxiliary_loss_clip": 0.01088398, + "auxiliary_loss_mlp": 0.01044417, + "balance_loss_clip": 1.04069412, + "balance_loss_mlp": 1.03119707, + "epoch": 0.7016984818878701, + "flos": 19792274348160.0, + "grad_norm": 1.7508124659386841, + "language_loss": 0.81738812, + "learning_rate": 8.629091384213218e-07, + "loss": 0.83871627, + "num_input_tokens_seen": 251809840, + "step": 11671, + "time_per_iteration": 2.622065544128418 + }, + { + "auxiliary_loss_clip": 0.0110729, + "auxiliary_loss_mlp": 0.01033835, + "balance_loss_clip": 1.04194474, + "balance_loss_mlp": 1.02070975, + "epoch": 0.7017586051405381, + "flos": 12896923184640.0, + "grad_norm": 2.274862917429984, + "language_loss": 0.75504148, + "learning_rate": 8.625887686035313e-07, + "loss": 0.77645272, + "num_input_tokens_seen": 251827550, + "step": 11672, + "time_per_iteration": 2.6540980339050293 + }, + { + "auxiliary_loss_clip": 0.01096652, + "auxiliary_loss_mlp": 0.01034604, + "balance_loss_clip": 1.03700793, + "balance_loss_mlp": 1.02045953, + "epoch": 0.701818728393206, + "flos": 18332828847360.0, + "grad_norm": 1.5989851774558104, + "language_loss": 0.87145984, + "learning_rate": 8.622684419164883e-07, + "loss": 0.8927725, + "num_input_tokens_seen": 251844880, + "step": 11673, + "time_per_iteration": 2.8490025997161865 + }, + { + "auxiliary_loss_clip": 0.01096229, + "auxiliary_loss_mlp": 0.01029311, + "balance_loss_clip": 1.0356524, + "balance_loss_mlp": 1.01583493, + "epoch": 0.701878851645874, + "flos": 17384212615680.0, + "grad_norm": 1.8214270702877817, + "language_loss": 0.73174304, + "learning_rate": 8.619481583723399e-07, + "loss": 0.75299847, + "num_input_tokens_seen": 251861025, + "step": 11674, + "time_per_iteration": 2.679823160171509 + }, + { + "auxiliary_loss_clip": 0.01096759, + "auxiliary_loss_mlp": 0.00769911, + "balance_loss_clip": 1.04201114, + "balance_loss_mlp": 1.00022173, + "epoch": 0.701938974898542, + "flos": 23915501481600.0, + "grad_norm": 1.622926679018359, + "language_loss": 0.72171724, + "learning_rate": 8.616279179832329e-07, + "loss": 0.74038392, + "num_input_tokens_seen": 251880175, + "step": 11675, + "time_per_iteration": 4.312408447265625 + }, + { + "auxiliary_loss_clip": 0.01074264, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.03631043, + "balance_loss_mlp": 1.01713443, + "epoch": 0.70199909815121, + "flos": 21795586652160.0, + "grad_norm": 2.2219041729549143, + "language_loss": 0.51501888, + "learning_rate": 8.613077207613078e-07, + "loss": 0.53606999, + "num_input_tokens_seen": 251899005, + "step": 11676, + "time_per_iteration": 2.6962332725524902 + }, + { + "auxiliary_loss_clip": 0.01010504, + "auxiliary_loss_mlp": 0.0075156, + "balance_loss_clip": 1.00856614, + "balance_loss_mlp": 0.99960405, + "epoch": 0.702059221403878, + "flos": 71715047109120.0, + "grad_norm": 0.7294989296672769, + "language_loss": 0.59194738, + "learning_rate": 8.609875667187079e-07, + "loss": 0.60956806, + "num_input_tokens_seen": 251966790, + "step": 11677, + "time_per_iteration": 3.283904552459717 + }, + { + "auxiliary_loss_clip": 0.01100162, + "auxiliary_loss_mlp": 0.01038032, + "balance_loss_clip": 1.03780138, + "balance_loss_mlp": 1.02260053, + "epoch": 0.7021193446565459, + "flos": 28111052649600.0, + "grad_norm": 2.3043069619356333, + "language_loss": 0.62869537, + "learning_rate": 8.606674558675737e-07, + "loss": 0.65007722, + "num_input_tokens_seen": 251989315, + "step": 11678, + "time_per_iteration": 2.683986186981201 + }, + { + "auxiliary_loss_clip": 0.01114626, + "auxiliary_loss_mlp": 0.01034461, + "balance_loss_clip": 1.04092705, + "balance_loss_mlp": 1.02130055, + "epoch": 0.7021794679092139, + "flos": 22924905229440.0, + "grad_norm": 1.616922175472628, + "language_loss": 0.79195565, + "learning_rate": 8.603473882200444e-07, + "loss": 0.81344652, + "num_input_tokens_seen": 252006620, + "step": 11679, + "time_per_iteration": 2.6574866771698 + }, + { + "auxiliary_loss_clip": 0.01084266, + "auxiliary_loss_mlp": 0.01048047, + "balance_loss_clip": 1.03683782, + "balance_loss_mlp": 1.03429675, + "epoch": 0.7022395911618818, + "flos": 18077827219200.0, + "grad_norm": 2.0679583940680746, + "language_loss": 0.70934772, + "learning_rate": 8.600273637882567e-07, + "loss": 0.73067081, + "num_input_tokens_seen": 252024570, + "step": 11680, + "time_per_iteration": 2.7358908653259277 + }, + { + "auxiliary_loss_clip": 0.01074807, + "auxiliary_loss_mlp": 0.010398, + "balance_loss_clip": 1.03587687, + "balance_loss_mlp": 1.02517891, + "epoch": 0.7022997144145499, + "flos": 16034294661120.0, + "grad_norm": 1.6247051825758914, + "language_loss": 0.74976349, + "learning_rate": 8.597073825843446e-07, + "loss": 0.77090955, + "num_input_tokens_seen": 252042775, + "step": 11681, + "time_per_iteration": 2.774574041366577 + }, + { + "auxiliary_loss_clip": 0.01094616, + "auxiliary_loss_mlp": 0.0103624, + "balance_loss_clip": 1.03856039, + "balance_loss_mlp": 1.0238483, + "epoch": 0.7023598376672178, + "flos": 26468678160000.0, + "grad_norm": 1.575109913537797, + "language_loss": 0.76865822, + "learning_rate": 8.593874446204434e-07, + "loss": 0.78996682, + "num_input_tokens_seen": 252063690, + "step": 11682, + "time_per_iteration": 2.7486395835876465 + }, + { + "auxiliary_loss_clip": 0.01082555, + "auxiliary_loss_mlp": 0.00772032, + "balance_loss_clip": 1.03884804, + "balance_loss_mlp": 1.00019991, + "epoch": 0.7024199609198858, + "flos": 17055917285760.0, + "grad_norm": 2.008069790408466, + "language_loss": 0.737746, + "learning_rate": 8.590675499086841e-07, + "loss": 0.75629187, + "num_input_tokens_seen": 252080335, + "step": 11683, + "time_per_iteration": 2.744171142578125 + }, + { + "auxiliary_loss_clip": 0.01079915, + "auxiliary_loss_mlp": 0.01035548, + "balance_loss_clip": 1.03894246, + "balance_loss_mlp": 1.02157617, + "epoch": 0.7024800841725537, + "flos": 25849039616640.0, + "grad_norm": 1.8760496906578064, + "language_loss": 0.71592307, + "learning_rate": 8.587476984611976e-07, + "loss": 0.73707771, + "num_input_tokens_seen": 252101075, + "step": 11684, + "time_per_iteration": 2.88992977142334 + }, + { + "auxiliary_loss_clip": 0.01104368, + "auxiliary_loss_mlp": 0.01036296, + "balance_loss_clip": 1.03960109, + "balance_loss_mlp": 1.02242017, + "epoch": 0.7025402074252217, + "flos": 23513014609920.0, + "grad_norm": 1.7667874173043265, + "language_loss": 0.71676773, + "learning_rate": 8.584278902901128e-07, + "loss": 0.73817438, + "num_input_tokens_seen": 252120510, + "step": 11685, + "time_per_iteration": 2.7373046875 + }, + { + "auxiliary_loss_clip": 0.01101099, + "auxiliary_loss_mlp": 0.01033615, + "balance_loss_clip": 1.03761411, + "balance_loss_mlp": 1.02074599, + "epoch": 0.7026003306778896, + "flos": 20150985519360.0, + "grad_norm": 2.3305068980612695, + "language_loss": 0.84660101, + "learning_rate": 8.581081254075582e-07, + "loss": 0.86794817, + "num_input_tokens_seen": 252137590, + "step": 11686, + "time_per_iteration": 2.6728014945983887 + }, + { + "auxiliary_loss_clip": 0.0101853, + "auxiliary_loss_mlp": 0.01001761, + "balance_loss_clip": 1.00980115, + "balance_loss_mlp": 1.00045574, + "epoch": 0.7026604539305576, + "flos": 64772400712320.0, + "grad_norm": 1.2905405547920359, + "language_loss": 0.69901091, + "learning_rate": 8.577884038256566e-07, + "loss": 0.71921384, + "num_input_tokens_seen": 252199830, + "step": 11687, + "time_per_iteration": 3.3107638359069824 + }, + { + "auxiliary_loss_clip": 0.01076554, + "auxiliary_loss_mlp": 0.01032291, + "balance_loss_clip": 1.03496408, + "balance_loss_mlp": 1.01832569, + "epoch": 0.7027205771832256, + "flos": 21871466133120.0, + "grad_norm": 1.932037995437553, + "language_loss": 0.7684707, + "learning_rate": 8.574687255565329e-07, + "loss": 0.78955913, + "num_input_tokens_seen": 252217200, + "step": 11688, + "time_per_iteration": 2.7459444999694824 + }, + { + "auxiliary_loss_clip": 0.0111428, + "auxiliary_loss_mlp": 0.0103517, + "balance_loss_clip": 1.04030085, + "balance_loss_mlp": 1.02199149, + "epoch": 0.7027807004358936, + "flos": 23367791923200.0, + "grad_norm": 2.3717928399741117, + "language_loss": 0.68631124, + "learning_rate": 8.571490906123107e-07, + "loss": 0.70780575, + "num_input_tokens_seen": 252236105, + "step": 11689, + "time_per_iteration": 2.615769624710083 + }, + { + "auxiliary_loss_clip": 0.01092147, + "auxiliary_loss_mlp": 0.01039659, + "balance_loss_clip": 1.03881717, + "balance_loss_mlp": 1.02628398, + "epoch": 0.7028408236885616, + "flos": 15304266645120.0, + "grad_norm": 2.2678896966391293, + "language_loss": 0.79724276, + "learning_rate": 8.568294990051086e-07, + "loss": 0.81856084, + "num_input_tokens_seen": 252253315, + "step": 11690, + "time_per_iteration": 2.752448081970215 + }, + { + "auxiliary_loss_clip": 0.01114987, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.04160511, + "balance_loss_mlp": 1.02232075, + "epoch": 0.7029009469412295, + "flos": 22018197191040.0, + "grad_norm": 1.8737745525801948, + "language_loss": 0.76049984, + "learning_rate": 8.56509950747047e-07, + "loss": 0.78200579, + "num_input_tokens_seen": 252272765, + "step": 11691, + "time_per_iteration": 2.6119184494018555 + }, + { + "auxiliary_loss_clip": 0.0108875, + "auxiliary_loss_mlp": 0.01032492, + "balance_loss_clip": 1.03811002, + "balance_loss_mlp": 1.01972437, + "epoch": 0.7029610701938975, + "flos": 21835519597440.0, + "grad_norm": 9.903733322151682, + "language_loss": 0.81749791, + "learning_rate": 8.561904458502429e-07, + "loss": 0.83871031, + "num_input_tokens_seen": 252290510, + "step": 11692, + "time_per_iteration": 2.69521427154541 + }, + { + "auxiliary_loss_clip": 0.0108957, + "auxiliary_loss_mlp": 0.01032565, + "balance_loss_clip": 1.03853154, + "balance_loss_mlp": 1.01875424, + "epoch": 0.7030211934465654, + "flos": 19135647774720.0, + "grad_norm": 1.5579267825971022, + "language_loss": 0.76325333, + "learning_rate": 8.558709843268111e-07, + "loss": 0.78447467, + "num_input_tokens_seen": 252309365, + "step": 11693, + "time_per_iteration": 2.6727678775787354 + }, + { + "auxiliary_loss_clip": 0.01089511, + "auxiliary_loss_mlp": 0.01037963, + "balance_loss_clip": 1.04170704, + "balance_loss_mlp": 1.02457595, + "epoch": 0.7030813166992335, + "flos": 38546010766080.0, + "grad_norm": 1.4223920880081815, + "language_loss": 0.68617809, + "learning_rate": 8.55551566188866e-07, + "loss": 0.70745289, + "num_input_tokens_seen": 252333010, + "step": 11694, + "time_per_iteration": 2.858931541442871 + }, + { + "auxiliary_loss_clip": 0.0111374, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.03941369, + "balance_loss_mlp": 1.02133918, + "epoch": 0.7031414399519014, + "flos": 14720897859840.0, + "grad_norm": 2.09445306191262, + "language_loss": 0.75264633, + "learning_rate": 8.552321914485203e-07, + "loss": 0.77412868, + "num_input_tokens_seen": 252351330, + "step": 11695, + "time_per_iteration": 2.631002902984619 + }, + { + "auxiliary_loss_clip": 0.01092725, + "auxiliary_loss_mlp": 0.01042661, + "balance_loss_clip": 1.04262757, + "balance_loss_mlp": 1.02838016, + "epoch": 0.7032015632045694, + "flos": 14027247342720.0, + "grad_norm": 2.095793582645169, + "language_loss": 0.73874116, + "learning_rate": 8.549128601178852e-07, + "loss": 0.760095, + "num_input_tokens_seen": 252369580, + "step": 11696, + "time_per_iteration": 2.7669694423675537 + }, + { + "auxiliary_loss_clip": 0.01097157, + "auxiliary_loss_mlp": 0.01032154, + "balance_loss_clip": 1.0388881, + "balance_loss_mlp": 1.01825988, + "epoch": 0.7032616864572373, + "flos": 27637175496960.0, + "grad_norm": 1.7096974428239413, + "language_loss": 0.75290072, + "learning_rate": 8.545935722090693e-07, + "loss": 0.77419376, + "num_input_tokens_seen": 252390525, + "step": 11697, + "time_per_iteration": 2.763500928878784 + }, + { + "auxiliary_loss_clip": 0.01063183, + "auxiliary_loss_mlp": 0.01044698, + "balance_loss_clip": 1.03909528, + "balance_loss_mlp": 1.02815211, + "epoch": 0.7033218097099053, + "flos": 17967294092160.0, + "grad_norm": 1.8043330597848055, + "language_loss": 0.81064868, + "learning_rate": 8.542743277341793e-07, + "loss": 0.8317275, + "num_input_tokens_seen": 252407470, + "step": 11698, + "time_per_iteration": 2.869485378265381 + }, + { + "auxiliary_loss_clip": 0.01087007, + "auxiliary_loss_mlp": 0.01041792, + "balance_loss_clip": 1.03696036, + "balance_loss_mlp": 1.02600873, + "epoch": 0.7033819329625732, + "flos": 19501721233920.0, + "grad_norm": 1.788545707110732, + "language_loss": 0.84702611, + "learning_rate": 8.539551267053222e-07, + "loss": 0.86831409, + "num_input_tokens_seen": 252427025, + "step": 11699, + "time_per_iteration": 2.664696216583252 + }, + { + "auxiliary_loss_clip": 0.01097664, + "auxiliary_loss_mlp": 0.01034973, + "balance_loss_clip": 1.03910065, + "balance_loss_mlp": 1.02029788, + "epoch": 0.7034420562152413, + "flos": 23987645948160.0, + "grad_norm": 2.037813318278341, + "language_loss": 0.78878331, + "learning_rate": 8.53635969134601e-07, + "loss": 0.81010973, + "num_input_tokens_seen": 252445410, + "step": 11700, + "time_per_iteration": 2.6491341590881348 + }, + { + "auxiliary_loss_clip": 0.01104199, + "auxiliary_loss_mlp": 0.01030571, + "balance_loss_clip": 1.04006886, + "balance_loss_mlp": 1.01655197, + "epoch": 0.7035021794679092, + "flos": 35043427756800.0, + "grad_norm": 1.9483737724471917, + "language_loss": 0.74603212, + "learning_rate": 8.533168550341186e-07, + "loss": 0.76737982, + "num_input_tokens_seen": 252463905, + "step": 11701, + "time_per_iteration": 2.75663423538208 + }, + { + "auxiliary_loss_clip": 0.0110842, + "auxiliary_loss_mlp": 0.01031935, + "balance_loss_clip": 1.04136193, + "balance_loss_mlp": 1.01701057, + "epoch": 0.7035623027205772, + "flos": 10997428164480.0, + "grad_norm": 2.4684106998326913, + "language_loss": 0.84602612, + "learning_rate": 8.529977844159769e-07, + "loss": 0.86742967, + "num_input_tokens_seen": 252478655, + "step": 11702, + "time_per_iteration": 2.691843032836914 + }, + { + "auxiliary_loss_clip": 0.01114954, + "auxiliary_loss_mlp": 0.01040813, + "balance_loss_clip": 1.03995621, + "balance_loss_mlp": 1.02679968, + "epoch": 0.7036224259732452, + "flos": 23623727304960.0, + "grad_norm": 2.401456792119983, + "language_loss": 0.61207104, + "learning_rate": 8.526787572922738e-07, + "loss": 0.63362873, + "num_input_tokens_seen": 252498740, + "step": 11703, + "time_per_iteration": 2.6257216930389404 + }, + { + "auxiliary_loss_clip": 0.01112246, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.03811026, + "balance_loss_mlp": 1.01622462, + "epoch": 0.7036825492259131, + "flos": 31686175175040.0, + "grad_norm": 1.8586997056929888, + "language_loss": 0.61509585, + "learning_rate": 8.523597736751067e-07, + "loss": 0.63652194, + "num_input_tokens_seen": 252517800, + "step": 11704, + "time_per_iteration": 2.6846559047698975 + }, + { + "auxiliary_loss_clip": 0.01096047, + "auxiliary_loss_mlp": 0.01032429, + "balance_loss_clip": 1.0392369, + "balance_loss_mlp": 1.02010882, + "epoch": 0.7037426724785811, + "flos": 30192866127360.0, + "grad_norm": 1.6136715073341366, + "language_loss": 0.70614809, + "learning_rate": 8.520408335765719e-07, + "loss": 0.72743285, + "num_input_tokens_seen": 252539620, + "step": 11705, + "time_per_iteration": 4.335879564285278 + }, + { + "auxiliary_loss_clip": 0.01103119, + "auxiliary_loss_mlp": 0.01036985, + "balance_loss_clip": 1.04067218, + "balance_loss_mlp": 1.02324617, + "epoch": 0.703802795731249, + "flos": 24311523905280.0, + "grad_norm": 1.8826981498859905, + "language_loss": 0.61822981, + "learning_rate": 8.517219370087645e-07, + "loss": 0.63963085, + "num_input_tokens_seen": 252557300, + "step": 11706, + "time_per_iteration": 2.593494176864624 + }, + { + "auxiliary_loss_clip": 0.01106671, + "auxiliary_loss_mlp": 0.01030439, + "balance_loss_clip": 1.04123783, + "balance_loss_mlp": 1.01777911, + "epoch": 0.7038629189839171, + "flos": 22528954632960.0, + "grad_norm": 2.04643987571859, + "language_loss": 0.67915642, + "learning_rate": 8.514030839837756e-07, + "loss": 0.70052749, + "num_input_tokens_seen": 252576715, + "step": 11707, + "time_per_iteration": 2.7215147018432617 + }, + { + "auxiliary_loss_clip": 0.01112969, + "auxiliary_loss_mlp": 0.01030922, + "balance_loss_clip": 1.04011774, + "balance_loss_mlp": 1.01814246, + "epoch": 0.703923042236585, + "flos": 26250484993920.0, + "grad_norm": 1.7862705599659854, + "language_loss": 0.76583481, + "learning_rate": 8.510842745136974e-07, + "loss": 0.78727371, + "num_input_tokens_seen": 252596190, + "step": 11708, + "time_per_iteration": 4.144139051437378 + }, + { + "auxiliary_loss_clip": 0.01090944, + "auxiliary_loss_mlp": 0.01034476, + "balance_loss_clip": 1.03818655, + "balance_loss_mlp": 1.02149391, + "epoch": 0.703983165489253, + "flos": 19390254353280.0, + "grad_norm": 1.9126700939118615, + "language_loss": 0.72069716, + "learning_rate": 8.50765508610619e-07, + "loss": 0.74195135, + "num_input_tokens_seen": 252613410, + "step": 11709, + "time_per_iteration": 4.317174911499023 + }, + { + "auxiliary_loss_clip": 0.01103216, + "auxiliary_loss_mlp": 0.01032072, + "balance_loss_clip": 1.04039192, + "balance_loss_mlp": 1.01939988, + "epoch": 0.7040432887419209, + "flos": 16683630773760.0, + "grad_norm": 2.4201158088388337, + "language_loss": 0.78757358, + "learning_rate": 8.504467862866267e-07, + "loss": 0.80892646, + "num_input_tokens_seen": 252629150, + "step": 11710, + "time_per_iteration": 2.6521589756011963 + }, + { + "auxiliary_loss_clip": 0.01106607, + "auxiliary_loss_mlp": 0.01035061, + "balance_loss_clip": 1.04086101, + "balance_loss_mlp": 1.02094674, + "epoch": 0.7041034119945889, + "flos": 21141402203520.0, + "grad_norm": 1.5478201961623483, + "language_loss": 0.77396274, + "learning_rate": 8.501281075538076e-07, + "loss": 0.7953794, + "num_input_tokens_seen": 252648225, + "step": 11711, + "time_per_iteration": 2.673774242401123 + }, + { + "auxiliary_loss_clip": 0.01077655, + "auxiliary_loss_mlp": 0.01031566, + "balance_loss_clip": 1.03709733, + "balance_loss_mlp": 1.01935935, + "epoch": 0.7041635352472568, + "flos": 16910299549440.0, + "grad_norm": 2.4253841205918945, + "language_loss": 0.74240053, + "learning_rate": 8.498094724242457e-07, + "loss": 0.7634927, + "num_input_tokens_seen": 252665380, + "step": 11712, + "time_per_iteration": 2.7232208251953125 + }, + { + "auxiliary_loss_clip": 0.00994093, + "auxiliary_loss_mlp": 0.01000365, + "balance_loss_clip": 1.0117116, + "balance_loss_mlp": 0.99926871, + "epoch": 0.7042236584999249, + "flos": 71681219475840.0, + "grad_norm": 0.8868310854542714, + "language_loss": 0.64613295, + "learning_rate": 8.494908809100247e-07, + "loss": 0.66607749, + "num_input_tokens_seen": 252727950, + "step": 11713, + "time_per_iteration": 3.285946846008301 + }, + { + "auxiliary_loss_clip": 0.01098435, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.03652644, + "balance_loss_mlp": 1.01949096, + "epoch": 0.7042837817525928, + "flos": 28658187590400.0, + "grad_norm": 2.013493705916732, + "language_loss": 0.73046267, + "learning_rate": 8.49172333023225e-07, + "loss": 0.75177109, + "num_input_tokens_seen": 252746770, + "step": 11714, + "time_per_iteration": 4.236090898513794 + }, + { + "auxiliary_loss_clip": 0.01087938, + "auxiliary_loss_mlp": 0.0077181, + "balance_loss_clip": 1.03839374, + "balance_loss_mlp": 1.00026381, + "epoch": 0.7043439050052608, + "flos": 19753562465280.0, + "grad_norm": 1.7093963248736088, + "language_loss": 0.79507661, + "learning_rate": 8.488538287759248e-07, + "loss": 0.81367409, + "num_input_tokens_seen": 252765610, + "step": 11715, + "time_per_iteration": 2.6581244468688965 + }, + { + "auxiliary_loss_clip": 0.01084772, + "auxiliary_loss_mlp": 0.0104235, + "balance_loss_clip": 1.03780401, + "balance_loss_mlp": 1.02811062, + "epoch": 0.7044040282579288, + "flos": 11538529620480.0, + "grad_norm": 2.525582703515887, + "language_loss": 0.71628869, + "learning_rate": 8.485353681802037e-07, + "loss": 0.73755985, + "num_input_tokens_seen": 252781610, + "step": 11716, + "time_per_iteration": 2.6553633213043213 + }, + { + "auxiliary_loss_clip": 0.0108292, + "auxiliary_loss_mlp": 0.01035705, + "balance_loss_clip": 1.04328799, + "balance_loss_mlp": 1.02210903, + "epoch": 0.7044641515105967, + "flos": 33656126722560.0, + "grad_norm": 1.9767167849127631, + "language_loss": 0.66507739, + "learning_rate": 8.482169512481358e-07, + "loss": 0.68626368, + "num_input_tokens_seen": 252800600, + "step": 11717, + "time_per_iteration": 2.8526103496551514 + }, + { + "auxiliary_loss_clip": 0.01115695, + "auxiliary_loss_mlp": 0.01028921, + "balance_loss_clip": 1.04154098, + "balance_loss_mlp": 1.01575446, + "epoch": 0.7045242747632647, + "flos": 26723859356160.0, + "grad_norm": 1.3833751412057287, + "language_loss": 0.74381793, + "learning_rate": 8.478985779917967e-07, + "loss": 0.76526403, + "num_input_tokens_seen": 252822310, + "step": 11718, + "time_per_iteration": 2.6526429653167725 + }, + { + "auxiliary_loss_clip": 0.0110132, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.039011, + "balance_loss_mlp": 1.02069247, + "epoch": 0.7045843980159326, + "flos": 26797655848320.0, + "grad_norm": 2.395703855617911, + "language_loss": 0.79719883, + "learning_rate": 8.475802484232606e-07, + "loss": 0.81854498, + "num_input_tokens_seen": 252842355, + "step": 11719, + "time_per_iteration": 2.66690731048584 + }, + { + "auxiliary_loss_clip": 0.01105187, + "auxiliary_loss_mlp": 0.01041375, + "balance_loss_clip": 1.04220223, + "balance_loss_mlp": 1.02782118, + "epoch": 0.7046445212686007, + "flos": 41574824363520.0, + "grad_norm": 1.733178571450649, + "language_loss": 0.65760505, + "learning_rate": 8.472619625545951e-07, + "loss": 0.67907059, + "num_input_tokens_seen": 252866785, + "step": 11720, + "time_per_iteration": 2.808574914932251 + }, + { + "auxiliary_loss_clip": 0.0109618, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.04084325, + "balance_loss_mlp": 1.01776671, + "epoch": 0.7047046445212686, + "flos": 15560166113280.0, + "grad_norm": 2.168655366214879, + "language_loss": 0.80443633, + "learning_rate": 8.46943720397872e-07, + "loss": 0.8257128, + "num_input_tokens_seen": 252881870, + "step": 11721, + "time_per_iteration": 2.7525858879089355 + }, + { + "auxiliary_loss_clip": 0.01001442, + "auxiliary_loss_mlp": 0.00998843, + "balance_loss_clip": 1.0093838, + "balance_loss_mlp": 0.99760932, + "epoch": 0.7047647677739366, + "flos": 70410269571840.0, + "grad_norm": 0.7632832348458642, + "language_loss": 0.64800274, + "learning_rate": 8.466255219651582e-07, + "loss": 0.66800559, + "num_input_tokens_seen": 252951300, + "step": 11722, + "time_per_iteration": 3.413194179534912 + }, + { + "auxiliary_loss_clip": 0.010923, + "auxiliary_loss_mlp": 0.01034777, + "balance_loss_clip": 1.03915524, + "balance_loss_mlp": 1.02249277, + "epoch": 0.7048248910266045, + "flos": 23660032976640.0, + "grad_norm": 1.7290381066238394, + "language_loss": 0.65823722, + "learning_rate": 8.463073672685211e-07, + "loss": 0.67950797, + "num_input_tokens_seen": 252971400, + "step": 11723, + "time_per_iteration": 2.668208360671997 + }, + { + "auxiliary_loss_clip": 0.01083668, + "auxiliary_loss_mlp": 0.01031266, + "balance_loss_clip": 1.03798199, + "balance_loss_mlp": 1.01790833, + "epoch": 0.7048850142792725, + "flos": 21397158017280.0, + "grad_norm": 1.6832847250880896, + "language_loss": 0.80916411, + "learning_rate": 8.459892563200235e-07, + "loss": 0.8303135, + "num_input_tokens_seen": 252989475, + "step": 11724, + "time_per_iteration": 2.7347311973571777 + }, + { + "auxiliary_loss_clip": 0.01104162, + "auxiliary_loss_mlp": 0.01035842, + "balance_loss_clip": 1.03983295, + "balance_loss_mlp": 1.02229953, + "epoch": 0.7049451375319404, + "flos": 21648101408640.0, + "grad_norm": 1.7664791855809618, + "language_loss": 0.7323097, + "learning_rate": 8.456711891317296e-07, + "loss": 0.75370979, + "num_input_tokens_seen": 253007220, + "step": 11725, + "time_per_iteration": 2.654641628265381 + }, + { + "auxiliary_loss_clip": 0.01066947, + "auxiliary_loss_mlp": 0.01039139, + "balance_loss_clip": 1.03276384, + "balance_loss_mlp": 1.02378523, + "epoch": 0.7050052607846085, + "flos": 14866802904960.0, + "grad_norm": 2.572506179811176, + "language_loss": 0.78501201, + "learning_rate": 8.453531657156998e-07, + "loss": 0.80607283, + "num_input_tokens_seen": 253025410, + "step": 11726, + "time_per_iteration": 2.7266314029693604 + }, + { + "auxiliary_loss_clip": 0.01093418, + "auxiliary_loss_mlp": 0.01038851, + "balance_loss_clip": 1.03780484, + "balance_loss_mlp": 1.02567255, + "epoch": 0.7050653840372764, + "flos": 19241763528960.0, + "grad_norm": 2.180783221878792, + "language_loss": 0.70615113, + "learning_rate": 8.450351860839931e-07, + "loss": 0.72747386, + "num_input_tokens_seen": 253043305, + "step": 11727, + "time_per_iteration": 2.6545214653015137 + }, + { + "auxiliary_loss_clip": 0.0110651, + "auxiliary_loss_mlp": 0.00770675, + "balance_loss_clip": 1.03787398, + "balance_loss_mlp": 1.00010693, + "epoch": 0.7051255072899444, + "flos": 27780422935680.0, + "grad_norm": 1.6658028000538345, + "language_loss": 0.68843293, + "learning_rate": 8.44717250248668e-07, + "loss": 0.7072047, + "num_input_tokens_seen": 253062790, + "step": 11728, + "time_per_iteration": 2.7480993270874023 + }, + { + "auxiliary_loss_clip": 0.01080875, + "auxiliary_loss_mlp": 0.00771073, + "balance_loss_clip": 1.03778076, + "balance_loss_mlp": 1.00025976, + "epoch": 0.7051856305426124, + "flos": 27892033470720.0, + "grad_norm": 1.6755629992737011, + "language_loss": 0.73434365, + "learning_rate": 8.443993582217803e-07, + "loss": 0.75286305, + "num_input_tokens_seen": 253082055, + "step": 11729, + "time_per_iteration": 2.762924909591675 + }, + { + "auxiliary_loss_clip": 0.01101913, + "auxiliary_loss_mlp": 0.01033478, + "balance_loss_clip": 1.04192829, + "balance_loss_mlp": 1.01929784, + "epoch": 0.7052457537952803, + "flos": 25043563082880.0, + "grad_norm": 1.5899274462099697, + "language_loss": 0.77899098, + "learning_rate": 8.440815100153862e-07, + "loss": 0.80034494, + "num_input_tokens_seen": 253102575, + "step": 11730, + "time_per_iteration": 2.6950738430023193 + }, + { + "auxiliary_loss_clip": 0.01113225, + "auxiliary_loss_mlp": 0.01040637, + "balance_loss_clip": 1.0385406, + "balance_loss_mlp": 1.02747023, + "epoch": 0.7053058770479483, + "flos": 21871717528320.0, + "grad_norm": 1.9830603646297307, + "language_loss": 0.62745416, + "learning_rate": 8.437637056415359e-07, + "loss": 0.64899278, + "num_input_tokens_seen": 253121290, + "step": 11731, + "time_per_iteration": 2.588109016418457 + }, + { + "auxiliary_loss_clip": 0.01058245, + "auxiliary_loss_mlp": 0.01032885, + "balance_loss_clip": 1.0364511, + "balance_loss_mlp": 1.01818633, + "epoch": 0.7053660003006162, + "flos": 16398716094720.0, + "grad_norm": 1.9950553391193862, + "language_loss": 0.74299359, + "learning_rate": 8.434459451122815e-07, + "loss": 0.76390493, + "num_input_tokens_seen": 253139720, + "step": 11732, + "time_per_iteration": 2.6930272579193115 + }, + { + "auxiliary_loss_clip": 0.01102907, + "auxiliary_loss_mlp": 0.01034149, + "balance_loss_clip": 1.04072869, + "balance_loss_mlp": 1.0211252, + "epoch": 0.7054261235532843, + "flos": 22711560399360.0, + "grad_norm": 1.5877435619523543, + "language_loss": 0.71181738, + "learning_rate": 8.431282284396735e-07, + "loss": 0.73318791, + "num_input_tokens_seen": 253160250, + "step": 11733, + "time_per_iteration": 2.6677498817443848 + }, + { + "auxiliary_loss_clip": 0.01077175, + "auxiliary_loss_mlp": 0.01034316, + "balance_loss_clip": 1.03793108, + "balance_loss_mlp": 1.02100611, + "epoch": 0.7054862468059522, + "flos": 13589711775360.0, + "grad_norm": 1.9036570704847118, + "language_loss": 0.73538595, + "learning_rate": 8.428105556357583e-07, + "loss": 0.75650084, + "num_input_tokens_seen": 253178710, + "step": 11734, + "time_per_iteration": 2.660600185394287 + }, + { + "auxiliary_loss_clip": 0.01080202, + "auxiliary_loss_mlp": 0.01045919, + "balance_loss_clip": 1.03852844, + "balance_loss_mlp": 1.02982593, + "epoch": 0.7055463700586202, + "flos": 15880704105600.0, + "grad_norm": 2.4268970887811685, + "language_loss": 0.6969564, + "learning_rate": 8.424929267125829e-07, + "loss": 0.71821761, + "num_input_tokens_seen": 253194805, + "step": 11735, + "time_per_iteration": 2.6809842586517334 + }, + { + "auxiliary_loss_clip": 0.01084684, + "auxiliary_loss_mlp": 0.01039756, + "balance_loss_clip": 1.03542256, + "balance_loss_mlp": 1.02454448, + "epoch": 0.7056064933112881, + "flos": 23076161400960.0, + "grad_norm": 2.1216259487349918, + "language_loss": 0.72249383, + "learning_rate": 8.421753416821933e-07, + "loss": 0.74373823, + "num_input_tokens_seen": 253213895, + "step": 11736, + "time_per_iteration": 2.7173984050750732 + }, + { + "auxiliary_loss_clip": 0.01093397, + "auxiliary_loss_mlp": 0.01028697, + "balance_loss_clip": 1.03913403, + "balance_loss_mlp": 1.01618576, + "epoch": 0.7056666165639561, + "flos": 24057168721920.0, + "grad_norm": 1.8303036629192204, + "language_loss": 0.68867785, + "learning_rate": 8.41857800556629e-07, + "loss": 0.70989877, + "num_input_tokens_seen": 253231620, + "step": 11737, + "time_per_iteration": 2.7358338832855225 + }, + { + "auxiliary_loss_clip": 0.01082807, + "auxiliary_loss_mlp": 0.01039951, + "balance_loss_clip": 1.04143405, + "balance_loss_mlp": 1.02608716, + "epoch": 0.705726739816624, + "flos": 17493237371520.0, + "grad_norm": 2.0769803991045848, + "language_loss": 0.67978764, + "learning_rate": 8.415403033479332e-07, + "loss": 0.70101517, + "num_input_tokens_seen": 253249590, + "step": 11738, + "time_per_iteration": 2.7042016983032227 + }, + { + "auxiliary_loss_clip": 0.01114904, + "auxiliary_loss_mlp": 0.01037563, + "balance_loss_clip": 1.04074073, + "balance_loss_mlp": 1.02349627, + "epoch": 0.7057868630692921, + "flos": 51350426472960.0, + "grad_norm": 1.923264037015028, + "language_loss": 0.75011027, + "learning_rate": 8.41222850068145e-07, + "loss": 0.77163494, + "num_input_tokens_seen": 253273870, + "step": 11739, + "time_per_iteration": 2.9135007858276367 + }, + { + "auxiliary_loss_clip": 0.01084303, + "auxiliary_loss_mlp": 0.00770885, + "balance_loss_clip": 1.03611875, + "balance_loss_mlp": 1.00016105, + "epoch": 0.70584698632196, + "flos": 26102963836800.0, + "grad_norm": 1.6688083494293096, + "language_loss": 0.71504521, + "learning_rate": 8.409054407293032e-07, + "loss": 0.7335971, + "num_input_tokens_seen": 253293720, + "step": 11740, + "time_per_iteration": 2.7146854400634766 + }, + { + "auxiliary_loss_clip": 0.01081608, + "auxiliary_loss_mlp": 0.01029234, + "balance_loss_clip": 1.03897929, + "balance_loss_mlp": 1.01712787, + "epoch": 0.705907109574628, + "flos": 21543134889600.0, + "grad_norm": 1.6427960474243053, + "language_loss": 0.81782758, + "learning_rate": 8.405880753434434e-07, + "loss": 0.83893597, + "num_input_tokens_seen": 253313700, + "step": 11741, + "time_per_iteration": 2.7265563011169434 + }, + { + "auxiliary_loss_clip": 0.01091272, + "auxiliary_loss_mlp": 0.01033057, + "balance_loss_clip": 1.03843784, + "balance_loss_mlp": 1.01918101, + "epoch": 0.705967232827296, + "flos": 22710842127360.0, + "grad_norm": 1.7910600093045685, + "language_loss": 0.77970088, + "learning_rate": 8.402707539225993e-07, + "loss": 0.80094415, + "num_input_tokens_seen": 253332425, + "step": 11742, + "time_per_iteration": 2.744617462158203 + }, + { + "auxiliary_loss_clip": 0.01119104, + "auxiliary_loss_mlp": 0.01034721, + "balance_loss_clip": 1.04196119, + "balance_loss_mlp": 1.02049327, + "epoch": 0.7060273560799639, + "flos": 28691225124480.0, + "grad_norm": 1.5236916078434313, + "language_loss": 0.64199877, + "learning_rate": 8.39953476478805e-07, + "loss": 0.66353697, + "num_input_tokens_seen": 253353620, + "step": 11743, + "time_per_iteration": 2.6587469577789307 + }, + { + "auxiliary_loss_clip": 0.01087403, + "auxiliary_loss_mlp": 0.01037922, + "balance_loss_clip": 1.03589988, + "balance_loss_mlp": 1.02340233, + "epoch": 0.7060874793326319, + "flos": 15706178899200.0, + "grad_norm": 1.87431643891437, + "language_loss": 0.65725398, + "learning_rate": 8.396362430240902e-07, + "loss": 0.67850721, + "num_input_tokens_seen": 253370930, + "step": 11744, + "time_per_iteration": 2.651118278503418 + }, + { + "auxiliary_loss_clip": 0.01100616, + "auxiliary_loss_mlp": 0.01034598, + "balance_loss_clip": 1.03792346, + "balance_loss_mlp": 1.02068591, + "epoch": 0.7061476025852998, + "flos": 21506757390720.0, + "grad_norm": 1.7030797660453072, + "language_loss": 0.63694406, + "learning_rate": 8.393190535704857e-07, + "loss": 0.65829617, + "num_input_tokens_seen": 253389810, + "step": 11745, + "time_per_iteration": 4.299595832824707 + }, + { + "auxiliary_loss_clip": 0.01077796, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.03395259, + "balance_loss_mlp": 1.02075148, + "epoch": 0.7062077258379679, + "flos": 28181832399360.0, + "grad_norm": 1.8209890328260383, + "language_loss": 0.71859854, + "learning_rate": 8.390019081300188e-07, + "loss": 0.73971808, + "num_input_tokens_seen": 253408685, + "step": 11746, + "time_per_iteration": 2.736166000366211 + }, + { + "auxiliary_loss_clip": 0.01057236, + "auxiliary_loss_mlp": 0.01033707, + "balance_loss_clip": 1.03863013, + "balance_loss_mlp": 1.02068353, + "epoch": 0.7062678490906358, + "flos": 27853680723840.0, + "grad_norm": 1.467695639044188, + "language_loss": 0.79042476, + "learning_rate": 8.386848067147175e-07, + "loss": 0.81133419, + "num_input_tokens_seen": 253429685, + "step": 11747, + "time_per_iteration": 4.4075751304626465 + }, + { + "auxiliary_loss_clip": 0.01099667, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.03842044, + "balance_loss_mlp": 1.02088141, + "epoch": 0.7063279723433038, + "flos": 23184862934400.0, + "grad_norm": 1.8618400783249733, + "language_loss": 0.65024734, + "learning_rate": 8.383677493366031e-07, + "loss": 0.67157471, + "num_input_tokens_seen": 253448260, + "step": 11748, + "time_per_iteration": 4.207107305526733 + }, + { + "auxiliary_loss_clip": 0.01067203, + "auxiliary_loss_mlp": 0.01036794, + "balance_loss_clip": 1.03578413, + "balance_loss_mlp": 1.02373433, + "epoch": 0.7063880955959717, + "flos": 20188655907840.0, + "grad_norm": 2.0013045839241337, + "language_loss": 0.79624116, + "learning_rate": 8.380507360077003e-07, + "loss": 0.81728113, + "num_input_tokens_seen": 253467725, + "step": 11749, + "time_per_iteration": 2.8175888061523438 + }, + { + "auxiliary_loss_clip": 0.01033129, + "auxiliary_loss_mlp": 0.01002303, + "balance_loss_clip": 1.01007652, + "balance_loss_mlp": 1.001194, + "epoch": 0.7064482188486397, + "flos": 63668182763520.0, + "grad_norm": 0.7992789353290359, + "language_loss": 0.54014421, + "learning_rate": 8.377337667400304e-07, + "loss": 0.56049848, + "num_input_tokens_seen": 253526940, + "step": 11750, + "time_per_iteration": 3.154975175857544 + }, + { + "auxiliary_loss_clip": 0.01092158, + "auxiliary_loss_mlp": 0.01037282, + "balance_loss_clip": 1.03940368, + "balance_loss_mlp": 1.0240972, + "epoch": 0.7065083421013076, + "flos": 25191227894400.0, + "grad_norm": 1.6870361976430077, + "language_loss": 0.78464556, + "learning_rate": 8.37416841545612e-07, + "loss": 0.80593991, + "num_input_tokens_seen": 253546160, + "step": 11751, + "time_per_iteration": 2.732318878173828 + }, + { + "auxiliary_loss_clip": 0.01074241, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.03658986, + "balance_loss_mlp": 1.02024841, + "epoch": 0.7065684653539757, + "flos": 22893699288960.0, + "grad_norm": 2.1027740219742928, + "language_loss": 0.67992324, + "learning_rate": 8.370999604364634e-07, + "loss": 0.70098889, + "num_input_tokens_seen": 253565505, + "step": 11752, + "time_per_iteration": 2.810976505279541 + }, + { + "auxiliary_loss_clip": 0.01058629, + "auxiliary_loss_mlp": 0.00771253, + "balance_loss_clip": 1.03738487, + "balance_loss_mlp": 1.00034094, + "epoch": 0.7066285886066436, + "flos": 23550254035200.0, + "grad_norm": 2.005268773121489, + "language_loss": 0.7630111, + "learning_rate": 8.367831234246025e-07, + "loss": 0.7813099, + "num_input_tokens_seen": 253585125, + "step": 11753, + "time_per_iteration": 2.7646682262420654 + }, + { + "auxiliary_loss_clip": 0.01082791, + "auxiliary_loss_mlp": 0.00770025, + "balance_loss_clip": 1.0389396, + "balance_loss_mlp": 1.00026608, + "epoch": 0.7066887118593116, + "flos": 21069293650560.0, + "grad_norm": 1.5279654497487496, + "language_loss": 0.70828259, + "learning_rate": 8.364663305220405e-07, + "loss": 0.72681069, + "num_input_tokens_seen": 253604815, + "step": 11754, + "time_per_iteration": 4.359536170959473 + }, + { + "auxiliary_loss_clip": 0.01072435, + "auxiliary_loss_mlp": 0.01043933, + "balance_loss_clip": 1.03650284, + "balance_loss_mlp": 1.02949619, + "epoch": 0.7067488351119796, + "flos": 21176307244800.0, + "grad_norm": 2.2420199206717104, + "language_loss": 0.89593709, + "learning_rate": 8.361495817407919e-07, + "loss": 0.91710079, + "num_input_tokens_seen": 253622855, + "step": 11755, + "time_per_iteration": 2.682011365890503 + }, + { + "auxiliary_loss_clip": 0.01088944, + "auxiliary_loss_mlp": 0.00770375, + "balance_loss_clip": 1.03828812, + "balance_loss_mlp": 1.00012851, + "epoch": 0.7068089583646475, + "flos": 20449224144000.0, + "grad_norm": 1.7866584888812729, + "language_loss": 0.79776525, + "learning_rate": 8.358328770928678e-07, + "loss": 0.81635845, + "num_input_tokens_seen": 253642760, + "step": 11756, + "time_per_iteration": 2.7065372467041016 + }, + { + "auxiliary_loss_clip": 0.00998647, + "auxiliary_loss_mlp": 0.01001672, + "balance_loss_clip": 1.01305819, + "balance_loss_mlp": 1.00066495, + "epoch": 0.7068690816173155, + "flos": 59109179829120.0, + "grad_norm": 1.037725439626294, + "language_loss": 0.60347986, + "learning_rate": 8.355162165902785e-07, + "loss": 0.62348306, + "num_input_tokens_seen": 253695685, + "step": 11757, + "time_per_iteration": 3.031812906265259 + }, + { + "auxiliary_loss_clip": 0.0107753, + "auxiliary_loss_mlp": 0.01034781, + "balance_loss_clip": 1.03763008, + "balance_loss_mlp": 1.02182341, + "epoch": 0.7069292048699835, + "flos": 16251554073600.0, + "grad_norm": 1.6437193466007092, + "language_loss": 0.80430943, + "learning_rate": 8.351996002450307e-07, + "loss": 0.82543254, + "num_input_tokens_seen": 253713305, + "step": 11758, + "time_per_iteration": 2.655449628829956 + }, + { + "auxiliary_loss_clip": 0.01071031, + "auxiliary_loss_mlp": 0.00770922, + "balance_loss_clip": 1.03758502, + "balance_loss_mlp": 1.000157, + "epoch": 0.7069893281226515, + "flos": 41172768455040.0, + "grad_norm": 2.774739921576683, + "language_loss": 0.77785885, + "learning_rate": 8.348830280691304e-07, + "loss": 0.79627836, + "num_input_tokens_seen": 253736100, + "step": 11759, + "time_per_iteration": 2.8444526195526123 + }, + { + "auxiliary_loss_clip": 0.01101914, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.03790534, + "balance_loss_mlp": 1.01810324, + "epoch": 0.7070494513753194, + "flos": 24207275658240.0, + "grad_norm": 1.9103417618984617, + "language_loss": 0.67560184, + "learning_rate": 8.34566500074583e-07, + "loss": 0.69693744, + "num_input_tokens_seen": 253757350, + "step": 11760, + "time_per_iteration": 2.715236186981201 + }, + { + "auxiliary_loss_clip": 0.01076213, + "auxiliary_loss_mlp": 0.01033273, + "balance_loss_clip": 1.03878856, + "balance_loss_mlp": 1.02084506, + "epoch": 0.7071095746279874, + "flos": 20185675079040.0, + "grad_norm": 1.9224750289593278, + "language_loss": 0.80442196, + "learning_rate": 8.342500162733899e-07, + "loss": 0.82551688, + "num_input_tokens_seen": 253772855, + "step": 11761, + "time_per_iteration": 2.6564581394195557 + }, + { + "auxiliary_loss_clip": 0.01086413, + "auxiliary_loss_mlp": 0.01043874, + "balance_loss_clip": 1.03556442, + "balance_loss_mlp": 1.02777457, + "epoch": 0.7071696978806553, + "flos": 18183045133440.0, + "grad_norm": 2.457600250400544, + "language_loss": 0.75026697, + "learning_rate": 8.33933576677553e-07, + "loss": 0.77156985, + "num_input_tokens_seen": 253790360, + "step": 11762, + "time_per_iteration": 2.6615617275238037 + }, + { + "auxiliary_loss_clip": 0.01087856, + "auxiliary_loss_mlp": 0.01032971, + "balance_loss_clip": 1.0366205, + "balance_loss_mlp": 1.0203824, + "epoch": 0.7072298211333233, + "flos": 24131719399680.0, + "grad_norm": 1.9130475183821334, + "language_loss": 0.76827163, + "learning_rate": 8.336171812990724e-07, + "loss": 0.78947991, + "num_input_tokens_seen": 253810585, + "step": 11763, + "time_per_iteration": 2.7182300090789795 + }, + { + "auxiliary_loss_clip": 0.01083565, + "auxiliary_loss_mlp": 0.00771257, + "balance_loss_clip": 1.03937089, + "balance_loss_mlp": 1.0002048, + "epoch": 0.7072899443859912, + "flos": 27198418867200.0, + "grad_norm": 2.255368812760523, + "language_loss": 0.78756404, + "learning_rate": 8.333008301499453e-07, + "loss": 0.80611229, + "num_input_tokens_seen": 253829080, + "step": 11764, + "time_per_iteration": 2.7578113079071045 + }, + { + "auxiliary_loss_clip": 0.01064836, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.03533673, + "balance_loss_mlp": 1.02406919, + "epoch": 0.7073500676386593, + "flos": 16435596384000.0, + "grad_norm": 1.727630740916822, + "language_loss": 0.79465842, + "learning_rate": 8.32984523242167e-07, + "loss": 0.81568348, + "num_input_tokens_seen": 253846780, + "step": 11765, + "time_per_iteration": 2.7866904735565186 + }, + { + "auxiliary_loss_clip": 0.01109005, + "auxiliary_loss_mlp": 0.01028153, + "balance_loss_clip": 1.03910017, + "balance_loss_mlp": 1.017102, + "epoch": 0.7074101908913272, + "flos": 27673732563840.0, + "grad_norm": 1.7516856530265033, + "language_loss": 0.68398869, + "learning_rate": 8.326682605877324e-07, + "loss": 0.70536023, + "num_input_tokens_seen": 253867075, + "step": 11766, + "time_per_iteration": 2.701338768005371 + }, + { + "auxiliary_loss_clip": 0.01090324, + "auxiliary_loss_mlp": 0.01038357, + "balance_loss_clip": 1.03629494, + "balance_loss_mlp": 1.02530944, + "epoch": 0.7074703141439952, + "flos": 22238078296320.0, + "grad_norm": 2.1562679089059245, + "language_loss": 0.63844228, + "learning_rate": 8.323520421986352e-07, + "loss": 0.65972912, + "num_input_tokens_seen": 253885790, + "step": 11767, + "time_per_iteration": 2.682774543762207 + }, + { + "auxiliary_loss_clip": 0.01101727, + "auxiliary_loss_mlp": 0.01027229, + "balance_loss_clip": 1.03870296, + "balance_loss_mlp": 1.01452112, + "epoch": 0.7075304373966632, + "flos": 29643217234560.0, + "grad_norm": 1.942959612416706, + "language_loss": 0.5247106, + "learning_rate": 8.320358680868646e-07, + "loss": 0.54600012, + "num_input_tokens_seen": 253907070, + "step": 11768, + "time_per_iteration": 2.753188133239746 + }, + { + "auxiliary_loss_clip": 0.01088433, + "auxiliary_loss_mlp": 0.00770686, + "balance_loss_clip": 1.03836966, + "balance_loss_mlp": 1.00015306, + "epoch": 0.7075905606493311, + "flos": 19755214490880.0, + "grad_norm": 1.6006404938341818, + "language_loss": 0.75532562, + "learning_rate": 8.317197382644119e-07, + "loss": 0.77391684, + "num_input_tokens_seen": 253927290, + "step": 11769, + "time_per_iteration": 2.7288544178009033 + }, + { + "auxiliary_loss_clip": 0.01013903, + "auxiliary_loss_mlp": 0.01015517, + "balance_loss_clip": 1.00881553, + "balance_loss_mlp": 1.01409209, + "epoch": 0.7076506839019991, + "flos": 65716132694400.0, + "grad_norm": 0.8537079866047984, + "language_loss": 0.61981726, + "learning_rate": 8.314036527432637e-07, + "loss": 0.64011145, + "num_input_tokens_seen": 253983440, + "step": 11770, + "time_per_iteration": 3.14176607131958 + }, + { + "auxiliary_loss_clip": 0.01078902, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.03566778, + "balance_loss_mlp": 1.02311409, + "epoch": 0.707710807154667, + "flos": 23765286804480.0, + "grad_norm": 1.7412518327218227, + "language_loss": 0.76630473, + "learning_rate": 8.310876115354055e-07, + "loss": 0.7874555, + "num_input_tokens_seen": 254003825, + "step": 11771, + "time_per_iteration": 2.8118982315063477 + }, + { + "auxiliary_loss_clip": 0.01097524, + "auxiliary_loss_mlp": 0.01028735, + "balance_loss_clip": 1.03753102, + "balance_loss_mlp": 1.01656938, + "epoch": 0.7077709304073351, + "flos": 21251360712960.0, + "grad_norm": 1.6484169639122062, + "language_loss": 0.70931244, + "learning_rate": 8.307716146528221e-07, + "loss": 0.73057497, + "num_input_tokens_seen": 254023345, + "step": 11772, + "time_per_iteration": 2.6268296241760254 + }, + { + "auxiliary_loss_clip": 0.01063148, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.03564882, + "balance_loss_mlp": 1.01975262, + "epoch": 0.707831053660003, + "flos": 20740746925440.0, + "grad_norm": 1.8398984713825175, + "language_loss": 0.69417477, + "learning_rate": 8.30455662107496e-07, + "loss": 0.71513855, + "num_input_tokens_seen": 254041815, + "step": 11773, + "time_per_iteration": 2.7778313159942627 + }, + { + "auxiliary_loss_clip": 0.01104178, + "auxiliary_loss_mlp": 0.01034694, + "balance_loss_clip": 1.03965759, + "balance_loss_mlp": 1.02178395, + "epoch": 0.707891176912671, + "flos": 21980993679360.0, + "grad_norm": 1.4556965212861144, + "language_loss": 0.7014932, + "learning_rate": 8.301397539114095e-07, + "loss": 0.72288191, + "num_input_tokens_seen": 254062065, + "step": 11774, + "time_per_iteration": 2.68330979347229 + }, + { + "auxiliary_loss_clip": 0.01081938, + "auxiliary_loss_mlp": 0.01028977, + "balance_loss_clip": 1.04048491, + "balance_loss_mlp": 1.01713347, + "epoch": 0.7079513001653389, + "flos": 21068970428160.0, + "grad_norm": 1.5670559751490778, + "language_loss": 0.74400485, + "learning_rate": 8.298238900765407e-07, + "loss": 0.76511401, + "num_input_tokens_seen": 254080605, + "step": 11775, + "time_per_iteration": 2.672057628631592 + }, + { + "auxiliary_loss_clip": 0.01074662, + "auxiliary_loss_mlp": 0.00770567, + "balance_loss_clip": 1.03893805, + "balance_loss_mlp": 1.00032187, + "epoch": 0.7080114234180069, + "flos": 18040659621120.0, + "grad_norm": 1.9150110614912736, + "language_loss": 0.86714977, + "learning_rate": 8.295080706148665e-07, + "loss": 0.88560206, + "num_input_tokens_seen": 254098710, + "step": 11776, + "time_per_iteration": 2.68167781829834 + }, + { + "auxiliary_loss_clip": 0.01093973, + "auxiliary_loss_mlp": 0.01034049, + "balance_loss_clip": 1.03666544, + "balance_loss_mlp": 1.02201438, + "epoch": 0.7080715466706748, + "flos": 15122271409920.0, + "grad_norm": 1.4933339123942304, + "language_loss": 0.75204122, + "learning_rate": 8.291922955383641e-07, + "loss": 0.77332139, + "num_input_tokens_seen": 254117200, + "step": 11777, + "time_per_iteration": 2.617124319076538 + }, + { + "auxiliary_loss_clip": 0.0109467, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.04061341, + "balance_loss_mlp": 1.02046156, + "epoch": 0.7081316699233429, + "flos": 14422802889600.0, + "grad_norm": 2.468930422112918, + "language_loss": 0.8228538, + "learning_rate": 8.288765648590066e-07, + "loss": 0.84413421, + "num_input_tokens_seen": 254132115, + "step": 11778, + "time_per_iteration": 2.7087488174438477 + }, + { + "auxiliary_loss_clip": 0.01082719, + "auxiliary_loss_mlp": 0.0103363, + "balance_loss_clip": 1.03583169, + "balance_loss_mlp": 1.02246594, + "epoch": 0.7081917931760108, + "flos": 23222389668480.0, + "grad_norm": 1.6098616985666978, + "language_loss": 0.85021019, + "learning_rate": 8.285608785887673e-07, + "loss": 0.87137371, + "num_input_tokens_seen": 254152285, + "step": 11779, + "time_per_iteration": 2.6744067668914795 + }, + { + "auxiliary_loss_clip": 0.0108855, + "auxiliary_loss_mlp": 0.01032676, + "balance_loss_clip": 1.03944063, + "balance_loss_mlp": 1.01993263, + "epoch": 0.7082519164286788, + "flos": 39308429871360.0, + "grad_norm": 2.2882732237177326, + "language_loss": 0.72005677, + "learning_rate": 8.28245236739618e-07, + "loss": 0.74126905, + "num_input_tokens_seen": 254172805, + "step": 11780, + "time_per_iteration": 2.8406546115875244 + }, + { + "auxiliary_loss_clip": 0.0105972, + "auxiliary_loss_mlp": 0.010316, + "balance_loss_clip": 1.03477693, + "balance_loss_mlp": 1.01896429, + "epoch": 0.7083120396813467, + "flos": 21651154064640.0, + "grad_norm": 1.4192183070754045, + "language_loss": 0.73349321, + "learning_rate": 8.279296393235256e-07, + "loss": 0.75440645, + "num_input_tokens_seen": 254191890, + "step": 11781, + "time_per_iteration": 2.8251590728759766 + }, + { + "auxiliary_loss_clip": 0.01099337, + "auxiliary_loss_mlp": 0.01032456, + "balance_loss_clip": 1.0398531, + "balance_loss_mlp": 1.02066612, + "epoch": 0.7083721629340147, + "flos": 17567033863680.0, + "grad_norm": 1.678957129171523, + "language_loss": 0.77408248, + "learning_rate": 8.276140863524585e-07, + "loss": 0.79540044, + "num_input_tokens_seen": 254210150, + "step": 11782, + "time_per_iteration": 2.6758499145507812 + }, + { + "auxiliary_loss_clip": 0.0108554, + "auxiliary_loss_mlp": 0.01029552, + "balance_loss_clip": 1.03717136, + "balance_loss_mlp": 1.01893663, + "epoch": 0.7084322861866827, + "flos": 29350509304320.0, + "grad_norm": 1.5187238607788938, + "language_loss": 0.69871926, + "learning_rate": 8.272985778383828e-07, + "loss": 0.71987015, + "num_input_tokens_seen": 254233015, + "step": 11783, + "time_per_iteration": 2.8378536701202393 + }, + { + "auxiliary_loss_clip": 0.01073688, + "auxiliary_loss_mlp": 0.01028822, + "balance_loss_clip": 1.03804398, + "balance_loss_mlp": 1.01622784, + "epoch": 0.7084924094393507, + "flos": 20194294343040.0, + "grad_norm": 1.5952065311243613, + "language_loss": 0.78930736, + "learning_rate": 8.269831137932632e-07, + "loss": 0.81033248, + "num_input_tokens_seen": 254251345, + "step": 11784, + "time_per_iteration": 4.362036943435669 + }, + { + "auxiliary_loss_clip": 0.01111276, + "auxiliary_loss_mlp": 0.0103335, + "balance_loss_clip": 1.04004228, + "balance_loss_mlp": 1.02080894, + "epoch": 0.7085525326920187, + "flos": 23477211728640.0, + "grad_norm": 2.08544088937736, + "language_loss": 0.77696943, + "learning_rate": 8.266676942290609e-07, + "loss": 0.79841572, + "num_input_tokens_seen": 254269905, + "step": 11785, + "time_per_iteration": 2.5937209129333496 + }, + { + "auxiliary_loss_clip": 0.01085039, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.0364778, + "balance_loss_mlp": 1.02091932, + "epoch": 0.7086126559446866, + "flos": 25958818558080.0, + "grad_norm": 1.9412182789537995, + "language_loss": 0.78004217, + "learning_rate": 8.26352319157738e-07, + "loss": 0.80123466, + "num_input_tokens_seen": 254289990, + "step": 11786, + "time_per_iteration": 4.211290121078491 + }, + { + "auxiliary_loss_clip": 0.01113302, + "auxiliary_loss_mlp": 0.01030804, + "balance_loss_clip": 1.03969085, + "balance_loss_mlp": 1.01798928, + "epoch": 0.7086727791973546, + "flos": 26724793109760.0, + "grad_norm": 1.9518945204498503, + "language_loss": 0.78987539, + "learning_rate": 8.260369885912526e-07, + "loss": 0.81131643, + "num_input_tokens_seen": 254309085, + "step": 11787, + "time_per_iteration": 4.215709447860718 + }, + { + "auxiliary_loss_clip": 0.01100936, + "auxiliary_loss_mlp": 0.01032335, + "balance_loss_clip": 1.0393877, + "balance_loss_mlp": 1.02004457, + "epoch": 0.7087329024500225, + "flos": 21683365585920.0, + "grad_norm": 1.963463516719362, + "language_loss": 0.76586342, + "learning_rate": 8.257217025415615e-07, + "loss": 0.7871961, + "num_input_tokens_seen": 254327045, + "step": 11788, + "time_per_iteration": 2.6296236515045166 + }, + { + "auxiliary_loss_clip": 0.0107305, + "auxiliary_loss_mlp": 0.01037785, + "balance_loss_clip": 1.03411317, + "balance_loss_mlp": 1.02229357, + "epoch": 0.7087930257026905, + "flos": 17931060247680.0, + "grad_norm": 1.8171493958934544, + "language_loss": 0.67838019, + "learning_rate": 8.254064610206212e-07, + "loss": 0.69948852, + "num_input_tokens_seen": 254344585, + "step": 11789, + "time_per_iteration": 2.664304733276367 + }, + { + "auxiliary_loss_clip": 0.0105779, + "auxiliary_loss_mlp": 0.01034389, + "balance_loss_clip": 1.03825188, + "balance_loss_mlp": 1.02094257, + "epoch": 0.7088531489553584, + "flos": 18911528864640.0, + "grad_norm": 1.6253389079610685, + "language_loss": 0.77915251, + "learning_rate": 8.250912640403858e-07, + "loss": 0.80007434, + "num_input_tokens_seen": 254362470, + "step": 11790, + "time_per_iteration": 2.745398759841919 + }, + { + "auxiliary_loss_clip": 0.01093327, + "auxiliary_loss_mlp": 0.01033055, + "balance_loss_clip": 1.03802967, + "balance_loss_mlp": 1.01917279, + "epoch": 0.7089132722080265, + "flos": 27380880979200.0, + "grad_norm": 2.6743877386072046, + "language_loss": 0.70789683, + "learning_rate": 8.247761116128085e-07, + "loss": 0.72916067, + "num_input_tokens_seen": 254383190, + "step": 11791, + "time_per_iteration": 2.7536044120788574 + }, + { + "auxiliary_loss_clip": 0.0110278, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.04042172, + "balance_loss_mlp": 1.02178025, + "epoch": 0.7089733954606944, + "flos": 22162917087360.0, + "grad_norm": 1.574032400084743, + "language_loss": 0.82329011, + "learning_rate": 8.244610037498376e-07, + "loss": 0.84467089, + "num_input_tokens_seen": 254403115, + "step": 11792, + "time_per_iteration": 2.658579111099243 + }, + { + "auxiliary_loss_clip": 0.01071076, + "auxiliary_loss_mlp": 0.01032048, + "balance_loss_clip": 1.03814042, + "balance_loss_mlp": 1.01898229, + "epoch": 0.7090335187133624, + "flos": 24425827960320.0, + "grad_norm": 2.4251661207172406, + "language_loss": 0.64878172, + "learning_rate": 8.241459404634232e-07, + "loss": 0.66981292, + "num_input_tokens_seen": 254421875, + "step": 11793, + "time_per_iteration": 4.261074066162109 + }, + { + "auxiliary_loss_clip": 0.01097375, + "auxiliary_loss_mlp": 0.01035971, + "balance_loss_clip": 1.03896296, + "balance_loss_mlp": 1.02244079, + "epoch": 0.7090936419660303, + "flos": 21835232288640.0, + "grad_norm": 5.81708329493613, + "language_loss": 0.70618987, + "learning_rate": 8.238309217655133e-07, + "loss": 0.72752333, + "num_input_tokens_seen": 254440765, + "step": 11794, + "time_per_iteration": 2.6876423358917236 + }, + { + "auxiliary_loss_clip": 0.01091573, + "auxiliary_loss_mlp": 0.01037583, + "balance_loss_clip": 1.04035616, + "balance_loss_mlp": 1.02522707, + "epoch": 0.7091537652186983, + "flos": 20082360585600.0, + "grad_norm": 1.8634208156139904, + "language_loss": 0.76080108, + "learning_rate": 8.23515947668052e-07, + "loss": 0.78209263, + "num_input_tokens_seen": 254459480, + "step": 11795, + "time_per_iteration": 2.63566255569458 + }, + { + "auxiliary_loss_clip": 0.01075226, + "auxiliary_loss_mlp": 0.01033753, + "balance_loss_clip": 1.03935122, + "balance_loss_mlp": 1.02176011, + "epoch": 0.7092138884713663, + "flos": 13151565676800.0, + "grad_norm": 2.3261568456734816, + "language_loss": 0.75312549, + "learning_rate": 8.232010181829838e-07, + "loss": 0.77421528, + "num_input_tokens_seen": 254473985, + "step": 11796, + "time_per_iteration": 2.716097116470337 + }, + { + "auxiliary_loss_clip": 0.01103014, + "auxiliary_loss_mlp": 0.01042156, + "balance_loss_clip": 1.03999233, + "balance_loss_mlp": 1.02640212, + "epoch": 0.7092740117240343, + "flos": 21645982506240.0, + "grad_norm": 1.559472378141648, + "language_loss": 0.74076355, + "learning_rate": 8.228861333222523e-07, + "loss": 0.76221526, + "num_input_tokens_seen": 254492135, + "step": 11797, + "time_per_iteration": 2.6320409774780273 + }, + { + "auxiliary_loss_clip": 0.0106907, + "auxiliary_loss_mlp": 0.01035432, + "balance_loss_clip": 1.03879786, + "balance_loss_mlp": 1.02290332, + "epoch": 0.7093341349767023, + "flos": 21032521102080.0, + "grad_norm": 1.5130058981356065, + "language_loss": 0.79285604, + "learning_rate": 8.225712930977953e-07, + "loss": 0.81390107, + "num_input_tokens_seen": 254512865, + "step": 11798, + "time_per_iteration": 2.7878127098083496 + }, + { + "auxiliary_loss_clip": 0.01079128, + "auxiliary_loss_mlp": 0.01040993, + "balance_loss_clip": 1.03444886, + "balance_loss_mlp": 1.02752233, + "epoch": 0.7093942582293702, + "flos": 22017658487040.0, + "grad_norm": 1.8215442061382334, + "language_loss": 0.6698848, + "learning_rate": 8.222564975215529e-07, + "loss": 0.69108605, + "num_input_tokens_seen": 254532605, + "step": 11799, + "time_per_iteration": 2.6869001388549805 + }, + { + "auxiliary_loss_clip": 0.01112483, + "auxiliary_loss_mlp": 0.01028449, + "balance_loss_clip": 1.03966284, + "balance_loss_mlp": 1.01535368, + "epoch": 0.7094543814820382, + "flos": 27235586465280.0, + "grad_norm": 1.576526567313424, + "language_loss": 0.81716406, + "learning_rate": 8.219417466054622e-07, + "loss": 0.8385734, + "num_input_tokens_seen": 254553780, + "step": 11800, + "time_per_iteration": 2.658963680267334 + }, + { + "auxiliary_loss_clip": 0.01088302, + "auxiliary_loss_mlp": 0.01033764, + "balance_loss_clip": 1.03797638, + "balance_loss_mlp": 1.02189112, + "epoch": 0.7095145047347061, + "flos": 12089148180480.0, + "grad_norm": 1.9061714801555572, + "language_loss": 0.86517024, + "learning_rate": 8.21627040361459e-07, + "loss": 0.88639092, + "num_input_tokens_seen": 254567510, + "step": 11801, + "time_per_iteration": 2.6748046875 + }, + { + "auxiliary_loss_clip": 0.0111264, + "auxiliary_loss_mlp": 0.0103384, + "balance_loss_clip": 1.03984725, + "balance_loss_mlp": 1.02127492, + "epoch": 0.7095746279873741, + "flos": 19383789905280.0, + "grad_norm": 1.9051932445720021, + "language_loss": 0.7623291, + "learning_rate": 8.213123788014758e-07, + "loss": 0.78379387, + "num_input_tokens_seen": 254585565, + "step": 11802, + "time_per_iteration": 2.618805170059204 + }, + { + "auxiliary_loss_clip": 0.01097308, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.03798604, + "balance_loss_mlp": 1.03312433, + "epoch": 0.709634751240042, + "flos": 21360600950400.0, + "grad_norm": 1.6390429241877804, + "language_loss": 0.81943619, + "learning_rate": 8.209977619374462e-07, + "loss": 0.84087008, + "num_input_tokens_seen": 254603465, + "step": 11803, + "time_per_iteration": 2.68537974357605 + }, + { + "auxiliary_loss_clip": 0.01112366, + "auxiliary_loss_mlp": 0.01034568, + "balance_loss_clip": 1.03814209, + "balance_loss_mlp": 1.02085912, + "epoch": 0.7096948744927101, + "flos": 13917037438080.0, + "grad_norm": 2.293538036404514, + "language_loss": 0.67322147, + "learning_rate": 8.206831897812995e-07, + "loss": 0.69469082, + "num_input_tokens_seen": 254620500, + "step": 11804, + "time_per_iteration": 2.642585277557373 + }, + { + "auxiliary_loss_clip": 0.01097953, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.03816247, + "balance_loss_mlp": 1.01730967, + "epoch": 0.709754997745378, + "flos": 30298335436800.0, + "grad_norm": 1.964223724359439, + "language_loss": 0.78081644, + "learning_rate": 8.203686623449637e-07, + "loss": 0.80208147, + "num_input_tokens_seen": 254638565, + "step": 11805, + "time_per_iteration": 2.720667600631714 + }, + { + "auxiliary_loss_clip": 0.01091353, + "auxiliary_loss_mlp": 0.00771338, + "balance_loss_clip": 1.03825855, + "balance_loss_mlp": 1.00015116, + "epoch": 0.709815120998046, + "flos": 18515147304960.0, + "grad_norm": 9.192055527956402, + "language_loss": 0.79064679, + "learning_rate": 8.200541796403667e-07, + "loss": 0.80927366, + "num_input_tokens_seen": 254657505, + "step": 11806, + "time_per_iteration": 2.681230306625366 + }, + { + "auxiliary_loss_clip": 0.01083674, + "auxiliary_loss_mlp": 0.01041523, + "balance_loss_clip": 1.03755128, + "balance_loss_mlp": 1.02857089, + "epoch": 0.7098752442507139, + "flos": 22272588288000.0, + "grad_norm": 2.0898066634684573, + "language_loss": 0.56422603, + "learning_rate": 8.197397416794332e-07, + "loss": 0.58547801, + "num_input_tokens_seen": 254674730, + "step": 11807, + "time_per_iteration": 2.734550714492798 + }, + { + "auxiliary_loss_clip": 0.01114828, + "auxiliary_loss_mlp": 0.01043868, + "balance_loss_clip": 1.03833497, + "balance_loss_mlp": 1.03099334, + "epoch": 0.7099353675033819, + "flos": 19275447507840.0, + "grad_norm": 2.0105184465729464, + "language_loss": 0.68802261, + "learning_rate": 8.194253484740882e-07, + "loss": 0.70960963, + "num_input_tokens_seen": 254691665, + "step": 11808, + "time_per_iteration": 2.6423966884613037 + }, + { + "auxiliary_loss_clip": 0.01098171, + "auxiliary_loss_mlp": 0.01032542, + "balance_loss_clip": 1.03855026, + "balance_loss_mlp": 1.02025712, + "epoch": 0.70999549075605, + "flos": 21908525990400.0, + "grad_norm": 2.1402280143316834, + "language_loss": 0.71625412, + "learning_rate": 8.191110000362513e-07, + "loss": 0.73756123, + "num_input_tokens_seen": 254711610, + "step": 11809, + "time_per_iteration": 2.627044200897217 + }, + { + "auxiliary_loss_clip": 0.01031591, + "auxiliary_loss_mlp": 0.0100231, + "balance_loss_clip": 1.00862455, + "balance_loss_mlp": 1.00124347, + "epoch": 0.7100556140087179, + "flos": 70456053456000.0, + "grad_norm": 0.7494928075068129, + "language_loss": 0.5943656, + "learning_rate": 8.187966963778435e-07, + "loss": 0.61470461, + "num_input_tokens_seen": 254772615, + "step": 11810, + "time_per_iteration": 3.2029061317443848 + }, + { + "auxiliary_loss_clip": 0.01033991, + "auxiliary_loss_mlp": 0.01048321, + "balance_loss_clip": 1.03268588, + "balance_loss_mlp": 1.03488612, + "epoch": 0.7101157372613859, + "flos": 23039568420480.0, + "grad_norm": 2.72885983888825, + "language_loss": 0.74159658, + "learning_rate": 8.18482437510784e-07, + "loss": 0.7624197, + "num_input_tokens_seen": 254791375, + "step": 11811, + "time_per_iteration": 2.8374974727630615 + }, + { + "auxiliary_loss_clip": 0.01073985, + "auxiliary_loss_mlp": 0.01027518, + "balance_loss_clip": 1.03985, + "balance_loss_mlp": 1.015275, + "epoch": 0.7101758605140538, + "flos": 23185329811200.0, + "grad_norm": 2.0242190076468223, + "language_loss": 0.83632278, + "learning_rate": 8.181682234469882e-07, + "loss": 0.85733783, + "num_input_tokens_seen": 254809300, + "step": 11812, + "time_per_iteration": 2.757760763168335 + }, + { + "auxiliary_loss_clip": 0.01114938, + "auxiliary_loss_mlp": 0.01031671, + "balance_loss_clip": 1.04106176, + "balance_loss_mlp": 1.01833153, + "epoch": 0.7102359837667218, + "flos": 23696123166720.0, + "grad_norm": 1.5968911597601785, + "language_loss": 0.6982094, + "learning_rate": 8.178540541983716e-07, + "loss": 0.71967542, + "num_input_tokens_seen": 254829325, + "step": 11813, + "time_per_iteration": 2.593907594680786 + }, + { + "auxiliary_loss_clip": 0.01109186, + "auxiliary_loss_mlp": 0.01028954, + "balance_loss_clip": 1.03841877, + "balance_loss_mlp": 1.01689565, + "epoch": 0.7102961070193897, + "flos": 19391116279680.0, + "grad_norm": 1.9874956852039145, + "language_loss": 0.81565011, + "learning_rate": 8.175399297768495e-07, + "loss": 0.83703148, + "num_input_tokens_seen": 254847690, + "step": 11814, + "time_per_iteration": 2.5443472862243652 + }, + { + "auxiliary_loss_clip": 0.01112342, + "auxiliary_loss_mlp": 0.01033274, + "balance_loss_clip": 1.04030275, + "balance_loss_mlp": 1.02032149, + "epoch": 0.7103562302720577, + "flos": 21507511576320.0, + "grad_norm": 2.7298772348158074, + "language_loss": 0.75506926, + "learning_rate": 8.172258501943301e-07, + "loss": 0.77652538, + "num_input_tokens_seen": 254865960, + "step": 11815, + "time_per_iteration": 2.5428481101989746 + }, + { + "auxiliary_loss_clip": 0.01067291, + "auxiliary_loss_mlp": 0.01031233, + "balance_loss_clip": 1.03749645, + "balance_loss_mlp": 1.01881731, + "epoch": 0.7104163535247257, + "flos": 14535059869440.0, + "grad_norm": 1.7974303130693923, + "language_loss": 0.78488684, + "learning_rate": 8.16911815462725e-07, + "loss": 0.80587208, + "num_input_tokens_seen": 254882815, + "step": 11816, + "time_per_iteration": 2.7543234825134277 + }, + { + "auxiliary_loss_clip": 0.01085859, + "auxiliary_loss_mlp": 0.01038436, + "balance_loss_clip": 1.03845906, + "balance_loss_mlp": 1.02579379, + "epoch": 0.7104764767773937, + "flos": 11400310085760.0, + "grad_norm": 1.72092645420432, + "language_loss": 0.86654431, + "learning_rate": 8.165978255939426e-07, + "loss": 0.88778722, + "num_input_tokens_seen": 254898705, + "step": 11817, + "time_per_iteration": 2.6052346229553223 + }, + { + "auxiliary_loss_clip": 0.01064818, + "auxiliary_loss_mlp": 0.01029492, + "balance_loss_clip": 1.0393579, + "balance_loss_mlp": 1.01749921, + "epoch": 0.7105366000300616, + "flos": 11690432236800.0, + "grad_norm": 2.3052427315849964, + "language_loss": 0.848396, + "learning_rate": 8.162838805998897e-07, + "loss": 0.86933911, + "num_input_tokens_seen": 254913665, + "step": 11818, + "time_per_iteration": 2.664659023284912 + }, + { + "auxiliary_loss_clip": 0.01111214, + "auxiliary_loss_mlp": 0.01029578, + "balance_loss_clip": 1.03756952, + "balance_loss_mlp": 1.01640534, + "epoch": 0.7105967232827296, + "flos": 19354020508800.0, + "grad_norm": 2.1251714303337006, + "language_loss": 0.76013577, + "learning_rate": 8.159699804924709e-07, + "loss": 0.78154367, + "num_input_tokens_seen": 254932140, + "step": 11819, + "time_per_iteration": 2.5721442699432373 + }, + { + "auxiliary_loss_clip": 0.01069448, + "auxiliary_loss_mlp": 0.01034158, + "balance_loss_clip": 1.0366652, + "balance_loss_mlp": 1.01895833, + "epoch": 0.7106568465353975, + "flos": 22930400010240.0, + "grad_norm": 1.554661416155005, + "language_loss": 0.70843577, + "learning_rate": 8.156561252835883e-07, + "loss": 0.7294718, + "num_input_tokens_seen": 254951580, + "step": 11820, + "time_per_iteration": 2.7395031452178955 + }, + { + "auxiliary_loss_clip": 0.01101119, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.03955543, + "balance_loss_mlp": 1.01675773, + "epoch": 0.7107169697880655, + "flos": 19099665325440.0, + "grad_norm": 1.8332126356579708, + "language_loss": 0.75666863, + "learning_rate": 8.153423149851449e-07, + "loss": 0.7779721, + "num_input_tokens_seen": 254969425, + "step": 11821, + "time_per_iteration": 2.6001696586608887 + }, + { + "auxiliary_loss_clip": 0.00987426, + "auxiliary_loss_mlp": 0.00999944, + "balance_loss_clip": 1.01348448, + "balance_loss_mlp": 0.99880552, + "epoch": 0.7107770930407336, + "flos": 63638054231040.0, + "grad_norm": 0.7717757980179868, + "language_loss": 0.5505957, + "learning_rate": 8.150285496090388e-07, + "loss": 0.57046944, + "num_input_tokens_seen": 255032680, + "step": 11822, + "time_per_iteration": 3.295065402984619 + }, + { + "auxiliary_loss_clip": 0.0109566, + "auxiliary_loss_mlp": 0.01026855, + "balance_loss_clip": 1.03837609, + "balance_loss_mlp": 1.01429629, + "epoch": 0.7108372162934015, + "flos": 22054466949120.0, + "grad_norm": 1.928380251227047, + "language_loss": 0.60496062, + "learning_rate": 8.147148291671688e-07, + "loss": 0.62618577, + "num_input_tokens_seen": 255054400, + "step": 11823, + "time_per_iteration": 2.6415092945098877 + }, + { + "auxiliary_loss_clip": 0.01099793, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.03883433, + "balance_loss_mlp": 1.02019811, + "epoch": 0.7108973395460695, + "flos": 19135144984320.0, + "grad_norm": 2.0421558606422434, + "language_loss": 0.71593511, + "learning_rate": 8.144011536714322e-07, + "loss": 0.73725778, + "num_input_tokens_seen": 255072785, + "step": 11824, + "time_per_iteration": 4.298635244369507 + }, + { + "auxiliary_loss_clip": 0.01077795, + "auxiliary_loss_mlp": 0.00772624, + "balance_loss_clip": 1.03366399, + "balance_loss_mlp": 1.00021195, + "epoch": 0.7109574627987374, + "flos": 17894431353600.0, + "grad_norm": 2.7239449344013322, + "language_loss": 0.72674167, + "learning_rate": 8.140875231337223e-07, + "loss": 0.74524581, + "num_input_tokens_seen": 255091820, + "step": 11825, + "time_per_iteration": 2.652414083480835 + }, + { + "auxiliary_loss_clip": 0.01081872, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.03761208, + "balance_loss_mlp": 1.01838112, + "epoch": 0.7110175860514054, + "flos": 28979623422720.0, + "grad_norm": 1.6201501744547915, + "language_loss": 0.79405123, + "learning_rate": 8.137739375659321e-07, + "loss": 0.8151809, + "num_input_tokens_seen": 255111720, + "step": 11826, + "time_per_iteration": 4.22081995010376 + }, + { + "auxiliary_loss_clip": 0.01098598, + "auxiliary_loss_mlp": 0.01034594, + "balance_loss_clip": 1.03932214, + "balance_loss_mlp": 1.02239263, + "epoch": 0.7110777093040733, + "flos": 26173312623360.0, + "grad_norm": 1.462780118765175, + "language_loss": 0.8310101, + "learning_rate": 8.134603969799527e-07, + "loss": 0.85234201, + "num_input_tokens_seen": 255133495, + "step": 11827, + "time_per_iteration": 4.226747512817383 + }, + { + "auxiliary_loss_clip": 0.01079454, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.03688717, + "balance_loss_mlp": 1.01972437, + "epoch": 0.7111378325567413, + "flos": 26869943969280.0, + "grad_norm": 1.4792451308451542, + "language_loss": 0.6237936, + "learning_rate": 8.131469013876748e-07, + "loss": 0.6449163, + "num_input_tokens_seen": 255156880, + "step": 11828, + "time_per_iteration": 2.7983498573303223 + }, + { + "auxiliary_loss_clip": 0.01111659, + "auxiliary_loss_mlp": 0.01034434, + "balance_loss_clip": 1.03956318, + "balance_loss_mlp": 1.02155936, + "epoch": 0.7111979558094093, + "flos": 27271820309760.0, + "grad_norm": 1.434450077194213, + "language_loss": 0.72024685, + "learning_rate": 8.128334508009846e-07, + "loss": 0.7417078, + "num_input_tokens_seen": 255178920, + "step": 11829, + "time_per_iteration": 2.6990365982055664 + }, + { + "auxiliary_loss_clip": 0.01111652, + "auxiliary_loss_mlp": 0.01034605, + "balance_loss_clip": 1.04012764, + "balance_loss_mlp": 1.02254748, + "epoch": 0.7112580790620773, + "flos": 25046938961280.0, + "grad_norm": 1.7220593364674301, + "language_loss": 0.80250454, + "learning_rate": 8.125200452317697e-07, + "loss": 0.8239671, + "num_input_tokens_seen": 255198095, + "step": 11830, + "time_per_iteration": 2.573199987411499 + }, + { + "auxiliary_loss_clip": 0.01099477, + "auxiliary_loss_mlp": 0.0103532, + "balance_loss_clip": 1.0376153, + "balance_loss_mlp": 1.02277327, + "epoch": 0.7113182023147452, + "flos": 21646628951040.0, + "grad_norm": 1.7248457668834243, + "language_loss": 0.84357107, + "learning_rate": 8.122066846919138e-07, + "loss": 0.86491901, + "num_input_tokens_seen": 255215860, + "step": 11831, + "time_per_iteration": 2.6142139434814453 + }, + { + "auxiliary_loss_clip": 0.01088822, + "auxiliary_loss_mlp": 0.01032858, + "balance_loss_clip": 1.0360502, + "balance_loss_mlp": 1.02048969, + "epoch": 0.7113783255674132, + "flos": 20996287257600.0, + "grad_norm": 3.6930845590637986, + "language_loss": 0.77417958, + "learning_rate": 8.118933691932985e-07, + "loss": 0.79539645, + "num_input_tokens_seen": 255235425, + "step": 11832, + "time_per_iteration": 2.6712517738342285 + }, + { + "auxiliary_loss_clip": 0.01020951, + "auxiliary_loss_mlp": 0.01006539, + "balance_loss_clip": 1.00784588, + "balance_loss_mlp": 1.00549638, + "epoch": 0.7114384488200811, + "flos": 66771080161920.0, + "grad_norm": 0.7440594945981316, + "language_loss": 0.56577992, + "learning_rate": 8.115800987478059e-07, + "loss": 0.5860548, + "num_input_tokens_seen": 255291680, + "step": 11833, + "time_per_iteration": 4.970557689666748 + }, + { + "auxiliary_loss_clip": 0.01063684, + "auxiliary_loss_mlp": 0.01035215, + "balance_loss_clip": 1.03803515, + "balance_loss_mlp": 1.02324665, + "epoch": 0.7114985720727491, + "flos": 25010058672000.0, + "grad_norm": 1.530403195160948, + "language_loss": 0.70702851, + "learning_rate": 8.11266873367315e-07, + "loss": 0.72801757, + "num_input_tokens_seen": 255313880, + "step": 11834, + "time_per_iteration": 2.814005136489868 + }, + { + "auxiliary_loss_clip": 0.0111468, + "auxiliary_loss_mlp": 0.01035977, + "balance_loss_clip": 1.04097462, + "balance_loss_mlp": 1.02278614, + "epoch": 0.7115586953254172, + "flos": 21470128496640.0, + "grad_norm": 2.039356893023201, + "language_loss": 0.79006612, + "learning_rate": 8.10953693063704e-07, + "loss": 0.81157267, + "num_input_tokens_seen": 255332390, + "step": 11835, + "time_per_iteration": 2.6193342208862305 + }, + { + "auxiliary_loss_clip": 0.01098428, + "auxiliary_loss_mlp": 0.01030957, + "balance_loss_clip": 1.0383265, + "balance_loss_mlp": 1.01929832, + "epoch": 0.7116188185780851, + "flos": 28622600190720.0, + "grad_norm": 1.4832382343509314, + "language_loss": 0.75895661, + "learning_rate": 8.10640557848848e-07, + "loss": 0.78025043, + "num_input_tokens_seen": 255354025, + "step": 11836, + "time_per_iteration": 2.796912670135498 + }, + { + "auxiliary_loss_clip": 0.01041174, + "auxiliary_loss_mlp": 0.01035442, + "balance_loss_clip": 1.03577304, + "balance_loss_mlp": 1.02302051, + "epoch": 0.7116789418307531, + "flos": 25293608634240.0, + "grad_norm": 1.738152097420041, + "language_loss": 0.69952178, + "learning_rate": 8.103274677346208e-07, + "loss": 0.72028792, + "num_input_tokens_seen": 255371400, + "step": 11837, + "time_per_iteration": 2.850287914276123 + }, + { + "auxiliary_loss_clip": 0.01104188, + "auxiliary_loss_mlp": 0.01038013, + "balance_loss_clip": 1.04023147, + "balance_loss_mlp": 1.02389264, + "epoch": 0.711739065083421, + "flos": 25557301353600.0, + "grad_norm": 1.8562944097025111, + "language_loss": 0.61769348, + "learning_rate": 8.100144227328958e-07, + "loss": 0.63911551, + "num_input_tokens_seen": 255390710, + "step": 11838, + "time_per_iteration": 2.6722800731658936 + }, + { + "auxiliary_loss_clip": 0.01103036, + "auxiliary_loss_mlp": 0.01032537, + "balance_loss_clip": 1.04213476, + "balance_loss_mlp": 1.02000856, + "epoch": 0.711799188336089, + "flos": 26140993361280.0, + "grad_norm": 2.198172519021995, + "language_loss": 0.67758644, + "learning_rate": 8.097014228555426e-07, + "loss": 0.69894218, + "num_input_tokens_seen": 255408790, + "step": 11839, + "time_per_iteration": 2.700693130493164 + }, + { + "auxiliary_loss_clip": 0.01113567, + "auxiliary_loss_mlp": 0.01032497, + "balance_loss_clip": 1.04118514, + "balance_loss_mlp": 1.02025414, + "epoch": 0.7118593115887569, + "flos": 21140648017920.0, + "grad_norm": 2.000863214598685, + "language_loss": 0.84081334, + "learning_rate": 8.093884681144305e-07, + "loss": 0.86227405, + "num_input_tokens_seen": 255426280, + "step": 11840, + "time_per_iteration": 2.6260175704956055 + }, + { + "auxiliary_loss_clip": 0.01089291, + "auxiliary_loss_mlp": 0.01032161, + "balance_loss_clip": 1.03793836, + "balance_loss_mlp": 1.01938784, + "epoch": 0.711919434841425, + "flos": 14975684006400.0, + "grad_norm": 1.8693362232508501, + "language_loss": 0.76592988, + "learning_rate": 8.090755585214277e-07, + "loss": 0.78714442, + "num_input_tokens_seen": 255442935, + "step": 11841, + "time_per_iteration": 2.7380130290985107 + }, + { + "auxiliary_loss_clip": 0.01097544, + "auxiliary_loss_mlp": 0.01031388, + "balance_loss_clip": 1.0421263, + "balance_loss_mlp": 1.01840663, + "epoch": 0.7119795580940929, + "flos": 16508997826560.0, + "grad_norm": 2.0423814070424546, + "language_loss": 0.75526315, + "learning_rate": 8.087626940883994e-07, + "loss": 0.77655244, + "num_input_tokens_seen": 255460925, + "step": 11842, + "time_per_iteration": 2.7132010459899902 + }, + { + "auxiliary_loss_clip": 0.01025805, + "auxiliary_loss_mlp": 0.01005384, + "balance_loss_clip": 1.01843143, + "balance_loss_mlp": 1.00406706, + "epoch": 0.7120396813467609, + "flos": 66570736055040.0, + "grad_norm": 0.7903612051800185, + "language_loss": 0.61607522, + "learning_rate": 8.084498748272082e-07, + "loss": 0.63638717, + "num_input_tokens_seen": 255521360, + "step": 11843, + "time_per_iteration": 3.199335813522339 + }, + { + "auxiliary_loss_clip": 0.01110982, + "auxiliary_loss_mlp": 0.01027269, + "balance_loss_clip": 1.04004669, + "balance_loss_mlp": 1.01506245, + "epoch": 0.7120998045994288, + "flos": 26432731624320.0, + "grad_norm": 2.817805590014094, + "language_loss": 0.80302823, + "learning_rate": 8.081371007497171e-07, + "loss": 0.82441074, + "num_input_tokens_seen": 255541435, + "step": 11844, + "time_per_iteration": 2.7244338989257812 + }, + { + "auxiliary_loss_clip": 0.010573, + "auxiliary_loss_mlp": 0.01034033, + "balance_loss_clip": 1.03133631, + "balance_loss_mlp": 1.02053213, + "epoch": 0.7121599278520968, + "flos": 16427982700800.0, + "grad_norm": 2.6971267365188565, + "language_loss": 0.79268605, + "learning_rate": 8.078243718677873e-07, + "loss": 0.81359935, + "num_input_tokens_seen": 255558505, + "step": 11845, + "time_per_iteration": 2.7217719554901123 + }, + { + "auxiliary_loss_clip": 0.01094755, + "auxiliary_loss_mlp": 0.0103426, + "balance_loss_clip": 1.03866315, + "balance_loss_mlp": 1.02122474, + "epoch": 0.7122200511047647, + "flos": 28949889939840.0, + "grad_norm": 2.005574335935101, + "language_loss": 0.77602625, + "learning_rate": 8.075116881932762e-07, + "loss": 0.79731637, + "num_input_tokens_seen": 255577815, + "step": 11846, + "time_per_iteration": 2.64569354057312 + }, + { + "auxiliary_loss_clip": 0.01101916, + "auxiliary_loss_mlp": 0.01033988, + "balance_loss_clip": 1.03883851, + "balance_loss_mlp": 1.0209887, + "epoch": 0.7122801743574327, + "flos": 16471866142080.0, + "grad_norm": 1.8418760265221825, + "language_loss": 0.58981413, + "learning_rate": 8.071990497380421e-07, + "loss": 0.61117315, + "num_input_tokens_seen": 255595885, + "step": 11847, + "time_per_iteration": 2.626909017562866 + }, + { + "auxiliary_loss_clip": 0.01095645, + "auxiliary_loss_mlp": 0.00769201, + "balance_loss_clip": 1.03844142, + "balance_loss_mlp": 1.00012553, + "epoch": 0.7123402976101008, + "flos": 20631039811200.0, + "grad_norm": 2.0944282784493353, + "language_loss": 0.71676862, + "learning_rate": 8.068864565139395e-07, + "loss": 0.73541707, + "num_input_tokens_seen": 255616750, + "step": 11848, + "time_per_iteration": 2.7139625549316406 + }, + { + "auxiliary_loss_clip": 0.01023376, + "auxiliary_loss_mlp": 0.01000864, + "balance_loss_clip": 1.00891995, + "balance_loss_mlp": 0.99969596, + "epoch": 0.7124004208627687, + "flos": 62325734837760.0, + "grad_norm": 0.8463803916761699, + "language_loss": 0.62977934, + "learning_rate": 8.065739085328211e-07, + "loss": 0.65002173, + "num_input_tokens_seen": 255677900, + "step": 11849, + "time_per_iteration": 3.1411380767822266 + }, + { + "auxiliary_loss_clip": 0.01083662, + "auxiliary_loss_mlp": 0.01037172, + "balance_loss_clip": 1.03620315, + "balance_loss_mlp": 1.0243752, + "epoch": 0.7124605441154367, + "flos": 39675975788160.0, + "grad_norm": 2.803971224135637, + "language_loss": 0.63841069, + "learning_rate": 8.0626140580654e-07, + "loss": 0.65961903, + "num_input_tokens_seen": 255699140, + "step": 11850, + "time_per_iteration": 2.923384428024292 + }, + { + "auxiliary_loss_clip": 0.01102405, + "auxiliary_loss_mlp": 0.01032425, + "balance_loss_clip": 1.03889465, + "balance_loss_mlp": 1.01953292, + "epoch": 0.7125206673681046, + "flos": 28181868312960.0, + "grad_norm": 1.6311022275087306, + "language_loss": 0.69985723, + "learning_rate": 8.05948948346946e-07, + "loss": 0.72120547, + "num_input_tokens_seen": 255719640, + "step": 11851, + "time_per_iteration": 2.7820382118225098 + }, + { + "auxiliary_loss_clip": 0.0110311, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.04154539, + "balance_loss_mlp": 1.02258956, + "epoch": 0.7125807906207726, + "flos": 26176939896960.0, + "grad_norm": 1.8696019158411576, + "language_loss": 0.83187509, + "learning_rate": 8.056365361658882e-07, + "loss": 0.8532483, + "num_input_tokens_seen": 255740450, + "step": 11852, + "time_per_iteration": 2.6444952487945557 + }, + { + "auxiliary_loss_clip": 0.01100225, + "auxiliary_loss_mlp": 0.00771762, + "balance_loss_clip": 1.03667736, + "balance_loss_mlp": 1.00029016, + "epoch": 0.7126409138734405, + "flos": 17157328358400.0, + "grad_norm": 2.349353252161211, + "language_loss": 0.73249555, + "learning_rate": 8.053241692752126e-07, + "loss": 0.75121546, + "num_input_tokens_seen": 255758070, + "step": 11853, + "time_per_iteration": 2.637211799621582 + }, + { + "auxiliary_loss_clip": 0.0107018, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.03552818, + "balance_loss_mlp": 1.02707744, + "epoch": 0.7127010371261085, + "flos": 18769933451520.0, + "grad_norm": 1.913315807088991, + "language_loss": 0.92358422, + "learning_rate": 8.050118476867635e-07, + "loss": 0.94468045, + "num_input_tokens_seen": 255775685, + "step": 11854, + "time_per_iteration": 2.7072691917419434 + }, + { + "auxiliary_loss_clip": 0.01098797, + "auxiliary_loss_mlp": 0.01033688, + "balance_loss_clip": 1.03843451, + "balance_loss_mlp": 1.02162969, + "epoch": 0.7127611603787765, + "flos": 20376433232640.0, + "grad_norm": 1.737694299359858, + "language_loss": 0.7940923, + "learning_rate": 8.046995714123856e-07, + "loss": 0.81541711, + "num_input_tokens_seen": 255794750, + "step": 11855, + "time_per_iteration": 2.6459240913391113 + }, + { + "auxiliary_loss_clip": 0.01062363, + "auxiliary_loss_mlp": 0.0103668, + "balance_loss_clip": 1.03427172, + "balance_loss_mlp": 1.02277398, + "epoch": 0.7128212836314445, + "flos": 20449008662400.0, + "grad_norm": 1.6847297518773263, + "language_loss": 0.72626299, + "learning_rate": 8.043873404639192e-07, + "loss": 0.74725342, + "num_input_tokens_seen": 255813325, + "step": 11856, + "time_per_iteration": 2.798802614212036 + }, + { + "auxiliary_loss_clip": 0.01105236, + "auxiliary_loss_mlp": 0.01030789, + "balance_loss_clip": 1.0418961, + "balance_loss_mlp": 1.01811683, + "epoch": 0.7128814068841124, + "flos": 23440834229760.0, + "grad_norm": 1.7617515399603183, + "language_loss": 0.70205921, + "learning_rate": 8.040751548532046e-07, + "loss": 0.72341949, + "num_input_tokens_seen": 255832470, + "step": 11857, + "time_per_iteration": 2.7193527221679688 + }, + { + "auxiliary_loss_clip": 0.01097533, + "auxiliary_loss_mlp": 0.01029351, + "balance_loss_clip": 1.03706014, + "balance_loss_mlp": 1.01644111, + "epoch": 0.7129415301367804, + "flos": 18222942165120.0, + "grad_norm": 2.6735250437319684, + "language_loss": 0.85148036, + "learning_rate": 8.03763014592081e-07, + "loss": 0.87274927, + "num_input_tokens_seen": 255849740, + "step": 11858, + "time_per_iteration": 2.640803813934326 + }, + { + "auxiliary_loss_clip": 0.01116792, + "auxiliary_loss_mlp": 0.01033094, + "balance_loss_clip": 1.04105759, + "balance_loss_mlp": 1.020136, + "epoch": 0.7130016533894483, + "flos": 15523896355200.0, + "grad_norm": 1.6211685141896377, + "language_loss": 0.80374736, + "learning_rate": 8.034509196923829e-07, + "loss": 0.82524627, + "num_input_tokens_seen": 255866975, + "step": 11859, + "time_per_iteration": 2.600557565689087 + }, + { + "auxiliary_loss_clip": 0.01088815, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.03991199, + "balance_loss_mlp": 1.01981378, + "epoch": 0.7130617766421163, + "flos": 57115668960000.0, + "grad_norm": 1.670734379003671, + "language_loss": 0.68986422, + "learning_rate": 8.031388701659456e-07, + "loss": 0.71107167, + "num_input_tokens_seen": 255892915, + "step": 11860, + "time_per_iteration": 3.0131988525390625 + }, + { + "auxiliary_loss_clip": 0.01101154, + "auxiliary_loss_mlp": 0.01031024, + "balance_loss_clip": 1.03928113, + "balance_loss_mlp": 1.01791143, + "epoch": 0.7131218998947844, + "flos": 19788252024960.0, + "grad_norm": 1.6914333481475103, + "language_loss": 0.64537835, + "learning_rate": 8.028268660246023e-07, + "loss": 0.66670012, + "num_input_tokens_seen": 255911480, + "step": 11861, + "time_per_iteration": 2.609196424484253 + }, + { + "auxiliary_loss_clip": 0.01095274, + "auxiliary_loss_mlp": 0.01033317, + "balance_loss_clip": 1.04040623, + "balance_loss_mlp": 1.01967335, + "epoch": 0.7131820231474523, + "flos": 26651894457600.0, + "grad_norm": 1.5298656478489163, + "language_loss": 0.66931856, + "learning_rate": 8.025149072801849e-07, + "loss": 0.69060439, + "num_input_tokens_seen": 255931140, + "step": 11862, + "time_per_iteration": 2.7272536754608154 + }, + { + "auxiliary_loss_clip": 0.01084067, + "auxiliary_loss_mlp": 0.01040707, + "balance_loss_clip": 1.03703427, + "balance_loss_mlp": 1.02913761, + "epoch": 0.7132421464001203, + "flos": 29205609840000.0, + "grad_norm": 1.958177792409317, + "language_loss": 0.66627884, + "learning_rate": 8.022029939445214e-07, + "loss": 0.68752658, + "num_input_tokens_seen": 255951665, + "step": 11863, + "time_per_iteration": 4.389364957809448 + }, + { + "auxiliary_loss_clip": 0.01071831, + "auxiliary_loss_mlp": 0.01047442, + "balance_loss_clip": 1.03993106, + "balance_loss_mlp": 1.03224277, + "epoch": 0.7133022696527882, + "flos": 23073611535360.0, + "grad_norm": 1.9615071733998684, + "language_loss": 0.65745306, + "learning_rate": 8.018911260294414e-07, + "loss": 0.67864573, + "num_input_tokens_seen": 255970055, + "step": 11864, + "time_per_iteration": 2.7246596813201904 + }, + { + "auxiliary_loss_clip": 0.01101997, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.03820133, + "balance_loss_mlp": 1.01960993, + "epoch": 0.7133623929054562, + "flos": 17457111267840.0, + "grad_norm": 1.8809252452747804, + "language_loss": 0.86299706, + "learning_rate": 8.015793035467697e-07, + "loss": 0.8843466, + "num_input_tokens_seen": 255987720, + "step": 11865, + "time_per_iteration": 4.186030149459839 + }, + { + "auxiliary_loss_clip": 0.01071299, + "auxiliary_loss_mlp": 0.01037479, + "balance_loss_clip": 1.0331533, + "balance_loss_mlp": 1.02338219, + "epoch": 0.7134225161581241, + "flos": 19536554448000.0, + "grad_norm": 4.424017900151165, + "language_loss": 0.75215453, + "learning_rate": 8.012675265083304e-07, + "loss": 0.77324229, + "num_input_tokens_seen": 256005490, + "step": 11866, + "time_per_iteration": 4.38300085067749 + }, + { + "auxiliary_loss_clip": 0.01075897, + "auxiliary_loss_mlp": 0.01035618, + "balance_loss_clip": 1.03858542, + "balance_loss_mlp": 1.02196276, + "epoch": 0.7134826394107922, + "flos": 26250089944320.0, + "grad_norm": 1.787273089908781, + "language_loss": 0.70222098, + "learning_rate": 8.009557949259464e-07, + "loss": 0.72333616, + "num_input_tokens_seen": 256026030, + "step": 11867, + "time_per_iteration": 2.7252299785614014 + }, + { + "auxiliary_loss_clip": 0.0109972, + "auxiliary_loss_mlp": 0.01030726, + "balance_loss_clip": 1.03978539, + "balance_loss_mlp": 1.01921654, + "epoch": 0.7135427626634601, + "flos": 15815311395840.0, + "grad_norm": 2.323638504940392, + "language_loss": 0.72056556, + "learning_rate": 8.006441088114397e-07, + "loss": 0.74186999, + "num_input_tokens_seen": 256043680, + "step": 11868, + "time_per_iteration": 2.6166346073150635 + }, + { + "auxiliary_loss_clip": 0.01063174, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.03661656, + "balance_loss_mlp": 1.02014136, + "epoch": 0.7136028859161281, + "flos": 18223409041920.0, + "grad_norm": 2.386444069797043, + "language_loss": 0.66029108, + "learning_rate": 8.003324681766286e-07, + "loss": 0.68127489, + "num_input_tokens_seen": 256059705, + "step": 11869, + "time_per_iteration": 2.6557157039642334 + }, + { + "auxiliary_loss_clip": 0.01086038, + "auxiliary_loss_mlp": 0.01027932, + "balance_loss_clip": 1.03453624, + "balance_loss_mlp": 1.01540327, + "epoch": 0.713663009168796, + "flos": 24314827956480.0, + "grad_norm": 1.5122287108134371, + "language_loss": 0.77901238, + "learning_rate": 8.000208730333298e-07, + "loss": 0.80015206, + "num_input_tokens_seen": 256079785, + "step": 11870, + "time_per_iteration": 2.767284870147705 + }, + { + "auxiliary_loss_clip": 0.01062535, + "auxiliary_loss_mlp": 0.01035444, + "balance_loss_clip": 1.03716147, + "balance_loss_mlp": 1.02176499, + "epoch": 0.713723132421464, + "flos": 26538488242560.0, + "grad_norm": 1.7572988726243002, + "language_loss": 0.81102479, + "learning_rate": 7.997093233933597e-07, + "loss": 0.83200461, + "num_input_tokens_seen": 256099000, + "step": 11871, + "time_per_iteration": 2.7799062728881836 + }, + { + "auxiliary_loss_clip": 0.01081304, + "auxiliary_loss_mlp": 0.01037741, + "balance_loss_clip": 1.03814363, + "balance_loss_mlp": 1.02452111, + "epoch": 0.7137832556741319, + "flos": 19865675790720.0, + "grad_norm": 1.5739267518019031, + "language_loss": 0.78791887, + "learning_rate": 7.993978192685331e-07, + "loss": 0.80910927, + "num_input_tokens_seen": 256117985, + "step": 11872, + "time_per_iteration": 4.27405309677124 + }, + { + "auxiliary_loss_clip": 0.01104458, + "auxiliary_loss_mlp": 0.01029654, + "balance_loss_clip": 1.04009414, + "balance_loss_mlp": 1.0162369, + "epoch": 0.7138433789267999, + "flos": 21688932193920.0, + "grad_norm": 2.3871550053143893, + "language_loss": 0.84496498, + "learning_rate": 7.990863606706606e-07, + "loss": 0.86630619, + "num_input_tokens_seen": 256134350, + "step": 11873, + "time_per_iteration": 2.6260197162628174 + }, + { + "auxiliary_loss_clip": 0.01073276, + "auxiliary_loss_mlp": 0.01032331, + "balance_loss_clip": 1.03462076, + "balance_loss_mlp": 1.02040982, + "epoch": 0.713903502179468, + "flos": 17602729004160.0, + "grad_norm": 2.5229231415013116, + "language_loss": 0.86355793, + "learning_rate": 7.987749476115539e-07, + "loss": 0.88461399, + "num_input_tokens_seen": 256150610, + "step": 11874, + "time_per_iteration": 2.680554151535034 + }, + { + "auxiliary_loss_clip": 0.01103576, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.0389179, + "balance_loss_mlp": 1.01873553, + "epoch": 0.7139636254321359, + "flos": 18040336398720.0, + "grad_norm": 1.760053080674637, + "language_loss": 0.8337326, + "learning_rate": 7.984635801030228e-07, + "loss": 0.85508358, + "num_input_tokens_seen": 256168620, + "step": 11875, + "time_per_iteration": 2.597926616668701 + }, + { + "auxiliary_loss_clip": 0.01091056, + "auxiliary_loss_mlp": 0.01038348, + "balance_loss_clip": 1.03766298, + "balance_loss_mlp": 1.02233863, + "epoch": 0.7140237486848039, + "flos": 23331127115520.0, + "grad_norm": 1.7238625463047035, + "language_loss": 0.69539726, + "learning_rate": 7.981522581568721e-07, + "loss": 0.71669132, + "num_input_tokens_seen": 256186700, + "step": 11876, + "time_per_iteration": 2.7075090408325195 + }, + { + "auxiliary_loss_clip": 0.01115515, + "auxiliary_loss_mlp": 0.01036257, + "balance_loss_clip": 1.04096556, + "balance_loss_mlp": 1.02292919, + "epoch": 0.7140838719374718, + "flos": 16837077674880.0, + "grad_norm": 1.7495986259479304, + "language_loss": 0.78027952, + "learning_rate": 7.978409817849079e-07, + "loss": 0.80179715, + "num_input_tokens_seen": 256205390, + "step": 11877, + "time_per_iteration": 2.579984188079834 + }, + { + "auxiliary_loss_clip": 0.01100542, + "auxiliary_loss_mlp": 0.01039441, + "balance_loss_clip": 1.03897512, + "balance_loss_mlp": 1.02755046, + "epoch": 0.7141439951901398, + "flos": 21142012734720.0, + "grad_norm": 2.00893168794746, + "language_loss": 0.69702816, + "learning_rate": 7.97529750998934e-07, + "loss": 0.71842802, + "num_input_tokens_seen": 256224575, + "step": 11878, + "time_per_iteration": 2.7117369174957275 + }, + { + "auxiliary_loss_clip": 0.01075067, + "auxiliary_loss_mlp": 0.01035543, + "balance_loss_clip": 1.03836131, + "balance_loss_mlp": 1.024194, + "epoch": 0.7142041184428077, + "flos": 24717709877760.0, + "grad_norm": 1.9345471164629369, + "language_loss": 0.67564619, + "learning_rate": 7.972185658107535e-07, + "loss": 0.69675231, + "num_input_tokens_seen": 256242130, + "step": 11879, + "time_per_iteration": 2.781487226486206 + }, + { + "auxiliary_loss_clip": 0.01052886, + "auxiliary_loss_mlp": 0.01039936, + "balance_loss_clip": 1.03587782, + "balance_loss_mlp": 1.02522612, + "epoch": 0.7142642416954758, + "flos": 21908202768000.0, + "grad_norm": 2.4025708755379136, + "language_loss": 0.68949473, + "learning_rate": 7.969074262321646e-07, + "loss": 0.71042299, + "num_input_tokens_seen": 256261920, + "step": 11880, + "time_per_iteration": 2.7956559658050537 + }, + { + "auxiliary_loss_clip": 0.01085326, + "auxiliary_loss_mlp": 0.01038627, + "balance_loss_clip": 1.03614664, + "balance_loss_mlp": 1.02517402, + "epoch": 0.7143243649481437, + "flos": 20805636844800.0, + "grad_norm": 2.7211845383040263, + "language_loss": 0.80758023, + "learning_rate": 7.965963322749674e-07, + "loss": 0.82881975, + "num_input_tokens_seen": 256277970, + "step": 11881, + "time_per_iteration": 2.7760164737701416 + }, + { + "auxiliary_loss_clip": 0.01069489, + "auxiliary_loss_mlp": 0.01031682, + "balance_loss_clip": 1.03435218, + "balance_loss_mlp": 1.01974893, + "epoch": 0.7143844882008117, + "flos": 27235011847680.0, + "grad_norm": 1.9142544481843773, + "language_loss": 0.63496864, + "learning_rate": 7.962852839509579e-07, + "loss": 0.65598035, + "num_input_tokens_seen": 256298205, + "step": 11882, + "time_per_iteration": 2.8055615425109863 + }, + { + "auxiliary_loss_clip": 0.01115484, + "auxiliary_loss_mlp": 0.01033467, + "balance_loss_clip": 1.0405947, + "balance_loss_mlp": 1.02086067, + "epoch": 0.7144446114534796, + "flos": 17929623703680.0, + "grad_norm": 1.6563668139876793, + "language_loss": 0.68799591, + "learning_rate": 7.959742812719304e-07, + "loss": 0.70948541, + "num_input_tokens_seen": 256316685, + "step": 11883, + "time_per_iteration": 2.6891119480133057 + }, + { + "auxiliary_loss_clip": 0.0110208, + "auxiliary_loss_mlp": 0.01037358, + "balance_loss_clip": 1.04018784, + "balance_loss_mlp": 1.02402401, + "epoch": 0.7145047347061476, + "flos": 20740962407040.0, + "grad_norm": 1.7218148321096673, + "language_loss": 0.77569342, + "learning_rate": 7.956633242496788e-07, + "loss": 0.79708779, + "num_input_tokens_seen": 256334205, + "step": 11884, + "time_per_iteration": 2.6530849933624268 + }, + { + "auxiliary_loss_clip": 0.01107156, + "auxiliary_loss_mlp": 0.01036925, + "balance_loss_clip": 1.0385685, + "balance_loss_mlp": 1.02221453, + "epoch": 0.7145648579588155, + "flos": 21178605715200.0, + "grad_norm": 4.109766479944614, + "language_loss": 0.73748314, + "learning_rate": 7.953524128959954e-07, + "loss": 0.75892401, + "num_input_tokens_seen": 256353340, + "step": 11885, + "time_per_iteration": 2.8627066612243652 + }, + { + "auxiliary_loss_clip": 0.01014823, + "auxiliary_loss_mlp": 0.00999083, + "balance_loss_clip": 1.01118517, + "balance_loss_mlp": 0.9980278, + "epoch": 0.7146249812114835, + "flos": 64784539509120.0, + "grad_norm": 0.8971094917641942, + "language_loss": 0.66321898, + "learning_rate": 7.95041547222669e-07, + "loss": 0.68335795, + "num_input_tokens_seen": 256411550, + "step": 11886, + "time_per_iteration": 3.2624523639678955 + }, + { + "auxiliary_loss_clip": 0.01068235, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.03730834, + "balance_loss_mlp": 1.01627326, + "epoch": 0.7146851044641516, + "flos": 18113881495680.0, + "grad_norm": 1.637061044438449, + "language_loss": 0.74940675, + "learning_rate": 7.947307272414874e-07, + "loss": 0.77038538, + "num_input_tokens_seen": 256430360, + "step": 11887, + "time_per_iteration": 2.951922655105591 + }, + { + "auxiliary_loss_clip": 0.0110054, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.03856289, + "balance_loss_mlp": 1.01542068, + "epoch": 0.7147452277168195, + "flos": 19243846517760.0, + "grad_norm": 1.834582654468692, + "language_loss": 0.71475005, + "learning_rate": 7.944199529642372e-07, + "loss": 0.73603028, + "num_input_tokens_seen": 256449750, + "step": 11888, + "time_per_iteration": 2.7142348289489746 + }, + { + "auxiliary_loss_clip": 0.01097744, + "auxiliary_loss_mlp": 0.0103845, + "balance_loss_clip": 1.03603697, + "balance_loss_mlp": 1.02444923, + "epoch": 0.7148053509694875, + "flos": 23764712186880.0, + "grad_norm": 1.9131125822464334, + "language_loss": 0.84173727, + "learning_rate": 7.941092244027041e-07, + "loss": 0.86309922, + "num_input_tokens_seen": 256467330, + "step": 11889, + "time_per_iteration": 2.7939958572387695 + }, + { + "auxiliary_loss_clip": 0.01066177, + "auxiliary_loss_mlp": 0.01028337, + "balance_loss_clip": 1.04017806, + "balance_loss_mlp": 1.01598644, + "epoch": 0.7148654742221554, + "flos": 22485322586880.0, + "grad_norm": 1.7213621841277236, + "language_loss": 0.76025808, + "learning_rate": 7.937985415686695e-07, + "loss": 0.78120321, + "num_input_tokens_seen": 256485705, + "step": 11890, + "time_per_iteration": 2.909778594970703 + }, + { + "auxiliary_loss_clip": 0.0106853, + "auxiliary_loss_mlp": 0.01036534, + "balance_loss_clip": 1.03322911, + "balance_loss_mlp": 1.0240227, + "epoch": 0.7149255974748234, + "flos": 24679213476480.0, + "grad_norm": 1.510956653160521, + "language_loss": 0.74061215, + "learning_rate": 7.934879044739147e-07, + "loss": 0.76166284, + "num_input_tokens_seen": 256504755, + "step": 11891, + "time_per_iteration": 2.870742082595825 + }, + { + "auxiliary_loss_clip": 0.01069165, + "auxiliary_loss_mlp": 0.01036831, + "balance_loss_clip": 1.03776526, + "balance_loss_mlp": 1.0234617, + "epoch": 0.7149857207274913, + "flos": 18405583845120.0, + "grad_norm": 2.1855656268859565, + "language_loss": 0.67586207, + "learning_rate": 7.931773131302211e-07, + "loss": 0.69692206, + "num_input_tokens_seen": 256523670, + "step": 11892, + "time_per_iteration": 2.879074811935425 + }, + { + "auxiliary_loss_clip": 0.01078901, + "auxiliary_loss_mlp": 0.01034356, + "balance_loss_clip": 1.03972173, + "balance_loss_mlp": 1.02015805, + "epoch": 0.7150458439801594, + "flos": 24969515195520.0, + "grad_norm": 1.7990304927260297, + "language_loss": 0.737535, + "learning_rate": 7.928667675493632e-07, + "loss": 0.75866759, + "num_input_tokens_seen": 256542225, + "step": 11893, + "time_per_iteration": 2.797793388366699 + }, + { + "auxiliary_loss_clip": 0.01118028, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.04243374, + "balance_loss_mlp": 1.01739264, + "epoch": 0.7151059672328273, + "flos": 16690777580160.0, + "grad_norm": 2.922265419299714, + "language_loss": 0.67378318, + "learning_rate": 7.925562677431185e-07, + "loss": 0.69527477, + "num_input_tokens_seen": 256560730, + "step": 11894, + "time_per_iteration": 2.6411194801330566 + }, + { + "auxiliary_loss_clip": 0.01079135, + "auxiliary_loss_mlp": 0.01032012, + "balance_loss_clip": 1.04023933, + "balance_loss_mlp": 1.01957238, + "epoch": 0.7151660904854953, + "flos": 27271820309760.0, + "grad_norm": 1.6674046722753406, + "language_loss": 0.77498591, + "learning_rate": 7.922458137232613e-07, + "loss": 0.7960974, + "num_input_tokens_seen": 256580505, + "step": 11895, + "time_per_iteration": 2.9311444759368896 + }, + { + "auxiliary_loss_clip": 0.01102223, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.03921759, + "balance_loss_mlp": 1.0176903, + "epoch": 0.7152262137381632, + "flos": 18332254229760.0, + "grad_norm": 1.8566798780150704, + "language_loss": 0.69233418, + "learning_rate": 7.919354055015643e-07, + "loss": 0.71367466, + "num_input_tokens_seen": 256597330, + "step": 11896, + "time_per_iteration": 2.708909034729004 + }, + { + "auxiliary_loss_clip": 0.010908, + "auxiliary_loss_mlp": 0.01041603, + "balance_loss_clip": 1.03788733, + "balance_loss_mlp": 1.02761424, + "epoch": 0.7152863369908312, + "flos": 21799285752960.0, + "grad_norm": 2.0196702259188952, + "language_loss": 0.86874604, + "learning_rate": 7.91625043089798e-07, + "loss": 0.89007008, + "num_input_tokens_seen": 256616030, + "step": 11897, + "time_per_iteration": 2.8452200889587402 + }, + { + "auxiliary_loss_clip": 0.01091656, + "auxiliary_loss_mlp": 0.01035463, + "balance_loss_clip": 1.03988373, + "balance_loss_mlp": 1.0220046, + "epoch": 0.7153464602434991, + "flos": 22158427887360.0, + "grad_norm": 3.4189922155736965, + "language_loss": 0.7799052, + "learning_rate": 7.913147264997304e-07, + "loss": 0.80117643, + "num_input_tokens_seen": 256635570, + "step": 11898, + "time_per_iteration": 2.73362398147583 + }, + { + "auxiliary_loss_clip": 0.01089871, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.03692102, + "balance_loss_mlp": 1.01879835, + "epoch": 0.7154065834961671, + "flos": 24716057852160.0, + "grad_norm": 2.2668196785220895, + "language_loss": 0.73072803, + "learning_rate": 7.910044557431302e-07, + "loss": 0.7519573, + "num_input_tokens_seen": 256655290, + "step": 11899, + "time_per_iteration": 2.7390663623809814 + }, + { + "auxiliary_loss_clip": 0.01101493, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.03773189, + "balance_loss_mlp": 1.02130437, + "epoch": 0.7154667067488351, + "flos": 22601494149120.0, + "grad_norm": 5.969579255187867, + "language_loss": 0.75829309, + "learning_rate": 7.906942308317614e-07, + "loss": 0.77965403, + "num_input_tokens_seen": 256671605, + "step": 11900, + "time_per_iteration": 2.6649601459503174 + }, + { + "auxiliary_loss_clip": 0.01103632, + "auxiliary_loss_mlp": 0.01030943, + "balance_loss_clip": 1.04124916, + "balance_loss_mlp": 1.01839614, + "epoch": 0.7155268300015031, + "flos": 18771154513920.0, + "grad_norm": 1.8695849033514778, + "language_loss": 0.80723226, + "learning_rate": 7.903840517773886e-07, + "loss": 0.828578, + "num_input_tokens_seen": 256689680, + "step": 11901, + "time_per_iteration": 2.7060022354125977 + }, + { + "auxiliary_loss_clip": 0.01080211, + "auxiliary_loss_mlp": 0.01038068, + "balance_loss_clip": 1.03678465, + "balance_loss_mlp": 1.02424598, + "epoch": 0.7155869532541711, + "flos": 18296343607680.0, + "grad_norm": 1.8343268513832525, + "language_loss": 0.81889194, + "learning_rate": 7.900739185917744e-07, + "loss": 0.84007472, + "num_input_tokens_seen": 256707760, + "step": 11902, + "time_per_iteration": 2.7816693782806396 + }, + { + "auxiliary_loss_clip": 0.01069017, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.03530717, + "balance_loss_mlp": 1.01750159, + "epoch": 0.715647076506839, + "flos": 11980805783040.0, + "grad_norm": 1.7267279020747466, + "language_loss": 0.68092871, + "learning_rate": 7.897638312866785e-07, + "loss": 0.70191914, + "num_input_tokens_seen": 256724150, + "step": 11903, + "time_per_iteration": 4.382705926895142 + }, + { + "auxiliary_loss_clip": 0.0106915, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.03447473, + "balance_loss_mlp": 1.01918483, + "epoch": 0.715707199759507, + "flos": 18951641377920.0, + "grad_norm": 5.343255365048286, + "language_loss": 0.75641096, + "learning_rate": 7.894537898738589e-07, + "loss": 0.77741492, + "num_input_tokens_seen": 256742780, + "step": 11904, + "time_per_iteration": 4.288340330123901 + }, + { + "auxiliary_loss_clip": 0.01091072, + "auxiliary_loss_mlp": 0.01039419, + "balance_loss_clip": 1.03938174, + "balance_loss_mlp": 1.02566779, + "epoch": 0.7157673230121749, + "flos": 15304410299520.0, + "grad_norm": 2.088773074445301, + "language_loss": 0.72025734, + "learning_rate": 7.891437943650727e-07, + "loss": 0.74156225, + "num_input_tokens_seen": 256761355, + "step": 11905, + "time_per_iteration": 4.343631267547607 + }, + { + "auxiliary_loss_clip": 0.01077244, + "auxiliary_loss_mlp": 0.0103248, + "balance_loss_clip": 1.03842819, + "balance_loss_mlp": 1.02001703, + "epoch": 0.715827446264843, + "flos": 23221850964480.0, + "grad_norm": 1.657779728748099, + "language_loss": 0.779338, + "learning_rate": 7.88833844772076e-07, + "loss": 0.8004353, + "num_input_tokens_seen": 256781335, + "step": 11906, + "time_per_iteration": 2.8104159832000732 + }, + { + "auxiliary_loss_clip": 0.01014211, + "auxiliary_loss_mlp": 0.0099711, + "balance_loss_clip": 1.01162815, + "balance_loss_mlp": 0.99602473, + "epoch": 0.7158875695175109, + "flos": 60975421833600.0, + "grad_norm": 0.7366961855147857, + "language_loss": 0.55325353, + "learning_rate": 7.885239411066205e-07, + "loss": 0.57336664, + "num_input_tokens_seen": 256838890, + "step": 11907, + "time_per_iteration": 3.1521129608154297 + }, + { + "auxiliary_loss_clip": 0.01094066, + "auxiliary_loss_mlp": 0.01039845, + "balance_loss_clip": 1.03540492, + "balance_loss_mlp": 1.02677381, + "epoch": 0.7159476927701789, + "flos": 17128780024320.0, + "grad_norm": 1.89939740443007, + "language_loss": 0.69593656, + "learning_rate": 7.882140833804593e-07, + "loss": 0.71727568, + "num_input_tokens_seen": 256858145, + "step": 11908, + "time_per_iteration": 2.6724538803100586 + }, + { + "auxiliary_loss_clip": 0.01059783, + "auxiliary_loss_mlp": 0.01036603, + "balance_loss_clip": 1.03303337, + "balance_loss_mlp": 1.02254832, + "epoch": 0.7160078160228468, + "flos": 22490601886080.0, + "grad_norm": 1.6751841094057447, + "language_loss": 0.71237969, + "learning_rate": 7.879042716053415e-07, + "loss": 0.7333436, + "num_input_tokens_seen": 256878545, + "step": 11909, + "time_per_iteration": 2.779273509979248 + }, + { + "auxiliary_loss_clip": 0.01099917, + "auxiliary_loss_mlp": 0.01030028, + "balance_loss_clip": 1.0387938, + "balance_loss_mlp": 1.01755881, + "epoch": 0.7160679392755148, + "flos": 30590935626240.0, + "grad_norm": 1.4959151522362304, + "language_loss": 0.75010902, + "learning_rate": 7.875945057930144e-07, + "loss": 0.7714085, + "num_input_tokens_seen": 256899920, + "step": 11910, + "time_per_iteration": 2.7424912452697754 + }, + { + "auxiliary_loss_clip": 0.01085268, + "auxiliary_loss_mlp": 0.01034213, + "balance_loss_clip": 1.0382638, + "balance_loss_mlp": 1.02263737, + "epoch": 0.7161280625281827, + "flos": 21323648833920.0, + "grad_norm": 1.5302691845486787, + "language_loss": 0.76587963, + "learning_rate": 7.872847859552251e-07, + "loss": 0.78707445, + "num_input_tokens_seen": 256918460, + "step": 11911, + "time_per_iteration": 4.259274244308472 + }, + { + "auxiliary_loss_clip": 0.01069944, + "auxiliary_loss_mlp": 0.01043229, + "balance_loss_clip": 1.03755224, + "balance_loss_mlp": 1.02831018, + "epoch": 0.7161881857808508, + "flos": 61860078921600.0, + "grad_norm": 1.649161828071685, + "language_loss": 0.58413363, + "learning_rate": 7.869751121037192e-07, + "loss": 0.60526532, + "num_input_tokens_seen": 256942015, + "step": 11912, + "time_per_iteration": 3.1699318885803223 + }, + { + "auxiliary_loss_clip": 0.01101612, + "auxiliary_loss_mlp": 0.01031486, + "balance_loss_clip": 1.04070008, + "balance_loss_mlp": 1.01849806, + "epoch": 0.7162483090335187, + "flos": 20812101292800.0, + "grad_norm": 1.859500164824888, + "language_loss": 0.7810173, + "learning_rate": 7.866654842502376e-07, + "loss": 0.80234826, + "num_input_tokens_seen": 256961065, + "step": 11913, + "time_per_iteration": 2.704882860183716 + }, + { + "auxiliary_loss_clip": 0.01087765, + "auxiliary_loss_mlp": 0.0102754, + "balance_loss_clip": 1.03807175, + "balance_loss_mlp": 1.01646566, + "epoch": 0.7163084322861867, + "flos": 24097532630400.0, + "grad_norm": 1.6076637682641197, + "language_loss": 0.74075729, + "learning_rate": 7.863559024065234e-07, + "loss": 0.76191038, + "num_input_tokens_seen": 256982165, + "step": 11914, + "time_per_iteration": 2.7636988162994385 + }, + { + "auxiliary_loss_clip": 0.01075409, + "auxiliary_loss_mlp": 0.01033103, + "balance_loss_clip": 1.036044, + "balance_loss_mlp": 1.02074111, + "epoch": 0.7163685555388547, + "flos": 20080888128000.0, + "grad_norm": 1.6922973692533387, + "language_loss": 0.74138194, + "learning_rate": 7.860463665843143e-07, + "loss": 0.76246703, + "num_input_tokens_seen": 256999825, + "step": 11915, + "time_per_iteration": 2.816134452819824 + }, + { + "auxiliary_loss_clip": 0.01111475, + "auxiliary_loss_mlp": 0.01032503, + "balance_loss_clip": 1.0383029, + "balance_loss_mlp": 1.02015853, + "epoch": 0.7164286787915226, + "flos": 17456967613440.0, + "grad_norm": 2.8016306362793353, + "language_loss": 0.80886412, + "learning_rate": 7.85736876795349e-07, + "loss": 0.83030391, + "num_input_tokens_seen": 257017450, + "step": 11916, + "time_per_iteration": 2.666930675506592 + }, + { + "auxiliary_loss_clip": 0.01033862, + "auxiliary_loss_mlp": 0.01034435, + "balance_loss_clip": 1.03228307, + "balance_loss_mlp": 1.0218699, + "epoch": 0.7164888020441906, + "flos": 19718908819200.0, + "grad_norm": 1.9058816458994292, + "language_loss": 0.6875428, + "learning_rate": 7.854274330513626e-07, + "loss": 0.70822579, + "num_input_tokens_seen": 257035465, + "step": 11917, + "time_per_iteration": 3.0043599605560303 + }, + { + "auxiliary_loss_clip": 0.0108964, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.03826666, + "balance_loss_mlp": 1.0224905, + "epoch": 0.7165489252968585, + "flos": 21470523546240.0, + "grad_norm": 2.1418903876984614, + "language_loss": 0.75930321, + "learning_rate": 7.851180353640896e-07, + "loss": 0.78055626, + "num_input_tokens_seen": 257053750, + "step": 11918, + "time_per_iteration": 2.8666863441467285 + }, + { + "auxiliary_loss_clip": 0.01012914, + "auxiliary_loss_mlp": 0.0100742, + "balance_loss_clip": 1.00994635, + "balance_loss_mlp": 1.00643027, + "epoch": 0.7166090485495266, + "flos": 69928060464000.0, + "grad_norm": 0.6290817017745445, + "language_loss": 0.53839982, + "learning_rate": 7.848086837452639e-07, + "loss": 0.55860317, + "num_input_tokens_seen": 257121215, + "step": 11919, + "time_per_iteration": 3.3189728260040283 + }, + { + "auxiliary_loss_clip": 0.01090721, + "auxiliary_loss_mlp": 0.0103132, + "balance_loss_clip": 1.03968215, + "balance_loss_mlp": 1.01944053, + "epoch": 0.7166691718021945, + "flos": 27343892949120.0, + "grad_norm": 2.4558245905246188, + "language_loss": 0.68792629, + "learning_rate": 7.844993782066132e-07, + "loss": 0.70914674, + "num_input_tokens_seen": 257143370, + "step": 11920, + "time_per_iteration": 2.7760236263275146 + }, + { + "auxiliary_loss_clip": 0.01093244, + "auxiliary_loss_mlp": 0.01042352, + "balance_loss_clip": 1.03837049, + "balance_loss_mlp": 1.02936387, + "epoch": 0.7167292950548625, + "flos": 30408868563840.0, + "grad_norm": 1.7838996304195904, + "language_loss": 0.75269383, + "learning_rate": 7.841901187598678e-07, + "loss": 0.77404976, + "num_input_tokens_seen": 257162160, + "step": 11921, + "time_per_iteration": 2.775209426879883 + }, + { + "auxiliary_loss_clip": 0.01081729, + "auxiliary_loss_mlp": 0.01036086, + "balance_loss_clip": 1.04076838, + "balance_loss_mlp": 1.02052867, + "epoch": 0.7167894183075304, + "flos": 14571257800320.0, + "grad_norm": 2.2701090477680546, + "language_loss": 0.75837505, + "learning_rate": 7.83880905416755e-07, + "loss": 0.77955317, + "num_input_tokens_seen": 257179300, + "step": 11922, + "time_per_iteration": 2.7607452869415283 + }, + { + "auxiliary_loss_clip": 0.01014406, + "auxiliary_loss_mlp": 0.01014898, + "balance_loss_clip": 1.00970268, + "balance_loss_mlp": 1.01383746, + "epoch": 0.7168495415601984, + "flos": 64110674407680.0, + "grad_norm": 0.7523286809102585, + "language_loss": 0.55089313, + "learning_rate": 7.83571738189001e-07, + "loss": 0.57118618, + "num_input_tokens_seen": 257235470, + "step": 11923, + "time_per_iteration": 3.0676429271698 + }, + { + "auxiliary_loss_clip": 0.01080014, + "auxiliary_loss_mlp": 0.01037915, + "balance_loss_clip": 1.03623641, + "balance_loss_mlp": 1.024611, + "epoch": 0.7169096648128663, + "flos": 24681440119680.0, + "grad_norm": 1.4525334689031153, + "language_loss": 0.7698282, + "learning_rate": 7.832626170883279e-07, + "loss": 0.79100752, + "num_input_tokens_seen": 257255850, + "step": 11924, + "time_per_iteration": 2.823679208755493 + }, + { + "auxiliary_loss_clip": 0.01078337, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.0381155, + "balance_loss_mlp": 1.02288294, + "epoch": 0.7169697880655344, + "flos": 20667525050880.0, + "grad_norm": 1.7352538037253364, + "language_loss": 0.68109524, + "learning_rate": 7.829535421264588e-07, + "loss": 0.70222354, + "num_input_tokens_seen": 257275425, + "step": 11925, + "time_per_iteration": 2.7586591243743896 + }, + { + "auxiliary_loss_clip": 0.01080533, + "auxiliary_loss_mlp": 0.01032844, + "balance_loss_clip": 1.03722239, + "balance_loss_mlp": 1.02085745, + "epoch": 0.7170299113182023, + "flos": 21032700670080.0, + "grad_norm": 1.565689357795704, + "language_loss": 0.77380347, + "learning_rate": 7.826445133151133e-07, + "loss": 0.79493719, + "num_input_tokens_seen": 257295740, + "step": 11926, + "time_per_iteration": 2.777597188949585 + }, + { + "auxiliary_loss_clip": 0.01099959, + "auxiliary_loss_mlp": 0.00771085, + "balance_loss_clip": 1.03791356, + "balance_loss_mlp": 1.00019264, + "epoch": 0.7170900345708703, + "flos": 22893304239360.0, + "grad_norm": 1.9891447928832446, + "language_loss": 0.77106082, + "learning_rate": 7.823355306660093e-07, + "loss": 0.78977132, + "num_input_tokens_seen": 257315970, + "step": 11927, + "time_per_iteration": 2.722008228302002 + }, + { + "auxiliary_loss_clip": 0.01103176, + "auxiliary_loss_mlp": 0.01032942, + "balance_loss_clip": 1.04161656, + "balance_loss_mlp": 1.01948345, + "epoch": 0.7171501578235383, + "flos": 15518688883200.0, + "grad_norm": 1.5109458575320354, + "language_loss": 0.69240952, + "learning_rate": 7.820265941908642e-07, + "loss": 0.71377075, + "num_input_tokens_seen": 257334230, + "step": 11928, + "time_per_iteration": 2.685173511505127 + }, + { + "auxiliary_loss_clip": 0.01063212, + "auxiliary_loss_mlp": 0.01033233, + "balance_loss_clip": 1.03615737, + "balance_loss_mlp": 1.02093053, + "epoch": 0.7172102810762062, + "flos": 26104292640000.0, + "grad_norm": 1.8437632186543573, + "language_loss": 0.64895999, + "learning_rate": 7.817177039013931e-07, + "loss": 0.66992444, + "num_input_tokens_seen": 257352145, + "step": 11929, + "time_per_iteration": 2.811458110809326 + }, + { + "auxiliary_loss_clip": 0.01084354, + "auxiliary_loss_mlp": 0.0103302, + "balance_loss_clip": 1.03474772, + "balance_loss_mlp": 1.0201571, + "epoch": 0.7172704043288742, + "flos": 21506649649920.0, + "grad_norm": 4.025535473729134, + "language_loss": 0.70036447, + "learning_rate": 7.81408859809308e-07, + "loss": 0.72153819, + "num_input_tokens_seen": 257371460, + "step": 11930, + "time_per_iteration": 2.7018861770629883 + }, + { + "auxiliary_loss_clip": 0.01073615, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.03261399, + "balance_loss_mlp": 1.01994824, + "epoch": 0.7173305275815421, + "flos": 18770939032320.0, + "grad_norm": 1.865130875534894, + "language_loss": 0.80753005, + "learning_rate": 7.811000619263219e-07, + "loss": 0.82859218, + "num_input_tokens_seen": 257390800, + "step": 11931, + "time_per_iteration": 2.814512252807617 + }, + { + "auxiliary_loss_clip": 0.01099893, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.03860784, + "balance_loss_mlp": 1.02030206, + "epoch": 0.7173906508342102, + "flos": 16179876483840.0, + "grad_norm": 2.1237811167102967, + "language_loss": 0.77989686, + "learning_rate": 7.80791310264143e-07, + "loss": 0.80121714, + "num_input_tokens_seen": 257407495, + "step": 11932, + "time_per_iteration": 2.643590211868286 + }, + { + "auxiliary_loss_clip": 0.01094325, + "auxiliary_loss_mlp": 0.01031086, + "balance_loss_clip": 1.03725207, + "balance_loss_mlp": 1.01856303, + "epoch": 0.7174507740868781, + "flos": 26613864933120.0, + "grad_norm": 1.4329540611911684, + "language_loss": 0.75208265, + "learning_rate": 7.804826048344803e-07, + "loss": 0.77333677, + "num_input_tokens_seen": 257429675, + "step": 11933, + "time_per_iteration": 2.73256254196167 + }, + { + "auxiliary_loss_clip": 0.01118631, + "auxiliary_loss_mlp": 0.01038608, + "balance_loss_clip": 1.04044771, + "balance_loss_mlp": 1.02359951, + "epoch": 0.7175108973395461, + "flos": 18432911116800.0, + "grad_norm": 2.5273912434143537, + "language_loss": 0.69165599, + "learning_rate": 7.801739456490388e-07, + "loss": 0.71322834, + "num_input_tokens_seen": 257442765, + "step": 11934, + "time_per_iteration": 2.63053822517395 + }, + { + "auxiliary_loss_clip": 0.01101966, + "auxiliary_loss_mlp": 0.01034522, + "balance_loss_clip": 1.03851914, + "balance_loss_mlp": 1.02134395, + "epoch": 0.717571020592214, + "flos": 23914962777600.0, + "grad_norm": 2.3786346670781886, + "language_loss": 0.86663944, + "learning_rate": 7.798653327195237e-07, + "loss": 0.88800436, + "num_input_tokens_seen": 257459310, + "step": 11935, + "time_per_iteration": 2.7059433460235596 + }, + { + "auxiliary_loss_clip": 0.01068502, + "auxiliary_loss_mlp": 0.01030899, + "balance_loss_clip": 1.03335261, + "balance_loss_mlp": 1.01750588, + "epoch": 0.717631143844882, + "flos": 38256930109440.0, + "grad_norm": 1.5650811923001593, + "language_loss": 0.73900878, + "learning_rate": 7.795567660576388e-07, + "loss": 0.76000285, + "num_input_tokens_seen": 257484750, + "step": 11936, + "time_per_iteration": 2.8850317001342773 + }, + { + "auxiliary_loss_clip": 0.01029429, + "auxiliary_loss_mlp": 0.01001743, + "balance_loss_clip": 1.00656271, + "balance_loss_mlp": 1.00076544, + "epoch": 0.7176912670975499, + "flos": 65515896328320.0, + "grad_norm": 0.7545285974494826, + "language_loss": 0.55848956, + "learning_rate": 7.79248245675082e-07, + "loss": 0.57880127, + "num_input_tokens_seen": 257543110, + "step": 11937, + "time_per_iteration": 3.1446144580841064 + }, + { + "auxiliary_loss_clip": 0.01104456, + "auxiliary_loss_mlp": 0.01037308, + "balance_loss_clip": 1.03975892, + "balance_loss_mlp": 1.02318776, + "epoch": 0.717751390350218, + "flos": 31281066610560.0, + "grad_norm": 1.8325127153814165, + "language_loss": 0.54673332, + "learning_rate": 7.789397715835542e-07, + "loss": 0.568151, + "num_input_tokens_seen": 257567410, + "step": 11938, + "time_per_iteration": 2.7281179428100586 + }, + { + "auxiliary_loss_clip": 0.01098499, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.03886163, + "balance_loss_mlp": 1.01891303, + "epoch": 0.7178115136028859, + "flos": 19859031774720.0, + "grad_norm": 1.5418999350026745, + "language_loss": 0.76693535, + "learning_rate": 7.786313437947527e-07, + "loss": 0.78823477, + "num_input_tokens_seen": 257586270, + "step": 11939, + "time_per_iteration": 2.681007146835327 + }, + { + "auxiliary_loss_clip": 0.01013928, + "auxiliary_loss_mlp": 0.01000101, + "balance_loss_clip": 1.01088846, + "balance_loss_mlp": 0.99894488, + "epoch": 0.7178716368555539, + "flos": 64348655967360.0, + "grad_norm": 0.7513743107787466, + "language_loss": 0.61356354, + "learning_rate": 7.783229623203738e-07, + "loss": 0.63370389, + "num_input_tokens_seen": 257647415, + "step": 11940, + "time_per_iteration": 3.202899694442749 + }, + { + "auxiliary_loss_clip": 0.01071936, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.03435445, + "balance_loss_mlp": 1.02100182, + "epoch": 0.7179317601082219, + "flos": 26762607152640.0, + "grad_norm": 1.8000940083228283, + "language_loss": 0.58835107, + "learning_rate": 7.780146271721097e-07, + "loss": 0.60940421, + "num_input_tokens_seen": 257669795, + "step": 11941, + "time_per_iteration": 2.8157269954681396 + }, + { + "auxiliary_loss_clip": 0.01090967, + "auxiliary_loss_mlp": 0.01035406, + "balance_loss_clip": 1.03997254, + "balance_loss_mlp": 1.02213192, + "epoch": 0.7179918833608898, + "flos": 23513804709120.0, + "grad_norm": 1.9761608738591345, + "language_loss": 0.79027683, + "learning_rate": 7.777063383616543e-07, + "loss": 0.8115406, + "num_input_tokens_seen": 257687415, + "step": 11942, + "time_per_iteration": 4.7641441822052 + }, + { + "auxiliary_loss_clip": 0.01101717, + "auxiliary_loss_mlp": 0.01043851, + "balance_loss_clip": 1.03940737, + "balance_loss_mlp": 1.03082132, + "epoch": 0.7180520066135578, + "flos": 17165588486400.0, + "grad_norm": 2.14920903348502, + "language_loss": 0.66369361, + "learning_rate": 7.773980959006968e-07, + "loss": 0.68514931, + "num_input_tokens_seen": 257706215, + "step": 11943, + "time_per_iteration": 4.182480335235596 + }, + { + "auxiliary_loss_clip": 0.01111064, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.03972828, + "balance_loss_mlp": 1.01943135, + "epoch": 0.7181121298662257, + "flos": 17566638814080.0, + "grad_norm": 1.806449010910671, + "language_loss": 0.79078984, + "learning_rate": 7.770898998009254e-07, + "loss": 0.81222498, + "num_input_tokens_seen": 257724740, + "step": 11944, + "time_per_iteration": 2.5949878692626953 + }, + { + "auxiliary_loss_clip": 0.01088381, + "auxiliary_loss_mlp": 0.00771584, + "balance_loss_clip": 1.0390811, + "balance_loss_mlp": 1.00018096, + "epoch": 0.7181722531188938, + "flos": 11947660508160.0, + "grad_norm": 2.453862625605413, + "language_loss": 0.63021427, + "learning_rate": 7.767817500740277e-07, + "loss": 0.64881396, + "num_input_tokens_seen": 257742060, + "step": 11945, + "time_per_iteration": 4.4570722579956055 + }, + { + "auxiliary_loss_clip": 0.01016433, + "auxiliary_loss_mlp": 0.01004566, + "balance_loss_clip": 1.00740266, + "balance_loss_mlp": 1.00340927, + "epoch": 0.7182323763715617, + "flos": 65503649790720.0, + "grad_norm": 0.7009639524775984, + "language_loss": 0.51063281, + "learning_rate": 7.76473646731689e-07, + "loss": 0.53084278, + "num_input_tokens_seen": 257802250, + "step": 11946, + "time_per_iteration": 3.083326816558838 + }, + { + "auxiliary_loss_clip": 0.01082274, + "auxiliary_loss_mlp": 0.01035503, + "balance_loss_clip": 1.03858232, + "balance_loss_mlp": 1.02061403, + "epoch": 0.7182924996242297, + "flos": 20630932070400.0, + "grad_norm": 1.6221308546961208, + "language_loss": 0.74305403, + "learning_rate": 7.761655897855925e-07, + "loss": 0.7642318, + "num_input_tokens_seen": 257821155, + "step": 11947, + "time_per_iteration": 2.690142869949341 + }, + { + "auxiliary_loss_clip": 0.01063215, + "auxiliary_loss_mlp": 0.00770856, + "balance_loss_clip": 1.03264832, + "balance_loss_mlp": 1.0000999, + "epoch": 0.7183526228768976, + "flos": 16216433550720.0, + "grad_norm": 1.4641702489475559, + "language_loss": 0.72301382, + "learning_rate": 7.758575792474187e-07, + "loss": 0.74135453, + "num_input_tokens_seen": 257839905, + "step": 11948, + "time_per_iteration": 2.722843647003174 + }, + { + "auxiliary_loss_clip": 0.01090958, + "auxiliary_loss_mlp": 0.01044843, + "balance_loss_clip": 1.03650224, + "balance_loss_mlp": 1.0302515, + "epoch": 0.7184127461295656, + "flos": 22232655342720.0, + "grad_norm": 1.5800605567869153, + "language_loss": 0.71426845, + "learning_rate": 7.755496151288483e-07, + "loss": 0.73562646, + "num_input_tokens_seen": 257860055, + "step": 11949, + "time_per_iteration": 2.6724255084991455 + }, + { + "auxiliary_loss_clip": 0.01110775, + "auxiliary_loss_mlp": 0.00770919, + "balance_loss_clip": 1.03964746, + "balance_loss_mlp": 1.00022686, + "epoch": 0.7184728693822335, + "flos": 27344503480320.0, + "grad_norm": 2.2408917866135116, + "language_loss": 0.76207352, + "learning_rate": 7.752416974415598e-07, + "loss": 0.78089041, + "num_input_tokens_seen": 257879315, + "step": 11950, + "time_per_iteration": 4.192263603210449 + }, + { + "auxiliary_loss_clip": 0.011156, + "auxiliary_loss_mlp": 0.01034636, + "balance_loss_clip": 1.04076946, + "balance_loss_mlp": 1.02039647, + "epoch": 0.7185329926349016, + "flos": 16508530949760.0, + "grad_norm": 2.236939541243443, + "language_loss": 0.67911047, + "learning_rate": 7.749338261972282e-07, + "loss": 0.70061278, + "num_input_tokens_seen": 257896570, + "step": 11951, + "time_per_iteration": 2.506354808807373 + }, + { + "auxiliary_loss_clip": 0.01093328, + "auxiliary_loss_mlp": 0.01038497, + "balance_loss_clip": 1.03931642, + "balance_loss_mlp": 1.02329814, + "epoch": 0.7185931158875695, + "flos": 23951052967680.0, + "grad_norm": 1.74286410335133, + "language_loss": 0.78158391, + "learning_rate": 7.746260014075286e-07, + "loss": 0.8029021, + "num_input_tokens_seen": 257916855, + "step": 11952, + "time_per_iteration": 2.660937547683716 + }, + { + "auxiliary_loss_clip": 0.01106031, + "auxiliary_loss_mlp": 0.01036025, + "balance_loss_clip": 1.03961015, + "balance_loss_mlp": 1.02241182, + "epoch": 0.7186532391402375, + "flos": 26542007775360.0, + "grad_norm": 1.8142092778297234, + "language_loss": 0.74966663, + "learning_rate": 7.743182230841352e-07, + "loss": 0.77108717, + "num_input_tokens_seen": 257937140, + "step": 11953, + "time_per_iteration": 2.64990234375 + }, + { + "auxiliary_loss_clip": 0.01104406, + "auxiliary_loss_mlp": 0.010347, + "balance_loss_clip": 1.03859532, + "balance_loss_mlp": 1.0209074, + "epoch": 0.7187133623929055, + "flos": 22383049587840.0, + "grad_norm": 1.8633986860843366, + "language_loss": 0.73231012, + "learning_rate": 7.740104912387164e-07, + "loss": 0.75370121, + "num_input_tokens_seen": 257956785, + "step": 11954, + "time_per_iteration": 2.667728900909424 + }, + { + "auxiliary_loss_clip": 0.01092336, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.04056668, + "balance_loss_mlp": 1.02468944, + "epoch": 0.7187734856455734, + "flos": 15779580341760.0, + "grad_norm": 1.6371467452088548, + "language_loss": 0.7436921, + "learning_rate": 7.737028058829425e-07, + "loss": 0.76499295, + "num_input_tokens_seen": 257975455, + "step": 11955, + "time_per_iteration": 2.750943660736084 + }, + { + "auxiliary_loss_clip": 0.01077053, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.03667569, + "balance_loss_mlp": 1.02145171, + "epoch": 0.7188336088982414, + "flos": 31759612531200.0, + "grad_norm": 1.63362456002572, + "language_loss": 0.73112231, + "learning_rate": 7.733951670284817e-07, + "loss": 0.75224108, + "num_input_tokens_seen": 257996850, + "step": 11956, + "time_per_iteration": 2.7964000701904297 + }, + { + "auxiliary_loss_clip": 0.01027108, + "auxiliary_loss_mlp": 0.01054242, + "balance_loss_clip": 1.0295012, + "balance_loss_mlp": 1.0388875, + "epoch": 0.7188937321509093, + "flos": 21465208333440.0, + "grad_norm": 1.634055582279059, + "language_loss": 0.71066529, + "learning_rate": 7.730875746869987e-07, + "loss": 0.73147881, + "num_input_tokens_seen": 258016145, + "step": 11957, + "time_per_iteration": 2.920449733734131 + }, + { + "auxiliary_loss_clip": 0.01066083, + "auxiliary_loss_mlp": 0.01046033, + "balance_loss_clip": 1.03746307, + "balance_loss_mlp": 1.03144193, + "epoch": 0.7189538554035774, + "flos": 27271497087360.0, + "grad_norm": 1.9298649142974575, + "language_loss": 0.73817873, + "learning_rate": 7.727800288701582e-07, + "loss": 0.75929987, + "num_input_tokens_seen": 258035420, + "step": 11958, + "time_per_iteration": 2.8204050064086914 + }, + { + "auxiliary_loss_clip": 0.01097894, + "auxiliary_loss_mlp": 0.01043657, + "balance_loss_clip": 1.03673959, + "balance_loss_mlp": 1.03006124, + "epoch": 0.7190139786562453, + "flos": 21580625710080.0, + "grad_norm": 1.5794968369614186, + "language_loss": 0.83998394, + "learning_rate": 7.724725295896215e-07, + "loss": 0.86139941, + "num_input_tokens_seen": 258053520, + "step": 11959, + "time_per_iteration": 2.7135143280029297 + }, + { + "auxiliary_loss_clip": 0.01118944, + "auxiliary_loss_mlp": 0.01033809, + "balance_loss_clip": 1.04263496, + "balance_loss_mlp": 1.0193491, + "epoch": 0.7190741019089133, + "flos": 26721237663360.0, + "grad_norm": 1.6672676962556263, + "language_loss": 0.81917083, + "learning_rate": 7.7216507685705e-07, + "loss": 0.84069836, + "num_input_tokens_seen": 258073020, + "step": 11960, + "time_per_iteration": 2.6510887145996094 + }, + { + "auxiliary_loss_clip": 0.01085237, + "auxiliary_loss_mlp": 0.01040267, + "balance_loss_clip": 1.03664184, + "balance_loss_mlp": 1.02624774, + "epoch": 0.7191342251615812, + "flos": 26104759516800.0, + "grad_norm": 1.541177269995967, + "language_loss": 0.77309084, + "learning_rate": 7.718576706841013e-07, + "loss": 0.79434586, + "num_input_tokens_seen": 258093155, + "step": 11961, + "time_per_iteration": 2.720644950866699 + }, + { + "auxiliary_loss_clip": 0.01093865, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.03698349, + "balance_loss_mlp": 1.02280951, + "epoch": 0.7191943484142492, + "flos": 22967028904320.0, + "grad_norm": 1.422930099710146, + "language_loss": 0.75150669, + "learning_rate": 7.715503110824326e-07, + "loss": 0.7727946, + "num_input_tokens_seen": 258113905, + "step": 11962, + "time_per_iteration": 2.602642774581909 + }, + { + "auxiliary_loss_clip": 0.01101563, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.03852582, + "balance_loss_mlp": 1.01952553, + "epoch": 0.7192544716669171, + "flos": 22565332131840.0, + "grad_norm": 1.6830971031616218, + "language_loss": 0.74998534, + "learning_rate": 7.712429980637001e-07, + "loss": 0.77133304, + "num_input_tokens_seen": 258132820, + "step": 11963, + "time_per_iteration": 2.6065595149993896 + }, + { + "auxiliary_loss_clip": 0.01076507, + "auxiliary_loss_mlp": 0.0103598, + "balance_loss_clip": 1.03903389, + "balance_loss_mlp": 1.02130532, + "epoch": 0.7193145949195852, + "flos": 18982200873600.0, + "grad_norm": 2.2290722742706253, + "language_loss": 0.80742419, + "learning_rate": 7.709357316395564e-07, + "loss": 0.82854903, + "num_input_tokens_seen": 258148055, + "step": 11964, + "time_per_iteration": 2.623037338256836 + }, + { + "auxiliary_loss_clip": 0.0110166, + "auxiliary_loss_mlp": 0.01035653, + "balance_loss_clip": 1.03931797, + "balance_loss_mlp": 1.02267718, + "epoch": 0.7193747181722531, + "flos": 18004246208640.0, + "grad_norm": 1.8511533341084931, + "language_loss": 0.74847329, + "learning_rate": 7.70628511821652e-07, + "loss": 0.76984644, + "num_input_tokens_seen": 258165995, + "step": 11965, + "time_per_iteration": 2.6308131217956543 + }, + { + "auxiliary_loss_clip": 0.01088669, + "auxiliary_loss_mlp": 0.0103597, + "balance_loss_clip": 1.04116011, + "balance_loss_mlp": 1.02225494, + "epoch": 0.7194348414249211, + "flos": 24389414547840.0, + "grad_norm": 1.5072398598153138, + "language_loss": 0.77484959, + "learning_rate": 7.703213386216377e-07, + "loss": 0.79609603, + "num_input_tokens_seen": 258186165, + "step": 11966, + "time_per_iteration": 2.7064943313598633 + }, + { + "auxiliary_loss_clip": 0.0108693, + "auxiliary_loss_mlp": 0.01040354, + "balance_loss_clip": 1.03570664, + "balance_loss_mlp": 1.02598929, + "epoch": 0.7194949646775891, + "flos": 22163455791360.0, + "grad_norm": 2.094523780207328, + "language_loss": 0.72974217, + "learning_rate": 7.700142120511619e-07, + "loss": 0.75101507, + "num_input_tokens_seen": 258204595, + "step": 11967, + "time_per_iteration": 2.6798341274261475 + }, + { + "auxiliary_loss_clip": 0.01084414, + "auxiliary_loss_mlp": 0.01030462, + "balance_loss_clip": 1.03810835, + "balance_loss_mlp": 1.01876187, + "epoch": 0.719555087930257, + "flos": 20266366982400.0, + "grad_norm": 1.6400995747939784, + "language_loss": 0.81876254, + "learning_rate": 7.6970713212187e-07, + "loss": 0.83991134, + "num_input_tokens_seen": 258223110, + "step": 11968, + "time_per_iteration": 2.5945241451263428 + }, + { + "auxiliary_loss_clip": 0.01090809, + "auxiliary_loss_mlp": 0.01030819, + "balance_loss_clip": 1.03921008, + "balance_loss_mlp": 1.01730037, + "epoch": 0.719615211182925, + "flos": 24716309247360.0, + "grad_norm": 6.059293757732166, + "language_loss": 0.76039946, + "learning_rate": 7.69400098845407e-07, + "loss": 0.78161573, + "num_input_tokens_seen": 258242660, + "step": 11969, + "time_per_iteration": 2.669769763946533 + }, + { + "auxiliary_loss_clip": 0.01071764, + "auxiliary_loss_mlp": 0.01035075, + "balance_loss_clip": 1.03422332, + "balance_loss_mlp": 1.02085924, + "epoch": 0.719675334435593, + "flos": 20009641501440.0, + "grad_norm": 1.7540280095141672, + "language_loss": 0.71060121, + "learning_rate": 7.69093112233417e-07, + "loss": 0.7316696, + "num_input_tokens_seen": 258261850, + "step": 11970, + "time_per_iteration": 2.679556131362915 + }, + { + "auxiliary_loss_clip": 0.01013659, + "auxiliary_loss_mlp": 0.01009131, + "balance_loss_clip": 1.00968122, + "balance_loss_mlp": 1.00800419, + "epoch": 0.719735457688261, + "flos": 44199861177600.0, + "grad_norm": 0.9164669258671052, + "language_loss": 0.60825729, + "learning_rate": 7.68786172297538e-07, + "loss": 0.6284852, + "num_input_tokens_seen": 258312570, + "step": 11971, + "time_per_iteration": 3.07918381690979 + }, + { + "auxiliary_loss_clip": 0.01119878, + "auxiliary_loss_mlp": 0.0103657, + "balance_loss_clip": 1.04122591, + "balance_loss_mlp": 1.02223504, + "epoch": 0.7197955809409289, + "flos": 16802890905600.0, + "grad_norm": 2.0890119632055772, + "language_loss": 0.80200607, + "learning_rate": 7.684792790494105e-07, + "loss": 0.82357055, + "num_input_tokens_seen": 258331600, + "step": 11972, + "time_per_iteration": 2.6157615184783936 + }, + { + "auxiliary_loss_clip": 0.01094231, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.03909624, + "balance_loss_mlp": 1.02286744, + "epoch": 0.7198557041935969, + "flos": 24535391420160.0, + "grad_norm": 1.4459296159534718, + "language_loss": 0.75361621, + "learning_rate": 7.681724325006733e-07, + "loss": 0.77492678, + "num_input_tokens_seen": 258351785, + "step": 11973, + "time_per_iteration": 2.7092697620391846 + }, + { + "auxiliary_loss_clip": 0.00998126, + "auxiliary_loss_mlp": 0.01000341, + "balance_loss_clip": 1.01353586, + "balance_loss_mlp": 0.99922049, + "epoch": 0.7199158274462648, + "flos": 70710839602560.0, + "grad_norm": 0.8513128948563679, + "language_loss": 0.5708431, + "learning_rate": 7.6786563266296e-07, + "loss": 0.5908277, + "num_input_tokens_seen": 258404035, + "step": 11974, + "time_per_iteration": 3.085857391357422 + }, + { + "auxiliary_loss_clip": 0.01087282, + "auxiliary_loss_mlp": 0.01034258, + "balance_loss_clip": 1.03747392, + "balance_loss_mlp": 1.02043021, + "epoch": 0.7199759506989328, + "flos": 29347995352320.0, + "grad_norm": 2.3096725812803225, + "language_loss": 0.61059892, + "learning_rate": 7.675588795479062e-07, + "loss": 0.6318143, + "num_input_tokens_seen": 258424850, + "step": 11975, + "time_per_iteration": 2.7332818508148193 + }, + { + "auxiliary_loss_clip": 0.01100807, + "auxiliary_loss_mlp": 0.01033871, + "balance_loss_clip": 1.03652167, + "balance_loss_mlp": 1.02041817, + "epoch": 0.7200360739516007, + "flos": 24640465680000.0, + "grad_norm": 2.671508087455807, + "language_loss": 0.67916059, + "learning_rate": 7.672521731671425e-07, + "loss": 0.7005074, + "num_input_tokens_seen": 258445485, + "step": 11976, + "time_per_iteration": 2.6940202713012695 + }, + { + "auxiliary_loss_clip": 0.0108397, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.03955865, + "balance_loss_mlp": 1.0175494, + "epoch": 0.7200961972042688, + "flos": 20812855478400.0, + "grad_norm": 1.8443077153848637, + "language_loss": 0.67261469, + "learning_rate": 7.669455135323004e-07, + "loss": 0.69375765, + "num_input_tokens_seen": 258464505, + "step": 11977, + "time_per_iteration": 2.6581647396087646 + }, + { + "auxiliary_loss_clip": 0.01091707, + "auxiliary_loss_mlp": 0.01036372, + "balance_loss_clip": 1.03710294, + "balance_loss_mlp": 1.02315187, + "epoch": 0.7201563204569367, + "flos": 31245910174080.0, + "grad_norm": 1.5443170627433962, + "language_loss": 0.75495118, + "learning_rate": 7.666389006550074e-07, + "loss": 0.776232, + "num_input_tokens_seen": 258487190, + "step": 11978, + "time_per_iteration": 2.8164350986480713 + }, + { + "auxiliary_loss_clip": 0.0111045, + "auxiliary_loss_mlp": 0.01033615, + "balance_loss_clip": 1.03798056, + "balance_loss_mlp": 1.02009642, + "epoch": 0.7202164437096047, + "flos": 26651391667200.0, + "grad_norm": 2.011628151794158, + "language_loss": 0.78906727, + "learning_rate": 7.663323345468908e-07, + "loss": 0.81050789, + "num_input_tokens_seen": 258503790, + "step": 11979, + "time_per_iteration": 2.603609323501587 + }, + { + "auxiliary_loss_clip": 0.01100805, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.03782308, + "balance_loss_mlp": 1.01863027, + "epoch": 0.7202765669622727, + "flos": 25959608657280.0, + "grad_norm": 1.489458439869756, + "language_loss": 0.64516908, + "learning_rate": 7.660258152195767e-07, + "loss": 0.66650194, + "num_input_tokens_seen": 258527335, + "step": 11980, + "time_per_iteration": 2.6712260246276855 + }, + { + "auxiliary_loss_clip": 0.01106474, + "auxiliary_loss_mlp": 0.01037898, + "balance_loss_clip": 1.04096806, + "balance_loss_mlp": 1.02322936, + "epoch": 0.7203366902149406, + "flos": 28512354372480.0, + "grad_norm": 3.283132344520263, + "language_loss": 0.67034644, + "learning_rate": 7.657193426846871e-07, + "loss": 0.69179016, + "num_input_tokens_seen": 258546690, + "step": 11981, + "time_per_iteration": 4.248534202575684 + }, + { + "auxiliary_loss_clip": 0.01080413, + "auxiliary_loss_mlp": 0.01035174, + "balance_loss_clip": 1.03540182, + "balance_loss_mlp": 1.02077293, + "epoch": 0.7203968134676086, + "flos": 21106030285440.0, + "grad_norm": 1.9200957279106055, + "language_loss": 0.74228042, + "learning_rate": 7.65412916953843e-07, + "loss": 0.76343632, + "num_input_tokens_seen": 258566340, + "step": 11982, + "time_per_iteration": 2.6612656116485596 + }, + { + "auxiliary_loss_clip": 0.01082612, + "auxiliary_loss_mlp": 0.00771666, + "balance_loss_clip": 1.03610659, + "balance_loss_mlp": 1.00010824, + "epoch": 0.7204569367202766, + "flos": 18332146488960.0, + "grad_norm": 1.9444187102114145, + "language_loss": 0.65890288, + "learning_rate": 7.65106538038665e-07, + "loss": 0.67744565, + "num_input_tokens_seen": 258584455, + "step": 11983, + "time_per_iteration": 5.959589004516602 + }, + { + "auxiliary_loss_clip": 0.01084437, + "auxiliary_loss_mlp": 0.01035638, + "balance_loss_clip": 1.04208398, + "balance_loss_mlp": 1.02224469, + "epoch": 0.7205170599729446, + "flos": 23255103980160.0, + "grad_norm": 1.5232420204646802, + "language_loss": 0.66515326, + "learning_rate": 7.648002059507715e-07, + "loss": 0.68635398, + "num_input_tokens_seen": 258604725, + "step": 11984, + "time_per_iteration": 2.6606063842773438 + }, + { + "auxiliary_loss_clip": 0.01102672, + "auxiliary_loss_mlp": 0.01035615, + "balance_loss_clip": 1.03870726, + "balance_loss_mlp": 1.02119064, + "epoch": 0.7205771832256125, + "flos": 20120892900480.0, + "grad_norm": 1.688320312491579, + "language_loss": 0.74081761, + "learning_rate": 7.644939207017771e-07, + "loss": 0.76220047, + "num_input_tokens_seen": 258622885, + "step": 11985, + "time_per_iteration": 2.6758813858032227 + }, + { + "auxiliary_loss_clip": 0.01100706, + "auxiliary_loss_mlp": 0.01032026, + "balance_loss_clip": 1.03882444, + "balance_loss_mlp": 1.01896691, + "epoch": 0.7206373064782805, + "flos": 27703250565120.0, + "grad_norm": 2.1824579845147287, + "language_loss": 0.62681192, + "learning_rate": 7.641876823032977e-07, + "loss": 0.64813924, + "num_input_tokens_seen": 258644305, + "step": 11986, + "time_per_iteration": 2.6787214279174805 + }, + { + "auxiliary_loss_clip": 0.01094506, + "auxiliary_loss_mlp": 0.01035959, + "balance_loss_clip": 1.0400337, + "balance_loss_mlp": 1.02129614, + "epoch": 0.7206974297309484, + "flos": 17968156018560.0, + "grad_norm": 1.6774381581209574, + "language_loss": 0.72387213, + "learning_rate": 7.638814907669455e-07, + "loss": 0.74517679, + "num_input_tokens_seen": 258661775, + "step": 11987, + "time_per_iteration": 2.6494300365448 + }, + { + "auxiliary_loss_clip": 0.01091554, + "auxiliary_loss_mlp": 0.01036778, + "balance_loss_clip": 1.03807402, + "balance_loss_mlp": 1.0230689, + "epoch": 0.7207575529836164, + "flos": 16983162288000.0, + "grad_norm": 2.0154158747708886, + "language_loss": 0.78542352, + "learning_rate": 7.635753461043301e-07, + "loss": 0.80670691, + "num_input_tokens_seen": 258679830, + "step": 11988, + "time_per_iteration": 2.7818825244903564 + }, + { + "auxiliary_loss_clip": 0.01112006, + "auxiliary_loss_mlp": 0.0103674, + "balance_loss_clip": 1.03854907, + "balance_loss_mlp": 1.02319229, + "epoch": 0.7208176762362843, + "flos": 18727594295040.0, + "grad_norm": 2.5683487455576013, + "language_loss": 0.78912222, + "learning_rate": 7.632692483270618e-07, + "loss": 0.8106097, + "num_input_tokens_seen": 258697415, + "step": 11989, + "time_per_iteration": 4.105331659317017 + }, + { + "auxiliary_loss_clip": 0.01110244, + "auxiliary_loss_mlp": 0.01035931, + "balance_loss_clip": 1.03845143, + "balance_loss_mlp": 1.02281189, + "epoch": 0.7208777994889524, + "flos": 18734489706240.0, + "grad_norm": 1.667538370245498, + "language_loss": 0.8218925, + "learning_rate": 7.629631974467481e-07, + "loss": 0.84335428, + "num_input_tokens_seen": 258716755, + "step": 11990, + "time_per_iteration": 2.59250545501709 + }, + { + "auxiliary_loss_clip": 0.01084798, + "auxiliary_loss_mlp": 0.01039501, + "balance_loss_clip": 1.03765297, + "balance_loss_mlp": 1.0263406, + "epoch": 0.7209379227416203, + "flos": 14793437376000.0, + "grad_norm": 2.0017944237848146, + "language_loss": 0.76018798, + "learning_rate": 7.626571934749931e-07, + "loss": 0.78143102, + "num_input_tokens_seen": 258733270, + "step": 11991, + "time_per_iteration": 2.6581742763519287 + }, + { + "auxiliary_loss_clip": 0.01069068, + "auxiliary_loss_mlp": 0.01036802, + "balance_loss_clip": 1.03637481, + "balance_loss_mlp": 1.02277708, + "epoch": 0.7209980459942883, + "flos": 29636860527360.0, + "grad_norm": 1.4417836781723634, + "language_loss": 0.7278806, + "learning_rate": 7.623512364234022e-07, + "loss": 0.74893934, + "num_input_tokens_seen": 258755270, + "step": 11992, + "time_per_iteration": 2.762066602706909 + }, + { + "auxiliary_loss_clip": 0.01101853, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.03684831, + "balance_loss_mlp": 1.0217396, + "epoch": 0.7210581692469563, + "flos": 23477175815040.0, + "grad_norm": 1.590664380995942, + "language_loss": 0.66213107, + "learning_rate": 7.620453263035755e-07, + "loss": 0.68350136, + "num_input_tokens_seen": 258775340, + "step": 11993, + "time_per_iteration": 2.669746160507202 + }, + { + "auxiliary_loss_clip": 0.01103083, + "auxiliary_loss_mlp": 0.01034792, + "balance_loss_clip": 1.03803623, + "balance_loss_mlp": 1.02193534, + "epoch": 0.7211182924996242, + "flos": 26099839353600.0, + "grad_norm": 3.884072112544962, + "language_loss": 0.65876019, + "learning_rate": 7.61739463127115e-07, + "loss": 0.68013895, + "num_input_tokens_seen": 258794580, + "step": 11994, + "time_per_iteration": 2.6249778270721436 + }, + { + "auxiliary_loss_clip": 0.01103021, + "auxiliary_loss_mlp": 0.01036805, + "balance_loss_clip": 1.03799295, + "balance_loss_mlp": 1.02208841, + "epoch": 0.7211784157522922, + "flos": 17712076982400.0, + "grad_norm": 2.8589170011893006, + "language_loss": 0.67324853, + "learning_rate": 7.614336469056172e-07, + "loss": 0.69464678, + "num_input_tokens_seen": 258812330, + "step": 11995, + "time_per_iteration": 2.5577452182769775 + }, + { + "auxiliary_loss_clip": 0.01084316, + "auxiliary_loss_mlp": 0.01033821, + "balance_loss_clip": 1.03543901, + "balance_loss_mlp": 1.01986206, + "epoch": 0.7212385390049602, + "flos": 24423637230720.0, + "grad_norm": 2.2331481184505537, + "language_loss": 0.79888833, + "learning_rate": 7.6112787765068e-07, + "loss": 0.82006973, + "num_input_tokens_seen": 258831770, + "step": 11996, + "time_per_iteration": 2.6798765659332275 + }, + { + "auxiliary_loss_clip": 0.01112754, + "auxiliary_loss_mlp": 0.01038784, + "balance_loss_clip": 1.03908491, + "balance_loss_mlp": 1.02556992, + "epoch": 0.7212986622576282, + "flos": 28147250580480.0, + "grad_norm": 1.9315796052948224, + "language_loss": 0.81023175, + "learning_rate": 7.60822155373899e-07, + "loss": 0.83174717, + "num_input_tokens_seen": 258849090, + "step": 11997, + "time_per_iteration": 2.656759023666382 + }, + { + "auxiliary_loss_clip": 0.01114647, + "auxiliary_loss_mlp": 0.0103518, + "balance_loss_clip": 1.03915894, + "balance_loss_mlp": 1.02126861, + "epoch": 0.7213587855102961, + "flos": 21835770992640.0, + "grad_norm": 1.8930751745760046, + "language_loss": 0.67190164, + "learning_rate": 7.605164800868646e-07, + "loss": 0.69339991, + "num_input_tokens_seen": 258868230, + "step": 11998, + "time_per_iteration": 2.6269752979278564 + }, + { + "auxiliary_loss_clip": 0.01113247, + "auxiliary_loss_mlp": 0.01032004, + "balance_loss_clip": 1.0402633, + "balance_loss_mlp": 1.01999927, + "epoch": 0.7214189087629641, + "flos": 14611549881600.0, + "grad_norm": 2.2123816168992287, + "language_loss": 0.72197175, + "learning_rate": 7.602108518011696e-07, + "loss": 0.74342418, + "num_input_tokens_seen": 258885525, + "step": 11999, + "time_per_iteration": 2.7030436992645264 + }, + { + "auxiliary_loss_clip": 0.01095225, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.03975248, + "balance_loss_mlp": 1.01632595, + "epoch": 0.721479032015632, + "flos": 19390864884480.0, + "grad_norm": 2.1896556782870986, + "language_loss": 0.82891619, + "learning_rate": 7.599052705284039e-07, + "loss": 0.85016865, + "num_input_tokens_seen": 258903245, + "step": 12000, + "time_per_iteration": 2.72419810295105 + }, + { + "auxiliary_loss_clip": 0.0110488, + "auxiliary_loss_mlp": 0.01036877, + "balance_loss_clip": 1.04077649, + "balance_loss_mlp": 1.02337074, + "epoch": 0.7215391552683, + "flos": 18512884748160.0, + "grad_norm": 2.238210081957985, + "language_loss": 0.77015889, + "learning_rate": 7.59599736280154e-07, + "loss": 0.79157639, + "num_input_tokens_seen": 258921245, + "step": 12001, + "time_per_iteration": 2.6786983013153076 + }, + { + "auxiliary_loss_clip": 0.01096613, + "auxiliary_loss_mlp": 0.01041613, + "balance_loss_clip": 1.03922153, + "balance_loss_mlp": 1.02826142, + "epoch": 0.721599278520968, + "flos": 23258731253760.0, + "grad_norm": 1.7647688561278618, + "language_loss": 0.81434, + "learning_rate": 7.592942490680066e-07, + "loss": 0.83572221, + "num_input_tokens_seen": 258939425, + "step": 12002, + "time_per_iteration": 2.766787052154541 + }, + { + "auxiliary_loss_clip": 0.01103657, + "auxiliary_loss_mlp": 0.0102914, + "balance_loss_clip": 1.03956521, + "balance_loss_mlp": 1.01506686, + "epoch": 0.721659401773636, + "flos": 39199045979520.0, + "grad_norm": 1.90156490746599, + "language_loss": 0.62442046, + "learning_rate": 7.589888089035462e-07, + "loss": 0.64574844, + "num_input_tokens_seen": 258960710, + "step": 12003, + "time_per_iteration": 2.7572412490844727 + }, + { + "auxiliary_loss_clip": 0.01114647, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.0397718, + "balance_loss_mlp": 1.019418, + "epoch": 0.7217195250263039, + "flos": 14939917038720.0, + "grad_norm": 2.6609118210523146, + "language_loss": 0.6843828, + "learning_rate": 7.586834157983544e-07, + "loss": 0.70586002, + "num_input_tokens_seen": 258978475, + "step": 12004, + "time_per_iteration": 2.553619623184204 + }, + { + "auxiliary_loss_clip": 0.01013578, + "auxiliary_loss_mlp": 0.01003303, + "balance_loss_clip": 1.01591694, + "balance_loss_mlp": 1.0020926, + "epoch": 0.7217796482789719, + "flos": 70869206666880.0, + "grad_norm": 0.858251890961465, + "language_loss": 0.54091179, + "learning_rate": 7.583780697640112e-07, + "loss": 0.56108057, + "num_input_tokens_seen": 259037520, + "step": 12005, + "time_per_iteration": 3.186676502227783 + }, + { + "auxiliary_loss_clip": 0.0107998, + "auxiliary_loss_mlp": 0.01033092, + "balance_loss_clip": 1.03859079, + "balance_loss_mlp": 1.0192821, + "epoch": 0.7218397715316398, + "flos": 37451525402880.0, + "grad_norm": 1.66711169237072, + "language_loss": 0.63384253, + "learning_rate": 7.580727708120962e-07, + "loss": 0.65497327, + "num_input_tokens_seen": 259061325, + "step": 12006, + "time_per_iteration": 2.8096885681152344 + }, + { + "auxiliary_loss_clip": 0.01084341, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.03541422, + "balance_loss_mlp": 1.02141964, + "epoch": 0.7218998947843078, + "flos": 22710662559360.0, + "grad_norm": 1.8415091001444905, + "language_loss": 0.91831303, + "learning_rate": 7.577675189541865e-07, + "loss": 0.93950289, + "num_input_tokens_seen": 259078135, + "step": 12007, + "time_per_iteration": 2.636061668395996 + }, + { + "auxiliary_loss_clip": 0.01074819, + "auxiliary_loss_mlp": 0.01038291, + "balance_loss_clip": 1.03386235, + "balance_loss_mlp": 1.02249599, + "epoch": 0.7219600180369758, + "flos": 12167182477440.0, + "grad_norm": 1.9560042300828953, + "language_loss": 0.64139968, + "learning_rate": 7.574623142018568e-07, + "loss": 0.66253078, + "num_input_tokens_seen": 259095910, + "step": 12008, + "time_per_iteration": 2.6658670902252197 + }, + { + "auxiliary_loss_clip": 0.0110234, + "auxiliary_loss_mlp": 0.0103902, + "balance_loss_clip": 1.03860354, + "balance_loss_mlp": 1.02491176, + "epoch": 0.7220201412896438, + "flos": 22596573985920.0, + "grad_norm": 1.9949931171952824, + "language_loss": 0.78768408, + "learning_rate": 7.57157156566681e-07, + "loss": 0.80909771, + "num_input_tokens_seen": 259114225, + "step": 12009, + "time_per_iteration": 2.6496176719665527 + }, + { + "auxiliary_loss_clip": 0.01103715, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.04009509, + "balance_loss_mlp": 1.02490854, + "epoch": 0.7220802645423118, + "flos": 26718651884160.0, + "grad_norm": 1.8397913257632763, + "language_loss": 0.64088428, + "learning_rate": 7.568520460602297e-07, + "loss": 0.66232234, + "num_input_tokens_seen": 259134660, + "step": 12010, + "time_per_iteration": 2.7039434909820557 + }, + { + "auxiliary_loss_clip": 0.01112341, + "auxiliary_loss_mlp": 0.01028267, + "balance_loss_clip": 1.0384059, + "balance_loss_mlp": 1.01517224, + "epoch": 0.7221403877949797, + "flos": 24420548661120.0, + "grad_norm": 2.031062192481546, + "language_loss": 0.7745133, + "learning_rate": 7.565469826940742e-07, + "loss": 0.79591942, + "num_input_tokens_seen": 259153300, + "step": 12011, + "time_per_iteration": 2.6566684246063232 + }, + { + "auxiliary_loss_clip": 0.01095954, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.03788853, + "balance_loss_mlp": 1.02336133, + "epoch": 0.7222005110476477, + "flos": 23514379326720.0, + "grad_norm": 2.0943143808042617, + "language_loss": 0.78936207, + "learning_rate": 7.56241966479781e-07, + "loss": 0.81068206, + "num_input_tokens_seen": 259172115, + "step": 12012, + "time_per_iteration": 2.6651875972747803 + }, + { + "auxiliary_loss_clip": 0.0109279, + "auxiliary_loss_mlp": 0.01031271, + "balance_loss_clip": 1.03982329, + "balance_loss_mlp": 1.01809883, + "epoch": 0.7222606343003156, + "flos": 23112538899840.0, + "grad_norm": 1.7259096547472548, + "language_loss": 0.75816202, + "learning_rate": 7.559369974289171e-07, + "loss": 0.77940267, + "num_input_tokens_seen": 259191345, + "step": 12013, + "time_per_iteration": 2.6666300296783447 + }, + { + "auxiliary_loss_clip": 0.01112282, + "auxiliary_loss_mlp": 0.01027778, + "balance_loss_clip": 1.03951406, + "balance_loss_mlp": 1.01493895, + "epoch": 0.7223207575529836, + "flos": 24351169541760.0, + "grad_norm": 1.5900887482073394, + "language_loss": 0.76009625, + "learning_rate": 7.556320755530484e-07, + "loss": 0.78149676, + "num_input_tokens_seen": 259211700, + "step": 12014, + "time_per_iteration": 2.8077309131622314 + }, + { + "auxiliary_loss_clip": 0.01103939, + "auxiliary_loss_mlp": 0.01031964, + "balance_loss_clip": 1.03792763, + "balance_loss_mlp": 1.01870835, + "epoch": 0.7223808808056515, + "flos": 28330179569280.0, + "grad_norm": 1.5772479389327612, + "language_loss": 0.86851835, + "learning_rate": 7.553272008637346e-07, + "loss": 0.88987738, + "num_input_tokens_seen": 259233825, + "step": 12015, + "time_per_iteration": 2.658083915710449 + }, + { + "auxiliary_loss_clip": 0.01099282, + "auxiliary_loss_mlp": 0.01033999, + "balance_loss_clip": 1.0388813, + "balance_loss_mlp": 1.02105308, + "epoch": 0.7224410040583196, + "flos": 21069437304960.0, + "grad_norm": 1.834690814791336, + "language_loss": 0.7801137, + "learning_rate": 7.55022373372538e-07, + "loss": 0.80144656, + "num_input_tokens_seen": 259253055, + "step": 12016, + "time_per_iteration": 2.623483180999756 + }, + { + "auxiliary_loss_clip": 0.01067391, + "auxiliary_loss_mlp": 0.0105171, + "balance_loss_clip": 1.03403831, + "balance_loss_mlp": 1.03612971, + "epoch": 0.7225011273109875, + "flos": 26795429205120.0, + "grad_norm": 1.3753282936745013, + "language_loss": 0.77807558, + "learning_rate": 7.547175930910186e-07, + "loss": 0.79926664, + "num_input_tokens_seen": 259273420, + "step": 12017, + "time_per_iteration": 2.7652459144592285 + }, + { + "auxiliary_loss_clip": 0.01109706, + "auxiliary_loss_mlp": 0.01031666, + "balance_loss_clip": 1.03881669, + "balance_loss_mlp": 1.01943493, + "epoch": 0.7225612505636555, + "flos": 23583578878080.0, + "grad_norm": 1.9142448581528158, + "language_loss": 0.73780286, + "learning_rate": 7.54412860030732e-07, + "loss": 0.75921661, + "num_input_tokens_seen": 259291000, + "step": 12018, + "time_per_iteration": 2.640007495880127 + }, + { + "auxiliary_loss_clip": 0.01084854, + "auxiliary_loss_mlp": 0.01034783, + "balance_loss_clip": 1.04522383, + "balance_loss_mlp": 1.02281451, + "epoch": 0.7226213738163234, + "flos": 20777627214720.0, + "grad_norm": 4.152096025533445, + "language_loss": 0.77579439, + "learning_rate": 7.541081742032347e-07, + "loss": 0.79699075, + "num_input_tokens_seen": 259312390, + "step": 12019, + "time_per_iteration": 2.6887192726135254 + }, + { + "auxiliary_loss_clip": 0.01087897, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.03979766, + "balance_loss_mlp": 1.01615798, + "epoch": 0.7226814970689914, + "flos": 32635832901120.0, + "grad_norm": 1.8249624922907017, + "language_loss": 0.73749167, + "learning_rate": 7.53803535620081e-07, + "loss": 0.75866961, + "num_input_tokens_seen": 259332645, + "step": 12020, + "time_per_iteration": 2.714838743209839 + }, + { + "auxiliary_loss_clip": 0.01096548, + "auxiliary_loss_mlp": 0.01033021, + "balance_loss_clip": 1.03796768, + "balance_loss_mlp": 1.0203011, + "epoch": 0.7227416203216595, + "flos": 22454368041600.0, + "grad_norm": 1.8291980950612234, + "language_loss": 0.77410042, + "learning_rate": 7.534989442928219e-07, + "loss": 0.79539609, + "num_input_tokens_seen": 259353810, + "step": 12021, + "time_per_iteration": 4.313388347625732 + }, + { + "auxiliary_loss_clip": 0.01074387, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.03570378, + "balance_loss_mlp": 1.02155018, + "epoch": 0.7228017435743274, + "flos": 21652303299840.0, + "grad_norm": 1.8872518659613802, + "language_loss": 0.68324184, + "learning_rate": 7.531944002330073e-07, + "loss": 0.70433629, + "num_input_tokens_seen": 259372460, + "step": 12022, + "time_per_iteration": 2.7648468017578125 + }, + { + "auxiliary_loss_clip": 0.01102722, + "auxiliary_loss_mlp": 0.0103106, + "balance_loss_clip": 1.03769839, + "balance_loss_mlp": 1.01741076, + "epoch": 0.7228618668269954, + "flos": 29533474206720.0, + "grad_norm": 1.7890580535020497, + "language_loss": 0.69560903, + "learning_rate": 7.528899034521858e-07, + "loss": 0.71694684, + "num_input_tokens_seen": 259393275, + "step": 12023, + "time_per_iteration": 5.942451000213623 + }, + { + "auxiliary_loss_clip": 0.01082247, + "auxiliary_loss_mlp": 0.01030033, + "balance_loss_clip": 1.03305829, + "balance_loss_mlp": 1.0162704, + "epoch": 0.7229219900796633, + "flos": 27453815544960.0, + "grad_norm": 1.630981256405689, + "language_loss": 0.71236169, + "learning_rate": 7.525854539619052e-07, + "loss": 0.73348451, + "num_input_tokens_seen": 259416205, + "step": 12024, + "time_per_iteration": 2.673879861831665 + }, + { + "auxiliary_loss_clip": 0.01079579, + "auxiliary_loss_mlp": 0.01035111, + "balance_loss_clip": 1.0382725, + "balance_loss_mlp": 1.02249229, + "epoch": 0.7229821133323313, + "flos": 16289368116480.0, + "grad_norm": 2.2051730456809655, + "language_loss": 0.75628078, + "learning_rate": 7.522810517737089e-07, + "loss": 0.77742761, + "num_input_tokens_seen": 259433115, + "step": 12025, + "time_per_iteration": 2.7355802059173584 + }, + { + "auxiliary_loss_clip": 0.01099666, + "auxiliary_loss_mlp": 0.01030116, + "balance_loss_clip": 1.03707576, + "balance_loss_mlp": 1.01740193, + "epoch": 0.7230422365849992, + "flos": 20412343854720.0, + "grad_norm": 2.068852797373043, + "language_loss": 0.76397157, + "learning_rate": 7.519766968991395e-07, + "loss": 0.78526938, + "num_input_tokens_seen": 259450475, + "step": 12026, + "time_per_iteration": 2.6082088947296143 + }, + { + "auxiliary_loss_clip": 0.01102144, + "auxiliary_loss_mlp": 0.01042375, + "balance_loss_clip": 1.0383482, + "balance_loss_mlp": 1.02952373, + "epoch": 0.7231023598376672, + "flos": 25593499284480.0, + "grad_norm": 1.9477752433448912, + "language_loss": 0.6773926, + "learning_rate": 7.516723893497388e-07, + "loss": 0.69883776, + "num_input_tokens_seen": 259469355, + "step": 12027, + "time_per_iteration": 2.6620283126831055 + }, + { + "auxiliary_loss_clip": 0.01062411, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.0401032, + "balance_loss_mlp": 1.0175457, + "epoch": 0.7231624830903352, + "flos": 25149607009920.0, + "grad_norm": 2.2693920109033403, + "language_loss": 0.79310131, + "learning_rate": 7.513681291370469e-07, + "loss": 0.81403315, + "num_input_tokens_seen": 259486565, + "step": 12028, + "time_per_iteration": 4.312790870666504 + }, + { + "auxiliary_loss_clip": 0.01071831, + "auxiliary_loss_mlp": 0.01030546, + "balance_loss_clip": 1.03564012, + "balance_loss_mlp": 1.01683056, + "epoch": 0.7232226063430032, + "flos": 21725740656000.0, + "grad_norm": 1.7649088716190047, + "language_loss": 0.8226198, + "learning_rate": 7.510639162726e-07, + "loss": 0.84364355, + "num_input_tokens_seen": 259505070, + "step": 12029, + "time_per_iteration": 2.6882169246673584 + }, + { + "auxiliary_loss_clip": 0.01012512, + "auxiliary_loss_mlp": 0.01001695, + "balance_loss_clip": 1.01107883, + "balance_loss_mlp": 1.00054455, + "epoch": 0.7232827295956711, + "flos": 68436798491520.0, + "grad_norm": 0.8099058839034723, + "language_loss": 0.61733758, + "learning_rate": 7.507597507679347e-07, + "loss": 0.63747966, + "num_input_tokens_seen": 259569135, + "step": 12030, + "time_per_iteration": 3.252488136291504 + }, + { + "auxiliary_loss_clip": 0.01094272, + "auxiliary_loss_mlp": 0.01037532, + "balance_loss_clip": 1.03575993, + "balance_loss_mlp": 1.02277446, + "epoch": 0.7233428528483391, + "flos": 20192642317440.0, + "grad_norm": 1.6655467134622794, + "language_loss": 0.77807963, + "learning_rate": 7.504556326345859e-07, + "loss": 0.79939759, + "num_input_tokens_seen": 259587035, + "step": 12031, + "time_per_iteration": 2.6133508682250977 + }, + { + "auxiliary_loss_clip": 0.01102197, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.0374577, + "balance_loss_mlp": 1.01696777, + "epoch": 0.723402976101007, + "flos": 23949472769280.0, + "grad_norm": 1.9738785195921462, + "language_loss": 0.81575108, + "learning_rate": 7.501515618840834e-07, + "loss": 0.83707643, + "num_input_tokens_seen": 259606140, + "step": 12032, + "time_per_iteration": 2.7112133502960205 + }, + { + "auxiliary_loss_clip": 0.01075376, + "auxiliary_loss_mlp": 0.01037925, + "balance_loss_clip": 1.03567076, + "balance_loss_mlp": 1.02435255, + "epoch": 0.723463099353675, + "flos": 20813394182400.0, + "grad_norm": 1.776312475495692, + "language_loss": 0.75339031, + "learning_rate": 7.498475385279592e-07, + "loss": 0.77452338, + "num_input_tokens_seen": 259624275, + "step": 12033, + "time_per_iteration": 2.718799114227295 + }, + { + "auxiliary_loss_clip": 0.01077923, + "auxiliary_loss_mlp": 0.01029541, + "balance_loss_clip": 1.03677177, + "balance_loss_mlp": 1.01704192, + "epoch": 0.723523222606343, + "flos": 19098013299840.0, + "grad_norm": 1.7129862588080287, + "language_loss": 0.75157291, + "learning_rate": 7.495435625777423e-07, + "loss": 0.7726475, + "num_input_tokens_seen": 259643465, + "step": 12034, + "time_per_iteration": 2.6831793785095215 + }, + { + "auxiliary_loss_clip": 0.01089243, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.03759241, + "balance_loss_mlp": 1.01996899, + "epoch": 0.723583345859011, + "flos": 26506994993280.0, + "grad_norm": 1.842898991016843, + "language_loss": 0.80809641, + "learning_rate": 7.492396340449578e-07, + "loss": 0.82931113, + "num_input_tokens_seen": 259662500, + "step": 12035, + "time_per_iteration": 2.695371627807617 + }, + { + "auxiliary_loss_clip": 0.01050925, + "auxiliary_loss_mlp": 0.01037786, + "balance_loss_clip": 1.03530586, + "balance_loss_mlp": 1.0243392, + "epoch": 0.723643469111679, + "flos": 16033863697920.0, + "grad_norm": 2.241481195422046, + "language_loss": 0.61241198, + "learning_rate": 7.489357529411326e-07, + "loss": 0.63329911, + "num_input_tokens_seen": 259680140, + "step": 12036, + "time_per_iteration": 2.809441566467285 + }, + { + "auxiliary_loss_clip": 0.01095223, + "auxiliary_loss_mlp": 0.01037212, + "balance_loss_clip": 1.03603697, + "balance_loss_mlp": 1.02554715, + "epoch": 0.7237035923643469, + "flos": 21945549934080.0, + "grad_norm": 1.6262385259954, + "language_loss": 0.67594683, + "learning_rate": 7.486319192777883e-07, + "loss": 0.69727111, + "num_input_tokens_seen": 259700160, + "step": 12037, + "time_per_iteration": 2.7354328632354736 + }, + { + "auxiliary_loss_clip": 0.0111287, + "auxiliary_loss_mlp": 0.01037592, + "balance_loss_clip": 1.03997326, + "balance_loss_mlp": 1.02422309, + "epoch": 0.7237637156170149, + "flos": 23583112001280.0, + "grad_norm": 2.066772048559837, + "language_loss": 0.72353923, + "learning_rate": 7.483281330664479e-07, + "loss": 0.74504387, + "num_input_tokens_seen": 259720525, + "step": 12038, + "time_per_iteration": 2.704622983932495 + }, + { + "auxiliary_loss_clip": 0.01111581, + "auxiliary_loss_mlp": 0.01034396, + "balance_loss_clip": 1.0390476, + "balance_loss_mlp": 1.02059746, + "epoch": 0.7238238388696828, + "flos": 20594698225920.0, + "grad_norm": 1.734011651040034, + "language_loss": 0.72293609, + "learning_rate": 7.480243943186293e-07, + "loss": 0.74439585, + "num_input_tokens_seen": 259738680, + "step": 12039, + "time_per_iteration": 2.6200029850006104 + }, + { + "auxiliary_loss_clip": 0.01112988, + "auxiliary_loss_mlp": 0.01033651, + "balance_loss_clip": 1.03924608, + "balance_loss_mlp": 1.02135432, + "epoch": 0.7238839621223508, + "flos": 24207024263040.0, + "grad_norm": 1.7505923285041294, + "language_loss": 0.76183081, + "learning_rate": 7.477207030458513e-07, + "loss": 0.78329718, + "num_input_tokens_seen": 259758790, + "step": 12040, + "time_per_iteration": 2.560269832611084 + }, + { + "auxiliary_loss_clip": 0.01079576, + "auxiliary_loss_mlp": 0.01035383, + "balance_loss_clip": 1.03573811, + "balance_loss_mlp": 1.0221684, + "epoch": 0.7239440853750188, + "flos": 14209745368320.0, + "grad_norm": 2.0682435916617075, + "language_loss": 0.7625649, + "learning_rate": 7.474170592596301e-07, + "loss": 0.78371453, + "num_input_tokens_seen": 259777370, + "step": 12041, + "time_per_iteration": 2.714940309524536 + }, + { + "auxiliary_loss_clip": 0.01102621, + "auxiliary_loss_mlp": 0.01029545, + "balance_loss_clip": 1.0374378, + "balance_loss_mlp": 1.01699817, + "epoch": 0.7240042086276868, + "flos": 21614812479360.0, + "grad_norm": 2.6117170122590636, + "language_loss": 0.63805127, + "learning_rate": 7.471134629714797e-07, + "loss": 0.65937293, + "num_input_tokens_seen": 259794665, + "step": 12042, + "time_per_iteration": 2.6314237117767334 + }, + { + "auxiliary_loss_clip": 0.01075777, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.03741169, + "balance_loss_mlp": 1.02075338, + "epoch": 0.7240643318803547, + "flos": 23331450337920.0, + "grad_norm": 1.8128616053031077, + "language_loss": 0.83376384, + "learning_rate": 7.468099141929116e-07, + "loss": 0.85486257, + "num_input_tokens_seen": 259811110, + "step": 12043, + "time_per_iteration": 2.676255226135254 + }, + { + "auxiliary_loss_clip": 0.01079486, + "auxiliary_loss_mlp": 0.01030427, + "balance_loss_clip": 1.03760707, + "balance_loss_mlp": 1.01697433, + "epoch": 0.7241244551330227, + "flos": 24024849459840.0, + "grad_norm": 1.7443833351104767, + "language_loss": 0.64167023, + "learning_rate": 7.465064129354379e-07, + "loss": 0.66276932, + "num_input_tokens_seen": 259831080, + "step": 12044, + "time_per_iteration": 2.7761828899383545 + }, + { + "auxiliary_loss_clip": 0.0111317, + "auxiliary_loss_mlp": 0.01032715, + "balance_loss_clip": 1.04010856, + "balance_loss_mlp": 1.01904798, + "epoch": 0.7241845783856906, + "flos": 18730323728640.0, + "grad_norm": 1.9383242043113957, + "language_loss": 0.81468868, + "learning_rate": 7.462029592105658e-07, + "loss": 0.83614755, + "num_input_tokens_seen": 259850135, + "step": 12045, + "time_per_iteration": 2.5996835231781006 + }, + { + "auxiliary_loss_clip": 0.01108154, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.03746927, + "balance_loss_mlp": 1.01956022, + "epoch": 0.7242447016383586, + "flos": 19498668577920.0, + "grad_norm": 1.5954644621567537, + "language_loss": 0.71763444, + "learning_rate": 7.458995530298034e-07, + "loss": 0.73903888, + "num_input_tokens_seen": 259868185, + "step": 12046, + "time_per_iteration": 2.5615580081939697 + }, + { + "auxiliary_loss_clip": 0.01075175, + "auxiliary_loss_mlp": 0.01033472, + "balance_loss_clip": 1.03313971, + "balance_loss_mlp": 1.01897645, + "epoch": 0.7243048248910267, + "flos": 22163491704960.0, + "grad_norm": 2.0910154490498125, + "language_loss": 0.71177173, + "learning_rate": 7.455961944046553e-07, + "loss": 0.73285818, + "num_input_tokens_seen": 259887055, + "step": 12047, + "time_per_iteration": 2.700878381729126 + }, + { + "auxiliary_loss_clip": 0.01086391, + "auxiliary_loss_mlp": 0.01041787, + "balance_loss_clip": 1.03794575, + "balance_loss_mlp": 1.02800667, + "epoch": 0.7243649481436946, + "flos": 27672762896640.0, + "grad_norm": 1.5839782384796177, + "language_loss": 0.70204568, + "learning_rate": 7.45292883346627e-07, + "loss": 0.72332752, + "num_input_tokens_seen": 259908295, + "step": 12048, + "time_per_iteration": 2.690060615539551 + }, + { + "auxiliary_loss_clip": 0.01011684, + "auxiliary_loss_mlp": 0.01004259, + "balance_loss_clip": 1.00705278, + "balance_loss_mlp": 1.0028162, + "epoch": 0.7244250713963626, + "flos": 63244545759360.0, + "grad_norm": 0.8298796504425336, + "language_loss": 0.53679693, + "learning_rate": 7.449896198672168e-07, + "loss": 0.55695641, + "num_input_tokens_seen": 259968475, + "step": 12049, + "time_per_iteration": 3.2119057178497314 + }, + { + "auxiliary_loss_clip": 0.01088982, + "auxiliary_loss_mlp": 0.01032865, + "balance_loss_clip": 1.03676033, + "balance_loss_mlp": 1.01766598, + "epoch": 0.7244851946490305, + "flos": 17967114524160.0, + "grad_norm": 2.0687483221381897, + "language_loss": 0.59396434, + "learning_rate": 7.446864039779258e-07, + "loss": 0.61518282, + "num_input_tokens_seen": 259984865, + "step": 12050, + "time_per_iteration": 2.632354736328125 + }, + { + "auxiliary_loss_clip": 0.0099629, + "auxiliary_loss_mlp": 0.01011839, + "balance_loss_clip": 1.0111258, + "balance_loss_mlp": 1.01062906, + "epoch": 0.7245453179016985, + "flos": 70943649603840.0, + "grad_norm": 0.7230865999860119, + "language_loss": 0.53218287, + "learning_rate": 7.443832356902528e-07, + "loss": 0.55226415, + "num_input_tokens_seen": 260046735, + "step": 12051, + "time_per_iteration": 3.2180604934692383 + }, + { + "auxiliary_loss_clip": 0.01097618, + "auxiliary_loss_mlp": 0.01032159, + "balance_loss_clip": 1.03679287, + "balance_loss_mlp": 1.02010143, + "epoch": 0.7246054411543664, + "flos": 24568464867840.0, + "grad_norm": 1.7070120237628115, + "language_loss": 0.72170782, + "learning_rate": 7.440801150156927e-07, + "loss": 0.74300563, + "num_input_tokens_seen": 260067950, + "step": 12052, + "time_per_iteration": 2.6380202770233154 + }, + { + "auxiliary_loss_clip": 0.01099407, + "auxiliary_loss_mlp": 0.01034919, + "balance_loss_clip": 1.03736925, + "balance_loss_mlp": 1.01992285, + "epoch": 0.7246655644070344, + "flos": 32338312548480.0, + "grad_norm": 1.8571757187229716, + "language_loss": 0.74080825, + "learning_rate": 7.437770419657415e-07, + "loss": 0.76215148, + "num_input_tokens_seen": 260087730, + "step": 12053, + "time_per_iteration": 2.691523790359497 + }, + { + "auxiliary_loss_clip": 0.01072566, + "auxiliary_loss_mlp": 0.0103532, + "balance_loss_clip": 1.03622317, + "balance_loss_mlp": 1.02119958, + "epoch": 0.7247256876597024, + "flos": 21872471713920.0, + "grad_norm": 1.7294141781477532, + "language_loss": 0.78110063, + "learning_rate": 7.434740165518898e-07, + "loss": 0.80217946, + "num_input_tokens_seen": 260107760, + "step": 12054, + "time_per_iteration": 2.658952236175537 + }, + { + "auxiliary_loss_clip": 0.01077648, + "auxiliary_loss_mlp": 0.01035486, + "balance_loss_clip": 1.03661764, + "balance_loss_mlp": 1.02215791, + "epoch": 0.7247858109123704, + "flos": 16213093585920.0, + "grad_norm": 2.4200013582642437, + "language_loss": 0.67830694, + "learning_rate": 7.431710387856301e-07, + "loss": 0.69943827, + "num_input_tokens_seen": 260123660, + "step": 12055, + "time_per_iteration": 2.646244525909424 + }, + { + "auxiliary_loss_clip": 0.01080369, + "auxiliary_loss_mlp": 0.01036731, + "balance_loss_clip": 1.03789568, + "balance_loss_mlp": 1.02451193, + "epoch": 0.7248459341650383, + "flos": 20850705434880.0, + "grad_norm": 1.6702264045613682, + "language_loss": 0.74097568, + "learning_rate": 7.428681086784496e-07, + "loss": 0.76214665, + "num_input_tokens_seen": 260142690, + "step": 12056, + "time_per_iteration": 2.7628982067108154 + }, + { + "auxiliary_loss_clip": 0.01108663, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.03835511, + "balance_loss_mlp": 1.01454699, + "epoch": 0.7249060574177063, + "flos": 25921794614400.0, + "grad_norm": 1.66863868022831, + "language_loss": 0.70870286, + "learning_rate": 7.425652262418368e-07, + "loss": 0.73006552, + "num_input_tokens_seen": 260162590, + "step": 12057, + "time_per_iteration": 2.71063232421875 + }, + { + "auxiliary_loss_clip": 0.01058179, + "auxiliary_loss_mlp": 0.01044744, + "balance_loss_clip": 1.03556621, + "balance_loss_mlp": 1.03009939, + "epoch": 0.7249661806703742, + "flos": 17345536646400.0, + "grad_norm": 1.8439836916669041, + "language_loss": 0.6237672, + "learning_rate": 7.42262391487277e-07, + "loss": 0.64479643, + "num_input_tokens_seen": 260181065, + "step": 12058, + "time_per_iteration": 2.8430051803588867 + }, + { + "auxiliary_loss_clip": 0.01070122, + "auxiliary_loss_mlp": 0.01031873, + "balance_loss_clip": 1.03506172, + "balance_loss_mlp": 1.01852131, + "epoch": 0.7250263039230422, + "flos": 19574153009280.0, + "grad_norm": 1.8897334856820058, + "language_loss": 0.74905157, + "learning_rate": 7.419596044262535e-07, + "loss": 0.77007163, + "num_input_tokens_seen": 260200330, + "step": 12059, + "time_per_iteration": 2.832826614379883 + }, + { + "auxiliary_loss_clip": 0.01098356, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.03746486, + "balance_loss_mlp": 1.02145672, + "epoch": 0.7250864271757103, + "flos": 21976648133760.0, + "grad_norm": 1.8419617438371911, + "language_loss": 0.79300022, + "learning_rate": 7.416568650702472e-07, + "loss": 0.81431836, + "num_input_tokens_seen": 260219975, + "step": 12060, + "time_per_iteration": 4.281320095062256 + }, + { + "auxiliary_loss_clip": 0.01100606, + "auxiliary_loss_mlp": 0.01026628, + "balance_loss_clip": 1.03860307, + "balance_loss_mlp": 1.01334846, + "epoch": 0.7251465504283782, + "flos": 25012608537600.0, + "grad_norm": 1.7927785216248016, + "language_loss": 0.76260906, + "learning_rate": 7.413541734307393e-07, + "loss": 0.78388143, + "num_input_tokens_seen": 260242025, + "step": 12061, + "time_per_iteration": 2.748656749725342 + }, + { + "auxiliary_loss_clip": 0.01108857, + "auxiliary_loss_mlp": 0.00769754, + "balance_loss_clip": 1.03873777, + "balance_loss_mlp": 1.00011206, + "epoch": 0.7252066736810462, + "flos": 16690131135360.0, + "grad_norm": 1.7879167066361221, + "language_loss": 0.81589133, + "learning_rate": 7.410515295192068e-07, + "loss": 0.83467746, + "num_input_tokens_seen": 260260015, + "step": 12062, + "time_per_iteration": 4.1720802783966064 + }, + { + "auxiliary_loss_clip": 0.0106197, + "auxiliary_loss_mlp": 0.01035204, + "balance_loss_clip": 1.03478372, + "balance_loss_mlp": 1.0198977, + "epoch": 0.7252667969337141, + "flos": 25703026830720.0, + "grad_norm": 2.017910234455411, + "language_loss": 0.69402146, + "learning_rate": 7.407489333471262e-07, + "loss": 0.71499324, + "num_input_tokens_seen": 260278635, + "step": 12063, + "time_per_iteration": 4.450777769088745 + }, + { + "auxiliary_loss_clip": 0.01076449, + "auxiliary_loss_mlp": 0.01034693, + "balance_loss_clip": 1.03741121, + "balance_loss_mlp": 1.02178848, + "epoch": 0.7253269201863821, + "flos": 18259930195200.0, + "grad_norm": 1.4900878050946833, + "language_loss": 0.69918656, + "learning_rate": 7.40446384925973e-07, + "loss": 0.72029793, + "num_input_tokens_seen": 260298510, + "step": 12064, + "time_per_iteration": 2.7114603519439697 + }, + { + "auxiliary_loss_clip": 0.01091634, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.03896451, + "balance_loss_mlp": 1.02210331, + "epoch": 0.72538704343905, + "flos": 20411805150720.0, + "grad_norm": 1.7705588276559046, + "language_loss": 0.90465009, + "learning_rate": 7.401438842672192e-07, + "loss": 0.92592084, + "num_input_tokens_seen": 260317405, + "step": 12065, + "time_per_iteration": 2.723996877670288 + }, + { + "auxiliary_loss_clip": 0.01020643, + "auxiliary_loss_mlp": 0.01001515, + "balance_loss_clip": 1.00699556, + "balance_loss_mlp": 1.00026369, + "epoch": 0.725447166691718, + "flos": 70151209706880.0, + "grad_norm": 0.6554583314348987, + "language_loss": 0.56083691, + "learning_rate": 7.398414313823349e-07, + "loss": 0.58105844, + "num_input_tokens_seen": 260388085, + "step": 12066, + "time_per_iteration": 3.332350254058838 + }, + { + "auxiliary_loss_clip": 0.01062291, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.03549218, + "balance_loss_mlp": 1.01799369, + "epoch": 0.725507289944386, + "flos": 27052334254080.0, + "grad_norm": 1.7495752784177439, + "language_loss": 0.76740146, + "learning_rate": 7.395390262827897e-07, + "loss": 0.78832901, + "num_input_tokens_seen": 260406165, + "step": 12067, + "time_per_iteration": 2.815978765487671 + }, + { + "auxiliary_loss_clip": 0.0101369, + "auxiliary_loss_mlp": 0.01006237, + "balance_loss_clip": 1.01036, + "balance_loss_mlp": 1.0050863, + "epoch": 0.725567413197054, + "flos": 62921924778240.0, + "grad_norm": 0.722755917983848, + "language_loss": 0.56971467, + "learning_rate": 7.392366689800515e-07, + "loss": 0.58991396, + "num_input_tokens_seen": 260461365, + "step": 12068, + "time_per_iteration": 4.744567394256592 + }, + { + "auxiliary_loss_clip": 0.0099354, + "auxiliary_loss_mlp": 0.01007822, + "balance_loss_clip": 1.00846553, + "balance_loss_mlp": 1.00654685, + "epoch": 0.7256275364497219, + "flos": 60295957188480.0, + "grad_norm": 0.663737486882956, + "language_loss": 0.55370045, + "learning_rate": 7.389343594855848e-07, + "loss": 0.57371408, + "num_input_tokens_seen": 260523795, + "step": 12069, + "time_per_iteration": 3.275995969772339 + }, + { + "auxiliary_loss_clip": 0.01077438, + "auxiliary_loss_mlp": 0.01027102, + "balance_loss_clip": 1.03855562, + "balance_loss_mlp": 1.01507938, + "epoch": 0.7256876597023899, + "flos": 24498511130880.0, + "grad_norm": 1.6562852272905184, + "language_loss": 0.79984176, + "learning_rate": 7.38632097810854e-07, + "loss": 0.82088709, + "num_input_tokens_seen": 260544765, + "step": 12070, + "time_per_iteration": 2.806398391723633 + }, + { + "auxiliary_loss_clip": 0.01083416, + "auxiliary_loss_mlp": 0.01036302, + "balance_loss_clip": 1.03607607, + "balance_loss_mlp": 1.02395165, + "epoch": 0.7257477829550578, + "flos": 24352749740160.0, + "grad_norm": 1.8683198427961691, + "language_loss": 0.71817708, + "learning_rate": 7.383298839673197e-07, + "loss": 0.73937428, + "num_input_tokens_seen": 260564340, + "step": 12071, + "time_per_iteration": 2.7380881309509277 + }, + { + "auxiliary_loss_clip": 0.01108781, + "auxiliary_loss_mlp": 0.01039283, + "balance_loss_clip": 1.03857553, + "balance_loss_mlp": 1.02693939, + "epoch": 0.7258079062077258, + "flos": 17202217380480.0, + "grad_norm": 2.1132155235444183, + "language_loss": 0.70214903, + "learning_rate": 7.380277179664436e-07, + "loss": 0.72362965, + "num_input_tokens_seen": 260582565, + "step": 12072, + "time_per_iteration": 2.639300584793091 + }, + { + "auxiliary_loss_clip": 0.01075383, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.03398466, + "balance_loss_mlp": 1.01966858, + "epoch": 0.7258680294603939, + "flos": 21580338401280.0, + "grad_norm": 1.7211132025466964, + "language_loss": 0.78522944, + "learning_rate": 7.377255998196821e-07, + "loss": 0.80631953, + "num_input_tokens_seen": 260601700, + "step": 12073, + "time_per_iteration": 2.707505226135254 + }, + { + "auxiliary_loss_clip": 0.01089188, + "auxiliary_loss_mlp": 0.01031416, + "balance_loss_clip": 1.03761029, + "balance_loss_mlp": 1.0188278, + "epoch": 0.7259281527130618, + "flos": 34855399036800.0, + "grad_norm": 1.5601813308837964, + "language_loss": 0.70586532, + "learning_rate": 7.374235295384923e-07, + "loss": 0.72707134, + "num_input_tokens_seen": 260623040, + "step": 12074, + "time_per_iteration": 2.7605321407318115 + }, + { + "auxiliary_loss_clip": 0.01089374, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.03541577, + "balance_loss_mlp": 1.01786137, + "epoch": 0.7259882759657298, + "flos": 25404644551680.0, + "grad_norm": 1.7787306902519031, + "language_loss": 0.74126077, + "learning_rate": 7.371215071343302e-07, + "loss": 0.76246876, + "num_input_tokens_seen": 260642735, + "step": 12075, + "time_per_iteration": 2.809924840927124 + }, + { + "auxiliary_loss_clip": 0.01102235, + "auxiliary_loss_mlp": 0.01037145, + "balance_loss_clip": 1.03854585, + "balance_loss_mlp": 1.02345967, + "epoch": 0.7260483992183977, + "flos": 62953630531200.0, + "grad_norm": 2.761502875821282, + "language_loss": 0.63991046, + "learning_rate": 7.368195326186458e-07, + "loss": 0.6613043, + "num_input_tokens_seen": 260669935, + "step": 12076, + "time_per_iteration": 3.073396921157837 + }, + { + "auxiliary_loss_clip": 0.01073377, + "auxiliary_loss_mlp": 0.01030745, + "balance_loss_clip": 1.03426909, + "balance_loss_mlp": 1.01711977, + "epoch": 0.7261085224710657, + "flos": 26467528924800.0, + "grad_norm": 1.967529180708395, + "language_loss": 0.78661555, + "learning_rate": 7.365176060028912e-07, + "loss": 0.80765676, + "num_input_tokens_seen": 260689605, + "step": 12077, + "time_per_iteration": 2.748734712600708 + }, + { + "auxiliary_loss_clip": 0.01030217, + "auxiliary_loss_mlp": 0.00751512, + "balance_loss_clip": 1.00731969, + "balance_loss_mlp": 0.99968779, + "epoch": 0.7261686457237336, + "flos": 66772732187520.0, + "grad_norm": 0.8834354289567558, + "language_loss": 0.64973843, + "learning_rate": 7.362157272985163e-07, + "loss": 0.66755569, + "num_input_tokens_seen": 260748265, + "step": 12078, + "time_per_iteration": 3.1502130031585693 + }, + { + "auxiliary_loss_clip": 0.01023011, + "auxiliary_loss_mlp": 0.01002876, + "balance_loss_clip": 1.00983262, + "balance_loss_mlp": 1.00162983, + "epoch": 0.7262287689764017, + "flos": 69999594399360.0, + "grad_norm": 0.7148369654201937, + "language_loss": 0.59227604, + "learning_rate": 7.359138965169671e-07, + "loss": 0.61253494, + "num_input_tokens_seen": 260816715, + "step": 12079, + "time_per_iteration": 3.2680857181549072 + }, + { + "auxiliary_loss_clip": 0.01064199, + "auxiliary_loss_mlp": 0.01033019, + "balance_loss_clip": 1.03485882, + "balance_loss_mlp": 1.01984644, + "epoch": 0.7262888922290696, + "flos": 23805435231360.0, + "grad_norm": 2.2028126662157383, + "language_loss": 0.64762789, + "learning_rate": 7.356121136696895e-07, + "loss": 0.66860008, + "num_input_tokens_seen": 260836765, + "step": 12080, + "time_per_iteration": 2.718738317489624 + }, + { + "auxiliary_loss_clip": 0.01064639, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.03282523, + "balance_loss_mlp": 1.01555538, + "epoch": 0.7263490154817376, + "flos": 19500320603520.0, + "grad_norm": 2.4686396191281235, + "language_loss": 0.69309068, + "learning_rate": 7.35310378768128e-07, + "loss": 0.71402919, + "num_input_tokens_seen": 260854610, + "step": 12081, + "time_per_iteration": 2.869288444519043 + }, + { + "auxiliary_loss_clip": 0.01114886, + "auxiliary_loss_mlp": 0.01031024, + "balance_loss_clip": 1.04031432, + "balance_loss_mlp": 1.01794684, + "epoch": 0.7264091387344055, + "flos": 16286243633280.0, + "grad_norm": 1.842145300936274, + "language_loss": 0.81440926, + "learning_rate": 7.350086918237237e-07, + "loss": 0.83586842, + "num_input_tokens_seen": 260871620, + "step": 12082, + "time_per_iteration": 2.558000087738037 + }, + { + "auxiliary_loss_clip": 0.01104122, + "auxiliary_loss_mlp": 0.01037212, + "balance_loss_clip": 1.0367763, + "balance_loss_mlp": 1.02259684, + "epoch": 0.7264692619870735, + "flos": 24352031468160.0, + "grad_norm": 1.7329186007952004, + "language_loss": 0.77324694, + "learning_rate": 7.347070528479158e-07, + "loss": 0.79466033, + "num_input_tokens_seen": 260890490, + "step": 12083, + "time_per_iteration": 2.707674741744995 + }, + { + "auxiliary_loss_clip": 0.01114141, + "auxiliary_loss_mlp": 0.01032066, + "balance_loss_clip": 1.04018736, + "balance_loss_mlp": 1.01889968, + "epoch": 0.7265293852397414, + "flos": 25119478477440.0, + "grad_norm": 1.8409046940193436, + "language_loss": 0.73034543, + "learning_rate": 7.344054618521433e-07, + "loss": 0.75180745, + "num_input_tokens_seen": 260909700, + "step": 12084, + "time_per_iteration": 2.656688928604126 + }, + { + "auxiliary_loss_clip": 0.01114376, + "auxiliary_loss_mlp": 0.01036848, + "balance_loss_clip": 1.03960419, + "balance_loss_mlp": 1.02362156, + "epoch": 0.7265895084924094, + "flos": 22638230784000.0, + "grad_norm": 3.047460171891373, + "language_loss": 0.7778368, + "learning_rate": 7.34103918847843e-07, + "loss": 0.79934901, + "num_input_tokens_seen": 260929090, + "step": 12085, + "time_per_iteration": 2.645911693572998 + }, + { + "auxiliary_loss_clip": 0.01099641, + "auxiliary_loss_mlp": 0.01034439, + "balance_loss_clip": 1.03661323, + "balance_loss_mlp": 1.02154636, + "epoch": 0.7266496317450775, + "flos": 23368222886400.0, + "grad_norm": 1.5977221637963412, + "language_loss": 0.72068805, + "learning_rate": 7.338024238464493e-07, + "loss": 0.74202883, + "num_input_tokens_seen": 260946615, + "step": 12086, + "time_per_iteration": 2.6855533123016357 + }, + { + "auxiliary_loss_clip": 0.01073096, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_clip": 1.03401077, + "balance_loss_mlp": 1.02729964, + "epoch": 0.7267097549977454, + "flos": 28074603323520.0, + "grad_norm": 1.6297510133590103, + "language_loss": 0.6963405, + "learning_rate": 7.335009768593938e-07, + "loss": 0.71748894, + "num_input_tokens_seen": 260968515, + "step": 12087, + "time_per_iteration": 2.8121585845947266 + }, + { + "auxiliary_loss_clip": 0.01115392, + "auxiliary_loss_mlp": 0.01035484, + "balance_loss_clip": 1.04074097, + "balance_loss_mlp": 1.02153099, + "epoch": 0.7267698782504134, + "flos": 22195523658240.0, + "grad_norm": 5.160414648565969, + "language_loss": 0.79164052, + "learning_rate": 7.331995778981088e-07, + "loss": 0.81314927, + "num_input_tokens_seen": 260986790, + "step": 12088, + "time_per_iteration": 2.563143491744995 + }, + { + "auxiliary_loss_clip": 0.01097059, + "auxiliary_loss_mlp": 0.01037688, + "balance_loss_clip": 1.03751171, + "balance_loss_mlp": 1.02490282, + "epoch": 0.7268300015030813, + "flos": 18514859996160.0, + "grad_norm": 1.723527946831352, + "language_loss": 0.73941064, + "learning_rate": 7.328982269740221e-07, + "loss": 0.76075816, + "num_input_tokens_seen": 261004925, + "step": 12089, + "time_per_iteration": 2.6264712810516357 + }, + { + "auxiliary_loss_clip": 0.01088906, + "auxiliary_loss_mlp": 0.01035753, + "balance_loss_clip": 1.03559196, + "balance_loss_mlp": 1.02308106, + "epoch": 0.7268901247557493, + "flos": 23986029836160.0, + "grad_norm": 1.6147699540484286, + "language_loss": 0.70883548, + "learning_rate": 7.325969240985616e-07, + "loss": 0.73008209, + "num_input_tokens_seen": 261023895, + "step": 12090, + "time_per_iteration": 2.674154281616211 + }, + { + "auxiliary_loss_clip": 0.01057949, + "auxiliary_loss_mlp": 0.01033382, + "balance_loss_clip": 1.03447902, + "balance_loss_mlp": 1.01989388, + "epoch": 0.7269502480084172, + "flos": 32088087429120.0, + "grad_norm": 1.7900263733785062, + "language_loss": 0.7724641, + "learning_rate": 7.322956692831528e-07, + "loss": 0.7933774, + "num_input_tokens_seen": 261045445, + "step": 12091, + "time_per_iteration": 2.837162494659424 + }, + { + "auxiliary_loss_clip": 0.0109404, + "auxiliary_loss_mlp": 0.00771553, + "balance_loss_clip": 1.03523159, + "balance_loss_mlp": 1.00019574, + "epoch": 0.7270103712610853, + "flos": 19062785036160.0, + "grad_norm": 2.0691872442271415, + "language_loss": 0.71682477, + "learning_rate": 7.319944625392205e-07, + "loss": 0.73548067, + "num_input_tokens_seen": 261064275, + "step": 12092, + "time_per_iteration": 2.6305599212646484 + }, + { + "auxiliary_loss_clip": 0.01101746, + "auxiliary_loss_mlp": 0.01033427, + "balance_loss_clip": 1.03929043, + "balance_loss_mlp": 1.02035582, + "epoch": 0.7270704945137532, + "flos": 34532921710080.0, + "grad_norm": 2.2398684774576156, + "language_loss": 0.61100423, + "learning_rate": 7.31693303878184e-07, + "loss": 0.63235605, + "num_input_tokens_seen": 261083310, + "step": 12093, + "time_per_iteration": 2.750157117843628 + }, + { + "auxiliary_loss_clip": 0.01090608, + "auxiliary_loss_mlp": 0.01037448, + "balance_loss_clip": 1.03955996, + "balance_loss_mlp": 1.02412009, + "epoch": 0.7271306177664212, + "flos": 21507583403520.0, + "grad_norm": 1.6663796185948798, + "language_loss": 0.75200593, + "learning_rate": 7.313921933114644e-07, + "loss": 0.77328646, + "num_input_tokens_seen": 261103460, + "step": 12094, + "time_per_iteration": 2.63088059425354 + }, + { + "auxiliary_loss_clip": 0.01076646, + "auxiliary_loss_mlp": 0.01031624, + "balance_loss_clip": 1.03417659, + "balance_loss_mlp": 1.01941681, + "epoch": 0.7271907410190891, + "flos": 22272444633600.0, + "grad_norm": 1.8443350683921131, + "language_loss": 0.84625936, + "learning_rate": 7.310911308504808e-07, + "loss": 0.867342, + "num_input_tokens_seen": 261121375, + "step": 12095, + "time_per_iteration": 2.7300918102264404 + }, + { + "auxiliary_loss_clip": 0.010978, + "auxiliary_loss_mlp": 0.01037417, + "balance_loss_clip": 1.03561294, + "balance_loss_mlp": 1.02383316, + "epoch": 0.7272508642717571, + "flos": 22893124671360.0, + "grad_norm": 2.27024179817087, + "language_loss": 0.77610254, + "learning_rate": 7.307901165066479e-07, + "loss": 0.79745466, + "num_input_tokens_seen": 261141105, + "step": 12096, + "time_per_iteration": 2.754016399383545 + }, + { + "auxiliary_loss_clip": 0.01113914, + "auxiliary_loss_mlp": 0.01037152, + "balance_loss_clip": 1.04082382, + "balance_loss_mlp": 1.02434897, + "epoch": 0.727310987524425, + "flos": 11655886331520.0, + "grad_norm": 1.96001611615308, + "language_loss": 0.72508037, + "learning_rate": 7.30489150291381e-07, + "loss": 0.74659109, + "num_input_tokens_seen": 261159255, + "step": 12097, + "time_per_iteration": 2.57547664642334 + }, + { + "auxiliary_loss_clip": 0.01101296, + "auxiliary_loss_mlp": 0.00771623, + "balance_loss_clip": 1.03833079, + "balance_loss_mlp": 1.00024211, + "epoch": 0.727371110777093, + "flos": 24535319592960.0, + "grad_norm": 1.744636039852928, + "language_loss": 0.77178752, + "learning_rate": 7.301882322160935e-07, + "loss": 0.79051673, + "num_input_tokens_seen": 261177960, + "step": 12098, + "time_per_iteration": 2.697739601135254 + }, + { + "auxiliary_loss_clip": 0.01090376, + "auxiliary_loss_mlp": 0.01033826, + "balance_loss_clip": 1.03530288, + "balance_loss_mlp": 1.02023625, + "epoch": 0.7274312340297611, + "flos": 74739835405440.0, + "grad_norm": 1.6470814614885703, + "language_loss": 0.67452812, + "learning_rate": 7.298873622921952e-07, + "loss": 0.69577014, + "num_input_tokens_seen": 261205660, + "step": 12099, + "time_per_iteration": 4.734724283218384 + }, + { + "auxiliary_loss_clip": 0.01100384, + "auxiliary_loss_mlp": 0.01040178, + "balance_loss_clip": 1.0354315, + "balance_loss_mlp": 1.02401924, + "epoch": 0.727491357282429, + "flos": 22342865247360.0, + "grad_norm": 1.6470477467852347, + "language_loss": 0.72511584, + "learning_rate": 7.29586540531095e-07, + "loss": 0.74652147, + "num_input_tokens_seen": 261225185, + "step": 12100, + "time_per_iteration": 2.6307733058929443 + }, + { + "auxiliary_loss_clip": 0.01101803, + "auxiliary_loss_mlp": 0.01038394, + "balance_loss_clip": 1.03856468, + "balance_loss_mlp": 1.02577031, + "epoch": 0.727551480535097, + "flos": 23297550877440.0, + "grad_norm": 1.4604095726641635, + "language_loss": 0.74780536, + "learning_rate": 7.292857669442005e-07, + "loss": 0.76920736, + "num_input_tokens_seen": 261247965, + "step": 12101, + "time_per_iteration": 2.6731035709381104 + }, + { + "auxiliary_loss_clip": 0.01070063, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.03622627, + "balance_loss_mlp": 1.01775718, + "epoch": 0.7276116037877649, + "flos": 21470559459840.0, + "grad_norm": 1.7882931756264577, + "language_loss": 0.82550085, + "learning_rate": 7.289850415429177e-07, + "loss": 0.8464992, + "num_input_tokens_seen": 261267585, + "step": 12102, + "time_per_iteration": 5.8568243980407715 + }, + { + "auxiliary_loss_clip": 0.01100092, + "auxiliary_loss_mlp": 0.01035417, + "balance_loss_clip": 1.03823566, + "balance_loss_mlp": 1.02270937, + "epoch": 0.7276717270404329, + "flos": 21464059098240.0, + "grad_norm": 2.5021196197746396, + "language_loss": 0.81821334, + "learning_rate": 7.286843643386495e-07, + "loss": 0.83956838, + "num_input_tokens_seen": 261285200, + "step": 12103, + "time_per_iteration": 2.619070291519165 + }, + { + "auxiliary_loss_clip": 0.0109026, + "auxiliary_loss_mlp": 0.01027412, + "balance_loss_clip": 1.03774977, + "balance_loss_mlp": 1.01372027, + "epoch": 0.7277318502931008, + "flos": 16837221329280.0, + "grad_norm": 1.6323298348226507, + "language_loss": 0.66439486, + "learning_rate": 7.283837353427968e-07, + "loss": 0.68557155, + "num_input_tokens_seen": 261303645, + "step": 12104, + "time_per_iteration": 2.7373523712158203 + }, + { + "auxiliary_loss_clip": 0.01079506, + "auxiliary_loss_mlp": 0.01033638, + "balance_loss_clip": 1.03706837, + "balance_loss_mlp": 1.02034616, + "epoch": 0.7277919735457689, + "flos": 33400550476800.0, + "grad_norm": 3.4364169718839324, + "language_loss": 0.66114849, + "learning_rate": 7.280831545667611e-07, + "loss": 0.68227994, + "num_input_tokens_seen": 261323265, + "step": 12105, + "time_per_iteration": 2.767533302307129 + }, + { + "auxiliary_loss_clip": 0.01115684, + "auxiliary_loss_mlp": 0.01034833, + "balance_loss_clip": 1.04181576, + "balance_loss_mlp": 1.02132106, + "epoch": 0.7278520967984368, + "flos": 19206499351680.0, + "grad_norm": 3.014598256639034, + "language_loss": 0.75495023, + "learning_rate": 7.27782622021939e-07, + "loss": 0.7764554, + "num_input_tokens_seen": 261339745, + "step": 12106, + "time_per_iteration": 2.595414161682129 + }, + { + "auxiliary_loss_clip": 0.01103034, + "auxiliary_loss_mlp": 0.01033288, + "balance_loss_clip": 1.03735209, + "balance_loss_mlp": 1.01898909, + "epoch": 0.7279122200511048, + "flos": 34094667870720.0, + "grad_norm": 2.1351092676673162, + "language_loss": 0.70326072, + "learning_rate": 7.274821377197273e-07, + "loss": 0.72462392, + "num_input_tokens_seen": 261359310, + "step": 12107, + "time_per_iteration": 4.187346935272217 + }, + { + "auxiliary_loss_clip": 0.01094591, + "auxiliary_loss_mlp": 0.0103929, + "balance_loss_clip": 1.03660846, + "balance_loss_mlp": 1.02583683, + "epoch": 0.7279723433037727, + "flos": 54599049348480.0, + "grad_norm": 1.7543215604249431, + "language_loss": 0.75391257, + "learning_rate": 7.271817016715205e-07, + "loss": 0.77525139, + "num_input_tokens_seen": 261384640, + "step": 12108, + "time_per_iteration": 2.922069549560547 + }, + { + "auxiliary_loss_clip": 0.01111137, + "auxiliary_loss_mlp": 0.01031166, + "balance_loss_clip": 1.03809679, + "balance_loss_mlp": 1.01802313, + "epoch": 0.7280324665564407, + "flos": 36137482156800.0, + "grad_norm": 1.5176447474285724, + "language_loss": 0.67057818, + "learning_rate": 7.268813138887124e-07, + "loss": 0.69200122, + "num_input_tokens_seen": 261405290, + "step": 12109, + "time_per_iteration": 2.691226005554199 + }, + { + "auxiliary_loss_clip": 0.01073593, + "auxiliary_loss_mlp": 0.01033469, + "balance_loss_clip": 1.03573251, + "balance_loss_mlp": 1.01958656, + "epoch": 0.7280925898091086, + "flos": 11618539165440.0, + "grad_norm": 2.3584964062920646, + "language_loss": 0.63489443, + "learning_rate": 7.265809743826912e-07, + "loss": 0.65596509, + "num_input_tokens_seen": 261419710, + "step": 12110, + "time_per_iteration": 2.7957284450531006 + }, + { + "auxiliary_loss_clip": 0.01079859, + "auxiliary_loss_mlp": 0.01029854, + "balance_loss_clip": 1.03503799, + "balance_loss_mlp": 1.01581717, + "epoch": 0.7281527130617766, + "flos": 34277094069120.0, + "grad_norm": 2.403450287181842, + "language_loss": 0.58412719, + "learning_rate": 7.26280683164847e-07, + "loss": 0.60522431, + "num_input_tokens_seen": 261442385, + "step": 12111, + "time_per_iteration": 2.8229284286499023 + }, + { + "auxiliary_loss_clip": 0.01063232, + "auxiliary_loss_mlp": 0.0103291, + "balance_loss_clip": 1.03874135, + "balance_loss_mlp": 1.01915908, + "epoch": 0.7282128363144446, + "flos": 13918043018880.0, + "grad_norm": 2.038144887813222, + "language_loss": 0.73754865, + "learning_rate": 7.259804402465677e-07, + "loss": 0.75851005, + "num_input_tokens_seen": 261459805, + "step": 12112, + "time_per_iteration": 2.7780139446258545 + }, + { + "auxiliary_loss_clip": 0.01098263, + "auxiliary_loss_mlp": 0.01031686, + "balance_loss_clip": 1.03572726, + "balance_loss_mlp": 1.01943743, + "epoch": 0.7282729595671126, + "flos": 20777627214720.0, + "grad_norm": 2.316952642046255, + "language_loss": 0.66911846, + "learning_rate": 7.25680245639237e-07, + "loss": 0.69041795, + "num_input_tokens_seen": 261477175, + "step": 12113, + "time_per_iteration": 2.6054317951202393 + }, + { + "auxiliary_loss_clip": 0.01073794, + "auxiliary_loss_mlp": 0.01034736, + "balance_loss_clip": 1.03603506, + "balance_loss_mlp": 1.02081203, + "epoch": 0.7283330828197806, + "flos": 16325422392960.0, + "grad_norm": 2.2071094228181716, + "language_loss": 0.73312247, + "learning_rate": 7.253800993542399e-07, + "loss": 0.75420773, + "num_input_tokens_seen": 261494990, + "step": 12114, + "time_per_iteration": 2.779949188232422 + }, + { + "auxiliary_loss_clip": 0.0108015, + "auxiliary_loss_mlp": 0.01031985, + "balance_loss_clip": 1.03596735, + "balance_loss_mlp": 1.01860976, + "epoch": 0.7283932060724485, + "flos": 27490193043840.0, + "grad_norm": 2.0284186088728604, + "language_loss": 0.68312764, + "learning_rate": 7.250800014029564e-07, + "loss": 0.70424896, + "num_input_tokens_seen": 261514445, + "step": 12115, + "time_per_iteration": 2.7396066188812256 + }, + { + "auxiliary_loss_clip": 0.01112838, + "auxiliary_loss_mlp": 0.01035969, + "balance_loss_clip": 1.03786767, + "balance_loss_mlp": 1.02284992, + "epoch": 0.7284533293251165, + "flos": 18367877543040.0, + "grad_norm": 1.7392304859469863, + "language_loss": 0.60055017, + "learning_rate": 7.247799517967674e-07, + "loss": 0.62203836, + "num_input_tokens_seen": 261533565, + "step": 12116, + "time_per_iteration": 2.6416893005371094 + }, + { + "auxiliary_loss_clip": 0.01101571, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.03989601, + "balance_loss_mlp": 1.01943648, + "epoch": 0.7285134525777844, + "flos": 21725525174400.0, + "grad_norm": 1.8456050280461243, + "language_loss": 0.73165786, + "learning_rate": 7.2447995054705e-07, + "loss": 0.75300157, + "num_input_tokens_seen": 261553795, + "step": 12117, + "time_per_iteration": 2.680856704711914 + }, + { + "auxiliary_loss_clip": 0.01096697, + "auxiliary_loss_mlp": 0.01032842, + "balance_loss_clip": 1.03561711, + "balance_loss_mlp": 1.01907897, + "epoch": 0.7285735758304525, + "flos": 20741357456640.0, + "grad_norm": 1.892233976782661, + "language_loss": 0.69420332, + "learning_rate": 7.241799976651807e-07, + "loss": 0.71549869, + "num_input_tokens_seen": 261572565, + "step": 12118, + "time_per_iteration": 2.689328908920288 + }, + { + "auxiliary_loss_clip": 0.01054191, + "auxiliary_loss_mlp": 0.01039747, + "balance_loss_clip": 1.03333414, + "balance_loss_mlp": 1.026968, + "epoch": 0.7286336990831204, + "flos": 17310954827520.0, + "grad_norm": 6.128645472594502, + "language_loss": 0.84134108, + "learning_rate": 7.238800931625346e-07, + "loss": 0.86228043, + "num_input_tokens_seen": 261590910, + "step": 12119, + "time_per_iteration": 2.811901330947876 + }, + { + "auxiliary_loss_clip": 0.01112084, + "auxiliary_loss_mlp": 0.01029087, + "balance_loss_clip": 1.03825903, + "balance_loss_mlp": 1.01655173, + "epoch": 0.7286938223357884, + "flos": 19787390098560.0, + "grad_norm": 2.0681771064873544, + "language_loss": 0.81878972, + "learning_rate": 7.235802370504831e-07, + "loss": 0.84020138, + "num_input_tokens_seen": 261606005, + "step": 12120, + "time_per_iteration": 2.6672909259796143 + }, + { + "auxiliary_loss_clip": 0.01072804, + "auxiliary_loss_mlp": 0.01040557, + "balance_loss_clip": 1.03617036, + "balance_loss_mlp": 1.02706861, + "epoch": 0.7287539455884563, + "flos": 15340859625600.0, + "grad_norm": 1.933953511288546, + "language_loss": 0.7878201, + "learning_rate": 7.232804293403963e-07, + "loss": 0.8089537, + "num_input_tokens_seen": 261622305, + "step": 12121, + "time_per_iteration": 2.6193950176239014 + }, + { + "auxiliary_loss_clip": 0.01111609, + "auxiliary_loss_mlp": 0.01036655, + "balance_loss_clip": 1.0360496, + "balance_loss_mlp": 1.02327943, + "epoch": 0.7288140688411243, + "flos": 25192484870400.0, + "grad_norm": 1.533681893436525, + "language_loss": 0.69097638, + "learning_rate": 7.229806700436441e-07, + "loss": 0.71245903, + "num_input_tokens_seen": 261642465, + "step": 12122, + "time_per_iteration": 2.650777578353882 + }, + { + "auxiliary_loss_clip": 0.01064636, + "auxiliary_loss_mlp": 0.01033566, + "balance_loss_clip": 1.03321254, + "balance_loss_mlp": 1.02150214, + "epoch": 0.7288741920937922, + "flos": 23984162328960.0, + "grad_norm": 1.9841747121514857, + "language_loss": 0.87224233, + "learning_rate": 7.226809591715923e-07, + "loss": 0.89322436, + "num_input_tokens_seen": 261661420, + "step": 12123, + "time_per_iteration": 2.767803907394409 + }, + { + "auxiliary_loss_clip": 0.01077874, + "auxiliary_loss_mlp": 0.01035309, + "balance_loss_clip": 1.0370611, + "balance_loss_mlp": 1.02279758, + "epoch": 0.7289343153464602, + "flos": 22744921155840.0, + "grad_norm": 19.006549121525065, + "language_loss": 0.8255595, + "learning_rate": 7.223812967356065e-07, + "loss": 0.84669125, + "num_input_tokens_seen": 261680865, + "step": 12124, + "time_per_iteration": 2.7401580810546875 + }, + { + "auxiliary_loss_clip": 0.01082733, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.03729665, + "balance_loss_mlp": 1.01955354, + "epoch": 0.7289944385991282, + "flos": 24900028335360.0, + "grad_norm": 2.2469511782017726, + "language_loss": 0.67069578, + "learning_rate": 7.220816827470499e-07, + "loss": 0.69184899, + "num_input_tokens_seen": 261701455, + "step": 12125, + "time_per_iteration": 2.681535243988037 + }, + { + "auxiliary_loss_clip": 0.01104267, + "auxiliary_loss_mlp": 0.01038084, + "balance_loss_clip": 1.03742492, + "balance_loss_mlp": 1.02412462, + "epoch": 0.7290545618517962, + "flos": 22967064817920.0, + "grad_norm": 2.039401823737763, + "language_loss": 0.74920547, + "learning_rate": 7.217821172172855e-07, + "loss": 0.77062899, + "num_input_tokens_seen": 261721260, + "step": 12126, + "time_per_iteration": 2.6920571327209473 + }, + { + "auxiliary_loss_clip": 0.01016131, + "auxiliary_loss_mlp": 0.00997812, + "balance_loss_clip": 1.01327682, + "balance_loss_mlp": 0.99669784, + "epoch": 0.7291146851044642, + "flos": 61901523216000.0, + "grad_norm": 0.8366377087030958, + "language_loss": 0.5864383, + "learning_rate": 7.2148260015767e-07, + "loss": 0.60657775, + "num_input_tokens_seen": 261779370, + "step": 12127, + "time_per_iteration": 3.1948511600494385 + }, + { + "auxiliary_loss_clip": 0.01076598, + "auxiliary_loss_mlp": 0.01031063, + "balance_loss_clip": 1.03621507, + "balance_loss_mlp": 1.01911807, + "epoch": 0.7291748083571321, + "flos": 23330947547520.0, + "grad_norm": 2.1989684199567376, + "language_loss": 0.68995476, + "learning_rate": 7.21183131579562e-07, + "loss": 0.71103132, + "num_input_tokens_seen": 261798050, + "step": 12128, + "time_per_iteration": 2.761828899383545 + }, + { + "auxiliary_loss_clip": 0.01085147, + "auxiliary_loss_mlp": 0.01035559, + "balance_loss_clip": 1.03663111, + "balance_loss_mlp": 1.02137899, + "epoch": 0.7292349316098001, + "flos": 28330000001280.0, + "grad_norm": 1.8229974773388113, + "language_loss": 0.65319067, + "learning_rate": 7.20883711494319e-07, + "loss": 0.67439777, + "num_input_tokens_seen": 261817660, + "step": 12129, + "time_per_iteration": 2.7223851680755615 + }, + { + "auxiliary_loss_clip": 0.01108826, + "auxiliary_loss_mlp": 0.01030564, + "balance_loss_clip": 1.03813577, + "balance_loss_mlp": 1.01728415, + "epoch": 0.729295054862468, + "flos": 24132222190080.0, + "grad_norm": 1.987746290779436, + "language_loss": 0.74474001, + "learning_rate": 7.205843399132927e-07, + "loss": 0.7661339, + "num_input_tokens_seen": 261837935, + "step": 12130, + "time_per_iteration": 2.624861001968384 + }, + { + "auxiliary_loss_clip": 0.01084684, + "auxiliary_loss_mlp": 0.01036003, + "balance_loss_clip": 1.03371596, + "balance_loss_mlp": 1.02260351, + "epoch": 0.7293551781151361, + "flos": 22816239609600.0, + "grad_norm": 1.9230016702733295, + "language_loss": 0.69777483, + "learning_rate": 7.202850168478374e-07, + "loss": 0.71898174, + "num_input_tokens_seen": 261857575, + "step": 12131, + "time_per_iteration": 2.686483383178711 + }, + { + "auxiliary_loss_clip": 0.01075038, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.03706694, + "balance_loss_mlp": 1.02072525, + "epoch": 0.729415301367804, + "flos": 22126683242880.0, + "grad_norm": 1.5997534699121376, + "language_loss": 0.77348047, + "learning_rate": 7.199857423093025e-07, + "loss": 0.79455858, + "num_input_tokens_seen": 261877265, + "step": 12132, + "time_per_iteration": 2.7391042709350586 + }, + { + "auxiliary_loss_clip": 0.0110301, + "auxiliary_loss_mlp": 0.01038259, + "balance_loss_clip": 1.03978968, + "balance_loss_mlp": 1.02559876, + "epoch": 0.729475424620472, + "flos": 12349608675840.0, + "grad_norm": 2.2281458510507797, + "language_loss": 0.78860861, + "learning_rate": 7.196865163090358e-07, + "loss": 0.81002128, + "num_input_tokens_seen": 261893695, + "step": 12133, + "time_per_iteration": 2.5943353176116943 + }, + { + "auxiliary_loss_clip": 0.01060968, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.03212547, + "balance_loss_mlp": 1.01933742, + "epoch": 0.7295355478731399, + "flos": 22195308176640.0, + "grad_norm": 2.7553273898402333, + "language_loss": 0.72054434, + "learning_rate": 7.193873388583846e-07, + "loss": 0.7414813, + "num_input_tokens_seen": 261911825, + "step": 12134, + "time_per_iteration": 2.764251470565796 + }, + { + "auxiliary_loss_clip": 0.01091285, + "auxiliary_loss_mlp": 0.01040465, + "balance_loss_clip": 1.03840399, + "balance_loss_mlp": 1.02753675, + "epoch": 0.7295956711258079, + "flos": 23222030532480.0, + "grad_norm": 2.1447336349614203, + "language_loss": 0.71251649, + "learning_rate": 7.190882099686939e-07, + "loss": 0.73383397, + "num_input_tokens_seen": 261931190, + "step": 12135, + "time_per_iteration": 2.7322559356689453 + }, + { + "auxiliary_loss_clip": 0.01077251, + "auxiliary_loss_mlp": 0.01035683, + "balance_loss_clip": 1.03486896, + "balance_loss_mlp": 1.02259374, + "epoch": 0.7296557943784758, + "flos": 31869104163840.0, + "grad_norm": 2.309450763309982, + "language_loss": 0.61924529, + "learning_rate": 7.187891296513075e-07, + "loss": 0.64037454, + "num_input_tokens_seen": 261951240, + "step": 12136, + "time_per_iteration": 2.7608072757720947 + }, + { + "auxiliary_loss_clip": 0.01094465, + "auxiliary_loss_mlp": 0.00770512, + "balance_loss_clip": 1.03708506, + "balance_loss_mlp": 1.00022686, + "epoch": 0.7297159176311439, + "flos": 26651714889600.0, + "grad_norm": 1.8756317332834676, + "language_loss": 0.74414635, + "learning_rate": 7.184900979175654e-07, + "loss": 0.76279616, + "num_input_tokens_seen": 261971605, + "step": 12137, + "time_per_iteration": 2.6699535846710205 + }, + { + "auxiliary_loss_clip": 0.01104052, + "auxiliary_loss_mlp": 0.00771068, + "balance_loss_clip": 1.04109406, + "balance_loss_mlp": 1.00024545, + "epoch": 0.7297760408838118, + "flos": 24749562263040.0, + "grad_norm": 1.6416252206910797, + "language_loss": 0.74556518, + "learning_rate": 7.181911147788069e-07, + "loss": 0.76431638, + "num_input_tokens_seen": 261990830, + "step": 12138, + "time_per_iteration": 2.6462252140045166 + }, + { + "auxiliary_loss_clip": 0.01073993, + "auxiliary_loss_mlp": 0.01030576, + "balance_loss_clip": 1.03440869, + "balance_loss_mlp": 1.01832712, + "epoch": 0.7298361641364798, + "flos": 18073768982400.0, + "grad_norm": 2.2048130444672527, + "language_loss": 0.71792364, + "learning_rate": 7.178921802463702e-07, + "loss": 0.73896933, + "num_input_tokens_seen": 262008190, + "step": 12139, + "time_per_iteration": 2.637579917907715 + }, + { + "auxiliary_loss_clip": 0.01094798, + "auxiliary_loss_mlp": 0.01029337, + "balance_loss_clip": 1.03654766, + "balance_loss_mlp": 1.01727343, + "epoch": 0.7298962873891478, + "flos": 29895597169920.0, + "grad_norm": 1.5727231241692394, + "language_loss": 0.73340857, + "learning_rate": 7.175932943315898e-07, + "loss": 0.75464988, + "num_input_tokens_seen": 262030460, + "step": 12140, + "time_per_iteration": 4.322738170623779 + }, + { + "auxiliary_loss_clip": 0.01086242, + "auxiliary_loss_mlp": 0.01033553, + "balance_loss_clip": 1.03733993, + "balance_loss_mlp": 1.02028465, + "epoch": 0.7299564106418157, + "flos": 32266096254720.0, + "grad_norm": 2.108634462016176, + "language_loss": 0.55439997, + "learning_rate": 7.172944570458003e-07, + "loss": 0.57559788, + "num_input_tokens_seen": 262050830, + "step": 12141, + "time_per_iteration": 4.280510425567627 + }, + { + "auxiliary_loss_clip": 0.01072661, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.03414416, + "balance_loss_mlp": 1.0185132, + "epoch": 0.7300165338944837, + "flos": 22930292269440.0, + "grad_norm": 1.6200088413354243, + "language_loss": 0.72661757, + "learning_rate": 7.169956684003342e-07, + "loss": 0.74765337, + "num_input_tokens_seen": 262071245, + "step": 12142, + "time_per_iteration": 4.36347508430481 + }, + { + "auxiliary_loss_clip": 0.01109011, + "auxiliary_loss_mlp": 0.01039998, + "balance_loss_clip": 1.03754866, + "balance_loss_mlp": 1.02798176, + "epoch": 0.7300766571471516, + "flos": 19828795501440.0, + "grad_norm": 1.8395683964833187, + "language_loss": 0.73354667, + "learning_rate": 7.16696928406521e-07, + "loss": 0.75503671, + "num_input_tokens_seen": 262087525, + "step": 12143, + "time_per_iteration": 2.562661647796631 + }, + { + "auxiliary_loss_clip": 0.01072117, + "auxiliary_loss_mlp": 0.01036251, + "balance_loss_clip": 1.0354147, + "balance_loss_mlp": 1.02270293, + "epoch": 0.7301367803998197, + "flos": 24347829576960.0, + "grad_norm": 11.58693755368333, + "language_loss": 0.67069697, + "learning_rate": 7.163982370756882e-07, + "loss": 0.69178069, + "num_input_tokens_seen": 262107355, + "step": 12144, + "time_per_iteration": 2.7019169330596924 + }, + { + "auxiliary_loss_clip": 0.01087218, + "auxiliary_loss_mlp": 0.01031157, + "balance_loss_clip": 1.03756452, + "balance_loss_mlp": 1.01808596, + "epoch": 0.7301969036524876, + "flos": 15304518040320.0, + "grad_norm": 2.004686825288867, + "language_loss": 0.79088622, + "learning_rate": 7.160995944191627e-07, + "loss": 0.81206995, + "num_input_tokens_seen": 262125645, + "step": 12145, + "time_per_iteration": 2.609962224960327 + }, + { + "auxiliary_loss_clip": 0.01071068, + "auxiliary_loss_mlp": 0.01038463, + "balance_loss_clip": 1.03582478, + "balance_loss_mlp": 1.02542722, + "epoch": 0.7302570269051556, + "flos": 23507268433920.0, + "grad_norm": 2.189602190838667, + "language_loss": 0.91191077, + "learning_rate": 7.158010004482702e-07, + "loss": 0.93300605, + "num_input_tokens_seen": 262144075, + "step": 12146, + "time_per_iteration": 4.17360258102417 + }, + { + "auxiliary_loss_clip": 0.01107983, + "auxiliary_loss_mlp": 0.01027437, + "balance_loss_clip": 1.03820586, + "balance_loss_mlp": 1.01547432, + "epoch": 0.7303171501578235, + "flos": 20523056549760.0, + "grad_norm": 1.801228566583195, + "language_loss": 0.62361127, + "learning_rate": 7.155024551743316e-07, + "loss": 0.64496547, + "num_input_tokens_seen": 262165940, + "step": 12147, + "time_per_iteration": 2.7316384315490723 + }, + { + "auxiliary_loss_clip": 0.01113892, + "auxiliary_loss_mlp": 0.01039081, + "balance_loss_clip": 1.0402323, + "balance_loss_mlp": 1.02578294, + "epoch": 0.7303772734104915, + "flos": 18332613365760.0, + "grad_norm": 1.9466892860385239, + "language_loss": 0.75526571, + "learning_rate": 7.152039586086693e-07, + "loss": 0.77679539, + "num_input_tokens_seen": 262184520, + "step": 12148, + "time_per_iteration": 2.55757999420166 + }, + { + "auxiliary_loss_clip": 0.01010613, + "auxiliary_loss_mlp": 0.0075184, + "balance_loss_clip": 1.00818348, + "balance_loss_mlp": 0.99964029, + "epoch": 0.7304373966631594, + "flos": 60654776100480.0, + "grad_norm": 0.6918687189528673, + "language_loss": 0.56630087, + "learning_rate": 7.149055107626017e-07, + "loss": 0.58392537, + "num_input_tokens_seen": 262247070, + "step": 12149, + "time_per_iteration": 3.1780648231506348 + }, + { + "auxiliary_loss_clip": 0.01090981, + "auxiliary_loss_mlp": 0.01036437, + "balance_loss_clip": 1.03665161, + "balance_loss_mlp": 1.02352667, + "epoch": 0.7304975199158275, + "flos": 19828077229440.0, + "grad_norm": 1.6617515713368272, + "language_loss": 0.73949683, + "learning_rate": 7.146071116474451e-07, + "loss": 0.76077104, + "num_input_tokens_seen": 262266605, + "step": 12150, + "time_per_iteration": 2.6775600910186768 + }, + { + "auxiliary_loss_clip": 0.0111323, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.03854418, + "balance_loss_mlp": 1.02156699, + "epoch": 0.7305576431684954, + "flos": 13223997452160.0, + "grad_norm": 2.052406638174018, + "language_loss": 0.84060204, + "learning_rate": 7.143087612745158e-07, + "loss": 0.86207914, + "num_input_tokens_seen": 262283880, + "step": 12151, + "time_per_iteration": 2.589292049407959 + }, + { + "auxiliary_loss_clip": 0.01072466, + "auxiliary_loss_mlp": 0.01040374, + "balance_loss_clip": 1.03497267, + "balance_loss_mlp": 1.02686191, + "epoch": 0.7306177664211634, + "flos": 24060472773120.0, + "grad_norm": 1.844893248025129, + "language_loss": 0.78079808, + "learning_rate": 7.14010459655127e-07, + "loss": 0.80192649, + "num_input_tokens_seen": 262304155, + "step": 12152, + "time_per_iteration": 2.7783727645874023 + }, + { + "auxiliary_loss_clip": 0.01075382, + "auxiliary_loss_mlp": 0.01032051, + "balance_loss_clip": 1.03711772, + "balance_loss_mlp": 1.01889646, + "epoch": 0.7306778896738314, + "flos": 27089106802560.0, + "grad_norm": 2.295487047202377, + "language_loss": 0.79554176, + "learning_rate": 7.137122068005919e-07, + "loss": 0.81661606, + "num_input_tokens_seen": 262325660, + "step": 12153, + "time_per_iteration": 2.773252010345459 + }, + { + "auxiliary_loss_clip": 0.01100913, + "auxiliary_loss_mlp": 0.01037363, + "balance_loss_clip": 1.03726029, + "balance_loss_mlp": 1.02455413, + "epoch": 0.7307380129264993, + "flos": 16690669839360.0, + "grad_norm": 1.708854446027603, + "language_loss": 0.67438841, + "learning_rate": 7.134140027222173e-07, + "loss": 0.69577122, + "num_input_tokens_seen": 262344075, + "step": 12154, + "time_per_iteration": 2.657804489135742 + }, + { + "auxiliary_loss_clip": 0.01064569, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.03754902, + "balance_loss_mlp": 1.01900196, + "epoch": 0.7307981361791673, + "flos": 21725740656000.0, + "grad_norm": 1.7409892720521978, + "language_loss": 0.6598506, + "learning_rate": 7.131158474313128e-07, + "loss": 0.68082201, + "num_input_tokens_seen": 262363305, + "step": 12155, + "time_per_iteration": 2.727818012237549 + }, + { + "auxiliary_loss_clip": 0.01090955, + "auxiliary_loss_mlp": 0.01028633, + "balance_loss_clip": 1.03944302, + "balance_loss_mlp": 1.01606798, + "epoch": 0.7308582594318352, + "flos": 18040659621120.0, + "grad_norm": 2.059846064937341, + "language_loss": 0.81401372, + "learning_rate": 7.128177409391851e-07, + "loss": 0.83520961, + "num_input_tokens_seen": 262380730, + "step": 12156, + "time_per_iteration": 2.6713905334472656 + }, + { + "auxiliary_loss_clip": 0.01069178, + "auxiliary_loss_mlp": 0.01038604, + "balance_loss_clip": 1.03357935, + "balance_loss_mlp": 1.02677894, + "epoch": 0.7309183826845033, + "flos": 13844964798720.0, + "grad_norm": 2.368813587947745, + "language_loss": 0.7572211, + "learning_rate": 7.125196832571367e-07, + "loss": 0.77829891, + "num_input_tokens_seen": 262395480, + "step": 12157, + "time_per_iteration": 2.6478710174560547 + }, + { + "auxiliary_loss_clip": 0.01097661, + "auxiliary_loss_mlp": 0.01029375, + "balance_loss_clip": 1.03817534, + "balance_loss_mlp": 1.01818156, + "epoch": 0.7309785059371712, + "flos": 17019216564480.0, + "grad_norm": 2.20999557197409, + "language_loss": 0.72660947, + "learning_rate": 7.122216743964713e-07, + "loss": 0.74787986, + "num_input_tokens_seen": 262413340, + "step": 12158, + "time_per_iteration": 2.6752305030822754 + }, + { + "auxiliary_loss_clip": 0.01090002, + "auxiliary_loss_mlp": 0.01036269, + "balance_loss_clip": 1.03874135, + "balance_loss_mlp": 1.02343071, + "epoch": 0.7310386291898392, + "flos": 26502398052480.0, + "grad_norm": 1.5980086656224926, + "language_loss": 0.85433125, + "learning_rate": 7.119237143684896e-07, + "loss": 0.87559396, + "num_input_tokens_seen": 262433455, + "step": 12159, + "time_per_iteration": 2.722282886505127 + }, + { + "auxiliary_loss_clip": 0.01090808, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.0357151, + "balance_loss_mlp": 1.01996553, + "epoch": 0.7310987524425071, + "flos": 16945922862720.0, + "grad_norm": 2.240373926166887, + "language_loss": 0.73471999, + "learning_rate": 7.116258031844895e-07, + "loss": 0.75596595, + "num_input_tokens_seen": 262450335, + "step": 12160, + "time_per_iteration": 2.6522862911224365 + }, + { + "auxiliary_loss_clip": 0.01103069, + "auxiliary_loss_mlp": 0.01035667, + "balance_loss_clip": 1.0388577, + "balance_loss_mlp": 1.0220058, + "epoch": 0.7311588756951751, + "flos": 13845288021120.0, + "grad_norm": 1.9039689153632533, + "language_loss": 0.72493577, + "learning_rate": 7.113279408557675e-07, + "loss": 0.74632311, + "num_input_tokens_seen": 262468240, + "step": 12161, + "time_per_iteration": 2.5589683055877686 + }, + { + "auxiliary_loss_clip": 0.01083193, + "auxiliary_loss_mlp": 0.00772186, + "balance_loss_clip": 1.03667367, + "balance_loss_mlp": 1.00028253, + "epoch": 0.731218998947843, + "flos": 28767894704640.0, + "grad_norm": 1.765712961659712, + "language_loss": 0.69565916, + "learning_rate": 7.110301273936192e-07, + "loss": 0.71421289, + "num_input_tokens_seen": 262487045, + "step": 12162, + "time_per_iteration": 2.8083322048187256 + }, + { + "auxiliary_loss_clip": 0.01102238, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.03934407, + "balance_loss_mlp": 1.01765895, + "epoch": 0.7312791222005111, + "flos": 27088783580160.0, + "grad_norm": 1.79396916880486, + "language_loss": 0.66982478, + "learning_rate": 7.107323628093382e-07, + "loss": 0.69115686, + "num_input_tokens_seen": 262504855, + "step": 12163, + "time_per_iteration": 2.664005756378174 + }, + { + "auxiliary_loss_clip": 0.01088818, + "auxiliary_loss_mlp": 0.01029215, + "balance_loss_clip": 1.03657246, + "balance_loss_mlp": 1.01618505, + "epoch": 0.731339245453179, + "flos": 20924035050240.0, + "grad_norm": 1.4858782021210455, + "language_loss": 0.68422931, + "learning_rate": 7.104346471142153e-07, + "loss": 0.70540965, + "num_input_tokens_seen": 262524920, + "step": 12164, + "time_per_iteration": 2.730407953262329 + }, + { + "auxiliary_loss_clip": 0.01064444, + "auxiliary_loss_mlp": 0.01035496, + "balance_loss_clip": 1.03925169, + "balance_loss_mlp": 1.02344418, + "epoch": 0.731399368705847, + "flos": 23075694524160.0, + "grad_norm": 1.621904213104564, + "language_loss": 0.73121232, + "learning_rate": 7.101369803195391e-07, + "loss": 0.75221169, + "num_input_tokens_seen": 262545725, + "step": 12165, + "time_per_iteration": 2.745304584503174 + }, + { + "auxiliary_loss_clip": 0.01104061, + "auxiliary_loss_mlp": 0.0103506, + "balance_loss_clip": 1.03919411, + "balance_loss_mlp": 1.02191114, + "epoch": 0.731459491958515, + "flos": 23582681038080.0, + "grad_norm": 1.959130136013477, + "language_loss": 0.7631768, + "learning_rate": 7.098393624365988e-07, + "loss": 0.78456795, + "num_input_tokens_seen": 262565480, + "step": 12166, + "time_per_iteration": 2.655210256576538 + }, + { + "auxiliary_loss_clip": 0.01083193, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.03837287, + "balance_loss_mlp": 1.01877546, + "epoch": 0.7315196152111829, + "flos": 22379278659840.0, + "grad_norm": 1.7735251016583573, + "language_loss": 0.79791737, + "learning_rate": 7.095417934766781e-07, + "loss": 0.81906146, + "num_input_tokens_seen": 262584145, + "step": 12167, + "time_per_iteration": 2.686013698577881 + }, + { + "auxiliary_loss_clip": 0.01099781, + "auxiliary_loss_mlp": 0.01043597, + "balance_loss_clip": 1.03856659, + "balance_loss_mlp": 1.03108573, + "epoch": 0.7315797384638509, + "flos": 26177047637760.0, + "grad_norm": 1.6689116898679521, + "language_loss": 0.76710904, + "learning_rate": 7.092442734510622e-07, + "loss": 0.78854281, + "num_input_tokens_seen": 262604045, + "step": 12168, + "time_per_iteration": 2.6875557899475098 + }, + { + "auxiliary_loss_clip": 0.0109665, + "auxiliary_loss_mlp": 0.01043712, + "balance_loss_clip": 1.03574252, + "balance_loss_mlp": 1.02774954, + "epoch": 0.7316398617165188, + "flos": 21506326427520.0, + "grad_norm": 2.5442709815389684, + "language_loss": 0.81822222, + "learning_rate": 7.089468023710326e-07, + "loss": 0.83962584, + "num_input_tokens_seen": 262624540, + "step": 12169, + "time_per_iteration": 2.592453718185425 + }, + { + "auxiliary_loss_clip": 0.01097824, + "auxiliary_loss_mlp": 0.01039563, + "balance_loss_clip": 1.03882432, + "balance_loss_mlp": 1.0264802, + "epoch": 0.7316999849691869, + "flos": 30482557315200.0, + "grad_norm": 1.9915594425883627, + "language_loss": 0.69992799, + "learning_rate": 7.08649380247871e-07, + "loss": 0.72130191, + "num_input_tokens_seen": 262644545, + "step": 12170, + "time_per_iteration": 2.7040326595306396 + }, + { + "auxiliary_loss_clip": 0.01109905, + "auxiliary_loss_mlp": 0.01032057, + "balance_loss_clip": 1.03831005, + "balance_loss_mlp": 1.01799059, + "epoch": 0.7317601082218548, + "flos": 21543781334400.0, + "grad_norm": 15.0863481947429, + "language_loss": 0.69820881, + "learning_rate": 7.083520070928533e-07, + "loss": 0.71962845, + "num_input_tokens_seen": 262662570, + "step": 12171, + "time_per_iteration": 2.5760347843170166 + }, + { + "auxiliary_loss_clip": 0.01111903, + "auxiliary_loss_mlp": 0.0104052, + "balance_loss_clip": 1.03991163, + "balance_loss_mlp": 1.0280571, + "epoch": 0.7318202314745228, + "flos": 33251592775680.0, + "grad_norm": 4.139375107953077, + "language_loss": 0.65600061, + "learning_rate": 7.080546829172564e-07, + "loss": 0.67752481, + "num_input_tokens_seen": 262683245, + "step": 12172, + "time_per_iteration": 2.629512071609497 + }, + { + "auxiliary_loss_clip": 0.01112155, + "auxiliary_loss_mlp": 0.01027678, + "balance_loss_clip": 1.03968287, + "balance_loss_mlp": 1.01504803, + "epoch": 0.7318803547271907, + "flos": 20157054917760.0, + "grad_norm": 2.4544456450965577, + "language_loss": 0.6181004, + "learning_rate": 7.077574077323564e-07, + "loss": 0.63949871, + "num_input_tokens_seen": 262701585, + "step": 12173, + "time_per_iteration": 2.714617967605591 + }, + { + "auxiliary_loss_clip": 0.01056565, + "auxiliary_loss_mlp": 0.01030506, + "balance_loss_clip": 1.03468084, + "balance_loss_mlp": 1.01789331, + "epoch": 0.7319404779798587, + "flos": 20558536208640.0, + "grad_norm": 3.4474002403228714, + "language_loss": 0.74141943, + "learning_rate": 7.074601815494243e-07, + "loss": 0.76229018, + "num_input_tokens_seen": 262719295, + "step": 12174, + "time_per_iteration": 2.691361427307129 + }, + { + "auxiliary_loss_clip": 0.0110738, + "auxiliary_loss_mlp": 0.01029138, + "balance_loss_clip": 1.03786492, + "balance_loss_mlp": 1.01689529, + "epoch": 0.7320006012325266, + "flos": 28695391102080.0, + "grad_norm": 1.70169272855857, + "language_loss": 0.80771077, + "learning_rate": 7.071630043797317e-07, + "loss": 0.82907599, + "num_input_tokens_seen": 262739995, + "step": 12175, + "time_per_iteration": 2.6333701610565186 + }, + { + "auxiliary_loss_clip": 0.01091186, + "auxiliary_loss_mlp": 0.01029927, + "balance_loss_clip": 1.03785181, + "balance_loss_mlp": 1.01719511, + "epoch": 0.7320607244851947, + "flos": 16362697731840.0, + "grad_norm": 2.2994636661960777, + "language_loss": 0.76175666, + "learning_rate": 7.068658762345488e-07, + "loss": 0.78296781, + "num_input_tokens_seen": 262757680, + "step": 12176, + "time_per_iteration": 2.6684181690216064 + }, + { + "auxiliary_loss_clip": 0.01099222, + "auxiliary_loss_mlp": 0.01033517, + "balance_loss_clip": 1.03950393, + "balance_loss_mlp": 1.02143455, + "epoch": 0.7321208477378626, + "flos": 20955097336320.0, + "grad_norm": 1.7266339084119442, + "language_loss": 0.76393938, + "learning_rate": 7.065687971251399e-07, + "loss": 0.78526676, + "num_input_tokens_seen": 262776990, + "step": 12177, + "time_per_iteration": 2.5895602703094482 + }, + { + "auxiliary_loss_clip": 0.01076316, + "auxiliary_loss_mlp": 0.0103859, + "balance_loss_clip": 1.03529096, + "balance_loss_mlp": 1.02638888, + "epoch": 0.7321809709905306, + "flos": 13845072539520.0, + "grad_norm": 2.2196900974647003, + "language_loss": 0.74673522, + "learning_rate": 7.06271767062772e-07, + "loss": 0.76788431, + "num_input_tokens_seen": 262795440, + "step": 12178, + "time_per_iteration": 2.6741504669189453 + }, + { + "auxiliary_loss_clip": 0.01091987, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.03604901, + "balance_loss_mlp": 1.02187705, + "epoch": 0.7322410942431986, + "flos": 26979938392320.0, + "grad_norm": 2.2839200958654584, + "language_loss": 0.82424951, + "learning_rate": 7.059747860587084e-07, + "loss": 0.84551692, + "num_input_tokens_seen": 262816385, + "step": 12179, + "time_per_iteration": 4.333508253097534 + }, + { + "auxiliary_loss_clip": 0.01073556, + "auxiliary_loss_mlp": 0.01040091, + "balance_loss_clip": 1.03531742, + "balance_loss_mlp": 1.02663827, + "epoch": 0.7323012174958665, + "flos": 17639717034240.0, + "grad_norm": 4.252835567274656, + "language_loss": 0.74462938, + "learning_rate": 7.056778541242115e-07, + "loss": 0.76576585, + "num_input_tokens_seen": 262834955, + "step": 12180, + "time_per_iteration": 2.64694881439209 + }, + { + "auxiliary_loss_clip": 0.01100626, + "auxiliary_loss_mlp": 0.00770628, + "balance_loss_clip": 1.03525329, + "balance_loss_mlp": 1.00013947, + "epoch": 0.7323613407485345, + "flos": 32342765834880.0, + "grad_norm": 2.118039690946721, + "language_loss": 0.79425126, + "learning_rate": 7.053809712705396e-07, + "loss": 0.81296378, + "num_input_tokens_seen": 262853555, + "step": 12181, + "time_per_iteration": 5.950862407684326 + }, + { + "auxiliary_loss_clip": 0.01104749, + "auxiliary_loss_mlp": 0.00770994, + "balance_loss_clip": 1.0405333, + "balance_loss_mlp": 1.00015044, + "epoch": 0.7324214640012024, + "flos": 18362777811840.0, + "grad_norm": 3.5037562339731343, + "language_loss": 0.72006238, + "learning_rate": 7.050841375089506e-07, + "loss": 0.73881984, + "num_input_tokens_seen": 262870975, + "step": 12182, + "time_per_iteration": 2.60955810546875 + }, + { + "auxiliary_loss_clip": 0.01113664, + "auxiliary_loss_mlp": 0.01031542, + "balance_loss_clip": 1.04023218, + "balance_loss_mlp": 1.01922774, + "epoch": 0.7324815872538705, + "flos": 30812289189120.0, + "grad_norm": 1.455017822583619, + "language_loss": 0.7080251, + "learning_rate": 7.047873528507015e-07, + "loss": 0.72947717, + "num_input_tokens_seen": 262892635, + "step": 12183, + "time_per_iteration": 2.651121139526367 + }, + { + "auxiliary_loss_clip": 0.01100782, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.04088736, + "balance_loss_mlp": 1.02230549, + "epoch": 0.7325417105065384, + "flos": 21505069451520.0, + "grad_norm": 1.9960836350213491, + "language_loss": 0.73006004, + "learning_rate": 7.04490617307045e-07, + "loss": 0.75142741, + "num_input_tokens_seen": 262910725, + "step": 12184, + "time_per_iteration": 4.158590078353882 + }, + { + "auxiliary_loss_clip": 0.01011352, + "auxiliary_loss_mlp": 0.01007926, + "balance_loss_clip": 1.00717974, + "balance_loss_mlp": 1.0068059, + "epoch": 0.7326018337592064, + "flos": 67257742556160.0, + "grad_norm": 0.7629811613061157, + "language_loss": 0.65181279, + "learning_rate": 7.041939308892344e-07, + "loss": 0.67200553, + "num_input_tokens_seen": 262974150, + "step": 12185, + "time_per_iteration": 3.1753084659576416 + }, + { + "auxiliary_loss_clip": 0.01110902, + "auxiliary_loss_mlp": 0.01027525, + "balance_loss_clip": 1.03751791, + "balance_loss_mlp": 1.01419187, + "epoch": 0.7326619570118743, + "flos": 22857070394880.0, + "grad_norm": 1.8466605492768327, + "language_loss": 0.80407894, + "learning_rate": 7.038972936085197e-07, + "loss": 0.82546324, + "num_input_tokens_seen": 262993370, + "step": 12186, + "time_per_iteration": 2.7113280296325684 + }, + { + "auxiliary_loss_clip": 0.01095897, + "auxiliary_loss_mlp": 0.01035902, + "balance_loss_clip": 1.03822923, + "balance_loss_mlp": 1.02185869, + "epoch": 0.7327220802645423, + "flos": 23327499841920.0, + "grad_norm": 1.6891374777680592, + "language_loss": 0.73376352, + "learning_rate": 7.036007054761508e-07, + "loss": 0.75508153, + "num_input_tokens_seen": 263012665, + "step": 12187, + "time_per_iteration": 2.6341447830200195 + }, + { + "auxiliary_loss_clip": 0.01113144, + "auxiliary_loss_mlp": 0.01032976, + "balance_loss_clip": 1.03975987, + "balance_loss_mlp": 1.020298, + "epoch": 0.7327822035172102, + "flos": 23180661043200.0, + "grad_norm": 1.849813706667638, + "language_loss": 0.88717717, + "learning_rate": 7.033041665033716e-07, + "loss": 0.90863836, + "num_input_tokens_seen": 263031475, + "step": 12188, + "time_per_iteration": 2.5466268062591553 + }, + { + "auxiliary_loss_clip": 0.01068599, + "auxiliary_loss_mlp": 0.01036205, + "balance_loss_clip": 1.03427935, + "balance_loss_mlp": 1.02241302, + "epoch": 0.7328423267698783, + "flos": 21066600130560.0, + "grad_norm": 2.0499334322207856, + "language_loss": 0.74851215, + "learning_rate": 7.030076767014284e-07, + "loss": 0.76956022, + "num_input_tokens_seen": 263051445, + "step": 12189, + "time_per_iteration": 2.7621939182281494 + }, + { + "auxiliary_loss_clip": 0.01078663, + "auxiliary_loss_mlp": 0.0103229, + "balance_loss_clip": 1.03718972, + "balance_loss_mlp": 1.01898003, + "epoch": 0.7329024500225462, + "flos": 21689578638720.0, + "grad_norm": 1.96321719925377, + "language_loss": 0.82236755, + "learning_rate": 7.027112360815648e-07, + "loss": 0.84347707, + "num_input_tokens_seen": 263070835, + "step": 12190, + "time_per_iteration": 2.701537609100342 + }, + { + "auxiliary_loss_clip": 0.01073099, + "auxiliary_loss_mlp": 0.01036113, + "balance_loss_clip": 1.03755641, + "balance_loss_mlp": 1.02225447, + "epoch": 0.7329625732752142, + "flos": 24164038661760.0, + "grad_norm": 1.6849977085368404, + "language_loss": 0.71588874, + "learning_rate": 7.024148446550204e-07, + "loss": 0.73698092, + "num_input_tokens_seen": 263090070, + "step": 12191, + "time_per_iteration": 2.72813081741333 + }, + { + "auxiliary_loss_clip": 0.01112512, + "auxiliary_loss_mlp": 0.01035784, + "balance_loss_clip": 1.03892088, + "balance_loss_mlp": 1.02245009, + "epoch": 0.7330226965278822, + "flos": 30077915627520.0, + "grad_norm": 1.5354384218805013, + "language_loss": 0.69254857, + "learning_rate": 7.021185024330361e-07, + "loss": 0.71403152, + "num_input_tokens_seen": 263110030, + "step": 12192, + "time_per_iteration": 2.6177656650543213 + }, + { + "auxiliary_loss_clip": 0.01099104, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.0388236, + "balance_loss_mlp": 1.01836967, + "epoch": 0.7330828197805501, + "flos": 23368294713600.0, + "grad_norm": 1.627423362173816, + "language_loss": 0.73143125, + "learning_rate": 7.01822209426848e-07, + "loss": 0.75273132, + "num_input_tokens_seen": 263129735, + "step": 12193, + "time_per_iteration": 2.6829118728637695 + }, + { + "auxiliary_loss_clip": 0.01094199, + "auxiliary_loss_mlp": 0.01034877, + "balance_loss_clip": 1.03632629, + "balance_loss_mlp": 1.02171612, + "epoch": 0.7331429430332181, + "flos": 21032808410880.0, + "grad_norm": 2.400736232898333, + "language_loss": 0.76939815, + "learning_rate": 7.015259656476911e-07, + "loss": 0.79068899, + "num_input_tokens_seen": 263149100, + "step": 12194, + "time_per_iteration": 2.589165687561035 + }, + { + "auxiliary_loss_clip": 0.01100113, + "auxiliary_loss_mlp": 0.01029986, + "balance_loss_clip": 1.03972054, + "balance_loss_mlp": 1.01695681, + "epoch": 0.733203066285886, + "flos": 14647891466880.0, + "grad_norm": 1.9190061960430176, + "language_loss": 0.70403659, + "learning_rate": 7.012297711067998e-07, + "loss": 0.72533756, + "num_input_tokens_seen": 263166620, + "step": 12195, + "time_per_iteration": 2.550752639770508 + }, + { + "auxiliary_loss_clip": 0.01111325, + "auxiliary_loss_mlp": 0.01036105, + "balance_loss_clip": 1.03835511, + "balance_loss_mlp": 1.02386189, + "epoch": 0.7332631895385541, + "flos": 17165301177600.0, + "grad_norm": 1.958340476490106, + "language_loss": 0.72090805, + "learning_rate": 7.009336258154057e-07, + "loss": 0.74238235, + "num_input_tokens_seen": 263184780, + "step": 12196, + "time_per_iteration": 2.540836811065674 + }, + { + "auxiliary_loss_clip": 0.01111546, + "auxiliary_loss_mlp": 0.01030796, + "balance_loss_clip": 1.04016924, + "balance_loss_mlp": 1.01791, + "epoch": 0.733323312791222, + "flos": 28658151676800.0, + "grad_norm": 1.92503318264866, + "language_loss": 0.71952534, + "learning_rate": 7.006375297847394e-07, + "loss": 0.7409488, + "num_input_tokens_seen": 263204625, + "step": 12197, + "time_per_iteration": 2.6192398071289062 + }, + { + "auxiliary_loss_clip": 0.01058905, + "auxiliary_loss_mlp": 0.00771452, + "balance_loss_clip": 1.03431988, + "balance_loss_mlp": 1.00020027, + "epoch": 0.73338343604389, + "flos": 16618417632000.0, + "grad_norm": 3.2701178801425983, + "language_loss": 0.77824599, + "learning_rate": 7.003414830260282e-07, + "loss": 0.79654956, + "num_input_tokens_seen": 263221565, + "step": 12198, + "time_per_iteration": 2.751495599746704 + }, + { + "auxiliary_loss_clip": 0.0105527, + "auxiliary_loss_mlp": 0.01033337, + "balance_loss_clip": 1.0351963, + "balance_loss_mlp": 1.02071261, + "epoch": 0.7334435592965579, + "flos": 21142084561920.0, + "grad_norm": 1.9440363866172514, + "language_loss": 0.74263847, + "learning_rate": 7.000454855504974e-07, + "loss": 0.76352453, + "num_input_tokens_seen": 263240620, + "step": 12199, + "time_per_iteration": 2.767896890640259 + }, + { + "auxiliary_loss_clip": 0.01094013, + "auxiliary_loss_mlp": 0.01032416, + "balance_loss_clip": 1.03940797, + "balance_loss_mlp": 1.01919568, + "epoch": 0.7335036825492259, + "flos": 17125332318720.0, + "grad_norm": 2.5044351330443377, + "language_loss": 0.76926482, + "learning_rate": 6.997495373693729e-07, + "loss": 0.79052913, + "num_input_tokens_seen": 263254365, + "step": 12200, + "time_per_iteration": 2.6367027759552 + }, + { + "auxiliary_loss_clip": 0.01074082, + "auxiliary_loss_mlp": 0.01027226, + "balance_loss_clip": 1.03776014, + "balance_loss_mlp": 1.01524007, + "epoch": 0.7335638058018938, + "flos": 23731818307200.0, + "grad_norm": 2.389152390847936, + "language_loss": 0.61618876, + "learning_rate": 6.994536384938754e-07, + "loss": 0.63720185, + "num_input_tokens_seen": 263275880, + "step": 12201, + "time_per_iteration": 2.6798954010009766 + }, + { + "auxiliary_loss_clip": 0.0107342, + "auxiliary_loss_mlp": 0.00770019, + "balance_loss_clip": 1.03417397, + "balance_loss_mlp": 1.00014138, + "epoch": 0.7336239290545619, + "flos": 34933289679360.0, + "grad_norm": 2.0307356501592526, + "language_loss": 0.52253979, + "learning_rate": 6.991577889352264e-07, + "loss": 0.5409742, + "num_input_tokens_seen": 263298315, + "step": 12202, + "time_per_iteration": 2.8340702056884766 + }, + { + "auxiliary_loss_clip": 0.01087087, + "auxiliary_loss_mlp": 0.01030166, + "balance_loss_clip": 1.03677177, + "balance_loss_mlp": 1.017923, + "epoch": 0.7336840523072298, + "flos": 21103049456640.0, + "grad_norm": 1.7212231979753123, + "language_loss": 0.68485624, + "learning_rate": 6.98861988704645e-07, + "loss": 0.70602876, + "num_input_tokens_seen": 263318615, + "step": 12203, + "time_per_iteration": 2.642812967300415 + }, + { + "auxiliary_loss_clip": 0.01088423, + "auxiliary_loss_mlp": 0.01037493, + "balance_loss_clip": 1.03938603, + "balance_loss_mlp": 1.02476776, + "epoch": 0.7337441755598978, + "flos": 24024418496640.0, + "grad_norm": 2.034834601717817, + "language_loss": 0.6607222, + "learning_rate": 6.985662378133474e-07, + "loss": 0.68198133, + "num_input_tokens_seen": 263336705, + "step": 12204, + "time_per_iteration": 2.74241042137146 + }, + { + "auxiliary_loss_clip": 0.01089625, + "auxiliary_loss_mlp": 0.01034455, + "balance_loss_clip": 1.04081655, + "balance_loss_mlp": 1.02211094, + "epoch": 0.7338042988125658, + "flos": 22711309004160.0, + "grad_norm": 1.8580582529828333, + "language_loss": 0.77225935, + "learning_rate": 6.982705362725479e-07, + "loss": 0.79350007, + "num_input_tokens_seen": 263355065, + "step": 12205, + "time_per_iteration": 2.6422648429870605 + }, + { + "auxiliary_loss_clip": 0.01058875, + "auxiliary_loss_mlp": 0.01032513, + "balance_loss_clip": 1.03662992, + "balance_loss_mlp": 1.02064013, + "epoch": 0.7338644220652337, + "flos": 21360996000000.0, + "grad_norm": 2.159301504218906, + "language_loss": 0.79434526, + "learning_rate": 6.979748840934601e-07, + "loss": 0.8152591, + "num_input_tokens_seen": 263374460, + "step": 12206, + "time_per_iteration": 2.722921848297119 + }, + { + "auxiliary_loss_clip": 0.01071317, + "auxiliary_loss_mlp": 0.01031287, + "balance_loss_clip": 1.03451514, + "balance_loss_mlp": 1.01825154, + "epoch": 0.7339245453179017, + "flos": 30920236536960.0, + "grad_norm": 2.0535884600804817, + "language_loss": 0.71176481, + "learning_rate": 6.976792812872958e-07, + "loss": 0.73279089, + "num_input_tokens_seen": 263393610, + "step": 12207, + "time_per_iteration": 2.9302005767822266 + }, + { + "auxiliary_loss_clip": 0.01014266, + "auxiliary_loss_mlp": 0.01003684, + "balance_loss_clip": 1.01024389, + "balance_loss_mlp": 1.00252759, + "epoch": 0.7339846685705697, + "flos": 67899429072000.0, + "grad_norm": 0.7780632600453249, + "language_loss": 0.54746544, + "learning_rate": 6.97383727865263e-07, + "loss": 0.56764495, + "num_input_tokens_seen": 263450340, + "step": 12208, + "time_per_iteration": 3.267242431640625 + }, + { + "auxiliary_loss_clip": 0.01111313, + "auxiliary_loss_mlp": 0.01029971, + "balance_loss_clip": 1.03991294, + "balance_loss_mlp": 1.01901555, + "epoch": 0.7340447918232377, + "flos": 22236749493120.0, + "grad_norm": 1.4520136816915177, + "language_loss": 0.8051306, + "learning_rate": 6.970882238385703e-07, + "loss": 0.82654339, + "num_input_tokens_seen": 263471735, + "step": 12209, + "time_per_iteration": 2.6250216960906982 + }, + { + "auxiliary_loss_clip": 0.01108587, + "auxiliary_loss_mlp": 0.01033248, + "balance_loss_clip": 1.0370816, + "balance_loss_mlp": 1.02134514, + "epoch": 0.7341049150759056, + "flos": 23764784014080.0, + "grad_norm": 1.461722216284673, + "language_loss": 0.79026657, + "learning_rate": 6.96792769218423e-07, + "loss": 0.81168497, + "num_input_tokens_seen": 263493245, + "step": 12210, + "time_per_iteration": 2.5592970848083496 + }, + { + "auxiliary_loss_clip": 0.01108387, + "auxiliary_loss_mlp": 0.01029679, + "balance_loss_clip": 1.03799284, + "balance_loss_mlp": 1.01709008, + "epoch": 0.7341650383285736, + "flos": 17236547804160.0, + "grad_norm": 1.73695170749579, + "language_loss": 0.76122808, + "learning_rate": 6.964973640160236e-07, + "loss": 0.78260869, + "num_input_tokens_seen": 263511660, + "step": 12211, + "time_per_iteration": 2.571751117706299 + }, + { + "auxiliary_loss_clip": 0.01087498, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.03891158, + "balance_loss_mlp": 1.018592, + "epoch": 0.7342251615812415, + "flos": 23403953940480.0, + "grad_norm": 6.531715121329498, + "language_loss": 0.71997905, + "learning_rate": 6.962020082425748e-07, + "loss": 0.74116814, + "num_input_tokens_seen": 263530875, + "step": 12212, + "time_per_iteration": 2.6509475708007812 + }, + { + "auxiliary_loss_clip": 0.01112722, + "auxiliary_loss_mlp": 0.01033281, + "balance_loss_clip": 1.04100943, + "balance_loss_mlp": 1.02054381, + "epoch": 0.7342852848339095, + "flos": 22747183712640.0, + "grad_norm": 1.5833725401172443, + "language_loss": 0.68744397, + "learning_rate": 6.959067019092766e-07, + "loss": 0.70890403, + "num_input_tokens_seen": 263551585, + "step": 12213, + "time_per_iteration": 2.5494189262390137 + }, + { + "auxiliary_loss_clip": 0.010305, + "auxiliary_loss_mlp": 0.01005419, + "balance_loss_clip": 1.00768566, + "balance_loss_mlp": 1.004251, + "epoch": 0.7343454080865774, + "flos": 53942353925760.0, + "grad_norm": 0.7305513742092771, + "language_loss": 0.54231656, + "learning_rate": 6.956114450273276e-07, + "loss": 0.56267571, + "num_input_tokens_seen": 263609545, + "step": 12214, + "time_per_iteration": 3.0239064693450928 + }, + { + "auxiliary_loss_clip": 0.01112827, + "auxiliary_loss_mlp": 0.01031447, + "balance_loss_clip": 1.03797483, + "balance_loss_mlp": 1.01904964, + "epoch": 0.7344055313392455, + "flos": 12166859255040.0, + "grad_norm": 1.9946109817082227, + "language_loss": 0.70621991, + "learning_rate": 6.953162376079233e-07, + "loss": 0.72766268, + "num_input_tokens_seen": 263627880, + "step": 12215, + "time_per_iteration": 2.5570547580718994 + }, + { + "auxiliary_loss_clip": 0.01082063, + "auxiliary_loss_mlp": 0.01033308, + "balance_loss_clip": 1.03650701, + "balance_loss_mlp": 1.02130389, + "epoch": 0.7344656545919134, + "flos": 18550052346240.0, + "grad_norm": 1.5883175175393598, + "language_loss": 0.72867477, + "learning_rate": 6.950210796622573e-07, + "loss": 0.74982846, + "num_input_tokens_seen": 263645665, + "step": 12216, + "time_per_iteration": 2.621229887008667 + }, + { + "auxiliary_loss_clip": 0.0111704, + "auxiliary_loss_mlp": 0.01039831, + "balance_loss_clip": 1.0392859, + "balance_loss_mlp": 1.02483487, + "epoch": 0.7345257778445814, + "flos": 23661649088640.0, + "grad_norm": 1.6902289453280186, + "language_loss": 0.78386879, + "learning_rate": 6.947259712015236e-07, + "loss": 0.80543745, + "num_input_tokens_seen": 263668170, + "step": 12217, + "time_per_iteration": 2.594928503036499 + }, + { + "auxiliary_loss_clip": 0.01072057, + "auxiliary_loss_mlp": 0.01027279, + "balance_loss_clip": 1.03669691, + "balance_loss_mlp": 1.01602566, + "epoch": 0.7345859010972494, + "flos": 13808659127040.0, + "grad_norm": 1.9223508730662753, + "language_loss": 0.77991557, + "learning_rate": 6.94430912236911e-07, + "loss": 0.80090904, + "num_input_tokens_seen": 263684190, + "step": 12218, + "time_per_iteration": 4.173985958099365 + }, + { + "auxiliary_loss_clip": 0.01060122, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.03246057, + "balance_loss_mlp": 1.02410722, + "epoch": 0.7346460243499173, + "flos": 22272731942400.0, + "grad_norm": 1.7300149246142222, + "language_loss": 0.71998847, + "learning_rate": 6.941359027796092e-07, + "loss": 0.74097216, + "num_input_tokens_seen": 263702095, + "step": 12219, + "time_per_iteration": 2.7360141277313232 + }, + { + "auxiliary_loss_clip": 0.01084965, + "auxiliary_loss_mlp": 0.01031146, + "balance_loss_clip": 1.03496408, + "balance_loss_mlp": 1.01936817, + "epoch": 0.7347061476025853, + "flos": 23255247634560.0, + "grad_norm": 6.086208044404794, + "language_loss": 0.74677491, + "learning_rate": 6.938409428408061e-07, + "loss": 0.76793599, + "num_input_tokens_seen": 263721385, + "step": 12220, + "time_per_iteration": 4.237574577331543 + }, + { + "auxiliary_loss_clip": 0.01101059, + "auxiliary_loss_mlp": 0.01032621, + "balance_loss_clip": 1.03634357, + "balance_loss_mlp": 1.02002692, + "epoch": 0.7347662708552533, + "flos": 15267565923840.0, + "grad_norm": 1.7582091320116324, + "language_loss": 0.65720487, + "learning_rate": 6.93546032431684e-07, + "loss": 0.67854166, + "num_input_tokens_seen": 263737835, + "step": 12221, + "time_per_iteration": 4.174748182296753 + }, + { + "auxiliary_loss_clip": 0.0108489, + "auxiliary_loss_mlp": 0.01039185, + "balance_loss_clip": 1.0352186, + "balance_loss_mlp": 1.02567315, + "epoch": 0.7348263941079213, + "flos": 24859987649280.0, + "grad_norm": 1.907694939441604, + "language_loss": 0.69323444, + "learning_rate": 6.932511715634273e-07, + "loss": 0.71447521, + "num_input_tokens_seen": 263756480, + "step": 12222, + "time_per_iteration": 2.704784393310547 + }, + { + "auxiliary_loss_clip": 0.01063424, + "auxiliary_loss_mlp": 0.01030995, + "balance_loss_clip": 1.03514957, + "balance_loss_mlp": 1.01988506, + "epoch": 0.7348865173605892, + "flos": 24352103295360.0, + "grad_norm": 1.9184398882939155, + "language_loss": 0.66062474, + "learning_rate": 6.92956360247217e-07, + "loss": 0.68156886, + "num_input_tokens_seen": 263776440, + "step": 12223, + "time_per_iteration": 2.8198130130767822 + }, + { + "auxiliary_loss_clip": 0.01094086, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.03635502, + "balance_loss_mlp": 1.01708925, + "epoch": 0.7349466406132572, + "flos": 20004613597440.0, + "grad_norm": 1.6947927626477597, + "language_loss": 0.72573948, + "learning_rate": 6.926615984942332e-07, + "loss": 0.7469753, + "num_input_tokens_seen": 263793700, + "step": 12224, + "time_per_iteration": 4.08525276184082 + }, + { + "auxiliary_loss_clip": 0.01085057, + "auxiliary_loss_mlp": 0.01029564, + "balance_loss_clip": 1.04095888, + "balance_loss_mlp": 1.01713049, + "epoch": 0.7350067638659251, + "flos": 29825068815360.0, + "grad_norm": 1.830057292997908, + "language_loss": 0.72199714, + "learning_rate": 6.92366886315652e-07, + "loss": 0.74314332, + "num_input_tokens_seen": 263814620, + "step": 12225, + "time_per_iteration": 2.736055850982666 + }, + { + "auxiliary_loss_clip": 0.0111514, + "auxiliary_loss_mlp": 0.01035041, + "balance_loss_clip": 1.03917527, + "balance_loss_mlp": 1.02134943, + "epoch": 0.7350668871185931, + "flos": 21866150920320.0, + "grad_norm": 1.7365051701265057, + "language_loss": 0.76401973, + "learning_rate": 6.920722237226501e-07, + "loss": 0.78552151, + "num_input_tokens_seen": 263832725, + "step": 12226, + "time_per_iteration": 2.578805446624756 + }, + { + "auxiliary_loss_clip": 0.01085278, + "auxiliary_loss_mlp": 0.0103433, + "balance_loss_clip": 1.03646374, + "balance_loss_mlp": 1.01977456, + "epoch": 0.735127010371261, + "flos": 22566122231040.0, + "grad_norm": 1.442598448518307, + "language_loss": 0.6717149, + "learning_rate": 6.917776107264008e-07, + "loss": 0.69291103, + "num_input_tokens_seen": 263853850, + "step": 12227, + "time_per_iteration": 2.638720989227295 + }, + { + "auxiliary_loss_clip": 0.01101144, + "auxiliary_loss_mlp": 0.0103552, + "balance_loss_clip": 1.03755474, + "balance_loss_mlp": 1.02331293, + "epoch": 0.7351871336239291, + "flos": 25884339707520.0, + "grad_norm": 2.1955172179062536, + "language_loss": 0.63554502, + "learning_rate": 6.914830473380749e-07, + "loss": 0.65691161, + "num_input_tokens_seen": 263874760, + "step": 12228, + "time_per_iteration": 2.646679162979126 + }, + { + "auxiliary_loss_clip": 0.0109047, + "auxiliary_loss_mlp": 0.01036115, + "balance_loss_clip": 1.03838301, + "balance_loss_mlp": 1.02450967, + "epoch": 0.735247256876597, + "flos": 17932173569280.0, + "grad_norm": 1.6447533892101769, + "language_loss": 0.63384873, + "learning_rate": 6.911885335688427e-07, + "loss": 0.65511459, + "num_input_tokens_seen": 263893390, + "step": 12229, + "time_per_iteration": 2.626433849334717 + }, + { + "auxiliary_loss_clip": 0.01087319, + "auxiliary_loss_mlp": 0.01037821, + "balance_loss_clip": 1.03916466, + "balance_loss_mlp": 1.02470779, + "epoch": 0.735307380129265, + "flos": 28875159694080.0, + "grad_norm": 1.6569871387550634, + "language_loss": 0.73374206, + "learning_rate": 6.908940694298726e-07, + "loss": 0.75499344, + "num_input_tokens_seen": 263911180, + "step": 12230, + "time_per_iteration": 2.719008207321167 + }, + { + "auxiliary_loss_clip": 0.01058297, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.03558922, + "balance_loss_mlp": 1.0192132, + "epoch": 0.7353675033819329, + "flos": 13625658311040.0, + "grad_norm": 2.410798964065256, + "language_loss": 0.72446096, + "learning_rate": 6.90599654932332e-07, + "loss": 0.74536955, + "num_input_tokens_seen": 263928975, + "step": 12231, + "time_per_iteration": 2.7233800888061523 + }, + { + "auxiliary_loss_clip": 0.01102609, + "auxiliary_loss_mlp": 0.01037553, + "balance_loss_clip": 1.0392592, + "balance_loss_mlp": 1.0230689, + "epoch": 0.7354276266346009, + "flos": 19463081178240.0, + "grad_norm": 2.5985105749536332, + "language_loss": 0.63813508, + "learning_rate": 6.903052900873823e-07, + "loss": 0.65953672, + "num_input_tokens_seen": 263944495, + "step": 12232, + "time_per_iteration": 2.626089334487915 + }, + { + "auxiliary_loss_clip": 0.0109166, + "auxiliary_loss_mlp": 0.01032177, + "balance_loss_clip": 1.03764665, + "balance_loss_mlp": 1.01987481, + "epoch": 0.735487749887269, + "flos": 15771858917760.0, + "grad_norm": 1.7852756816189446, + "language_loss": 0.75511599, + "learning_rate": 6.900109749061874e-07, + "loss": 0.77635431, + "num_input_tokens_seen": 263961325, + "step": 12233, + "time_per_iteration": 2.614691972732544 + }, + { + "auxiliary_loss_clip": 0.01112187, + "auxiliary_loss_mlp": 0.01028949, + "balance_loss_clip": 1.03919733, + "balance_loss_mlp": 1.01619315, + "epoch": 0.7355478731399369, + "flos": 18260648467200.0, + "grad_norm": 4.244761548872676, + "language_loss": 0.73351365, + "learning_rate": 6.897167093999079e-07, + "loss": 0.75492501, + "num_input_tokens_seen": 263980445, + "step": 12234, + "time_per_iteration": 2.5742101669311523 + }, + { + "auxiliary_loss_clip": 0.01099473, + "auxiliary_loss_mlp": 0.0103086, + "balance_loss_clip": 1.03804564, + "balance_loss_mlp": 1.01721096, + "epoch": 0.7356079963926049, + "flos": 26542043688960.0, + "grad_norm": 2.1824026453384078, + "language_loss": 0.59852672, + "learning_rate": 6.894224935797017e-07, + "loss": 0.61983013, + "num_input_tokens_seen": 263999330, + "step": 12235, + "time_per_iteration": 2.661247730255127 + }, + { + "auxiliary_loss_clip": 0.01088694, + "auxiliary_loss_mlp": 0.01027233, + "balance_loss_clip": 1.03844726, + "balance_loss_mlp": 1.01487708, + "epoch": 0.7356681196452728, + "flos": 10778624467200.0, + "grad_norm": 2.763935396627176, + "language_loss": 0.85834122, + "learning_rate": 6.891283274567259e-07, + "loss": 0.87950051, + "num_input_tokens_seen": 264014150, + "step": 12236, + "time_per_iteration": 2.589035749435425 + }, + { + "auxiliary_loss_clip": 0.0110083, + "auxiliary_loss_mlp": 0.00769741, + "balance_loss_clip": 1.03816271, + "balance_loss_mlp": 1.00019503, + "epoch": 0.7357282428979408, + "flos": 19718693337600.0, + "grad_norm": 5.1654234015242215, + "language_loss": 0.69555867, + "learning_rate": 6.888342110421364e-07, + "loss": 0.71426433, + "num_input_tokens_seen": 264033140, + "step": 12237, + "time_per_iteration": 2.652851104736328 + }, + { + "auxiliary_loss_clip": 0.01022711, + "auxiliary_loss_mlp": 0.01031396, + "balance_loss_clip": 1.02870941, + "balance_loss_mlp": 1.01868236, + "epoch": 0.7357883661506087, + "flos": 19464014931840.0, + "grad_norm": 1.6842160267600648, + "language_loss": 0.72287041, + "learning_rate": 6.885401443470839e-07, + "loss": 0.74341154, + "num_input_tokens_seen": 264052105, + "step": 12238, + "time_per_iteration": 2.887967586517334 + }, + { + "auxiliary_loss_clip": 0.0108237, + "auxiliary_loss_mlp": 0.01030077, + "balance_loss_clip": 1.03519797, + "balance_loss_mlp": 1.01672542, + "epoch": 0.7358484894032767, + "flos": 27123006263040.0, + "grad_norm": 2.119394608491001, + "language_loss": 0.72818553, + "learning_rate": 6.882461273827205e-07, + "loss": 0.74930996, + "num_input_tokens_seen": 264070690, + "step": 12239, + "time_per_iteration": 3.308215618133545 + }, + { + "auxiliary_loss_clip": 0.01079481, + "auxiliary_loss_mlp": 0.0103047, + "balance_loss_clip": 1.03759682, + "balance_loss_mlp": 1.01827478, + "epoch": 0.7359086126559446, + "flos": 24502282058880.0, + "grad_norm": 1.656407411551667, + "language_loss": 0.78889048, + "learning_rate": 6.879521601601954e-07, + "loss": 0.80998993, + "num_input_tokens_seen": 264094225, + "step": 12240, + "time_per_iteration": 2.6716065406799316 + }, + { + "auxiliary_loss_clip": 0.01101629, + "auxiliary_loss_mlp": 0.01037535, + "balance_loss_clip": 1.03955805, + "balance_loss_mlp": 1.02480888, + "epoch": 0.7359687359086127, + "flos": 23331270769920.0, + "grad_norm": 1.888852774104125, + "language_loss": 0.82579136, + "learning_rate": 6.876582426906565e-07, + "loss": 0.84718299, + "num_input_tokens_seen": 264113190, + "step": 12241, + "time_per_iteration": 2.687603712081909 + }, + { + "auxiliary_loss_clip": 0.01097273, + "auxiliary_loss_mlp": 0.01025951, + "balance_loss_clip": 1.03536153, + "balance_loss_mlp": 1.01373816, + "epoch": 0.7360288591612806, + "flos": 20193396503040.0, + "grad_norm": 1.823724311239111, + "language_loss": 0.78747702, + "learning_rate": 6.873643749852484e-07, + "loss": 0.80870926, + "num_input_tokens_seen": 264132050, + "step": 12242, + "time_per_iteration": 2.6332826614379883 + }, + { + "auxiliary_loss_clip": 0.01062855, + "auxiliary_loss_mlp": 0.01032129, + "balance_loss_clip": 1.03485787, + "balance_loss_mlp": 1.01942182, + "epoch": 0.7360889824139486, + "flos": 24972783333120.0, + "grad_norm": 1.7248872165867588, + "language_loss": 0.79574555, + "learning_rate": 6.870705570551145e-07, + "loss": 0.81669545, + "num_input_tokens_seen": 264152800, + "step": 12243, + "time_per_iteration": 2.6513876914978027 + }, + { + "auxiliary_loss_clip": 0.01101249, + "auxiliary_loss_mlp": 0.01032749, + "balance_loss_clip": 1.03733206, + "balance_loss_mlp": 1.01998186, + "epoch": 0.7361491056666165, + "flos": 15012312900480.0, + "grad_norm": 2.291279589424139, + "language_loss": 0.74445826, + "learning_rate": 6.867767889113969e-07, + "loss": 0.76579821, + "num_input_tokens_seen": 264169650, + "step": 12244, + "time_per_iteration": 2.4683594703674316 + }, + { + "auxiliary_loss_clip": 0.01094664, + "auxiliary_loss_mlp": 0.01032807, + "balance_loss_clip": 1.03583598, + "balance_loss_mlp": 1.02007556, + "epoch": 0.7362092289192845, + "flos": 22930400010240.0, + "grad_norm": 1.867590406442262, + "language_loss": 0.69203222, + "learning_rate": 6.864830705652347e-07, + "loss": 0.7133069, + "num_input_tokens_seen": 264190530, + "step": 12245, + "time_per_iteration": 2.687621831893921 + }, + { + "auxiliary_loss_clip": 0.01072242, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.03500962, + "balance_loss_mlp": 1.02093101, + "epoch": 0.7362693521719526, + "flos": 20702681487360.0, + "grad_norm": 1.5504904420549481, + "language_loss": 0.73484623, + "learning_rate": 6.861894020277658e-07, + "loss": 0.75591099, + "num_input_tokens_seen": 264210820, + "step": 12246, + "time_per_iteration": 2.73628568649292 + }, + { + "auxiliary_loss_clip": 0.01084679, + "auxiliary_loss_mlp": 0.01025875, + "balance_loss_clip": 1.03512716, + "balance_loss_mlp": 1.01378119, + "epoch": 0.7363294754246205, + "flos": 13111381336320.0, + "grad_norm": 2.1569575321455163, + "language_loss": 0.73685145, + "learning_rate": 6.858957833101266e-07, + "loss": 0.75795692, + "num_input_tokens_seen": 264227430, + "step": 12247, + "time_per_iteration": 2.5930237770080566 + }, + { + "auxiliary_loss_clip": 0.01101325, + "auxiliary_loss_mlp": 0.01032427, + "balance_loss_clip": 1.04162931, + "balance_loss_mlp": 1.02031505, + "epoch": 0.7363895986772885, + "flos": 14027426910720.0, + "grad_norm": 1.6102027523975817, + "language_loss": 0.7423265, + "learning_rate": 6.856022144234526e-07, + "loss": 0.76366401, + "num_input_tokens_seen": 264245230, + "step": 12248, + "time_per_iteration": 2.5792789459228516 + }, + { + "auxiliary_loss_clip": 0.0109033, + "auxiliary_loss_mlp": 0.01033502, + "balance_loss_clip": 1.03816319, + "balance_loss_mlp": 1.02057934, + "epoch": 0.7364497219299564, + "flos": 19719986227200.0, + "grad_norm": 1.8750204418443517, + "language_loss": 0.72477007, + "learning_rate": 6.853086953788727e-07, + "loss": 0.7460084, + "num_input_tokens_seen": 264263945, + "step": 12249, + "time_per_iteration": 2.624386787414551 + }, + { + "auxiliary_loss_clip": 0.01089724, + "auxiliary_loss_mlp": 0.01033559, + "balance_loss_clip": 1.03801394, + "balance_loss_mlp": 1.02015996, + "epoch": 0.7365098451826244, + "flos": 21361391049600.0, + "grad_norm": 2.586847113789983, + "language_loss": 0.77382159, + "learning_rate": 6.850152261875189e-07, + "loss": 0.7950545, + "num_input_tokens_seen": 264281500, + "step": 12250, + "time_per_iteration": 2.6388142108917236 + }, + { + "auxiliary_loss_clip": 0.01066882, + "auxiliary_loss_mlp": 0.01031238, + "balance_loss_clip": 1.03667164, + "balance_loss_mlp": 1.01857233, + "epoch": 0.7365699684352923, + "flos": 23368222886400.0, + "grad_norm": 1.6519467305081468, + "language_loss": 0.71352232, + "learning_rate": 6.8472180686052e-07, + "loss": 0.73450345, + "num_input_tokens_seen": 264301625, + "step": 12251, + "time_per_iteration": 2.7391629219055176 + }, + { + "auxiliary_loss_clip": 0.01095208, + "auxiliary_loss_mlp": 0.01035371, + "balance_loss_clip": 1.03801441, + "balance_loss_mlp": 1.0229789, + "epoch": 0.7366300916879603, + "flos": 59524879927680.0, + "grad_norm": 1.575545988693255, + "language_loss": 0.65908438, + "learning_rate": 6.844284374090015e-07, + "loss": 0.68039018, + "num_input_tokens_seen": 264323975, + "step": 12252, + "time_per_iteration": 2.9795963764190674 + }, + { + "auxiliary_loss_clip": 0.0106263, + "auxiliary_loss_mlp": 0.01035896, + "balance_loss_clip": 1.03544736, + "balance_loss_mlp": 1.02261591, + "epoch": 0.7366902149406283, + "flos": 20923137210240.0, + "grad_norm": 1.669933486125426, + "language_loss": 0.79418141, + "learning_rate": 6.841351178440884e-07, + "loss": 0.81516671, + "num_input_tokens_seen": 264343785, + "step": 12253, + "time_per_iteration": 2.762692451477051 + }, + { + "auxiliary_loss_clip": 0.01107479, + "auxiliary_loss_mlp": 0.00769571, + "balance_loss_clip": 1.03836572, + "balance_loss_mlp": 1.00025702, + "epoch": 0.7367503381932963, + "flos": 17348158339200.0, + "grad_norm": 2.0410258772790604, + "language_loss": 0.76204622, + "learning_rate": 6.83841848176905e-07, + "loss": 0.78081673, + "num_input_tokens_seen": 264361130, + "step": 12254, + "time_per_iteration": 2.518159866333008 + }, + { + "auxiliary_loss_clip": 0.01085242, + "auxiliary_loss_mlp": 0.01042117, + "balance_loss_clip": 1.03690898, + "balance_loss_mlp": 1.02805638, + "epoch": 0.7368104614459642, + "flos": 17821317219840.0, + "grad_norm": 4.287032087933439, + "language_loss": 0.7025637, + "learning_rate": 6.835486284185692e-07, + "loss": 0.72383738, + "num_input_tokens_seen": 264376965, + "step": 12255, + "time_per_iteration": 2.589442729949951 + }, + { + "auxiliary_loss_clip": 0.0110157, + "auxiliary_loss_mlp": 0.01029844, + "balance_loss_clip": 1.03971469, + "balance_loss_mlp": 1.01649857, + "epoch": 0.7368705846986322, + "flos": 24606099342720.0, + "grad_norm": 1.8002690456311732, + "language_loss": 0.75496477, + "learning_rate": 6.832554585802012e-07, + "loss": 0.77627891, + "num_input_tokens_seen": 264396310, + "step": 12256, + "time_per_iteration": 2.6408097743988037 + }, + { + "auxiliary_loss_clip": 0.0110194, + "auxiliary_loss_mlp": 0.01031829, + "balance_loss_clip": 1.03902447, + "balance_loss_mlp": 1.01861525, + "epoch": 0.7369307079513001, + "flos": 34970169968640.0, + "grad_norm": 1.8159152177837306, + "language_loss": 0.73517919, + "learning_rate": 6.829623386729182e-07, + "loss": 0.75651693, + "num_input_tokens_seen": 264418085, + "step": 12257, + "time_per_iteration": 2.6984493732452393 + }, + { + "auxiliary_loss_clip": 0.01092873, + "auxiliary_loss_mlp": 0.01038875, + "balance_loss_clip": 1.03521228, + "balance_loss_mlp": 1.02668011, + "epoch": 0.7369908312039681, + "flos": 21214588164480.0, + "grad_norm": 1.793311215899037, + "language_loss": 0.78370535, + "learning_rate": 6.826692687078362e-07, + "loss": 0.80502284, + "num_input_tokens_seen": 264437595, + "step": 12258, + "time_per_iteration": 4.2666544914245605 + }, + { + "auxiliary_loss_clip": 0.01103154, + "auxiliary_loss_mlp": 0.0103457, + "balance_loss_clip": 1.03888559, + "balance_loss_mlp": 1.02195156, + "epoch": 0.7370509544566362, + "flos": 23623655477760.0, + "grad_norm": 1.4256743681063133, + "language_loss": 0.66447318, + "learning_rate": 6.823762486960674e-07, + "loss": 0.68585044, + "num_input_tokens_seen": 264457385, + "step": 12259, + "time_per_iteration": 2.6215436458587646 + }, + { + "auxiliary_loss_clip": 0.01101635, + "auxiliary_loss_mlp": 0.01036273, + "balance_loss_clip": 1.0403527, + "balance_loss_mlp": 1.02288604, + "epoch": 0.7371110777093041, + "flos": 24827704300800.0, + "grad_norm": 1.885600567170779, + "language_loss": 0.73500818, + "learning_rate": 6.820832786487225e-07, + "loss": 0.75638729, + "num_input_tokens_seen": 264477205, + "step": 12260, + "time_per_iteration": 5.883468866348267 + }, + { + "auxiliary_loss_clip": 0.01096844, + "auxiliary_loss_mlp": 0.010343, + "balance_loss_clip": 1.0374378, + "balance_loss_mlp": 1.02105618, + "epoch": 0.7371712009619721, + "flos": 23149491016320.0, + "grad_norm": 1.6200420783650578, + "language_loss": 0.73566377, + "learning_rate": 6.817903585769125e-07, + "loss": 0.75697523, + "num_input_tokens_seen": 264497195, + "step": 12261, + "time_per_iteration": 2.611388683319092 + }, + { + "auxiliary_loss_clip": 0.01091123, + "auxiliary_loss_mlp": 0.01034498, + "balance_loss_clip": 1.03705454, + "balance_loss_mlp": 1.02096152, + "epoch": 0.73723132421464, + "flos": 23112898035840.0, + "grad_norm": 1.9187106646052445, + "language_loss": 0.66943705, + "learning_rate": 6.814974884917438e-07, + "loss": 0.69069326, + "num_input_tokens_seen": 264516950, + "step": 12262, + "time_per_iteration": 2.605332374572754 + }, + { + "auxiliary_loss_clip": 0.01112628, + "auxiliary_loss_mlp": 0.01032891, + "balance_loss_clip": 1.03917944, + "balance_loss_mlp": 1.01943254, + "epoch": 0.737291447467308, + "flos": 19273328605440.0, + "grad_norm": 2.61578609371499, + "language_loss": 0.88660431, + "learning_rate": 6.81204668404322e-07, + "loss": 0.90805948, + "num_input_tokens_seen": 264532675, + "step": 12263, + "time_per_iteration": 4.228296279907227 + }, + { + "auxiliary_loss_clip": 0.01107513, + "auxiliary_loss_mlp": 0.01028636, + "balance_loss_clip": 1.03926718, + "balance_loss_mlp": 1.01731133, + "epoch": 0.7373515707199759, + "flos": 25118257415040.0, + "grad_norm": 1.6036669439356246, + "language_loss": 0.67279935, + "learning_rate": 6.809118983257522e-07, + "loss": 0.69416088, + "num_input_tokens_seen": 264555635, + "step": 12264, + "time_per_iteration": 2.6264944076538086 + }, + { + "auxiliary_loss_clip": 0.01107424, + "auxiliary_loss_mlp": 0.01032446, + "balance_loss_clip": 1.0380187, + "balance_loss_mlp": 1.02020919, + "epoch": 0.737411693972644, + "flos": 32408481767040.0, + "grad_norm": 5.628920745941572, + "language_loss": 0.80262679, + "learning_rate": 6.806191782671356e-07, + "loss": 0.82402551, + "num_input_tokens_seen": 264573140, + "step": 12265, + "time_per_iteration": 2.6175074577331543 + }, + { + "auxiliary_loss_clip": 0.01104877, + "auxiliary_loss_mlp": 0.01031656, + "balance_loss_clip": 1.03860068, + "balance_loss_mlp": 1.01912761, + "epoch": 0.7374718172253119, + "flos": 24315797623680.0, + "grad_norm": 2.6431361651655094, + "language_loss": 0.74271613, + "learning_rate": 6.803265082395711e-07, + "loss": 0.76408148, + "num_input_tokens_seen": 264591610, + "step": 12266, + "time_per_iteration": 2.6342427730560303 + }, + { + "auxiliary_loss_clip": 0.01102733, + "auxiliary_loss_mlp": 0.01039895, + "balance_loss_clip": 1.03989673, + "balance_loss_mlp": 1.02624547, + "epoch": 0.7375319404779799, + "flos": 27156115624320.0, + "grad_norm": 1.6143075154919249, + "language_loss": 0.72911859, + "learning_rate": 6.800338882541576e-07, + "loss": 0.75054485, + "num_input_tokens_seen": 264611170, + "step": 12267, + "time_per_iteration": 2.638545036315918 + }, + { + "auxiliary_loss_clip": 0.01075616, + "auxiliary_loss_mlp": 0.01036972, + "balance_loss_clip": 1.03733301, + "balance_loss_mlp": 1.02528942, + "epoch": 0.7375920637306478, + "flos": 18879999701760.0, + "grad_norm": 2.114502804369275, + "language_loss": 0.83173954, + "learning_rate": 6.797413183219923e-07, + "loss": 0.85286546, + "num_input_tokens_seen": 264629365, + "step": 12268, + "time_per_iteration": 2.6624231338500977 + }, + { + "auxiliary_loss_clip": 0.0111022, + "auxiliary_loss_mlp": 0.01043154, + "balance_loss_clip": 1.03934455, + "balance_loss_mlp": 1.03039253, + "epoch": 0.7376521869833158, + "flos": 15669765486720.0, + "grad_norm": 1.8306850804928718, + "language_loss": 0.73056579, + "learning_rate": 6.794487984541677e-07, + "loss": 0.75209951, + "num_input_tokens_seen": 264647915, + "step": 12269, + "time_per_iteration": 2.5542378425598145 + }, + { + "auxiliary_loss_clip": 0.01086703, + "auxiliary_loss_mlp": 0.01036517, + "balance_loss_clip": 1.03575897, + "balance_loss_mlp": 1.02278399, + "epoch": 0.7377123102359837, + "flos": 36971973901440.0, + "grad_norm": 2.033998429253707, + "language_loss": 0.70437771, + "learning_rate": 6.791563286617776e-07, + "loss": 0.72560984, + "num_input_tokens_seen": 264669620, + "step": 12270, + "time_per_iteration": 2.738266706466675 + }, + { + "auxiliary_loss_clip": 0.01096302, + "auxiliary_loss_mlp": 0.01032958, + "balance_loss_clip": 1.03592134, + "balance_loss_mlp": 1.02121568, + "epoch": 0.7377724334886517, + "flos": 24496284487680.0, + "grad_norm": 1.5966861797114758, + "language_loss": 0.69652647, + "learning_rate": 6.788639089559119e-07, + "loss": 0.71781904, + "num_input_tokens_seen": 264689345, + "step": 12271, + "time_per_iteration": 2.664652109146118 + }, + { + "auxiliary_loss_clip": 0.01080906, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.03928661, + "balance_loss_mlp": 1.02066565, + "epoch": 0.7378325567413198, + "flos": 24390025079040.0, + "grad_norm": 2.652639550639501, + "language_loss": 0.67802662, + "learning_rate": 6.785715393476586e-07, + "loss": 0.69917965, + "num_input_tokens_seen": 264707625, + "step": 12272, + "time_per_iteration": 2.6848604679107666 + }, + { + "auxiliary_loss_clip": 0.01086013, + "auxiliary_loss_mlp": 0.01030527, + "balance_loss_clip": 1.03750646, + "balance_loss_mlp": 1.01848674, + "epoch": 0.7378926799939877, + "flos": 17416388223360.0, + "grad_norm": 2.2309811346874655, + "language_loss": 0.780334, + "learning_rate": 6.782792198481049e-07, + "loss": 0.80149937, + "num_input_tokens_seen": 264725575, + "step": 12273, + "time_per_iteration": 2.635556936264038 + }, + { + "auxiliary_loss_clip": 0.01109904, + "auxiliary_loss_mlp": 0.01030975, + "balance_loss_clip": 1.03768742, + "balance_loss_mlp": 1.01857686, + "epoch": 0.7379528032466557, + "flos": 18474208778880.0, + "grad_norm": 1.8331912360811773, + "language_loss": 0.83564162, + "learning_rate": 6.779869504683355e-07, + "loss": 0.85705042, + "num_input_tokens_seen": 264742855, + "step": 12274, + "time_per_iteration": 2.5715138912200928 + }, + { + "auxiliary_loss_clip": 0.01091523, + "auxiliary_loss_mlp": 0.00771783, + "balance_loss_clip": 1.03963578, + "balance_loss_mlp": 1.00021505, + "epoch": 0.7380129264993236, + "flos": 17821999578240.0, + "grad_norm": 2.3015182106996237, + "language_loss": 0.73600042, + "learning_rate": 6.776947312194341e-07, + "loss": 0.75463349, + "num_input_tokens_seen": 264761155, + "step": 12275, + "time_per_iteration": 2.715363025665283 + }, + { + "auxiliary_loss_clip": 0.01078211, + "auxiliary_loss_mlp": 0.01054085, + "balance_loss_clip": 1.03664327, + "balance_loss_mlp": 1.03894567, + "epoch": 0.7380730497519916, + "flos": 22997372918400.0, + "grad_norm": 1.6539392854769155, + "language_loss": 0.73462373, + "learning_rate": 6.774025621124813e-07, + "loss": 0.75594664, + "num_input_tokens_seen": 264780660, + "step": 12276, + "time_per_iteration": 2.7231481075286865 + }, + { + "auxiliary_loss_clip": 0.01112925, + "auxiliary_loss_mlp": 0.01031524, + "balance_loss_clip": 1.03907084, + "balance_loss_mlp": 1.01874495, + "epoch": 0.7381331730046595, + "flos": 20266259241600.0, + "grad_norm": 1.9864441113033549, + "language_loss": 0.7796191, + "learning_rate": 6.771104431585551e-07, + "loss": 0.80106354, + "num_input_tokens_seen": 264798850, + "step": 12277, + "time_per_iteration": 2.5575850009918213 + }, + { + "auxiliary_loss_clip": 0.01110863, + "auxiliary_loss_mlp": 0.01038626, + "balance_loss_clip": 1.03995776, + "balance_loss_mlp": 1.0259068, + "epoch": 0.7381932962573275, + "flos": 19754532132480.0, + "grad_norm": 2.416998693757566, + "language_loss": 0.78511059, + "learning_rate": 6.768183743687338e-07, + "loss": 0.80660546, + "num_input_tokens_seen": 264816795, + "step": 12278, + "time_per_iteration": 2.542168617248535 + }, + { + "auxiliary_loss_clip": 0.01102779, + "auxiliary_loss_mlp": 0.00771237, + "balance_loss_clip": 1.03840756, + "balance_loss_mlp": 1.00024569, + "epoch": 0.7382534195099955, + "flos": 17305316392320.0, + "grad_norm": 2.0236332127409, + "language_loss": 0.72539043, + "learning_rate": 6.765263557540921e-07, + "loss": 0.74413061, + "num_input_tokens_seen": 264834105, + "step": 12279, + "time_per_iteration": 2.612534761428833 + }, + { + "auxiliary_loss_clip": 0.01103104, + "auxiliary_loss_mlp": 0.01035418, + "balance_loss_clip": 1.03738606, + "balance_loss_mlp": 1.02173257, + "epoch": 0.7383135427626635, + "flos": 18697358021760.0, + "grad_norm": 2.394018024730235, + "language_loss": 0.86069536, + "learning_rate": 6.762343873257034e-07, + "loss": 0.88208055, + "num_input_tokens_seen": 264850895, + "step": 12280, + "time_per_iteration": 2.611475944519043 + }, + { + "auxiliary_loss_clip": 0.01073789, + "auxiliary_loss_mlp": 0.01032032, + "balance_loss_clip": 1.03634775, + "balance_loss_mlp": 1.01885295, + "epoch": 0.7383736660153314, + "flos": 20881300844160.0, + "grad_norm": 1.8693617932134328, + "language_loss": 0.72391272, + "learning_rate": 6.759424690946408e-07, + "loss": 0.74497092, + "num_input_tokens_seen": 264869505, + "step": 12281, + "time_per_iteration": 2.718876361846924 + }, + { + "auxiliary_loss_clip": 0.0106943, + "auxiliary_loss_mlp": 0.01035051, + "balance_loss_clip": 1.0354619, + "balance_loss_mlp": 1.02190232, + "epoch": 0.7384337892679994, + "flos": 20663215418880.0, + "grad_norm": 1.705222549149129, + "language_loss": 0.60742152, + "learning_rate": 6.756506010719711e-07, + "loss": 0.62846637, + "num_input_tokens_seen": 264886915, + "step": 12282, + "time_per_iteration": 2.70023775100708 + }, + { + "auxiliary_loss_clip": 0.01077848, + "auxiliary_loss_mlp": 0.01030119, + "balance_loss_clip": 1.03686452, + "balance_loss_mlp": 1.01697028, + "epoch": 0.7384939125206673, + "flos": 29169627390720.0, + "grad_norm": 1.8611774916735326, + "language_loss": 0.6824851, + "learning_rate": 6.753587832687632e-07, + "loss": 0.70356476, + "num_input_tokens_seen": 264910350, + "step": 12283, + "time_per_iteration": 2.758152484893799 + }, + { + "auxiliary_loss_clip": 0.01112935, + "auxiliary_loss_mlp": 0.00771245, + "balance_loss_clip": 1.040452, + "balance_loss_mlp": 1.00015855, + "epoch": 0.7385540357733353, + "flos": 36312833376000.0, + "grad_norm": 1.7271477850401677, + "language_loss": 0.76260293, + "learning_rate": 6.750670156960832e-07, + "loss": 0.78144467, + "num_input_tokens_seen": 264930705, + "step": 12284, + "time_per_iteration": 2.7076218128204346 + }, + { + "auxiliary_loss_clip": 0.01094916, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.03557301, + "balance_loss_mlp": 1.02121985, + "epoch": 0.7386141590260034, + "flos": 20302600826880.0, + "grad_norm": 1.9358750531249929, + "language_loss": 0.68962932, + "learning_rate": 6.747752983649954e-07, + "loss": 0.7109322, + "num_input_tokens_seen": 264946975, + "step": 12285, + "time_per_iteration": 2.572366714477539 + }, + { + "auxiliary_loss_clip": 0.01095815, + "auxiliary_loss_mlp": 0.01038084, + "balance_loss_clip": 1.03904641, + "balance_loss_mlp": 1.02421951, + "epoch": 0.7386742822786713, + "flos": 25483792170240.0, + "grad_norm": 1.9975794318154387, + "language_loss": 0.79803824, + "learning_rate": 6.744836312865602e-07, + "loss": 0.81937724, + "num_input_tokens_seen": 264967665, + "step": 12286, + "time_per_iteration": 2.6924288272857666 + }, + { + "auxiliary_loss_clip": 0.01062201, + "auxiliary_loss_mlp": 0.01027877, + "balance_loss_clip": 1.03638017, + "balance_loss_mlp": 1.01515102, + "epoch": 0.7387344055313393, + "flos": 13771958405760.0, + "grad_norm": 2.075219582835579, + "language_loss": 0.65311086, + "learning_rate": 6.741920144718396e-07, + "loss": 0.67401159, + "num_input_tokens_seen": 264985480, + "step": 12287, + "time_per_iteration": 2.7654411792755127 + }, + { + "auxiliary_loss_clip": 0.010848, + "auxiliary_loss_mlp": 0.01026868, + "balance_loss_clip": 1.03562939, + "balance_loss_mlp": 1.01483417, + "epoch": 0.7387945287840072, + "flos": 27855189095040.0, + "grad_norm": 2.1085520874155046, + "language_loss": 0.76855958, + "learning_rate": 6.739004479318903e-07, + "loss": 0.78967619, + "num_input_tokens_seen": 265004790, + "step": 12288, + "time_per_iteration": 2.6597485542297363 + }, + { + "auxiliary_loss_clip": 0.01104274, + "auxiliary_loss_mlp": 0.00771655, + "balance_loss_clip": 1.04053795, + "balance_loss_mlp": 1.00024295, + "epoch": 0.7388546520366752, + "flos": 44233039388160.0, + "grad_norm": 1.5714095328418676, + "language_loss": 0.58359075, + "learning_rate": 6.736089316777684e-07, + "loss": 0.60235, + "num_input_tokens_seen": 265028790, + "step": 12289, + "time_per_iteration": 2.790731906890869 + }, + { + "auxiliary_loss_clip": 0.01031232, + "auxiliary_loss_mlp": 0.00751213, + "balance_loss_clip": 1.00846362, + "balance_loss_mlp": 0.99965459, + "epoch": 0.7389147752893431, + "flos": 70680890638080.0, + "grad_norm": 0.6357735365195177, + "language_loss": 0.49246126, + "learning_rate": 6.733174657205287e-07, + "loss": 0.51028574, + "num_input_tokens_seen": 265096660, + "step": 12290, + "time_per_iteration": 3.243767261505127 + }, + { + "auxiliary_loss_clip": 0.01096247, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_clip": 1.03841698, + "balance_loss_mlp": 1.02171409, + "epoch": 0.7389748985420111, + "flos": 25994980575360.0, + "grad_norm": 3.780514148170796, + "language_loss": 0.67435575, + "learning_rate": 6.730260500712237e-07, + "loss": 0.69567037, + "num_input_tokens_seen": 265116375, + "step": 12291, + "time_per_iteration": 2.605470895767212 + }, + { + "auxiliary_loss_clip": 0.0099264, + "auxiliary_loss_mlp": 0.01000802, + "balance_loss_clip": 1.00994468, + "balance_loss_mlp": 0.99969369, + "epoch": 0.7390350217946791, + "flos": 54403661318400.0, + "grad_norm": 0.9871071197765896, + "language_loss": 0.60852838, + "learning_rate": 6.727346847409052e-07, + "loss": 0.62846279, + "num_input_tokens_seen": 265161230, + "step": 12292, + "time_per_iteration": 2.888421058654785 + }, + { + "auxiliary_loss_clip": 0.0106381, + "auxiliary_loss_mlp": 0.01034194, + "balance_loss_clip": 1.03513324, + "balance_loss_mlp": 1.0222311, + "epoch": 0.7390951450473471, + "flos": 32196968530560.0, + "grad_norm": 2.192815626746647, + "language_loss": 0.66975296, + "learning_rate": 6.724433697406191e-07, + "loss": 0.69073296, + "num_input_tokens_seen": 265182515, + "step": 12293, + "time_per_iteration": 2.8275856971740723 + }, + { + "auxiliary_loss_clip": 0.01100034, + "auxiliary_loss_mlp": 0.0103067, + "balance_loss_clip": 1.03730226, + "balance_loss_mlp": 1.01779556, + "epoch": 0.739155268300015, + "flos": 16684241304960.0, + "grad_norm": 1.9827271257615733, + "language_loss": 0.83464789, + "learning_rate": 6.721521050814134e-07, + "loss": 0.85595489, + "num_input_tokens_seen": 265198160, + "step": 12294, + "time_per_iteration": 2.597766160964966 + }, + { + "auxiliary_loss_clip": 0.01077206, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.03740942, + "balance_loss_mlp": 1.0197401, + "epoch": 0.739215391552683, + "flos": 31649761762560.0, + "grad_norm": 1.5365825507794162, + "language_loss": 0.72879148, + "learning_rate": 6.718608907743337e-07, + "loss": 0.74989408, + "num_input_tokens_seen": 265218480, + "step": 12295, + "time_per_iteration": 2.7728140354156494 + }, + { + "auxiliary_loss_clip": 0.0109979, + "auxiliary_loss_mlp": 0.01037539, + "balance_loss_clip": 1.03960156, + "balance_loss_mlp": 1.02521241, + "epoch": 0.7392755148053509, + "flos": 29718522097920.0, + "grad_norm": 2.087551297048025, + "language_loss": 0.7901718, + "learning_rate": 6.715697268304215e-07, + "loss": 0.81154513, + "num_input_tokens_seen": 265240165, + "step": 12296, + "time_per_iteration": 2.7069132328033447 + }, + { + "auxiliary_loss_clip": 0.01112194, + "auxiliary_loss_mlp": 0.01031879, + "balance_loss_clip": 1.03957283, + "balance_loss_mlp": 1.01797891, + "epoch": 0.7393356380580189, + "flos": 37050475075200.0, + "grad_norm": 2.421267182668315, + "language_loss": 0.66443473, + "learning_rate": 6.712786132607182e-07, + "loss": 0.68587548, + "num_input_tokens_seen": 265263295, + "step": 12297, + "time_per_iteration": 4.15710186958313 + }, + { + "auxiliary_loss_clip": 0.01086243, + "auxiliary_loss_mlp": 0.01038586, + "balance_loss_clip": 1.03743219, + "balance_loss_mlp": 1.02521062, + "epoch": 0.739395761310687, + "flos": 19719627091200.0, + "grad_norm": 2.031169028874948, + "language_loss": 0.68639588, + "learning_rate": 6.709875500762645e-07, + "loss": 0.70764422, + "num_input_tokens_seen": 265282740, + "step": 12298, + "time_per_iteration": 2.6803133487701416 + }, + { + "auxiliary_loss_clip": 0.01083526, + "auxiliary_loss_mlp": 0.0103472, + "balance_loss_clip": 1.03630257, + "balance_loss_mlp": 1.02177382, + "epoch": 0.7394558845633549, + "flos": 11801504067840.0, + "grad_norm": 1.810073219689882, + "language_loss": 0.7460804, + "learning_rate": 6.706965372880946e-07, + "loss": 0.76726282, + "num_input_tokens_seen": 265300175, + "step": 12299, + "time_per_iteration": 4.1317057609558105 + }, + { + "auxiliary_loss_clip": 0.01013835, + "auxiliary_loss_mlp": 0.00999495, + "balance_loss_clip": 1.0160886, + "balance_loss_mlp": 0.99818373, + "epoch": 0.7395160078160229, + "flos": 66195827850240.0, + "grad_norm": 0.7191377980528004, + "language_loss": 0.60850734, + "learning_rate": 6.704055749072455e-07, + "loss": 0.62864065, + "num_input_tokens_seen": 265363275, + "step": 12300, + "time_per_iteration": 4.986863136291504 + }, + { + "auxiliary_loss_clip": 0.01084534, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.03953075, + "balance_loss_mlp": 1.01876962, + "epoch": 0.7395761310686908, + "flos": 21249708687360.0, + "grad_norm": 1.6608612377328966, + "language_loss": 0.80444926, + "learning_rate": 6.7011466294475e-07, + "loss": 0.82561237, + "num_input_tokens_seen": 265382935, + "step": 12301, + "time_per_iteration": 2.635004997253418 + }, + { + "auxiliary_loss_clip": 0.01109746, + "auxiliary_loss_mlp": 0.01029708, + "balance_loss_clip": 1.03857565, + "balance_loss_mlp": 1.01823974, + "epoch": 0.7396362543213588, + "flos": 25955299025280.0, + "grad_norm": 1.5135415761232773, + "language_loss": 0.73152131, + "learning_rate": 6.698238014116406e-07, + "loss": 0.75291586, + "num_input_tokens_seen": 265403245, + "step": 12302, + "time_per_iteration": 2.612121105194092 + }, + { + "auxiliary_loss_clip": 0.01113143, + "auxiliary_loss_mlp": 0.01041216, + "balance_loss_clip": 1.03972757, + "balance_loss_mlp": 1.02819264, + "epoch": 0.7396963775740267, + "flos": 27377936064000.0, + "grad_norm": 6.478228728649492, + "language_loss": 0.73720932, + "learning_rate": 6.695329903189451e-07, + "loss": 0.75875294, + "num_input_tokens_seen": 265423105, + "step": 12303, + "time_per_iteration": 4.152388334274292 + }, + { + "auxiliary_loss_clip": 0.01109918, + "auxiliary_loss_mlp": 0.01030651, + "balance_loss_clip": 1.03906059, + "balance_loss_mlp": 1.01861048, + "epoch": 0.7397565008266948, + "flos": 25520133755520.0, + "grad_norm": 1.665147368260365, + "language_loss": 0.53981858, + "learning_rate": 6.692422296776927e-07, + "loss": 0.56122428, + "num_input_tokens_seen": 265443445, + "step": 12304, + "time_per_iteration": 2.6007986068725586 + }, + { + "auxiliary_loss_clip": 0.01088478, + "auxiliary_loss_mlp": 0.01038752, + "balance_loss_clip": 1.03643012, + "balance_loss_mlp": 1.02587104, + "epoch": 0.7398166240793627, + "flos": 23727760070400.0, + "grad_norm": 4.218553993502621, + "language_loss": 0.84787995, + "learning_rate": 6.689515194989084e-07, + "loss": 0.86915219, + "num_input_tokens_seen": 265462085, + "step": 12305, + "time_per_iteration": 2.7033863067626953 + }, + { + "auxiliary_loss_clip": 0.01007992, + "auxiliary_loss_mlp": 0.01002097, + "balance_loss_clip": 1.00802636, + "balance_loss_mlp": 1.00075579, + "epoch": 0.7398767473320307, + "flos": 67267582882560.0, + "grad_norm": 0.8984474927660691, + "language_loss": 0.57649475, + "learning_rate": 6.68660859793615e-07, + "loss": 0.59659564, + "num_input_tokens_seen": 265521190, + "step": 12306, + "time_per_iteration": 3.190584421157837 + }, + { + "auxiliary_loss_clip": 0.01091647, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.03991795, + "balance_loss_mlp": 1.02137649, + "epoch": 0.7399368705846986, + "flos": 22018699981440.0, + "grad_norm": 1.9564303795331826, + "language_loss": 0.81826288, + "learning_rate": 6.683702505728355e-07, + "loss": 0.83952522, + "num_input_tokens_seen": 265539705, + "step": 12307, + "time_per_iteration": 2.760991096496582 + }, + { + "auxiliary_loss_clip": 0.01094355, + "auxiliary_loss_mlp": 0.01035489, + "balance_loss_clip": 1.04020476, + "balance_loss_mlp": 1.0237112, + "epoch": 0.7399969938373666, + "flos": 14173870659840.0, + "grad_norm": 1.7875471048417528, + "language_loss": 0.69662929, + "learning_rate": 6.680796918475893e-07, + "loss": 0.71792769, + "num_input_tokens_seen": 265555855, + "step": 12308, + "time_per_iteration": 2.786059617996216 + }, + { + "auxiliary_loss_clip": 0.01080019, + "auxiliary_loss_mlp": 0.01030655, + "balance_loss_clip": 1.03736496, + "balance_loss_mlp": 1.01869845, + "epoch": 0.7400571170900345, + "flos": 25301473712640.0, + "grad_norm": 1.9234846760439523, + "language_loss": 0.81795132, + "learning_rate": 6.67789183628896e-07, + "loss": 0.83905804, + "num_input_tokens_seen": 265575455, + "step": 12309, + "time_per_iteration": 2.6756904125213623 + }, + { + "auxiliary_loss_clip": 0.01100831, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.03873348, + "balance_loss_mlp": 1.02133226, + "epoch": 0.7401172403427025, + "flos": 22711344917760.0, + "grad_norm": 3.264420183038049, + "language_loss": 0.72705656, + "learning_rate": 6.674987259277692e-07, + "loss": 0.74841309, + "num_input_tokens_seen": 265595250, + "step": 12310, + "time_per_iteration": 2.7013933658599854 + }, + { + "auxiliary_loss_clip": 0.01075917, + "auxiliary_loss_mlp": 0.01042964, + "balance_loss_clip": 1.0368607, + "balance_loss_mlp": 1.02921319, + "epoch": 0.7401773635953706, + "flos": 18067448188800.0, + "grad_norm": 2.4013054691194915, + "language_loss": 0.88485903, + "learning_rate": 6.672083187552239e-07, + "loss": 0.90604782, + "num_input_tokens_seen": 265606945, + "step": 12311, + "time_per_iteration": 2.6424548625946045 + }, + { + "auxiliary_loss_clip": 0.01046645, + "auxiliary_loss_mlp": 0.01029353, + "balance_loss_clip": 1.0324477, + "balance_loss_mlp": 1.01692545, + "epoch": 0.7402374868480385, + "flos": 22712135016960.0, + "grad_norm": 1.58737852842035, + "language_loss": 0.80510384, + "learning_rate": 6.669179621222738e-07, + "loss": 0.82586384, + "num_input_tokens_seen": 265626115, + "step": 12312, + "time_per_iteration": 2.820053815841675 + }, + { + "auxiliary_loss_clip": 0.01060693, + "auxiliary_loss_mlp": 0.01035735, + "balance_loss_clip": 1.03197908, + "balance_loss_mlp": 1.02264023, + "epoch": 0.7402976101007065, + "flos": 22856675345280.0, + "grad_norm": 1.990612245665929, + "language_loss": 0.78425479, + "learning_rate": 6.666276560399273e-07, + "loss": 0.80521905, + "num_input_tokens_seen": 265646520, + "step": 12313, + "time_per_iteration": 2.756864547729492 + }, + { + "auxiliary_loss_clip": 0.01059901, + "auxiliary_loss_mlp": 0.01038311, + "balance_loss_clip": 1.03464198, + "balance_loss_mlp": 1.02487016, + "epoch": 0.7403577333533744, + "flos": 12345801834240.0, + "grad_norm": 2.1312329589300947, + "language_loss": 0.78784394, + "learning_rate": 6.663374005191937e-07, + "loss": 0.80882609, + "num_input_tokens_seen": 265661875, + "step": 12314, + "time_per_iteration": 2.7299044132232666 + }, + { + "auxiliary_loss_clip": 0.01020285, + "auxiliary_loss_mlp": 0.01000472, + "balance_loss_clip": 1.00777555, + "balance_loss_mlp": 0.99948281, + "epoch": 0.7404178566060424, + "flos": 60327270869760.0, + "grad_norm": 0.9319847439120421, + "language_loss": 0.55094397, + "learning_rate": 6.660471955710809e-07, + "loss": 0.57115149, + "num_input_tokens_seen": 265721255, + "step": 12315, + "time_per_iteration": 3.201897382736206 + }, + { + "auxiliary_loss_clip": 0.01093771, + "auxiliary_loss_mlp": 0.01036367, + "balance_loss_clip": 1.03759921, + "balance_loss_mlp": 1.02371287, + "epoch": 0.7404779798587103, + "flos": 32014650072960.0, + "grad_norm": 1.5030342819067668, + "language_loss": 0.79353088, + "learning_rate": 6.65757041206591e-07, + "loss": 0.81483227, + "num_input_tokens_seen": 265743970, + "step": 12316, + "time_per_iteration": 2.705349922180176 + }, + { + "auxiliary_loss_clip": 0.01098009, + "auxiliary_loss_mlp": 0.01031964, + "balance_loss_clip": 1.03624582, + "balance_loss_mlp": 1.01957273, + "epoch": 0.7405381031113784, + "flos": 12889704551040.0, + "grad_norm": 1.7371134770990158, + "language_loss": 0.7492671, + "learning_rate": 6.654669374367275e-07, + "loss": 0.77056682, + "num_input_tokens_seen": 265760890, + "step": 12317, + "time_per_iteration": 2.637202024459839 + }, + { + "auxiliary_loss_clip": 0.01078909, + "auxiliary_loss_mlp": 0.01035186, + "balance_loss_clip": 1.03754401, + "balance_loss_mlp": 1.02296102, + "epoch": 0.7405982263640463, + "flos": 20229127557120.0, + "grad_norm": 1.520938817583414, + "language_loss": 0.81343406, + "learning_rate": 6.651768842724917e-07, + "loss": 0.834575, + "num_input_tokens_seen": 265779600, + "step": 12318, + "time_per_iteration": 2.7076103687286377 + }, + { + "auxiliary_loss_clip": 0.01084776, + "auxiliary_loss_mlp": 0.0103153, + "balance_loss_clip": 1.03475654, + "balance_loss_mlp": 1.0187031, + "epoch": 0.7406583496167143, + "flos": 17567213431680.0, + "grad_norm": 1.9057934883865575, + "language_loss": 0.76502925, + "learning_rate": 6.648868817248827e-07, + "loss": 0.7861923, + "num_input_tokens_seen": 265797030, + "step": 12319, + "time_per_iteration": 2.6530611515045166 + }, + { + "auxiliary_loss_clip": 0.01080701, + "auxiliary_loss_mlp": 0.01032886, + "balance_loss_clip": 1.0368222, + "balance_loss_mlp": 1.0211314, + "epoch": 0.7407184728693822, + "flos": 18295733076480.0, + "grad_norm": 2.7907820586254064, + "language_loss": 0.64157581, + "learning_rate": 6.64596929804897e-07, + "loss": 0.66271174, + "num_input_tokens_seen": 265815055, + "step": 12320, + "time_per_iteration": 2.7634599208831787 + }, + { + "auxiliary_loss_clip": 0.0110264, + "auxiliary_loss_mlp": 0.01041469, + "balance_loss_clip": 1.03931427, + "balance_loss_mlp": 1.02880883, + "epoch": 0.7407785961220502, + "flos": 16690562098560.0, + "grad_norm": 2.6669296111663168, + "language_loss": 0.8214829, + "learning_rate": 6.643070285235288e-07, + "loss": 0.842924, + "num_input_tokens_seen": 265828480, + "step": 12321, + "time_per_iteration": 2.603889226913452 + }, + { + "auxiliary_loss_clip": 0.01091833, + "auxiliary_loss_mlp": 0.01048957, + "balance_loss_clip": 1.03682292, + "balance_loss_mlp": 1.03459191, + "epoch": 0.7408387193747181, + "flos": 22088330496000.0, + "grad_norm": 2.755383259535151, + "language_loss": 0.72079754, + "learning_rate": 6.640171778917727e-07, + "loss": 0.74220538, + "num_input_tokens_seen": 265845825, + "step": 12322, + "time_per_iteration": 2.5962164402008057 + }, + { + "auxiliary_loss_clip": 0.01100778, + "auxiliary_loss_mlp": 0.00770917, + "balance_loss_clip": 1.03753436, + "balance_loss_mlp": 1.0002656, + "epoch": 0.7408988426273861, + "flos": 24236721832320.0, + "grad_norm": 1.859375439746312, + "language_loss": 0.64215767, + "learning_rate": 6.637273779206183e-07, + "loss": 0.66087461, + "num_input_tokens_seen": 265866335, + "step": 12323, + "time_per_iteration": 2.650984525680542 + }, + { + "auxiliary_loss_clip": 0.01074935, + "auxiliary_loss_mlp": 0.01032883, + "balance_loss_clip": 1.03454328, + "balance_loss_mlp": 1.01984739, + "epoch": 0.7409589658800542, + "flos": 29023004073600.0, + "grad_norm": 1.364972718978451, + "language_loss": 0.75983679, + "learning_rate": 6.634376286210559e-07, + "loss": 0.78091496, + "num_input_tokens_seen": 265888945, + "step": 12324, + "time_per_iteration": 2.758053779602051 + }, + { + "auxiliary_loss_clip": 0.01079211, + "auxiliary_loss_mlp": 0.01027292, + "balance_loss_clip": 1.03694987, + "balance_loss_mlp": 1.01489401, + "epoch": 0.7410190891327221, + "flos": 19351362902400.0, + "grad_norm": 1.7409894929083622, + "language_loss": 0.74638963, + "learning_rate": 6.63147930004073e-07, + "loss": 0.76745468, + "num_input_tokens_seen": 265908030, + "step": 12325, + "time_per_iteration": 2.6512198448181152 + }, + { + "auxiliary_loss_clip": 0.01070767, + "auxiliary_loss_mlp": 0.01038532, + "balance_loss_clip": 1.03589809, + "balance_loss_mlp": 1.02524054, + "epoch": 0.7410792123853901, + "flos": 22747650589440.0, + "grad_norm": 1.8899213582685095, + "language_loss": 0.68341279, + "learning_rate": 6.628582820806545e-07, + "loss": 0.7045058, + "num_input_tokens_seen": 265927030, + "step": 12326, + "time_per_iteration": 2.760312557220459 + }, + { + "auxiliary_loss_clip": 0.01072406, + "auxiliary_loss_mlp": 0.01028918, + "balance_loss_clip": 1.03731251, + "balance_loss_mlp": 1.01672876, + "epoch": 0.741139335638058, + "flos": 25372433030400.0, + "grad_norm": 1.6031079526338634, + "language_loss": 0.89560592, + "learning_rate": 6.625686848617835e-07, + "loss": 0.91661912, + "num_input_tokens_seen": 265945490, + "step": 12327, + "time_per_iteration": 2.753051519393921 + }, + { + "auxiliary_loss_clip": 0.01110031, + "auxiliary_loss_mlp": 0.01032575, + "balance_loss_clip": 1.03885567, + "balance_loss_mlp": 1.0198555, + "epoch": 0.741199458890726, + "flos": 18585639745920.0, + "grad_norm": 1.7237905370438114, + "language_loss": 0.85383123, + "learning_rate": 6.62279138358442e-07, + "loss": 0.87525725, + "num_input_tokens_seen": 265963265, + "step": 12328, + "time_per_iteration": 2.5977120399475098 + }, + { + "auxiliary_loss_clip": 0.01098285, + "auxiliary_loss_mlp": 0.01032958, + "balance_loss_clip": 1.0383029, + "balance_loss_mlp": 1.01909983, + "epoch": 0.7412595821433939, + "flos": 22127078292480.0, + "grad_norm": 1.669888281499519, + "language_loss": 0.66867191, + "learning_rate": 6.619896425816103e-07, + "loss": 0.68998432, + "num_input_tokens_seen": 265982270, + "step": 12329, + "time_per_iteration": 2.63157057762146 + }, + { + "auxiliary_loss_clip": 0.01078104, + "auxiliary_loss_mlp": 0.01042687, + "balance_loss_clip": 1.03691041, + "balance_loss_mlp": 1.02878761, + "epoch": 0.741319705396062, + "flos": 29169699217920.0, + "grad_norm": 1.6151090072025307, + "language_loss": 0.66697407, + "learning_rate": 6.617001975422647e-07, + "loss": 0.688182, + "num_input_tokens_seen": 266003835, + "step": 12330, + "time_per_iteration": 2.8134610652923584 + }, + { + "auxiliary_loss_clip": 0.01078521, + "auxiliary_loss_mlp": 0.01036152, + "balance_loss_clip": 1.04112339, + "balance_loss_mlp": 1.02134609, + "epoch": 0.7413798286487299, + "flos": 20667489137280.0, + "grad_norm": 2.0428405490816837, + "language_loss": 0.85805637, + "learning_rate": 6.614108032513823e-07, + "loss": 0.87920308, + "num_input_tokens_seen": 266021595, + "step": 12331, + "time_per_iteration": 2.812793493270874 + }, + { + "auxiliary_loss_clip": 0.01048375, + "auxiliary_loss_mlp": 0.01034839, + "balance_loss_clip": 1.0381304, + "balance_loss_mlp": 1.02189922, + "epoch": 0.7414399519013979, + "flos": 16398895662720.0, + "grad_norm": 1.9478476477957887, + "language_loss": 0.6967262, + "learning_rate": 6.611214597199364e-07, + "loss": 0.71755838, + "num_input_tokens_seen": 266039860, + "step": 12332, + "time_per_iteration": 3.0447654724121094 + }, + { + "auxiliary_loss_clip": 0.01112852, + "auxiliary_loss_mlp": 0.01040645, + "balance_loss_clip": 1.03986526, + "balance_loss_mlp": 1.02710271, + "epoch": 0.7415000751540658, + "flos": 25630235919360.0, + "grad_norm": 1.894199070779257, + "language_loss": 0.63652647, + "learning_rate": 6.608321669588984e-07, + "loss": 0.65806139, + "num_input_tokens_seen": 266058050, + "step": 12333, + "time_per_iteration": 2.8000104427337646 + }, + { + "auxiliary_loss_clip": 0.010897, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.04135418, + "balance_loss_mlp": 1.02300525, + "epoch": 0.7415601984067338, + "flos": 24499732193280.0, + "grad_norm": 1.6946502841165116, + "language_loss": 0.71084702, + "learning_rate": 6.605429249792387e-07, + "loss": 0.73209548, + "num_input_tokens_seen": 266078060, + "step": 12334, + "time_per_iteration": 2.7801129817962646 + }, + { + "auxiliary_loss_clip": 0.01065371, + "auxiliary_loss_mlp": 0.01027933, + "balance_loss_clip": 1.0374248, + "balance_loss_mlp": 1.01558292, + "epoch": 0.7416203216594017, + "flos": 20887154760960.0, + "grad_norm": 1.6662744969405867, + "language_loss": 0.82556254, + "learning_rate": 6.602537337919257e-07, + "loss": 0.84649551, + "num_input_tokens_seen": 266097110, + "step": 12335, + "time_per_iteration": 2.7619669437408447 + }, + { + "auxiliary_loss_clip": 0.01111608, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.03896701, + "balance_loss_mlp": 1.01763475, + "epoch": 0.7416804449120697, + "flos": 15624265933440.0, + "grad_norm": 2.6708776221620134, + "language_loss": 0.74853325, + "learning_rate": 6.599645934079259e-07, + "loss": 0.76996362, + "num_input_tokens_seen": 266110870, + "step": 12336, + "time_per_iteration": 4.294764518737793 + }, + { + "auxiliary_loss_clip": 0.01068313, + "auxiliary_loss_mlp": 0.01036465, + "balance_loss_clip": 1.03603351, + "balance_loss_mlp": 1.02284563, + "epoch": 0.7417405681647377, + "flos": 17120483982720.0, + "grad_norm": 1.9180906175997412, + "language_loss": 0.73796511, + "learning_rate": 6.596755038382029e-07, + "loss": 0.75901294, + "num_input_tokens_seen": 266127845, + "step": 12337, + "time_per_iteration": 2.8595807552337646 + }, + { + "auxiliary_loss_clip": 0.01083057, + "auxiliary_loss_mlp": 0.01039105, + "balance_loss_clip": 1.03681028, + "balance_loss_mlp": 1.0262543, + "epoch": 0.7418006914174057, + "flos": 18880322924160.0, + "grad_norm": 1.6574383205520367, + "language_loss": 0.76809967, + "learning_rate": 6.593864650937186e-07, + "loss": 0.78932124, + "num_input_tokens_seen": 266145400, + "step": 12338, + "time_per_iteration": 4.203794240951538 + }, + { + "auxiliary_loss_clip": 0.01099752, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.03882122, + "balance_loss_mlp": 1.02033818, + "epoch": 0.7418608146700737, + "flos": 21580733450880.0, + "grad_norm": 1.7161166457507804, + "language_loss": 0.73070621, + "learning_rate": 6.590974771854345e-07, + "loss": 0.75201988, + "num_input_tokens_seen": 266164430, + "step": 12339, + "time_per_iteration": 4.210087776184082 + }, + { + "auxiliary_loss_clip": 0.01092405, + "auxiliary_loss_mlp": 0.01031981, + "balance_loss_clip": 1.0387336, + "balance_loss_mlp": 1.01890945, + "epoch": 0.7419209379227416, + "flos": 22340459036160.0, + "grad_norm": 2.0219989421818276, + "language_loss": 0.79605651, + "learning_rate": 6.588085401243077e-07, + "loss": 0.81730038, + "num_input_tokens_seen": 266183855, + "step": 12340, + "time_per_iteration": 2.670774221420288 + }, + { + "auxiliary_loss_clip": 0.01069023, + "auxiliary_loss_mlp": 0.01036356, + "balance_loss_clip": 1.03491449, + "balance_loss_mlp": 1.02310038, + "epoch": 0.7419810611754096, + "flos": 16762275601920.0, + "grad_norm": 2.432257860237773, + "language_loss": 0.75854677, + "learning_rate": 6.585196539212958e-07, + "loss": 0.77960056, + "num_input_tokens_seen": 266202085, + "step": 12341, + "time_per_iteration": 2.686434268951416 + }, + { + "auxiliary_loss_clip": 0.0107769, + "auxiliary_loss_mlp": 0.01041186, + "balance_loss_clip": 1.03510964, + "balance_loss_mlp": 1.02783489, + "epoch": 0.7420411844280775, + "flos": 26212958259840.0, + "grad_norm": 1.4473494294427032, + "language_loss": 0.8024286, + "learning_rate": 6.582308185873535e-07, + "loss": 0.8236174, + "num_input_tokens_seen": 266223445, + "step": 12342, + "time_per_iteration": 4.343433380126953 + }, + { + "auxiliary_loss_clip": 0.01075896, + "auxiliary_loss_mlp": 0.01027447, + "balance_loss_clip": 1.03609908, + "balance_loss_mlp": 1.01511443, + "epoch": 0.7421013076807456, + "flos": 68529371840640.0, + "grad_norm": 1.749760257309467, + "language_loss": 0.77626014, + "learning_rate": 6.57942034133433e-07, + "loss": 0.79729354, + "num_input_tokens_seen": 266246575, + "step": 12343, + "time_per_iteration": 3.107714891433716 + }, + { + "auxiliary_loss_clip": 0.01082874, + "auxiliary_loss_mlp": 0.01034526, + "balance_loss_clip": 1.03323293, + "balance_loss_mlp": 1.02221727, + "epoch": 0.7421614309334135, + "flos": 24425325169920.0, + "grad_norm": 1.6706510034937676, + "language_loss": 0.67636979, + "learning_rate": 6.576533005704843e-07, + "loss": 0.69754374, + "num_input_tokens_seen": 266266055, + "step": 12344, + "time_per_iteration": 2.7599802017211914 + }, + { + "auxiliary_loss_clip": 0.01065258, + "auxiliary_loss_mlp": 0.01037206, + "balance_loss_clip": 1.03660572, + "balance_loss_mlp": 1.02291846, + "epoch": 0.7422215541860815, + "flos": 12311076360960.0, + "grad_norm": 2.3156123925604692, + "language_loss": 0.81109858, + "learning_rate": 6.573646179094572e-07, + "loss": 0.83212328, + "num_input_tokens_seen": 266282240, + "step": 12345, + "time_per_iteration": 2.7414791584014893 + }, + { + "auxiliary_loss_clip": 0.01072147, + "auxiliary_loss_mlp": 0.0103856, + "balance_loss_clip": 1.03549957, + "balance_loss_mlp": 1.02523887, + "epoch": 0.7422816774387494, + "flos": 19645579203840.0, + "grad_norm": 1.9382183588535902, + "language_loss": 0.70441389, + "learning_rate": 6.570759861612988e-07, + "loss": 0.72552097, + "num_input_tokens_seen": 266300980, + "step": 12346, + "time_per_iteration": 2.728034734725952 + }, + { + "auxiliary_loss_clip": 0.01102385, + "auxiliary_loss_mlp": 0.0103363, + "balance_loss_clip": 1.03974307, + "balance_loss_mlp": 1.02126789, + "epoch": 0.7423418006914174, + "flos": 32015978876160.0, + "grad_norm": 2.081189833506492, + "language_loss": 0.73518687, + "learning_rate": 6.56787405336953e-07, + "loss": 0.75654697, + "num_input_tokens_seen": 266322215, + "step": 12347, + "time_per_iteration": 2.691364049911499 + }, + { + "auxiliary_loss_clip": 0.01090637, + "auxiliary_loss_mlp": 0.01034498, + "balance_loss_clip": 1.03648269, + "balance_loss_mlp": 1.02162337, + "epoch": 0.7424019239440853, + "flos": 18916951818240.0, + "grad_norm": 1.681708315108595, + "language_loss": 0.80881745, + "learning_rate": 6.564988754473642e-07, + "loss": 0.83006883, + "num_input_tokens_seen": 266341600, + "step": 12348, + "time_per_iteration": 2.719554901123047 + }, + { + "auxiliary_loss_clip": 0.01110126, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.03918421, + "balance_loss_mlp": 1.02082634, + "epoch": 0.7424620471967533, + "flos": 35876518871040.0, + "grad_norm": 1.8616740684019923, + "language_loss": 0.73023462, + "learning_rate": 6.562103965034724e-07, + "loss": 0.7516675, + "num_input_tokens_seen": 266362895, + "step": 12349, + "time_per_iteration": 2.762857437133789 + }, + { + "auxiliary_loss_clip": 0.01091582, + "auxiliary_loss_mlp": 0.01035038, + "balance_loss_clip": 1.03577137, + "balance_loss_mlp": 1.02081633, + "epoch": 0.7425221704494213, + "flos": 27016603200000.0, + "grad_norm": 2.2070987228261427, + "language_loss": 0.78727913, + "learning_rate": 6.559219685162165e-07, + "loss": 0.80854535, + "num_input_tokens_seen": 266384015, + "step": 12350, + "time_per_iteration": 2.67797589302063 + }, + { + "auxiliary_loss_clip": 0.01067839, + "auxiliary_loss_mlp": 0.01035914, + "balance_loss_clip": 1.03754306, + "balance_loss_mlp": 1.0233134, + "epoch": 0.7425822937020893, + "flos": 34167135559680.0, + "grad_norm": 1.5216618153297856, + "language_loss": 0.74963629, + "learning_rate": 6.556335914965343e-07, + "loss": 0.77067381, + "num_input_tokens_seen": 266405990, + "step": 12351, + "time_per_iteration": 2.8214800357818604 + }, + { + "auxiliary_loss_clip": 0.01055755, + "auxiliary_loss_mlp": 0.01030345, + "balance_loss_clip": 1.0381254, + "balance_loss_mlp": 1.01733303, + "epoch": 0.7426424169547573, + "flos": 21283572234240.0, + "grad_norm": 2.67642082180286, + "language_loss": 0.81345606, + "learning_rate": 6.553452654553611e-07, + "loss": 0.83431703, + "num_input_tokens_seen": 266424260, + "step": 12352, + "time_per_iteration": 2.8043935298919678 + }, + { + "auxiliary_loss_clip": 0.01103554, + "auxiliary_loss_mlp": 0.01039938, + "balance_loss_clip": 1.0413506, + "balance_loss_mlp": 1.02751637, + "epoch": 0.7427025402074252, + "flos": 22448442297600.0, + "grad_norm": 1.8427124307905225, + "language_loss": 0.72003049, + "learning_rate": 6.550569904036307e-07, + "loss": 0.74146539, + "num_input_tokens_seen": 266444580, + "step": 12353, + "time_per_iteration": 2.726813793182373 + }, + { + "auxiliary_loss_clip": 0.0110208, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.04067636, + "balance_loss_mlp": 1.01913714, + "epoch": 0.7427626634600932, + "flos": 22524609087360.0, + "grad_norm": 2.0628021124051275, + "language_loss": 0.72218555, + "learning_rate": 6.547687663522739e-07, + "loss": 0.74351114, + "num_input_tokens_seen": 266465640, + "step": 12354, + "time_per_iteration": 2.6648378372192383 + }, + { + "auxiliary_loss_clip": 0.01020848, + "auxiliary_loss_mlp": 0.01006019, + "balance_loss_clip": 1.00787544, + "balance_loss_mlp": 1.00489271, + "epoch": 0.7428227867127611, + "flos": 67209477655680.0, + "grad_norm": 0.694826107122343, + "language_loss": 0.59537125, + "learning_rate": 6.544805933122199e-07, + "loss": 0.61563993, + "num_input_tokens_seen": 266531950, + "step": 12355, + "time_per_iteration": 3.3000428676605225 + }, + { + "auxiliary_loss_clip": 0.01111904, + "auxiliary_loss_mlp": 0.01030428, + "balance_loss_clip": 1.03898406, + "balance_loss_mlp": 1.01765478, + "epoch": 0.7428829099654292, + "flos": 14721221082240.0, + "grad_norm": 1.7387842003260185, + "language_loss": 0.677315, + "learning_rate": 6.541924712943971e-07, + "loss": 0.69873834, + "num_input_tokens_seen": 266550665, + "step": 12356, + "time_per_iteration": 2.577047824859619 + }, + { + "auxiliary_loss_clip": 0.01100444, + "auxiliary_loss_mlp": 0.00771382, + "balance_loss_clip": 1.03524387, + "balance_loss_mlp": 1.00019741, + "epoch": 0.7429430332180971, + "flos": 48646496413440.0, + "grad_norm": 1.7685989280794623, + "language_loss": 0.72208947, + "learning_rate": 6.539044003097301e-07, + "loss": 0.74080771, + "num_input_tokens_seen": 266572455, + "step": 12357, + "time_per_iteration": 2.9096696376800537 + }, + { + "auxiliary_loss_clip": 0.01088209, + "auxiliary_loss_mlp": 0.01029654, + "balance_loss_clip": 1.03906703, + "balance_loss_mlp": 1.01782274, + "epoch": 0.7430031564707651, + "flos": 16764071281920.0, + "grad_norm": 1.8287713858548653, + "language_loss": 0.65631384, + "learning_rate": 6.53616380369143e-07, + "loss": 0.6774925, + "num_input_tokens_seen": 266590895, + "step": 12358, + "time_per_iteration": 2.668260097503662 + }, + { + "auxiliary_loss_clip": 0.01073582, + "auxiliary_loss_mlp": 0.0103526, + "balance_loss_clip": 1.0399549, + "balance_loss_mlp": 1.02100861, + "epoch": 0.743063279723433, + "flos": 23870576545920.0, + "grad_norm": 1.7940637938845212, + "language_loss": 0.81230819, + "learning_rate": 6.533284114835591e-07, + "loss": 0.83339661, + "num_input_tokens_seen": 266607660, + "step": 12359, + "time_per_iteration": 2.750425338745117 + }, + { + "auxiliary_loss_clip": 0.01100028, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.03793418, + "balance_loss_mlp": 1.01983833, + "epoch": 0.743123402976101, + "flos": 14391704689920.0, + "grad_norm": 2.122041383037816, + "language_loss": 0.67954987, + "learning_rate": 6.530404936638956e-07, + "loss": 0.70087737, + "num_input_tokens_seen": 266624260, + "step": 12360, + "time_per_iteration": 2.638991355895996 + }, + { + "auxiliary_loss_clip": 0.01099874, + "auxiliary_loss_mlp": 0.00770722, + "balance_loss_clip": 1.03788424, + "balance_loss_mlp": 1.00024271, + "epoch": 0.7431835262287689, + "flos": 27454318335360.0, + "grad_norm": 1.6135955801091852, + "language_loss": 0.72960168, + "learning_rate": 6.527526269210715e-07, + "loss": 0.74830765, + "num_input_tokens_seen": 266644210, + "step": 12361, + "time_per_iteration": 2.6851212978363037 + }, + { + "auxiliary_loss_clip": 0.01061783, + "auxiliary_loss_mlp": 0.01043643, + "balance_loss_clip": 1.03427052, + "balance_loss_mlp": 1.02964807, + "epoch": 0.743243649481437, + "flos": 20959514709120.0, + "grad_norm": 1.8538295437323902, + "language_loss": 0.55904317, + "learning_rate": 6.524648112660027e-07, + "loss": 0.58009744, + "num_input_tokens_seen": 266664230, + "step": 12362, + "time_per_iteration": 2.6957335472106934 + }, + { + "auxiliary_loss_clip": 0.01075259, + "auxiliary_loss_mlp": 0.01030795, + "balance_loss_clip": 1.03825688, + "balance_loss_mlp": 1.01771164, + "epoch": 0.7433037727341049, + "flos": 22783166161920.0, + "grad_norm": 1.5750012237947109, + "language_loss": 0.77069867, + "learning_rate": 6.521770467096039e-07, + "loss": 0.79175913, + "num_input_tokens_seen": 266683270, + "step": 12363, + "time_per_iteration": 2.7211437225341797 + }, + { + "auxiliary_loss_clip": 0.01082709, + "auxiliary_loss_mlp": 0.01036524, + "balance_loss_clip": 1.03588808, + "balance_loss_mlp": 1.02383995, + "epoch": 0.7433638959867729, + "flos": 22196708807040.0, + "grad_norm": 1.6083671142844838, + "language_loss": 0.78007239, + "learning_rate": 6.518893332627862e-07, + "loss": 0.8012647, + "num_input_tokens_seen": 266701235, + "step": 12364, + "time_per_iteration": 2.6894009113311768 + }, + { + "auxiliary_loss_clip": 0.01098885, + "auxiliary_loss_mlp": 0.01037373, + "balance_loss_clip": 1.03761303, + "balance_loss_mlp": 1.025172, + "epoch": 0.7434240192394409, + "flos": 23296760778240.0, + "grad_norm": 1.5760163793718025, + "language_loss": 0.78754139, + "learning_rate": 6.516016709364604e-07, + "loss": 0.80890405, + "num_input_tokens_seen": 266721495, + "step": 12365, + "time_per_iteration": 2.625281572341919 + }, + { + "auxiliary_loss_clip": 0.01087609, + "auxiliary_loss_mlp": 0.01033626, + "balance_loss_clip": 1.03624249, + "balance_loss_mlp": 1.02065635, + "epoch": 0.7434841424921088, + "flos": 54009575251200.0, + "grad_norm": 1.5814760031444242, + "language_loss": 0.76864719, + "learning_rate": 6.513140597415346e-07, + "loss": 0.78985953, + "num_input_tokens_seen": 266747400, + "step": 12366, + "time_per_iteration": 2.9688045978546143 + }, + { + "auxiliary_loss_clip": 0.01099866, + "auxiliary_loss_mlp": 0.01028311, + "balance_loss_clip": 1.04013896, + "balance_loss_mlp": 1.01761758, + "epoch": 0.7435442657447768, + "flos": 21433966479360.0, + "grad_norm": 1.3642058865548359, + "language_loss": 0.71373397, + "learning_rate": 6.510264996889141e-07, + "loss": 0.73501575, + "num_input_tokens_seen": 266767630, + "step": 12367, + "time_per_iteration": 2.661372184753418 + }, + { + "auxiliary_loss_clip": 0.01084148, + "auxiliary_loss_mlp": 0.01036563, + "balance_loss_clip": 1.0383482, + "balance_loss_mlp": 1.02371848, + "epoch": 0.7436043889974447, + "flos": 24499408970880.0, + "grad_norm": 1.5961683932504214, + "language_loss": 0.74215865, + "learning_rate": 6.507389907895038e-07, + "loss": 0.76336575, + "num_input_tokens_seen": 266788015, + "step": 12368, + "time_per_iteration": 2.712043285369873 + }, + { + "auxiliary_loss_clip": 0.01097444, + "auxiliary_loss_mlp": 0.01031837, + "balance_loss_clip": 1.03949308, + "balance_loss_mlp": 1.02042866, + "epoch": 0.7436645122501128, + "flos": 40698388512000.0, + "grad_norm": 2.9422959785728757, + "language_loss": 0.69383776, + "learning_rate": 6.50451533054207e-07, + "loss": 0.71513051, + "num_input_tokens_seen": 266809010, + "step": 12369, + "time_per_iteration": 2.7961301803588867 + }, + { + "auxiliary_loss_clip": 0.01088683, + "auxiliary_loss_mlp": 0.00770011, + "balance_loss_clip": 1.03793979, + "balance_loss_mlp": 1.00026274, + "epoch": 0.7437246355027807, + "flos": 18908835344640.0, + "grad_norm": 1.8064840643083067, + "language_loss": 0.75919938, + "learning_rate": 6.501641264939233e-07, + "loss": 0.77778637, + "num_input_tokens_seen": 266825390, + "step": 12370, + "time_per_iteration": 2.7155323028564453 + }, + { + "auxiliary_loss_clip": 0.01111903, + "auxiliary_loss_mlp": 0.01036048, + "balance_loss_clip": 1.04072666, + "balance_loss_mlp": 1.02287519, + "epoch": 0.7437847587554487, + "flos": 21543817248000.0, + "grad_norm": 2.0269448614883863, + "language_loss": 0.78193456, + "learning_rate": 6.498767711195503e-07, + "loss": 0.80341411, + "num_input_tokens_seen": 266844675, + "step": 12371, + "time_per_iteration": 2.6484358310699463 + }, + { + "auxiliary_loss_clip": 0.01091423, + "auxiliary_loss_mlp": 0.01029857, + "balance_loss_clip": 1.03848553, + "balance_loss_mlp": 1.01723862, + "epoch": 0.7438448820081166, + "flos": 27782470010880.0, + "grad_norm": 1.6126638712287897, + "language_loss": 0.69267446, + "learning_rate": 6.495894669419857e-07, + "loss": 0.71388721, + "num_input_tokens_seen": 266865160, + "step": 12372, + "time_per_iteration": 2.7042236328125 + }, + { + "auxiliary_loss_clip": 0.01079002, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.03700709, + "balance_loss_mlp": 1.02461922, + "epoch": 0.7439050052607846, + "flos": 17967832796160.0, + "grad_norm": 1.9384549362985082, + "language_loss": 0.75196183, + "learning_rate": 6.493022139721245e-07, + "loss": 0.77312428, + "num_input_tokens_seen": 266883285, + "step": 12373, + "time_per_iteration": 2.6364054679870605 + }, + { + "auxiliary_loss_clip": 0.01057413, + "auxiliary_loss_mlp": 0.01039492, + "balance_loss_clip": 1.03332591, + "balance_loss_mlp": 1.02528191, + "epoch": 0.7439651285134525, + "flos": 22958696949120.0, + "grad_norm": 1.7332911073866848, + "language_loss": 0.7709462, + "learning_rate": 6.49015012220858e-07, + "loss": 0.7919153, + "num_input_tokens_seen": 266900960, + "step": 12374, + "time_per_iteration": 2.7238872051239014 + }, + { + "auxiliary_loss_clip": 0.01048312, + "auxiliary_loss_mlp": 0.01037947, + "balance_loss_clip": 1.03520083, + "balance_loss_mlp": 1.02472675, + "epoch": 0.7440252517661206, + "flos": 18806777827200.0, + "grad_norm": 2.3876563861488496, + "language_loss": 0.76403177, + "learning_rate": 6.487278616990774e-07, + "loss": 0.78489435, + "num_input_tokens_seen": 266917710, + "step": 12375, + "time_per_iteration": 2.8014628887176514 + }, + { + "auxiliary_loss_clip": 0.01098112, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.03817892, + "balance_loss_mlp": 1.02062082, + "epoch": 0.7440853750187885, + "flos": 20266295155200.0, + "grad_norm": 1.9311839942562836, + "language_loss": 0.77011836, + "learning_rate": 6.484407624176733e-07, + "loss": 0.79141957, + "num_input_tokens_seen": 266934220, + "step": 12376, + "time_per_iteration": 4.12352442741394 + }, + { + "auxiliary_loss_clip": 0.01071601, + "auxiliary_loss_mlp": 0.01038876, + "balance_loss_clip": 1.03379536, + "balance_loss_mlp": 1.02320004, + "epoch": 0.7441454982714565, + "flos": 25337276593920.0, + "grad_norm": 1.692291173938847, + "language_loss": 0.79398865, + "learning_rate": 6.481537143875296e-07, + "loss": 0.8150934, + "num_input_tokens_seen": 266955210, + "step": 12377, + "time_per_iteration": 4.235915184020996 + }, + { + "auxiliary_loss_clip": 0.010991, + "auxiliary_loss_mlp": 0.01030466, + "balance_loss_clip": 1.03905261, + "balance_loss_mlp": 1.01754928, + "epoch": 0.7442056215241245, + "flos": 64480910866560.0, + "grad_norm": 1.9747138110607607, + "language_loss": 0.67284125, + "learning_rate": 6.478667176195322e-07, + "loss": 0.69413698, + "num_input_tokens_seen": 266976555, + "step": 12378, + "time_per_iteration": 4.622121572494507 + }, + { + "auxiliary_loss_clip": 0.010776, + "auxiliary_loss_mlp": 0.01037137, + "balance_loss_clip": 1.03861165, + "balance_loss_mlp": 1.02326727, + "epoch": 0.7442657447767924, + "flos": 31285376242560.0, + "grad_norm": 1.7913513654463005, + "language_loss": 0.71687776, + "learning_rate": 6.475797721245648e-07, + "loss": 0.73802519, + "num_input_tokens_seen": 266997640, + "step": 12379, + "time_per_iteration": 2.7747161388397217 + }, + { + "auxiliary_loss_clip": 0.01072089, + "auxiliary_loss_mlp": 0.00772364, + "balance_loss_clip": 1.0351454, + "balance_loss_mlp": 1.00025105, + "epoch": 0.7443258680294604, + "flos": 20807899401600.0, + "grad_norm": 2.0210704518096523, + "language_loss": 0.65216178, + "learning_rate": 6.472928779135085e-07, + "loss": 0.67060632, + "num_input_tokens_seen": 267016165, + "step": 12380, + "time_per_iteration": 2.7074787616729736 + }, + { + "auxiliary_loss_clip": 0.01101589, + "auxiliary_loss_mlp": 0.01034892, + "balance_loss_clip": 1.03957582, + "balance_loss_mlp": 1.0219394, + "epoch": 0.7443859912821283, + "flos": 22199833290240.0, + "grad_norm": 2.7838482793597388, + "language_loss": 0.78674221, + "learning_rate": 6.470060349972411e-07, + "loss": 0.80810702, + "num_input_tokens_seen": 267034075, + "step": 12381, + "time_per_iteration": 2.6567366123199463 + }, + { + "auxiliary_loss_clip": 0.01072016, + "auxiliary_loss_mlp": 0.01045243, + "balance_loss_clip": 1.03785646, + "balance_loss_mlp": 1.02981174, + "epoch": 0.7444461145347964, + "flos": 22017838055040.0, + "grad_norm": 2.878241445403415, + "language_loss": 0.72793961, + "learning_rate": 6.467192433866411e-07, + "loss": 0.74911219, + "num_input_tokens_seen": 267053645, + "step": 12382, + "time_per_iteration": 4.307409763336182 + }, + { + "auxiliary_loss_clip": 0.01005043, + "auxiliary_loss_mlp": 0.01004958, + "balance_loss_clip": 1.01257348, + "balance_loss_mlp": 1.00380802, + "epoch": 0.7445062377874643, + "flos": 70559047704960.0, + "grad_norm": 0.6531954820349142, + "language_loss": 0.54669428, + "learning_rate": 6.464325030925831e-07, + "loss": 0.56679428, + "num_input_tokens_seen": 267121830, + "step": 12383, + "time_per_iteration": 3.4219913482666016 + }, + { + "auxiliary_loss_clip": 0.01085875, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_clip": 1.03667879, + "balance_loss_mlp": 1.01498723, + "epoch": 0.7445663610401323, + "flos": 22164425458560.0, + "grad_norm": 5.0246719589759365, + "language_loss": 0.76023626, + "learning_rate": 6.461458141259395e-07, + "loss": 0.78137243, + "num_input_tokens_seen": 267141145, + "step": 12384, + "time_per_iteration": 2.6512553691864014 + }, + { + "auxiliary_loss_clip": 0.01098981, + "auxiliary_loss_mlp": 0.01029554, + "balance_loss_clip": 1.03816116, + "balance_loss_mlp": 1.01680422, + "epoch": 0.7446264842928002, + "flos": 24170251714560.0, + "grad_norm": 1.9156504433833408, + "language_loss": 0.78836381, + "learning_rate": 6.458591764975823e-07, + "loss": 0.80964911, + "num_input_tokens_seen": 267159280, + "step": 12385, + "time_per_iteration": 2.6723034381866455 + }, + { + "auxiliary_loss_clip": 0.01078718, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.03725076, + "balance_loss_mlp": 1.01855612, + "epoch": 0.7446866075454682, + "flos": 24134556574080.0, + "grad_norm": 1.6864726540587271, + "language_loss": 0.81386524, + "learning_rate": 6.455725902183813e-07, + "loss": 0.83498025, + "num_input_tokens_seen": 267179390, + "step": 12386, + "time_per_iteration": 2.724527359008789 + }, + { + "auxiliary_loss_clip": 0.01097105, + "auxiliary_loss_mlp": 0.01034795, + "balance_loss_clip": 1.03846228, + "balance_loss_mlp": 1.02235591, + "epoch": 0.7447467307981361, + "flos": 23548063305600.0, + "grad_norm": 1.6785981407963104, + "language_loss": 0.71043932, + "learning_rate": 6.452860552992037e-07, + "loss": 0.73175836, + "num_input_tokens_seen": 267198165, + "step": 12387, + "time_per_iteration": 2.7917346954345703 + }, + { + "auxiliary_loss_clip": 0.0107995, + "auxiliary_loss_mlp": 0.01031199, + "balance_loss_clip": 1.03891492, + "balance_loss_mlp": 1.01899815, + "epoch": 0.7448068540508042, + "flos": 19567832215680.0, + "grad_norm": 2.0106336394947597, + "language_loss": 0.70168763, + "learning_rate": 6.449995717509138e-07, + "loss": 0.72279912, + "num_input_tokens_seen": 267214520, + "step": 12388, + "time_per_iteration": 2.831563949584961 + }, + { + "auxiliary_loss_clip": 0.01099712, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.03740311, + "balance_loss_mlp": 1.01846361, + "epoch": 0.7448669773034721, + "flos": 21839721488640.0, + "grad_norm": 2.075908043210206, + "language_loss": 0.84796858, + "learning_rate": 6.447131395843761e-07, + "loss": 0.86927676, + "num_input_tokens_seen": 267236555, + "step": 12389, + "time_per_iteration": 2.6563961505889893 + }, + { + "auxiliary_loss_clip": 0.01069109, + "auxiliary_loss_mlp": 0.01035071, + "balance_loss_clip": 1.03659904, + "balance_loss_mlp": 1.02245224, + "epoch": 0.7449271005561401, + "flos": 25155389099520.0, + "grad_norm": 2.0263392027511298, + "language_loss": 0.79228258, + "learning_rate": 6.444267588104526e-07, + "loss": 0.81332433, + "num_input_tokens_seen": 267254800, + "step": 12390, + "time_per_iteration": 2.756574869155884 + }, + { + "auxiliary_loss_clip": 0.01089478, + "auxiliary_loss_mlp": 0.0103054, + "balance_loss_clip": 1.03688502, + "balance_loss_mlp": 1.01727843, + "epoch": 0.7449872238088081, + "flos": 22273342473600.0, + "grad_norm": 1.8599579606909906, + "language_loss": 0.851529, + "learning_rate": 6.441404294400014e-07, + "loss": 0.87272918, + "num_input_tokens_seen": 267274610, + "step": 12391, + "time_per_iteration": 2.6953816413879395 + }, + { + "auxiliary_loss_clip": 0.01111566, + "auxiliary_loss_mlp": 0.01028434, + "balance_loss_clip": 1.03942573, + "balance_loss_mlp": 1.0161674, + "epoch": 0.745047347061476, + "flos": 20594805966720.0, + "grad_norm": 1.7091676728035188, + "language_loss": 0.73478818, + "learning_rate": 6.438541514838811e-07, + "loss": 0.75618815, + "num_input_tokens_seen": 267292600, + "step": 12392, + "time_per_iteration": 2.566464424133301 + }, + { + "auxiliary_loss_clip": 0.010973, + "auxiliary_loss_mlp": 0.01035758, + "balance_loss_clip": 1.03854799, + "balance_loss_mlp": 1.02366483, + "epoch": 0.745107470314144, + "flos": 22127545169280.0, + "grad_norm": 3.0948074405421617, + "language_loss": 0.76522237, + "learning_rate": 6.435679249529487e-07, + "loss": 0.78655297, + "num_input_tokens_seen": 267311295, + "step": 12393, + "time_per_iteration": 2.614400625228882 + }, + { + "auxiliary_loss_clip": 0.01100705, + "auxiliary_loss_mlp": 0.0103966, + "balance_loss_clip": 1.03918004, + "balance_loss_mlp": 1.02523553, + "epoch": 0.745167593566812, + "flos": 22236498097920.0, + "grad_norm": 1.8734262060070255, + "language_loss": 0.72774941, + "learning_rate": 6.432817498580552e-07, + "loss": 0.74915308, + "num_input_tokens_seen": 267328390, + "step": 12394, + "time_per_iteration": 2.6467761993408203 + }, + { + "auxiliary_loss_clip": 0.01058489, + "auxiliary_loss_mlp": 0.00770508, + "balance_loss_clip": 1.04145324, + "balance_loss_mlp": 1.0001545, + "epoch": 0.74522771681948, + "flos": 20666232161280.0, + "grad_norm": 1.9226493220785308, + "language_loss": 0.81523216, + "learning_rate": 6.429956262100535e-07, + "loss": 0.83352214, + "num_input_tokens_seen": 267348185, + "step": 12395, + "time_per_iteration": 2.772284984588623 + }, + { + "auxiliary_loss_clip": 0.0110524, + "auxiliary_loss_mlp": 0.01037708, + "balance_loss_clip": 1.03964758, + "balance_loss_mlp": 1.0240705, + "epoch": 0.7452878400721479, + "flos": 21106999952640.0, + "grad_norm": 1.9270177813162948, + "language_loss": 0.7149328, + "learning_rate": 6.427095540197937e-07, + "loss": 0.73636222, + "num_input_tokens_seen": 267367010, + "step": 12396, + "time_per_iteration": 2.6198830604553223 + }, + { + "auxiliary_loss_clip": 0.0107235, + "auxiliary_loss_mlp": 0.01033046, + "balance_loss_clip": 1.03889275, + "balance_loss_mlp": 1.02018356, + "epoch": 0.7453479633248159, + "flos": 26688056474880.0, + "grad_norm": 1.7432262055203618, + "language_loss": 0.68239546, + "learning_rate": 6.424235332981245e-07, + "loss": 0.70344937, + "num_input_tokens_seen": 267386605, + "step": 12397, + "time_per_iteration": 2.8147408962249756 + }, + { + "auxiliary_loss_clip": 0.01111263, + "auxiliary_loss_mlp": 0.01038637, + "balance_loss_clip": 1.03894281, + "balance_loss_mlp": 1.02567935, + "epoch": 0.7454080865774838, + "flos": 17016056167680.0, + "grad_norm": 1.7819734884556382, + "language_loss": 0.77117336, + "learning_rate": 6.421375640558908e-07, + "loss": 0.79267234, + "num_input_tokens_seen": 267404135, + "step": 12398, + "time_per_iteration": 2.561169385910034 + }, + { + "auxiliary_loss_clip": 0.01100902, + "auxiliary_loss_mlp": 0.01029647, + "balance_loss_clip": 1.04031086, + "balance_loss_mlp": 1.01657581, + "epoch": 0.7454682098301518, + "flos": 21323900229120.0, + "grad_norm": 1.713165335415303, + "language_loss": 0.779158, + "learning_rate": 6.418516463039363e-07, + "loss": 0.80046344, + "num_input_tokens_seen": 267423120, + "step": 12399, + "time_per_iteration": 2.6413347721099854 + }, + { + "auxiliary_loss_clip": 0.010824, + "auxiliary_loss_mlp": 0.01035168, + "balance_loss_clip": 1.03334904, + "balance_loss_mlp": 1.02338409, + "epoch": 0.7455283330828197, + "flos": 17858341163520.0, + "grad_norm": 2.1285775991405482, + "language_loss": 0.73999, + "learning_rate": 6.415657800531038e-07, + "loss": 0.76116568, + "num_input_tokens_seen": 267441250, + "step": 12400, + "time_per_iteration": 2.696606159210205 + }, + { + "auxiliary_loss_clip": 0.01096917, + "auxiliary_loss_mlp": 0.01030276, + "balance_loss_clip": 1.03786886, + "balance_loss_mlp": 1.01809835, + "epoch": 0.7455884563354878, + "flos": 30774259664640.0, + "grad_norm": 2.4044760151763174, + "language_loss": 0.82103872, + "learning_rate": 6.412799653142327e-07, + "loss": 0.84231067, + "num_input_tokens_seen": 267462820, + "step": 12401, + "time_per_iteration": 2.700671434402466 + }, + { + "auxiliary_loss_clip": 0.01078431, + "auxiliary_loss_mlp": 0.01035329, + "balance_loss_clip": 1.03934383, + "balance_loss_mlp": 1.02312756, + "epoch": 0.7456485795881557, + "flos": 23185545292800.0, + "grad_norm": 2.1019998768613326, + "language_loss": 0.64676833, + "learning_rate": 6.409942020981611e-07, + "loss": 0.66790593, + "num_input_tokens_seen": 267483065, + "step": 12402, + "time_per_iteration": 2.775984287261963 + }, + { + "auxiliary_loss_clip": 0.01077021, + "auxiliary_loss_mlp": 0.01033791, + "balance_loss_clip": 1.03509498, + "balance_loss_mlp": 1.02227569, + "epoch": 0.7457087028408237, + "flos": 38727144074880.0, + "grad_norm": 1.560080300868097, + "language_loss": 0.73373783, + "learning_rate": 6.407084904157265e-07, + "loss": 0.75484598, + "num_input_tokens_seen": 267504825, + "step": 12403, + "time_per_iteration": 2.8398375511169434 + }, + { + "auxiliary_loss_clip": 0.01002548, + "auxiliary_loss_mlp": 0.01008627, + "balance_loss_clip": 1.01085329, + "balance_loss_mlp": 1.00753641, + "epoch": 0.7457688260934917, + "flos": 56043737337600.0, + "grad_norm": 0.830633313503113, + "language_loss": 0.58735222, + "learning_rate": 6.404228302777621e-07, + "loss": 0.60746402, + "num_input_tokens_seen": 267559260, + "step": 12404, + "time_per_iteration": 3.018889904022217 + }, + { + "auxiliary_loss_clip": 0.01110759, + "auxiliary_loss_mlp": 0.01032429, + "balance_loss_clip": 1.03871632, + "balance_loss_mlp": 1.020383, + "epoch": 0.7458289493461596, + "flos": 20116152305280.0, + "grad_norm": 1.8575983002348149, + "language_loss": 0.77702922, + "learning_rate": 6.401372216950995e-07, + "loss": 0.79846108, + "num_input_tokens_seen": 267578720, + "step": 12405, + "time_per_iteration": 2.607694625854492 + }, + { + "auxiliary_loss_clip": 0.01083469, + "auxiliary_loss_mlp": 0.01036873, + "balance_loss_clip": 1.03548229, + "balance_loss_mlp": 1.02420723, + "epoch": 0.7458890725988276, + "flos": 20193073280640.0, + "grad_norm": 1.6476155913625474, + "language_loss": 0.69351685, + "learning_rate": 6.398516646785698e-07, + "loss": 0.71472031, + "num_input_tokens_seen": 267598250, + "step": 12406, + "time_per_iteration": 2.651949882507324 + }, + { + "auxiliary_loss_clip": 0.01047021, + "auxiliary_loss_mlp": 0.01036186, + "balance_loss_clip": 1.03744388, + "balance_loss_mlp": 1.02236354, + "epoch": 0.7459491958514956, + "flos": 17018749687680.0, + "grad_norm": 2.2152803431091685, + "language_loss": 0.65254861, + "learning_rate": 6.39566159239002e-07, + "loss": 0.67338073, + "num_input_tokens_seen": 267615430, + "step": 12407, + "time_per_iteration": 2.761862277984619 + }, + { + "auxiliary_loss_clip": 0.01070552, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.03763545, + "balance_loss_mlp": 1.02068424, + "epoch": 0.7460093191041636, + "flos": 25078719519360.0, + "grad_norm": 2.453425787686552, + "language_loss": 0.72200561, + "learning_rate": 6.392807053872212e-07, + "loss": 0.74305862, + "num_input_tokens_seen": 267635075, + "step": 12408, + "time_per_iteration": 2.7553751468658447 + }, + { + "auxiliary_loss_clip": 0.01105957, + "auxiliary_loss_mlp": 0.0103312, + "balance_loss_clip": 1.03999674, + "balance_loss_mlp": 1.01942849, + "epoch": 0.7460694423568315, + "flos": 21908525990400.0, + "grad_norm": 2.19086035143854, + "language_loss": 0.72995472, + "learning_rate": 6.38995303134053e-07, + "loss": 0.7513454, + "num_input_tokens_seen": 267654105, + "step": 12409, + "time_per_iteration": 2.6748335361480713 + }, + { + "auxiliary_loss_clip": 0.01097314, + "auxiliary_loss_mlp": 0.01031591, + "balance_loss_clip": 1.03749943, + "balance_loss_mlp": 1.02024233, + "epoch": 0.7461295656094995, + "flos": 21215737399680.0, + "grad_norm": 2.015553074030815, + "language_loss": 0.65646017, + "learning_rate": 6.38709952490319e-07, + "loss": 0.67774916, + "num_input_tokens_seen": 267673090, + "step": 12410, + "time_per_iteration": 2.599883794784546 + }, + { + "auxiliary_loss_clip": 0.01094288, + "auxiliary_loss_mlp": 0.00770134, + "balance_loss_clip": 1.0380162, + "balance_loss_mlp": 1.00011945, + "epoch": 0.7461896888621674, + "flos": 22346851656960.0, + "grad_norm": 1.8387948527336508, + "language_loss": 0.84203392, + "learning_rate": 6.384246534668396e-07, + "loss": 0.86067814, + "num_input_tokens_seen": 267690605, + "step": 12411, + "time_per_iteration": 2.7593939304351807 + }, + { + "auxiliary_loss_clip": 0.01076302, + "auxiliary_loss_mlp": 0.01030521, + "balance_loss_clip": 1.0369643, + "balance_loss_mlp": 1.01747966, + "epoch": 0.7462498121148354, + "flos": 25482930243840.0, + "grad_norm": 2.2444375236075578, + "language_loss": 0.77899462, + "learning_rate": 6.381394060744339e-07, + "loss": 0.80006284, + "num_input_tokens_seen": 267710540, + "step": 12412, + "time_per_iteration": 2.880466938018799 + }, + { + "auxiliary_loss_clip": 0.01069141, + "auxiliary_loss_mlp": 0.01041632, + "balance_loss_clip": 1.03378701, + "balance_loss_mlp": 1.02820313, + "epoch": 0.7463099353675033, + "flos": 33947936812800.0, + "grad_norm": 2.442333824498856, + "language_loss": 0.62740505, + "learning_rate": 6.378542103239188e-07, + "loss": 0.64851284, + "num_input_tokens_seen": 267730780, + "step": 12413, + "time_per_iteration": 2.8031466007232666 + }, + { + "auxiliary_loss_clip": 0.01023176, + "auxiliary_loss_mlp": 0.00751261, + "balance_loss_clip": 1.00943136, + "balance_loss_mlp": 0.99959147, + "epoch": 0.7463700586201714, + "flos": 62767723691520.0, + "grad_norm": 0.7172744197889728, + "language_loss": 0.54801792, + "learning_rate": 6.375690662261082e-07, + "loss": 0.56576228, + "num_input_tokens_seen": 267794240, + "step": 12414, + "time_per_iteration": 3.2076735496520996 + }, + { + "auxiliary_loss_clip": 0.01081911, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.03365874, + "balance_loss_mlp": 1.01806545, + "epoch": 0.7464301818728393, + "flos": 33432654257280.0, + "grad_norm": 1.4875618615685628, + "language_loss": 0.5517059, + "learning_rate": 6.372839737918154e-07, + "loss": 0.57284164, + "num_input_tokens_seen": 267817190, + "step": 12415, + "time_per_iteration": 4.414318084716797 + }, + { + "auxiliary_loss_clip": 0.0104777, + "auxiliary_loss_mlp": 0.01036648, + "balance_loss_clip": 1.03617668, + "balance_loss_mlp": 1.02174664, + "epoch": 0.7464903051255073, + "flos": 26869872142080.0, + "grad_norm": 1.6764979613333528, + "language_loss": 0.75015157, + "learning_rate": 6.369989330318506e-07, + "loss": 0.77099568, + "num_input_tokens_seen": 267836245, + "step": 12416, + "time_per_iteration": 2.831061840057373 + }, + { + "auxiliary_loss_clip": 0.01060971, + "auxiliary_loss_mlp": 0.01042536, + "balance_loss_clip": 1.03266478, + "balance_loss_mlp": 1.02845144, + "epoch": 0.7465504283781753, + "flos": 44086954775040.0, + "grad_norm": 5.110704099754697, + "language_loss": 0.69582009, + "learning_rate": 6.367139439570233e-07, + "loss": 0.71685511, + "num_input_tokens_seen": 267858310, + "step": 12417, + "time_per_iteration": 6.061137676239014 + }, + { + "auxiliary_loss_clip": 0.01087135, + "auxiliary_loss_mlp": 0.010359, + "balance_loss_clip": 1.04298329, + "balance_loss_mlp": 1.02211332, + "epoch": 0.7466105516308432, + "flos": 19676102785920.0, + "grad_norm": 1.7520602773189389, + "language_loss": 0.73654354, + "learning_rate": 6.364290065781392e-07, + "loss": 0.75777388, + "num_input_tokens_seen": 267876345, + "step": 12418, + "time_per_iteration": 2.719461441040039 + }, + { + "auxiliary_loss_clip": 0.01101371, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.03970969, + "balance_loss_mlp": 1.01958394, + "epoch": 0.7466706748835112, + "flos": 20520722165760.0, + "grad_norm": 1.5723677716415394, + "language_loss": 0.68733931, + "learning_rate": 6.361441209060039e-07, + "loss": 0.70867467, + "num_input_tokens_seen": 267896740, + "step": 12419, + "time_per_iteration": 2.658419370651245 + }, + { + "auxiliary_loss_clip": 0.01106886, + "auxiliary_loss_mlp": 0.01034487, + "balance_loss_clip": 1.03877735, + "balance_loss_mlp": 1.0225246, + "epoch": 0.7467307981361792, + "flos": 21690260997120.0, + "grad_norm": 2.325148718588452, + "language_loss": 0.74999017, + "learning_rate": 6.358592869514216e-07, + "loss": 0.77140391, + "num_input_tokens_seen": 267914765, + "step": 12420, + "time_per_iteration": 2.6232640743255615 + }, + { + "auxiliary_loss_clip": 0.01105813, + "auxiliary_loss_mlp": 0.0103159, + "balance_loss_clip": 1.04157043, + "balance_loss_mlp": 1.01868558, + "epoch": 0.7467909213888472, + "flos": 19573686132480.0, + "grad_norm": 1.5853276507887042, + "language_loss": 0.6715399, + "learning_rate": 6.355745047251904e-07, + "loss": 0.69291389, + "num_input_tokens_seen": 267934085, + "step": 12421, + "time_per_iteration": 4.228281021118164 + }, + { + "auxiliary_loss_clip": 0.01087742, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.03845739, + "balance_loss_mlp": 1.02044845, + "epoch": 0.7468510446415151, + "flos": 23695225326720.0, + "grad_norm": 1.7891201641771508, + "language_loss": 0.72700393, + "learning_rate": 6.352897742381107e-07, + "loss": 0.74822545, + "num_input_tokens_seen": 267955170, + "step": 12422, + "time_per_iteration": 2.678581953048706 + }, + { + "auxiliary_loss_clip": 0.0107257, + "auxiliary_loss_mlp": 0.01034325, + "balance_loss_clip": 1.03739822, + "balance_loss_mlp": 1.02140832, + "epoch": 0.7469111678941831, + "flos": 29315783831040.0, + "grad_norm": 1.7729815610764, + "language_loss": 0.7519784, + "learning_rate": 6.350050955009796e-07, + "loss": 0.77304733, + "num_input_tokens_seen": 267974980, + "step": 12423, + "time_per_iteration": 2.884932518005371 + }, + { + "auxiliary_loss_clip": 0.01097508, + "auxiliary_loss_mlp": 0.01026494, + "balance_loss_clip": 1.03815055, + "balance_loss_mlp": 1.01491261, + "epoch": 0.746971291146851, + "flos": 21798639308160.0, + "grad_norm": 1.3102627766091752, + "language_loss": 0.67454731, + "learning_rate": 6.347204685245929e-07, + "loss": 0.69578731, + "num_input_tokens_seen": 267994985, + "step": 12424, + "time_per_iteration": 2.665360927581787 + }, + { + "auxiliary_loss_clip": 0.01106731, + "auxiliary_loss_mlp": 0.0103674, + "balance_loss_clip": 1.04188585, + "balance_loss_mlp": 1.02385378, + "epoch": 0.747031414399519, + "flos": 36245070368640.0, + "grad_norm": 1.8168677421099624, + "language_loss": 0.7413224, + "learning_rate": 6.344358933197418e-07, + "loss": 0.76275706, + "num_input_tokens_seen": 268014985, + "step": 12425, + "time_per_iteration": 2.684622049331665 + }, + { + "auxiliary_loss_clip": 0.01071034, + "auxiliary_loss_mlp": 0.01034399, + "balance_loss_clip": 1.03520596, + "balance_loss_mlp": 1.0205828, + "epoch": 0.7470915376521869, + "flos": 19974916028160.0, + "grad_norm": 8.361913341794455, + "language_loss": 0.69433403, + "learning_rate": 6.341513698972194e-07, + "loss": 0.71538836, + "num_input_tokens_seen": 268034395, + "step": 12426, + "time_per_iteration": 2.686992645263672 + }, + { + "auxiliary_loss_clip": 0.01070297, + "auxiliary_loss_mlp": 0.01035278, + "balance_loss_clip": 1.03655338, + "balance_loss_mlp": 1.02329111, + "epoch": 0.747151660904855, + "flos": 20084299920000.0, + "grad_norm": 1.4050021872275102, + "language_loss": 0.65497875, + "learning_rate": 6.338668982678139e-07, + "loss": 0.67603451, + "num_input_tokens_seen": 268054485, + "step": 12427, + "time_per_iteration": 2.8737995624542236 + }, + { + "auxiliary_loss_clip": 0.0111177, + "auxiliary_loss_mlp": 0.01030082, + "balance_loss_clip": 1.03934562, + "balance_loss_mlp": 1.01686215, + "epoch": 0.7472117841575229, + "flos": 16290373697280.0, + "grad_norm": 1.6443370194470839, + "language_loss": 0.74700832, + "learning_rate": 6.335824784423118e-07, + "loss": 0.7684269, + "num_input_tokens_seen": 268072250, + "step": 12428, + "time_per_iteration": 2.5923843383789062 + }, + { + "auxiliary_loss_clip": 0.01105561, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.03948128, + "balance_loss_mlp": 1.01726604, + "epoch": 0.7472719074101909, + "flos": 21389939383680.0, + "grad_norm": 1.8997644217403626, + "language_loss": 0.5859766, + "learning_rate": 6.33298110431499e-07, + "loss": 0.60734349, + "num_input_tokens_seen": 268089840, + "step": 12429, + "time_per_iteration": 2.673205614089966 + }, + { + "auxiliary_loss_clip": 0.01100742, + "auxiliary_loss_mlp": 0.01035285, + "balance_loss_clip": 1.0397048, + "balance_loss_mlp": 1.02210021, + "epoch": 0.7473320306628589, + "flos": 29643289061760.0, + "grad_norm": 2.191924076091365, + "language_loss": 0.60676718, + "learning_rate": 6.330137942461595e-07, + "loss": 0.62812746, + "num_input_tokens_seen": 268109360, + "step": 12430, + "time_per_iteration": 2.695838212966919 + }, + { + "auxiliary_loss_clip": 0.01089402, + "auxiliary_loss_mlp": 0.01035646, + "balance_loss_clip": 1.0370059, + "balance_loss_mlp": 1.02266431, + "epoch": 0.7473921539155268, + "flos": 24136100858880.0, + "grad_norm": 1.60839761436318, + "language_loss": 0.75666201, + "learning_rate": 6.327295298970734e-07, + "loss": 0.7779125, + "num_input_tokens_seen": 268131840, + "step": 12431, + "time_per_iteration": 2.7131593227386475 + }, + { + "auxiliary_loss_clip": 0.01098694, + "auxiliary_loss_mlp": 0.01031285, + "balance_loss_clip": 1.03696167, + "balance_loss_mlp": 1.01853514, + "epoch": 0.7474522771681948, + "flos": 17487958072320.0, + "grad_norm": 1.8735643765316532, + "language_loss": 0.75119841, + "learning_rate": 6.32445317395021e-07, + "loss": 0.77249819, + "num_input_tokens_seen": 268148300, + "step": 12432, + "time_per_iteration": 2.596440315246582 + }, + { + "auxiliary_loss_clip": 0.01088473, + "auxiliary_loss_mlp": 0.01036339, + "balance_loss_clip": 1.03782606, + "balance_loss_mlp": 1.02223635, + "epoch": 0.7475124004208628, + "flos": 16727298733440.0, + "grad_norm": 2.50552734802935, + "language_loss": 0.69950736, + "learning_rate": 6.321611567507787e-07, + "loss": 0.72075546, + "num_input_tokens_seen": 268166450, + "step": 12433, + "time_per_iteration": 2.606110095977783 + }, + { + "auxiliary_loss_clip": 0.01063022, + "auxiliary_loss_mlp": 0.01031886, + "balance_loss_clip": 1.03389204, + "balance_loss_mlp": 1.01835036, + "epoch": 0.7475725236735308, + "flos": 19720237622400.0, + "grad_norm": 2.703159081845411, + "language_loss": 0.67130244, + "learning_rate": 6.318770479751232e-07, + "loss": 0.6922515, + "num_input_tokens_seen": 268186165, + "step": 12434, + "time_per_iteration": 2.751291513442993 + }, + { + "auxiliary_loss_clip": 0.01105439, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.03803849, + "balance_loss_mlp": 1.02368116, + "epoch": 0.7476326469261987, + "flos": 26286000566400.0, + "grad_norm": 3.4930601864100224, + "language_loss": 0.7979542, + "learning_rate": 6.315929910788263e-07, + "loss": 0.8193624, + "num_input_tokens_seen": 268208145, + "step": 12435, + "time_per_iteration": 2.6472816467285156 + }, + { + "auxiliary_loss_clip": 0.01083734, + "auxiliary_loss_mlp": 0.01027736, + "balance_loss_clip": 1.03813887, + "balance_loss_mlp": 1.01502252, + "epoch": 0.7476927701788667, + "flos": 31831828824960.0, + "grad_norm": 1.8861832027521432, + "language_loss": 0.68124855, + "learning_rate": 6.313089860726604e-07, + "loss": 0.70236325, + "num_input_tokens_seen": 268228345, + "step": 12436, + "time_per_iteration": 2.813854694366455 + }, + { + "auxiliary_loss_clip": 0.0108534, + "auxiliary_loss_mlp": 0.01034815, + "balance_loss_clip": 1.0374372, + "balance_loss_mlp": 1.02242923, + "epoch": 0.7477528934315346, + "flos": 31795487239680.0, + "grad_norm": 1.9570276627413858, + "language_loss": 0.70576406, + "learning_rate": 6.31025032967396e-07, + "loss": 0.72696555, + "num_input_tokens_seen": 268250260, + "step": 12437, + "time_per_iteration": 2.7825896739959717 + }, + { + "auxiliary_loss_clip": 0.01071415, + "auxiliary_loss_mlp": 0.01028505, + "balance_loss_clip": 1.03356171, + "balance_loss_mlp": 1.01697707, + "epoch": 0.7478130166842026, + "flos": 20371979946240.0, + "grad_norm": 2.395892152897482, + "language_loss": 0.67251343, + "learning_rate": 6.307411317737986e-07, + "loss": 0.69351262, + "num_input_tokens_seen": 268268440, + "step": 12438, + "time_per_iteration": 2.706458568572998 + }, + { + "auxiliary_loss_clip": 0.01087999, + "auxiliary_loss_mlp": 0.01035267, + "balance_loss_clip": 1.03646779, + "balance_loss_mlp": 1.0229404, + "epoch": 0.7478731399368705, + "flos": 18148930191360.0, + "grad_norm": 1.593097914021623, + "language_loss": 0.8085202, + "learning_rate": 6.304572825026344e-07, + "loss": 0.8297528, + "num_input_tokens_seen": 268285765, + "step": 12439, + "time_per_iteration": 2.665294647216797 + }, + { + "auxiliary_loss_clip": 0.01074236, + "auxiliary_loss_mlp": 0.01040046, + "balance_loss_clip": 1.03548503, + "balance_loss_mlp": 1.02805412, + "epoch": 0.7479332631895386, + "flos": 15267889146240.0, + "grad_norm": 2.6477676249196334, + "language_loss": 0.70738852, + "learning_rate": 6.301734851646674e-07, + "loss": 0.72853136, + "num_input_tokens_seen": 268304015, + "step": 12440, + "time_per_iteration": 2.7106735706329346 + }, + { + "auxiliary_loss_clip": 0.01088049, + "auxiliary_loss_mlp": 0.01026861, + "balance_loss_clip": 1.04011965, + "balance_loss_mlp": 1.01467144, + "epoch": 0.7479933864422065, + "flos": 21142515525120.0, + "grad_norm": 1.6270418049825819, + "language_loss": 0.74380887, + "learning_rate": 6.298897397706597e-07, + "loss": 0.7649579, + "num_input_tokens_seen": 268323290, + "step": 12441, + "time_per_iteration": 2.7022409439086914 + }, + { + "auxiliary_loss_clip": 0.01105099, + "auxiliary_loss_mlp": 0.00770813, + "balance_loss_clip": 1.04095459, + "balance_loss_mlp": 1.00020576, + "epoch": 0.7480535096948745, + "flos": 14392027912320.0, + "grad_norm": 2.187499472876037, + "language_loss": 0.82711899, + "learning_rate": 6.296060463313698e-07, + "loss": 0.84587812, + "num_input_tokens_seen": 268339490, + "step": 12442, + "time_per_iteration": 2.7588963508605957 + }, + { + "auxiliary_loss_clip": 0.0105579, + "auxiliary_loss_mlp": 0.01031459, + "balance_loss_clip": 1.03666043, + "balance_loss_mlp": 1.01823914, + "epoch": 0.7481136329475425, + "flos": 27344683048320.0, + "grad_norm": 2.073136454951009, + "language_loss": 0.63220263, + "learning_rate": 6.293224048575565e-07, + "loss": 0.65307516, + "num_input_tokens_seen": 268359865, + "step": 12443, + "time_per_iteration": 2.874648094177246 + }, + { + "auxiliary_loss_clip": 0.01067932, + "auxiliary_loss_mlp": 0.0102658, + "balance_loss_clip": 1.03455901, + "balance_loss_mlp": 1.01451015, + "epoch": 0.7481737562002104, + "flos": 19531454716800.0, + "grad_norm": 2.062388953360283, + "language_loss": 0.7137714, + "learning_rate": 6.29038815359975e-07, + "loss": 0.73471653, + "num_input_tokens_seen": 268377065, + "step": 12444, + "time_per_iteration": 2.703878402709961 + }, + { + "auxiliary_loss_clip": 0.01059747, + "auxiliary_loss_mlp": 0.01031938, + "balance_loss_clip": 1.03627777, + "balance_loss_mlp": 1.01890206, + "epoch": 0.7482338794528784, + "flos": 21760035166080.0, + "grad_norm": 1.378277825499583, + "language_loss": 0.69101679, + "learning_rate": 6.287552778493786e-07, + "loss": 0.71193373, + "num_input_tokens_seen": 268396935, + "step": 12445, + "time_per_iteration": 2.757577657699585 + }, + { + "auxiliary_loss_clip": 0.01098864, + "auxiliary_loss_mlp": 0.0102548, + "balance_loss_clip": 1.03871107, + "balance_loss_mlp": 1.01329112, + "epoch": 0.7482940027055464, + "flos": 18697358021760.0, + "grad_norm": 1.944144924482792, + "language_loss": 0.74288422, + "learning_rate": 6.28471792336519e-07, + "loss": 0.76412767, + "num_input_tokens_seen": 268414460, + "step": 12446, + "time_per_iteration": 2.69356107711792 + }, + { + "auxiliary_loss_clip": 0.01094765, + "auxiliary_loss_mlp": 0.00771514, + "balance_loss_clip": 1.04004169, + "balance_loss_mlp": 1.0002172, + "epoch": 0.7483541259582144, + "flos": 15998024903040.0, + "grad_norm": 2.4465560126403245, + "language_loss": 0.7326262, + "learning_rate": 6.281883588321475e-07, + "loss": 0.75128901, + "num_input_tokens_seen": 268432225, + "step": 12447, + "time_per_iteration": 2.662238597869873 + }, + { + "auxiliary_loss_clip": 0.01068097, + "auxiliary_loss_mlp": 0.01031231, + "balance_loss_clip": 1.03563976, + "balance_loss_mlp": 1.0193516, + "epoch": 0.7484142492108823, + "flos": 25556295772800.0, + "grad_norm": 2.4715537752348906, + "language_loss": 0.7231704, + "learning_rate": 6.279049773470109e-07, + "loss": 0.74416363, + "num_input_tokens_seen": 268449270, + "step": 12448, + "time_per_iteration": 2.7589666843414307 + }, + { + "auxiliary_loss_clip": 0.01113987, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.04052019, + "balance_loss_mlp": 1.02560151, + "epoch": 0.7484743724635503, + "flos": 22887737631360.0, + "grad_norm": 1.8427048424278483, + "language_loss": 0.73759341, + "learning_rate": 6.276216478918543e-07, + "loss": 0.75910997, + "num_input_tokens_seen": 268467250, + "step": 12449, + "time_per_iteration": 2.6071417331695557 + }, + { + "auxiliary_loss_clip": 0.01076255, + "auxiliary_loss_mlp": 0.01037131, + "balance_loss_clip": 1.03802109, + "balance_loss_mlp": 1.02391624, + "epoch": 0.7485344957162182, + "flos": 25300288563840.0, + "grad_norm": 2.0043420955718716, + "language_loss": 0.6146363, + "learning_rate": 6.273383704774225e-07, + "loss": 0.6357702, + "num_input_tokens_seen": 268487270, + "step": 12450, + "time_per_iteration": 2.7463302612304688 + }, + { + "auxiliary_loss_clip": 0.01106441, + "auxiliary_loss_mlp": 0.01026536, + "balance_loss_clip": 1.03821647, + "balance_loss_mlp": 1.01458502, + "epoch": 0.7485946189688862, + "flos": 27053016612480.0, + "grad_norm": 1.9632558902155064, + "language_loss": 0.70478344, + "learning_rate": 6.270551451144577e-07, + "loss": 0.7261132, + "num_input_tokens_seen": 268508020, + "step": 12451, + "time_per_iteration": 2.632495641708374 + }, + { + "auxiliary_loss_clip": 0.01103126, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.03716731, + "balance_loss_mlp": 1.0168184, + "epoch": 0.7486547422215541, + "flos": 26906752431360.0, + "grad_norm": 2.915106727246987, + "language_loss": 0.80665791, + "learning_rate": 6.267719718136988e-07, + "loss": 0.82798392, + "num_input_tokens_seen": 268527375, + "step": 12452, + "time_per_iteration": 2.6505486965179443 + }, + { + "auxiliary_loss_clip": 0.01119519, + "auxiliary_loss_mlp": 0.0103336, + "balance_loss_clip": 1.04324985, + "balance_loss_mlp": 1.02005577, + "epoch": 0.7487148654742222, + "flos": 22346277039360.0, + "grad_norm": 2.8493444529110215, + "language_loss": 0.71248496, + "learning_rate": 6.264888505858843e-07, + "loss": 0.73401374, + "num_input_tokens_seen": 268544870, + "step": 12453, + "time_per_iteration": 2.6861732006073 + }, + { + "auxiliary_loss_clip": 0.01091229, + "auxiliary_loss_mlp": 0.01034252, + "balance_loss_clip": 1.03970766, + "balance_loss_mlp": 1.02196777, + "epoch": 0.7487749887268901, + "flos": 23038814234880.0, + "grad_norm": 1.5893693791461498, + "language_loss": 0.73979241, + "learning_rate": 6.262057814417517e-07, + "loss": 0.76104718, + "num_input_tokens_seen": 268564580, + "step": 12454, + "time_per_iteration": 2.716642379760742 + }, + { + "auxiliary_loss_clip": 0.0100113, + "auxiliary_loss_mlp": 0.01001978, + "balance_loss_clip": 1.00717449, + "balance_loss_mlp": 1.00067317, + "epoch": 0.7488351119795581, + "flos": 71525294536320.0, + "grad_norm": 0.7358432419267441, + "language_loss": 0.59396183, + "learning_rate": 6.259227643920322e-07, + "loss": 0.61399293, + "num_input_tokens_seen": 268629550, + "step": 12455, + "time_per_iteration": 4.886117935180664 + }, + { + "auxiliary_loss_clip": 0.01072127, + "auxiliary_loss_mlp": 0.01029798, + "balance_loss_clip": 1.0343852, + "balance_loss_mlp": 1.01737666, + "epoch": 0.748895235232226, + "flos": 17196255722880.0, + "grad_norm": 2.489880729520255, + "language_loss": 0.79817784, + "learning_rate": 6.256397994474592e-07, + "loss": 0.81919706, + "num_input_tokens_seen": 268646645, + "step": 12456, + "time_per_iteration": 5.9515721797943115 + }, + { + "auxiliary_loss_clip": 0.01020316, + "auxiliary_loss_mlp": 0.01001663, + "balance_loss_clip": 1.00686383, + "balance_loss_mlp": 1.00054216, + "epoch": 0.748955358484894, + "flos": 58979256336000.0, + "grad_norm": 0.849157440562182, + "language_loss": 0.61421359, + "learning_rate": 6.25356886618763e-07, + "loss": 0.63443339, + "num_input_tokens_seen": 268702275, + "step": 12457, + "time_per_iteration": 3.1303980350494385 + }, + { + "auxiliary_loss_clip": 0.01098576, + "auxiliary_loss_mlp": 0.01035418, + "balance_loss_clip": 1.04226291, + "balance_loss_mlp": 1.02326477, + "epoch": 0.749015481737562, + "flos": 11360413054080.0, + "grad_norm": 1.9444047716710122, + "language_loss": 0.6761775, + "learning_rate": 6.250740259166711e-07, + "loss": 0.6975174, + "num_input_tokens_seen": 268716265, + "step": 12458, + "time_per_iteration": 2.665384292602539 + }, + { + "auxiliary_loss_clip": 0.0105583, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.03316355, + "balance_loss_mlp": 1.02080858, + "epoch": 0.74907560499023, + "flos": 21106497162240.0, + "grad_norm": 2.619057127646577, + "language_loss": 0.79952264, + "learning_rate": 6.247912173519106e-07, + "loss": 0.82040823, + "num_input_tokens_seen": 268734330, + "step": 12459, + "time_per_iteration": 2.754957675933838 + }, + { + "auxiliary_loss_clip": 0.01072944, + "auxiliary_loss_mlp": 0.01036735, + "balance_loss_clip": 1.0369221, + "balance_loss_mlp": 1.02394927, + "epoch": 0.749135728242898, + "flos": 22268027260800.0, + "grad_norm": 1.4984057584596764, + "language_loss": 0.80603898, + "learning_rate": 6.245084609352043e-07, + "loss": 0.82713568, + "num_input_tokens_seen": 268753500, + "step": 12460, + "time_per_iteration": 4.2594664096832275 + }, + { + "auxiliary_loss_clip": 0.01082271, + "auxiliary_loss_mlp": 0.01031927, + "balance_loss_clip": 1.03578806, + "balance_loss_mlp": 1.01876581, + "epoch": 0.7491958514955659, + "flos": 24057527857920.0, + "grad_norm": 1.80824785320189, + "language_loss": 0.85877681, + "learning_rate": 6.242257566772755e-07, + "loss": 0.87991881, + "num_input_tokens_seen": 268772055, + "step": 12461, + "time_per_iteration": 2.6852405071258545 + }, + { + "auxiliary_loss_clip": 0.01093212, + "auxiliary_loss_mlp": 0.01035506, + "balance_loss_clip": 1.03965092, + "balance_loss_mlp": 1.02309084, + "epoch": 0.7492559747482339, + "flos": 24492118510080.0, + "grad_norm": 1.8962735896690046, + "language_loss": 0.69416398, + "learning_rate": 6.239431045888435e-07, + "loss": 0.71545118, + "num_input_tokens_seen": 268792265, + "step": 12462, + "time_per_iteration": 2.768845319747925 + }, + { + "auxiliary_loss_clip": 0.01110765, + "auxiliary_loss_mlp": 0.01033779, + "balance_loss_clip": 1.03923655, + "balance_loss_mlp": 1.02101731, + "epoch": 0.7493160980009018, + "flos": 27745338326400.0, + "grad_norm": 2.365885457635203, + "language_loss": 0.7031799, + "learning_rate": 6.236605046806267e-07, + "loss": 0.72462535, + "num_input_tokens_seen": 268812735, + "step": 12463, + "time_per_iteration": 2.6340458393096924 + }, + { + "auxiliary_loss_clip": 0.01074204, + "auxiliary_loss_mlp": 0.01032497, + "balance_loss_clip": 1.03618455, + "balance_loss_mlp": 1.02071965, + "epoch": 0.7493762212535698, + "flos": 30226190970240.0, + "grad_norm": 2.141316726058728, + "language_loss": 0.77804828, + "learning_rate": 6.233779569633419e-07, + "loss": 0.7991153, + "num_input_tokens_seen": 268833090, + "step": 12464, + "time_per_iteration": 2.751758098602295 + }, + { + "auxiliary_loss_clip": 0.0108502, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_clip": 1.03515768, + "balance_loss_mlp": 1.01502621, + "epoch": 0.7494363445062378, + "flos": 21944472526080.0, + "grad_norm": 1.8114572432161449, + "language_loss": 0.78449178, + "learning_rate": 6.230954614477034e-07, + "loss": 0.80560803, + "num_input_tokens_seen": 268851880, + "step": 12465, + "time_per_iteration": 2.6739721298217773 + }, + { + "auxiliary_loss_clip": 0.0108024, + "auxiliary_loss_mlp": 0.01039302, + "balance_loss_clip": 1.03586817, + "balance_loss_mlp": 1.02480614, + "epoch": 0.7494964677589058, + "flos": 12490342162560.0, + "grad_norm": 2.3217514076277697, + "language_loss": 0.74179816, + "learning_rate": 6.22813018144422e-07, + "loss": 0.76299357, + "num_input_tokens_seen": 268867910, + "step": 12466, + "time_per_iteration": 2.63236665725708 + }, + { + "auxiliary_loss_clip": 0.01098476, + "auxiliary_loss_mlp": 0.01036506, + "balance_loss_clip": 1.03608537, + "balance_loss_mlp": 1.02381599, + "epoch": 0.7495565910115737, + "flos": 21653057485440.0, + "grad_norm": 2.1977964760321362, + "language_loss": 0.66625774, + "learning_rate": 6.22530627064209e-07, + "loss": 0.68760759, + "num_input_tokens_seen": 268887260, + "step": 12467, + "time_per_iteration": 2.6381313800811768 + }, + { + "auxiliary_loss_clip": 0.01062241, + "auxiliary_loss_mlp": 0.00773108, + "balance_loss_clip": 1.03538942, + "balance_loss_mlp": 1.00025678, + "epoch": 0.7496167142642417, + "flos": 15268535591040.0, + "grad_norm": 2.2660950859773425, + "language_loss": 0.76690638, + "learning_rate": 6.222482882177735e-07, + "loss": 0.7852599, + "num_input_tokens_seen": 268902520, + "step": 12468, + "time_per_iteration": 2.717893123626709 + }, + { + "auxiliary_loss_clip": 0.01071579, + "auxiliary_loss_mlp": 0.01029806, + "balance_loss_clip": 1.03752029, + "balance_loss_mlp": 1.0167706, + "epoch": 0.7496768375169096, + "flos": 22054933825920.0, + "grad_norm": 2.258197229303168, + "language_loss": 0.69274288, + "learning_rate": 6.219660016158201e-07, + "loss": 0.7137568, + "num_input_tokens_seen": 268920970, + "step": 12469, + "time_per_iteration": 2.7141220569610596 + }, + { + "auxiliary_loss_clip": 0.01089029, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.03684139, + "balance_loss_mlp": 1.01970625, + "epoch": 0.7497369607695776, + "flos": 19057038860160.0, + "grad_norm": 1.9749809581101754, + "language_loss": 0.69305575, + "learning_rate": 6.216837672690543e-07, + "loss": 0.71426892, + "num_input_tokens_seen": 268936600, + "step": 12470, + "time_per_iteration": 2.736288547515869 + }, + { + "auxiliary_loss_clip": 0.01082647, + "auxiliary_loss_mlp": 0.01033832, + "balance_loss_clip": 1.03593028, + "balance_loss_mlp": 1.01967597, + "epoch": 0.7497970840222457, + "flos": 21617434172160.0, + "grad_norm": 1.8937148851838135, + "language_loss": 0.75178516, + "learning_rate": 6.214015851881793e-07, + "loss": 0.77294993, + "num_input_tokens_seen": 268956560, + "step": 12471, + "time_per_iteration": 2.664313554763794 + }, + { + "auxiliary_loss_clip": 0.01084709, + "auxiliary_loss_mlp": 0.01035791, + "balance_loss_clip": 1.0353353, + "balance_loss_mlp": 1.02159286, + "epoch": 0.7498572072749136, + "flos": 13735580906880.0, + "grad_norm": 2.796464416827846, + "language_loss": 0.77233744, + "learning_rate": 6.211194553838929e-07, + "loss": 0.79354239, + "num_input_tokens_seen": 268973945, + "step": 12472, + "time_per_iteration": 2.657557487487793 + }, + { + "auxiliary_loss_clip": 0.01094543, + "auxiliary_loss_mlp": 0.00769819, + "balance_loss_clip": 1.03535211, + "balance_loss_mlp": 1.00018263, + "epoch": 0.7499173305275816, + "flos": 22966526113920.0, + "grad_norm": 1.5448300611730317, + "language_loss": 0.84419262, + "learning_rate": 6.208373778668951e-07, + "loss": 0.8628363, + "num_input_tokens_seen": 268993245, + "step": 12473, + "time_per_iteration": 2.7043027877807617 + }, + { + "auxiliary_loss_clip": 0.01079095, + "auxiliary_loss_mlp": 0.01031863, + "balance_loss_clip": 1.03500473, + "balance_loss_mlp": 1.01823711, + "epoch": 0.7499774537802495, + "flos": 22740467869440.0, + "grad_norm": 2.038219260751869, + "language_loss": 0.7402907, + "learning_rate": 6.205553526478829e-07, + "loss": 0.76140028, + "num_input_tokens_seen": 269012125, + "step": 12474, + "time_per_iteration": 2.74438214302063 + }, + { + "auxiliary_loss_clip": 0.01088373, + "auxiliary_loss_mlp": 0.01038948, + "balance_loss_clip": 1.03736258, + "balance_loss_mlp": 1.02587676, + "epoch": 0.7500375770329175, + "flos": 18296559089280.0, + "grad_norm": 2.2001386818620263, + "language_loss": 0.74208605, + "learning_rate": 6.202733797375492e-07, + "loss": 0.76335931, + "num_input_tokens_seen": 269030545, + "step": 12475, + "time_per_iteration": 2.6366353034973145 + }, + { + "auxiliary_loss_clip": 0.0110606, + "auxiliary_loss_mlp": 0.01035347, + "balance_loss_clip": 1.03846169, + "balance_loss_mlp": 1.02150083, + "epoch": 0.7500977002855854, + "flos": 19169978198400.0, + "grad_norm": 1.7274221168077015, + "language_loss": 0.80403024, + "learning_rate": 6.199914591465878e-07, + "loss": 0.82544434, + "num_input_tokens_seen": 269048180, + "step": 12476, + "time_per_iteration": 2.622103691101074 + }, + { + "auxiliary_loss_clip": 0.01076959, + "auxiliary_loss_mlp": 0.01034735, + "balance_loss_clip": 1.0369035, + "balance_loss_mlp": 1.02214074, + "epoch": 0.7501578235382534, + "flos": 22163886754560.0, + "grad_norm": 1.9425018569967707, + "language_loss": 0.77756828, + "learning_rate": 6.19709590885688e-07, + "loss": 0.79868519, + "num_input_tokens_seen": 269068600, + "step": 12477, + "time_per_iteration": 2.6923439502716064 + }, + { + "auxiliary_loss_clip": 0.01010213, + "auxiliary_loss_mlp": 0.01001269, + "balance_loss_clip": 1.00773573, + "balance_loss_mlp": 1.00022018, + "epoch": 0.7502179467909214, + "flos": 64465040033280.0, + "grad_norm": 0.8187484606770943, + "language_loss": 0.54458755, + "learning_rate": 6.194277749655394e-07, + "loss": 0.56470239, + "num_input_tokens_seen": 269119045, + "step": 12478, + "time_per_iteration": 3.204738140106201 + }, + { + "auxiliary_loss_clip": 0.0108167, + "auxiliary_loss_mlp": 0.01032285, + "balance_loss_clip": 1.03592229, + "balance_loss_mlp": 1.02035236, + "epoch": 0.7502780700435894, + "flos": 20478275268480.0, + "grad_norm": 1.6024244799309337, + "language_loss": 0.80039358, + "learning_rate": 6.191460113968272e-07, + "loss": 0.82153314, + "num_input_tokens_seen": 269136755, + "step": 12479, + "time_per_iteration": 2.690080165863037 + }, + { + "auxiliary_loss_clip": 0.01104663, + "auxiliary_loss_mlp": 0.01038768, + "balance_loss_clip": 1.03951621, + "balance_loss_mlp": 1.02505875, + "epoch": 0.7503381932962573, + "flos": 20445273648000.0, + "grad_norm": 2.9599564657820805, + "language_loss": 0.62753713, + "learning_rate": 6.188643001902369e-07, + "loss": 0.64897144, + "num_input_tokens_seen": 269156120, + "step": 12480, + "time_per_iteration": 2.6162097454071045 + }, + { + "auxiliary_loss_clip": 0.0108428, + "auxiliary_loss_mlp": 0.01034909, + "balance_loss_clip": 1.03689194, + "balance_loss_mlp": 1.02272034, + "epoch": 0.7503983165489253, + "flos": 22381936266240.0, + "grad_norm": 2.3943314671981955, + "language_loss": 0.78243744, + "learning_rate": 6.185826413564512e-07, + "loss": 0.80362934, + "num_input_tokens_seen": 269175650, + "step": 12481, + "time_per_iteration": 2.669548988342285 + }, + { + "auxiliary_loss_clip": 0.0106997, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.0354079, + "balance_loss_mlp": 1.02479172, + "epoch": 0.7504584398015932, + "flos": 24899453717760.0, + "grad_norm": 1.8872543755880817, + "language_loss": 0.71297598, + "learning_rate": 6.183010349061501e-07, + "loss": 0.73405796, + "num_input_tokens_seen": 269197080, + "step": 12482, + "time_per_iteration": 2.7567055225372314 + }, + { + "auxiliary_loss_clip": 0.01111149, + "auxiliary_loss_mlp": 0.01035657, + "balance_loss_clip": 1.03868306, + "balance_loss_mlp": 1.02335453, + "epoch": 0.7505185630542612, + "flos": 25885237547520.0, + "grad_norm": 1.839701731381712, + "language_loss": 0.6994698, + "learning_rate": 6.180194808500118e-07, + "loss": 0.72093785, + "num_input_tokens_seen": 269218600, + "step": 12483, + "time_per_iteration": 2.606757402420044 + }, + { + "auxiliary_loss_clip": 0.01110582, + "auxiliary_loss_mlp": 0.01027036, + "balance_loss_clip": 1.03916931, + "balance_loss_mlp": 1.01574111, + "epoch": 0.7505786863069293, + "flos": 23143852581120.0, + "grad_norm": 2.0560449071537943, + "language_loss": 0.74602097, + "learning_rate": 6.177379791987131e-07, + "loss": 0.76739717, + "num_input_tokens_seen": 269239245, + "step": 12484, + "time_per_iteration": 2.618504285812378 + }, + { + "auxiliary_loss_clip": 0.01087809, + "auxiliary_loss_mlp": 0.01029947, + "balance_loss_clip": 1.03753555, + "balance_loss_mlp": 1.01745975, + "epoch": 0.7506388095595972, + "flos": 16983377769600.0, + "grad_norm": 1.9415131613647365, + "language_loss": 0.84624791, + "learning_rate": 6.174565299629295e-07, + "loss": 0.86742544, + "num_input_tokens_seen": 269258520, + "step": 12485, + "time_per_iteration": 2.697805404663086 + }, + { + "auxiliary_loss_clip": 0.01072795, + "auxiliary_loss_mlp": 0.0103091, + "balance_loss_clip": 1.03648996, + "balance_loss_mlp": 1.01851201, + "epoch": 0.7506989328122652, + "flos": 22344984149760.0, + "grad_norm": 1.6745365119179365, + "language_loss": 0.78448224, + "learning_rate": 6.171751331533323e-07, + "loss": 0.80551928, + "num_input_tokens_seen": 269278320, + "step": 12486, + "time_per_iteration": 2.714510202407837 + }, + { + "auxiliary_loss_clip": 0.01099772, + "auxiliary_loss_mlp": 0.01033013, + "balance_loss_clip": 1.03659987, + "balance_loss_mlp": 1.01920259, + "epoch": 0.7507590560649331, + "flos": 25776069137280.0, + "grad_norm": 2.4012807743392477, + "language_loss": 0.72792411, + "learning_rate": 6.168937887805932e-07, + "loss": 0.74925202, + "num_input_tokens_seen": 269298025, + "step": 12487, + "time_per_iteration": 2.7071502208709717 + }, + { + "auxiliary_loss_clip": 0.01085256, + "auxiliary_loss_mlp": 0.0103171, + "balance_loss_clip": 1.0348568, + "balance_loss_mlp": 1.01866841, + "epoch": 0.7508191793176011, + "flos": 24279420124800.0, + "grad_norm": 4.846564325155201, + "language_loss": 0.67752981, + "learning_rate": 6.166124968553801e-07, + "loss": 0.69869953, + "num_input_tokens_seen": 269316770, + "step": 12488, + "time_per_iteration": 2.644109010696411 + }, + { + "auxiliary_loss_clip": 0.01045289, + "auxiliary_loss_mlp": 0.01033728, + "balance_loss_clip": 1.0341351, + "balance_loss_mlp": 1.02041197, + "epoch": 0.750879302570269, + "flos": 19899575251200.0, + "grad_norm": 1.8778545582321347, + "language_loss": 0.77185404, + "learning_rate": 6.163312573883592e-07, + "loss": 0.7926442, + "num_input_tokens_seen": 269334755, + "step": 12489, + "time_per_iteration": 2.73962664604187 + }, + { + "auxiliary_loss_clip": 0.01096988, + "auxiliary_loss_mlp": 0.01031266, + "balance_loss_clip": 1.03820062, + "balance_loss_mlp": 1.01943493, + "epoch": 0.750939425822937, + "flos": 29205681667200.0, + "grad_norm": 2.4146735284189393, + "language_loss": 0.75405651, + "learning_rate": 6.160500703901956e-07, + "loss": 0.77533901, + "num_input_tokens_seen": 269353810, + "step": 12490, + "time_per_iteration": 2.6824519634246826 + }, + { + "auxiliary_loss_clip": 0.01109505, + "auxiliary_loss_mlp": 0.0103058, + "balance_loss_clip": 1.03855062, + "balance_loss_mlp": 1.01803946, + "epoch": 0.750999549075605, + "flos": 21142300043520.0, + "grad_norm": 1.5627078953093116, + "language_loss": 0.78168178, + "learning_rate": 6.157689358715527e-07, + "loss": 0.80308264, + "num_input_tokens_seen": 269372910, + "step": 12491, + "time_per_iteration": 2.6018178462982178 + }, + { + "auxiliary_loss_clip": 0.01097672, + "auxiliary_loss_mlp": 0.01031858, + "balance_loss_clip": 1.03719735, + "balance_loss_mlp": 1.02034187, + "epoch": 0.751059672328273, + "flos": 23547740083200.0, + "grad_norm": 1.6628642916222176, + "language_loss": 0.76332009, + "learning_rate": 6.154878538430899e-07, + "loss": 0.7846154, + "num_input_tokens_seen": 269391545, + "step": 12492, + "time_per_iteration": 2.691298484802246 + }, + { + "auxiliary_loss_clip": 0.01078534, + "auxiliary_loss_mlp": 0.01032513, + "balance_loss_clip": 1.03569448, + "balance_loss_mlp": 1.02058053, + "epoch": 0.7511197955809409, + "flos": 18989742729600.0, + "grad_norm": 1.9903305425404954, + "language_loss": 0.71488953, + "learning_rate": 6.152068243154671e-07, + "loss": 0.736, + "num_input_tokens_seen": 269408530, + "step": 12493, + "time_per_iteration": 2.718707323074341 + }, + { + "auxiliary_loss_clip": 0.01099033, + "auxiliary_loss_mlp": 0.00770094, + "balance_loss_clip": 1.03731656, + "balance_loss_mlp": 1.00024784, + "epoch": 0.7511799188336089, + "flos": 22046961006720.0, + "grad_norm": 4.285406665827556, + "language_loss": 0.80753833, + "learning_rate": 6.149258472993395e-07, + "loss": 0.82622963, + "num_input_tokens_seen": 269425930, + "step": 12494, + "time_per_iteration": 4.076538562774658 + }, + { + "auxiliary_loss_clip": 0.01111429, + "auxiliary_loss_mlp": 0.01029891, + "balance_loss_clip": 1.03875446, + "balance_loss_mlp": 1.01716495, + "epoch": 0.7512400420862768, + "flos": 16467125546880.0, + "grad_norm": 3.1005011583642084, + "language_loss": 0.78857327, + "learning_rate": 6.146449228053634e-07, + "loss": 0.80998647, + "num_input_tokens_seen": 269443945, + "step": 12495, + "time_per_iteration": 2.608964204788208 + }, + { + "auxiliary_loss_clip": 0.01110172, + "auxiliary_loss_mlp": 0.00769806, + "balance_loss_clip": 1.03854084, + "balance_loss_mlp": 1.0001905, + "epoch": 0.7513001653389448, + "flos": 20448326304000.0, + "grad_norm": 2.1655519437431967, + "language_loss": 0.7114259, + "learning_rate": 6.143640508441898e-07, + "loss": 0.73022562, + "num_input_tokens_seen": 269463625, + "step": 12496, + "time_per_iteration": 5.996880769729614 + }, + { + "auxiliary_loss_clip": 0.01065225, + "auxiliary_loss_mlp": 0.01035626, + "balance_loss_clip": 1.03378069, + "balance_loss_mlp": 1.02353823, + "epoch": 0.7513602885916129, + "flos": 23476816679040.0, + "grad_norm": 1.61396701477023, + "language_loss": 0.78199899, + "learning_rate": 6.140832314264705e-07, + "loss": 0.80300748, + "num_input_tokens_seen": 269483415, + "step": 12497, + "time_per_iteration": 2.9391214847564697 + }, + { + "auxiliary_loss_clip": 0.01100389, + "auxiliary_loss_mlp": 0.0103612, + "balance_loss_clip": 1.03779829, + "balance_loss_mlp": 1.02334642, + "epoch": 0.7514204118442808, + "flos": 26797224885120.0, + "grad_norm": 1.6137991944491499, + "language_loss": 0.76816785, + "learning_rate": 6.13802464562855e-07, + "loss": 0.7895329, + "num_input_tokens_seen": 269504635, + "step": 12498, + "time_per_iteration": 2.6544244289398193 + }, + { + "auxiliary_loss_clip": 0.0108807, + "auxiliary_loss_mlp": 0.01033331, + "balance_loss_clip": 1.03969288, + "balance_loss_mlp": 1.02200651, + "epoch": 0.7514805350969488, + "flos": 19865639877120.0, + "grad_norm": 1.7444376873678542, + "language_loss": 0.74047679, + "learning_rate": 6.135217502639878e-07, + "loss": 0.7616908, + "num_input_tokens_seen": 269523955, + "step": 12499, + "time_per_iteration": 4.209566831588745 + }, + { + "auxiliary_loss_clip": 0.01096501, + "auxiliary_loss_mlp": 0.01028936, + "balance_loss_clip": 1.03525448, + "balance_loss_mlp": 1.01752162, + "epoch": 0.7515406583496167, + "flos": 24571553437440.0, + "grad_norm": 2.0366798192363698, + "language_loss": 0.79610258, + "learning_rate": 6.132410885405148e-07, + "loss": 0.81735694, + "num_input_tokens_seen": 269544410, + "step": 12500, + "time_per_iteration": 2.6563799381256104 + }, + { + "auxiliary_loss_clip": 0.01108205, + "auxiliary_loss_mlp": 0.01037277, + "balance_loss_clip": 1.03992486, + "balance_loss_mlp": 1.02259684, + "epoch": 0.7516007816022847, + "flos": 20120246455680.0, + "grad_norm": 3.0425120159741588, + "language_loss": 0.73648608, + "learning_rate": 6.129604794030794e-07, + "loss": 0.75794089, + "num_input_tokens_seen": 269563315, + "step": 12501, + "time_per_iteration": 2.744978666305542 + }, + { + "auxiliary_loss_clip": 0.01086633, + "auxiliary_loss_mlp": 0.01027091, + "balance_loss_clip": 1.03513741, + "balance_loss_mlp": 1.01484871, + "epoch": 0.7516609048549526, + "flos": 22784638619520.0, + "grad_norm": 1.7898399637161078, + "language_loss": 0.78497088, + "learning_rate": 6.126799228623207e-07, + "loss": 0.80610812, + "num_input_tokens_seen": 269583950, + "step": 12502, + "time_per_iteration": 2.738304615020752 + }, + { + "auxiliary_loss_clip": 0.01089762, + "auxiliary_loss_mlp": 0.01036729, + "balance_loss_clip": 1.03781581, + "balance_loss_mlp": 1.02402735, + "epoch": 0.7517210281076206, + "flos": 10634012311680.0, + "grad_norm": 2.4706577261656277, + "language_loss": 0.70263046, + "learning_rate": 6.123994189288786e-07, + "loss": 0.72389537, + "num_input_tokens_seen": 269600120, + "step": 12503, + "time_per_iteration": 2.647141695022583 + }, + { + "auxiliary_loss_clip": 0.0102855, + "auxiliary_loss_mlp": 0.00998893, + "balance_loss_clip": 1.00588393, + "balance_loss_mlp": 0.99776667, + "epoch": 0.7517811513602886, + "flos": 66052221275520.0, + "grad_norm": 0.9994462005888404, + "language_loss": 0.63930368, + "learning_rate": 6.121189676133903e-07, + "loss": 0.65957808, + "num_input_tokens_seen": 269659815, + "step": 12504, + "time_per_iteration": 3.0780868530273438 + }, + { + "auxiliary_loss_clip": 0.01067894, + "auxiliary_loss_mlp": 0.01035688, + "balance_loss_clip": 1.03288054, + "balance_loss_mlp": 1.02317679, + "epoch": 0.7518412746129566, + "flos": 37268345018880.0, + "grad_norm": 1.4012015647577118, + "language_loss": 0.68983722, + "learning_rate": 6.118385689264896e-07, + "loss": 0.71087301, + "num_input_tokens_seen": 269684565, + "step": 12505, + "time_per_iteration": 2.979429244995117 + }, + { + "auxiliary_loss_clip": 0.01018848, + "auxiliary_loss_mlp": 0.00750909, + "balance_loss_clip": 1.00648499, + "balance_loss_mlp": 0.9996025, + "epoch": 0.7519013978656245, + "flos": 60518567727360.0, + "grad_norm": 1.3160178950136667, + "language_loss": 0.55058348, + "learning_rate": 6.11558222878809e-07, + "loss": 0.56828105, + "num_input_tokens_seen": 269752325, + "step": 12506, + "time_per_iteration": 3.3165297508239746 + }, + { + "auxiliary_loss_clip": 0.01099755, + "auxiliary_loss_mlp": 0.0103953, + "balance_loss_clip": 1.03766441, + "balance_loss_mlp": 1.02648234, + "epoch": 0.7519615211182925, + "flos": 18806885568000.0, + "grad_norm": 2.1648531082865, + "language_loss": 0.78275704, + "learning_rate": 6.112779294809796e-07, + "loss": 0.80414987, + "num_input_tokens_seen": 269770630, + "step": 12507, + "time_per_iteration": 2.608264923095703 + }, + { + "auxiliary_loss_clip": 0.01083834, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.03923869, + "balance_loss_mlp": 1.02056146, + "epoch": 0.7520216443709604, + "flos": 14575244209920.0, + "grad_norm": 1.7931867783569413, + "language_loss": 0.71366513, + "learning_rate": 6.10997688743631e-07, + "loss": 0.73482585, + "num_input_tokens_seen": 269787280, + "step": 12508, + "time_per_iteration": 2.695327043533325 + }, + { + "auxiliary_loss_clip": 0.01095204, + "auxiliary_loss_mlp": 0.01031492, + "balance_loss_clip": 1.03605807, + "balance_loss_mlp": 1.01884961, + "epoch": 0.7520817676236284, + "flos": 17056599644160.0, + "grad_norm": 1.7777789026239683, + "language_loss": 0.71897995, + "learning_rate": 6.107175006773885e-07, + "loss": 0.74024695, + "num_input_tokens_seen": 269805205, + "step": 12509, + "time_per_iteration": 2.649292230606079 + }, + { + "auxiliary_loss_clip": 0.01116565, + "auxiliary_loss_mlp": 0.01036188, + "balance_loss_clip": 1.04018068, + "balance_loss_mlp": 1.02252054, + "epoch": 0.7521418908762965, + "flos": 25666397936640.0, + "grad_norm": 1.6131110543422247, + "language_loss": 0.62129647, + "learning_rate": 6.104373652928785e-07, + "loss": 0.64282399, + "num_input_tokens_seen": 269824820, + "step": 12510, + "time_per_iteration": 2.5948257446289062 + }, + { + "auxiliary_loss_clip": 0.01097666, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.03866065, + "balance_loss_mlp": 1.01975513, + "epoch": 0.7522020141289644, + "flos": 20886759711360.0, + "grad_norm": 3.9854305674745474, + "language_loss": 0.81469762, + "learning_rate": 6.10157282600722e-07, + "loss": 0.83599389, + "num_input_tokens_seen": 269842825, + "step": 12511, + "time_per_iteration": 2.6505610942840576 + }, + { + "auxiliary_loss_clip": 0.01087038, + "auxiliary_loss_mlp": 0.01038744, + "balance_loss_clip": 1.03666866, + "balance_loss_mlp": 1.02523184, + "epoch": 0.7522621373816324, + "flos": 12640305444480.0, + "grad_norm": 1.8827619698116422, + "language_loss": 0.75637031, + "learning_rate": 6.098772526115412e-07, + "loss": 0.77762812, + "num_input_tokens_seen": 269859000, + "step": 12512, + "time_per_iteration": 2.647817850112915 + }, + { + "auxiliary_loss_clip": 0.01093893, + "auxiliary_loss_mlp": 0.01030376, + "balance_loss_clip": 1.03682494, + "balance_loss_mlp": 1.01915812, + "epoch": 0.7523222606343003, + "flos": 25626141768960.0, + "grad_norm": 1.9631337780364373, + "language_loss": 0.82056046, + "learning_rate": 6.095972753359537e-07, + "loss": 0.84180307, + "num_input_tokens_seen": 269878895, + "step": 12513, + "time_per_iteration": 2.6645336151123047 + }, + { + "auxiliary_loss_clip": 0.01097529, + "auxiliary_loss_mlp": 0.01037828, + "balance_loss_clip": 1.03800118, + "balance_loss_mlp": 1.02469754, + "epoch": 0.7523823838869683, + "flos": 20448900921600.0, + "grad_norm": 2.0970776368608846, + "language_loss": 0.74827564, + "learning_rate": 6.093173507845771e-07, + "loss": 0.76962924, + "num_input_tokens_seen": 269897280, + "step": 12514, + "time_per_iteration": 2.617037057876587 + }, + { + "auxiliary_loss_clip": 0.01090674, + "auxiliary_loss_mlp": 0.01031924, + "balance_loss_clip": 1.03809762, + "balance_loss_mlp": 1.02052188, + "epoch": 0.7524425071396362, + "flos": 14720610551040.0, + "grad_norm": 3.0939358054724146, + "language_loss": 0.68892944, + "learning_rate": 6.090374789680271e-07, + "loss": 0.71015543, + "num_input_tokens_seen": 269914640, + "step": 12515, + "time_per_iteration": 2.59306001663208 + }, + { + "auxiliary_loss_clip": 0.01100231, + "auxiliary_loss_mlp": 0.0103411, + "balance_loss_clip": 1.03811383, + "balance_loss_mlp": 1.02207565, + "epoch": 0.7525026303923043, + "flos": 30592048947840.0, + "grad_norm": 2.040446314697398, + "language_loss": 0.69929761, + "learning_rate": 6.087576598969137e-07, + "loss": 0.72064102, + "num_input_tokens_seen": 269934960, + "step": 12516, + "time_per_iteration": 2.6960794925689697 + }, + { + "auxiliary_loss_clip": 0.0106292, + "auxiliary_loss_mlp": 0.01032821, + "balance_loss_clip": 1.03736663, + "balance_loss_mlp": 1.020751, + "epoch": 0.7525627536449722, + "flos": 24791757765120.0, + "grad_norm": 1.5503440947431564, + "language_loss": 0.89659667, + "learning_rate": 6.084778935818495e-07, + "loss": 0.91755402, + "num_input_tokens_seen": 269956655, + "step": 12517, + "time_per_iteration": 2.9062299728393555 + }, + { + "auxiliary_loss_clip": 0.0108799, + "auxiliary_loss_mlp": 0.01032916, + "balance_loss_clip": 1.036955, + "balance_loss_mlp": 1.02054834, + "epoch": 0.7526228768976402, + "flos": 20779782030720.0, + "grad_norm": 1.60812028776888, + "language_loss": 0.74420178, + "learning_rate": 6.081981800334437e-07, + "loss": 0.76541078, + "num_input_tokens_seen": 269976835, + "step": 12518, + "time_per_iteration": 2.9830613136291504 + }, + { + "auxiliary_loss_clip": 0.00997374, + "auxiliary_loss_mlp": 0.01010959, + "balance_loss_clip": 1.01711154, + "balance_loss_mlp": 1.0097965, + "epoch": 0.7526830001503081, + "flos": 66559243703040.0, + "grad_norm": 0.708684039109314, + "language_loss": 0.55700099, + "learning_rate": 6.079185192623017e-07, + "loss": 0.5770843, + "num_input_tokens_seen": 270040630, + "step": 12519, + "time_per_iteration": 3.3146941661834717 + }, + { + "auxiliary_loss_clip": 0.01093149, + "auxiliary_loss_mlp": 0.01034827, + "balance_loss_clip": 1.0377202, + "balance_loss_mlp": 1.0233829, + "epoch": 0.7527431234029761, + "flos": 23477894087040.0, + "grad_norm": 1.471289032229335, + "language_loss": 0.77771223, + "learning_rate": 6.07638911279029e-07, + "loss": 0.79899204, + "num_input_tokens_seen": 270059695, + "step": 12520, + "time_per_iteration": 2.6884288787841797 + }, + { + "auxiliary_loss_clip": 0.01092157, + "auxiliary_loss_mlp": 0.01040045, + "balance_loss_clip": 1.0348748, + "balance_loss_mlp": 1.02787995, + "epoch": 0.752803246655644, + "flos": 22049546785920.0, + "grad_norm": 1.9875940404305874, + "language_loss": 0.73850435, + "learning_rate": 6.07359356094229e-07, + "loss": 0.75982636, + "num_input_tokens_seen": 270078420, + "step": 12521, + "time_per_iteration": 2.6750216484069824 + }, + { + "auxiliary_loss_clip": 0.01088057, + "auxiliary_loss_mlp": 0.01037596, + "balance_loss_clip": 1.03908432, + "balance_loss_mlp": 1.02416718, + "epoch": 0.752863369908312, + "flos": 30153795108480.0, + "grad_norm": 2.5051463409980634, + "language_loss": 0.67080051, + "learning_rate": 6.070798537185016e-07, + "loss": 0.69205701, + "num_input_tokens_seen": 270097040, + "step": 12522, + "time_per_iteration": 2.772545576095581 + }, + { + "auxiliary_loss_clip": 0.01101858, + "auxiliary_loss_mlp": 0.01042231, + "balance_loss_clip": 1.0390172, + "balance_loss_mlp": 1.02954745, + "epoch": 0.7529234931609801, + "flos": 24567638855040.0, + "grad_norm": 1.9325900284520732, + "language_loss": 0.78271604, + "learning_rate": 6.068004041624453e-07, + "loss": 0.8041569, + "num_input_tokens_seen": 270116365, + "step": 12523, + "time_per_iteration": 2.5928404331207275 + }, + { + "auxiliary_loss_clip": 0.01107861, + "auxiliary_loss_mlp": 0.01033051, + "balance_loss_clip": 1.03753757, + "balance_loss_mlp": 1.02056384, + "epoch": 0.752983616413648, + "flos": 23112395245440.0, + "grad_norm": 1.9643840273203326, + "language_loss": 0.80548674, + "learning_rate": 6.065210074366571e-07, + "loss": 0.82689583, + "num_input_tokens_seen": 270135395, + "step": 12524, + "time_per_iteration": 2.5654656887054443 + }, + { + "auxiliary_loss_clip": 0.01100021, + "auxiliary_loss_mlp": 0.00769024, + "balance_loss_clip": 1.03862953, + "balance_loss_mlp": 1.00022125, + "epoch": 0.753043739666316, + "flos": 24316946858880.0, + "grad_norm": 1.7390985823733704, + "language_loss": 0.74004686, + "learning_rate": 6.062416635517326e-07, + "loss": 0.75873733, + "num_input_tokens_seen": 270156425, + "step": 12525, + "time_per_iteration": 2.629235029220581 + }, + { + "auxiliary_loss_clip": 0.01076975, + "auxiliary_loss_mlp": 0.01030405, + "balance_loss_clip": 1.03678906, + "balance_loss_mlp": 1.01844287, + "epoch": 0.7531038629189839, + "flos": 24243294021120.0, + "grad_norm": 1.8793498338294334, + "language_loss": 0.72428775, + "learning_rate": 6.059623725182641e-07, + "loss": 0.74536157, + "num_input_tokens_seen": 270176905, + "step": 12526, + "time_per_iteration": 2.7281389236450195 + }, + { + "auxiliary_loss_clip": 0.01088063, + "auxiliary_loss_mlp": 0.01028205, + "balance_loss_clip": 1.03752398, + "balance_loss_mlp": 1.01674342, + "epoch": 0.7531639861716519, + "flos": 30188807890560.0, + "grad_norm": 1.5986018665355572, + "language_loss": 0.72446048, + "learning_rate": 6.056831343468414e-07, + "loss": 0.74562311, + "num_input_tokens_seen": 270196640, + "step": 12527, + "time_per_iteration": 2.765742301940918 + }, + { + "auxiliary_loss_clip": 0.01077327, + "auxiliary_loss_mlp": 0.01027333, + "balance_loss_clip": 1.03892338, + "balance_loss_mlp": 1.01588261, + "epoch": 0.7532241094243198, + "flos": 18223193560320.0, + "grad_norm": 1.7490164070315937, + "language_loss": 0.81002724, + "learning_rate": 6.054039490480539e-07, + "loss": 0.83107388, + "num_input_tokens_seen": 270213905, + "step": 12528, + "time_per_iteration": 2.8258893489837646 + }, + { + "auxiliary_loss_clip": 0.01062731, + "auxiliary_loss_mlp": 0.01037431, + "balance_loss_clip": 1.04194808, + "balance_loss_mlp": 1.02391267, + "epoch": 0.7532842326769879, + "flos": 20881049448960.0, + "grad_norm": 2.0737257705998084, + "language_loss": 0.84989285, + "learning_rate": 6.051248166324892e-07, + "loss": 0.87089443, + "num_input_tokens_seen": 270231995, + "step": 12529, + "time_per_iteration": 2.8930623531341553 + }, + { + "auxiliary_loss_clip": 0.01084647, + "auxiliary_loss_mlp": 0.01037761, + "balance_loss_clip": 1.04127479, + "balance_loss_mlp": 1.02479124, + "epoch": 0.7533443559296558, + "flos": 18078689145600.0, + "grad_norm": 1.9050070504159882, + "language_loss": 0.73877907, + "learning_rate": 6.048457371107303e-07, + "loss": 0.76000321, + "num_input_tokens_seen": 270251480, + "step": 12530, + "time_per_iteration": 2.765109062194824 + }, + { + "auxiliary_loss_clip": 0.0098332, + "auxiliary_loss_mlp": 0.01008335, + "balance_loss_clip": 1.01471329, + "balance_loss_mlp": 1.00720811, + "epoch": 0.7534044791823238, + "flos": 50254830766080.0, + "grad_norm": 0.8264532859334547, + "language_loss": 0.63601089, + "learning_rate": 6.045667104933612e-07, + "loss": 0.65592742, + "num_input_tokens_seen": 270306480, + "step": 12531, + "time_per_iteration": 3.203054428100586 + }, + { + "auxiliary_loss_clip": 0.01090436, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.03936112, + "balance_loss_mlp": 1.01770902, + "epoch": 0.7534646024349917, + "flos": 20850274471680.0, + "grad_norm": 2.3022787240399087, + "language_loss": 0.69915926, + "learning_rate": 6.042877367909633e-07, + "loss": 0.72037345, + "num_input_tokens_seen": 270324595, + "step": 12532, + "time_per_iteration": 2.8513519763946533 + }, + { + "auxiliary_loss_clip": 0.01080734, + "auxiliary_loss_mlp": 0.010295, + "balance_loss_clip": 1.0378058, + "balance_loss_mlp": 1.01846147, + "epoch": 0.7535247256876597, + "flos": 23071779941760.0, + "grad_norm": 1.6087653356009437, + "language_loss": 0.77676719, + "learning_rate": 6.040088160141132e-07, + "loss": 0.79786956, + "num_input_tokens_seen": 270344375, + "step": 12533, + "time_per_iteration": 5.849594831466675 + }, + { + "auxiliary_loss_clip": 0.01019649, + "auxiliary_loss_mlp": 0.01000792, + "balance_loss_clip": 1.00604045, + "balance_loss_mlp": 0.99969578, + "epoch": 0.7535848489403276, + "flos": 58623418252800.0, + "grad_norm": 0.7831538235922403, + "language_loss": 0.57278597, + "learning_rate": 6.037299481733886e-07, + "loss": 0.5929904, + "num_input_tokens_seen": 270405235, + "step": 12534, + "time_per_iteration": 4.745975494384766 + }, + { + "auxiliary_loss_clip": 0.01087528, + "auxiliary_loss_mlp": 0.01028431, + "balance_loss_clip": 1.03641176, + "balance_loss_mlp": 1.01590824, + "epoch": 0.7536449721929956, + "flos": 26577882483840.0, + "grad_norm": 1.758420059943407, + "language_loss": 0.71251357, + "learning_rate": 6.03451133279365e-07, + "loss": 0.73367316, + "num_input_tokens_seen": 270425820, + "step": 12535, + "time_per_iteration": 4.465839862823486 + }, + { + "auxiliary_loss_clip": 0.01084954, + "auxiliary_loss_mlp": 0.01032093, + "balance_loss_clip": 1.0328145, + "balance_loss_mlp": 1.01900923, + "epoch": 0.7537050954456637, + "flos": 25735992537600.0, + "grad_norm": 1.6235192895129946, + "language_loss": 0.80976534, + "learning_rate": 6.031723713426135e-07, + "loss": 0.83093584, + "num_input_tokens_seen": 270447120, + "step": 12536, + "time_per_iteration": 2.8644282817840576 + }, + { + "auxiliary_loss_clip": 0.01075788, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.03380179, + "balance_loss_mlp": 1.02025628, + "epoch": 0.7537652186983316, + "flos": 30224431203840.0, + "grad_norm": 2.1219622248663095, + "language_loss": 0.74480766, + "learning_rate": 6.028936623737067e-07, + "loss": 0.76588988, + "num_input_tokens_seen": 270468680, + "step": 12537, + "time_per_iteration": 2.8825693130493164 + }, + { + "auxiliary_loss_clip": 0.01110837, + "auxiliary_loss_mlp": 0.01034517, + "balance_loss_clip": 1.03765774, + "balance_loss_mlp": 1.0218091, + "epoch": 0.7538253419509996, + "flos": 12641239198080.0, + "grad_norm": 1.7916470224859762, + "language_loss": 0.74127239, + "learning_rate": 6.026150063832111e-07, + "loss": 0.76272595, + "num_input_tokens_seen": 270486310, + "step": 12538, + "time_per_iteration": 2.6497671604156494 + }, + { + "auxiliary_loss_clip": 0.0107304, + "auxiliary_loss_mlp": 0.01037868, + "balance_loss_clip": 1.03775454, + "balance_loss_mlp": 1.02487969, + "epoch": 0.7538854652036675, + "flos": 23185976256000.0, + "grad_norm": 1.6497097895252697, + "language_loss": 0.67839807, + "learning_rate": 6.023364033816956e-07, + "loss": 0.69950712, + "num_input_tokens_seen": 270507210, + "step": 12539, + "time_per_iteration": 4.390820503234863 + }, + { + "auxiliary_loss_clip": 0.01109728, + "auxiliary_loss_mlp": 0.01030908, + "balance_loss_clip": 1.03869367, + "balance_loss_mlp": 1.01831353, + "epoch": 0.7539455884563355, + "flos": 23186227651200.0, + "grad_norm": 1.7887923247322153, + "language_loss": 0.74677789, + "learning_rate": 6.020578533797229e-07, + "loss": 0.76818419, + "num_input_tokens_seen": 270525250, + "step": 12540, + "time_per_iteration": 2.6644110679626465 + }, + { + "auxiliary_loss_clip": 0.01112068, + "auxiliary_loss_mlp": 0.01031348, + "balance_loss_clip": 1.03821325, + "balance_loss_mlp": 1.01833093, + "epoch": 0.7540057117090034, + "flos": 13181155505280.0, + "grad_norm": 2.2413467917064844, + "language_loss": 0.72496325, + "learning_rate": 6.017793563878566e-07, + "loss": 0.74639738, + "num_input_tokens_seen": 270539295, + "step": 12541, + "time_per_iteration": 2.6159961223602295 + }, + { + "auxiliary_loss_clip": 0.0110964, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.03806591, + "balance_loss_mlp": 1.0190115, + "epoch": 0.7540658349616715, + "flos": 45478134478080.0, + "grad_norm": 1.701926826906392, + "language_loss": 0.72403926, + "learning_rate": 6.015009124166576e-07, + "loss": 0.74545187, + "num_input_tokens_seen": 270562815, + "step": 12542, + "time_per_iteration": 2.8387362957000732 + }, + { + "auxiliary_loss_clip": 0.01085175, + "auxiliary_loss_mlp": 0.0102804, + "balance_loss_clip": 1.03588843, + "balance_loss_mlp": 1.01508224, + "epoch": 0.7541259582143394, + "flos": 19930817105280.0, + "grad_norm": 2.526006786337045, + "language_loss": 0.8460182, + "learning_rate": 6.012225214766844e-07, + "loss": 0.86715031, + "num_input_tokens_seen": 270579055, + "step": 12543, + "time_per_iteration": 2.6901259422302246 + }, + { + "auxiliary_loss_clip": 0.01077755, + "auxiliary_loss_mlp": 0.01034905, + "balance_loss_clip": 1.04070735, + "balance_loss_mlp": 1.02253056, + "epoch": 0.7541860814670074, + "flos": 27198239299200.0, + "grad_norm": 2.1653550809548587, + "language_loss": 0.73906153, + "learning_rate": 6.009441835784927e-07, + "loss": 0.7601881, + "num_input_tokens_seen": 270599080, + "step": 12544, + "time_per_iteration": 2.729667901992798 + }, + { + "auxiliary_loss_clip": 0.0109777, + "auxiliary_loss_mlp": 0.01030811, + "balance_loss_clip": 1.03749204, + "balance_loss_mlp": 1.01909888, + "epoch": 0.7542462047196753, + "flos": 21324151624320.0, + "grad_norm": 1.9325798259203488, + "language_loss": 0.6805954, + "learning_rate": 6.006658987326383e-07, + "loss": 0.70188129, + "num_input_tokens_seen": 270618715, + "step": 12545, + "time_per_iteration": 2.6119935512542725 + }, + { + "auxiliary_loss_clip": 0.01085784, + "auxiliary_loss_mlp": 0.01033147, + "balance_loss_clip": 1.03426456, + "balance_loss_mlp": 1.0204457, + "epoch": 0.7543063279723433, + "flos": 11940944664960.0, + "grad_norm": 1.8867589100270292, + "language_loss": 0.68448645, + "learning_rate": 6.003876669496728e-07, + "loss": 0.70567578, + "num_input_tokens_seen": 270635695, + "step": 12546, + "time_per_iteration": 2.644932270050049 + }, + { + "auxiliary_loss_clip": 0.01096622, + "auxiliary_loss_mlp": 0.01036368, + "balance_loss_clip": 1.03690362, + "balance_loss_mlp": 1.02293336, + "epoch": 0.7543664512250112, + "flos": 22819974624000.0, + "grad_norm": 2.226922026836887, + "language_loss": 0.73148012, + "learning_rate": 6.00109488240147e-07, + "loss": 0.75281006, + "num_input_tokens_seen": 270654325, + "step": 12547, + "time_per_iteration": 2.592843770980835 + }, + { + "auxiliary_loss_clip": 0.0110976, + "auxiliary_loss_mlp": 0.01029553, + "balance_loss_clip": 1.037709, + "balance_loss_mlp": 1.01641619, + "epoch": 0.7544265744776792, + "flos": 20923855482240.0, + "grad_norm": 2.152338960632508, + "language_loss": 0.67440069, + "learning_rate": 5.998313626146099e-07, + "loss": 0.69579387, + "num_input_tokens_seen": 270674260, + "step": 12548, + "time_per_iteration": 2.646831750869751 + }, + { + "auxiliary_loss_clip": 0.01090643, + "auxiliary_loss_mlp": 0.01034531, + "balance_loss_clip": 1.03753376, + "balance_loss_mlp": 1.02168059, + "epoch": 0.7544866977303473, + "flos": 15195493284480.0, + "grad_norm": 1.8439150079595696, + "language_loss": 0.87032682, + "learning_rate": 5.995532900836088e-07, + "loss": 0.89157856, + "num_input_tokens_seen": 270692200, + "step": 12549, + "time_per_iteration": 2.73703932762146 + }, + { + "auxiliary_loss_clip": 0.01062401, + "auxiliary_loss_mlp": 0.0103417, + "balance_loss_clip": 1.03635311, + "balance_loss_mlp": 1.02223134, + "epoch": 0.7545468209830152, + "flos": 27083683848960.0, + "grad_norm": 1.964347561010599, + "language_loss": 0.77038085, + "learning_rate": 5.992752706576865e-07, + "loss": 0.79134655, + "num_input_tokens_seen": 270709675, + "step": 12550, + "time_per_iteration": 2.7760634422302246 + }, + { + "auxiliary_loss_clip": 0.01110423, + "auxiliary_loss_mlp": 0.01024865, + "balance_loss_clip": 1.03772533, + "balance_loss_mlp": 1.01295626, + "epoch": 0.7546069442356832, + "flos": 26871703735680.0, + "grad_norm": 1.48969324659374, + "language_loss": 0.69521177, + "learning_rate": 5.98997304347386e-07, + "loss": 0.71656471, + "num_input_tokens_seen": 270733055, + "step": 12551, + "time_per_iteration": 2.612513303756714 + }, + { + "auxiliary_loss_clip": 0.0108872, + "auxiliary_loss_mlp": 0.01029008, + "balance_loss_clip": 1.03803182, + "balance_loss_mlp": 1.01590717, + "epoch": 0.7546670674883511, + "flos": 15743131015680.0, + "grad_norm": 1.9528134557769512, + "language_loss": 0.86114484, + "learning_rate": 5.987193911632487e-07, + "loss": 0.88232207, + "num_input_tokens_seen": 270749275, + "step": 12552, + "time_per_iteration": 2.7746293544769287 + }, + { + "auxiliary_loss_clip": 0.0110308, + "auxiliary_loss_mlp": 0.01032898, + "balance_loss_clip": 1.03883934, + "balance_loss_mlp": 1.02059603, + "epoch": 0.7547271907410191, + "flos": 23477714519040.0, + "grad_norm": 1.7307464295257877, + "language_loss": 0.78382206, + "learning_rate": 5.98441531115812e-07, + "loss": 0.8051818, + "num_input_tokens_seen": 270768230, + "step": 12553, + "time_per_iteration": 2.7080695629119873 + }, + { + "auxiliary_loss_clip": 0.01099832, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.0393219, + "balance_loss_mlp": 1.0227654, + "epoch": 0.754787313993687, + "flos": 31722804069120.0, + "grad_norm": 2.043637353991968, + "language_loss": 0.62419349, + "learning_rate": 5.981637242156135e-07, + "loss": 0.64554828, + "num_input_tokens_seen": 270786285, + "step": 12554, + "time_per_iteration": 2.6828603744506836 + }, + { + "auxiliary_loss_clip": 0.01087132, + "auxiliary_loss_mlp": 0.01036641, + "balance_loss_clip": 1.03482223, + "balance_loss_mlp": 1.02456522, + "epoch": 0.7548474372463551, + "flos": 27563055782400.0, + "grad_norm": 1.8726381797429124, + "language_loss": 0.73138636, + "learning_rate": 5.978859704731864e-07, + "loss": 0.75262409, + "num_input_tokens_seen": 270805505, + "step": 12555, + "time_per_iteration": 2.765606164932251 + }, + { + "auxiliary_loss_clip": 0.01089159, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.04132962, + "balance_loss_mlp": 1.0199343, + "epoch": 0.754907560499023, + "flos": 19318576763520.0, + "grad_norm": 1.687430506523416, + "language_loss": 0.78570682, + "learning_rate": 5.976082698990645e-07, + "loss": 0.80692875, + "num_input_tokens_seen": 270824610, + "step": 12556, + "time_per_iteration": 2.7887120246887207 + }, + { + "auxiliary_loss_clip": 0.0102254, + "auxiliary_loss_mlp": 0.01000624, + "balance_loss_clip": 1.00953579, + "balance_loss_mlp": 0.99957508, + "epoch": 0.754967683751691, + "flos": 69744628684800.0, + "grad_norm": 0.7056309097064257, + "language_loss": 0.50379604, + "learning_rate": 5.973306225037769e-07, + "loss": 0.52402771, + "num_input_tokens_seen": 270886155, + "step": 12557, + "time_per_iteration": 3.15433931350708 + }, + { + "auxiliary_loss_clip": 0.01101663, + "auxiliary_loss_mlp": 0.0103538, + "balance_loss_clip": 1.0402422, + "balance_loss_mlp": 1.02214742, + "epoch": 0.7550278070043589, + "flos": 24421913377920.0, + "grad_norm": 1.864770698097121, + "language_loss": 0.71454239, + "learning_rate": 5.970530282978525e-07, + "loss": 0.7359128, + "num_input_tokens_seen": 270905325, + "step": 12558, + "time_per_iteration": 2.6398966312408447 + }, + { + "auxiliary_loss_clip": 0.01086077, + "auxiliary_loss_mlp": 0.01039687, + "balance_loss_clip": 1.03605294, + "balance_loss_mlp": 1.02564383, + "epoch": 0.7550879302570269, + "flos": 32634611838720.0, + "grad_norm": 1.9214211385606932, + "language_loss": 0.80440283, + "learning_rate": 5.967754872918187e-07, + "loss": 0.82566047, + "num_input_tokens_seen": 270927535, + "step": 12559, + "time_per_iteration": 2.774087905883789 + }, + { + "auxiliary_loss_clip": 0.01064062, + "auxiliary_loss_mlp": 0.01030442, + "balance_loss_clip": 1.03735518, + "balance_loss_mlp": 1.01727533, + "epoch": 0.7551480535096948, + "flos": 21795550738560.0, + "grad_norm": 1.681888372687875, + "language_loss": 0.78732002, + "learning_rate": 5.96497999496199e-07, + "loss": 0.80826509, + "num_input_tokens_seen": 270946920, + "step": 12560, + "time_per_iteration": 2.773224115371704 + }, + { + "auxiliary_loss_clip": 0.01059602, + "auxiliary_loss_mlp": 0.01042887, + "balance_loss_clip": 1.03382421, + "balance_loss_mlp": 1.0288794, + "epoch": 0.7552081767623628, + "flos": 18515111391360.0, + "grad_norm": 2.5084045238772625, + "language_loss": 0.70966113, + "learning_rate": 5.96220564921515e-07, + "loss": 0.73068601, + "num_input_tokens_seen": 270965705, + "step": 12561, + "time_per_iteration": 2.7290430068969727 + }, + { + "auxiliary_loss_clip": 0.01084123, + "auxiliary_loss_mlp": 0.0077333, + "balance_loss_clip": 1.03486896, + "balance_loss_mlp": 1.00013804, + "epoch": 0.7552683000150308, + "flos": 27634805199360.0, + "grad_norm": 1.645858172778927, + "language_loss": 0.7574439, + "learning_rate": 5.959431835782889e-07, + "loss": 0.7760185, + "num_input_tokens_seen": 270986550, + "step": 12562, + "time_per_iteration": 2.766808032989502 + }, + { + "auxiliary_loss_clip": 0.01084916, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.03713727, + "balance_loss_mlp": 1.01534379, + "epoch": 0.7553284232676988, + "flos": 20302924049280.0, + "grad_norm": 1.8387284199108043, + "language_loss": 0.76086068, + "learning_rate": 5.956658554770371e-07, + "loss": 0.78199327, + "num_input_tokens_seen": 271006250, + "step": 12563, + "time_per_iteration": 2.6442339420318604 + }, + { + "auxiliary_loss_clip": 0.01082317, + "auxiliary_loss_mlp": 0.01032838, + "balance_loss_clip": 1.03697193, + "balance_loss_mlp": 1.01750755, + "epoch": 0.7553885465203668, + "flos": 33255471444480.0, + "grad_norm": 2.643775015065329, + "language_loss": 0.67393947, + "learning_rate": 5.953885806282768e-07, + "loss": 0.69509107, + "num_input_tokens_seen": 271025575, + "step": 12564, + "time_per_iteration": 2.780668020248413 + }, + { + "auxiliary_loss_clip": 0.01084523, + "auxiliary_loss_mlp": 0.01036081, + "balance_loss_clip": 1.03688002, + "balance_loss_mlp": 1.02188349, + "epoch": 0.7554486697730347, + "flos": 21616249023360.0, + "grad_norm": 2.407953823392175, + "language_loss": 0.69013596, + "learning_rate": 5.951113590425228e-07, + "loss": 0.71134198, + "num_input_tokens_seen": 271045805, + "step": 12565, + "time_per_iteration": 2.665789842605591 + }, + { + "auxiliary_loss_clip": 0.01091959, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.03703809, + "balance_loss_mlp": 1.01887071, + "epoch": 0.7555087930257027, + "flos": 27632973605760.0, + "grad_norm": 1.874045640971064, + "language_loss": 0.75261271, + "learning_rate": 5.94834190730287e-07, + "loss": 0.77385962, + "num_input_tokens_seen": 271066065, + "step": 12566, + "time_per_iteration": 2.6897921562194824 + }, + { + "auxiliary_loss_clip": 0.01105994, + "auxiliary_loss_mlp": 0.01036283, + "balance_loss_clip": 1.03961587, + "balance_loss_mlp": 1.02240658, + "epoch": 0.7555689162783706, + "flos": 23621644316160.0, + "grad_norm": 2.19029922676856, + "language_loss": 0.73804015, + "learning_rate": 5.945570757020789e-07, + "loss": 0.75946295, + "num_input_tokens_seen": 271085870, + "step": 12567, + "time_per_iteration": 2.681082248687744 + }, + { + "auxiliary_loss_clip": 0.01112381, + "auxiliary_loss_mlp": 0.01028784, + "balance_loss_clip": 1.03940594, + "balance_loss_mlp": 1.01680374, + "epoch": 0.7556290395310387, + "flos": 24863076218880.0, + "grad_norm": 2.047451974712634, + "language_loss": 0.62868547, + "learning_rate": 5.942800139684073e-07, + "loss": 0.65009713, + "num_input_tokens_seen": 271104260, + "step": 12568, + "time_per_iteration": 2.663501739501953 + }, + { + "auxiliary_loss_clip": 0.0102291, + "auxiliary_loss_mlp": 0.01041785, + "balance_loss_clip": 1.03343916, + "balance_loss_mlp": 1.02825463, + "epoch": 0.7556891627837066, + "flos": 43543770330240.0, + "grad_norm": 1.8712587826927434, + "language_loss": 0.66730088, + "learning_rate": 5.940030055397789e-07, + "loss": 0.68794787, + "num_input_tokens_seen": 271125745, + "step": 12569, + "time_per_iteration": 3.4009649753570557 + }, + { + "auxiliary_loss_clip": 0.01104803, + "auxiliary_loss_mlp": 0.01036972, + "balance_loss_clip": 1.03995872, + "balance_loss_mlp": 1.02256608, + "epoch": 0.7557492860363746, + "flos": 26650924790400.0, + "grad_norm": 1.7459864458479233, + "language_loss": 0.67298895, + "learning_rate": 5.93726050426697e-07, + "loss": 0.69440669, + "num_input_tokens_seen": 271147145, + "step": 12570, + "time_per_iteration": 3.006865978240967 + }, + { + "auxiliary_loss_clip": 0.01112467, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.03923225, + "balance_loss_mlp": 1.02238834, + "epoch": 0.7558094092890425, + "flos": 55182885010560.0, + "grad_norm": 1.8543133954373656, + "language_loss": 0.71857494, + "learning_rate": 5.934491486396647e-07, + "loss": 0.74005824, + "num_input_tokens_seen": 271170865, + "step": 12571, + "time_per_iteration": 2.9743287563323975 + }, + { + "auxiliary_loss_clip": 0.01066938, + "auxiliary_loss_mlp": 0.01037876, + "balance_loss_clip": 1.03525424, + "balance_loss_mlp": 1.02339244, + "epoch": 0.7558695325417105, + "flos": 23988292392960.0, + "grad_norm": 1.8208269811999866, + "language_loss": 0.73415917, + "learning_rate": 5.931723001891811e-07, + "loss": 0.7552073, + "num_input_tokens_seen": 271191450, + "step": 12572, + "time_per_iteration": 2.819380044937134 + }, + { + "auxiliary_loss_clip": 0.0109252, + "auxiliary_loss_mlp": 0.01033068, + "balance_loss_clip": 1.04051542, + "balance_loss_mlp": 1.02049112, + "epoch": 0.7559296557943784, + "flos": 14611262572800.0, + "grad_norm": 2.0177969949137577, + "language_loss": 0.76612377, + "learning_rate": 5.928955050857456e-07, + "loss": 0.78737968, + "num_input_tokens_seen": 271207335, + "step": 12573, + "time_per_iteration": 4.514675617218018 + }, + { + "auxiliary_loss_clip": 0.01087889, + "auxiliary_loss_mlp": 0.01036063, + "balance_loss_clip": 1.04069138, + "balance_loss_mlp": 1.02323067, + "epoch": 0.7559897790470465, + "flos": 18550483309440.0, + "grad_norm": 1.5618514080613375, + "language_loss": 0.69153476, + "learning_rate": 5.926187633398527e-07, + "loss": 0.71277434, + "num_input_tokens_seen": 271226895, + "step": 12574, + "time_per_iteration": 4.325180530548096 + }, + { + "auxiliary_loss_clip": 0.0107305, + "auxiliary_loss_mlp": 0.01033848, + "balance_loss_clip": 1.03176165, + "balance_loss_mlp": 1.01988304, + "epoch": 0.7560499022997144, + "flos": 17967868709760.0, + "grad_norm": 2.3174142994510065, + "language_loss": 0.71567178, + "learning_rate": 5.923420749619974e-07, + "loss": 0.73674083, + "num_input_tokens_seen": 271244375, + "step": 12575, + "time_per_iteration": 4.343465805053711 + }, + { + "auxiliary_loss_clip": 0.0110949, + "auxiliary_loss_mlp": 0.00770549, + "balance_loss_clip": 1.03735065, + "balance_loss_mlp": 1.00018251, + "epoch": 0.7561100255523824, + "flos": 15737815802880.0, + "grad_norm": 2.1045282969153125, + "language_loss": 0.71783686, + "learning_rate": 5.92065439962673e-07, + "loss": 0.73663723, + "num_input_tokens_seen": 271259530, + "step": 12576, + "time_per_iteration": 2.6967074871063232 + }, + { + "auxiliary_loss_clip": 0.01076617, + "auxiliary_loss_mlp": 0.01031643, + "balance_loss_clip": 1.03790188, + "balance_loss_mlp": 1.01866078, + "epoch": 0.7561701488050504, + "flos": 15888102307200.0, + "grad_norm": 2.0401468166974857, + "language_loss": 0.67187804, + "learning_rate": 5.917888583523669e-07, + "loss": 0.69296062, + "num_input_tokens_seen": 271276835, + "step": 12577, + "time_per_iteration": 2.6873996257781982 + }, + { + "auxiliary_loss_clip": 0.01088122, + "auxiliary_loss_mlp": 0.01037075, + "balance_loss_clip": 1.03602171, + "balance_loss_mlp": 1.02463531, + "epoch": 0.7562302720577183, + "flos": 20339157893760.0, + "grad_norm": 1.8873015547804852, + "language_loss": 0.78041267, + "learning_rate": 5.915123301415685e-07, + "loss": 0.80166459, + "num_input_tokens_seen": 271296275, + "step": 12578, + "time_per_iteration": 4.377631664276123 + }, + { + "auxiliary_loss_clip": 0.01100787, + "auxiliary_loss_mlp": 0.01032965, + "balance_loss_clip": 1.03763413, + "balance_loss_mlp": 1.02016759, + "epoch": 0.7562903953103863, + "flos": 20812209033600.0, + "grad_norm": 1.6803508279416246, + "language_loss": 0.75802839, + "learning_rate": 5.912358553407641e-07, + "loss": 0.7793659, + "num_input_tokens_seen": 271315685, + "step": 12579, + "time_per_iteration": 2.778723955154419 + }, + { + "auxiliary_loss_clip": 0.01070667, + "auxiliary_loss_mlp": 0.01036269, + "balance_loss_clip": 1.03752732, + "balance_loss_mlp": 1.02198792, + "epoch": 0.7563505185630542, + "flos": 37596999484800.0, + "grad_norm": 2.5693429830085397, + "language_loss": 0.627738, + "learning_rate": 5.90959433960437e-07, + "loss": 0.64880729, + "num_input_tokens_seen": 271336790, + "step": 12580, + "time_per_iteration": 2.996838331222534 + }, + { + "auxiliary_loss_clip": 0.01067496, + "auxiliary_loss_mlp": 0.01033758, + "balance_loss_clip": 1.03585196, + "balance_loss_mlp": 1.02117586, + "epoch": 0.7564106418157223, + "flos": 20230995064320.0, + "grad_norm": 1.6306554766999415, + "language_loss": 0.74993187, + "learning_rate": 5.906830660110691e-07, + "loss": 0.77094436, + "num_input_tokens_seen": 271355470, + "step": 12581, + "time_per_iteration": 2.8892579078674316 + }, + { + "auxiliary_loss_clip": 0.01071961, + "auxiliary_loss_mlp": 0.01033537, + "balance_loss_clip": 1.03673053, + "balance_loss_mlp": 1.02031684, + "epoch": 0.7564707650683902, + "flos": 24754877475840.0, + "grad_norm": 1.6534906098525708, + "language_loss": 0.62473053, + "learning_rate": 5.904067515031412e-07, + "loss": 0.64578557, + "num_input_tokens_seen": 271375810, + "step": 12582, + "time_per_iteration": 2.78520131111145 + }, + { + "auxiliary_loss_clip": 0.01031417, + "auxiliary_loss_mlp": 0.0099978, + "balance_loss_clip": 1.00870466, + "balance_loss_mlp": 0.99880236, + "epoch": 0.7565308883210582, + "flos": 48530076433920.0, + "grad_norm": 0.9397092341612294, + "language_loss": 0.6060046, + "learning_rate": 5.901304904471307e-07, + "loss": 0.62631655, + "num_input_tokens_seen": 271424775, + "step": 12583, + "time_per_iteration": 2.9951171875 + }, + { + "auxiliary_loss_clip": 0.01084102, + "auxiliary_loss_mlp": 0.01041483, + "balance_loss_clip": 1.03840542, + "balance_loss_mlp": 1.02859008, + "epoch": 0.7565910115737261, + "flos": 12495082757760.0, + "grad_norm": 2.173716211625989, + "language_loss": 0.7912221, + "learning_rate": 5.898542828535125e-07, + "loss": 0.81247795, + "num_input_tokens_seen": 271440500, + "step": 12584, + "time_per_iteration": 2.724681854248047 + }, + { + "auxiliary_loss_clip": 0.01079444, + "auxiliary_loss_mlp": 0.01038295, + "balance_loss_clip": 1.03406775, + "balance_loss_mlp": 1.02354908, + "epoch": 0.7566511348263941, + "flos": 21173003193600.0, + "grad_norm": 2.334504412939606, + "language_loss": 0.77645278, + "learning_rate": 5.895781287327612e-07, + "loss": 0.79763019, + "num_input_tokens_seen": 271458180, + "step": 12585, + "time_per_iteration": 2.7006850242614746 + }, + { + "auxiliary_loss_clip": 0.01116119, + "auxiliary_loss_mlp": 0.01038399, + "balance_loss_clip": 1.04165065, + "balance_loss_mlp": 1.0249517, + "epoch": 0.756711258079062, + "flos": 21754827694080.0, + "grad_norm": 1.6643260913798816, + "language_loss": 0.83146328, + "learning_rate": 5.893020280953493e-07, + "loss": 0.85300845, + "num_input_tokens_seen": 271475730, + "step": 12586, + "time_per_iteration": 2.7549026012420654 + }, + { + "auxiliary_loss_clip": 0.01115138, + "auxiliary_loss_mlp": 0.01030771, + "balance_loss_clip": 1.04039466, + "balance_loss_mlp": 1.01833797, + "epoch": 0.75677138133173, + "flos": 22382905933440.0, + "grad_norm": 2.0582325962784207, + "language_loss": 0.83617753, + "learning_rate": 5.890259809517459e-07, + "loss": 0.85763657, + "num_input_tokens_seen": 271495030, + "step": 12587, + "time_per_iteration": 2.6982500553131104 + }, + { + "auxiliary_loss_clip": 0.01076996, + "auxiliary_loss_mlp": 0.01027662, + "balance_loss_clip": 1.03665411, + "balance_loss_mlp": 1.01509786, + "epoch": 0.756831504584398, + "flos": 22708974620160.0, + "grad_norm": 1.4789161114631317, + "language_loss": 0.71016109, + "learning_rate": 5.88749987312418e-07, + "loss": 0.73120773, + "num_input_tokens_seen": 271515355, + "step": 12588, + "time_per_iteration": 2.811058282852173 + }, + { + "auxiliary_loss_clip": 0.01113651, + "auxiliary_loss_mlp": 0.00770901, + "balance_loss_clip": 1.03982472, + "balance_loss_mlp": 1.00019073, + "epoch": 0.756891627837066, + "flos": 24098358643200.0, + "grad_norm": 1.7170735982642948, + "language_loss": 0.68827093, + "learning_rate": 5.884740471878327e-07, + "loss": 0.70711648, + "num_input_tokens_seen": 271535090, + "step": 12589, + "time_per_iteration": 2.668159008026123 + }, + { + "auxiliary_loss_clip": 0.01100202, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.03817892, + "balance_loss_mlp": 1.01629877, + "epoch": 0.756951751089734, + "flos": 19749001438080.0, + "grad_norm": 1.693382160425306, + "language_loss": 0.92356181, + "learning_rate": 5.881981605884522e-07, + "loss": 0.9448548, + "num_input_tokens_seen": 271551075, + "step": 12590, + "time_per_iteration": 2.737993001937866 + }, + { + "auxiliary_loss_clip": 0.01081772, + "auxiliary_loss_mlp": 0.01030735, + "balance_loss_clip": 1.03454733, + "balance_loss_mlp": 1.01852822, + "epoch": 0.7570118743424019, + "flos": 35079266551680.0, + "grad_norm": 2.1087448505355364, + "language_loss": 0.6530177, + "learning_rate": 5.879223275247391e-07, + "loss": 0.67414272, + "num_input_tokens_seen": 271571035, + "step": 12591, + "time_per_iteration": 2.836533308029175 + }, + { + "auxiliary_loss_clip": 0.01099676, + "auxiliary_loss_mlp": 0.01029683, + "balance_loss_clip": 1.03951907, + "balance_loss_mlp": 1.01828074, + "epoch": 0.7570719975950699, + "flos": 25594540778880.0, + "grad_norm": 10.362773010711903, + "language_loss": 0.73889554, + "learning_rate": 5.876465480071528e-07, + "loss": 0.76018918, + "num_input_tokens_seen": 271592950, + "step": 12592, + "time_per_iteration": 2.729740619659424 + }, + { + "auxiliary_loss_clip": 0.01100337, + "auxiliary_loss_mlp": 0.01036287, + "balance_loss_clip": 1.03738928, + "balance_loss_mlp": 1.02323985, + "epoch": 0.7571321208477378, + "flos": 10816223028480.0, + "grad_norm": 2.217401018900874, + "language_loss": 0.71442747, + "learning_rate": 5.873708220461522e-07, + "loss": 0.73579371, + "num_input_tokens_seen": 271608835, + "step": 12593, + "time_per_iteration": 2.684826135635376 + }, + { + "auxiliary_loss_clip": 0.01112155, + "auxiliary_loss_mlp": 0.01031949, + "balance_loss_clip": 1.03883767, + "balance_loss_mlp": 1.01900887, + "epoch": 0.7571922441004059, + "flos": 18260109763200.0, + "grad_norm": 2.277271762211562, + "language_loss": 0.66408104, + "learning_rate": 5.870951496521903e-07, + "loss": 0.68552208, + "num_input_tokens_seen": 271627730, + "step": 12594, + "time_per_iteration": 2.66044545173645 + }, + { + "auxiliary_loss_clip": 0.01081064, + "auxiliary_loss_mlp": 0.01034067, + "balance_loss_clip": 1.03765464, + "balance_loss_mlp": 1.02116287, + "epoch": 0.7572523673530738, + "flos": 22890502978560.0, + "grad_norm": 1.5512103327237567, + "language_loss": 0.80722225, + "learning_rate": 5.86819530835722e-07, + "loss": 0.82837361, + "num_input_tokens_seen": 271646415, + "step": 12595, + "time_per_iteration": 2.75352144241333 + }, + { + "auxiliary_loss_clip": 0.01078291, + "auxiliary_loss_mlp": 0.01034396, + "balance_loss_clip": 1.03972101, + "balance_loss_mlp": 1.02266574, + "epoch": 0.7573124906057418, + "flos": 20996323171200.0, + "grad_norm": 1.9894880322297872, + "language_loss": 0.71883428, + "learning_rate": 5.865439656071993e-07, + "loss": 0.73996115, + "num_input_tokens_seen": 271666240, + "step": 12596, + "time_per_iteration": 2.830939531326294 + }, + { + "auxiliary_loss_clip": 0.01013568, + "auxiliary_loss_mlp": 0.01033181, + "balance_loss_clip": 1.03547406, + "balance_loss_mlp": 1.02174306, + "epoch": 0.7573726138584097, + "flos": 20886292834560.0, + "grad_norm": 1.6646538422679886, + "language_loss": 0.80251002, + "learning_rate": 5.862684539770706e-07, + "loss": 0.82297754, + "num_input_tokens_seen": 271686370, + "step": 12597, + "time_per_iteration": 3.2770867347717285 + }, + { + "auxiliary_loss_clip": 0.01084183, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.04002273, + "balance_loss_mlp": 1.01700711, + "epoch": 0.7574327371110777, + "flos": 24530507170560.0, + "grad_norm": 2.8794945787689477, + "language_loss": 0.83217478, + "learning_rate": 5.859929959557835e-07, + "loss": 0.8533262, + "num_input_tokens_seen": 271705050, + "step": 12598, + "time_per_iteration": 3.5696053504943848 + }, + { + "auxiliary_loss_clip": 0.01083032, + "auxiliary_loss_mlp": 0.0102703, + "balance_loss_clip": 1.03725743, + "balance_loss_mlp": 1.01568758, + "epoch": 0.7574928603637456, + "flos": 23364523785600.0, + "grad_norm": 1.9599324451937288, + "language_loss": 0.62513769, + "learning_rate": 5.857175915537845e-07, + "loss": 0.64623827, + "num_input_tokens_seen": 271724915, + "step": 12599, + "time_per_iteration": 2.9659054279327393 + }, + { + "auxiliary_loss_clip": 0.01087639, + "auxiliary_loss_mlp": 0.00772119, + "balance_loss_clip": 1.03743839, + "balance_loss_mlp": 1.00022399, + "epoch": 0.7575529836164137, + "flos": 13516274419200.0, + "grad_norm": 2.6514435576773767, + "language_loss": 0.63275111, + "learning_rate": 5.854422407815161e-07, + "loss": 0.65134871, + "num_input_tokens_seen": 271742410, + "step": 12600, + "time_per_iteration": 2.761671304702759 + }, + { + "auxiliary_loss_clip": 0.01081508, + "auxiliary_loss_mlp": 0.010341, + "balance_loss_clip": 1.03465056, + "balance_loss_mlp": 1.01968765, + "epoch": 0.7576131068690816, + "flos": 19646584784640.0, + "grad_norm": 1.9759732214873023, + "language_loss": 0.66604817, + "learning_rate": 5.851669436494191e-07, + "loss": 0.68720412, + "num_input_tokens_seen": 271761425, + "step": 12601, + "time_per_iteration": 2.8752126693725586 + }, + { + "auxiliary_loss_clip": 0.01081767, + "auxiliary_loss_mlp": 0.01030188, + "balance_loss_clip": 1.03683853, + "balance_loss_mlp": 1.01856518, + "epoch": 0.7576732301217496, + "flos": 20048245643520.0, + "grad_norm": 1.862908723746181, + "language_loss": 0.6777848, + "learning_rate": 5.848917001679335e-07, + "loss": 0.69890434, + "num_input_tokens_seen": 271780875, + "step": 12602, + "time_per_iteration": 2.7810614109039307 + }, + { + "auxiliary_loss_clip": 0.01102089, + "auxiliary_loss_mlp": 0.01035949, + "balance_loss_clip": 1.03889537, + "balance_loss_mlp": 1.02206695, + "epoch": 0.7577333533744176, + "flos": 15377093470080.0, + "grad_norm": 3.3133966859560138, + "language_loss": 0.67229289, + "learning_rate": 5.846165103474967e-07, + "loss": 0.69367325, + "num_input_tokens_seen": 271799490, + "step": 12603, + "time_per_iteration": 2.677644968032837 + }, + { + "auxiliary_loss_clip": 0.01086121, + "auxiliary_loss_mlp": 0.01033317, + "balance_loss_clip": 1.03466463, + "balance_loss_mlp": 1.02153969, + "epoch": 0.7577934766270855, + "flos": 17894862316800.0, + "grad_norm": 2.091164920728678, + "language_loss": 0.61460161, + "learning_rate": 5.843413741985439e-07, + "loss": 0.63579607, + "num_input_tokens_seen": 271817040, + "step": 12604, + "time_per_iteration": 2.690556287765503 + }, + { + "auxiliary_loss_clip": 0.01113132, + "auxiliary_loss_mlp": 0.01037248, + "balance_loss_clip": 1.04157591, + "balance_loss_mlp": 1.0240519, + "epoch": 0.7578535998797535, + "flos": 21613770984960.0, + "grad_norm": 1.860643993925951, + "language_loss": 0.79847634, + "learning_rate": 5.840662917315076e-07, + "loss": 0.81998014, + "num_input_tokens_seen": 271835480, + "step": 12605, + "time_per_iteration": 2.650987148284912 + }, + { + "auxiliary_loss_clip": 0.01114865, + "auxiliary_loss_mlp": 0.01030793, + "balance_loss_clip": 1.03956521, + "balance_loss_mlp": 1.01750159, + "epoch": 0.7579137231324214, + "flos": 18478374756480.0, + "grad_norm": 2.6305225547150286, + "language_loss": 0.79649675, + "learning_rate": 5.837912629568198e-07, + "loss": 0.81795335, + "num_input_tokens_seen": 271849835, + "step": 12606, + "time_per_iteration": 2.6179733276367188 + }, + { + "auxiliary_loss_clip": 0.01094397, + "auxiliary_loss_mlp": 0.01030813, + "balance_loss_clip": 1.03664708, + "balance_loss_mlp": 1.01947641, + "epoch": 0.7579738463850895, + "flos": 23255032152960.0, + "grad_norm": 1.422559911510894, + "language_loss": 0.73040879, + "learning_rate": 5.835162878849087e-07, + "loss": 0.75166082, + "num_input_tokens_seen": 271869560, + "step": 12607, + "time_per_iteration": 2.660883903503418 + }, + { + "auxiliary_loss_clip": 0.01085893, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.03795099, + "balance_loss_mlp": 1.01872361, + "epoch": 0.7580339696377574, + "flos": 14027031861120.0, + "grad_norm": 1.8402667548029668, + "language_loss": 0.75154114, + "learning_rate": 5.83241366526202e-07, + "loss": 0.7727201, + "num_input_tokens_seen": 271887950, + "step": 12608, + "time_per_iteration": 2.7164134979248047 + }, + { + "auxiliary_loss_clip": 0.01074571, + "auxiliary_loss_mlp": 0.00770045, + "balance_loss_clip": 1.0365268, + "balance_loss_mlp": 1.00018573, + "epoch": 0.7580940928904254, + "flos": 25082777756160.0, + "grad_norm": 1.7434049205366062, + "language_loss": 0.71609342, + "learning_rate": 5.829664988911245e-07, + "loss": 0.73453957, + "num_input_tokens_seen": 271907700, + "step": 12609, + "time_per_iteration": 2.788742780685425 + }, + { + "auxiliary_loss_clip": 0.0111318, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.0384202, + "balance_loss_mlp": 1.02005589, + "epoch": 0.7581542161430933, + "flos": 23836425690240.0, + "grad_norm": 1.6307456106692844, + "language_loss": 0.81648767, + "learning_rate": 5.826916849901007e-07, + "loss": 0.83795345, + "num_input_tokens_seen": 271926840, + "step": 12610, + "time_per_iteration": 2.6684138774871826 + }, + { + "auxiliary_loss_clip": 0.01096074, + "auxiliary_loss_mlp": 0.01034645, + "balance_loss_clip": 1.03990412, + "balance_loss_mlp": 1.0215261, + "epoch": 0.7582143393957613, + "flos": 22237000888320.0, + "grad_norm": 1.7108192328279062, + "language_loss": 0.70459145, + "learning_rate": 5.824169248335488e-07, + "loss": 0.72589862, + "num_input_tokens_seen": 271946465, + "step": 12611, + "time_per_iteration": 2.7615582942962646 + }, + { + "auxiliary_loss_clip": 0.01111911, + "auxiliary_loss_mlp": 0.01031837, + "balance_loss_clip": 1.03971386, + "balance_loss_mlp": 1.01939797, + "epoch": 0.7582744626484292, + "flos": 21106389421440.0, + "grad_norm": 1.4842490716025172, + "language_loss": 0.70994782, + "learning_rate": 5.821422184318893e-07, + "loss": 0.73138535, + "num_input_tokens_seen": 271967295, + "step": 12612, + "time_per_iteration": 4.388495445251465 + }, + { + "auxiliary_loss_clip": 0.01051139, + "auxiliary_loss_mlp": 0.01043129, + "balance_loss_clip": 1.03555894, + "balance_loss_mlp": 1.03022408, + "epoch": 0.7583345859010973, + "flos": 24604770539520.0, + "grad_norm": 1.3817563743236791, + "language_loss": 0.59341693, + "learning_rate": 5.818675657955397e-07, + "loss": 0.61435962, + "num_input_tokens_seen": 271987960, + "step": 12613, + "time_per_iteration": 4.679025411605835 + }, + { + "auxiliary_loss_clip": 0.01085628, + "auxiliary_loss_mlp": 0.01039806, + "balance_loss_clip": 1.0359726, + "balance_loss_mlp": 1.02548921, + "epoch": 0.7583947091537652, + "flos": 33546814657920.0, + "grad_norm": 1.5041496360989353, + "language_loss": 0.59715927, + "learning_rate": 5.815929669349135e-07, + "loss": 0.61841357, + "num_input_tokens_seen": 272011780, + "step": 12614, + "time_per_iteration": 4.3984222412109375 + }, + { + "auxiliary_loss_clip": 0.01075793, + "auxiliary_loss_mlp": 0.01029168, + "balance_loss_clip": 1.03387702, + "balance_loss_mlp": 1.01572776, + "epoch": 0.7584548324064332, + "flos": 20121000641280.0, + "grad_norm": 1.921615771870116, + "language_loss": 0.73268729, + "learning_rate": 5.813184218604246e-07, + "loss": 0.75373691, + "num_input_tokens_seen": 272030825, + "step": 12615, + "time_per_iteration": 2.8314290046691895 + }, + { + "auxiliary_loss_clip": 0.01011548, + "auxiliary_loss_mlp": 0.00999712, + "balance_loss_clip": 1.01067567, + "balance_loss_mlp": 0.99848443, + "epoch": 0.7585149556591012, + "flos": 70402584061440.0, + "grad_norm": 0.8045882645133534, + "language_loss": 0.67647672, + "learning_rate": 5.810439305824828e-07, + "loss": 0.69658935, + "num_input_tokens_seen": 272095825, + "step": 12616, + "time_per_iteration": 3.260563850402832 + }, + { + "auxiliary_loss_clip": 0.0108171, + "auxiliary_loss_mlp": 0.01039897, + "balance_loss_clip": 1.03870976, + "balance_loss_mlp": 1.02642608, + "epoch": 0.7585750789117691, + "flos": 16143786293760.0, + "grad_norm": 1.736965809635246, + "language_loss": 0.84524256, + "learning_rate": 5.807694931114979e-07, + "loss": 0.86645865, + "num_input_tokens_seen": 272113950, + "step": 12617, + "time_per_iteration": 2.8263378143310547 + }, + { + "auxiliary_loss_clip": 0.01078721, + "auxiliary_loss_mlp": 0.01039251, + "balance_loss_clip": 1.0390234, + "balance_loss_mlp": 1.02730036, + "epoch": 0.7586352021644371, + "flos": 17493165544320.0, + "grad_norm": 2.328657460169902, + "language_loss": 0.74700725, + "learning_rate": 5.804951094578757e-07, + "loss": 0.76818699, + "num_input_tokens_seen": 272130315, + "step": 12618, + "time_per_iteration": 4.2552900314331055 + }, + { + "auxiliary_loss_clip": 0.0109138, + "auxiliary_loss_mlp": 0.01032269, + "balance_loss_clip": 1.03749752, + "balance_loss_mlp": 1.01850069, + "epoch": 0.758695325417105, + "flos": 17275187859840.0, + "grad_norm": 1.9133972925292233, + "language_loss": 0.77211189, + "learning_rate": 5.802207796320209e-07, + "loss": 0.79334843, + "num_input_tokens_seen": 272149080, + "step": 12619, + "time_per_iteration": 2.7758803367614746 + }, + { + "auxiliary_loss_clip": 0.0107017, + "auxiliary_loss_mlp": 0.01037442, + "balance_loss_clip": 1.03423786, + "balance_loss_mlp": 1.02421534, + "epoch": 0.7587554486697731, + "flos": 29495660163840.0, + "grad_norm": 1.9790425844010804, + "language_loss": 0.82581639, + "learning_rate": 5.79946503644337e-07, + "loss": 0.84689248, + "num_input_tokens_seen": 272168285, + "step": 12620, + "time_per_iteration": 2.860680341720581 + }, + { + "auxiliary_loss_clip": 0.01086979, + "auxiliary_loss_mlp": 0.01039531, + "balance_loss_clip": 1.03506887, + "balance_loss_mlp": 1.02535069, + "epoch": 0.758815571922441, + "flos": 16100800692480.0, + "grad_norm": 2.7719237542052335, + "language_loss": 0.82916582, + "learning_rate": 5.796722815052242e-07, + "loss": 0.85043091, + "num_input_tokens_seen": 272184585, + "step": 12621, + "time_per_iteration": 2.6819448471069336 + }, + { + "auxiliary_loss_clip": 0.01090396, + "auxiliary_loss_mlp": 0.01032874, + "balance_loss_clip": 1.03831124, + "balance_loss_mlp": 1.02035689, + "epoch": 0.758875695175109, + "flos": 16143714466560.0, + "grad_norm": 2.331369198169253, + "language_loss": 0.73694873, + "learning_rate": 5.7939811322508e-07, + "loss": 0.75818145, + "num_input_tokens_seen": 272200205, + "step": 12622, + "time_per_iteration": 2.867482900619507 + }, + { + "auxiliary_loss_clip": 0.01020627, + "auxiliary_loss_mlp": 0.00999479, + "balance_loss_clip": 1.00808787, + "balance_loss_mlp": 0.99837667, + "epoch": 0.7589358184277769, + "flos": 68462006860800.0, + "grad_norm": 0.8939637430208361, + "language_loss": 0.60890412, + "learning_rate": 5.791239988143024e-07, + "loss": 0.62910521, + "num_input_tokens_seen": 272259670, + "step": 12623, + "time_per_iteration": 3.399125814437866 + }, + { + "auxiliary_loss_clip": 0.01108354, + "auxiliary_loss_mlp": 0.01032352, + "balance_loss_clip": 1.0389595, + "balance_loss_mlp": 1.0204668, + "epoch": 0.7589959416804449, + "flos": 20047311889920.0, + "grad_norm": 2.1862107817163374, + "language_loss": 0.67437398, + "learning_rate": 5.788499382832847e-07, + "loss": 0.69578105, + "num_input_tokens_seen": 272277925, + "step": 12624, + "time_per_iteration": 2.7711076736450195 + }, + { + "auxiliary_loss_clip": 0.01108684, + "auxiliary_loss_mlp": 0.01029908, + "balance_loss_clip": 1.03826535, + "balance_loss_mlp": 1.01691461, + "epoch": 0.7590560649331128, + "flos": 18771800958720.0, + "grad_norm": 11.908372705872578, + "language_loss": 0.76173502, + "learning_rate": 5.785759316424196e-07, + "loss": 0.78312099, + "num_input_tokens_seen": 272296010, + "step": 12625, + "time_per_iteration": 2.695136308670044 + }, + { + "auxiliary_loss_clip": 0.0108337, + "auxiliary_loss_mlp": 0.01043075, + "balance_loss_clip": 1.03519034, + "balance_loss_mlp": 1.02824545, + "epoch": 0.7591161881857809, + "flos": 29825284296960.0, + "grad_norm": 1.865247644851499, + "language_loss": 0.63104314, + "learning_rate": 5.783019789020977e-07, + "loss": 0.65230757, + "num_input_tokens_seen": 272318330, + "step": 12626, + "time_per_iteration": 2.815093517303467 + }, + { + "auxiliary_loss_clip": 0.01080043, + "auxiliary_loss_mlp": 0.00771292, + "balance_loss_clip": 1.04494154, + "balance_loss_mlp": 1.00028062, + "epoch": 0.7591763114384488, + "flos": 20302708567680.0, + "grad_norm": 2.0523273844402605, + "language_loss": 0.74221742, + "learning_rate": 5.780280800727084e-07, + "loss": 0.76073074, + "num_input_tokens_seen": 272335265, + "step": 12627, + "time_per_iteration": 3.018779993057251 + }, + { + "auxiliary_loss_clip": 0.01100814, + "auxiliary_loss_mlp": 0.0103172, + "balance_loss_clip": 1.03871465, + "balance_loss_mlp": 1.0191313, + "epoch": 0.7592364346911168, + "flos": 20813609664000.0, + "grad_norm": 2.9039370071826145, + "language_loss": 0.6930986, + "learning_rate": 5.777542351646356e-07, + "loss": 0.71442395, + "num_input_tokens_seen": 272354795, + "step": 12628, + "time_per_iteration": 2.717823028564453 + }, + { + "auxiliary_loss_clip": 0.01102671, + "auxiliary_loss_mlp": 0.01034671, + "balance_loss_clip": 1.04052353, + "balance_loss_mlp": 1.02078366, + "epoch": 0.7592965579437848, + "flos": 21251504367360.0, + "grad_norm": 1.7338759935871468, + "language_loss": 0.63148701, + "learning_rate": 5.774804441882648e-07, + "loss": 0.6528604, + "num_input_tokens_seen": 272372875, + "step": 12629, + "time_per_iteration": 2.6770267486572266 + }, + { + "auxiliary_loss_clip": 0.01084801, + "auxiliary_loss_mlp": 0.01032088, + "balance_loss_clip": 1.03561509, + "balance_loss_mlp": 1.02010107, + "epoch": 0.7593566811964527, + "flos": 26213604704640.0, + "grad_norm": 1.4746577606675504, + "language_loss": 0.77671874, + "learning_rate": 5.772067071539786e-07, + "loss": 0.79788756, + "num_input_tokens_seen": 272394715, + "step": 12630, + "time_per_iteration": 2.9122629165649414 + }, + { + "auxiliary_loss_clip": 0.01029746, + "auxiliary_loss_mlp": 0.01002357, + "balance_loss_clip": 1.00722373, + "balance_loss_mlp": 1.00131977, + "epoch": 0.7594168044491207, + "flos": 71237255374080.0, + "grad_norm": 0.8115073267704523, + "language_loss": 0.61498612, + "learning_rate": 5.769330240721562e-07, + "loss": 0.63530719, + "num_input_tokens_seen": 272458775, + "step": 12631, + "time_per_iteration": 3.267413377761841 + }, + { + "auxiliary_loss_clip": 0.01084169, + "auxiliary_loss_mlp": 0.00772349, + "balance_loss_clip": 1.03867972, + "balance_loss_mlp": 1.00034893, + "epoch": 0.7594769277017887, + "flos": 26613326229120.0, + "grad_norm": 1.5722858256300303, + "language_loss": 0.73812342, + "learning_rate": 5.766593949531767e-07, + "loss": 0.75668871, + "num_input_tokens_seen": 272479355, + "step": 12632, + "time_per_iteration": 2.9674253463745117 + }, + { + "auxiliary_loss_clip": 0.01089012, + "auxiliary_loss_mlp": 0.01030973, + "balance_loss_clip": 1.0375607, + "balance_loss_mlp": 1.01855755, + "epoch": 0.7595370509544567, + "flos": 17595941333760.0, + "grad_norm": 2.3123827403326005, + "language_loss": 0.75472778, + "learning_rate": 5.763858198074154e-07, + "loss": 0.77592766, + "num_input_tokens_seen": 272493555, + "step": 12633, + "time_per_iteration": 2.733344078063965 + }, + { + "auxiliary_loss_clip": 0.01087192, + "auxiliary_loss_mlp": 0.01028663, + "balance_loss_clip": 1.03815973, + "balance_loss_mlp": 1.017272, + "epoch": 0.7595971742071246, + "flos": 18002953319040.0, + "grad_norm": 2.016293205622038, + "language_loss": 0.73391056, + "learning_rate": 5.76112298645246e-07, + "loss": 0.75506908, + "num_input_tokens_seen": 272508925, + "step": 12634, + "time_per_iteration": 2.7296500205993652 + }, + { + "auxiliary_loss_clip": 0.01111487, + "auxiliary_loss_mlp": 0.01034168, + "balance_loss_clip": 1.03916657, + "balance_loss_mlp": 1.02143645, + "epoch": 0.7596572974597926, + "flos": 28840326480000.0, + "grad_norm": 2.834994861255327, + "language_loss": 0.64522898, + "learning_rate": 5.758388314770408e-07, + "loss": 0.66668558, + "num_input_tokens_seen": 272528805, + "step": 12635, + "time_per_iteration": 2.79398512840271 + }, + { + "auxiliary_loss_clip": 0.01054416, + "auxiliary_loss_mlp": 0.01048736, + "balance_loss_clip": 1.03525424, + "balance_loss_mlp": 1.03316736, + "epoch": 0.7597174207124605, + "flos": 14282823588480.0, + "grad_norm": 1.8350185732096174, + "language_loss": 0.69167364, + "learning_rate": 5.7556541831317e-07, + "loss": 0.71270514, + "num_input_tokens_seen": 272546655, + "step": 12636, + "time_per_iteration": 2.827582836151123 + }, + { + "auxiliary_loss_clip": 0.01094213, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.03955829, + "balance_loss_mlp": 1.02246487, + "epoch": 0.7597775439651285, + "flos": 21688932193920.0, + "grad_norm": 2.1812107272877665, + "language_loss": 0.81070203, + "learning_rate": 5.752920591640018e-07, + "loss": 0.83199233, + "num_input_tokens_seen": 272564010, + "step": 12637, + "time_per_iteration": 2.805816650390625 + }, + { + "auxiliary_loss_clip": 0.01098118, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.03679478, + "balance_loss_mlp": 1.02025676, + "epoch": 0.7598376672177964, + "flos": 36101248312320.0, + "grad_norm": 1.701654856542883, + "language_loss": 0.66547924, + "learning_rate": 5.750187540399017e-07, + "loss": 0.68678635, + "num_input_tokens_seen": 272585840, + "step": 12638, + "time_per_iteration": 2.8566620349884033 + }, + { + "auxiliary_loss_clip": 0.01114657, + "auxiliary_loss_mlp": 0.01040373, + "balance_loss_clip": 1.04063082, + "balance_loss_mlp": 1.02584124, + "epoch": 0.7598977904704645, + "flos": 18332326056960.0, + "grad_norm": 2.2747225954566193, + "language_loss": 0.6550855, + "learning_rate": 5.747455029512323e-07, + "loss": 0.6766358, + "num_input_tokens_seen": 272602300, + "step": 12639, + "time_per_iteration": 2.6449224948883057 + }, + { + "auxiliary_loss_clip": 0.01097983, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.03591299, + "balance_loss_mlp": 1.01951265, + "epoch": 0.7599579137231324, + "flos": 20192642317440.0, + "grad_norm": 2.3376509636382057, + "language_loss": 0.70271343, + "learning_rate": 5.744723059083572e-07, + "loss": 0.72401774, + "num_input_tokens_seen": 272619595, + "step": 12640, + "time_per_iteration": 2.813169240951538 + }, + { + "auxiliary_loss_clip": 0.01091857, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.03943181, + "balance_loss_mlp": 1.0203433, + "epoch": 0.7600180369758004, + "flos": 24024849459840.0, + "grad_norm": 2.141253081598676, + "language_loss": 0.66953784, + "learning_rate": 5.741991629216343e-07, + "loss": 0.69079602, + "num_input_tokens_seen": 272638825, + "step": 12641, + "time_per_iteration": 2.8210034370422363 + }, + { + "auxiliary_loss_clip": 0.01098494, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.03655171, + "balance_loss_mlp": 1.01808345, + "epoch": 0.7600781602284684, + "flos": 18989527248000.0, + "grad_norm": 3.210818856983626, + "language_loss": 0.66875279, + "learning_rate": 5.73926074001422e-07, + "loss": 0.6900543, + "num_input_tokens_seen": 272657240, + "step": 12642, + "time_per_iteration": 2.6761434078216553 + }, + { + "auxiliary_loss_clip": 0.01092897, + "auxiliary_loss_mlp": 0.01031582, + "balance_loss_clip": 1.04070377, + "balance_loss_mlp": 1.01937461, + "epoch": 0.7601382834811363, + "flos": 26067520091520.0, + "grad_norm": 1.951124783740963, + "language_loss": 0.75605899, + "learning_rate": 5.736530391580765e-07, + "loss": 0.77730376, + "num_input_tokens_seen": 272677520, + "step": 12643, + "time_per_iteration": 2.858407497406006 + }, + { + "auxiliary_loss_clip": 0.01076624, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.03911448, + "balance_loss_mlp": 1.02123976, + "epoch": 0.7601984067338043, + "flos": 18844232734080.0, + "grad_norm": 1.8455815779990985, + "language_loss": 0.78802508, + "learning_rate": 5.733800584019508e-07, + "loss": 0.80914557, + "num_input_tokens_seen": 272696770, + "step": 12644, + "time_per_iteration": 2.820368766784668 + }, + { + "auxiliary_loss_clip": 0.01084265, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.0353601, + "balance_loss_mlp": 1.01994061, + "epoch": 0.7602585299864723, + "flos": 24646391424000.0, + "grad_norm": 1.5239807064585273, + "language_loss": 0.80124938, + "learning_rate": 5.731071317433957e-07, + "loss": 0.82241637, + "num_input_tokens_seen": 272718340, + "step": 12645, + "time_per_iteration": 2.8698811531066895 + }, + { + "auxiliary_loss_clip": 0.01087859, + "auxiliary_loss_mlp": 0.01033405, + "balance_loss_clip": 1.0394851, + "balance_loss_mlp": 1.02041101, + "epoch": 0.7603186532391403, + "flos": 23842100039040.0, + "grad_norm": 1.4661810849316874, + "language_loss": 0.72646892, + "learning_rate": 5.728342591927611e-07, + "loss": 0.74768156, + "num_input_tokens_seen": 272739575, + "step": 12646, + "time_per_iteration": 2.8227429389953613 + }, + { + "auxiliary_loss_clip": 0.01098686, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.03704524, + "balance_loss_mlp": 1.02336717, + "epoch": 0.7603787764918082, + "flos": 22199905117440.0, + "grad_norm": 2.4220316312811025, + "language_loss": 0.67611617, + "learning_rate": 5.725614407603949e-07, + "loss": 0.69745797, + "num_input_tokens_seen": 272758710, + "step": 12647, + "time_per_iteration": 2.8083581924438477 + }, + { + "auxiliary_loss_clip": 0.01019336, + "auxiliary_loss_mlp": 0.01006453, + "balance_loss_clip": 1.00592494, + "balance_loss_mlp": 1.00521874, + "epoch": 0.7604388997444762, + "flos": 54086894254080.0, + "grad_norm": 0.6752663503199356, + "language_loss": 0.48949182, + "learning_rate": 5.722886764566415e-07, + "loss": 0.50974971, + "num_input_tokens_seen": 272814855, + "step": 12648, + "time_per_iteration": 3.211672782897949 + }, + { + "auxiliary_loss_clip": 0.0109722, + "auxiliary_loss_mlp": 0.01036106, + "balance_loss_clip": 1.03749037, + "balance_loss_mlp": 1.02400017, + "epoch": 0.7604990229971441, + "flos": 19681920789120.0, + "grad_norm": 2.4521174104078467, + "language_loss": 0.76747489, + "learning_rate": 5.720159662918451e-07, + "loss": 0.78880811, + "num_input_tokens_seen": 272834400, + "step": 12649, + "time_per_iteration": 2.72628116607666 + }, + { + "auxiliary_loss_clip": 0.0106851, + "auxiliary_loss_mlp": 0.01035355, + "balance_loss_clip": 1.03517592, + "balance_loss_mlp": 1.02242661, + "epoch": 0.7605591462498121, + "flos": 25228036356480.0, + "grad_norm": 1.7702335478327527, + "language_loss": 0.68660265, + "learning_rate": 5.717433102763462e-07, + "loss": 0.7076413, + "num_input_tokens_seen": 272854760, + "step": 12650, + "time_per_iteration": 2.8096909523010254 + }, + { + "auxiliary_loss_clip": 0.01020249, + "auxiliary_loss_mlp": 0.01003738, + "balance_loss_clip": 1.00758457, + "balance_loss_mlp": 1.00275445, + "epoch": 0.76061926950248, + "flos": 66783757662720.0, + "grad_norm": 0.8336646667507255, + "language_loss": 0.62671125, + "learning_rate": 5.714707084204838e-07, + "loss": 0.64695108, + "num_input_tokens_seen": 272919030, + "step": 12651, + "time_per_iteration": 4.8483662605285645 + }, + { + "auxiliary_loss_clip": 0.01076594, + "auxiliary_loss_mlp": 0.01036234, + "balance_loss_clip": 1.0368011, + "balance_loss_mlp": 1.02373505, + "epoch": 0.7606793927551481, + "flos": 25338354001920.0, + "grad_norm": 1.4829724837753475, + "language_loss": 0.71288872, + "learning_rate": 5.711981607345951e-07, + "loss": 0.73401701, + "num_input_tokens_seen": 272938925, + "step": 12652, + "time_per_iteration": 2.85551118850708 + }, + { + "auxiliary_loss_clip": 0.01059292, + "auxiliary_loss_mlp": 0.01037324, + "balance_loss_clip": 1.0363282, + "balance_loss_mlp": 1.02425838, + "epoch": 0.760739516007816, + "flos": 18223624523520.0, + "grad_norm": 2.085886887216812, + "language_loss": 0.80261797, + "learning_rate": 5.709256672290152e-07, + "loss": 0.82358414, + "num_input_tokens_seen": 272954945, + "step": 12653, + "time_per_iteration": 6.101754665374756 + }, + { + "auxiliary_loss_clip": 0.01116949, + "auxiliary_loss_mlp": 0.01032648, + "balance_loss_clip": 1.04151583, + "balance_loss_mlp": 1.01946926, + "epoch": 0.760799639260484, + "flos": 22559119079040.0, + "grad_norm": 1.7273806090867658, + "language_loss": 0.79977405, + "learning_rate": 5.706532279140785e-07, + "loss": 0.82127005, + "num_input_tokens_seen": 272972855, + "step": 12654, + "time_per_iteration": 2.7514119148254395 + }, + { + "auxiliary_loss_clip": 0.01074955, + "auxiliary_loss_mlp": 0.0103594, + "balance_loss_clip": 1.03562033, + "balance_loss_mlp": 1.02221942, + "epoch": 0.760859762513152, + "flos": 22309324922880.0, + "grad_norm": 2.0189360189402357, + "language_loss": 0.79458809, + "learning_rate": 5.703808428001136e-07, + "loss": 0.81569707, + "num_input_tokens_seen": 272989895, + "step": 12655, + "time_per_iteration": 2.78948712348938 + }, + { + "auxiliary_loss_clip": 0.01094485, + "auxiliary_loss_mlp": 0.01028246, + "balance_loss_clip": 1.03769636, + "balance_loss_mlp": 1.01743925, + "epoch": 0.7609198857658199, + "flos": 24863902231680.0, + "grad_norm": 1.6124233768982144, + "language_loss": 0.68051422, + "learning_rate": 5.701085118974505e-07, + "loss": 0.70174152, + "num_input_tokens_seen": 273011695, + "step": 12656, + "time_per_iteration": 2.795375347137451 + }, + { + "auxiliary_loss_clip": 0.01101665, + "auxiliary_loss_mlp": 0.01030913, + "balance_loss_clip": 1.03489399, + "balance_loss_mlp": 1.01786578, + "epoch": 0.760980009018488, + "flos": 16836790366080.0, + "grad_norm": 2.7645541741379347, + "language_loss": 0.73798579, + "learning_rate": 5.698362352164164e-07, + "loss": 0.75931156, + "num_input_tokens_seen": 273028815, + "step": 12657, + "time_per_iteration": 4.21469521522522 + }, + { + "auxiliary_loss_clip": 0.01012936, + "auxiliary_loss_mlp": 0.01000637, + "balance_loss_clip": 1.00884259, + "balance_loss_mlp": 0.99950486, + "epoch": 0.7610401322711559, + "flos": 61230603029760.0, + "grad_norm": 0.85360954009419, + "language_loss": 0.64932978, + "learning_rate": 5.695640127673347e-07, + "loss": 0.66946548, + "num_input_tokens_seen": 273084080, + "step": 12658, + "time_per_iteration": 3.2157864570617676 + }, + { + "auxiliary_loss_clip": 0.01092364, + "auxiliary_loss_mlp": 0.01034541, + "balance_loss_clip": 1.03773546, + "balance_loss_mlp": 1.02238202, + "epoch": 0.7611002555238239, + "flos": 19640730867840.0, + "grad_norm": 2.0304267981282353, + "language_loss": 0.79544449, + "learning_rate": 5.692918445605293e-07, + "loss": 0.81671351, + "num_input_tokens_seen": 273102295, + "step": 12659, + "time_per_iteration": 2.6572675704956055 + }, + { + "auxiliary_loss_clip": 0.01097791, + "auxiliary_loss_mlp": 0.01028001, + "balance_loss_clip": 1.03714883, + "balance_loss_mlp": 1.015746, + "epoch": 0.7611603787764918, + "flos": 26872206526080.0, + "grad_norm": 1.589308819258603, + "language_loss": 0.68846476, + "learning_rate": 5.690197306063209e-07, + "loss": 0.7097227, + "num_input_tokens_seen": 273123400, + "step": 12660, + "time_per_iteration": 2.815166473388672 + }, + { + "auxiliary_loss_clip": 0.01111243, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.03771544, + "balance_loss_mlp": 1.02017736, + "epoch": 0.7612205020291598, + "flos": 27344252085120.0, + "grad_norm": 2.023337576106856, + "language_loss": 0.70192313, + "learning_rate": 5.687476709150281e-07, + "loss": 0.7233631, + "num_input_tokens_seen": 273145150, + "step": 12661, + "time_per_iteration": 2.7765588760375977 + }, + { + "auxiliary_loss_clip": 0.01099752, + "auxiliary_loss_mlp": 0.01034729, + "balance_loss_clip": 1.03683341, + "balance_loss_mlp": 1.02217579, + "epoch": 0.7612806252818277, + "flos": 29314598682240.0, + "grad_norm": 1.6042830797514005, + "language_loss": 0.83190757, + "learning_rate": 5.68475665496966e-07, + "loss": 0.85325241, + "num_input_tokens_seen": 273165180, + "step": 12662, + "time_per_iteration": 2.7277190685272217 + }, + { + "auxiliary_loss_clip": 0.01088049, + "auxiliary_loss_mlp": 0.0104358, + "balance_loss_clip": 1.03722358, + "balance_loss_mlp": 1.03130126, + "epoch": 0.7613407485344957, + "flos": 19026048401280.0, + "grad_norm": 1.7304537436557308, + "language_loss": 0.68582624, + "learning_rate": 5.682037143624505e-07, + "loss": 0.70714259, + "num_input_tokens_seen": 273184005, + "step": 12663, + "time_per_iteration": 2.770902156829834 + }, + { + "auxiliary_loss_clip": 0.01100065, + "auxiliary_loss_mlp": 0.01026036, + "balance_loss_clip": 1.03998029, + "balance_loss_mlp": 1.0138464, + "epoch": 0.7614008717871636, + "flos": 23256037733760.0, + "grad_norm": 2.1194736525192357, + "language_loss": 0.70156157, + "learning_rate": 5.67931817521794e-07, + "loss": 0.72282255, + "num_input_tokens_seen": 273203565, + "step": 12664, + "time_per_iteration": 2.7074570655822754 + }, + { + "auxiliary_loss_clip": 0.01105735, + "auxiliary_loss_mlp": 0.01039411, + "balance_loss_clip": 1.04057264, + "balance_loss_mlp": 1.02536225, + "epoch": 0.7614609950398317, + "flos": 21579907438080.0, + "grad_norm": 1.8390360170720685, + "language_loss": 0.79482293, + "learning_rate": 5.676599749853066e-07, + "loss": 0.8162744, + "num_input_tokens_seen": 273221645, + "step": 12665, + "time_per_iteration": 2.7299532890319824 + }, + { + "auxiliary_loss_clip": 0.0111148, + "auxiliary_loss_mlp": 0.00769447, + "balance_loss_clip": 1.04143631, + "balance_loss_mlp": 1.00019884, + "epoch": 0.7615211182924996, + "flos": 29277897960960.0, + "grad_norm": 1.9892685132164005, + "language_loss": 0.87823522, + "learning_rate": 5.673881867632959e-07, + "loss": 0.89704448, + "num_input_tokens_seen": 273242040, + "step": 12666, + "time_per_iteration": 2.7689883708953857 + }, + { + "auxiliary_loss_clip": 0.01055194, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.03460038, + "balance_loss_mlp": 1.02016783, + "epoch": 0.7615812415451676, + "flos": 13261129136640.0, + "grad_norm": 2.3749707608693513, + "language_loss": 0.8353771, + "learning_rate": 5.671164528660693e-07, + "loss": 0.85626853, + "num_input_tokens_seen": 273257365, + "step": 12667, + "time_per_iteration": 2.920854091644287 + }, + { + "auxiliary_loss_clip": 0.01089109, + "auxiliary_loss_mlp": 0.01037332, + "balance_loss_clip": 1.03897429, + "balance_loss_mlp": 1.02510726, + "epoch": 0.7616413647978356, + "flos": 18584741905920.0, + "grad_norm": 1.7012297272780605, + "language_loss": 0.78357065, + "learning_rate": 5.668447733039296e-07, + "loss": 0.80483508, + "num_input_tokens_seen": 273274710, + "step": 12668, + "time_per_iteration": 2.694464683532715 + }, + { + "auxiliary_loss_clip": 0.01075984, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.03536892, + "balance_loss_mlp": 1.02142668, + "epoch": 0.7617014880505035, + "flos": 18516188799360.0, + "grad_norm": 1.900278924462426, + "language_loss": 0.64169192, + "learning_rate": 5.6657314808718e-07, + "loss": 0.66279244, + "num_input_tokens_seen": 273292870, + "step": 12669, + "time_per_iteration": 2.793607234954834 + }, + { + "auxiliary_loss_clip": 0.01086136, + "auxiliary_loss_mlp": 0.01037174, + "balance_loss_clip": 1.03618228, + "balance_loss_mlp": 1.02251148, + "epoch": 0.7617616113031715, + "flos": 24973178382720.0, + "grad_norm": 2.3594416048527886, + "language_loss": 0.66683328, + "learning_rate": 5.663015772261202e-07, + "loss": 0.68806642, + "num_input_tokens_seen": 273312375, + "step": 12670, + "time_per_iteration": 2.784454584121704 + }, + { + "auxiliary_loss_clip": 0.01101863, + "auxiliary_loss_mlp": 0.01036176, + "balance_loss_clip": 1.03805709, + "balance_loss_mlp": 1.02371264, + "epoch": 0.7618217345558395, + "flos": 23295036925440.0, + "grad_norm": 1.6675852646548754, + "language_loss": 0.73051012, + "learning_rate": 5.660300607310493e-07, + "loss": 0.75189054, + "num_input_tokens_seen": 273332590, + "step": 12671, + "time_per_iteration": 2.7376444339752197 + }, + { + "auxiliary_loss_clip": 0.01072018, + "auxiliary_loss_mlp": 0.01036198, + "balance_loss_clip": 1.03299487, + "balance_loss_mlp": 1.02336478, + "epoch": 0.7618818578085075, + "flos": 25482894330240.0, + "grad_norm": 1.6810616532176517, + "language_loss": 0.73379242, + "learning_rate": 5.657585986122613e-07, + "loss": 0.75487459, + "num_input_tokens_seen": 273352885, + "step": 12672, + "time_per_iteration": 2.839824914932251 + }, + { + "auxiliary_loss_clip": 0.01001779, + "auxiliary_loss_mlp": 0.01000945, + "balance_loss_clip": 1.00902843, + "balance_loss_mlp": 0.99994415, + "epoch": 0.7619419810611754, + "flos": 61151994115200.0, + "grad_norm": 0.7605802796728681, + "language_loss": 0.56720763, + "learning_rate": 5.654871908800506e-07, + "loss": 0.58723491, + "num_input_tokens_seen": 273411730, + "step": 12673, + "time_per_iteration": 3.201223850250244 + }, + { + "auxiliary_loss_clip": 0.01100506, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.03872013, + "balance_loss_mlp": 1.02017713, + "epoch": 0.7620021043138434, + "flos": 23258659426560.0, + "grad_norm": 1.740985764323004, + "language_loss": 0.74985719, + "learning_rate": 5.652158375447102e-07, + "loss": 0.77119827, + "num_input_tokens_seen": 273430020, + "step": 12674, + "time_per_iteration": 2.7523674964904785 + }, + { + "auxiliary_loss_clip": 0.01078335, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.03280282, + "balance_loss_mlp": 1.02115917, + "epoch": 0.7620622275665113, + "flos": 25082490447360.0, + "grad_norm": 2.016968785159948, + "language_loss": 0.7202276, + "learning_rate": 5.649445386165286e-07, + "loss": 0.74135315, + "num_input_tokens_seen": 273448690, + "step": 12675, + "time_per_iteration": 2.796057939529419 + }, + { + "auxiliary_loss_clip": 0.01095004, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.03599072, + "balance_loss_mlp": 1.01886785, + "epoch": 0.7621223508191793, + "flos": 20155007842560.0, + "grad_norm": 2.355672138276969, + "language_loss": 0.73052359, + "learning_rate": 5.646732941057936e-07, + "loss": 0.7517817, + "num_input_tokens_seen": 273465190, + "step": 12676, + "time_per_iteration": 2.734591484069824 + }, + { + "auxiliary_loss_clip": 0.01081109, + "auxiliary_loss_mlp": 0.00771918, + "balance_loss_clip": 1.03906035, + "balance_loss_mlp": 1.00022256, + "epoch": 0.7621824740718472, + "flos": 18000187971840.0, + "grad_norm": 2.93709923203383, + "language_loss": 0.54046768, + "learning_rate": 5.644021040227927e-07, + "loss": 0.55899793, + "num_input_tokens_seen": 273478620, + "step": 12677, + "time_per_iteration": 2.8109676837921143 + }, + { + "auxiliary_loss_clip": 0.01052826, + "auxiliary_loss_mlp": 0.01035963, + "balance_loss_clip": 1.0335747, + "balance_loss_mlp": 1.02283812, + "epoch": 0.7622425973245153, + "flos": 21725668828800.0, + "grad_norm": 2.0762274911054184, + "language_loss": 0.78760284, + "learning_rate": 5.641309683778064e-07, + "loss": 0.80849069, + "num_input_tokens_seen": 273497635, + "step": 12678, + "time_per_iteration": 2.860340118408203 + }, + { + "auxiliary_loss_clip": 0.01073918, + "auxiliary_loss_mlp": 0.01036673, + "balance_loss_clip": 1.0344305, + "balance_loss_mlp": 1.02257085, + "epoch": 0.7623027205771832, + "flos": 19718549683200.0, + "grad_norm": 3.9645067236030114, + "language_loss": 0.77204514, + "learning_rate": 5.638598871811175e-07, + "loss": 0.79315102, + "num_input_tokens_seen": 273513955, + "step": 12679, + "time_per_iteration": 2.772916793823242 + }, + { + "auxiliary_loss_clip": 0.01100617, + "auxiliary_loss_mlp": 0.01027269, + "balance_loss_clip": 1.03917551, + "balance_loss_mlp": 1.01456678, + "epoch": 0.7623628438298512, + "flos": 23988831096960.0, + "grad_norm": 1.434526846534088, + "language_loss": 0.80099726, + "learning_rate": 5.635888604430059e-07, + "loss": 0.82227612, + "num_input_tokens_seen": 273533970, + "step": 12680, + "time_per_iteration": 2.7801437377929688 + }, + { + "auxiliary_loss_clip": 0.01089966, + "auxiliary_loss_mlp": 0.01032719, + "balance_loss_clip": 1.03768969, + "balance_loss_mlp": 1.01880169, + "epoch": 0.7624229670825191, + "flos": 22345702421760.0, + "grad_norm": 1.9046696360663191, + "language_loss": 0.62818468, + "learning_rate": 5.633178881737493e-07, + "loss": 0.64941156, + "num_input_tokens_seen": 273553090, + "step": 12681, + "time_per_iteration": 2.8114664554595947 + }, + { + "auxiliary_loss_clip": 0.01076613, + "auxiliary_loss_mlp": 0.01031955, + "balance_loss_clip": 1.03848743, + "balance_loss_mlp": 1.01964092, + "epoch": 0.7624830903351871, + "flos": 22711775880960.0, + "grad_norm": 2.2465025277457755, + "language_loss": 0.76199776, + "learning_rate": 5.63046970383622e-07, + "loss": 0.78308344, + "num_input_tokens_seen": 273572460, + "step": 12682, + "time_per_iteration": 2.8621296882629395 + }, + { + "auxiliary_loss_clip": 0.01085809, + "auxiliary_loss_mlp": 0.01032162, + "balance_loss_clip": 1.03555107, + "balance_loss_mlp": 1.02058053, + "epoch": 0.7625432135878552, + "flos": 25593714766080.0, + "grad_norm": 1.5266925893040741, + "language_loss": 0.68380392, + "learning_rate": 5.627761070828974e-07, + "loss": 0.70498371, + "num_input_tokens_seen": 273592815, + "step": 12683, + "time_per_iteration": 2.804927349090576 + }, + { + "auxiliary_loss_clip": 0.01067779, + "auxiliary_loss_mlp": 0.00772982, + "balance_loss_clip": 1.03292143, + "balance_loss_mlp": 1.00020671, + "epoch": 0.7626033368405231, + "flos": 23987645948160.0, + "grad_norm": 2.022263962104647, + "language_loss": 0.83156735, + "learning_rate": 5.625052982818472e-07, + "loss": 0.84997493, + "num_input_tokens_seen": 273611790, + "step": 12684, + "time_per_iteration": 2.7787985801696777 + }, + { + "auxiliary_loss_clip": 0.0108949, + "auxiliary_loss_mlp": 0.01041206, + "balance_loss_clip": 1.03807712, + "balance_loss_mlp": 1.02769983, + "epoch": 0.7626634600931911, + "flos": 12599115523200.0, + "grad_norm": 2.242764424782362, + "language_loss": 0.82618159, + "learning_rate": 5.622345439907396e-07, + "loss": 0.84748858, + "num_input_tokens_seen": 273628340, + "step": 12685, + "time_per_iteration": 2.735823631286621 + }, + { + "auxiliary_loss_clip": 0.0107975, + "auxiliary_loss_mlp": 0.00770301, + "balance_loss_clip": 1.03726244, + "balance_loss_mlp": 1.00022709, + "epoch": 0.762723583345859, + "flos": 26322593546880.0, + "grad_norm": 2.461504636054881, + "language_loss": 0.77635926, + "learning_rate": 5.619638442198422e-07, + "loss": 0.79485977, + "num_input_tokens_seen": 273646585, + "step": 12686, + "time_per_iteration": 2.906090021133423 + }, + { + "auxiliary_loss_clip": 0.01052651, + "auxiliary_loss_mlp": 0.01057311, + "balance_loss_clip": 1.03302336, + "balance_loss_mlp": 1.0405736, + "epoch": 0.762783706598527, + "flos": 21907053532800.0, + "grad_norm": 1.7909742891247455, + "language_loss": 0.72059739, + "learning_rate": 5.616931989794198e-07, + "loss": 0.74169701, + "num_input_tokens_seen": 273665410, + "step": 12687, + "time_per_iteration": 2.736345052719116 + }, + { + "auxiliary_loss_clip": 0.01084081, + "auxiliary_loss_mlp": 0.01042723, + "balance_loss_clip": 1.03387547, + "balance_loss_mlp": 1.02746391, + "epoch": 0.7628438298511949, + "flos": 15339782217600.0, + "grad_norm": 1.8904556177994511, + "language_loss": 0.65018427, + "learning_rate": 5.614226082797369e-07, + "loss": 0.67145234, + "num_input_tokens_seen": 273683035, + "step": 12688, + "time_per_iteration": 2.7697956562042236 + }, + { + "auxiliary_loss_clip": 0.01101479, + "auxiliary_loss_mlp": 0.01027744, + "balance_loss_clip": 1.03997755, + "balance_loss_mlp": 1.01574564, + "epoch": 0.7629039531038629, + "flos": 13006307076480.0, + "grad_norm": 3.084065426087135, + "language_loss": 0.70538044, + "learning_rate": 5.611520721310515e-07, + "loss": 0.72667265, + "num_input_tokens_seen": 273700130, + "step": 12689, + "time_per_iteration": 2.9508743286132812 + }, + { + "auxiliary_loss_clip": 0.01081126, + "auxiliary_loss_mlp": 0.01040898, + "balance_loss_clip": 1.03614342, + "balance_loss_mlp": 1.0274868, + "epoch": 0.7629640763565309, + "flos": 26171660597760.0, + "grad_norm": 1.827453823319608, + "language_loss": 0.69980061, + "learning_rate": 5.608815905436238e-07, + "loss": 0.72102082, + "num_input_tokens_seen": 273720310, + "step": 12690, + "time_per_iteration": 2.8916642665863037 + }, + { + "auxiliary_loss_clip": 0.01084164, + "auxiliary_loss_mlp": 0.01040929, + "balance_loss_clip": 1.03480482, + "balance_loss_mlp": 1.02747643, + "epoch": 0.7630241996091989, + "flos": 36793713680640.0, + "grad_norm": 1.455347798519734, + "language_loss": 0.69115114, + "learning_rate": 5.606111635277109e-07, + "loss": 0.71240205, + "num_input_tokens_seen": 273744475, + "step": 12691, + "time_per_iteration": 4.387454032897949 + }, + { + "auxiliary_loss_clip": 0.01093867, + "auxiliary_loss_mlp": 0.01037257, + "balance_loss_clip": 1.03709197, + "balance_loss_mlp": 1.02576542, + "epoch": 0.7630843228618668, + "flos": 21835160461440.0, + "grad_norm": 1.950930402576883, + "language_loss": 0.81791067, + "learning_rate": 5.603407910935662e-07, + "loss": 0.83922184, + "num_input_tokens_seen": 273764635, + "step": 12692, + "time_per_iteration": 5.863187551498413 + }, + { + "auxiliary_loss_clip": 0.01078564, + "auxiliary_loss_mlp": 0.010271, + "balance_loss_clip": 1.04068136, + "balance_loss_mlp": 1.01536989, + "epoch": 0.7631444461145348, + "flos": 12640520926080.0, + "grad_norm": 2.677454083590648, + "language_loss": 0.77390575, + "learning_rate": 5.600704732514438e-07, + "loss": 0.79496241, + "num_input_tokens_seen": 273780115, + "step": 12693, + "time_per_iteration": 2.8327314853668213 + }, + { + "auxiliary_loss_clip": 0.0107301, + "auxiliary_loss_mlp": 0.01034355, + "balance_loss_clip": 1.03885221, + "balance_loss_mlp": 1.02155745, + "epoch": 0.7632045693672027, + "flos": 16836610798080.0, + "grad_norm": 3.1941491202097523, + "language_loss": 0.72766727, + "learning_rate": 5.598002100115933e-07, + "loss": 0.74874091, + "num_input_tokens_seen": 273796605, + "step": 12694, + "time_per_iteration": 2.771289587020874 + }, + { + "auxiliary_loss_clip": 0.01096742, + "auxiliary_loss_mlp": 0.01029277, + "balance_loss_clip": 1.03683043, + "balance_loss_mlp": 1.01703393, + "epoch": 0.7632646926198707, + "flos": 22017335264640.0, + "grad_norm": 1.9917055644917767, + "language_loss": 0.70419681, + "learning_rate": 5.595300013842625e-07, + "loss": 0.72545701, + "num_input_tokens_seen": 273816515, + "step": 12695, + "time_per_iteration": 2.616629123687744 + }, + { + "auxiliary_loss_clip": 0.01109838, + "auxiliary_loss_mlp": 0.01031794, + "balance_loss_clip": 1.03797019, + "balance_loss_mlp": 1.0198853, + "epoch": 0.7633248158725388, + "flos": 23114011357440.0, + "grad_norm": 1.5503240571511046, + "language_loss": 0.72249472, + "learning_rate": 5.592598473796985e-07, + "loss": 0.74391103, + "num_input_tokens_seen": 273837060, + "step": 12696, + "time_per_iteration": 2.7050669193267822 + }, + { + "auxiliary_loss_clip": 0.01051627, + "auxiliary_loss_mlp": 0.01040707, + "balance_loss_clip": 1.03538561, + "balance_loss_mlp": 1.02642596, + "epoch": 0.7633849391252067, + "flos": 10889839952640.0, + "grad_norm": 2.077421826663572, + "language_loss": 0.71310508, + "learning_rate": 5.589897480081453e-07, + "loss": 0.73402846, + "num_input_tokens_seen": 273853365, + "step": 12697, + "time_per_iteration": 4.246352672576904 + }, + { + "auxiliary_loss_clip": 0.01077141, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.0388602, + "balance_loss_mlp": 1.02219009, + "epoch": 0.7634450623778747, + "flos": 20994168355200.0, + "grad_norm": 3.071082049112887, + "language_loss": 0.66922784, + "learning_rate": 5.587197032798461e-07, + "loss": 0.69034344, + "num_input_tokens_seen": 273870750, + "step": 12698, + "time_per_iteration": 2.7623679637908936 + }, + { + "auxiliary_loss_clip": 0.01097288, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.03538871, + "balance_loss_mlp": 1.01636612, + "epoch": 0.7635051856305426, + "flos": 18882046776960.0, + "grad_norm": 1.6894035015942557, + "language_loss": 0.72252488, + "learning_rate": 5.5844971320504e-07, + "loss": 0.74378926, + "num_input_tokens_seen": 273890890, + "step": 12699, + "time_per_iteration": 2.681185483932495 + }, + { + "auxiliary_loss_clip": 0.01088089, + "auxiliary_loss_mlp": 0.01032373, + "balance_loss_clip": 1.03612185, + "balance_loss_mlp": 1.02065527, + "epoch": 0.7635653088832106, + "flos": 34786989584640.0, + "grad_norm": 1.7379546952285325, + "language_loss": 0.73000193, + "learning_rate": 5.581797777939648e-07, + "loss": 0.75120658, + "num_input_tokens_seen": 273914015, + "step": 12700, + "time_per_iteration": 2.788801908493042 + }, + { + "auxiliary_loss_clip": 0.01109919, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.03708696, + "balance_loss_mlp": 1.01822746, + "epoch": 0.7636254321358785, + "flos": 23178434400000.0, + "grad_norm": 2.5171117546055717, + "language_loss": 0.69465768, + "learning_rate": 5.579098970568574e-07, + "loss": 0.71606004, + "num_input_tokens_seen": 273927415, + "step": 12701, + "time_per_iteration": 2.6201059818267822 + }, + { + "auxiliary_loss_clip": 0.01083521, + "auxiliary_loss_mlp": 0.01031087, + "balance_loss_clip": 1.03899217, + "balance_loss_mlp": 1.01891518, + "epoch": 0.7636855553885465, + "flos": 21325229032320.0, + "grad_norm": 2.215440552723354, + "language_loss": 0.64664185, + "learning_rate": 5.576400710039508e-07, + "loss": 0.66778791, + "num_input_tokens_seen": 273946690, + "step": 12702, + "time_per_iteration": 2.7970054149627686 + }, + { + "auxiliary_loss_clip": 0.01079185, + "auxiliary_loss_mlp": 0.01033415, + "balance_loss_clip": 1.03836131, + "balance_loss_mlp": 1.02095747, + "epoch": 0.7637456786412145, + "flos": 28658079849600.0, + "grad_norm": 1.9784000831539899, + "language_loss": 0.66083431, + "learning_rate": 5.57370299645477e-07, + "loss": 0.68196028, + "num_input_tokens_seen": 273966870, + "step": 12703, + "time_per_iteration": 2.822849750518799 + }, + { + "auxiliary_loss_clip": 0.01087834, + "auxiliary_loss_mlp": 0.01026937, + "balance_loss_clip": 1.03842688, + "balance_loss_mlp": 1.01438999, + "epoch": 0.7638058018938825, + "flos": 21907269014400.0, + "grad_norm": 2.027090239685764, + "language_loss": 0.83859146, + "learning_rate": 5.571005829916668e-07, + "loss": 0.85973918, + "num_input_tokens_seen": 273986360, + "step": 12704, + "time_per_iteration": 2.728527784347534 + }, + { + "auxiliary_loss_clip": 0.01088663, + "auxiliary_loss_mlp": 0.01032796, + "balance_loss_clip": 1.03736877, + "balance_loss_mlp": 1.02039814, + "epoch": 0.7638659251465504, + "flos": 29643899592960.0, + "grad_norm": 1.895547997363001, + "language_loss": 0.67812586, + "learning_rate": 5.568309210527469e-07, + "loss": 0.69934046, + "num_input_tokens_seen": 274009745, + "step": 12705, + "time_per_iteration": 2.818378448486328 + }, + { + "auxiliary_loss_clip": 0.01083042, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.03550816, + "balance_loss_mlp": 1.01972699, + "epoch": 0.7639260483992184, + "flos": 26141172929280.0, + "grad_norm": 1.7310921121136604, + "language_loss": 0.73945439, + "learning_rate": 5.565613138389427e-07, + "loss": 0.76060611, + "num_input_tokens_seen": 274028775, + "step": 12706, + "time_per_iteration": 2.7738003730773926 + }, + { + "auxiliary_loss_clip": 0.0109458, + "auxiliary_loss_mlp": 0.01037611, + "balance_loss_clip": 1.03670621, + "balance_loss_mlp": 1.02431297, + "epoch": 0.7639861716518863, + "flos": 20156695781760.0, + "grad_norm": 2.5805411754522396, + "language_loss": 0.78420258, + "learning_rate": 5.562917613604781e-07, + "loss": 0.80552453, + "num_input_tokens_seen": 274047520, + "step": 12707, + "time_per_iteration": 2.785919666290283 + }, + { + "auxiliary_loss_clip": 0.01083532, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.03674436, + "balance_loss_mlp": 1.01594281, + "epoch": 0.7640462949045543, + "flos": 18583125793920.0, + "grad_norm": 1.8763992467573365, + "language_loss": 0.79923272, + "learning_rate": 5.560222636275751e-07, + "loss": 0.82035094, + "num_input_tokens_seen": 274065350, + "step": 12708, + "time_per_iteration": 2.7112326622009277 + }, + { + "auxiliary_loss_clip": 0.0102089, + "auxiliary_loss_mlp": 0.00999756, + "balance_loss_clip": 1.0106082, + "balance_loss_mlp": 0.99848616, + "epoch": 0.7641064181572224, + "flos": 68321991646080.0, + "grad_norm": 0.8077298698173723, + "language_loss": 0.56427336, + "learning_rate": 5.557528206504521e-07, + "loss": 0.58447981, + "num_input_tokens_seen": 274122315, + "step": 12709, + "time_per_iteration": 3.2111401557922363 + }, + { + "auxiliary_loss_clip": 0.01098648, + "auxiliary_loss_mlp": 0.01040257, + "balance_loss_clip": 1.03582978, + "balance_loss_mlp": 1.02636278, + "epoch": 0.7641665414098903, + "flos": 17968982031360.0, + "grad_norm": 1.9630774322237245, + "language_loss": 0.63484347, + "learning_rate": 5.554834324393271e-07, + "loss": 0.65623254, + "num_input_tokens_seen": 274140555, + "step": 12710, + "time_per_iteration": 2.685795545578003 + }, + { + "auxiliary_loss_clip": 0.01062185, + "auxiliary_loss_mlp": 0.00771699, + "balance_loss_clip": 1.03377032, + "balance_loss_mlp": 1.00016749, + "epoch": 0.7642266646625583, + "flos": 21252078984960.0, + "grad_norm": 2.5143918151768867, + "language_loss": 0.64498585, + "learning_rate": 5.552140990044154e-07, + "loss": 0.66332471, + "num_input_tokens_seen": 274161125, + "step": 12711, + "time_per_iteration": 2.845017671585083 + }, + { + "auxiliary_loss_clip": 0.01088311, + "auxiliary_loss_mlp": 0.01037404, + "balance_loss_clip": 1.03707993, + "balance_loss_mlp": 1.02514362, + "epoch": 0.7642867879152262, + "flos": 22747794243840.0, + "grad_norm": 1.7149688745487186, + "language_loss": 0.72759664, + "learning_rate": 5.549448203559293e-07, + "loss": 0.7488538, + "num_input_tokens_seen": 274180835, + "step": 12712, + "time_per_iteration": 2.7211430072784424 + }, + { + "auxiliary_loss_clip": 0.01077131, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.03835392, + "balance_loss_mlp": 1.02084625, + "epoch": 0.7643469111678942, + "flos": 23332132696320.0, + "grad_norm": 2.218446959632987, + "language_loss": 0.80380988, + "learning_rate": 5.546755965040804e-07, + "loss": 0.82490551, + "num_input_tokens_seen": 274201190, + "step": 12713, + "time_per_iteration": 2.822138786315918 + }, + { + "auxiliary_loss_clip": 0.01102023, + "auxiliary_loss_mlp": 0.00771212, + "balance_loss_clip": 1.03739047, + "balance_loss_mlp": 1.00028956, + "epoch": 0.7644070344205621, + "flos": 19857092440320.0, + "grad_norm": 2.084525894573783, + "language_loss": 0.83132589, + "learning_rate": 5.544064274590776e-07, + "loss": 0.85005832, + "num_input_tokens_seen": 274217595, + "step": 12714, + "time_per_iteration": 2.67500638961792 + }, + { + "auxiliary_loss_clip": 0.01104132, + "auxiliary_loss_mlp": 0.01037809, + "balance_loss_clip": 1.0384692, + "balance_loss_mlp": 1.02498782, + "epoch": 0.7644671576732301, + "flos": 22090628966400.0, + "grad_norm": 1.7447994690858495, + "language_loss": 0.73020244, + "learning_rate": 5.541373132311287e-07, + "loss": 0.75162184, + "num_input_tokens_seen": 274237885, + "step": 12715, + "time_per_iteration": 2.705496072769165 + }, + { + "auxiliary_loss_clip": 0.0106908, + "auxiliary_loss_mlp": 0.01029403, + "balance_loss_clip": 1.03376102, + "balance_loss_mlp": 1.01651025, + "epoch": 0.7645272809258981, + "flos": 25481421872640.0, + "grad_norm": 1.9750549289299242, + "language_loss": 0.63063681, + "learning_rate": 5.538682538304376e-07, + "loss": 0.65162164, + "num_input_tokens_seen": 274258820, + "step": 12716, + "time_per_iteration": 2.7983617782592773 + }, + { + "auxiliary_loss_clip": 0.01115577, + "auxiliary_loss_mlp": 0.01037115, + "balance_loss_clip": 1.03981853, + "balance_loss_mlp": 1.02357841, + "epoch": 0.7645874041785661, + "flos": 21541877913600.0, + "grad_norm": 1.536427490036212, + "language_loss": 0.79740059, + "learning_rate": 5.535992492672068e-07, + "loss": 0.81892753, + "num_input_tokens_seen": 274278835, + "step": 12717, + "time_per_iteration": 2.595195770263672 + }, + { + "auxiliary_loss_clip": 0.01110878, + "auxiliary_loss_mlp": 0.01037171, + "balance_loss_clip": 1.03890347, + "balance_loss_mlp": 1.02481461, + "epoch": 0.764647527431234, + "flos": 20630896156800.0, + "grad_norm": 2.30472579589713, + "language_loss": 0.66033196, + "learning_rate": 5.53330299551638e-07, + "loss": 0.68181252, + "num_input_tokens_seen": 274297110, + "step": 12718, + "time_per_iteration": 2.673990488052368 + }, + { + "auxiliary_loss_clip": 0.01063441, + "auxiliary_loss_mlp": 0.01036496, + "balance_loss_clip": 1.03585815, + "balance_loss_mlp": 1.02499259, + "epoch": 0.764707650683902, + "flos": 21434074220160.0, + "grad_norm": 2.1613863310626287, + "language_loss": 0.77098262, + "learning_rate": 5.530614046939286e-07, + "loss": 0.791982, + "num_input_tokens_seen": 274315610, + "step": 12719, + "time_per_iteration": 2.6510918140411377 + }, + { + "auxiliary_loss_clip": 0.01112525, + "auxiliary_loss_mlp": 0.01029144, + "balance_loss_clip": 1.03881288, + "balance_loss_mlp": 1.01615012, + "epoch": 0.7647677739365699, + "flos": 22711201263360.0, + "grad_norm": 2.267731943336326, + "language_loss": 0.7029593, + "learning_rate": 5.527925647042754e-07, + "loss": 0.72437602, + "num_input_tokens_seen": 274333975, + "step": 12720, + "time_per_iteration": 2.5991692543029785 + }, + { + "auxiliary_loss_clip": 0.01079824, + "auxiliary_loss_mlp": 0.01040879, + "balance_loss_clip": 1.03855467, + "balance_loss_mlp": 1.02823687, + "epoch": 0.7648278971892379, + "flos": 21324115710720.0, + "grad_norm": 1.5967062450845435, + "language_loss": 0.73703921, + "learning_rate": 5.52523779592875e-07, + "loss": 0.7582463, + "num_input_tokens_seen": 274353695, + "step": 12721, + "time_per_iteration": 2.764606237411499 + }, + { + "auxiliary_loss_clip": 0.01070414, + "auxiliary_loss_mlp": 0.01030705, + "balance_loss_clip": 1.03494334, + "balance_loss_mlp": 1.01805067, + "epoch": 0.764888020441906, + "flos": 20667345482880.0, + "grad_norm": 1.6944622449827433, + "language_loss": 0.73529649, + "learning_rate": 5.522550493699163e-07, + "loss": 0.75630772, + "num_input_tokens_seen": 274371120, + "step": 12722, + "time_per_iteration": 2.7863218784332275 + }, + { + "auxiliary_loss_clip": 0.01099467, + "auxiliary_loss_mlp": 0.01038218, + "balance_loss_clip": 1.03691196, + "balance_loss_mlp": 1.02573085, + "epoch": 0.7649481436945739, + "flos": 25082526360960.0, + "grad_norm": 1.8664873966014532, + "language_loss": 0.74043649, + "learning_rate": 5.519863740455912e-07, + "loss": 0.76181328, + "num_input_tokens_seen": 274389665, + "step": 12723, + "time_per_iteration": 2.6984498500823975 + }, + { + "auxiliary_loss_clip": 0.01111926, + "auxiliary_loss_mlp": 0.0103197, + "balance_loss_clip": 1.03712893, + "balance_loss_mlp": 1.01897049, + "epoch": 0.7650082669472419, + "flos": 24900890261760.0, + "grad_norm": 1.9718177009092177, + "language_loss": 0.73098785, + "learning_rate": 5.517177536300881e-07, + "loss": 0.75242674, + "num_input_tokens_seen": 274408750, + "step": 12724, + "time_per_iteration": 2.723292112350464 + }, + { + "auxiliary_loss_clip": 0.0109622, + "auxiliary_loss_mlp": 0.01027237, + "balance_loss_clip": 1.03798413, + "balance_loss_mlp": 1.01521456, + "epoch": 0.7650683901999098, + "flos": 14647388676480.0, + "grad_norm": 1.8049167073820385, + "language_loss": 0.83982503, + "learning_rate": 5.514491881335935e-07, + "loss": 0.86105955, + "num_input_tokens_seen": 274424600, + "step": 12725, + "time_per_iteration": 2.6900579929351807 + }, + { + "auxiliary_loss_clip": 0.01071599, + "auxiliary_loss_mlp": 0.01033335, + "balance_loss_clip": 1.03815186, + "balance_loss_mlp": 1.01962614, + "epoch": 0.7651285134525778, + "flos": 26352434770560.0, + "grad_norm": 1.764771346840138, + "language_loss": 0.77535796, + "learning_rate": 5.511806775662901e-07, + "loss": 0.79640734, + "num_input_tokens_seen": 274443075, + "step": 12726, + "time_per_iteration": 2.7554757595062256 + }, + { + "auxiliary_loss_clip": 0.01098675, + "auxiliary_loss_mlp": 0.0103653, + "balance_loss_clip": 1.03659284, + "balance_loss_mlp": 1.0239116, + "epoch": 0.7651886367052457, + "flos": 26646866553600.0, + "grad_norm": 1.727505900288767, + "language_loss": 0.70817876, + "learning_rate": 5.509122219383615e-07, + "loss": 0.72953087, + "num_input_tokens_seen": 274463240, + "step": 12727, + "time_per_iteration": 2.679713249206543 + }, + { + "auxiliary_loss_clip": 0.0110535, + "auxiliary_loss_mlp": 0.01031096, + "balance_loss_clip": 1.03530371, + "balance_loss_mlp": 1.01887083, + "epoch": 0.7652487599579137, + "flos": 25702847262720.0, + "grad_norm": 1.645567589950576, + "language_loss": 0.79781538, + "learning_rate": 5.506438212599864e-07, + "loss": 0.81917983, + "num_input_tokens_seen": 274482750, + "step": 12728, + "time_per_iteration": 2.6556482315063477 + }, + { + "auxiliary_loss_clip": 0.01112141, + "auxiliary_loss_mlp": 0.01029917, + "balance_loss_clip": 1.03871763, + "balance_loss_mlp": 1.01615465, + "epoch": 0.7653088832105817, + "flos": 28585576247040.0, + "grad_norm": 2.018168225354916, + "language_loss": 0.55207121, + "learning_rate": 5.503754755413424e-07, + "loss": 0.57349181, + "num_input_tokens_seen": 274503545, + "step": 12729, + "time_per_iteration": 2.656604290008545 + }, + { + "auxiliary_loss_clip": 0.01087792, + "auxiliary_loss_mlp": 0.00770692, + "balance_loss_clip": 1.03700304, + "balance_loss_mlp": 1.00016689, + "epoch": 0.7653690064632497, + "flos": 23366750428800.0, + "grad_norm": 2.0285553204704914, + "language_loss": 0.78009534, + "learning_rate": 5.501071847926055e-07, + "loss": 0.79868019, + "num_input_tokens_seen": 274523825, + "step": 12730, + "time_per_iteration": 4.308157920837402 + }, + { + "auxiliary_loss_clip": 0.01104921, + "auxiliary_loss_mlp": 0.01038983, + "balance_loss_clip": 1.04124045, + "balance_loss_mlp": 1.02547646, + "epoch": 0.7654291297159176, + "flos": 15773905992960.0, + "grad_norm": 1.8100841028281673, + "language_loss": 0.69162709, + "learning_rate": 5.498389490239495e-07, + "loss": 0.7130661, + "num_input_tokens_seen": 274541625, + "step": 12731, + "time_per_iteration": 5.375198841094971 + }, + { + "auxiliary_loss_clip": 0.0111224, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.03824425, + "balance_loss_mlp": 1.0195576, + "epoch": 0.7654892529685856, + "flos": 18033800123520.0, + "grad_norm": 2.185341705177071, + "language_loss": 0.70105004, + "learning_rate": 5.495707682455471e-07, + "loss": 0.72249627, + "num_input_tokens_seen": 274557580, + "step": 12732, + "time_per_iteration": 4.1254401206970215 + }, + { + "auxiliary_loss_clip": 0.01092112, + "auxiliary_loss_mlp": 0.01027482, + "balance_loss_clip": 1.0373385, + "balance_loss_mlp": 1.01429737, + "epoch": 0.7655493762212535, + "flos": 27236017428480.0, + "grad_norm": 1.4842742362274353, + "language_loss": 0.78410125, + "learning_rate": 5.493026424675653e-07, + "loss": 0.8052972, + "num_input_tokens_seen": 274578135, + "step": 12733, + "time_per_iteration": 2.7428579330444336 + }, + { + "auxiliary_loss_clip": 0.0109795, + "auxiliary_loss_mlp": 0.01031014, + "balance_loss_clip": 1.03692389, + "balance_loss_mlp": 1.0184319, + "epoch": 0.7656094994739215, + "flos": 20773964027520.0, + "grad_norm": 1.7566510390792163, + "language_loss": 0.7753557, + "learning_rate": 5.490345717001726e-07, + "loss": 0.79664528, + "num_input_tokens_seen": 274595655, + "step": 12734, + "time_per_iteration": 2.7525999546051025 + }, + { + "auxiliary_loss_clip": 0.01085843, + "auxiliary_loss_mlp": 0.01034242, + "balance_loss_clip": 1.03541505, + "balance_loss_mlp": 1.01981783, + "epoch": 0.7656696227265896, + "flos": 23039245198080.0, + "grad_norm": 1.5677045475604683, + "language_loss": 0.73221684, + "learning_rate": 5.48766555953535e-07, + "loss": 0.75341773, + "num_input_tokens_seen": 274616305, + "step": 12735, + "time_per_iteration": 2.7425713539123535 + }, + { + "auxiliary_loss_clip": 0.01081818, + "auxiliary_loss_mlp": 0.01035075, + "balance_loss_clip": 1.03768682, + "balance_loss_mlp": 1.02273107, + "epoch": 0.7657297459792575, + "flos": 27525636789120.0, + "grad_norm": 1.7042118812882554, + "language_loss": 0.72533989, + "learning_rate": 5.484985952378145e-07, + "loss": 0.74650872, + "num_input_tokens_seen": 274638110, + "step": 12736, + "time_per_iteration": 4.268921852111816 + }, + { + "auxiliary_loss_clip": 0.01100818, + "auxiliary_loss_mlp": 0.00771184, + "balance_loss_clip": 1.0399543, + "balance_loss_mlp": 1.00027192, + "epoch": 0.7657898692319255, + "flos": 17128456801920.0, + "grad_norm": 2.232664044830526, + "language_loss": 0.77698004, + "learning_rate": 5.482306895631728e-07, + "loss": 0.79570007, + "num_input_tokens_seen": 274656565, + "step": 12737, + "time_per_iteration": 2.751887321472168 + }, + { + "auxiliary_loss_clip": 0.0108412, + "auxiliary_loss_mlp": 0.01034529, + "balance_loss_clip": 1.03502047, + "balance_loss_mlp": 1.02128458, + "epoch": 0.7658499924845934, + "flos": 21465747037440.0, + "grad_norm": 1.8163284528378292, + "language_loss": 0.76455462, + "learning_rate": 5.479628389397699e-07, + "loss": 0.78574109, + "num_input_tokens_seen": 274674215, + "step": 12738, + "time_per_iteration": 2.7251851558685303 + }, + { + "auxiliary_loss_clip": 0.01092339, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.03848684, + "balance_loss_mlp": 1.01825941, + "epoch": 0.7659101157372614, + "flos": 29496665744640.0, + "grad_norm": 1.9441100679422159, + "language_loss": 0.62250507, + "learning_rate": 5.476950433777603e-07, + "loss": 0.64373976, + "num_input_tokens_seen": 274693445, + "step": 12739, + "time_per_iteration": 2.858171224594116 + }, + { + "auxiliary_loss_clip": 0.01112469, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.03928363, + "balance_loss_mlp": 1.02203465, + "epoch": 0.7659702389899293, + "flos": 18551812112640.0, + "grad_norm": 2.47113097275276, + "language_loss": 0.79031968, + "learning_rate": 5.474273028873004e-07, + "loss": 0.81180167, + "num_input_tokens_seen": 274712815, + "step": 12740, + "time_per_iteration": 2.624732732772827 + }, + { + "auxiliary_loss_clip": 0.01100888, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.03686976, + "balance_loss_mlp": 1.01987791, + "epoch": 0.7660303622425974, + "flos": 23549176627200.0, + "grad_norm": 1.653199646827083, + "language_loss": 0.65173864, + "learning_rate": 5.471596174785429e-07, + "loss": 0.67307615, + "num_input_tokens_seen": 274732690, + "step": 12741, + "time_per_iteration": 2.716336488723755 + }, + { + "auxiliary_loss_clip": 0.01083513, + "auxiliary_loss_mlp": 0.01030711, + "balance_loss_clip": 1.03482628, + "balance_loss_mlp": 1.0174545, + "epoch": 0.7660904854952653, + "flos": 18916736336640.0, + "grad_norm": 1.544754015503659, + "language_loss": 0.75767601, + "learning_rate": 5.468919871616386e-07, + "loss": 0.77881825, + "num_input_tokens_seen": 274752460, + "step": 12742, + "time_per_iteration": 2.7747738361358643 + }, + { + "auxiliary_loss_clip": 0.01086511, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.03885317, + "balance_loss_mlp": 1.01983905, + "epoch": 0.7661506087479333, + "flos": 23147515768320.0, + "grad_norm": 1.4566796365103731, + "language_loss": 0.76655585, + "learning_rate": 5.46624411946736e-07, + "loss": 0.78773808, + "num_input_tokens_seen": 274773070, + "step": 12743, + "time_per_iteration": 2.780097484588623 + }, + { + "auxiliary_loss_clip": 0.01085441, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.03478014, + "balance_loss_mlp": 1.02236819, + "epoch": 0.7662107320006012, + "flos": 17565776887680.0, + "grad_norm": 1.917782267543357, + "language_loss": 0.74838865, + "learning_rate": 5.463568918439805e-07, + "loss": 0.76959044, + "num_input_tokens_seen": 274790220, + "step": 12744, + "time_per_iteration": 2.8596222400665283 + }, + { + "auxiliary_loss_clip": 0.01099606, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.03648257, + "balance_loss_mlp": 1.02051127, + "epoch": 0.7662708552532692, + "flos": 22303075956480.0, + "grad_norm": 2.0417086586666424, + "language_loss": 0.71049422, + "learning_rate": 5.460894268635181e-07, + "loss": 0.73182726, + "num_input_tokens_seen": 274805095, + "step": 12745, + "time_per_iteration": 2.7712717056274414 + }, + { + "auxiliary_loss_clip": 0.01095184, + "auxiliary_loss_mlp": 0.01038801, + "balance_loss_clip": 1.03534567, + "balance_loss_mlp": 1.0241797, + "epoch": 0.7663309785059371, + "flos": 15742053607680.0, + "grad_norm": 2.301646519557661, + "language_loss": 0.77083957, + "learning_rate": 5.458220170154896e-07, + "loss": 0.79217947, + "num_input_tokens_seen": 274821800, + "step": 12746, + "time_per_iteration": 2.6804726123809814 + }, + { + "auxiliary_loss_clip": 0.0100528, + "auxiliary_loss_mlp": 0.01001059, + "balance_loss_clip": 1.01132298, + "balance_loss_mlp": 0.99997419, + "epoch": 0.7663911017586051, + "flos": 62163312514560.0, + "grad_norm": 0.6620577659541201, + "language_loss": 0.56773937, + "learning_rate": 5.455546623100362e-07, + "loss": 0.58780277, + "num_input_tokens_seen": 274886970, + "step": 12747, + "time_per_iteration": 3.3290786743164062 + }, + { + "auxiliary_loss_clip": 0.01108005, + "auxiliary_loss_mlp": 0.01035791, + "balance_loss_clip": 1.03717351, + "balance_loss_mlp": 1.02456689, + "epoch": 0.7664512250112732, + "flos": 26506025326080.0, + "grad_norm": 1.9151583390314333, + "language_loss": 0.72503966, + "learning_rate": 5.452873627572956e-07, + "loss": 0.7464776, + "num_input_tokens_seen": 274907240, + "step": 12748, + "time_per_iteration": 2.730177640914917 + }, + { + "auxiliary_loss_clip": 0.01074476, + "auxiliary_loss_mlp": 0.01028824, + "balance_loss_clip": 1.03368735, + "balance_loss_mlp": 1.01592588, + "epoch": 0.7665113482639411, + "flos": 16249542912000.0, + "grad_norm": 1.8433426874848031, + "language_loss": 0.69247651, + "learning_rate": 5.450201183674052e-07, + "loss": 0.7135095, + "num_input_tokens_seen": 274924650, + "step": 12749, + "time_per_iteration": 2.755204439163208 + }, + { + "auxiliary_loss_clip": 0.01101353, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.03804362, + "balance_loss_mlp": 1.01727748, + "epoch": 0.7665714715166091, + "flos": 27197880163200.0, + "grad_norm": 1.535641047844791, + "language_loss": 0.73516762, + "learning_rate": 5.447529291504967e-07, + "loss": 0.75648522, + "num_input_tokens_seen": 274944550, + "step": 12750, + "time_per_iteration": 2.7742607593536377 + }, + { + "auxiliary_loss_clip": 0.01097021, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.0379684, + "balance_loss_mlp": 1.02008176, + "epoch": 0.766631594769277, + "flos": 21067785279360.0, + "grad_norm": 2.3156427112447147, + "language_loss": 0.76064527, + "learning_rate": 5.444857951167026e-07, + "loss": 0.78193521, + "num_input_tokens_seen": 274961330, + "step": 12751, + "time_per_iteration": 2.730836868286133 + }, + { + "auxiliary_loss_clip": 0.01077429, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.03694248, + "balance_loss_mlp": 1.02451706, + "epoch": 0.766691718021945, + "flos": 24097963593600.0, + "grad_norm": 1.9738925392982969, + "language_loss": 0.6149745, + "learning_rate": 5.442187162761537e-07, + "loss": 0.63612545, + "num_input_tokens_seen": 274981655, + "step": 12752, + "time_per_iteration": 2.869851589202881 + }, + { + "auxiliary_loss_clip": 0.01102451, + "auxiliary_loss_mlp": 0.01036291, + "balance_loss_clip": 1.03904963, + "balance_loss_mlp": 1.02302337, + "epoch": 0.7667518412746129, + "flos": 23440654661760.0, + "grad_norm": 1.931365168470797, + "language_loss": 0.69503748, + "learning_rate": 5.439516926389767e-07, + "loss": 0.71642488, + "num_input_tokens_seen": 274999970, + "step": 12753, + "time_per_iteration": 2.7476491928100586 + }, + { + "auxiliary_loss_clip": 0.01101717, + "auxiliary_loss_mlp": 0.01036587, + "balance_loss_clip": 1.03879189, + "balance_loss_mlp": 1.02405787, + "epoch": 0.766811964527281, + "flos": 18148786536960.0, + "grad_norm": 2.611222297039761, + "language_loss": 0.62583512, + "learning_rate": 5.436847242152971e-07, + "loss": 0.64721823, + "num_input_tokens_seen": 275015805, + "step": 12754, + "time_per_iteration": 2.7371304035186768 + }, + { + "auxiliary_loss_clip": 0.01110914, + "auxiliary_loss_mlp": 0.01030173, + "balance_loss_clip": 1.03996325, + "balance_loss_mlp": 1.01831782, + "epoch": 0.7668720877799489, + "flos": 19536051657600.0, + "grad_norm": 2.549051131454572, + "language_loss": 0.80213803, + "learning_rate": 5.434178110152401e-07, + "loss": 0.82354891, + "num_input_tokens_seen": 275031810, + "step": 12755, + "time_per_iteration": 2.643878936767578 + }, + { + "auxiliary_loss_clip": 0.01110814, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.03913355, + "balance_loss_mlp": 1.01825666, + "epoch": 0.7669322110326169, + "flos": 22674320974080.0, + "grad_norm": 2.28671666205893, + "language_loss": 0.70240182, + "learning_rate": 5.431509530489242e-07, + "loss": 0.72381282, + "num_input_tokens_seen": 275049325, + "step": 12756, + "time_per_iteration": 2.666398763656616 + }, + { + "auxiliary_loss_clip": 0.01101033, + "auxiliary_loss_mlp": 0.0103684, + "balance_loss_clip": 1.03897476, + "balance_loss_mlp": 1.02491951, + "epoch": 0.7669923342852848, + "flos": 26469396432000.0, + "grad_norm": 1.5126125205867516, + "language_loss": 0.70042777, + "learning_rate": 5.428841503264706e-07, + "loss": 0.72180653, + "num_input_tokens_seen": 275070865, + "step": 12757, + "time_per_iteration": 2.9036061763763428 + }, + { + "auxiliary_loss_clip": 0.01090769, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.03925812, + "balance_loss_mlp": 1.02609968, + "epoch": 0.7670524575379528, + "flos": 22856136641280.0, + "grad_norm": 1.9623271762553347, + "language_loss": 0.76281571, + "learning_rate": 5.426174028579955e-07, + "loss": 0.7841171, + "num_input_tokens_seen": 275088015, + "step": 12758, + "time_per_iteration": 2.7477500438690186 + }, + { + "auxiliary_loss_clip": 0.0109864, + "auxiliary_loss_mlp": 0.01041128, + "balance_loss_clip": 1.03716195, + "balance_loss_mlp": 1.0282712, + "epoch": 0.7671125807906207, + "flos": 22452141398400.0, + "grad_norm": 1.933344061408033, + "language_loss": 0.76319116, + "learning_rate": 5.423507106536156e-07, + "loss": 0.78458881, + "num_input_tokens_seen": 275106975, + "step": 12759, + "time_per_iteration": 2.714374303817749 + }, + { + "auxiliary_loss_clip": 0.01087695, + "auxiliary_loss_mlp": 0.01028999, + "balance_loss_clip": 1.03469515, + "balance_loss_mlp": 1.0170604, + "epoch": 0.7671727040432887, + "flos": 35371543518720.0, + "grad_norm": 4.630848621895134, + "language_loss": 0.67929637, + "learning_rate": 5.420840737234425e-07, + "loss": 0.70046335, + "num_input_tokens_seen": 275129560, + "step": 12760, + "time_per_iteration": 2.7753570079803467 + }, + { + "auxiliary_loss_clip": 0.01089951, + "auxiliary_loss_mlp": 0.01034392, + "balance_loss_clip": 1.03797793, + "balance_loss_mlp": 1.02147603, + "epoch": 0.7672328272959568, + "flos": 22494947431680.0, + "grad_norm": 1.455109874708046, + "language_loss": 0.79299426, + "learning_rate": 5.418174920775871e-07, + "loss": 0.81423771, + "num_input_tokens_seen": 275151180, + "step": 12761, + "time_per_iteration": 2.7769343852996826 + }, + { + "auxiliary_loss_clip": 0.01085141, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.03607702, + "balance_loss_mlp": 1.022295, + "epoch": 0.7672929505486247, + "flos": 22815557251200.0, + "grad_norm": 18.920071863703896, + "language_loss": 0.66145515, + "learning_rate": 5.415509657261589e-07, + "loss": 0.68265665, + "num_input_tokens_seen": 275170605, + "step": 12762, + "time_per_iteration": 2.8406293392181396 + }, + { + "auxiliary_loss_clip": 0.01101121, + "auxiliary_loss_mlp": 0.01034249, + "balance_loss_clip": 1.03835821, + "balance_loss_mlp": 1.02105296, + "epoch": 0.7673530738012927, + "flos": 20338834671360.0, + "grad_norm": 1.6976408594267334, + "language_loss": 0.74313831, + "learning_rate": 5.412844946792639e-07, + "loss": 0.76449203, + "num_input_tokens_seen": 275188750, + "step": 12763, + "time_per_iteration": 2.6841235160827637 + }, + { + "auxiliary_loss_clip": 0.01088871, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.03973687, + "balance_loss_mlp": 1.02024698, + "epoch": 0.7674131970539606, + "flos": 34933576988160.0, + "grad_norm": 1.693482308646493, + "language_loss": 0.70655918, + "learning_rate": 5.410180789470067e-07, + "loss": 0.7277801, + "num_input_tokens_seen": 275211365, + "step": 12764, + "time_per_iteration": 2.821410894393921 + }, + { + "auxiliary_loss_clip": 0.01101312, + "auxiliary_loss_mlp": 0.01031147, + "balance_loss_clip": 1.03925323, + "balance_loss_mlp": 1.01875496, + "epoch": 0.7674733203066286, + "flos": 28328850766080.0, + "grad_norm": 1.8643050168393442, + "language_loss": 0.69511282, + "learning_rate": 5.40751718539491e-07, + "loss": 0.7164374, + "num_input_tokens_seen": 275231670, + "step": 12765, + "time_per_iteration": 2.7457258701324463 + }, + { + "auxiliary_loss_clip": 0.01081052, + "auxiliary_loss_mlp": 0.01029756, + "balance_loss_clip": 1.03556418, + "balance_loss_mlp": 1.01865792, + "epoch": 0.7675334435592965, + "flos": 16289727252480.0, + "grad_norm": 3.667092334043392, + "language_loss": 0.60817224, + "learning_rate": 5.404854134668162e-07, + "loss": 0.62928033, + "num_input_tokens_seen": 275249425, + "step": 12766, + "time_per_iteration": 2.6500067710876465 + }, + { + "auxiliary_loss_clip": 0.01001024, + "auxiliary_loss_mlp": 0.01013385, + "balance_loss_clip": 1.01323843, + "balance_loss_mlp": 1.01216352, + "epoch": 0.7675935668119646, + "flos": 64826232220800.0, + "grad_norm": 0.7347382071618644, + "language_loss": 0.60767788, + "learning_rate": 5.402191637390803e-07, + "loss": 0.62782198, + "num_input_tokens_seen": 275312485, + "step": 12767, + "time_per_iteration": 3.39412260055542 + }, + { + "auxiliary_loss_clip": 0.01089304, + "auxiliary_loss_mlp": 0.01027185, + "balance_loss_clip": 1.04006386, + "balance_loss_mlp": 1.01521647, + "epoch": 0.7676536900646325, + "flos": 22675398382080.0, + "grad_norm": 1.6451651301272818, + "language_loss": 0.69793016, + "learning_rate": 5.399529693663801e-07, + "loss": 0.71909499, + "num_input_tokens_seen": 275331680, + "step": 12768, + "time_per_iteration": 2.730433464050293 + }, + { + "auxiliary_loss_clip": 0.01106486, + "auxiliary_loss_mlp": 0.01036773, + "balance_loss_clip": 1.0407027, + "balance_loss_mlp": 1.0239104, + "epoch": 0.7677138133173005, + "flos": 26939682224640.0, + "grad_norm": 1.8343046170579347, + "language_loss": 0.71094149, + "learning_rate": 5.3968683035881e-07, + "loss": 0.73237407, + "num_input_tokens_seen": 275351615, + "step": 12769, + "time_per_iteration": 4.170667409896851 + }, + { + "auxiliary_loss_clip": 0.01103072, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.04003, + "balance_loss_mlp": 1.01668179, + "epoch": 0.7677739365699684, + "flos": 23799545400960.0, + "grad_norm": 1.983209153557694, + "language_loss": 0.80168104, + "learning_rate": 5.394207467264611e-07, + "loss": 0.82300717, + "num_input_tokens_seen": 275368815, + "step": 12770, + "time_per_iteration": 5.3567235469818115 + }, + { + "auxiliary_loss_clip": 0.01073219, + "auxiliary_loss_mlp": 0.01038788, + "balance_loss_clip": 1.03567314, + "balance_loss_mlp": 1.02632451, + "epoch": 0.7678340598226364, + "flos": 34455497944320.0, + "grad_norm": 1.6213929898270116, + "language_loss": 0.78927696, + "learning_rate": 5.391547184794245e-07, + "loss": 0.81039715, + "num_input_tokens_seen": 275389345, + "step": 12771, + "time_per_iteration": 4.329530954360962 + }, + { + "auxiliary_loss_clip": 0.01110874, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.03865027, + "balance_loss_mlp": 1.02205408, + "epoch": 0.7678941830753043, + "flos": 23841740903040.0, + "grad_norm": 1.3882460901064075, + "language_loss": 0.68299866, + "learning_rate": 5.388887456277876e-07, + "loss": 0.70444703, + "num_input_tokens_seen": 275411240, + "step": 12772, + "time_per_iteration": 2.6789863109588623 + }, + { + "auxiliary_loss_clip": 0.01095405, + "auxiliary_loss_mlp": 0.01027019, + "balance_loss_clip": 1.03676343, + "balance_loss_mlp": 1.01512742, + "epoch": 0.7679543063279723, + "flos": 25410929431680.0, + "grad_norm": 1.5084750243321292, + "language_loss": 0.73452669, + "learning_rate": 5.386228281816349e-07, + "loss": 0.75575089, + "num_input_tokens_seen": 275432010, + "step": 12773, + "time_per_iteration": 2.6992523670196533 + }, + { + "auxiliary_loss_clip": 0.01069552, + "auxiliary_loss_mlp": 0.01031097, + "balance_loss_clip": 1.03272963, + "balance_loss_mlp": 1.0193727, + "epoch": 0.7680144295806404, + "flos": 27962382257280.0, + "grad_norm": 1.681002895076516, + "language_loss": 0.81144333, + "learning_rate": 5.383569661510512e-07, + "loss": 0.83244979, + "num_input_tokens_seen": 275453710, + "step": 12774, + "time_per_iteration": 2.8317103385925293 + }, + { + "auxiliary_loss_clip": 0.01102442, + "auxiliary_loss_mlp": 0.00769635, + "balance_loss_clip": 1.04086018, + "balance_loss_mlp": 1.00017095, + "epoch": 0.7680745528333083, + "flos": 20412810731520.0, + "grad_norm": 1.7406217670940616, + "language_loss": 0.69881612, + "learning_rate": 5.380911595461177e-07, + "loss": 0.71753687, + "num_input_tokens_seen": 275472915, + "step": 12775, + "time_per_iteration": 2.6908600330352783 + }, + { + "auxiliary_loss_clip": 0.00994458, + "auxiliary_loss_mlp": 0.01000081, + "balance_loss_clip": 1.01208818, + "balance_loss_mlp": 0.99908555, + "epoch": 0.7681346760859763, + "flos": 68401103351040.0, + "grad_norm": 0.7006055087346096, + "language_loss": 0.5683471, + "learning_rate": 5.378254083769147e-07, + "loss": 0.58829248, + "num_input_tokens_seen": 275534785, + "step": 12776, + "time_per_iteration": 4.903045415878296 + }, + { + "auxiliary_loss_clip": 0.01097484, + "auxiliary_loss_mlp": 0.01038787, + "balance_loss_clip": 1.03686929, + "balance_loss_mlp": 1.02621067, + "epoch": 0.7681947993386442, + "flos": 21251468453760.0, + "grad_norm": 1.9522911810284198, + "language_loss": 0.73814118, + "learning_rate": 5.375597126535188e-07, + "loss": 0.75950396, + "num_input_tokens_seen": 275553205, + "step": 12777, + "time_per_iteration": 2.6122212409973145 + }, + { + "auxiliary_loss_clip": 0.01086003, + "auxiliary_loss_mlp": 0.01032363, + "balance_loss_clip": 1.04298782, + "balance_loss_mlp": 1.02055573, + "epoch": 0.7682549225913122, + "flos": 21397696721280.0, + "grad_norm": 2.745693545308853, + "language_loss": 0.70324051, + "learning_rate": 5.372940723860043e-07, + "loss": 0.72442418, + "num_input_tokens_seen": 275571490, + "step": 12778, + "time_per_iteration": 2.67712664604187 + }, + { + "auxiliary_loss_clip": 0.01097946, + "auxiliary_loss_mlp": 0.01036667, + "balance_loss_clip": 1.0395422, + "balance_loss_mlp": 1.02473378, + "epoch": 0.7683150458439801, + "flos": 23038921975680.0, + "grad_norm": 1.741525859100896, + "language_loss": 0.70140779, + "learning_rate": 5.37028487584446e-07, + "loss": 0.72275388, + "num_input_tokens_seen": 275589665, + "step": 12779, + "time_per_iteration": 2.699604034423828 + }, + { + "auxiliary_loss_clip": 0.01086473, + "auxiliary_loss_mlp": 0.01031094, + "balance_loss_clip": 1.03794789, + "balance_loss_mlp": 1.01829696, + "epoch": 0.7683751690966482, + "flos": 67332397996800.0, + "grad_norm": 9.13576096667177, + "language_loss": 0.58861399, + "learning_rate": 5.367629582589133e-07, + "loss": 0.60978961, + "num_input_tokens_seen": 275615605, + "step": 12780, + "time_per_iteration": 3.0669844150543213 + }, + { + "auxiliary_loss_clip": 0.01104147, + "auxiliary_loss_mlp": 0.01037402, + "balance_loss_clip": 1.03906894, + "balance_loss_mlp": 1.02291799, + "epoch": 0.7684352923493161, + "flos": 21798890703360.0, + "grad_norm": 1.8034337516792285, + "language_loss": 0.67968678, + "learning_rate": 5.364974844194759e-07, + "loss": 0.70110226, + "num_input_tokens_seen": 275634965, + "step": 12781, + "time_per_iteration": 2.651834726333618 + }, + { + "auxiliary_loss_clip": 0.01060523, + "auxiliary_loss_mlp": 0.01036749, + "balance_loss_clip": 1.03551328, + "balance_loss_mlp": 1.02461362, + "epoch": 0.7684954156019841, + "flos": 25847603072640.0, + "grad_norm": 1.4376609198163834, + "language_loss": 0.79309833, + "learning_rate": 5.362320660762016e-07, + "loss": 0.81407106, + "num_input_tokens_seen": 275655785, + "step": 12782, + "time_per_iteration": 2.847486972808838 + }, + { + "auxiliary_loss_clip": 0.01082383, + "auxiliary_loss_mlp": 0.0103233, + "balance_loss_clip": 1.03683078, + "balance_loss_mlp": 1.01938355, + "epoch": 0.768555538854652, + "flos": 25447378757760.0, + "grad_norm": 1.7564402643439623, + "language_loss": 0.67005706, + "learning_rate": 5.35966703239153e-07, + "loss": 0.69120419, + "num_input_tokens_seen": 275676160, + "step": 12783, + "time_per_iteration": 2.703382730484009 + }, + { + "auxiliary_loss_clip": 0.01090024, + "auxiliary_loss_mlp": 0.01032797, + "balance_loss_clip": 1.03791714, + "balance_loss_mlp": 1.01942182, + "epoch": 0.76861566210732, + "flos": 19646369303040.0, + "grad_norm": 1.6469852773745217, + "language_loss": 0.69382596, + "learning_rate": 5.357013959183938e-07, + "loss": 0.71505415, + "num_input_tokens_seen": 275695660, + "step": 12784, + "time_per_iteration": 2.704110860824585 + }, + { + "auxiliary_loss_clip": 0.01069442, + "auxiliary_loss_mlp": 0.01027242, + "balance_loss_clip": 1.03885603, + "balance_loss_mlp": 1.01570261, + "epoch": 0.7686757853599879, + "flos": 22419032037120.0, + "grad_norm": 1.8804976619771494, + "language_loss": 0.80312717, + "learning_rate": 5.354361441239843e-07, + "loss": 0.824094, + "num_input_tokens_seen": 275714025, + "step": 12785, + "time_per_iteration": 2.7998046875 + }, + { + "auxiliary_loss_clip": 0.0109676, + "auxiliary_loss_mlp": 0.01038542, + "balance_loss_clip": 1.03655457, + "balance_loss_mlp": 1.02337885, + "epoch": 0.768735908612656, + "flos": 47774262453120.0, + "grad_norm": 1.5387616772885826, + "language_loss": 0.77432472, + "learning_rate": 5.351709478659836e-07, + "loss": 0.79567772, + "num_input_tokens_seen": 275737300, + "step": 12786, + "time_per_iteration": 2.8903398513793945 + }, + { + "auxiliary_loss_clip": 0.01110354, + "auxiliary_loss_mlp": 0.01035321, + "balance_loss_clip": 1.03830373, + "balance_loss_mlp": 1.02295876, + "epoch": 0.7687960318653239, + "flos": 30263179000320.0, + "grad_norm": 1.918052748759356, + "language_loss": 0.58398765, + "learning_rate": 5.349058071544468e-07, + "loss": 0.60544437, + "num_input_tokens_seen": 275757895, + "step": 12787, + "time_per_iteration": 2.699540376663208 + }, + { + "auxiliary_loss_clip": 0.01082553, + "auxiliary_loss_mlp": 0.01032316, + "balance_loss_clip": 1.03361166, + "balance_loss_mlp": 1.01962042, + "epoch": 0.7688561551179919, + "flos": 19573434737280.0, + "grad_norm": 1.5809067798231773, + "language_loss": 0.76156747, + "learning_rate": 5.346407219994292e-07, + "loss": 0.78271621, + "num_input_tokens_seen": 275776745, + "step": 12788, + "time_per_iteration": 2.81557559967041 + }, + { + "auxiliary_loss_clip": 0.01071579, + "auxiliary_loss_mlp": 0.00770364, + "balance_loss_clip": 1.03880525, + "balance_loss_mlp": 1.00020683, + "epoch": 0.7689162783706599, + "flos": 22783776693120.0, + "grad_norm": 1.957956891358716, + "language_loss": 0.66906554, + "learning_rate": 5.343756924109821e-07, + "loss": 0.68748498, + "num_input_tokens_seen": 275797205, + "step": 12789, + "time_per_iteration": 2.8146092891693115 + }, + { + "auxiliary_loss_clip": 0.01090409, + "auxiliary_loss_mlp": 0.0103625, + "balance_loss_clip": 1.03680754, + "balance_loss_mlp": 1.02214777, + "epoch": 0.7689764016233278, + "flos": 34204195416960.0, + "grad_norm": 1.6643565512884475, + "language_loss": 0.68623877, + "learning_rate": 5.341107183991553e-07, + "loss": 0.70750535, + "num_input_tokens_seen": 275817935, + "step": 12790, + "time_per_iteration": 2.812708854675293 + }, + { + "auxiliary_loss_clip": 0.0108634, + "auxiliary_loss_mlp": 0.01032838, + "balance_loss_clip": 1.03740978, + "balance_loss_mlp": 1.01972485, + "epoch": 0.7690365248759958, + "flos": 17274469587840.0, + "grad_norm": 1.474038307182623, + "language_loss": 0.68689752, + "learning_rate": 5.338457999739969e-07, + "loss": 0.70808923, + "num_input_tokens_seen": 275837145, + "step": 12791, + "time_per_iteration": 2.7558822631835938 + }, + { + "auxiliary_loss_clip": 0.01097751, + "auxiliary_loss_mlp": 0.01036178, + "balance_loss_clip": 1.038535, + "balance_loss_mlp": 1.0244422, + "epoch": 0.7690966481286637, + "flos": 18223157646720.0, + "grad_norm": 2.037350378754986, + "language_loss": 0.79861724, + "learning_rate": 5.335809371455526e-07, + "loss": 0.81995654, + "num_input_tokens_seen": 275855705, + "step": 12792, + "time_per_iteration": 2.6373798847198486 + }, + { + "auxiliary_loss_clip": 0.01086002, + "auxiliary_loss_mlp": 0.00771512, + "balance_loss_clip": 1.04310513, + "balance_loss_mlp": 1.0003171, + "epoch": 0.7691567713813318, + "flos": 21537568281600.0, + "grad_norm": 1.8617627243354054, + "language_loss": 0.72776759, + "learning_rate": 5.333161299238673e-07, + "loss": 0.74634272, + "num_input_tokens_seen": 275873930, + "step": 12793, + "time_per_iteration": 2.8017160892486572 + }, + { + "auxiliary_loss_clip": 0.01074333, + "auxiliary_loss_mlp": 0.01036724, + "balance_loss_clip": 1.03909159, + "balance_loss_mlp": 1.02368283, + "epoch": 0.7692168946339997, + "flos": 39379999720320.0, + "grad_norm": 1.9633300130492255, + "language_loss": 0.63842422, + "learning_rate": 5.330513783189803e-07, + "loss": 0.65953475, + "num_input_tokens_seen": 275895895, + "step": 12794, + "time_per_iteration": 2.8763763904571533 + }, + { + "auxiliary_loss_clip": 0.01088067, + "auxiliary_loss_mlp": 0.01038769, + "balance_loss_clip": 1.03724957, + "balance_loss_mlp": 1.02609682, + "epoch": 0.7692770178866677, + "flos": 25009950931200.0, + "grad_norm": 1.537212991597864, + "language_loss": 0.76528752, + "learning_rate": 5.327866823409319e-07, + "loss": 0.78655589, + "num_input_tokens_seen": 275917825, + "step": 12795, + "time_per_iteration": 2.7116506099700928 + }, + { + "auxiliary_loss_clip": 0.01075556, + "auxiliary_loss_mlp": 0.01025881, + "balance_loss_clip": 1.03575516, + "balance_loss_mlp": 1.01325679, + "epoch": 0.7693371411393356, + "flos": 24716273333760.0, + "grad_norm": 1.8098665948309556, + "language_loss": 0.71871811, + "learning_rate": 5.325220419997601e-07, + "loss": 0.7397325, + "num_input_tokens_seen": 275937890, + "step": 12796, + "time_per_iteration": 2.770573139190674 + }, + { + "auxiliary_loss_clip": 0.01110769, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.03795838, + "balance_loss_mlp": 1.01753139, + "epoch": 0.7693972643920036, + "flos": 15924803028480.0, + "grad_norm": 1.8945883944315456, + "language_loss": 0.64692825, + "learning_rate": 5.32257457305499e-07, + "loss": 0.66833782, + "num_input_tokens_seen": 275954495, + "step": 12797, + "time_per_iteration": 2.597770929336548 + }, + { + "auxiliary_loss_clip": 0.01074194, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.03441215, + "balance_loss_mlp": 1.02261305, + "epoch": 0.7694573876446715, + "flos": 25405901527680.0, + "grad_norm": 2.104503388065538, + "language_loss": 0.91503501, + "learning_rate": 5.319929282681823e-07, + "loss": 0.93614614, + "num_input_tokens_seen": 275972395, + "step": 12798, + "time_per_iteration": 2.7857353687286377 + }, + { + "auxiliary_loss_clip": 0.01061452, + "auxiliary_loss_mlp": 0.01027349, + "balance_loss_clip": 1.03667367, + "balance_loss_mlp": 1.01509404, + "epoch": 0.7695175108973396, + "flos": 16654220513280.0, + "grad_norm": 1.8305644604969793, + "language_loss": 0.82303166, + "learning_rate": 5.317284548978418e-07, + "loss": 0.84391975, + "num_input_tokens_seen": 275989020, + "step": 12799, + "time_per_iteration": 2.7627201080322266 + }, + { + "auxiliary_loss_clip": 0.01057867, + "auxiliary_loss_mlp": 0.0102915, + "balance_loss_clip": 1.03739285, + "balance_loss_mlp": 1.01601338, + "epoch": 0.7695776341500075, + "flos": 13626520237440.0, + "grad_norm": 1.9375837310730932, + "language_loss": 0.7841835, + "learning_rate": 5.314640372045045e-07, + "loss": 0.80505365, + "num_input_tokens_seen": 276006525, + "step": 12800, + "time_per_iteration": 2.860802173614502 + }, + { + "auxiliary_loss_clip": 0.01094192, + "auxiliary_loss_mlp": 0.01029605, + "balance_loss_clip": 1.03736687, + "balance_loss_mlp": 1.01572347, + "epoch": 0.7696377574026755, + "flos": 24276690691200.0, + "grad_norm": 1.6551183463192032, + "language_loss": 0.83884531, + "learning_rate": 5.31199675198198e-07, + "loss": 0.86008328, + "num_input_tokens_seen": 276027130, + "step": 12801, + "time_per_iteration": 2.8100953102111816 + }, + { + "auxiliary_loss_clip": 0.0108893, + "auxiliary_loss_mlp": 0.01030665, + "balance_loss_clip": 1.03665733, + "balance_loss_mlp": 1.01778448, + "epoch": 0.7696978806553435, + "flos": 20923137210240.0, + "grad_norm": 2.4183621963241357, + "language_loss": 0.72267437, + "learning_rate": 5.30935368888947e-07, + "loss": 0.74387032, + "num_input_tokens_seen": 276045715, + "step": 12802, + "time_per_iteration": 2.716482639312744 + }, + { + "auxiliary_loss_clip": 0.0108354, + "auxiliary_loss_mlp": 0.01034672, + "balance_loss_clip": 1.0340662, + "balance_loss_mlp": 1.022048, + "epoch": 0.7697580039080114, + "flos": 22929609911040.0, + "grad_norm": 1.7224396030439215, + "language_loss": 0.75905406, + "learning_rate": 5.306711182867747e-07, + "loss": 0.78023618, + "num_input_tokens_seen": 276065375, + "step": 12803, + "time_per_iteration": 2.7502260208129883 + }, + { + "auxiliary_loss_clip": 0.01018092, + "auxiliary_loss_mlp": 0.01000358, + "balance_loss_clip": 1.01451325, + "balance_loss_mlp": 0.99920207, + "epoch": 0.7698181271606794, + "flos": 68717654933760.0, + "grad_norm": 0.7330583208910887, + "language_loss": 0.55806667, + "learning_rate": 5.304069234017001e-07, + "loss": 0.57825118, + "num_input_tokens_seen": 276131405, + "step": 12804, + "time_per_iteration": 3.3005380630493164 + }, + { + "auxiliary_loss_clip": 0.0101265, + "auxiliary_loss_mlp": 0.01002009, + "balance_loss_clip": 1.00900471, + "balance_loss_mlp": 1.00096023, + "epoch": 0.7698782504133473, + "flos": 67409716999680.0, + "grad_norm": 0.7614116720269231, + "language_loss": 0.54004955, + "learning_rate": 5.301427842437429e-07, + "loss": 0.56019616, + "num_input_tokens_seen": 276200755, + "step": 12805, + "time_per_iteration": 3.3900198936462402 + }, + { + "auxiliary_loss_clip": 0.0108001, + "auxiliary_loss_mlp": 0.01033208, + "balance_loss_clip": 1.03882051, + "balance_loss_mlp": 1.02053022, + "epoch": 0.7699383736660154, + "flos": 22488842119680.0, + "grad_norm": 1.986233467865104, + "language_loss": 0.73035413, + "learning_rate": 5.298787008229187e-07, + "loss": 0.7514863, + "num_input_tokens_seen": 276217880, + "step": 12806, + "time_per_iteration": 2.7341980934143066 + }, + { + "auxiliary_loss_clip": 0.01086866, + "auxiliary_loss_mlp": 0.01035537, + "balance_loss_clip": 1.03594339, + "balance_loss_mlp": 1.02238786, + "epoch": 0.7699984969186833, + "flos": 21539723097600.0, + "grad_norm": 2.048367090429927, + "language_loss": 0.75222588, + "learning_rate": 5.296146731492408e-07, + "loss": 0.7734499, + "num_input_tokens_seen": 276234810, + "step": 12807, + "time_per_iteration": 2.724539041519165 + }, + { + "auxiliary_loss_clip": 0.01106456, + "auxiliary_loss_mlp": 0.01031388, + "balance_loss_clip": 1.04034483, + "balance_loss_mlp": 1.01792347, + "epoch": 0.7700586201713513, + "flos": 21719096640000.0, + "grad_norm": 2.054947719033548, + "language_loss": 0.80061448, + "learning_rate": 5.293507012327218e-07, + "loss": 0.82199287, + "num_input_tokens_seen": 276252850, + "step": 12808, + "time_per_iteration": 4.215209722518921 + }, + { + "auxiliary_loss_clip": 0.01105023, + "auxiliary_loss_mlp": 0.01039739, + "balance_loss_clip": 1.03983986, + "balance_loss_mlp": 1.02620244, + "epoch": 0.7701187434240192, + "flos": 27856015107840.0, + "grad_norm": 2.2828692902230743, + "language_loss": 0.79191184, + "learning_rate": 5.290867850833718e-07, + "loss": 0.8133595, + "num_input_tokens_seen": 276272525, + "step": 12809, + "time_per_iteration": 4.67883825302124 + }, + { + "auxiliary_loss_clip": 0.01075128, + "auxiliary_loss_mlp": 0.01026317, + "balance_loss_clip": 1.03558159, + "balance_loss_mlp": 1.014974, + "epoch": 0.7701788666766872, + "flos": 28621307301120.0, + "grad_norm": 1.7126957543660224, + "language_loss": 0.70423043, + "learning_rate": 5.288229247111993e-07, + "loss": 0.72524494, + "num_input_tokens_seen": 276294210, + "step": 12810, + "time_per_iteration": 4.299976110458374 + }, + { + "auxiliary_loss_clip": 0.0108663, + "auxiliary_loss_mlp": 0.01043871, + "balance_loss_clip": 1.03548312, + "balance_loss_mlp": 1.02746737, + "epoch": 0.7702389899293551, + "flos": 14246446089600.0, + "grad_norm": 2.84512278280032, + "language_loss": 0.77875537, + "learning_rate": 5.285591201262079e-07, + "loss": 0.80006033, + "num_input_tokens_seen": 276310290, + "step": 12811, + "time_per_iteration": 2.792184352874756 + }, + { + "auxiliary_loss_clip": 0.01001395, + "auxiliary_loss_mlp": 0.01001171, + "balance_loss_clip": 1.00706363, + "balance_loss_mlp": 0.99988317, + "epoch": 0.7702991131820232, + "flos": 70574128439040.0, + "grad_norm": 0.8151907995721069, + "language_loss": 0.56650817, + "learning_rate": 5.28295371338402e-07, + "loss": 0.5865339, + "num_input_tokens_seen": 276371715, + "step": 12812, + "time_per_iteration": 3.301762819290161 + }, + { + "auxiliary_loss_clip": 0.01073584, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.03664494, + "balance_loss_mlp": 1.02299511, + "epoch": 0.7703592364346911, + "flos": 25480021242240.0, + "grad_norm": 3.4768581734180453, + "language_loss": 0.72098076, + "learning_rate": 5.280316783577836e-07, + "loss": 0.74207264, + "num_input_tokens_seen": 276389895, + "step": 12813, + "time_per_iteration": 2.8251900672912598 + }, + { + "auxiliary_loss_clip": 0.0110181, + "auxiliary_loss_mlp": 0.01030481, + "balance_loss_clip": 1.03734303, + "balance_loss_mlp": 1.01664054, + "epoch": 0.7704193596873591, + "flos": 19280906375040.0, + "grad_norm": 2.0063403023078297, + "language_loss": 0.66324687, + "learning_rate": 5.27768041194351e-07, + "loss": 0.68456984, + "num_input_tokens_seen": 276408990, + "step": 12814, + "time_per_iteration": 2.7897889614105225 + }, + { + "auxiliary_loss_clip": 0.01089036, + "auxiliary_loss_mlp": 0.01038036, + "balance_loss_clip": 1.03707969, + "balance_loss_mlp": 1.02553058, + "epoch": 0.7704794829400271, + "flos": 23658452778240.0, + "grad_norm": 1.8618896845056536, + "language_loss": 0.65574408, + "learning_rate": 5.275044598581018e-07, + "loss": 0.67701477, + "num_input_tokens_seen": 276428190, + "step": 12815, + "time_per_iteration": 2.745948314666748 + }, + { + "auxiliary_loss_clip": 0.0109967, + "auxiliary_loss_mlp": 0.01034412, + "balance_loss_clip": 1.03795624, + "balance_loss_mlp": 1.02119207, + "epoch": 0.770539606192695, + "flos": 18989311766400.0, + "grad_norm": 3.9090080450756703, + "language_loss": 0.65051812, + "learning_rate": 5.272409343590322e-07, + "loss": 0.67185891, + "num_input_tokens_seen": 276446855, + "step": 12816, + "time_per_iteration": 4.193779230117798 + }, + { + "auxiliary_loss_clip": 0.01102885, + "auxiliary_loss_mlp": 0.01034999, + "balance_loss_clip": 1.03968191, + "balance_loss_mlp": 1.02194536, + "epoch": 0.770599729445363, + "flos": 11830160142720.0, + "grad_norm": 2.3027657135701496, + "language_loss": 0.71589029, + "learning_rate": 5.26977464707133e-07, + "loss": 0.73726916, + "num_input_tokens_seen": 276462000, + "step": 12817, + "time_per_iteration": 2.701976776123047 + }, + { + "auxiliary_loss_clip": 0.01067462, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.03671288, + "balance_loss_mlp": 1.01967907, + "epoch": 0.770659852698031, + "flos": 17822610109440.0, + "grad_norm": 2.117205920773346, + "language_loss": 0.61316186, + "learning_rate": 5.267140509123957e-07, + "loss": 0.63415402, + "num_input_tokens_seen": 276481190, + "step": 12818, + "time_per_iteration": 2.894584894180298 + }, + { + "auxiliary_loss_clip": 0.01098817, + "auxiliary_loss_mlp": 0.01029481, + "balance_loss_clip": 1.03884339, + "balance_loss_mlp": 1.01770937, + "epoch": 0.770719975950699, + "flos": 21871968923520.0, + "grad_norm": 1.8092629622591248, + "language_loss": 0.67272353, + "learning_rate": 5.264506929848093e-07, + "loss": 0.69400644, + "num_input_tokens_seen": 276499520, + "step": 12819, + "time_per_iteration": 2.6729207038879395 + }, + { + "auxiliary_loss_clip": 0.01114198, + "auxiliary_loss_mlp": 0.01031273, + "balance_loss_clip": 1.04036117, + "balance_loss_mlp": 1.0183568, + "epoch": 0.7707800992033669, + "flos": 21325049464320.0, + "grad_norm": 3.8495844407525786, + "language_loss": 0.57512546, + "learning_rate": 5.261873909343608e-07, + "loss": 0.59658015, + "num_input_tokens_seen": 276519110, + "step": 12820, + "time_per_iteration": 2.6065587997436523 + }, + { + "auxiliary_loss_clip": 0.01082909, + "auxiliary_loss_mlp": 0.01030006, + "balance_loss_clip": 1.037233, + "balance_loss_mlp": 1.01698244, + "epoch": 0.7708402224560349, + "flos": 28179426188160.0, + "grad_norm": 2.6946227990391742, + "language_loss": 0.80718732, + "learning_rate": 5.259241447710343e-07, + "loss": 0.82831645, + "num_input_tokens_seen": 276538805, + "step": 12821, + "time_per_iteration": 2.7545745372772217 + }, + { + "auxiliary_loss_clip": 0.01113447, + "auxiliary_loss_mlp": 0.01036131, + "balance_loss_clip": 1.04009652, + "balance_loss_mlp": 1.02311945, + "epoch": 0.7709003457087028, + "flos": 15377057556480.0, + "grad_norm": 3.179311365273749, + "language_loss": 0.68571889, + "learning_rate": 5.256609545048114e-07, + "loss": 0.70721459, + "num_input_tokens_seen": 276554770, + "step": 12822, + "time_per_iteration": 2.6314475536346436 + }, + { + "auxiliary_loss_clip": 0.0108847, + "auxiliary_loss_mlp": 0.01036733, + "balance_loss_clip": 1.03697228, + "balance_loss_mlp": 1.02384686, + "epoch": 0.7709604689613708, + "flos": 30621854257920.0, + "grad_norm": 1.8530631240007662, + "language_loss": 0.72300768, + "learning_rate": 5.253978201456733e-07, + "loss": 0.74425972, + "num_input_tokens_seen": 276574535, + "step": 12823, + "time_per_iteration": 2.7124979496002197 + }, + { + "auxiliary_loss_clip": 0.01107629, + "auxiliary_loss_mlp": 0.01039791, + "balance_loss_clip": 1.04024911, + "balance_loss_mlp": 1.02459168, + "epoch": 0.7710205922140387, + "flos": 20301272023680.0, + "grad_norm": 1.7759548619058283, + "language_loss": 0.76394266, + "learning_rate": 5.251347417035969e-07, + "loss": 0.78541684, + "num_input_tokens_seen": 276592925, + "step": 12824, + "time_per_iteration": 2.7012369632720947 + }, + { + "auxiliary_loss_clip": 0.0108641, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.0379014, + "balance_loss_mlp": 1.01897967, + "epoch": 0.7710807154667068, + "flos": 19644214487040.0, + "grad_norm": 2.5594814083345856, + "language_loss": 0.72377741, + "learning_rate": 5.248717191885592e-07, + "loss": 0.744959, + "num_input_tokens_seen": 276610540, + "step": 12825, + "time_per_iteration": 2.711148977279663 + }, + { + "auxiliary_loss_clip": 0.0110825, + "auxiliary_loss_mlp": 0.01037397, + "balance_loss_clip": 1.03889346, + "balance_loss_mlp": 1.02650094, + "epoch": 0.7711408387193747, + "flos": 20006337450240.0, + "grad_norm": 1.6443549229743277, + "language_loss": 0.73782164, + "learning_rate": 5.246087526105343e-07, + "loss": 0.75927812, + "num_input_tokens_seen": 276629200, + "step": 12826, + "time_per_iteration": 2.6268928050994873 + }, + { + "auxiliary_loss_clip": 0.01112855, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.03778219, + "balance_loss_mlp": 1.02234912, + "epoch": 0.7712009619720427, + "flos": 24971131307520.0, + "grad_norm": 1.6817186914054845, + "language_loss": 0.81052697, + "learning_rate": 5.243458419794933e-07, + "loss": 0.83201313, + "num_input_tokens_seen": 276648655, + "step": 12827, + "time_per_iteration": 2.6236133575439453 + }, + { + "auxiliary_loss_clip": 0.01030504, + "auxiliary_loss_mlp": 0.01001401, + "balance_loss_clip": 1.0079608, + "balance_loss_mlp": 1.0003643, + "epoch": 0.7712610852247107, + "flos": 63249681404160.0, + "grad_norm": 0.8667997379462846, + "language_loss": 0.55184829, + "learning_rate": 5.240829873054051e-07, + "loss": 0.57216728, + "num_input_tokens_seen": 276716500, + "step": 12828, + "time_per_iteration": 3.314025640487671 + }, + { + "auxiliary_loss_clip": 0.01062789, + "auxiliary_loss_mlp": 0.01034135, + "balance_loss_clip": 1.03295088, + "balance_loss_mlp": 1.02165389, + "epoch": 0.7713212084773786, + "flos": 18697860812160.0, + "grad_norm": 1.7251497465168657, + "language_loss": 0.6980052, + "learning_rate": 5.23820188598238e-07, + "loss": 0.71897441, + "num_input_tokens_seen": 276733535, + "step": 12829, + "time_per_iteration": 2.7099921703338623 + }, + { + "auxiliary_loss_clip": 0.01085241, + "auxiliary_loss_mlp": 0.01036187, + "balance_loss_clip": 1.04121757, + "balance_loss_mlp": 1.02271688, + "epoch": 0.7713813317300466, + "flos": 14173367869440.0, + "grad_norm": 2.8210703511982, + "language_loss": 0.79999912, + "learning_rate": 5.235574458679579e-07, + "loss": 0.82121342, + "num_input_tokens_seen": 276749575, + "step": 12830, + "time_per_iteration": 2.754983901977539 + }, + { + "auxiliary_loss_clip": 0.01104042, + "auxiliary_loss_mlp": 0.01037065, + "balance_loss_clip": 1.03856182, + "balance_loss_mlp": 1.02329099, + "epoch": 0.7714414549827145, + "flos": 25703960584320.0, + "grad_norm": 1.661801317561211, + "language_loss": 0.77825183, + "learning_rate": 5.232947591245269e-07, + "loss": 0.79966295, + "num_input_tokens_seen": 276769460, + "step": 12831, + "time_per_iteration": 2.7142996788024902 + }, + { + "auxiliary_loss_clip": 0.01078302, + "auxiliary_loss_mlp": 0.01036061, + "balance_loss_clip": 1.0332458, + "balance_loss_mlp": 1.02210712, + "epoch": 0.7715015782353826, + "flos": 30555312312960.0, + "grad_norm": 1.5151652331679557, + "language_loss": 0.6105473, + "learning_rate": 5.230321283779071e-07, + "loss": 0.63169092, + "num_input_tokens_seen": 276790820, + "step": 12832, + "time_per_iteration": 2.717639684677124 + }, + { + "auxiliary_loss_clip": 0.01085655, + "auxiliary_loss_mlp": 0.01039371, + "balance_loss_clip": 1.03684115, + "balance_loss_mlp": 1.02620983, + "epoch": 0.7715617014880505, + "flos": 20229343038720.0, + "grad_norm": 1.801135841177815, + "language_loss": 0.79230422, + "learning_rate": 5.227695536380572e-07, + "loss": 0.81355441, + "num_input_tokens_seen": 276811345, + "step": 12833, + "time_per_iteration": 2.7320380210876465 + }, + { + "auxiliary_loss_clip": 0.00988976, + "auxiliary_loss_mlp": 0.01003321, + "balance_loss_clip": 1.00962079, + "balance_loss_mlp": 1.00185442, + "epoch": 0.7716218247407185, + "flos": 63664770971520.0, + "grad_norm": 0.8509203865481852, + "language_loss": 0.55384171, + "learning_rate": 5.22507034914933e-07, + "loss": 0.57376468, + "num_input_tokens_seen": 276870950, + "step": 12834, + "time_per_iteration": 3.2906105518341064 + }, + { + "auxiliary_loss_clip": 0.01065317, + "auxiliary_loss_mlp": 0.01033528, + "balance_loss_clip": 1.03433681, + "balance_loss_mlp": 1.019449, + "epoch": 0.7716819479933864, + "flos": 19791807471360.0, + "grad_norm": 2.0007244746658905, + "language_loss": 0.72596645, + "learning_rate": 5.222445722184903e-07, + "loss": 0.74695486, + "num_input_tokens_seen": 276890760, + "step": 12835, + "time_per_iteration": 2.789001941680908 + }, + { + "auxiliary_loss_clip": 0.01078061, + "auxiliary_loss_mlp": 0.00771412, + "balance_loss_clip": 1.03562582, + "balance_loss_mlp": 1.00025511, + "epoch": 0.7717420712460544, + "flos": 18442176825600.0, + "grad_norm": 1.8060607168740586, + "language_loss": 0.70171171, + "learning_rate": 5.219821655586814e-07, + "loss": 0.72020638, + "num_input_tokens_seen": 276909625, + "step": 12836, + "time_per_iteration": 2.728555917739868 + }, + { + "auxiliary_loss_clip": 0.01087588, + "auxiliary_loss_mlp": 0.01031496, + "balance_loss_clip": 1.03710699, + "balance_loss_mlp": 1.01896143, + "epoch": 0.7718021944987223, + "flos": 35189476456320.0, + "grad_norm": 1.7672669175991982, + "language_loss": 0.59498906, + "learning_rate": 5.217198149454575e-07, + "loss": 0.61617988, + "num_input_tokens_seen": 276930760, + "step": 12837, + "time_per_iteration": 2.771662712097168 + }, + { + "auxiliary_loss_clip": 0.01019463, + "auxiliary_loss_mlp": 0.01007255, + "balance_loss_clip": 1.0126214, + "balance_loss_mlp": 1.00599122, + "epoch": 0.7718623177513904, + "flos": 67923167961600.0, + "grad_norm": 0.860607802199013, + "language_loss": 0.55781054, + "learning_rate": 5.214575203887666e-07, + "loss": 0.57807767, + "num_input_tokens_seen": 276989580, + "step": 12838, + "time_per_iteration": 3.17033052444458 + }, + { + "auxiliary_loss_clip": 0.0110038, + "auxiliary_loss_mlp": 0.01028077, + "balance_loss_clip": 1.03804731, + "balance_loss_mlp": 1.01625776, + "epoch": 0.7719224410040583, + "flos": 18581401941120.0, + "grad_norm": 2.316418806228274, + "language_loss": 0.69647658, + "learning_rate": 5.211952818985538e-07, + "loss": 0.71776116, + "num_input_tokens_seen": 277005450, + "step": 12839, + "time_per_iteration": 2.645826578140259 + }, + { + "auxiliary_loss_clip": 0.01099944, + "auxiliary_loss_mlp": 0.01027637, + "balance_loss_clip": 1.03894663, + "balance_loss_mlp": 1.01572192, + "epoch": 0.7719825642567263, + "flos": 23075802264960.0, + "grad_norm": 1.8115476435749553, + "language_loss": 0.79911268, + "learning_rate": 5.209330994847647e-07, + "loss": 0.8203885, + "num_input_tokens_seen": 277023055, + "step": 12840, + "time_per_iteration": 2.706791400909424 + }, + { + "auxiliary_loss_clip": 0.0110078, + "auxiliary_loss_mlp": 0.00770822, + "balance_loss_clip": 1.03851485, + "balance_loss_mlp": 1.00014949, + "epoch": 0.7720426875093943, + "flos": 20339086066560.0, + "grad_norm": 2.545853908868313, + "language_loss": 0.80008757, + "learning_rate": 5.206709731573402e-07, + "loss": 0.81880367, + "num_input_tokens_seen": 277041150, + "step": 12841, + "time_per_iteration": 2.7192368507385254 + }, + { + "auxiliary_loss_clip": 0.01075766, + "auxiliary_loss_mlp": 0.01028708, + "balance_loss_clip": 1.03847384, + "balance_loss_mlp": 1.01574421, + "epoch": 0.7721028107620622, + "flos": 23880704181120.0, + "grad_norm": 1.5305578365970447, + "language_loss": 0.76161742, + "learning_rate": 5.204089029262208e-07, + "loss": 0.78266215, + "num_input_tokens_seen": 277063895, + "step": 12842, + "time_per_iteration": 2.7325236797332764 + }, + { + "auxiliary_loss_clip": 0.01059079, + "auxiliary_loss_mlp": 0.00771703, + "balance_loss_clip": 1.03726017, + "balance_loss_mlp": 1.0002687, + "epoch": 0.7721629340147302, + "flos": 26651571235200.0, + "grad_norm": 4.828495725379175, + "language_loss": 0.68726575, + "learning_rate": 5.201468888013445e-07, + "loss": 0.70557356, + "num_input_tokens_seen": 277084045, + "step": 12843, + "time_per_iteration": 2.81326961517334 + }, + { + "auxiliary_loss_clip": 0.01088182, + "auxiliary_loss_mlp": 0.01032978, + "balance_loss_clip": 1.03403521, + "balance_loss_mlp": 1.02059186, + "epoch": 0.7722230572673981, + "flos": 21178857110400.0, + "grad_norm": 3.7944489397426286, + "language_loss": 0.73675692, + "learning_rate": 5.198849307926465e-07, + "loss": 0.75796854, + "num_input_tokens_seen": 277102625, + "step": 12844, + "time_per_iteration": 2.660747766494751 + }, + { + "auxiliary_loss_clip": 0.0109532, + "auxiliary_loss_mlp": 0.01041057, + "balance_loss_clip": 1.03639054, + "balance_loss_mlp": 1.02721667, + "epoch": 0.7722831805200662, + "flos": 27964644814080.0, + "grad_norm": 1.567829696052933, + "language_loss": 0.71341336, + "learning_rate": 5.196230289100596e-07, + "loss": 0.73477709, + "num_input_tokens_seen": 277123210, + "step": 12845, + "time_per_iteration": 2.720493793487549 + }, + { + "auxiliary_loss_clip": 0.01109647, + "auxiliary_loss_mlp": 0.01032633, + "balance_loss_clip": 1.03851032, + "balance_loss_mlp": 1.02038407, + "epoch": 0.7723433037727341, + "flos": 33875576864640.0, + "grad_norm": 1.7586648256902582, + "language_loss": 0.64064783, + "learning_rate": 5.193611831635159e-07, + "loss": 0.66207063, + "num_input_tokens_seen": 277144895, + "step": 12846, + "time_per_iteration": 2.7434511184692383 + }, + { + "auxiliary_loss_clip": 0.0102204, + "auxiliary_loss_mlp": 0.00751187, + "balance_loss_clip": 1.0084672, + "balance_loss_mlp": 0.99961835, + "epoch": 0.7724034270254021, + "flos": 62848271940480.0, + "grad_norm": 0.7939383469798397, + "language_loss": 0.61696756, + "learning_rate": 5.19099393562945e-07, + "loss": 0.63469982, + "num_input_tokens_seen": 277205160, + "step": 12847, + "time_per_iteration": 3.1408584117889404 + }, + { + "auxiliary_loss_clip": 0.01109979, + "auxiliary_loss_mlp": 0.01027701, + "balance_loss_clip": 1.0360781, + "balance_loss_mlp": 1.01481414, + "epoch": 0.77246355027807, + "flos": 23295467888640.0, + "grad_norm": 2.7620733076627255, + "language_loss": 0.7912066, + "learning_rate": 5.188376601182732e-07, + "loss": 0.81258333, + "num_input_tokens_seen": 277223005, + "step": 12848, + "time_per_iteration": 5.833041191101074 + }, + { + "auxiliary_loss_clip": 0.01073036, + "auxiliary_loss_mlp": 0.01041471, + "balance_loss_clip": 1.03511548, + "balance_loss_mlp": 1.02746367, + "epoch": 0.772523673530738, + "flos": 20121287950080.0, + "grad_norm": 1.5824412187396433, + "language_loss": 0.72673213, + "learning_rate": 5.185759828394261e-07, + "loss": 0.74787724, + "num_input_tokens_seen": 277241785, + "step": 12849, + "time_per_iteration": 2.7188072204589844 + }, + { + "auxiliary_loss_clip": 0.01110027, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.03745866, + "balance_loss_mlp": 1.01899564, + "epoch": 0.7725837967834059, + "flos": 17820096157440.0, + "grad_norm": 2.4780134177178166, + "language_loss": 0.78607786, + "learning_rate": 5.183143617363261e-07, + "loss": 0.80749798, + "num_input_tokens_seen": 277259050, + "step": 12850, + "time_per_iteration": 4.190839529037476 + }, + { + "auxiliary_loss_clip": 0.01054122, + "auxiliary_loss_mlp": 0.00771579, + "balance_loss_clip": 1.03170514, + "balance_loss_mlp": 1.00020933, + "epoch": 0.772643920036074, + "flos": 27198921657600.0, + "grad_norm": 1.5628406207285341, + "language_loss": 0.80081898, + "learning_rate": 5.180527968188935e-07, + "loss": 0.819076, + "num_input_tokens_seen": 277278235, + "step": 12851, + "time_per_iteration": 2.8007707595825195 + }, + { + "auxiliary_loss_clip": 0.01097831, + "auxiliary_loss_mlp": 0.01027911, + "balance_loss_clip": 1.03627992, + "balance_loss_mlp": 1.01439285, + "epoch": 0.7727040432887419, + "flos": 21579512388480.0, + "grad_norm": 1.50165866044674, + "language_loss": 0.73771137, + "learning_rate": 5.177912880970474e-07, + "loss": 0.75896883, + "num_input_tokens_seen": 277298355, + "step": 12852, + "time_per_iteration": 2.640066146850586 + }, + { + "auxiliary_loss_clip": 0.01108862, + "auxiliary_loss_mlp": 0.01036354, + "balance_loss_clip": 1.0370307, + "balance_loss_mlp": 1.02388501, + "epoch": 0.7727641665414099, + "flos": 22236641752320.0, + "grad_norm": 1.9047864889104873, + "language_loss": 0.82604998, + "learning_rate": 5.17529835580704e-07, + "loss": 0.84750211, + "num_input_tokens_seen": 277316095, + "step": 12853, + "time_per_iteration": 2.6782071590423584 + }, + { + "auxiliary_loss_clip": 0.01028971, + "auxiliary_loss_mlp": 0.01000563, + "balance_loss_clip": 1.00643969, + "balance_loss_mlp": 0.99953192, + "epoch": 0.7728242897940779, + "flos": 54832221463680.0, + "grad_norm": 0.7951405489665233, + "language_loss": 0.54508865, + "learning_rate": 5.172684392797786e-07, + "loss": 0.56538397, + "num_input_tokens_seen": 277380130, + "step": 12854, + "time_per_iteration": 3.2313177585601807 + }, + { + "auxiliary_loss_clip": 0.01102068, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.03806114, + "balance_loss_mlp": 1.01808441, + "epoch": 0.7728844130467458, + "flos": 34461962392320.0, + "grad_norm": 1.5042786507697257, + "language_loss": 0.71595842, + "learning_rate": 5.170070992041826e-07, + "loss": 0.73730195, + "num_input_tokens_seen": 277404015, + "step": 12855, + "time_per_iteration": 4.29422926902771 + }, + { + "auxiliary_loss_clip": 0.01111402, + "auxiliary_loss_mlp": 0.01031298, + "balance_loss_clip": 1.03859937, + "balance_loss_mlp": 1.01755357, + "epoch": 0.7729445362994138, + "flos": 18916341287040.0, + "grad_norm": 1.8322894078322527, + "language_loss": 0.68102384, + "learning_rate": 5.167458153638254e-07, + "loss": 0.70245087, + "num_input_tokens_seen": 277421375, + "step": 12856, + "time_per_iteration": 2.6372880935668945 + }, + { + "auxiliary_loss_clip": 0.010814, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.03660607, + "balance_loss_mlp": 1.02275896, + "epoch": 0.7730046595520818, + "flos": 22200048771840.0, + "grad_norm": 1.6258522353598035, + "language_loss": 0.79057026, + "learning_rate": 5.164845877686162e-07, + "loss": 0.81174016, + "num_input_tokens_seen": 277440170, + "step": 12857, + "time_per_iteration": 2.796715021133423 + }, + { + "auxiliary_loss_clip": 0.01063249, + "auxiliary_loss_mlp": 0.00770001, + "balance_loss_clip": 1.04108429, + "balance_loss_mlp": 1.00020409, + "epoch": 0.7730647828047498, + "flos": 13552328695680.0, + "grad_norm": 1.8401408925492355, + "language_loss": 0.78711581, + "learning_rate": 5.162234164284591e-07, + "loss": 0.80544829, + "num_input_tokens_seen": 277456880, + "step": 12858, + "time_per_iteration": 2.8125572204589844 + }, + { + "auxiliary_loss_clip": 0.01112062, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.03837538, + "balance_loss_mlp": 1.0190742, + "epoch": 0.7731249060574177, + "flos": 21976037602560.0, + "grad_norm": 1.938091007163787, + "language_loss": 0.77033961, + "learning_rate": 5.159623013532591e-07, + "loss": 0.7917791, + "num_input_tokens_seen": 277475365, + "step": 12859, + "time_per_iteration": 2.659550428390503 + }, + { + "auxiliary_loss_clip": 0.0109902, + "auxiliary_loss_mlp": 0.01030466, + "balance_loss_clip": 1.04030442, + "balance_loss_mlp": 1.01920676, + "epoch": 0.7731850293100857, + "flos": 22601817371520.0, + "grad_norm": 1.3916188047238045, + "language_loss": 0.67878425, + "learning_rate": 5.157012425529186e-07, + "loss": 0.7000792, + "num_input_tokens_seen": 277494975, + "step": 12860, + "time_per_iteration": 2.8458962440490723 + }, + { + "auxiliary_loss_clip": 0.01114237, + "auxiliary_loss_mlp": 0.01038751, + "balance_loss_clip": 1.03815317, + "balance_loss_mlp": 1.02510166, + "epoch": 0.7732451525627536, + "flos": 14098422142080.0, + "grad_norm": 2.3344978091609656, + "language_loss": 0.74838078, + "learning_rate": 5.154402400373343e-07, + "loss": 0.76991069, + "num_input_tokens_seen": 277510520, + "step": 12861, + "time_per_iteration": 2.5893940925598145 + }, + { + "auxiliary_loss_clip": 0.01105983, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.04054725, + "balance_loss_mlp": 1.01798797, + "epoch": 0.7733052758154216, + "flos": 21470020755840.0, + "grad_norm": 2.1487952861807558, + "language_loss": 0.74759662, + "learning_rate": 5.15179293816405e-07, + "loss": 0.7689755, + "num_input_tokens_seen": 277530505, + "step": 12862, + "time_per_iteration": 2.7624194622039795 + }, + { + "auxiliary_loss_clip": 0.01064299, + "auxiliary_loss_mlp": 0.01032266, + "balance_loss_clip": 1.03402948, + "balance_loss_mlp": 1.02048767, + "epoch": 0.7733653990680895, + "flos": 21394284929280.0, + "grad_norm": 1.5250392948978249, + "language_loss": 0.83059877, + "learning_rate": 5.149184039000256e-07, + "loss": 0.85156441, + "num_input_tokens_seen": 277550810, + "step": 12863, + "time_per_iteration": 2.771484851837158 + }, + { + "auxiliary_loss_clip": 0.01110135, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.03735471, + "balance_loss_mlp": 1.02050209, + "epoch": 0.7734255223207576, + "flos": 17676058619520.0, + "grad_norm": 1.7056890510124847, + "language_loss": 0.73495519, + "learning_rate": 5.146575702980898e-07, + "loss": 0.75638908, + "num_input_tokens_seen": 277567680, + "step": 12864, + "time_per_iteration": 2.6594743728637695 + }, + { + "auxiliary_loss_clip": 0.01089331, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.03545022, + "balance_loss_mlp": 1.0199455, + "epoch": 0.7734856455734255, + "flos": 25230837617280.0, + "grad_norm": 1.592544393546876, + "language_loss": 0.8264727, + "learning_rate": 5.143967930204871e-07, + "loss": 0.84768456, + "num_input_tokens_seen": 277588970, + "step": 12865, + "time_per_iteration": 2.7463982105255127 + }, + { + "auxiliary_loss_clip": 0.01116112, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.04054976, + "balance_loss_mlp": 1.01934528, + "epoch": 0.7735457688260935, + "flos": 23433112805760.0, + "grad_norm": 2.1106851031269365, + "language_loss": 0.72128093, + "learning_rate": 5.141360720771077e-07, + "loss": 0.74277413, + "num_input_tokens_seen": 277605450, + "step": 12866, + "time_per_iteration": 2.574566125869751 + }, + { + "auxiliary_loss_clip": 0.01069034, + "auxiliary_loss_mlp": 0.00770892, + "balance_loss_clip": 1.03813267, + "balance_loss_mlp": 1.00030208, + "epoch": 0.7736058920787615, + "flos": 18729246320640.0, + "grad_norm": 3.2060196397051444, + "language_loss": 0.64442635, + "learning_rate": 5.138754074778371e-07, + "loss": 0.66282552, + "num_input_tokens_seen": 277622530, + "step": 12867, + "time_per_iteration": 2.701490879058838 + }, + { + "auxiliary_loss_clip": 0.01098529, + "auxiliary_loss_mlp": 0.01037441, + "balance_loss_clip": 1.03714955, + "balance_loss_mlp": 1.02506101, + "epoch": 0.7736660153314294, + "flos": 22893304239360.0, + "grad_norm": 1.5193783690331675, + "language_loss": 0.71179724, + "learning_rate": 5.136147992325595e-07, + "loss": 0.73315698, + "num_input_tokens_seen": 277642700, + "step": 12868, + "time_per_iteration": 2.6771240234375 + }, + { + "auxiliary_loss_clip": 0.01105128, + "auxiliary_loss_mlp": 0.01031721, + "balance_loss_clip": 1.04049754, + "balance_loss_mlp": 1.01892424, + "epoch": 0.7737261385840974, + "flos": 13800901789440.0, + "grad_norm": 2.0821303548121284, + "language_loss": 0.77995592, + "learning_rate": 5.133542473511578e-07, + "loss": 0.80132443, + "num_input_tokens_seen": 277660005, + "step": 12869, + "time_per_iteration": 2.6456408500671387 + }, + { + "auxiliary_loss_clip": 0.01097602, + "auxiliary_loss_mlp": 0.01027939, + "balance_loss_clip": 1.03767705, + "balance_loss_mlp": 1.01517785, + "epoch": 0.7737862618367654, + "flos": 28730727106560.0, + "grad_norm": 1.7351593875890767, + "language_loss": 0.73740292, + "learning_rate": 5.130937518435124e-07, + "loss": 0.75865841, + "num_input_tokens_seen": 277682890, + "step": 12870, + "time_per_iteration": 2.670896530151367 + }, + { + "auxiliary_loss_clip": 0.01102985, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.03815126, + "balance_loss_mlp": 1.01947141, + "epoch": 0.7738463850894334, + "flos": 17018570119680.0, + "grad_norm": 1.9332947968793013, + "language_loss": 0.76220596, + "learning_rate": 5.12833312719501e-07, + "loss": 0.78355992, + "num_input_tokens_seen": 277699330, + "step": 12871, + "time_per_iteration": 2.5998897552490234 + }, + { + "auxiliary_loss_clip": 0.0108707, + "auxiliary_loss_mlp": 0.01035167, + "balance_loss_clip": 1.03574061, + "balance_loss_mlp": 1.02281117, + "epoch": 0.7739065083421013, + "flos": 20704010290560.0, + "grad_norm": 2.0007285261409407, + "language_loss": 0.69219184, + "learning_rate": 5.12572929988999e-07, + "loss": 0.71341425, + "num_input_tokens_seen": 277718750, + "step": 12872, + "time_per_iteration": 2.673105478286743 + }, + { + "auxiliary_loss_clip": 0.01111983, + "auxiliary_loss_mlp": 0.01031736, + "balance_loss_clip": 1.03831863, + "balance_loss_mlp": 1.01781273, + "epoch": 0.7739666315947693, + "flos": 20697222620160.0, + "grad_norm": 2.4536948806528502, + "language_loss": 0.85142237, + "learning_rate": 5.123126036618804e-07, + "loss": 0.8728596, + "num_input_tokens_seen": 277734645, + "step": 12873, + "time_per_iteration": 2.590299606323242 + }, + { + "auxiliary_loss_clip": 0.01115241, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.04048181, + "balance_loss_mlp": 1.02497935, + "epoch": 0.7740267548474372, + "flos": 29570677718400.0, + "grad_norm": 2.480997222503817, + "language_loss": 0.65266359, + "learning_rate": 5.120523337480174e-07, + "loss": 0.67418897, + "num_input_tokens_seen": 277755535, + "step": 12874, + "time_per_iteration": 2.6324357986450195 + }, + { + "auxiliary_loss_clip": 0.01072577, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.0420754, + "balance_loss_mlp": 1.01826084, + "epoch": 0.7740868781001052, + "flos": 23659099223040.0, + "grad_norm": 1.5630841905142332, + "language_loss": 0.62254053, + "learning_rate": 5.117921202572785e-07, + "loss": 0.64357871, + "num_input_tokens_seen": 277775585, + "step": 12875, + "time_per_iteration": 2.7664403915405273 + }, + { + "auxiliary_loss_clip": 0.0110217, + "auxiliary_loss_mlp": 0.0103118, + "balance_loss_clip": 1.0375613, + "balance_loss_mlp": 1.01843607, + "epoch": 0.7741470013527731, + "flos": 24717314828160.0, + "grad_norm": 2.655709255641646, + "language_loss": 0.65554249, + "learning_rate": 5.115319631995318e-07, + "loss": 0.67687607, + "num_input_tokens_seen": 277794795, + "step": 12876, + "time_per_iteration": 2.696556806564331 + }, + { + "auxiliary_loss_clip": 0.01082571, + "auxiliary_loss_mlp": 0.01036668, + "balance_loss_clip": 1.03536308, + "balance_loss_mlp": 1.02387714, + "epoch": 0.7742071246054412, + "flos": 21871645701120.0, + "grad_norm": 1.869396409074905, + "language_loss": 0.71216834, + "learning_rate": 5.112718625846433e-07, + "loss": 0.73336065, + "num_input_tokens_seen": 277813235, + "step": 12877, + "time_per_iteration": 2.692688465118408 + }, + { + "auxiliary_loss_clip": 0.01073259, + "auxiliary_loss_mlp": 0.01040102, + "balance_loss_clip": 1.03579319, + "balance_loss_mlp": 1.02468836, + "epoch": 0.7742672478581091, + "flos": 22674249146880.0, + "grad_norm": 1.8081756921528234, + "language_loss": 0.82974255, + "learning_rate": 5.110118184224736e-07, + "loss": 0.85087615, + "num_input_tokens_seen": 277832560, + "step": 12878, + "time_per_iteration": 2.7693746089935303 + }, + { + "auxiliary_loss_clip": 0.01091515, + "auxiliary_loss_mlp": 0.01034091, + "balance_loss_clip": 1.03874159, + "balance_loss_mlp": 1.0199523, + "epoch": 0.7743273711107771, + "flos": 18840892769280.0, + "grad_norm": 1.713941118960012, + "language_loss": 0.73144233, + "learning_rate": 5.10751830722885e-07, + "loss": 0.75269836, + "num_input_tokens_seen": 277850120, + "step": 12879, + "time_per_iteration": 2.6757094860076904 + }, + { + "auxiliary_loss_clip": 0.0108601, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.03621507, + "balance_loss_mlp": 1.01507425, + "epoch": 0.7743874943634451, + "flos": 28729326476160.0, + "grad_norm": 1.9120090925944704, + "language_loss": 0.79831159, + "learning_rate": 5.104918994957364e-07, + "loss": 0.81945121, + "num_input_tokens_seen": 277871020, + "step": 12880, + "time_per_iteration": 2.8304030895233154 + }, + { + "auxiliary_loss_clip": 0.01087192, + "auxiliary_loss_mlp": 0.01037799, + "balance_loss_clip": 1.03749204, + "balance_loss_mlp": 1.02506709, + "epoch": 0.774447617616113, + "flos": 21909639312000.0, + "grad_norm": 1.5834670208699275, + "language_loss": 0.70202577, + "learning_rate": 5.102320247508847e-07, + "loss": 0.72327566, + "num_input_tokens_seen": 277891525, + "step": 12881, + "time_per_iteration": 2.7064766883850098 + }, + { + "auxiliary_loss_clip": 0.01091686, + "auxiliary_loss_mlp": 0.01043391, + "balance_loss_clip": 1.03600717, + "balance_loss_mlp": 1.02921081, + "epoch": 0.774507740868781, + "flos": 19500643825920.0, + "grad_norm": 1.9376715027667266, + "language_loss": 0.84492528, + "learning_rate": 5.099722064981832e-07, + "loss": 0.86627603, + "num_input_tokens_seen": 277910425, + "step": 12882, + "time_per_iteration": 2.704357862472534 + }, + { + "auxiliary_loss_clip": 0.01002891, + "auxiliary_loss_mlp": 0.01007527, + "balance_loss_clip": 1.01538849, + "balance_loss_mlp": 1.00624514, + "epoch": 0.774567864121449, + "flos": 59426560402560.0, + "grad_norm": 0.7677682887225041, + "language_loss": 0.60380936, + "learning_rate": 5.097124447474858e-07, + "loss": 0.62391353, + "num_input_tokens_seen": 277972795, + "step": 12883, + "time_per_iteration": 3.2393903732299805 + }, + { + "auxiliary_loss_clip": 0.01064866, + "auxiliary_loss_mlp": 0.0103875, + "balance_loss_clip": 1.03618407, + "balance_loss_mlp": 1.023646, + "epoch": 0.774627987374117, + "flos": 13225326255360.0, + "grad_norm": 6.057542406739813, + "language_loss": 0.72638834, + "learning_rate": 5.094527395086416e-07, + "loss": 0.7474246, + "num_input_tokens_seen": 277990675, + "step": 12884, + "time_per_iteration": 2.798553705215454 + }, + { + "auxiliary_loss_clip": 0.01100426, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.03860021, + "balance_loss_mlp": 1.0236789, + "epoch": 0.7746881106267849, + "flos": 21394033534080.0, + "grad_norm": 1.4931379605931039, + "language_loss": 0.8105005, + "learning_rate": 5.091930907914986e-07, + "loss": 0.83185869, + "num_input_tokens_seen": 278010050, + "step": 12885, + "time_per_iteration": 2.638674736022949 + }, + { + "auxiliary_loss_clip": 0.01108511, + "auxiliary_loss_mlp": 0.01036987, + "balance_loss_clip": 1.03706241, + "balance_loss_mlp": 1.0250479, + "epoch": 0.7747482338794529, + "flos": 25629338079360.0, + "grad_norm": 1.712628084719396, + "language_loss": 0.63937521, + "learning_rate": 5.089334986059029e-07, + "loss": 0.6608302, + "num_input_tokens_seen": 278030660, + "step": 12886, + "time_per_iteration": 2.65639328956604 + }, + { + "auxiliary_loss_clip": 0.01072173, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.03371465, + "balance_loss_mlp": 1.01826668, + "epoch": 0.7748083571321208, + "flos": 11546933402880.0, + "grad_norm": 1.8883319339128437, + "language_loss": 0.69462442, + "learning_rate": 5.086739629616987e-07, + "loss": 0.71564978, + "num_input_tokens_seen": 278047645, + "step": 12887, + "time_per_iteration": 4.30758261680603 + }, + { + "auxiliary_loss_clip": 0.01100015, + "auxiliary_loss_mlp": 0.0103293, + "balance_loss_clip": 1.03749061, + "balance_loss_mlp": 1.02090144, + "epoch": 0.7748684803847888, + "flos": 19062425900160.0, + "grad_norm": 1.702042840830708, + "language_loss": 0.70615542, + "learning_rate": 5.084144838687275e-07, + "loss": 0.72748482, + "num_input_tokens_seen": 278066170, + "step": 12888, + "time_per_iteration": 2.681607246398926 + }, + { + "auxiliary_loss_clip": 0.01101783, + "auxiliary_loss_mlp": 0.01034032, + "balance_loss_clip": 1.03678536, + "balance_loss_mlp": 1.02094269, + "epoch": 0.7749286036374567, + "flos": 22273162905600.0, + "grad_norm": 1.6866747197007421, + "language_loss": 0.8189441, + "learning_rate": 5.081550613368279e-07, + "loss": 0.84030223, + "num_input_tokens_seen": 278085545, + "step": 12889, + "time_per_iteration": 4.1007890701293945 + }, + { + "auxiliary_loss_clip": 0.0107657, + "auxiliary_loss_mlp": 0.01028595, + "balance_loss_clip": 1.03708053, + "balance_loss_mlp": 1.01628113, + "epoch": 0.7749887268901248, + "flos": 20192462749440.0, + "grad_norm": 2.1944312112845057, + "language_loss": 0.79288089, + "learning_rate": 5.07895695375838e-07, + "loss": 0.81393254, + "num_input_tokens_seen": 278102995, + "step": 12890, + "time_per_iteration": 2.8066084384918213 + }, + { + "auxiliary_loss_clip": 0.01084496, + "auxiliary_loss_mlp": 0.01034255, + "balance_loss_clip": 1.03861511, + "balance_loss_mlp": 1.02098715, + "epoch": 0.7750488501427927, + "flos": 20337541781760.0, + "grad_norm": 1.9334241832657861, + "language_loss": 0.66675818, + "learning_rate": 5.076363859955932e-07, + "loss": 0.68794572, + "num_input_tokens_seen": 278121460, + "step": 12891, + "time_per_iteration": 2.7070491313934326 + }, + { + "auxiliary_loss_clip": 0.01100079, + "auxiliary_loss_mlp": 0.01033227, + "balance_loss_clip": 1.03662086, + "balance_loss_mlp": 1.02079916, + "epoch": 0.7751089733954607, + "flos": 28364043116160.0, + "grad_norm": 1.6084014662033723, + "language_loss": 0.78700238, + "learning_rate": 5.073771332059257e-07, + "loss": 0.80833542, + "num_input_tokens_seen": 278143905, + "step": 12892, + "time_per_iteration": 2.6891307830810547 + }, + { + "auxiliary_loss_clip": 0.01105106, + "auxiliary_loss_mlp": 0.01029124, + "balance_loss_clip": 1.04138756, + "balance_loss_mlp": 1.01607716, + "epoch": 0.7751690966481286, + "flos": 16943803960320.0, + "grad_norm": 2.167077484645157, + "language_loss": 0.67164677, + "learning_rate": 5.071179370166669e-07, + "loss": 0.69298911, + "num_input_tokens_seen": 278160850, + "step": 12893, + "time_per_iteration": 2.6599507331848145 + }, + { + "auxiliary_loss_clip": 0.01022351, + "auxiliary_loss_mlp": 0.01001788, + "balance_loss_clip": 1.00947237, + "balance_loss_mlp": 1.00071514, + "epoch": 0.7752292199007966, + "flos": 65668050339840.0, + "grad_norm": 0.8059900442079823, + "language_loss": 0.58441579, + "learning_rate": 5.068587974376468e-07, + "loss": 0.60465717, + "num_input_tokens_seen": 278219950, + "step": 12894, + "time_per_iteration": 4.591580629348755 + }, + { + "auxiliary_loss_clip": 0.01093145, + "auxiliary_loss_mlp": 0.01033525, + "balance_loss_clip": 1.03960991, + "balance_loss_mlp": 1.02001882, + "epoch": 0.7752893431534646, + "flos": 20594662312320.0, + "grad_norm": 2.026677697607631, + "language_loss": 0.77940953, + "learning_rate": 5.065997144786895e-07, + "loss": 0.80067623, + "num_input_tokens_seen": 278237805, + "step": 12895, + "time_per_iteration": 2.550419807434082 + }, + { + "auxiliary_loss_clip": 0.01070115, + "auxiliary_loss_mlp": 0.01035434, + "balance_loss_clip": 1.03553057, + "balance_loss_mlp": 1.02099133, + "epoch": 0.7753494664061326, + "flos": 20485350247680.0, + "grad_norm": 1.9545538067157624, + "language_loss": 0.67606688, + "learning_rate": 5.063406881496209e-07, + "loss": 0.69712234, + "num_input_tokens_seen": 278257660, + "step": 12896, + "time_per_iteration": 2.573294162750244 + }, + { + "auxiliary_loss_clip": 0.01086749, + "auxiliary_loss_mlp": 0.01040132, + "balance_loss_clip": 1.03621519, + "balance_loss_mlp": 1.02843189, + "epoch": 0.7754095896588006, + "flos": 20265900105600.0, + "grad_norm": 1.6654676924809417, + "language_loss": 0.6842171, + "learning_rate": 5.060817184602629e-07, + "loss": 0.70548594, + "num_input_tokens_seen": 278275110, + "step": 12897, + "time_per_iteration": 2.646030902862549 + }, + { + "auxiliary_loss_clip": 0.0111523, + "auxiliary_loss_mlp": 0.01041854, + "balance_loss_clip": 1.04096043, + "balance_loss_mlp": 1.02774525, + "epoch": 0.7754697129114685, + "flos": 23331091201920.0, + "grad_norm": 1.6795213586563635, + "language_loss": 0.75452977, + "learning_rate": 5.058228054204364e-07, + "loss": 0.77610064, + "num_input_tokens_seen": 278293035, + "step": 12898, + "time_per_iteration": 2.589974880218506 + }, + { + "auxiliary_loss_clip": 0.01101527, + "auxiliary_loss_mlp": 0.00771705, + "balance_loss_clip": 1.0381062, + "balance_loss_mlp": 1.00029922, + "epoch": 0.7755298361641365, + "flos": 17347619635200.0, + "grad_norm": 1.834394412240628, + "language_loss": 0.70020843, + "learning_rate": 5.055639490399588e-07, + "loss": 0.71894073, + "num_input_tokens_seen": 278311010, + "step": 12899, + "time_per_iteration": 2.569342851638794 + }, + { + "auxiliary_loss_clip": 0.01076575, + "auxiliary_loss_mlp": 0.01037603, + "balance_loss_clip": 1.03510606, + "balance_loss_mlp": 1.02406061, + "epoch": 0.7755899594168044, + "flos": 19645866512640.0, + "grad_norm": 2.136951327661946, + "language_loss": 0.7508406, + "learning_rate": 5.053051493286453e-07, + "loss": 0.77198243, + "num_input_tokens_seen": 278329900, + "step": 12900, + "time_per_iteration": 2.6928303241729736 + }, + { + "auxiliary_loss_clip": 0.01093277, + "auxiliary_loss_mlp": 0.01036499, + "balance_loss_clip": 1.03764784, + "balance_loss_mlp": 1.02486384, + "epoch": 0.7756500826694724, + "flos": 27414457217280.0, + "grad_norm": 2.3252412495258867, + "language_loss": 0.77514052, + "learning_rate": 5.050464062963113e-07, + "loss": 0.79643828, + "num_input_tokens_seen": 278349980, + "step": 12901, + "time_per_iteration": 2.7284209728240967 + }, + { + "auxiliary_loss_clip": 0.01102085, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.04122436, + "balance_loss_mlp": 1.01966059, + "epoch": 0.7757102059221404, + "flos": 28730511624960.0, + "grad_norm": 1.6090174147117244, + "language_loss": 0.7720294, + "learning_rate": 5.047877199527666e-07, + "loss": 0.79337895, + "num_input_tokens_seen": 278372485, + "step": 12902, + "time_per_iteration": 2.7194478511810303 + }, + { + "auxiliary_loss_clip": 0.01100702, + "auxiliary_loss_mlp": 0.01031411, + "balance_loss_clip": 1.03726745, + "balance_loss_mlp": 1.01915073, + "epoch": 0.7757703291748084, + "flos": 22486795044480.0, + "grad_norm": 1.743455027715563, + "language_loss": 0.73384994, + "learning_rate": 5.045290903078215e-07, + "loss": 0.75517106, + "num_input_tokens_seen": 278391660, + "step": 12903, + "time_per_iteration": 2.705784797668457 + }, + { + "auxiliary_loss_clip": 0.01089793, + "auxiliary_loss_mlp": 0.01030768, + "balance_loss_clip": 1.03994238, + "balance_loss_mlp": 1.01834655, + "epoch": 0.7758304524274763, + "flos": 21430159637760.0, + "grad_norm": 18.791554059780267, + "language_loss": 0.76102394, + "learning_rate": 5.042705173712835e-07, + "loss": 0.78222954, + "num_input_tokens_seen": 278409125, + "step": 12904, + "time_per_iteration": 2.6935760974884033 + }, + { + "auxiliary_loss_clip": 0.01109136, + "auxiliary_loss_mlp": 0.01027029, + "balance_loss_clip": 1.03901672, + "balance_loss_mlp": 1.01484025, + "epoch": 0.7758905756801443, + "flos": 23659242877440.0, + "grad_norm": 2.2307497011290462, + "language_loss": 0.68197864, + "learning_rate": 5.040120011529576e-07, + "loss": 0.70334029, + "num_input_tokens_seen": 278429450, + "step": 12905, + "time_per_iteration": 2.6777610778808594 + }, + { + "auxiliary_loss_clip": 0.01097117, + "auxiliary_loss_mlp": 0.00770393, + "balance_loss_clip": 1.03989148, + "balance_loss_mlp": 1.00023961, + "epoch": 0.7759506989328122, + "flos": 28365479660160.0, + "grad_norm": 1.6211065141580052, + "language_loss": 0.67231417, + "learning_rate": 5.037535416626459e-07, + "loss": 0.69098926, + "num_input_tokens_seen": 278449925, + "step": 12906, + "time_per_iteration": 2.7337546348571777 + }, + { + "auxiliary_loss_clip": 0.01072574, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.03331351, + "balance_loss_mlp": 1.02119029, + "epoch": 0.7760108221854802, + "flos": 14902785354240.0, + "grad_norm": 1.9856089108583717, + "language_loss": 0.81587309, + "learning_rate": 5.034951389101498e-07, + "loss": 0.83694196, + "num_input_tokens_seen": 278467255, + "step": 12907, + "time_per_iteration": 2.687721014022827 + }, + { + "auxiliary_loss_clip": 0.01096211, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.03709292, + "balance_loss_mlp": 1.02327871, + "epoch": 0.7760709454381483, + "flos": 14792503622400.0, + "grad_norm": 2.1316068769770213, + "language_loss": 0.6746856, + "learning_rate": 5.032367929052685e-07, + "loss": 0.69600445, + "num_input_tokens_seen": 278484250, + "step": 12908, + "time_per_iteration": 2.6765284538269043 + }, + { + "auxiliary_loss_clip": 0.01079432, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.03593588, + "balance_loss_mlp": 1.02890027, + "epoch": 0.7761310686908162, + "flos": 17379831156480.0, + "grad_norm": 1.487534860967946, + "language_loss": 0.70260543, + "learning_rate": 5.029785036577976e-07, + "loss": 0.72381896, + "num_input_tokens_seen": 278502740, + "step": 12909, + "time_per_iteration": 2.711395502090454 + }, + { + "auxiliary_loss_clip": 0.01100377, + "auxiliary_loss_mlp": 0.01035688, + "balance_loss_clip": 1.03995848, + "balance_loss_mlp": 1.02347469, + "epoch": 0.7761911919434842, + "flos": 25556547168000.0, + "grad_norm": 1.6219590580384207, + "language_loss": 0.6782195, + "learning_rate": 5.027202711775324e-07, + "loss": 0.69958019, + "num_input_tokens_seen": 278523890, + "step": 12910, + "time_per_iteration": 2.703979969024658 + }, + { + "auxiliary_loss_clip": 0.01064156, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.03646898, + "balance_loss_mlp": 1.02076757, + "epoch": 0.7762513151961521, + "flos": 23179763203200.0, + "grad_norm": 1.5806807809655474, + "language_loss": 0.71997929, + "learning_rate": 5.024620954742646e-07, + "loss": 0.74095035, + "num_input_tokens_seen": 278543185, + "step": 12911, + "time_per_iteration": 2.8058223724365234 + }, + { + "auxiliary_loss_clip": 0.01114991, + "auxiliary_loss_mlp": 0.00771737, + "balance_loss_clip": 1.04081869, + "balance_loss_mlp": 1.00030136, + "epoch": 0.7763114384488201, + "flos": 21689614552320.0, + "grad_norm": 3.3864854327362592, + "language_loss": 0.63468528, + "learning_rate": 5.022039765577836e-07, + "loss": 0.65355253, + "num_input_tokens_seen": 278559220, + "step": 12912, + "time_per_iteration": 2.641256809234619 + }, + { + "auxiliary_loss_clip": 0.01001929, + "auxiliary_loss_mlp": 0.01001295, + "balance_loss_clip": 1.00920105, + "balance_loss_mlp": 1.00030553, + "epoch": 0.776371561701488, + "flos": 69025554316800.0, + "grad_norm": 0.7664213178657265, + "language_loss": 0.53195411, + "learning_rate": 5.019459144378779e-07, + "loss": 0.55198634, + "num_input_tokens_seen": 278618185, + "step": 12913, + "time_per_iteration": 3.3077611923217773 + }, + { + "auxiliary_loss_clip": 0.01093414, + "auxiliary_loss_mlp": 0.01037157, + "balance_loss_clip": 1.04078877, + "balance_loss_mlp": 1.02395415, + "epoch": 0.776431684954156, + "flos": 22893914770560.0, + "grad_norm": 1.9204798335439963, + "language_loss": 0.62302238, + "learning_rate": 5.016879091243338e-07, + "loss": 0.644328, + "num_input_tokens_seen": 278636210, + "step": 12914, + "time_per_iteration": 2.7050273418426514 + }, + { + "auxiliary_loss_clip": 0.0108926, + "auxiliary_loss_mlp": 0.01032265, + "balance_loss_clip": 1.03807616, + "balance_loss_mlp": 1.01977742, + "epoch": 0.776491808206824, + "flos": 20261554560000.0, + "grad_norm": 1.7420543212332402, + "language_loss": 0.82108057, + "learning_rate": 5.014299606269339e-07, + "loss": 0.84229577, + "num_input_tokens_seen": 278653305, + "step": 12915, + "time_per_iteration": 2.7126035690307617 + }, + { + "auxiliary_loss_clip": 0.01099353, + "auxiliary_loss_mlp": 0.0103825, + "balance_loss_clip": 1.03824329, + "balance_loss_mlp": 1.02410579, + "epoch": 0.776551931459492, + "flos": 26759051706240.0, + "grad_norm": 1.7876763975048962, + "language_loss": 0.74624789, + "learning_rate": 5.011720689554603e-07, + "loss": 0.76762396, + "num_input_tokens_seen": 278671850, + "step": 12916, + "time_per_iteration": 2.6998839378356934 + }, + { + "auxiliary_loss_clip": 0.01056671, + "auxiliary_loss_mlp": 0.01036878, + "balance_loss_clip": 1.03597093, + "balance_loss_mlp": 1.02252531, + "epoch": 0.7766120547121599, + "flos": 52665080250240.0, + "grad_norm": 1.5017921162458647, + "language_loss": 0.65888739, + "learning_rate": 5.009142341196919e-07, + "loss": 0.67982292, + "num_input_tokens_seen": 278697860, + "step": 12917, + "time_per_iteration": 3.097477674484253 + }, + { + "auxiliary_loss_clip": 0.01099882, + "auxiliary_loss_mlp": 0.01033637, + "balance_loss_clip": 1.03583741, + "balance_loss_mlp": 1.02095342, + "epoch": 0.7766721779648279, + "flos": 25156215112320.0, + "grad_norm": 1.458879337938595, + "language_loss": 0.64478171, + "learning_rate": 5.006564561294065e-07, + "loss": 0.66611689, + "num_input_tokens_seen": 278720655, + "step": 12918, + "time_per_iteration": 2.7446439266204834 + }, + { + "auxiliary_loss_clip": 0.01111393, + "auxiliary_loss_mlp": 0.01037511, + "balance_loss_clip": 1.0397799, + "balance_loss_mlp": 1.02533412, + "epoch": 0.7767323012174958, + "flos": 23760761690880.0, + "grad_norm": 2.338899246619233, + "language_loss": 0.72807854, + "learning_rate": 5.003987349943777e-07, + "loss": 0.74956757, + "num_input_tokens_seen": 278737375, + "step": 12919, + "time_per_iteration": 2.631877899169922 + }, + { + "auxiliary_loss_clip": 0.01069782, + "auxiliary_loss_mlp": 0.01029892, + "balance_loss_clip": 1.03774428, + "balance_loss_mlp": 1.01674342, + "epoch": 0.7767924244701638, + "flos": 22086642556800.0, + "grad_norm": 2.3274821948551265, + "language_loss": 0.78924805, + "learning_rate": 5.001410707243792e-07, + "loss": 0.8102448, + "num_input_tokens_seen": 278756510, + "step": 12920, + "time_per_iteration": 2.8133649826049805 + }, + { + "auxiliary_loss_clip": 0.01102553, + "auxiliary_loss_mlp": 0.01033949, + "balance_loss_clip": 1.03963828, + "balance_loss_mlp": 1.0209614, + "epoch": 0.7768525477228319, + "flos": 21981640124160.0, + "grad_norm": 11.784624421403892, + "language_loss": 0.70922899, + "learning_rate": 4.998834633291829e-07, + "loss": 0.73059404, + "num_input_tokens_seen": 278775410, + "step": 12921, + "time_per_iteration": 2.6603341102600098 + }, + { + "auxiliary_loss_clip": 0.01105803, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.04023492, + "balance_loss_mlp": 1.02050102, + "epoch": 0.7769126709754998, + "flos": 21794581071360.0, + "grad_norm": 3.3431959549038885, + "language_loss": 0.76222974, + "learning_rate": 4.996259128185547e-07, + "loss": 0.7836318, + "num_input_tokens_seen": 278794260, + "step": 12922, + "time_per_iteration": 2.7015247344970703 + }, + { + "auxiliary_loss_clip": 0.01063506, + "auxiliary_loss_mlp": 0.0103979, + "balance_loss_clip": 1.03708482, + "balance_loss_mlp": 1.0270822, + "epoch": 0.7769727942281678, + "flos": 20047994248320.0, + "grad_norm": 1.6454971966787777, + "language_loss": 0.80262136, + "learning_rate": 4.993684192022625e-07, + "loss": 0.82365435, + "num_input_tokens_seen": 278813290, + "step": 12923, + "time_per_iteration": 2.7818875312805176 + }, + { + "auxiliary_loss_clip": 0.01076451, + "auxiliary_loss_mlp": 0.01040924, + "balance_loss_clip": 1.04072833, + "balance_loss_mlp": 1.02828157, + "epoch": 0.7770329174808357, + "flos": 21686777377920.0, + "grad_norm": 2.0408616917549067, + "language_loss": 0.92191219, + "learning_rate": 4.991109824900699e-07, + "loss": 0.94308597, + "num_input_tokens_seen": 278830610, + "step": 12924, + "time_per_iteration": 2.8274574279785156 + }, + { + "auxiliary_loss_clip": 0.01099709, + "auxiliary_loss_mlp": 0.01032924, + "balance_loss_clip": 1.03679144, + "balance_loss_mlp": 1.02001929, + "epoch": 0.7770930407335037, + "flos": 25849255098240.0, + "grad_norm": 1.8094451984441313, + "language_loss": 0.66132891, + "learning_rate": 4.988536026917401e-07, + "loss": 0.68265527, + "num_input_tokens_seen": 278849530, + "step": 12925, + "time_per_iteration": 2.69667649269104 + }, + { + "auxiliary_loss_clip": 0.01078276, + "auxiliary_loss_mlp": 0.01032472, + "balance_loss_clip": 1.03612852, + "balance_loss_mlp": 1.01974022, + "epoch": 0.7771531639861716, + "flos": 24347865490560.0, + "grad_norm": 2.0756313412895815, + "language_loss": 0.7192542, + "learning_rate": 4.985962798170314e-07, + "loss": 0.74036169, + "num_input_tokens_seen": 278869005, + "step": 12926, + "time_per_iteration": 4.349314451217651 + }, + { + "auxiliary_loss_clip": 0.01103533, + "auxiliary_loss_mlp": 0.01029731, + "balance_loss_clip": 1.0389967, + "balance_loss_mlp": 1.01636767, + "epoch": 0.7772132872388396, + "flos": 25629948610560.0, + "grad_norm": 1.6573712780681307, + "language_loss": 0.65608656, + "learning_rate": 4.983390138757027e-07, + "loss": 0.67741919, + "num_input_tokens_seen": 278888790, + "step": 12927, + "time_per_iteration": 4.16760778427124 + }, + { + "auxiliary_loss_clip": 0.01089675, + "auxiliary_loss_mlp": 0.01039623, + "balance_loss_clip": 1.03830886, + "balance_loss_mlp": 1.02623534, + "epoch": 0.7772734104915076, + "flos": 26067412350720.0, + "grad_norm": 1.7538632415038142, + "language_loss": 0.72743905, + "learning_rate": 4.980818048775093e-07, + "loss": 0.74873203, + "num_input_tokens_seen": 278908150, + "step": 12928, + "time_per_iteration": 2.755859851837158 + }, + { + "auxiliary_loss_clip": 0.01071134, + "auxiliary_loss_mlp": 0.01033028, + "balance_loss_clip": 1.03876746, + "balance_loss_mlp": 1.02003419, + "epoch": 0.7773335337441756, + "flos": 22925048883840.0, + "grad_norm": 1.8588967363228528, + "language_loss": 0.74152476, + "learning_rate": 4.978246528322036e-07, + "loss": 0.76256645, + "num_input_tokens_seen": 278927425, + "step": 12929, + "time_per_iteration": 4.2707133293151855 + }, + { + "auxiliary_loss_clip": 0.01074549, + "auxiliary_loss_mlp": 0.01031484, + "balance_loss_clip": 1.03665006, + "balance_loss_mlp": 1.01832283, + "epoch": 0.7773936569968435, + "flos": 20776765288320.0, + "grad_norm": 1.9039476143036729, + "language_loss": 0.7758745, + "learning_rate": 4.975675577495377e-07, + "loss": 0.79693484, + "num_input_tokens_seen": 278946475, + "step": 12930, + "time_per_iteration": 2.7537360191345215 + }, + { + "auxiliary_loss_clip": 0.01113583, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.04102445, + "balance_loss_mlp": 1.02152324, + "epoch": 0.7774537802495115, + "flos": 20372267255040.0, + "grad_norm": 1.8280345361242294, + "language_loss": 0.79341066, + "learning_rate": 4.973105196392613e-07, + "loss": 0.81488979, + "num_input_tokens_seen": 278964345, + "step": 12931, + "time_per_iteration": 2.608551502227783 + }, + { + "auxiliary_loss_clip": 0.01003397, + "auxiliary_loss_mlp": 0.01004694, + "balance_loss_clip": 1.02223182, + "balance_loss_mlp": 1.00322199, + "epoch": 0.7775139035021794, + "flos": 53912081738880.0, + "grad_norm": 0.8525982586440103, + "language_loss": 0.59734511, + "learning_rate": 4.970535385111199e-07, + "loss": 0.61742604, + "num_input_tokens_seen": 279022380, + "step": 12932, + "time_per_iteration": 3.19950270652771 + }, + { + "auxiliary_loss_clip": 0.01102586, + "auxiliary_loss_mlp": 0.01034922, + "balance_loss_clip": 1.03881812, + "balance_loss_mlp": 1.02250659, + "epoch": 0.7775740267548474, + "flos": 28842481296000.0, + "grad_norm": 1.5001192410807755, + "language_loss": 0.76264286, + "learning_rate": 4.967966143748595e-07, + "loss": 0.78401792, + "num_input_tokens_seen": 279044275, + "step": 12933, + "time_per_iteration": 2.838245391845703 + }, + { + "auxiliary_loss_clip": 0.01086722, + "auxiliary_loss_mlp": 0.01039418, + "balance_loss_clip": 1.03855717, + "balance_loss_mlp": 1.02625704, + "epoch": 0.7776341500075155, + "flos": 21872471713920.0, + "grad_norm": 1.9749580896078973, + "language_loss": 0.73223925, + "learning_rate": 4.965397472402215e-07, + "loss": 0.75350064, + "num_input_tokens_seen": 279063375, + "step": 12934, + "time_per_iteration": 4.214959621429443 + }, + { + "auxiliary_loss_clip": 0.01069437, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.03676343, + "balance_loss_mlp": 1.01571107, + "epoch": 0.7776942732601834, + "flos": 20229845829120.0, + "grad_norm": 1.8916304823351247, + "language_loss": 0.70821279, + "learning_rate": 4.962829371169475e-07, + "loss": 0.72919655, + "num_input_tokens_seen": 279082680, + "step": 12935, + "time_per_iteration": 2.8492965698242188 + }, + { + "auxiliary_loss_clip": 0.0108792, + "auxiliary_loss_mlp": 0.00771991, + "balance_loss_clip": 1.03933454, + "balance_loss_mlp": 1.0001905, + "epoch": 0.7777543965128514, + "flos": 22231829329920.0, + "grad_norm": 1.8453474181089096, + "language_loss": 0.83784235, + "learning_rate": 4.960261840147746e-07, + "loss": 0.85644144, + "num_input_tokens_seen": 279099805, + "step": 12936, + "time_per_iteration": 2.6989262104034424 + }, + { + "auxiliary_loss_clip": 0.01105595, + "auxiliary_loss_mlp": 0.01032215, + "balance_loss_clip": 1.03868532, + "balance_loss_mlp": 1.01979923, + "epoch": 0.7778145197655193, + "flos": 14501950508160.0, + "grad_norm": 2.021883178321684, + "language_loss": 0.6742574, + "learning_rate": 4.957694879434397e-07, + "loss": 0.69563556, + "num_input_tokens_seen": 279117975, + "step": 12937, + "time_per_iteration": 2.6387362480163574 + }, + { + "auxiliary_loss_clip": 0.01113841, + "auxiliary_loss_mlp": 0.01033827, + "balance_loss_clip": 1.03934264, + "balance_loss_mlp": 1.021245, + "epoch": 0.7778746430181873, + "flos": 21140288881920.0, + "grad_norm": 1.5206574462967066, + "language_loss": 0.87595057, + "learning_rate": 4.955128489126777e-07, + "loss": 0.89742726, + "num_input_tokens_seen": 279137255, + "step": 12938, + "time_per_iteration": 2.699613332748413 + }, + { + "auxiliary_loss_clip": 0.01101775, + "auxiliary_loss_mlp": 0.01034, + "balance_loss_clip": 1.03820324, + "balance_loss_mlp": 1.02050602, + "epoch": 0.7779347662708552, + "flos": 20266366982400.0, + "grad_norm": 2.05617872988158, + "language_loss": 0.8537035, + "learning_rate": 4.95256266932218e-07, + "loss": 0.87506127, + "num_input_tokens_seen": 279154500, + "step": 12939, + "time_per_iteration": 2.648550510406494 + }, + { + "auxiliary_loss_clip": 0.01108461, + "auxiliary_loss_mlp": 0.00770264, + "balance_loss_clip": 1.03820562, + "balance_loss_mlp": 1.00022864, + "epoch": 0.7779948895235232, + "flos": 19209013303680.0, + "grad_norm": 1.778278891076628, + "language_loss": 0.69293523, + "learning_rate": 4.949997420117915e-07, + "loss": 0.71172249, + "num_input_tokens_seen": 279173635, + "step": 12940, + "time_per_iteration": 2.5725789070129395 + }, + { + "auxiliary_loss_clip": 0.01077299, + "auxiliary_loss_mlp": 0.01026745, + "balance_loss_clip": 1.03700173, + "balance_loss_mlp": 1.01481247, + "epoch": 0.7780550127761912, + "flos": 23914711382400.0, + "grad_norm": 2.1657166687563887, + "language_loss": 0.77734792, + "learning_rate": 4.947432741611255e-07, + "loss": 0.7983883, + "num_input_tokens_seen": 279194430, + "step": 12941, + "time_per_iteration": 2.74072265625 + }, + { + "auxiliary_loss_clip": 0.01105122, + "auxiliary_loss_mlp": 0.01039107, + "balance_loss_clip": 1.03774464, + "balance_loss_mlp": 1.02505839, + "epoch": 0.7781151360288592, + "flos": 32415951795840.0, + "grad_norm": 2.6599455867272157, + "language_loss": 0.73127586, + "learning_rate": 4.944868633899462e-07, + "loss": 0.75271809, + "num_input_tokens_seen": 279212920, + "step": 12942, + "time_per_iteration": 2.717205047607422 + }, + { + "auxiliary_loss_clip": 0.0105644, + "auxiliary_loss_mlp": 0.01043958, + "balance_loss_clip": 1.03546214, + "balance_loss_mlp": 1.03034472, + "epoch": 0.7781752592815271, + "flos": 22346384780160.0, + "grad_norm": 2.908887240584156, + "language_loss": 0.67917764, + "learning_rate": 4.942305097079751e-07, + "loss": 0.7001816, + "num_input_tokens_seen": 279232310, + "step": 12943, + "time_per_iteration": 2.7333195209503174 + }, + { + "auxiliary_loss_clip": 0.01002881, + "auxiliary_loss_mlp": 0.01004649, + "balance_loss_clip": 1.00792861, + "balance_loss_mlp": 1.00336123, + "epoch": 0.7782353825341951, + "flos": 70460183520000.0, + "grad_norm": 0.7871784265530566, + "language_loss": 0.5845629, + "learning_rate": 4.939742131249347e-07, + "loss": 0.60463822, + "num_input_tokens_seen": 279295375, + "step": 12944, + "time_per_iteration": 3.390233039855957 + }, + { + "auxiliary_loss_clip": 0.01113922, + "auxiliary_loss_mlp": 0.01036077, + "balance_loss_clip": 1.03909469, + "balance_loss_mlp": 1.02220058, + "epoch": 0.778295505786863, + "flos": 19062569554560.0, + "grad_norm": 1.8086848755411578, + "language_loss": 0.67537427, + "learning_rate": 4.937179736505428e-07, + "loss": 0.69687426, + "num_input_tokens_seen": 279313660, + "step": 12945, + "time_per_iteration": 2.6378118991851807 + }, + { + "auxiliary_loss_clip": 0.01098229, + "auxiliary_loss_mlp": 0.0103623, + "balance_loss_clip": 1.03687143, + "balance_loss_mlp": 1.02295065, + "epoch": 0.778355629039531, + "flos": 20999734963200.0, + "grad_norm": 2.112440511554347, + "language_loss": 0.69157761, + "learning_rate": 4.93461791294516e-07, + "loss": 0.71292222, + "num_input_tokens_seen": 279334495, + "step": 12946, + "time_per_iteration": 2.7236101627349854 + }, + { + "auxiliary_loss_clip": 0.0111324, + "auxiliary_loss_mlp": 0.0102872, + "balance_loss_clip": 1.03970623, + "balance_loss_mlp": 1.01546407, + "epoch": 0.7784157522921991, + "flos": 21398091770880.0, + "grad_norm": 2.366818430498899, + "language_loss": 0.65404934, + "learning_rate": 4.932056660665689e-07, + "loss": 0.67546898, + "num_input_tokens_seen": 279352985, + "step": 12947, + "time_per_iteration": 2.6700103282928467 + }, + { + "auxiliary_loss_clip": 0.01049825, + "auxiliary_loss_mlp": 0.01043003, + "balance_loss_clip": 1.03298378, + "balance_loss_mlp": 1.02796459, + "epoch": 0.778475875544867, + "flos": 20813861059200.0, + "grad_norm": 1.8657083989144876, + "language_loss": 0.64925945, + "learning_rate": 4.929495979764147e-07, + "loss": 0.67018777, + "num_input_tokens_seen": 279371360, + "step": 12948, + "time_per_iteration": 2.8412203788757324 + }, + { + "auxiliary_loss_clip": 0.01112305, + "auxiliary_loss_mlp": 0.01035608, + "balance_loss_clip": 1.03932905, + "balance_loss_mlp": 1.02261424, + "epoch": 0.778535998797535, + "flos": 14355363104640.0, + "grad_norm": 1.8274723515678126, + "language_loss": 0.75157881, + "learning_rate": 4.926935870337625e-07, + "loss": 0.77305794, + "num_input_tokens_seen": 279389400, + "step": 12949, + "time_per_iteration": 2.641893148422241 + }, + { + "auxiliary_loss_clip": 0.01116388, + "auxiliary_loss_mlp": 0.01033844, + "balance_loss_clip": 1.04068756, + "balance_loss_mlp": 1.02045703, + "epoch": 0.7785961220502029, + "flos": 19209552007680.0, + "grad_norm": 2.2581725959312156, + "language_loss": 0.68925655, + "learning_rate": 4.924376332483202e-07, + "loss": 0.71075886, + "num_input_tokens_seen": 279409715, + "step": 12950, + "time_per_iteration": 2.7213573455810547 + }, + { + "auxiliary_loss_clip": 0.01096074, + "auxiliary_loss_mlp": 0.01034434, + "balance_loss_clip": 1.03823721, + "balance_loss_mlp": 1.02142787, + "epoch": 0.7786562453028709, + "flos": 25738757884800.0, + "grad_norm": 1.7372750277816864, + "language_loss": 0.71980989, + "learning_rate": 4.921817366297938e-07, + "loss": 0.74111497, + "num_input_tokens_seen": 279427705, + "step": 12951, + "time_per_iteration": 2.741422414779663 + }, + { + "auxiliary_loss_clip": 0.01087111, + "auxiliary_loss_mlp": 0.01035076, + "balance_loss_clip": 1.03640008, + "balance_loss_mlp": 1.02192152, + "epoch": 0.7787163685555388, + "flos": 25739440243200.0, + "grad_norm": 1.8924083949863153, + "language_loss": 0.65915614, + "learning_rate": 4.919258971878877e-07, + "loss": 0.68037808, + "num_input_tokens_seen": 279448215, + "step": 12952, + "time_per_iteration": 2.770171880722046 + }, + { + "auxiliary_loss_clip": 0.0108209, + "auxiliary_loss_mlp": 0.01031543, + "balance_loss_clip": 1.03549063, + "balance_loss_mlp": 1.01928258, + "epoch": 0.7787764918082068, + "flos": 22747722416640.0, + "grad_norm": 1.528475201753157, + "language_loss": 0.81114817, + "learning_rate": 4.916701149323022e-07, + "loss": 0.83228457, + "num_input_tokens_seen": 279466260, + "step": 12953, + "time_per_iteration": 2.708888530731201 + }, + { + "auxiliary_loss_clip": 0.01118162, + "auxiliary_loss_mlp": 0.01035624, + "balance_loss_clip": 1.04281271, + "balance_loss_mlp": 1.02266002, + "epoch": 0.7788366150608748, + "flos": 15190860430080.0, + "grad_norm": 2.122341354514922, + "language_loss": 0.76798481, + "learning_rate": 4.91414389872737e-07, + "loss": 0.78952265, + "num_input_tokens_seen": 279484520, + "step": 12954, + "time_per_iteration": 2.5349183082580566 + }, + { + "auxiliary_loss_clip": 0.01100423, + "auxiliary_loss_mlp": 0.01030129, + "balance_loss_clip": 1.0369153, + "balance_loss_mlp": 1.01788616, + "epoch": 0.7788967383135428, + "flos": 21210242618880.0, + "grad_norm": 1.5352459629047766, + "language_loss": 0.72880197, + "learning_rate": 4.911587220188905e-07, + "loss": 0.75010741, + "num_input_tokens_seen": 279503130, + "step": 12955, + "time_per_iteration": 2.7405974864959717 + }, + { + "auxiliary_loss_clip": 0.01079595, + "auxiliary_loss_mlp": 0.0104146, + "balance_loss_clip": 1.03563166, + "balance_loss_mlp": 1.02835917, + "epoch": 0.7789568615662107, + "flos": 21682970536320.0, + "grad_norm": 1.6339057488297875, + "language_loss": 0.68833733, + "learning_rate": 4.909031113804551e-07, + "loss": 0.70954788, + "num_input_tokens_seen": 279521930, + "step": 12956, + "time_per_iteration": 2.6949398517608643 + }, + { + "auxiliary_loss_clip": 0.01076197, + "auxiliary_loss_mlp": 0.01034972, + "balance_loss_clip": 1.0365479, + "balance_loss_mlp": 1.0227586, + "epoch": 0.7790169848188787, + "flos": 26360371676160.0, + "grad_norm": 1.6442430086846629, + "language_loss": 0.76081383, + "learning_rate": 4.906475579671252e-07, + "loss": 0.78192556, + "num_input_tokens_seen": 279542375, + "step": 12957, + "time_per_iteration": 2.7577781677246094 + }, + { + "auxiliary_loss_clip": 0.01041804, + "auxiliary_loss_mlp": 0.01027809, + "balance_loss_clip": 1.03647232, + "balance_loss_mlp": 1.01506531, + "epoch": 0.7790771080715466, + "flos": 25516183259520.0, + "grad_norm": 1.5510435056277987, + "language_loss": 0.77168477, + "learning_rate": 4.903920617885917e-07, + "loss": 0.79238093, + "num_input_tokens_seen": 279561885, + "step": 12958, + "time_per_iteration": 2.902573585510254 + }, + { + "auxiliary_loss_clip": 0.01099333, + "auxiliary_loss_mlp": 0.01042234, + "balance_loss_clip": 1.03847003, + "balance_loss_mlp": 1.02687943, + "epoch": 0.7791372313242146, + "flos": 16034186920320.0, + "grad_norm": 2.00916706928726, + "language_loss": 0.71559989, + "learning_rate": 4.901366228545418e-07, + "loss": 0.73701555, + "num_input_tokens_seen": 279579965, + "step": 12959, + "time_per_iteration": 2.6020190715789795 + }, + { + "auxiliary_loss_clip": 0.01094821, + "auxiliary_loss_mlp": 0.00771197, + "balance_loss_clip": 1.03832543, + "balance_loss_mlp": 1.00027037, + "epoch": 0.7791973545768827, + "flos": 23842207779840.0, + "grad_norm": 1.6491836005518046, + "language_loss": 0.78150439, + "learning_rate": 4.898812411746632e-07, + "loss": 0.80016458, + "num_input_tokens_seen": 279599030, + "step": 12960, + "time_per_iteration": 2.6712677478790283 + }, + { + "auxiliary_loss_clip": 0.01104299, + "auxiliary_loss_mlp": 0.01040928, + "balance_loss_clip": 1.03950214, + "balance_loss_mlp": 1.02792239, + "epoch": 0.7792574778295506, + "flos": 24168384207360.0, + "grad_norm": 2.171108267887673, + "language_loss": 0.75204015, + "learning_rate": 4.896259167586385e-07, + "loss": 0.77349246, + "num_input_tokens_seen": 279614400, + "step": 12961, + "time_per_iteration": 2.6742923259735107 + }, + { + "auxiliary_loss_clip": 0.01087433, + "auxiliary_loss_mlp": 0.01038038, + "balance_loss_clip": 1.03869224, + "balance_loss_mlp": 1.02624202, + "epoch": 0.7793176010822186, + "flos": 21464921024640.0, + "grad_norm": 1.7879944844879476, + "language_loss": 0.73984349, + "learning_rate": 4.893706496161511e-07, + "loss": 0.76109815, + "num_input_tokens_seen": 279633745, + "step": 12962, + "time_per_iteration": 2.6879115104675293 + }, + { + "auxiliary_loss_clip": 0.01101036, + "auxiliary_loss_mlp": 0.01030588, + "balance_loss_clip": 1.03875148, + "balance_loss_mlp": 1.01782036, + "epoch": 0.7793777243348865, + "flos": 20666699038080.0, + "grad_norm": 1.7723287922493858, + "language_loss": 0.69576943, + "learning_rate": 4.891154397568795e-07, + "loss": 0.71708572, + "num_input_tokens_seen": 279651165, + "step": 12963, + "time_per_iteration": 2.6385724544525146 + }, + { + "auxiliary_loss_clip": 0.01101416, + "auxiliary_loss_mlp": 0.00770165, + "balance_loss_clip": 1.04028928, + "balance_loss_mlp": 1.00022078, + "epoch": 0.7794378475875545, + "flos": 27125771610240.0, + "grad_norm": 1.6031620431196494, + "language_loss": 0.63797098, + "learning_rate": 4.888602871905019e-07, + "loss": 0.65668678, + "num_input_tokens_seen": 279671175, + "step": 12964, + "time_per_iteration": 2.6388909816741943 + }, + { + "auxiliary_loss_clip": 0.01092497, + "auxiliary_loss_mlp": 0.010352, + "balance_loss_clip": 1.03853321, + "balance_loss_mlp": 1.02259946, + "epoch": 0.7794979708402224, + "flos": 28074136446720.0, + "grad_norm": 1.8780726000065868, + "language_loss": 0.76702619, + "learning_rate": 4.88605191926694e-07, + "loss": 0.7883032, + "num_input_tokens_seen": 279688675, + "step": 12965, + "time_per_iteration": 4.301928758621216 + }, + { + "auxiliary_loss_clip": 0.01089139, + "auxiliary_loss_mlp": 0.01039643, + "balance_loss_clip": 1.03389204, + "balance_loss_mlp": 1.02626801, + "epoch": 0.7795580940928905, + "flos": 26869548919680.0, + "grad_norm": 1.824856626010527, + "language_loss": 0.73063076, + "learning_rate": 4.883501539751289e-07, + "loss": 0.75191855, + "num_input_tokens_seen": 279710245, + "step": 12966, + "time_per_iteration": 4.184988498687744 + }, + { + "auxiliary_loss_clip": 0.01088561, + "auxiliary_loss_mlp": 0.00769043, + "balance_loss_clip": 1.04008389, + "balance_loss_mlp": 1.00027704, + "epoch": 0.7796182173455584, + "flos": 23835384195840.0, + "grad_norm": 1.6189038671897886, + "language_loss": 0.7464664, + "learning_rate": 4.880951733454768e-07, + "loss": 0.76504242, + "num_input_tokens_seen": 279729045, + "step": 12967, + "time_per_iteration": 2.7788522243499756 + }, + { + "auxiliary_loss_clip": 0.0111227, + "auxiliary_loss_mlp": 0.01032605, + "balance_loss_clip": 1.03953099, + "balance_loss_mlp": 1.01915836, + "epoch": 0.7796783405982264, + "flos": 19792238434560.0, + "grad_norm": 3.3219826288937253, + "language_loss": 0.72220939, + "learning_rate": 4.878402500474073e-07, + "loss": 0.74365819, + "num_input_tokens_seen": 279748350, + "step": 12968, + "time_per_iteration": 4.058116436004639 + }, + { + "auxiliary_loss_clip": 0.01085681, + "auxiliary_loss_mlp": 0.01039035, + "balance_loss_clip": 1.03827214, + "balance_loss_mlp": 1.02664959, + "epoch": 0.7797384638508943, + "flos": 15450207603840.0, + "grad_norm": 1.9018596701865034, + "language_loss": 0.61007255, + "learning_rate": 4.875853840905874e-07, + "loss": 0.6313197, + "num_input_tokens_seen": 279765620, + "step": 12969, + "time_per_iteration": 2.6471657752990723 + }, + { + "auxiliary_loss_clip": 0.01090989, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.03693545, + "balance_loss_mlp": 1.0227617, + "epoch": 0.7797985871035623, + "flos": 20922742160640.0, + "grad_norm": 1.800586767958732, + "language_loss": 0.70180488, + "learning_rate": 4.873305754846811e-07, + "loss": 0.72305787, + "num_input_tokens_seen": 279782485, + "step": 12970, + "time_per_iteration": 2.6519546508789062 + }, + { + "auxiliary_loss_clip": 0.01075649, + "auxiliary_loss_mlp": 0.00770753, + "balance_loss_clip": 1.04074073, + "balance_loss_mlp": 1.00021172, + "epoch": 0.7798587103562302, + "flos": 36937212514560.0, + "grad_norm": 1.645308207198137, + "language_loss": 0.72213817, + "learning_rate": 4.870758242393507e-07, + "loss": 0.7406022, + "num_input_tokens_seen": 279804170, + "step": 12971, + "time_per_iteration": 2.818019390106201 + }, + { + "auxiliary_loss_clip": 0.0106953, + "auxiliary_loss_mlp": 0.01037101, + "balance_loss_clip": 1.03421068, + "balance_loss_mlp": 1.02360034, + "epoch": 0.7799188336088982, + "flos": 22419283432320.0, + "grad_norm": 3.5320834107901486, + "language_loss": 0.74761558, + "learning_rate": 4.868211303642578e-07, + "loss": 0.76868188, + "num_input_tokens_seen": 279823730, + "step": 12972, + "time_per_iteration": 2.7724294662475586 + }, + { + "auxiliary_loss_clip": 0.01111753, + "auxiliary_loss_mlp": 0.01025205, + "balance_loss_clip": 1.03887677, + "balance_loss_mlp": 1.01239038, + "epoch": 0.7799789568615663, + "flos": 18880466578560.0, + "grad_norm": 1.8982422603948057, + "language_loss": 0.71497881, + "learning_rate": 4.865664938690584e-07, + "loss": 0.73634839, + "num_input_tokens_seen": 279843035, + "step": 12973, + "time_per_iteration": 4.207505226135254 + }, + { + "auxiliary_loss_clip": 0.01099331, + "auxiliary_loss_mlp": 0.01032267, + "balance_loss_clip": 1.03967643, + "balance_loss_mlp": 1.0208292, + "epoch": 0.7800390801142342, + "flos": 20262272832000.0, + "grad_norm": 1.9924582249119662, + "language_loss": 0.77612895, + "learning_rate": 4.863119147634089e-07, + "loss": 0.79744494, + "num_input_tokens_seen": 279861450, + "step": 12974, + "time_per_iteration": 2.6812784671783447 + }, + { + "auxiliary_loss_clip": 0.01077043, + "auxiliary_loss_mlp": 0.01031844, + "balance_loss_clip": 1.0368197, + "balance_loss_mlp": 1.01885045, + "epoch": 0.7800992033669022, + "flos": 16690310703360.0, + "grad_norm": 1.5902544107071221, + "language_loss": 0.69343281, + "learning_rate": 4.86057393056964e-07, + "loss": 0.71452165, + "num_input_tokens_seen": 279878660, + "step": 12975, + "time_per_iteration": 2.668877124786377 + }, + { + "auxiliary_loss_clip": 0.01074216, + "auxiliary_loss_mlp": 0.01029987, + "balance_loss_clip": 1.03641438, + "balance_loss_mlp": 1.0174228, + "epoch": 0.7801593266195701, + "flos": 18585208782720.0, + "grad_norm": 1.9719657409906464, + "language_loss": 0.82066941, + "learning_rate": 4.858029287593739e-07, + "loss": 0.84171152, + "num_input_tokens_seen": 279895685, + "step": 12976, + "time_per_iteration": 2.760437488555908 + }, + { + "auxiliary_loss_clip": 0.01090901, + "auxiliary_loss_mlp": 0.00770609, + "balance_loss_clip": 1.03640187, + "balance_loss_mlp": 1.00019145, + "epoch": 0.7802194498722381, + "flos": 25484941405440.0, + "grad_norm": 1.5169608974947786, + "language_loss": 0.66052604, + "learning_rate": 4.85548521880289e-07, + "loss": 0.6791411, + "num_input_tokens_seen": 279917240, + "step": 12977, + "time_per_iteration": 2.7686586380004883 + }, + { + "auxiliary_loss_clip": 0.01087933, + "auxiliary_loss_mlp": 0.01028467, + "balance_loss_clip": 1.03792357, + "balance_loss_mlp": 1.01699352, + "epoch": 0.780279573124906, + "flos": 31176315573120.0, + "grad_norm": 1.5120129099478161, + "language_loss": 0.74935395, + "learning_rate": 4.852941724293554e-07, + "loss": 0.77051795, + "num_input_tokens_seen": 279938665, + "step": 12978, + "time_per_iteration": 2.775379180908203 + }, + { + "auxiliary_loss_clip": 0.01087009, + "auxiliary_loss_mlp": 0.01044028, + "balance_loss_clip": 1.03668034, + "balance_loss_mlp": 1.02886498, + "epoch": 0.780339696377574, + "flos": 26944027770240.0, + "grad_norm": 2.430538160229645, + "language_loss": 0.62134832, + "learning_rate": 4.85039880416219e-07, + "loss": 0.64265871, + "num_input_tokens_seen": 279957965, + "step": 12979, + "time_per_iteration": 2.715329170227051 + }, + { + "auxiliary_loss_clip": 0.01111779, + "auxiliary_loss_mlp": 0.01030782, + "balance_loss_clip": 1.03983402, + "balance_loss_mlp": 1.01825941, + "epoch": 0.780399819630242, + "flos": 27957426180480.0, + "grad_norm": 1.9760483655422685, + "language_loss": 0.77112854, + "learning_rate": 4.847856458505217e-07, + "loss": 0.79255414, + "num_input_tokens_seen": 279977490, + "step": 12980, + "time_per_iteration": 2.6605019569396973 + }, + { + "auxiliary_loss_clip": 0.0111412, + "auxiliary_loss_mlp": 0.01033295, + "balance_loss_clip": 1.03999209, + "balance_loss_mlp": 1.02089083, + "epoch": 0.78045994288291, + "flos": 22486795044480.0, + "grad_norm": 1.9592981721345673, + "language_loss": 0.77939653, + "learning_rate": 4.845314687419046e-07, + "loss": 0.80087066, + "num_input_tokens_seen": 279994220, + "step": 12981, + "time_per_iteration": 2.658205032348633 + }, + { + "auxiliary_loss_clip": 0.01069277, + "auxiliary_loss_mlp": 0.01036103, + "balance_loss_clip": 1.0379262, + "balance_loss_mlp": 1.02320492, + "epoch": 0.7805200661355779, + "flos": 20850849089280.0, + "grad_norm": 2.387436806844423, + "language_loss": 0.72364557, + "learning_rate": 4.842773491000067e-07, + "loss": 0.74469936, + "num_input_tokens_seen": 280012590, + "step": 12982, + "time_per_iteration": 2.6541051864624023 + }, + { + "auxiliary_loss_clip": 0.01084276, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.03608441, + "balance_loss_mlp": 1.01907182, + "epoch": 0.7805801893882459, + "flos": 25665966973440.0, + "grad_norm": 1.5019346121142914, + "language_loss": 0.73412144, + "learning_rate": 4.840232869344636e-07, + "loss": 0.75527191, + "num_input_tokens_seen": 280033700, + "step": 12983, + "time_per_iteration": 2.6957807540893555 + }, + { + "auxiliary_loss_clip": 0.01083908, + "auxiliary_loss_mlp": 0.01031264, + "balance_loss_clip": 1.03741837, + "balance_loss_mlp": 1.0185684, + "epoch": 0.7806403126409138, + "flos": 11327806483200.0, + "grad_norm": 1.8415039374254183, + "language_loss": 0.74685752, + "learning_rate": 4.837692822549086e-07, + "loss": 0.76800919, + "num_input_tokens_seen": 280052215, + "step": 12984, + "time_per_iteration": 2.6339285373687744 + }, + { + "auxiliary_loss_clip": 0.0108127, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.03251958, + "balance_loss_mlp": 1.02092516, + "epoch": 0.7807004358935818, + "flos": 19573362910080.0, + "grad_norm": 2.0272215357184646, + "language_loss": 0.81049699, + "learning_rate": 4.835153350709746e-07, + "loss": 0.83163786, + "num_input_tokens_seen": 280070525, + "step": 12985, + "time_per_iteration": 2.6104180812835693 + }, + { + "auxiliary_loss_clip": 0.01088889, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.03684783, + "balance_loss_mlp": 1.02074158, + "epoch": 0.7807605591462499, + "flos": 19135827342720.0, + "grad_norm": 2.394678033024852, + "language_loss": 0.76863611, + "learning_rate": 4.832614453922915e-07, + "loss": 0.78986299, + "num_input_tokens_seen": 280089855, + "step": 12986, + "time_per_iteration": 2.6664822101593018 + }, + { + "auxiliary_loss_clip": 0.01100426, + "auxiliary_loss_mlp": 0.01035599, + "balance_loss_clip": 1.037323, + "balance_loss_mlp": 1.02314782, + "epoch": 0.7808206823989178, + "flos": 32374654133760.0, + "grad_norm": 2.6632985579320128, + "language_loss": 0.73982435, + "learning_rate": 4.830076132284859e-07, + "loss": 0.76118457, + "num_input_tokens_seen": 280109960, + "step": 12987, + "time_per_iteration": 2.6844065189361572 + }, + { + "auxiliary_loss_clip": 0.01022794, + "auxiliary_loss_mlp": 0.00999717, + "balance_loss_clip": 1.01035285, + "balance_loss_mlp": 0.99873984, + "epoch": 0.7808808056515858, + "flos": 55050235061760.0, + "grad_norm": 0.7788144710273639, + "language_loss": 0.55080384, + "learning_rate": 4.82753838589184e-07, + "loss": 0.57102895, + "num_input_tokens_seen": 280169805, + "step": 12988, + "time_per_iteration": 3.1616508960723877 + }, + { + "auxiliary_loss_clip": 0.0107797, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.03549254, + "balance_loss_mlp": 1.01985967, + "epoch": 0.7809409289042537, + "flos": 12859468277760.0, + "grad_norm": 3.638882308233123, + "language_loss": 0.81044191, + "learning_rate": 4.82500121484009e-07, + "loss": 0.83153987, + "num_input_tokens_seen": 280184630, + "step": 12989, + "time_per_iteration": 2.660554885864258 + }, + { + "auxiliary_loss_clip": 0.01077669, + "auxiliary_loss_mlp": 0.01031778, + "balance_loss_clip": 1.03635395, + "balance_loss_mlp": 1.01853991, + "epoch": 0.7810010521569217, + "flos": 21687244254720.0, + "grad_norm": 1.7099876560000518, + "language_loss": 0.70650768, + "learning_rate": 4.822464619225806e-07, + "loss": 0.72760212, + "num_input_tokens_seen": 280203880, + "step": 12990, + "time_per_iteration": 2.7570815086364746 + }, + { + "auxiliary_loss_clip": 0.01087538, + "auxiliary_loss_mlp": 0.01034172, + "balance_loss_clip": 1.03705347, + "balance_loss_mlp": 1.02005148, + "epoch": 0.7810611754095896, + "flos": 16757068129920.0, + "grad_norm": 1.9898673429166094, + "language_loss": 0.77492607, + "learning_rate": 4.819928599145184e-07, + "loss": 0.79614317, + "num_input_tokens_seen": 280220460, + "step": 12991, + "time_per_iteration": 2.655853748321533 + }, + { + "auxiliary_loss_clip": 0.01071528, + "auxiliary_loss_mlp": 0.0103742, + "balance_loss_clip": 1.03491211, + "balance_loss_mlp": 1.02443242, + "epoch": 0.7811212986622577, + "flos": 43507464658560.0, + "grad_norm": 1.7999740041710885, + "language_loss": 0.6594398, + "learning_rate": 4.817393154694398e-07, + "loss": 0.68052924, + "num_input_tokens_seen": 280242680, + "step": 12992, + "time_per_iteration": 2.885798931121826 + }, + { + "auxiliary_loss_clip": 0.01114039, + "auxiliary_loss_mlp": 0.01030727, + "balance_loss_clip": 1.04082417, + "balance_loss_mlp": 1.01861548, + "epoch": 0.7811814219149256, + "flos": 21757700782080.0, + "grad_norm": 1.7673036757148999, + "language_loss": 0.61809367, + "learning_rate": 4.814858285969578e-07, + "loss": 0.63954139, + "num_input_tokens_seen": 280260655, + "step": 12993, + "time_per_iteration": 2.5982654094696045 + }, + { + "auxiliary_loss_clip": 0.01085768, + "auxiliary_loss_mlp": 0.01032115, + "balance_loss_clip": 1.03539443, + "balance_loss_mlp": 1.01902032, + "epoch": 0.7812415451675936, + "flos": 24061514267520.0, + "grad_norm": 1.4534277443177828, + "language_loss": 0.68547273, + "learning_rate": 4.812323993066862e-07, + "loss": 0.70665157, + "num_input_tokens_seen": 280281185, + "step": 12994, + "time_per_iteration": 2.7115039825439453 + }, + { + "auxiliary_loss_clip": 0.01109576, + "auxiliary_loss_mlp": 0.01027659, + "balance_loss_clip": 1.03841043, + "balance_loss_mlp": 1.01556516, + "epoch": 0.7813016684202615, + "flos": 18989706816000.0, + "grad_norm": 1.8869179456101774, + "language_loss": 0.68850774, + "learning_rate": 4.809790276082335e-07, + "loss": 0.70988011, + "num_input_tokens_seen": 280298255, + "step": 12995, + "time_per_iteration": 2.6276211738586426 + }, + { + "auxiliary_loss_clip": 0.01066626, + "auxiliary_loss_mlp": 0.01029556, + "balance_loss_clip": 1.03578615, + "balance_loss_mlp": 1.01815367, + "epoch": 0.7813617916729295, + "flos": 25260786581760.0, + "grad_norm": 1.6581465647867601, + "language_loss": 0.74758989, + "learning_rate": 4.807257135112088e-07, + "loss": 0.76855165, + "num_input_tokens_seen": 280319000, + "step": 12996, + "time_per_iteration": 2.7556345462799072 + }, + { + "auxiliary_loss_clip": 0.01115417, + "auxiliary_loss_mlp": 0.01034278, + "balance_loss_clip": 1.04004622, + "balance_loss_mlp": 1.02160597, + "epoch": 0.7814219149255974, + "flos": 17966037116160.0, + "grad_norm": 2.7982414236779385, + "language_loss": 0.68035823, + "learning_rate": 4.804724570252167e-07, + "loss": 0.70185518, + "num_input_tokens_seen": 280336375, + "step": 12997, + "time_per_iteration": 2.633403778076172 + }, + { + "auxiliary_loss_clip": 0.01115354, + "auxiliary_loss_mlp": 0.01035627, + "balance_loss_clip": 1.03941298, + "balance_loss_mlp": 1.02176905, + "epoch": 0.7814820381782654, + "flos": 25776176878080.0, + "grad_norm": 1.750414047475771, + "language_loss": 0.81803972, + "learning_rate": 4.802192581598614e-07, + "loss": 0.83954954, + "num_input_tokens_seen": 280358760, + "step": 12998, + "time_per_iteration": 2.6855201721191406 + }, + { + "auxiliary_loss_clip": 0.01083435, + "auxiliary_loss_mlp": 0.01038171, + "balance_loss_clip": 1.03414857, + "balance_loss_mlp": 1.02490866, + "epoch": 0.7815421614309335, + "flos": 20519572930560.0, + "grad_norm": 2.2116291760065523, + "language_loss": 0.74893302, + "learning_rate": 4.799661169247453e-07, + "loss": 0.77014905, + "num_input_tokens_seen": 280377085, + "step": 12999, + "time_per_iteration": 2.657938241958618 + }, + { + "auxiliary_loss_clip": 0.01098221, + "auxiliary_loss_mlp": 0.01042598, + "balance_loss_clip": 1.0372951, + "balance_loss_mlp": 1.02817392, + "epoch": 0.7816022846836014, + "flos": 21287666384640.0, + "grad_norm": 3.4549180565502957, + "language_loss": 0.84463656, + "learning_rate": 4.797130333294652e-07, + "loss": 0.8660447, + "num_input_tokens_seen": 280395465, + "step": 13000, + "time_per_iteration": 2.652675151824951 + }, + { + "auxiliary_loss_clip": 0.01102345, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.03934288, + "balance_loss_mlp": 1.02033567, + "epoch": 0.7816624079362694, + "flos": 19208402772480.0, + "grad_norm": 1.8050152528671168, + "language_loss": 0.66003239, + "learning_rate": 4.794600073836192e-07, + "loss": 0.68138748, + "num_input_tokens_seen": 280412775, + "step": 13001, + "time_per_iteration": 2.650995969772339 + }, + { + "auxiliary_loss_clip": 0.01073705, + "auxiliary_loss_mlp": 0.01037556, + "balance_loss_clip": 1.03569674, + "balance_loss_mlp": 1.02527714, + "epoch": 0.7817225311889373, + "flos": 26104687689600.0, + "grad_norm": 1.5795311212034024, + "language_loss": 0.67058933, + "learning_rate": 4.792070390968027e-07, + "loss": 0.69170189, + "num_input_tokens_seen": 280432905, + "step": 13002, + "time_per_iteration": 2.811582326889038 + }, + { + "auxiliary_loss_clip": 0.01105543, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.04086781, + "balance_loss_mlp": 1.02175558, + "epoch": 0.7817826544416053, + "flos": 21250929749760.0, + "grad_norm": 2.254654590765684, + "language_loss": 0.73237813, + "learning_rate": 4.78954128478607e-07, + "loss": 0.75378466, + "num_input_tokens_seen": 280450785, + "step": 13003, + "time_per_iteration": 2.6425418853759766 + }, + { + "auxiliary_loss_clip": 0.01101875, + "auxiliary_loss_mlp": 0.01035905, + "balance_loss_clip": 1.03900814, + "balance_loss_mlp": 1.02342987, + "epoch": 0.7818427776942732, + "flos": 19932181822080.0, + "grad_norm": 2.055818402329747, + "language_loss": 0.61984468, + "learning_rate": 4.787012755386233e-07, + "loss": 0.64122242, + "num_input_tokens_seen": 280468400, + "step": 13004, + "time_per_iteration": 2.6202731132507324 + }, + { + "auxiliary_loss_clip": 0.0110586, + "auxiliary_loss_mlp": 0.01032863, + "balance_loss_clip": 1.03744853, + "balance_loss_mlp": 1.02140069, + "epoch": 0.7819029009469413, + "flos": 11363753018880.0, + "grad_norm": 1.8629152700369227, + "language_loss": 0.82870841, + "learning_rate": 4.784484802864403e-07, + "loss": 0.85009563, + "num_input_tokens_seen": 280483930, + "step": 13005, + "time_per_iteration": 4.243497371673584 + }, + { + "auxiliary_loss_clip": 0.01070901, + "auxiliary_loss_mlp": 0.00771151, + "balance_loss_clip": 1.03450751, + "balance_loss_mlp": 1.00017846, + "epoch": 0.7819630241996092, + "flos": 24279276470400.0, + "grad_norm": 1.8867656052793076, + "language_loss": 0.72342831, + "learning_rate": 4.781957427316432e-07, + "loss": 0.74184883, + "num_input_tokens_seen": 280503465, + "step": 13006, + "time_per_iteration": 4.330881357192993 + }, + { + "auxiliary_loss_clip": 0.01101615, + "auxiliary_loss_mlp": 0.00771026, + "balance_loss_clip": 1.03858209, + "balance_loss_mlp": 1.00022697, + "epoch": 0.7820231474522772, + "flos": 22708902792960.0, + "grad_norm": 1.62503166301797, + "language_loss": 0.72343135, + "learning_rate": 4.779430628838157e-07, + "loss": 0.74215776, + "num_input_tokens_seen": 280523375, + "step": 13007, + "time_per_iteration": 4.214543581008911 + }, + { + "auxiliary_loss_clip": 0.01111637, + "auxiliary_loss_mlp": 0.01028444, + "balance_loss_clip": 1.03696549, + "balance_loss_mlp": 1.01505101, + "epoch": 0.7820832707049451, + "flos": 20047419630720.0, + "grad_norm": 1.901361826456498, + "language_loss": 0.68807894, + "learning_rate": 4.776904407525397e-07, + "loss": 0.70947969, + "num_input_tokens_seen": 280542920, + "step": 13008, + "time_per_iteration": 2.6050710678100586 + }, + { + "auxiliary_loss_clip": 0.01082934, + "auxiliary_loss_mlp": 0.01028936, + "balance_loss_clip": 1.03609729, + "balance_loss_mlp": 1.01611555, + "epoch": 0.7821433939576131, + "flos": 27162795553920.0, + "grad_norm": 2.2501417791775036, + "language_loss": 0.69864273, + "learning_rate": 4.774378763473954e-07, + "loss": 0.71976143, + "num_input_tokens_seen": 280561700, + "step": 13009, + "time_per_iteration": 2.7489216327667236 + }, + { + "auxiliary_loss_clip": 0.01069744, + "auxiliary_loss_mlp": 0.01029641, + "balance_loss_clip": 1.03394186, + "balance_loss_mlp": 1.01677287, + "epoch": 0.782203517210281, + "flos": 22602068766720.0, + "grad_norm": 2.6181121023195386, + "language_loss": 0.81756222, + "learning_rate": 4.771853696779586e-07, + "loss": 0.83855605, + "num_input_tokens_seen": 280580605, + "step": 13010, + "time_per_iteration": 2.754182815551758 + }, + { + "auxiliary_loss_clip": 0.01097326, + "auxiliary_loss_mlp": 0.01033843, + "balance_loss_clip": 1.03652465, + "balance_loss_mlp": 1.02199399, + "epoch": 0.782263640462949, + "flos": 29059812535680.0, + "grad_norm": 1.5058057514043965, + "language_loss": 0.61957836, + "learning_rate": 4.76932920753806e-07, + "loss": 0.64089006, + "num_input_tokens_seen": 280601495, + "step": 13011, + "time_per_iteration": 2.676269292831421 + }, + { + "auxiliary_loss_clip": 0.01098762, + "auxiliary_loss_mlp": 0.01028506, + "balance_loss_clip": 1.03798711, + "balance_loss_mlp": 1.01740146, + "epoch": 0.782323763715617, + "flos": 25299498464640.0, + "grad_norm": 1.7834780447298506, + "language_loss": 0.703578, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.72485065, + "num_input_tokens_seen": 280622760, + "step": 13012, + "time_per_iteration": 4.222137451171875 + }, + { + "auxiliary_loss_clip": 0.01030861, + "auxiliary_loss_mlp": 0.00999997, + "balance_loss_clip": 1.00834417, + "balance_loss_mlp": 0.99901354, + "epoch": 0.782383886968285, + "flos": 65194388668800.0, + "grad_norm": 0.7024977302558347, + "language_loss": 0.55065835, + "learning_rate": 4.764281961796395e-07, + "loss": 0.57096696, + "num_input_tokens_seen": 280687115, + "step": 13013, + "time_per_iteration": 3.2604727745056152 + }, + { + "auxiliary_loss_clip": 0.01088673, + "auxiliary_loss_mlp": 0.01038861, + "balance_loss_clip": 1.0394305, + "balance_loss_mlp": 1.02612925, + "epoch": 0.782444010220953, + "flos": 18405440190720.0, + "grad_norm": 1.739605099015053, + "language_loss": 0.65488654, + "learning_rate": 4.76175920548765e-07, + "loss": 0.67616189, + "num_input_tokens_seen": 280705000, + "step": 13014, + "time_per_iteration": 2.702570915222168 + }, + { + "auxiliary_loss_clip": 0.01005007, + "auxiliary_loss_mlp": 0.01000676, + "balance_loss_clip": 1.0074594, + "balance_loss_mlp": 0.99947244, + "epoch": 0.7825041334736209, + "flos": 63955003841280.0, + "grad_norm": 1.5199496836725135, + "language_loss": 0.58456129, + "learning_rate": 4.759237027014524e-07, + "loss": 0.60461813, + "num_input_tokens_seen": 280773525, + "step": 13015, + "time_per_iteration": 3.2708168029785156 + }, + { + "auxiliary_loss_clip": 0.01082509, + "auxiliary_loss_mlp": 0.0103455, + "balance_loss_clip": 1.03745651, + "balance_loss_mlp": 1.02287316, + "epoch": 0.7825642567262889, + "flos": 20339373375360.0, + "grad_norm": 1.6009097708406814, + "language_loss": 0.74550229, + "learning_rate": 4.756715426472666e-07, + "loss": 0.76667285, + "num_input_tokens_seen": 280791915, + "step": 13016, + "time_per_iteration": 2.775660514831543 + }, + { + "auxiliary_loss_clip": 0.01111525, + "auxiliary_loss_mlp": 0.01032595, + "balance_loss_clip": 1.03842187, + "balance_loss_mlp": 1.01902854, + "epoch": 0.7826243799789568, + "flos": 20262955190400.0, + "grad_norm": 1.7770751531413016, + "language_loss": 0.75118351, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.77262467, + "num_input_tokens_seen": 280811460, + "step": 13017, + "time_per_iteration": 2.6645398139953613 + }, + { + "auxiliary_loss_clip": 0.01085213, + "auxiliary_loss_mlp": 0.01034128, + "balance_loss_clip": 1.03540921, + "balance_loss_mlp": 1.0211221, + "epoch": 0.7826845032316249, + "flos": 21132926593920.0, + "grad_norm": 1.9823505334349008, + "language_loss": 0.75479347, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.77598691, + "num_input_tokens_seen": 280825415, + "step": 13018, + "time_per_iteration": 2.6840744018554688 + }, + { + "auxiliary_loss_clip": 0.01108451, + "auxiliary_loss_mlp": 0.01029158, + "balance_loss_clip": 1.03651655, + "balance_loss_mlp": 1.0168916, + "epoch": 0.7827446264842928, + "flos": 22492253911680.0, + "grad_norm": 1.4306758819867016, + "language_loss": 0.77329135, + "learning_rate": 4.749154093390708e-07, + "loss": 0.79466748, + "num_input_tokens_seen": 280845335, + "step": 13019, + "time_per_iteration": 2.6806087493896484 + }, + { + "auxiliary_loss_clip": 0.01065952, + "auxiliary_loss_mlp": 0.01028104, + "balance_loss_clip": 1.03685999, + "balance_loss_mlp": 1.01612306, + "epoch": 0.7828047497369608, + "flos": 28840649702400.0, + "grad_norm": 1.4806512863046632, + "language_loss": 0.67511666, + "learning_rate": 4.746634805529852e-07, + "loss": 0.6960572, + "num_input_tokens_seen": 280867145, + "step": 13020, + "time_per_iteration": 2.9394872188568115 + }, + { + "auxiliary_loss_clip": 0.01099304, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.03999841, + "balance_loss_mlp": 1.01744223, + "epoch": 0.7828648729896287, + "flos": 23257689759360.0, + "grad_norm": 3.1494596140171787, + "language_loss": 0.62587798, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.64717221, + "num_input_tokens_seen": 280886185, + "step": 13021, + "time_per_iteration": 2.6747660636901855 + }, + { + "auxiliary_loss_clip": 0.01107745, + "auxiliary_loss_mlp": 0.01035666, + "balance_loss_clip": 1.03731537, + "balance_loss_mlp": 1.02393007, + "epoch": 0.7829249962422967, + "flos": 25265670831360.0, + "grad_norm": 1.6912709426048431, + "language_loss": 0.69153851, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.71297264, + "num_input_tokens_seen": 280907665, + "step": 13022, + "time_per_iteration": 2.698918342590332 + }, + { + "auxiliary_loss_clip": 0.00980906, + "auxiliary_loss_mlp": 0.0100189, + "balance_loss_clip": 1.01163054, + "balance_loss_mlp": 1.00038803, + "epoch": 0.7829851194949646, + "flos": 70722044645760.0, + "grad_norm": 0.6377469168354571, + "language_loss": 0.56205934, + "learning_rate": 4.739080412784131e-07, + "loss": 0.58188736, + "num_input_tokens_seen": 280971405, + "step": 13023, + "time_per_iteration": 3.4054768085479736 + }, + { + "auxiliary_loss_clip": 0.0107826, + "auxiliary_loss_mlp": 0.01032398, + "balance_loss_clip": 1.03205729, + "balance_loss_mlp": 1.01958895, + "epoch": 0.7830452427476327, + "flos": 25660795415040.0, + "grad_norm": 1.7775757007122028, + "language_loss": 0.67073244, + "learning_rate": 4.736563439132792e-07, + "loss": 0.69183898, + "num_input_tokens_seen": 280989615, + "step": 13024, + "time_per_iteration": 2.646439790725708 + }, + { + "auxiliary_loss_clip": 0.01112317, + "auxiliary_loss_mlp": 0.01028491, + "balance_loss_clip": 1.03878796, + "balance_loss_mlp": 1.01559806, + "epoch": 0.7831053660003006, + "flos": 22784315397120.0, + "grad_norm": 1.5682779156650977, + "language_loss": 0.77674961, + "learning_rate": 4.734047044272498e-07, + "loss": 0.79815769, + "num_input_tokens_seen": 281009450, + "step": 13025, + "time_per_iteration": 2.632951021194458 + }, + { + "auxiliary_loss_clip": 0.01084338, + "auxiliary_loss_mlp": 0.01036499, + "balance_loss_clip": 1.03556383, + "balance_loss_mlp": 1.02404797, + "epoch": 0.7831654892529686, + "flos": 25812267068160.0, + "grad_norm": 2.0934383648650194, + "language_loss": 0.78239512, + "learning_rate": 4.731531228298673e-07, + "loss": 0.80360353, + "num_input_tokens_seen": 281028120, + "step": 13026, + "time_per_iteration": 2.7387208938598633 + }, + { + "auxiliary_loss_clip": 0.01097191, + "auxiliary_loss_mlp": 0.01028382, + "balance_loss_clip": 1.03798652, + "balance_loss_mlp": 1.01656842, + "epoch": 0.7832256125056366, + "flos": 20771557816320.0, + "grad_norm": 2.1298369773301, + "language_loss": 0.75428832, + "learning_rate": 4.729015991306715e-07, + "loss": 0.77554405, + "num_input_tokens_seen": 281042130, + "step": 13027, + "time_per_iteration": 2.62705135345459 + }, + { + "auxiliary_loss_clip": 0.01102237, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.04018044, + "balance_loss_mlp": 1.01980579, + "epoch": 0.7832857357583045, + "flos": 21506541909120.0, + "grad_norm": 1.7296473073785772, + "language_loss": 0.70366251, + "learning_rate": 4.726501333391997e-07, + "loss": 0.72500432, + "num_input_tokens_seen": 281060945, + "step": 13028, + "time_per_iteration": 2.651749849319458 + }, + { + "auxiliary_loss_clip": 0.01063459, + "auxiliary_loss_mlp": 0.01038176, + "balance_loss_clip": 1.03720903, + "balance_loss_mlp": 1.02482486, + "epoch": 0.7833458590109725, + "flos": 18077791305600.0, + "grad_norm": 2.132518402701666, + "language_loss": 0.68704486, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.70806122, + "num_input_tokens_seen": 281079270, + "step": 13029, + "time_per_iteration": 2.733846664428711 + }, + { + "auxiliary_loss_clip": 0.01085127, + "auxiliary_loss_mlp": 0.01033179, + "balance_loss_clip": 1.03914356, + "balance_loss_mlp": 1.01970291, + "epoch": 0.7834059822636404, + "flos": 28288738252800.0, + "grad_norm": 1.9111735577193074, + "language_loss": 0.81041169, + "learning_rate": 4.721473755175698e-07, + "loss": 0.83159471, + "num_input_tokens_seen": 281099500, + "step": 13030, + "time_per_iteration": 2.7992770671844482 + }, + { + "auxiliary_loss_clip": 0.01104778, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.03917181, + "balance_loss_mlp": 1.0206331, + "epoch": 0.7834661055163085, + "flos": 31686211088640.0, + "grad_norm": 2.9451675534531847, + "language_loss": 0.70219892, + "learning_rate": 4.71896083506476e-07, + "loss": 0.72357768, + "num_input_tokens_seen": 281121250, + "step": 13031, + "time_per_iteration": 2.9042952060699463 + }, + { + "auxiliary_loss_clip": 0.01072572, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.03533792, + "balance_loss_mlp": 1.02237034, + "epoch": 0.7835262287689764, + "flos": 12933192942720.0, + "grad_norm": 2.323625290086717, + "language_loss": 0.790016, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.81108725, + "num_input_tokens_seen": 281138760, + "step": 13032, + "time_per_iteration": 2.750812292098999 + }, + { + "auxiliary_loss_clip": 0.0110433, + "auxiliary_loss_mlp": 0.01040225, + "balance_loss_clip": 1.03909242, + "balance_loss_mlp": 1.02739763, + "epoch": 0.7835863520216444, + "flos": 16143211676160.0, + "grad_norm": 2.0684316430418463, + "language_loss": 0.62812865, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.64957428, + "num_input_tokens_seen": 281157420, + "step": 13033, + "time_per_iteration": 2.6421468257904053 + }, + { + "auxiliary_loss_clip": 0.01098998, + "auxiliary_loss_mlp": 0.01034317, + "balance_loss_clip": 1.03790116, + "balance_loss_mlp": 1.02132297, + "epoch": 0.7836464752743123, + "flos": 11509909459200.0, + "grad_norm": 1.58357633529001, + "language_loss": 0.71756327, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.73889643, + "num_input_tokens_seen": 281174620, + "step": 13034, + "time_per_iteration": 2.7166271209716797 + }, + { + "auxiliary_loss_clip": 0.01113235, + "auxiliary_loss_mlp": 0.00771091, + "balance_loss_clip": 1.03961957, + "balance_loss_mlp": 1.00013566, + "epoch": 0.7837065985269803, + "flos": 18223696350720.0, + "grad_norm": 1.6566949403371967, + "language_loss": 0.72002685, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.73887014, + "num_input_tokens_seen": 281193865, + "step": 13035, + "time_per_iteration": 2.5778520107269287 + }, + { + "auxiliary_loss_clip": 0.01112728, + "auxiliary_loss_mlp": 0.01035529, + "balance_loss_clip": 1.03951585, + "balance_loss_mlp": 1.02208817, + "epoch": 0.7837667217796482, + "flos": 24754410599040.0, + "grad_norm": 2.9879625935132372, + "language_loss": 0.66463214, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.68611467, + "num_input_tokens_seen": 281212250, + "step": 13036, + "time_per_iteration": 2.6302857398986816 + }, + { + "auxiliary_loss_clip": 0.01104467, + "auxiliary_loss_mlp": 0.01039494, + "balance_loss_clip": 1.03855228, + "balance_loss_mlp": 1.02618408, + "epoch": 0.7838268450323163, + "flos": 22383121415040.0, + "grad_norm": 2.0949975987912848, + "language_loss": 0.73010111, + "learning_rate": 4.703895486362031e-07, + "loss": 0.75154078, + "num_input_tokens_seen": 281230850, + "step": 13037, + "time_per_iteration": 2.6575746536254883 + }, + { + "auxiliary_loss_clip": 0.01070616, + "auxiliary_loss_mlp": 0.01035879, + "balance_loss_clip": 1.03389454, + "balance_loss_mlp": 1.02229476, + "epoch": 0.7838869682849842, + "flos": 19500284689920.0, + "grad_norm": 5.499006833043144, + "language_loss": 0.59598082, + "learning_rate": 4.701386624460717e-07, + "loss": 0.61704576, + "num_input_tokens_seen": 281249810, + "step": 13038, + "time_per_iteration": 2.6556448936462402 + }, + { + "auxiliary_loss_clip": 0.01089544, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.0388062, + "balance_loss_mlp": 1.0172174, + "epoch": 0.7839470915376522, + "flos": 32892845690880.0, + "grad_norm": 1.750335160170137, + "language_loss": 0.68257546, + "learning_rate": 4.698878342684349e-07, + "loss": 0.70375991, + "num_input_tokens_seen": 281273730, + "step": 13039, + "time_per_iteration": 2.760946273803711 + }, + { + "auxiliary_loss_clip": 0.01072076, + "auxiliary_loss_mlp": 0.01024882, + "balance_loss_clip": 1.03432715, + "balance_loss_mlp": 1.01383781, + "epoch": 0.7840072147903202, + "flos": 29676003373440.0, + "grad_norm": 1.826043040750904, + "language_loss": 0.69417781, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.71514744, + "num_input_tokens_seen": 281293670, + "step": 13040, + "time_per_iteration": 2.7545461654663086 + }, + { + "auxiliary_loss_clip": 0.0106802, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.03712749, + "balance_loss_mlp": 1.02223086, + "epoch": 0.7840673380429881, + "flos": 18186744234240.0, + "grad_norm": 1.5142145779529246, + "language_loss": 0.67758179, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.69861877, + "num_input_tokens_seen": 281313070, + "step": 13041, + "time_per_iteration": 2.7630157470703125 + }, + { + "auxiliary_loss_clip": 0.01022608, + "auxiliary_loss_mlp": 0.00751599, + "balance_loss_clip": 1.0097084, + "balance_loss_mlp": 0.99966377, + "epoch": 0.7841274612956561, + "flos": 66346006613760.0, + "grad_norm": 0.6656190181226946, + "language_loss": 0.57380033, + "learning_rate": 4.691356979055998e-07, + "loss": 0.59154236, + "num_input_tokens_seen": 281374880, + "step": 13042, + "time_per_iteration": 3.1374757289886475 + }, + { + "auxiliary_loss_clip": 0.01087388, + "auxiliary_loss_mlp": 0.01033015, + "balance_loss_clip": 1.03713918, + "balance_loss_mlp": 1.02007461, + "epoch": 0.784187584548324, + "flos": 26648482665600.0, + "grad_norm": 2.1244828686221267, + "language_loss": 0.83795989, + "learning_rate": 4.688851018730369e-07, + "loss": 0.85916388, + "num_input_tokens_seen": 281392620, + "step": 13043, + "time_per_iteration": 2.793748378753662 + }, + { + "auxiliary_loss_clip": 0.01095712, + "auxiliary_loss_mlp": 0.01027767, + "balance_loss_clip": 1.03719783, + "balance_loss_mlp": 1.0161922, + "epoch": 0.7842477078009921, + "flos": 25740158515200.0, + "grad_norm": 1.3834924746992494, + "language_loss": 0.88441205, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.90564686, + "num_input_tokens_seen": 281413140, + "step": 13044, + "time_per_iteration": 4.261160135269165 + }, + { + "auxiliary_loss_clip": 0.01093506, + "auxiliary_loss_mlp": 0.01034687, + "balance_loss_clip": 1.03787422, + "balance_loss_mlp": 1.02180016, + "epoch": 0.78430783105366, + "flos": 21980957765760.0, + "grad_norm": 1.825374480580212, + "language_loss": 0.78958154, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.81086344, + "num_input_tokens_seen": 281430860, + "step": 13045, + "time_per_iteration": 2.7708632946014404 + }, + { + "auxiliary_loss_clip": 0.01084228, + "auxiliary_loss_mlp": 0.01031577, + "balance_loss_clip": 1.03655803, + "balance_loss_mlp": 1.0191431, + "epoch": 0.784367954306328, + "flos": 23842279607040.0, + "grad_norm": 1.484345043483713, + "language_loss": 0.72495216, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.7461102, + "num_input_tokens_seen": 281451385, + "step": 13046, + "time_per_iteration": 4.295615196228027 + }, + { + "auxiliary_loss_clip": 0.01070358, + "auxiliary_loss_mlp": 0.01035911, + "balance_loss_clip": 1.04044282, + "balance_loss_mlp": 1.02267289, + "epoch": 0.7844280775589959, + "flos": 24826662806400.0, + "grad_norm": 1.5168340119310013, + "language_loss": 0.62780952, + "learning_rate": 4.678832984380809e-07, + "loss": 0.6488722, + "num_input_tokens_seen": 281472255, + "step": 13047, + "time_per_iteration": 4.33956503868103 + }, + { + "auxiliary_loss_clip": 0.01100709, + "auxiliary_loss_mlp": 0.01027981, + "balance_loss_clip": 1.03916669, + "balance_loss_mlp": 1.01601255, + "epoch": 0.7844882008116639, + "flos": 22455660931200.0, + "grad_norm": 1.6255681359432697, + "language_loss": 0.73295152, + "learning_rate": 4.676329928006515e-07, + "loss": 0.75423837, + "num_input_tokens_seen": 281492860, + "step": 13048, + "time_per_iteration": 2.764153003692627 + }, + { + "auxiliary_loss_clip": 0.01087112, + "auxiliary_loss_mlp": 0.0103201, + "balance_loss_clip": 1.03815794, + "balance_loss_mlp": 1.01965356, + "epoch": 0.7845483240643318, + "flos": 26104041244800.0, + "grad_norm": 3.259574846755966, + "language_loss": 0.74822855, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.76941979, + "num_input_tokens_seen": 281511815, + "step": 13049, + "time_per_iteration": 2.702545642852783 + }, + { + "auxiliary_loss_clip": 0.01113727, + "auxiliary_loss_mlp": 0.01032718, + "balance_loss_clip": 1.03731608, + "balance_loss_mlp": 1.01894963, + "epoch": 0.7846084473169999, + "flos": 19354307817600.0, + "grad_norm": 1.71411914117224, + "language_loss": 0.72622865, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.74769306, + "num_input_tokens_seen": 281530090, + "step": 13050, + "time_per_iteration": 2.6567511558532715 + }, + { + "auxiliary_loss_clip": 0.01098536, + "auxiliary_loss_mlp": 0.01034296, + "balance_loss_clip": 1.03764224, + "balance_loss_mlp": 1.02170706, + "epoch": 0.7846685705696678, + "flos": 23325811902720.0, + "grad_norm": 1.9970425884506249, + "language_loss": 0.73258287, + "learning_rate": 4.668824245713825e-07, + "loss": 0.75391114, + "num_input_tokens_seen": 281547075, + "step": 13051, + "time_per_iteration": 4.220673322677612 + }, + { + "auxiliary_loss_clip": 0.01112899, + "auxiliary_loss_mlp": 0.01034321, + "balance_loss_clip": 1.03919625, + "balance_loss_mlp": 1.02135718, + "epoch": 0.7847286938223358, + "flos": 35809545962880.0, + "grad_norm": 2.6887410249812578, + "language_loss": 0.72721338, + "learning_rate": 4.666323514209227e-07, + "loss": 0.7486856, + "num_input_tokens_seen": 281568080, + "step": 13052, + "time_per_iteration": 2.7622361183166504 + }, + { + "auxiliary_loss_clip": 0.0108619, + "auxiliary_loss_mlp": 0.01035577, + "balance_loss_clip": 1.03937328, + "balance_loss_mlp": 1.02357841, + "epoch": 0.7847888170750038, + "flos": 18478159274880.0, + "grad_norm": 1.82904296097524, + "language_loss": 0.69018829, + "learning_rate": 4.663823364159183e-07, + "loss": 0.71140599, + "num_input_tokens_seen": 281586925, + "step": 13053, + "time_per_iteration": 2.7101058959960938 + }, + { + "auxiliary_loss_clip": 0.0109323, + "auxiliary_loss_mlp": 0.01031564, + "balance_loss_clip": 1.03785491, + "balance_loss_mlp": 1.01989341, + "epoch": 0.7848489403276717, + "flos": 25119155255040.0, + "grad_norm": 2.155883968401707, + "language_loss": 0.69833845, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.71958637, + "num_input_tokens_seen": 281603915, + "step": 13054, + "time_per_iteration": 2.6558749675750732 + }, + { + "auxiliary_loss_clip": 0.01102359, + "auxiliary_loss_mlp": 0.01035501, + "balance_loss_clip": 1.03816795, + "balance_loss_mlp": 1.02254295, + "epoch": 0.7849090635803397, + "flos": 26502433966080.0, + "grad_norm": 1.6743772106587247, + "language_loss": 0.76095474, + "learning_rate": 4.658824808801938e-07, + "loss": 0.78233331, + "num_input_tokens_seen": 281624220, + "step": 13055, + "time_per_iteration": 2.729825019836426 + }, + { + "auxiliary_loss_clip": 0.01115191, + "auxiliary_loss_mlp": 0.01034507, + "balance_loss_clip": 1.03995335, + "balance_loss_mlp": 1.02139974, + "epoch": 0.7849691868330076, + "flos": 20959658363520.0, + "grad_norm": 1.870278317520838, + "language_loss": 0.7499572, + "learning_rate": 4.656326403684283e-07, + "loss": 0.77145422, + "num_input_tokens_seen": 281642325, + "step": 13056, + "time_per_iteration": 2.6321020126342773 + }, + { + "auxiliary_loss_clip": 0.01048067, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.03739357, + "balance_loss_mlp": 1.01989865, + "epoch": 0.7850293100856757, + "flos": 26067484177920.0, + "grad_norm": 1.7420143195586486, + "language_loss": 0.70014071, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.72094762, + "num_input_tokens_seen": 281663065, + "step": 13057, + "time_per_iteration": 2.8007147312164307 + }, + { + "auxiliary_loss_clip": 0.01064676, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_clip": 1.03794479, + "balance_loss_mlp": 1.02130675, + "epoch": 0.7850894333383436, + "flos": 22491894775680.0, + "grad_norm": 1.791422043134008, + "language_loss": 0.76534569, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.78632534, + "num_input_tokens_seen": 281681005, + "step": 13058, + "time_per_iteration": 2.7110915184020996 + }, + { + "auxiliary_loss_clip": 0.01101284, + "auxiliary_loss_mlp": 0.01036049, + "balance_loss_clip": 1.03946376, + "balance_loss_mlp": 1.0238781, + "epoch": 0.7851495565910116, + "flos": 20558643949440.0, + "grad_norm": 1.5851127868658192, + "language_loss": 0.70834202, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.72971535, + "num_input_tokens_seen": 281697965, + "step": 13059, + "time_per_iteration": 2.7031941413879395 + }, + { + "auxiliary_loss_clip": 0.01081291, + "auxiliary_loss_mlp": 0.01038886, + "balance_loss_clip": 1.03579831, + "balance_loss_mlp": 1.02460492, + "epoch": 0.7852096798436795, + "flos": 15924838942080.0, + "grad_norm": 2.081733102958074, + "language_loss": 0.76698899, + "learning_rate": 4.646338602497144e-07, + "loss": 0.78819072, + "num_input_tokens_seen": 281716035, + "step": 13060, + "time_per_iteration": 2.7939200401306152 + }, + { + "auxiliary_loss_clip": 0.01083148, + "auxiliary_loss_mlp": 0.01031719, + "balance_loss_clip": 1.03790545, + "balance_loss_mlp": 1.01883268, + "epoch": 0.7852698030963475, + "flos": 19062282245760.0, + "grad_norm": 2.323604844819863, + "language_loss": 0.77162534, + "learning_rate": 4.643843107494654e-07, + "loss": 0.79277396, + "num_input_tokens_seen": 281732815, + "step": 13061, + "time_per_iteration": 2.697397232055664 + }, + { + "auxiliary_loss_clip": 0.01074028, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.03479552, + "balance_loss_mlp": 1.02089262, + "epoch": 0.7853299263490154, + "flos": 24644380262400.0, + "grad_norm": 1.8894100648574905, + "language_loss": 0.74005646, + "learning_rate": 4.641348194799164e-07, + "loss": 0.76114058, + "num_input_tokens_seen": 281751980, + "step": 13062, + "time_per_iteration": 2.9962854385375977 + }, + { + "auxiliary_loss_clip": 0.01097852, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.03713512, + "balance_loss_mlp": 1.02026176, + "epoch": 0.7853900496016835, + "flos": 22017981709440.0, + "grad_norm": 1.6526268980906231, + "language_loss": 0.68907607, + "learning_rate": 4.638853864505297e-07, + "loss": 0.71037793, + "num_input_tokens_seen": 281772670, + "step": 13063, + "time_per_iteration": 2.7347474098205566 + }, + { + "auxiliary_loss_clip": 0.01099713, + "auxiliary_loss_mlp": 0.01036078, + "balance_loss_clip": 1.04048038, + "balance_loss_mlp": 1.02360916, + "epoch": 0.7854501728543514, + "flos": 30227412032640.0, + "grad_norm": 4.546851509745459, + "language_loss": 0.72635663, + "learning_rate": 4.636360116707625e-07, + "loss": 0.74771458, + "num_input_tokens_seen": 281792930, + "step": 13064, + "time_per_iteration": 2.7636148929595947 + }, + { + "auxiliary_loss_clip": 0.01082833, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.03790045, + "balance_loss_mlp": 1.02079129, + "epoch": 0.7855102961070194, + "flos": 18843694030080.0, + "grad_norm": 14.965350757481792, + "language_loss": 0.67957228, + "learning_rate": 4.633866951500718e-07, + "loss": 0.70073175, + "num_input_tokens_seen": 281811805, + "step": 13065, + "time_per_iteration": 2.7619290351867676 + }, + { + "auxiliary_loss_clip": 0.01097669, + "auxiliary_loss_mlp": 0.01037896, + "balance_loss_clip": 1.04063308, + "balance_loss_mlp": 1.02562392, + "epoch": 0.7855704193596874, + "flos": 22309971367680.0, + "grad_norm": 1.6867324299047715, + "language_loss": 0.75999427, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.78134984, + "num_input_tokens_seen": 281831885, + "step": 13066, + "time_per_iteration": 2.647052764892578 + }, + { + "auxiliary_loss_clip": 0.0103061, + "auxiliary_loss_mlp": 0.01006066, + "balance_loss_clip": 1.00811362, + "balance_loss_mlp": 1.00509405, + "epoch": 0.7856305426123553, + "flos": 60004434407040.0, + "grad_norm": 0.7063334807152991, + "language_loss": 0.5335499, + "learning_rate": 4.628882369237346e-07, + "loss": 0.55391669, + "num_input_tokens_seen": 281900310, + "step": 13067, + "time_per_iteration": 3.2783384323120117 + }, + { + "auxiliary_loss_clip": 0.01065395, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.03609753, + "balance_loss_mlp": 1.01884413, + "epoch": 0.7856906658650233, + "flos": 21868593045120.0, + "grad_norm": 1.5153182614776801, + "language_loss": 0.67582923, + "learning_rate": 4.62639095236989e-07, + "loss": 0.69680464, + "num_input_tokens_seen": 281918870, + "step": 13068, + "time_per_iteration": 2.818237543106079 + }, + { + "auxiliary_loss_clip": 0.01076742, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.03852606, + "balance_loss_mlp": 1.01839852, + "epoch": 0.7857507891176913, + "flos": 23622937205760.0, + "grad_norm": 2.4110222950654325, + "language_loss": 0.68040943, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.70147741, + "num_input_tokens_seen": 281936905, + "step": 13069, + "time_per_iteration": 2.7654619216918945 + }, + { + "auxiliary_loss_clip": 0.01103004, + "auxiliary_loss_mlp": 0.01035827, + "balance_loss_clip": 1.04032803, + "balance_loss_mlp": 1.02331567, + "epoch": 0.7858109123703593, + "flos": 25520061928320.0, + "grad_norm": 1.6503036246986864, + "language_loss": 0.76820791, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.7895962, + "num_input_tokens_seen": 281955625, + "step": 13070, + "time_per_iteration": 2.7123591899871826 + }, + { + "auxiliary_loss_clip": 0.0105121, + "auxiliary_loss_mlp": 0.0105136, + "balance_loss_clip": 1.030967, + "balance_loss_mlp": 1.03746009, + "epoch": 0.7858710356230272, + "flos": 17457398576640.0, + "grad_norm": 1.7605883689591728, + "language_loss": 0.65229589, + "learning_rate": 4.618920199958083e-07, + "loss": 0.6733216, + "num_input_tokens_seen": 281973285, + "step": 13071, + "time_per_iteration": 2.727679491043091 + }, + { + "auxiliary_loss_clip": 0.01063123, + "auxiliary_loss_mlp": 0.0103513, + "balance_loss_clip": 1.03286123, + "balance_loss_mlp": 1.02270818, + "epoch": 0.7859311588756952, + "flos": 24679680353280.0, + "grad_norm": 1.7243596413538878, + "language_loss": 0.73917699, + "learning_rate": 4.616431115532442e-07, + "loss": 0.76015961, + "num_input_tokens_seen": 281991410, + "step": 13072, + "time_per_iteration": 2.8985819816589355 + }, + { + "auxiliary_loss_clip": 0.01097172, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.04014218, + "balance_loss_mlp": 1.0194838, + "epoch": 0.7859912821283631, + "flos": 21799142098560.0, + "grad_norm": 4.385793601952052, + "language_loss": 0.71439523, + "learning_rate": 4.613942614453268e-07, + "loss": 0.73569524, + "num_input_tokens_seen": 282010845, + "step": 13073, + "time_per_iteration": 2.670741558074951 + }, + { + "auxiliary_loss_clip": 0.01085075, + "auxiliary_loss_mlp": 0.01035125, + "balance_loss_clip": 1.0389545, + "balance_loss_mlp": 1.0218693, + "epoch": 0.7860514053810311, + "flos": 20847293642880.0, + "grad_norm": 1.6142243935129328, + "language_loss": 0.76601768, + "learning_rate": 4.611454696814938e-07, + "loss": 0.78721976, + "num_input_tokens_seen": 282029635, + "step": 13074, + "time_per_iteration": 2.715064287185669 + }, + { + "auxiliary_loss_clip": 0.01067309, + "auxiliary_loss_mlp": 0.01034423, + "balance_loss_clip": 1.03506911, + "balance_loss_mlp": 1.0224185, + "epoch": 0.786111528633699, + "flos": 24315689882880.0, + "grad_norm": 1.7966754252742998, + "language_loss": 0.75166345, + "learning_rate": 4.608967362711782e-07, + "loss": 0.77268076, + "num_input_tokens_seen": 282050285, + "step": 13075, + "time_per_iteration": 2.8381521701812744 + }, + { + "auxiliary_loss_clip": 0.01083185, + "auxiliary_loss_mlp": 0.01026692, + "balance_loss_clip": 1.04080176, + "balance_loss_mlp": 1.01497984, + "epoch": 0.7861716518863671, + "flos": 24353180703360.0, + "grad_norm": 1.743827758665396, + "language_loss": 0.69089484, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.71199363, + "num_input_tokens_seen": 282071040, + "step": 13076, + "time_per_iteration": 2.812002658843994 + }, + { + "auxiliary_loss_clip": 0.01095604, + "auxiliary_loss_mlp": 0.01028537, + "balance_loss_clip": 1.03609765, + "balance_loss_mlp": 1.01606214, + "epoch": 0.786231775139035, + "flos": 14022399006720.0, + "grad_norm": 2.296864016069315, + "language_loss": 0.80343485, + "learning_rate": 4.603994445488282e-07, + "loss": 0.82467622, + "num_input_tokens_seen": 282086610, + "step": 13077, + "time_per_iteration": 2.690382480621338 + }, + { + "auxiliary_loss_clip": 0.0110006, + "auxiliary_loss_mlp": 0.0103229, + "balance_loss_clip": 1.039482, + "balance_loss_mlp": 1.01980269, + "epoch": 0.786291898391703, + "flos": 33724248865920.0, + "grad_norm": 1.6714014639715435, + "language_loss": 0.70845038, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.72977388, + "num_input_tokens_seen": 282107440, + "step": 13078, + "time_per_iteration": 3.024754524230957 + }, + { + "auxiliary_loss_clip": 0.01096328, + "auxiliary_loss_mlp": 0.01035627, + "balance_loss_clip": 1.03739369, + "balance_loss_mlp": 1.02363431, + "epoch": 0.786352021644371, + "flos": 25811476968960.0, + "grad_norm": 1.523123466356383, + "language_loss": 0.81217003, + "learning_rate": 4.599023863537039e-07, + "loss": 0.83348954, + "num_input_tokens_seen": 282127290, + "step": 13079, + "time_per_iteration": 2.6527066230773926 + }, + { + "auxiliary_loss_clip": 0.01078236, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.03953731, + "balance_loss_mlp": 1.0202589, + "epoch": 0.7864121448970389, + "flos": 28910818920960.0, + "grad_norm": 1.8147971205749318, + "language_loss": 0.68534672, + "learning_rate": 4.596539448524146e-07, + "loss": 0.70645535, + "num_input_tokens_seen": 282147505, + "step": 13080, + "time_per_iteration": 2.7910823822021484 + }, + { + "auxiliary_loss_clip": 0.01099002, + "auxiliary_loss_mlp": 0.01034583, + "balance_loss_clip": 1.03815937, + "balance_loss_mlp": 1.02227473, + "epoch": 0.7864722681497069, + "flos": 19208833735680.0, + "grad_norm": 1.6728405689924877, + "language_loss": 0.69698668, + "learning_rate": 4.594055617612016e-07, + "loss": 0.71832252, + "num_input_tokens_seen": 282166450, + "step": 13081, + "time_per_iteration": 2.676067590713501 + }, + { + "auxiliary_loss_clip": 0.01086253, + "auxiliary_loss_mlp": 0.01035065, + "balance_loss_clip": 1.03589058, + "balance_loss_mlp": 1.0229888, + "epoch": 0.7865323914023749, + "flos": 21871573873920.0, + "grad_norm": 1.8288911392242622, + "language_loss": 0.68142998, + "learning_rate": 4.591572370894838e-07, + "loss": 0.70264316, + "num_input_tokens_seen": 282186465, + "step": 13082, + "time_per_iteration": 2.671044111251831 + }, + { + "auxiliary_loss_clip": 0.01081636, + "auxiliary_loss_mlp": 0.01036406, + "balance_loss_clip": 1.03695893, + "balance_loss_mlp": 1.02418661, + "epoch": 0.7865925146550429, + "flos": 25520313323520.0, + "grad_norm": 1.7965603666617915, + "language_loss": 0.66121304, + "learning_rate": 4.589089708466789e-07, + "loss": 0.68239349, + "num_input_tokens_seen": 282207180, + "step": 13083, + "time_per_iteration": 2.77585506439209 + }, + { + "auxiliary_loss_clip": 0.01089696, + "auxiliary_loss_mlp": 0.01030934, + "balance_loss_clip": 1.03773546, + "balance_loss_mlp": 1.01740384, + "epoch": 0.7866526379077108, + "flos": 19097366855040.0, + "grad_norm": 2.0746759351122614, + "language_loss": 0.74140465, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.76261097, + "num_input_tokens_seen": 282225865, + "step": 13084, + "time_per_iteration": 5.905508518218994 + }, + { + "auxiliary_loss_clip": 0.01083182, + "auxiliary_loss_mlp": 0.01037679, + "balance_loss_clip": 1.03682792, + "balance_loss_mlp": 1.02519202, + "epoch": 0.7867127611603788, + "flos": 16173771171840.0, + "grad_norm": 3.4926036147980635, + "language_loss": 0.70331782, + "learning_rate": 4.584126136854591e-07, + "loss": 0.72452641, + "num_input_tokens_seen": 282242895, + "step": 13085, + "time_per_iteration": 2.689375162124634 + }, + { + "auxiliary_loss_clip": 0.01086151, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.03600478, + "balance_loss_mlp": 1.01758742, + "epoch": 0.7867728844130467, + "flos": 20773640805120.0, + "grad_norm": 2.163841211360238, + "language_loss": 0.7244603, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.74562383, + "num_input_tokens_seen": 282260425, + "step": 13086, + "time_per_iteration": 4.172788381576538 + }, + { + "auxiliary_loss_clip": 0.01108157, + "auxiliary_loss_mlp": 0.01027186, + "balance_loss_clip": 1.03651989, + "balance_loss_mlp": 1.01503301, + "epoch": 0.7868330076657147, + "flos": 21760106993280.0, + "grad_norm": 1.9419848626775902, + "language_loss": 0.74776971, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.7691232, + "num_input_tokens_seen": 282279335, + "step": 13087, + "time_per_iteration": 2.695462465286255 + }, + { + "auxiliary_loss_clip": 0.01085975, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.03558397, + "balance_loss_mlp": 1.02015603, + "epoch": 0.7868931309183826, + "flos": 25700692446720.0, + "grad_norm": 1.58603589617711, + "language_loss": 0.71365935, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.73483884, + "num_input_tokens_seen": 282299905, + "step": 13088, + "time_per_iteration": 2.781475782394409 + }, + { + "auxiliary_loss_clip": 0.01029395, + "auxiliary_loss_mlp": 0.01003015, + "balance_loss_clip": 1.0068965, + "balance_loss_mlp": 1.0020256, + "epoch": 0.7869532541710507, + "flos": 64644883430400.0, + "grad_norm": 1.2260501594651212, + "language_loss": 0.55467439, + "learning_rate": 4.574206009240431e-07, + "loss": 0.5749985, + "num_input_tokens_seen": 282367620, + "step": 13089, + "time_per_iteration": 3.24120831489563 + }, + { + "auxiliary_loss_clip": 0.01017655, + "auxiliary_loss_mlp": 0.01001728, + "balance_loss_clip": 1.00651848, + "balance_loss_mlp": 1.0007323, + "epoch": 0.7870133774237186, + "flos": 67453600440960.0, + "grad_norm": 0.7641579994840295, + "language_loss": 0.49973857, + "learning_rate": 4.571727439470976e-07, + "loss": 0.51993239, + "num_input_tokens_seen": 282435695, + "step": 13090, + "time_per_iteration": 4.754423379898071 + }, + { + "auxiliary_loss_clip": 0.01099139, + "auxiliary_loss_mlp": 0.01031069, + "balance_loss_clip": 1.0383184, + "balance_loss_mlp": 1.01955974, + "epoch": 0.7870735006763866, + "flos": 26068310190720.0, + "grad_norm": 1.460212524446196, + "language_loss": 0.8408305, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.86213255, + "num_input_tokens_seen": 282456025, + "step": 13091, + "time_per_iteration": 2.6467459201812744 + }, + { + "auxiliary_loss_clip": 0.01019902, + "auxiliary_loss_mlp": 0.01003454, + "balance_loss_clip": 1.00713682, + "balance_loss_mlp": 1.00247598, + "epoch": 0.7871336239290546, + "flos": 70289572896000.0, + "grad_norm": 0.7147506558128559, + "language_loss": 0.64014363, + "learning_rate": 4.566772055150947e-07, + "loss": 0.6603772, + "num_input_tokens_seen": 282520995, + "step": 13092, + "time_per_iteration": 3.2051150798797607 + }, + { + "auxiliary_loss_clip": 0.01088327, + "auxiliary_loss_mlp": 0.01035505, + "balance_loss_clip": 1.03942823, + "balance_loss_mlp": 1.0227139, + "epoch": 0.7871937471817225, + "flos": 15778574760960.0, + "grad_norm": 2.3884379503568076, + "language_loss": 0.79189074, + "learning_rate": 4.564295240788285e-07, + "loss": 0.81312907, + "num_input_tokens_seen": 282539355, + "step": 13093, + "time_per_iteration": 2.7134079933166504 + }, + { + "auxiliary_loss_clip": 0.01080576, + "auxiliary_loss_mlp": 0.01028467, + "balance_loss_clip": 1.03772855, + "balance_loss_mlp": 1.01671863, + "epoch": 0.7872538704343905, + "flos": 20485242506880.0, + "grad_norm": 1.8523965571373735, + "language_loss": 0.75549555, + "learning_rate": 4.561819011749106e-07, + "loss": 0.77658594, + "num_input_tokens_seen": 282555735, + "step": 13094, + "time_per_iteration": 2.7055883407592773 + }, + { + "auxiliary_loss_clip": 0.01061535, + "auxiliary_loss_mlp": 0.01044464, + "balance_loss_clip": 1.03247035, + "balance_loss_mlp": 1.030725, + "epoch": 0.7873139936870585, + "flos": 25082670015360.0, + "grad_norm": 1.6047845480222185, + "language_loss": 0.79805398, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.81911397, + "num_input_tokens_seen": 282574550, + "step": 13095, + "time_per_iteration": 2.819106340408325 + }, + { + "auxiliary_loss_clip": 0.01098697, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.03697014, + "balance_loss_mlp": 1.02055073, + "epoch": 0.7873741169397265, + "flos": 30883176679680.0, + "grad_norm": 1.6143252232165546, + "language_loss": 0.67820108, + "learning_rate": 4.556868310016715e-07, + "loss": 0.69951594, + "num_input_tokens_seen": 282596520, + "step": 13096, + "time_per_iteration": 2.6944971084594727 + }, + { + "auxiliary_loss_clip": 0.01082196, + "auxiliary_loss_mlp": 0.01027058, + "balance_loss_clip": 1.0342679, + "balance_loss_mlp": 1.01628733, + "epoch": 0.7874342401923944, + "flos": 46791962242560.0, + "grad_norm": 1.8327451146164324, + "language_loss": 0.7056793, + "learning_rate": 4.55439383751125e-07, + "loss": 0.72677183, + "num_input_tokens_seen": 282620560, + "step": 13097, + "time_per_iteration": 2.969263792037964 + }, + { + "auxiliary_loss_clip": 0.01092033, + "auxiliary_loss_mlp": 0.01034929, + "balance_loss_clip": 1.04004323, + "balance_loss_mlp": 1.0221442, + "epoch": 0.7874943634450624, + "flos": 23584548545280.0, + "grad_norm": 1.6158173512871257, + "language_loss": 0.80720508, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.82847476, + "num_input_tokens_seen": 282639830, + "step": 13098, + "time_per_iteration": 2.7234272956848145 + }, + { + "auxiliary_loss_clip": 0.01069091, + "auxiliary_loss_mlp": 0.01031845, + "balance_loss_clip": 1.03451467, + "balance_loss_mlp": 1.02053809, + "epoch": 0.7875544866977303, + "flos": 20191169859840.0, + "grad_norm": 2.07716673704352, + "language_loss": 0.73976696, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.76077634, + "num_input_tokens_seen": 282660130, + "step": 13099, + "time_per_iteration": 2.7741127014160156 + }, + { + "auxiliary_loss_clip": 0.01087499, + "auxiliary_loss_mlp": 0.01024045, + "balance_loss_clip": 1.03627956, + "balance_loss_mlp": 1.01170659, + "epoch": 0.7876146099503983, + "flos": 22602571557120.0, + "grad_norm": 1.5896108161315186, + "language_loss": 0.78226274, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.80337822, + "num_input_tokens_seen": 282681125, + "step": 13100, + "time_per_iteration": 2.7259294986724854 + }, + { + "auxiliary_loss_clip": 0.01101593, + "auxiliary_loss_mlp": 0.00771735, + "balance_loss_clip": 1.03714919, + "balance_loss_mlp": 1.00031686, + "epoch": 0.7876747332030662, + "flos": 10705833555840.0, + "grad_norm": 3.3947108231001772, + "language_loss": 0.66015649, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.67888987, + "num_input_tokens_seen": 282696690, + "step": 13101, + "time_per_iteration": 2.6262006759643555 + }, + { + "auxiliary_loss_clip": 0.01086168, + "auxiliary_loss_mlp": 0.01030979, + "balance_loss_clip": 1.03619587, + "balance_loss_mlp": 1.01895058, + "epoch": 0.7877348564557343, + "flos": 38399315621760.0, + "grad_norm": 1.4292814509281728, + "language_loss": 0.77840889, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.79958034, + "num_input_tokens_seen": 282721210, + "step": 13102, + "time_per_iteration": 3.016707420349121 + }, + { + "auxiliary_loss_clip": 0.01096566, + "auxiliary_loss_mlp": 0.01040471, + "balance_loss_clip": 1.0358392, + "balance_loss_mlp": 1.02863932, + "epoch": 0.7877949797084022, + "flos": 18329524796160.0, + "grad_norm": 1.7485518464366943, + "language_loss": 0.82362533, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.84499568, + "num_input_tokens_seen": 282738505, + "step": 13103, + "time_per_iteration": 2.6577935218811035 + }, + { + "auxiliary_loss_clip": 0.01101133, + "auxiliary_loss_mlp": 0.01033792, + "balance_loss_clip": 1.03859389, + "balance_loss_mlp": 1.02039886, + "epoch": 0.7878551029610702, + "flos": 25806736373760.0, + "grad_norm": 3.304808366824196, + "language_loss": 0.8070327, + "learning_rate": 4.537088934794913e-07, + "loss": 0.8283819, + "num_input_tokens_seen": 282756895, + "step": 13104, + "time_per_iteration": 2.680666923522949 + }, + { + "auxiliary_loss_clip": 0.01111584, + "auxiliary_loss_mlp": 0.01034583, + "balance_loss_clip": 1.03829467, + "balance_loss_mlp": 1.02250695, + "epoch": 0.7879152262137382, + "flos": 22342685679360.0, + "grad_norm": 1.6276257181376157, + "language_loss": 0.74308252, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.76454425, + "num_input_tokens_seen": 282774955, + "step": 13105, + "time_per_iteration": 2.5943186283111572 + }, + { + "auxiliary_loss_clip": 0.0105328, + "auxiliary_loss_mlp": 0.0104138, + "balance_loss_clip": 1.03382134, + "balance_loss_mlp": 1.02832067, + "epoch": 0.7879753494664061, + "flos": 24785329230720.0, + "grad_norm": 1.561193248297936, + "language_loss": 0.75636542, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.77731198, + "num_input_tokens_seen": 282793165, + "step": 13106, + "time_per_iteration": 2.8052754402160645 + }, + { + "auxiliary_loss_clip": 0.01060642, + "auxiliary_loss_mlp": 0.01033724, + "balance_loss_clip": 1.0368247, + "balance_loss_mlp": 1.02129078, + "epoch": 0.7880354727190741, + "flos": 16909078487040.0, + "grad_norm": 2.2640209986182116, + "language_loss": 0.73844689, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.75939053, + "num_input_tokens_seen": 282809820, + "step": 13107, + "time_per_iteration": 2.7168357372283936 + }, + { + "auxiliary_loss_clip": 0.01109075, + "auxiliary_loss_mlp": 0.01034958, + "balance_loss_clip": 1.03867579, + "balance_loss_mlp": 1.02291143, + "epoch": 0.7880955959717421, + "flos": 22230500526720.0, + "grad_norm": 1.5353613262891537, + "language_loss": 0.73295653, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.7543968, + "num_input_tokens_seen": 282828600, + "step": 13108, + "time_per_iteration": 2.6911845207214355 + }, + { + "auxiliary_loss_clip": 0.01029486, + "auxiliary_loss_mlp": 0.00999387, + "balance_loss_clip": 1.00682902, + "balance_loss_mlp": 0.99848729, + "epoch": 0.7881557192244101, + "flos": 69183200131200.0, + "grad_norm": 0.890062717819184, + "language_loss": 0.60359526, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.62388396, + "num_input_tokens_seen": 282882775, + "step": 13109, + "time_per_iteration": 3.113757610321045 + }, + { + "auxiliary_loss_clip": 0.01067084, + "auxiliary_loss_mlp": 0.010294, + "balance_loss_clip": 1.03637147, + "balance_loss_mlp": 1.01732993, + "epoch": 0.788215842477078, + "flos": 24935436167040.0, + "grad_norm": 1.6561185443626747, + "language_loss": 0.72235435, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.74331915, + "num_input_tokens_seen": 282902680, + "step": 13110, + "time_per_iteration": 2.7759180068969727 + }, + { + "auxiliary_loss_clip": 0.01056492, + "auxiliary_loss_mlp": 0.01030136, + "balance_loss_clip": 1.03376198, + "balance_loss_mlp": 1.01843548, + "epoch": 0.788275965729746, + "flos": 26106483369600.0, + "grad_norm": 1.3819740231055346, + "language_loss": 0.75173604, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.77260238, + "num_input_tokens_seen": 282923625, + "step": 13111, + "time_per_iteration": 2.840644121170044 + }, + { + "auxiliary_loss_clip": 0.01094246, + "auxiliary_loss_mlp": 0.01035923, + "balance_loss_clip": 1.03667474, + "balance_loss_mlp": 1.02317989, + "epoch": 0.7883360889824139, + "flos": 21214803646080.0, + "grad_norm": 2.288432261799451, + "language_loss": 0.61037534, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.63167697, + "num_input_tokens_seen": 282941955, + "step": 13112, + "time_per_iteration": 2.673748016357422 + }, + { + "auxiliary_loss_clip": 0.01089796, + "auxiliary_loss_mlp": 0.0103157, + "balance_loss_clip": 1.03910899, + "balance_loss_mlp": 1.01814699, + "epoch": 0.7883962122350819, + "flos": 21142551438720.0, + "grad_norm": 1.825503520806994, + "language_loss": 0.67753619, + "learning_rate": 4.514881996216644e-07, + "loss": 0.69874984, + "num_input_tokens_seen": 282961280, + "step": 13113, + "time_per_iteration": 2.6813149452209473 + }, + { + "auxiliary_loss_clip": 0.01069296, + "auxiliary_loss_mlp": 0.01035093, + "balance_loss_clip": 1.0344131, + "balance_loss_mlp": 1.02270675, + "epoch": 0.7884563354877498, + "flos": 15302901928320.0, + "grad_norm": 3.4397675156813867, + "language_loss": 0.5793888, + "learning_rate": 4.5124174933361e-07, + "loss": 0.60043263, + "num_input_tokens_seen": 282978210, + "step": 13114, + "time_per_iteration": 2.7150933742523193 + }, + { + "auxiliary_loss_clip": 0.01062606, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.03754115, + "balance_loss_mlp": 1.01891208, + "epoch": 0.7885164587404179, + "flos": 24388301226240.0, + "grad_norm": 1.5845799743186602, + "language_loss": 0.67243695, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.69338286, + "num_input_tokens_seen": 282998845, + "step": 13115, + "time_per_iteration": 2.80094575881958 + }, + { + "auxiliary_loss_clip": 0.01083933, + "auxiliary_loss_mlp": 0.01040208, + "balance_loss_clip": 1.03556573, + "balance_loss_mlp": 1.02654052, + "epoch": 0.7885765819930858, + "flos": 14385886686720.0, + "grad_norm": 2.573676201829806, + "language_loss": 0.88785017, + "learning_rate": 4.50749024954048e-07, + "loss": 0.90909165, + "num_input_tokens_seen": 283015200, + "step": 13116, + "time_per_iteration": 2.8118736743927 + }, + { + "auxiliary_loss_clip": 0.01093449, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.03728342, + "balance_loss_mlp": 1.02034712, + "epoch": 0.7886367052457538, + "flos": 18259930195200.0, + "grad_norm": 2.1380250897449384, + "language_loss": 0.72576118, + "learning_rate": 4.505027508812245e-07, + "loss": 0.74703431, + "num_input_tokens_seen": 283033680, + "step": 13117, + "time_per_iteration": 2.782005786895752 + }, + { + "auxiliary_loss_clip": 0.01096232, + "auxiliary_loss_mlp": 0.01027109, + "balance_loss_clip": 1.03812051, + "balance_loss_mlp": 1.01586211, + "epoch": 0.7886968284984217, + "flos": 15305092657920.0, + "grad_norm": 1.6421996108060435, + "language_loss": 0.79999858, + "learning_rate": 4.502565355654926e-07, + "loss": 0.82123202, + "num_input_tokens_seen": 283050620, + "step": 13118, + "time_per_iteration": 2.678349256515503 + }, + { + "auxiliary_loss_clip": 0.01097412, + "auxiliary_loss_mlp": 0.01028112, + "balance_loss_clip": 1.03808641, + "balance_loss_mlp": 1.01605964, + "epoch": 0.7887569517510897, + "flos": 21215450090880.0, + "grad_norm": 1.6890691063161838, + "language_loss": 0.72958535, + "learning_rate": 4.500103790161878e-07, + "loss": 0.75084054, + "num_input_tokens_seen": 283070215, + "step": 13119, + "time_per_iteration": 2.7472004890441895 + }, + { + "auxiliary_loss_clip": 0.01095693, + "auxiliary_loss_mlp": 0.01028074, + "balance_loss_clip": 1.03482223, + "balance_loss_mlp": 1.01517558, + "epoch": 0.7888170750037578, + "flos": 22711237176960.0, + "grad_norm": 3.3903571989834584, + "language_loss": 0.71983945, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.74107713, + "num_input_tokens_seen": 283091485, + "step": 13120, + "time_per_iteration": 2.82316517829895 + }, + { + "auxiliary_loss_clip": 0.01081982, + "auxiliary_loss_mlp": 0.007726, + "balance_loss_clip": 1.03474998, + "balance_loss_mlp": 1.00026715, + "epoch": 0.7888771982564257, + "flos": 36429148592640.0, + "grad_norm": 1.5160777676600576, + "language_loss": 0.79098976, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.80953562, + "num_input_tokens_seen": 283115040, + "step": 13121, + "time_per_iteration": 2.8498284816741943 + }, + { + "auxiliary_loss_clip": 0.01095183, + "auxiliary_loss_mlp": 0.01030106, + "balance_loss_clip": 1.0355587, + "balance_loss_mlp": 1.01765466, + "epoch": 0.7889373215090937, + "flos": 27309993488640.0, + "grad_norm": 1.3811288834626105, + "language_loss": 0.80475199, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.82600486, + "num_input_tokens_seen": 283136925, + "step": 13122, + "time_per_iteration": 2.667525053024292 + }, + { + "auxiliary_loss_clip": 0.01081111, + "auxiliary_loss_mlp": 0.01026345, + "balance_loss_clip": 1.03613377, + "balance_loss_mlp": 1.01491308, + "epoch": 0.7889974447617616, + "flos": 19829010983040.0, + "grad_norm": 1.947347999480454, + "language_loss": 0.78504455, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.8061192, + "num_input_tokens_seen": 283155725, + "step": 13123, + "time_per_iteration": 5.875938653945923 + }, + { + "auxiliary_loss_clip": 0.0109205, + "auxiliary_loss_mlp": 0.01034554, + "balance_loss_clip": 1.04389477, + "balance_loss_mlp": 1.02196002, + "epoch": 0.7890575680144296, + "flos": 17271201450240.0, + "grad_norm": 1.9573332964647796, + "language_loss": 0.67213017, + "learning_rate": 4.487804780926985e-07, + "loss": 0.69339627, + "num_input_tokens_seen": 283173845, + "step": 13124, + "time_per_iteration": 4.206716775894165 + }, + { + "auxiliary_loss_clip": 0.01087652, + "auxiliary_loss_mlp": 0.01025366, + "balance_loss_clip": 1.03578448, + "balance_loss_mlp": 1.01191306, + "epoch": 0.7891176912670975, + "flos": 27600151553280.0, + "grad_norm": 2.308329967659437, + "language_loss": 0.72559512, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.74672532, + "num_input_tokens_seen": 283191985, + "step": 13125, + "time_per_iteration": 2.7699477672576904 + }, + { + "auxiliary_loss_clip": 0.01092333, + "auxiliary_loss_mlp": 0.01028843, + "balance_loss_clip": 1.03605413, + "balance_loss_mlp": 1.01586151, + "epoch": 0.7891778145197655, + "flos": 22711668140160.0, + "grad_norm": 1.8181427406883512, + "language_loss": 0.72330505, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.74451685, + "num_input_tokens_seen": 283210855, + "step": 13126, + "time_per_iteration": 2.799743413925171 + }, + { + "auxiliary_loss_clip": 0.01091919, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.03676748, + "balance_loss_mlp": 1.01820195, + "epoch": 0.7892379377724335, + "flos": 17310775259520.0, + "grad_norm": 1.9171689494151543, + "language_loss": 0.76746297, + "learning_rate": 4.480432433327845e-07, + "loss": 0.78869414, + "num_input_tokens_seen": 283229665, + "step": 13127, + "time_per_iteration": 2.6769402027130127 + }, + { + "auxiliary_loss_clip": 0.0109264, + "auxiliary_loss_mlp": 0.01040923, + "balance_loss_clip": 1.03622723, + "balance_loss_mlp": 1.02709436, + "epoch": 0.7892980610251015, + "flos": 25775674087680.0, + "grad_norm": 1.6866494650381205, + "language_loss": 0.85712594, + "learning_rate": 4.47797616101103e-07, + "loss": 0.87846154, + "num_input_tokens_seen": 283248615, + "step": 13128, + "time_per_iteration": 2.6580183506011963 + }, + { + "auxiliary_loss_clip": 0.0109824, + "auxiliary_loss_mlp": 0.01037637, + "balance_loss_clip": 1.03702545, + "balance_loss_mlp": 1.02604949, + "epoch": 0.7893581842777694, + "flos": 21579943351680.0, + "grad_norm": 2.375306290130731, + "language_loss": 0.69267899, + "learning_rate": 4.475520477290904e-07, + "loss": 0.71403778, + "num_input_tokens_seen": 283267135, + "step": 13129, + "time_per_iteration": 2.736177682876587 + }, + { + "auxiliary_loss_clip": 0.01020095, + "auxiliary_loss_mlp": 0.01001956, + "balance_loss_clip": 1.00642443, + "balance_loss_mlp": 1.00062704, + "epoch": 0.7894183075304374, + "flos": 69016468176000.0, + "grad_norm": 0.7134870194246187, + "language_loss": 0.61555952, + "learning_rate": 4.473065382260597e-07, + "loss": 0.63578004, + "num_input_tokens_seen": 283328940, + "step": 13130, + "time_per_iteration": 4.797807216644287 + }, + { + "auxiliary_loss_clip": 0.0110005, + "auxiliary_loss_mlp": 0.01028381, + "balance_loss_clip": 1.03902447, + "balance_loss_mlp": 1.01690102, + "epoch": 0.7894784307831053, + "flos": 24243258107520.0, + "grad_norm": 1.9168458838285078, + "language_loss": 0.73797166, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.75925595, + "num_input_tokens_seen": 283350000, + "step": 13131, + "time_per_iteration": 2.7573840618133545 + }, + { + "auxiliary_loss_clip": 0.01088103, + "auxiliary_loss_mlp": 0.01026242, + "balance_loss_clip": 1.0371995, + "balance_loss_mlp": 1.01223469, + "epoch": 0.7895385540357733, + "flos": 20266546550400.0, + "grad_norm": 2.4133377950586676, + "language_loss": 0.68751633, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.70865989, + "num_input_tokens_seen": 283368020, + "step": 13132, + "time_per_iteration": 2.719820499420166 + }, + { + "auxiliary_loss_clip": 0.01101541, + "auxiliary_loss_mlp": 0.01040122, + "balance_loss_clip": 1.03842628, + "balance_loss_mlp": 1.02676463, + "epoch": 0.7895986772884414, + "flos": 20996574566400.0, + "grad_norm": 2.9264754072085104, + "language_loss": 0.62335461, + "learning_rate": 4.465703630239468e-07, + "loss": 0.64477122, + "num_input_tokens_seen": 283387030, + "step": 13133, + "time_per_iteration": 2.6314589977264404 + }, + { + "auxiliary_loss_clip": 0.01079478, + "auxiliary_loss_mlp": 0.01037851, + "balance_loss_clip": 1.03612971, + "balance_loss_mlp": 1.02386165, + "epoch": 0.7896588005411093, + "flos": 18657999694080.0, + "grad_norm": 3.367198830526819, + "language_loss": 0.7950719, + "learning_rate": 4.463250890899195e-07, + "loss": 0.8162452, + "num_input_tokens_seen": 283402090, + "step": 13134, + "time_per_iteration": 2.7504961490631104 + }, + { + "auxiliary_loss_clip": 0.0109746, + "auxiliary_loss_mlp": 0.01032763, + "balance_loss_clip": 1.03501463, + "balance_loss_mlp": 1.02011466, + "epoch": 0.7897189237937773, + "flos": 18405907067520.0, + "grad_norm": 1.8328144041845063, + "language_loss": 0.80414212, + "learning_rate": 4.460798740713998e-07, + "loss": 0.82544434, + "num_input_tokens_seen": 283421035, + "step": 13135, + "time_per_iteration": 2.666182518005371 + }, + { + "auxiliary_loss_clip": 0.01097147, + "auxiliary_loss_mlp": 0.0103152, + "balance_loss_clip": 1.0373044, + "balance_loss_mlp": 1.01890731, + "epoch": 0.7897790470464452, + "flos": 23731602825600.0, + "grad_norm": 1.9348385982052458, + "language_loss": 0.72716129, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.7484479, + "num_input_tokens_seen": 283441830, + "step": 13136, + "time_per_iteration": 2.643087387084961 + }, + { + "auxiliary_loss_clip": 0.01115705, + "auxiliary_loss_mlp": 0.01034132, + "balance_loss_clip": 1.03773975, + "balance_loss_mlp": 1.02081013, + "epoch": 0.7898391702991132, + "flos": 15918949111680.0, + "grad_norm": 5.084496642242111, + "language_loss": 0.70505196, + "learning_rate": 4.455896208180778e-07, + "loss": 0.72655034, + "num_input_tokens_seen": 283459540, + "step": 13137, + "time_per_iteration": 2.584527015686035 + }, + { + "auxiliary_loss_clip": 0.01108112, + "auxiliary_loss_mlp": 0.01035486, + "balance_loss_clip": 1.03718948, + "balance_loss_mlp": 1.02206349, + "epoch": 0.7898992935517811, + "flos": 19829046896640.0, + "grad_norm": 1.7127744511556113, + "language_loss": 0.73933578, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.76077175, + "num_input_tokens_seen": 283478790, + "step": 13138, + "time_per_iteration": 2.7276523113250732 + }, + { + "auxiliary_loss_clip": 0.01070823, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.03749275, + "balance_loss_mlp": 1.01971924, + "epoch": 0.7899594168044491, + "flos": 16216253982720.0, + "grad_norm": 1.9590714056368506, + "language_loss": 0.68501168, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.70603907, + "num_input_tokens_seen": 283495720, + "step": 13139, + "time_per_iteration": 2.7639269828796387 + }, + { + "auxiliary_loss_clip": 0.01021477, + "auxiliary_loss_mlp": 0.01001215, + "balance_loss_clip": 1.00810361, + "balance_loss_mlp": 1.00014842, + "epoch": 0.790019540057117, + "flos": 68331005959680.0, + "grad_norm": 0.8505295432368817, + "language_loss": 0.60203749, + "learning_rate": 4.448546830368003e-07, + "loss": 0.62226439, + "num_input_tokens_seen": 283558795, + "step": 13140, + "time_per_iteration": 3.293804168701172 + }, + { + "auxiliary_loss_clip": 0.01111705, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.03908968, + "balance_loss_mlp": 1.02385402, + "epoch": 0.7900796633097851, + "flos": 30332773601280.0, + "grad_norm": 1.6223884699668718, + "language_loss": 0.76106548, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.78255159, + "num_input_tokens_seen": 283579305, + "step": 13141, + "time_per_iteration": 2.753269672393799 + }, + { + "auxiliary_loss_clip": 0.01101932, + "auxiliary_loss_mlp": 0.01036808, + "balance_loss_clip": 1.03863978, + "balance_loss_mlp": 1.02401733, + "epoch": 0.790139786562453, + "flos": 22126790983680.0, + "grad_norm": 2.0698981191981978, + "language_loss": 0.68995577, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.71134317, + "num_input_tokens_seen": 283597840, + "step": 13142, + "time_per_iteration": 2.682314872741699 + }, + { + "auxiliary_loss_clip": 0.00984677, + "auxiliary_loss_mlp": 0.01013212, + "balance_loss_clip": 1.01147008, + "balance_loss_mlp": 1.01161504, + "epoch": 0.790199909815121, + "flos": 58207284213120.0, + "grad_norm": 0.8339263340003221, + "language_loss": 0.59981745, + "learning_rate": 4.441202759969049e-07, + "loss": 0.61979634, + "num_input_tokens_seen": 283647950, + "step": 13143, + "time_per_iteration": 3.278980255126953 + }, + { + "auxiliary_loss_clip": 0.01082841, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.03883827, + "balance_loss_mlp": 1.02172852, + "epoch": 0.7902600330677889, + "flos": 34533316759680.0, + "grad_norm": 1.6349862086854898, + "language_loss": 0.74675769, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.76793671, + "num_input_tokens_seen": 283670645, + "step": 13144, + "time_per_iteration": 3.294663429260254 + }, + { + "auxiliary_loss_clip": 0.01103742, + "auxiliary_loss_mlp": 0.01036273, + "balance_loss_clip": 1.03867149, + "balance_loss_mlp": 1.02252793, + "epoch": 0.7903201563204569, + "flos": 22346384780160.0, + "grad_norm": 2.139554645223281, + "language_loss": 0.82848895, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.84988904, + "num_input_tokens_seen": 283688830, + "step": 13145, + "time_per_iteration": 2.7851741313934326 + }, + { + "auxiliary_loss_clip": 0.01095507, + "auxiliary_loss_mlp": 0.01030248, + "balance_loss_clip": 1.0367043, + "balance_loss_mlp": 1.01874435, + "epoch": 0.790380279573125, + "flos": 22053533195520.0, + "grad_norm": 1.5468904439953068, + "language_loss": 0.73388755, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.75514507, + "num_input_tokens_seen": 283708625, + "step": 13146, + "time_per_iteration": 2.65710186958313 + }, + { + "auxiliary_loss_clip": 0.01111662, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.03781211, + "balance_loss_mlp": 1.01685667, + "epoch": 0.7904404028257929, + "flos": 20302600826880.0, + "grad_norm": 1.8467569642796249, + "language_loss": 0.75617737, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.77758318, + "num_input_tokens_seen": 283725710, + "step": 13147, + "time_per_iteration": 2.7460520267486572 + }, + { + "auxiliary_loss_clip": 0.01091922, + "auxiliary_loss_mlp": 0.0103933, + "balance_loss_clip": 1.03564286, + "balance_loss_mlp": 1.02576411, + "epoch": 0.7905005260784609, + "flos": 20008923229440.0, + "grad_norm": 1.7581550780117867, + "language_loss": 0.72203916, + "learning_rate": 4.428974443697087e-07, + "loss": 0.7433517, + "num_input_tokens_seen": 283744150, + "step": 13148, + "time_per_iteration": 2.6912500858306885 + }, + { + "auxiliary_loss_clip": 0.01095913, + "auxiliary_loss_mlp": 0.01030776, + "balance_loss_clip": 1.03445816, + "balance_loss_mlp": 1.01777613, + "epoch": 0.7905606493311288, + "flos": 26905926418560.0, + "grad_norm": 1.814389925772028, + "language_loss": 0.71692038, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.73818725, + "num_input_tokens_seen": 283764170, + "step": 13149, + "time_per_iteration": 2.800591230392456 + }, + { + "auxiliary_loss_clip": 0.01074802, + "auxiliary_loss_mlp": 0.01034207, + "balance_loss_clip": 1.03384662, + "balance_loss_mlp": 1.02023542, + "epoch": 0.7906207725837968, + "flos": 23696230907520.0, + "grad_norm": 2.263262344883557, + "language_loss": 0.65186799, + "learning_rate": 4.424087249723225e-07, + "loss": 0.67295814, + "num_input_tokens_seen": 283784305, + "step": 13150, + "time_per_iteration": 2.774513006210327 + }, + { + "auxiliary_loss_clip": 0.01108732, + "auxiliary_loss_mlp": 0.01033548, + "balance_loss_clip": 1.03688979, + "balance_loss_mlp": 1.02138877, + "epoch": 0.7906808958364647, + "flos": 20848837927680.0, + "grad_norm": 2.4892944447292065, + "language_loss": 0.70353788, + "learning_rate": 4.421644538650231e-07, + "loss": 0.72496063, + "num_input_tokens_seen": 283804040, + "step": 13151, + "time_per_iteration": 2.624737024307251 + }, + { + "auxiliary_loss_clip": 0.01091472, + "auxiliary_loss_mlp": 0.0103796, + "balance_loss_clip": 1.03773379, + "balance_loss_mlp": 1.02501988, + "epoch": 0.7907410190891327, + "flos": 40735196974080.0, + "grad_norm": 1.643411919564688, + "language_loss": 0.70038378, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.72167814, + "num_input_tokens_seen": 283827120, + "step": 13152, + "time_per_iteration": 2.820726156234741 + }, + { + "auxiliary_loss_clip": 0.01076957, + "auxiliary_loss_mlp": 0.00770237, + "balance_loss_clip": 1.03583848, + "balance_loss_mlp": 1.00032854, + "epoch": 0.7908011423418007, + "flos": 13261165050240.0, + "grad_norm": 2.5235845787272972, + "language_loss": 0.72838122, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.74685311, + "num_input_tokens_seen": 283844820, + "step": 13153, + "time_per_iteration": 2.782799005508423 + }, + { + "auxiliary_loss_clip": 0.01109362, + "auxiliary_loss_mlp": 0.01027556, + "balance_loss_clip": 1.0372107, + "balance_loss_mlp": 1.01542032, + "epoch": 0.7908612655944687, + "flos": 19754747614080.0, + "grad_norm": 1.5411567451067254, + "language_loss": 0.78878421, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.81015342, + "num_input_tokens_seen": 283862870, + "step": 13154, + "time_per_iteration": 2.617465019226074 + }, + { + "auxiliary_loss_clip": 0.01106383, + "auxiliary_loss_mlp": 0.01030654, + "balance_loss_clip": 1.03864491, + "balance_loss_mlp": 1.0168618, + "epoch": 0.7909213888471366, + "flos": 21287738211840.0, + "grad_norm": 2.826426218857978, + "language_loss": 0.7024678, + "learning_rate": 4.411879602612185e-07, + "loss": 0.72383815, + "num_input_tokens_seen": 283882405, + "step": 13155, + "time_per_iteration": 2.60141658782959 + }, + { + "auxiliary_loss_clip": 0.01110954, + "auxiliary_loss_mlp": 0.01030043, + "balance_loss_clip": 1.03789937, + "balance_loss_mlp": 1.01748431, + "epoch": 0.7909815120998046, + "flos": 22528882805760.0, + "grad_norm": 1.6493957316701613, + "language_loss": 0.76920623, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.79061615, + "num_input_tokens_seen": 283902070, + "step": 13156, + "time_per_iteration": 2.616990327835083 + }, + { + "auxiliary_loss_clip": 0.01077807, + "auxiliary_loss_mlp": 0.01032296, + "balance_loss_clip": 1.03416896, + "balance_loss_mlp": 1.02008295, + "epoch": 0.7910416353524725, + "flos": 26727702111360.0, + "grad_norm": 1.6152194898453356, + "language_loss": 0.65486753, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.67596853, + "num_input_tokens_seen": 283924100, + "step": 13157, + "time_per_iteration": 2.7800040245056152 + }, + { + "auxiliary_loss_clip": 0.01098205, + "auxiliary_loss_mlp": 0.01038516, + "balance_loss_clip": 1.03559875, + "balance_loss_mlp": 1.02468824, + "epoch": 0.7911017586051405, + "flos": 24644847139200.0, + "grad_norm": 1.6816257658835039, + "language_loss": 0.74068034, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.76204759, + "num_input_tokens_seen": 283944955, + "step": 13158, + "time_per_iteration": 2.6075475215911865 + }, + { + "auxiliary_loss_clip": 0.01095673, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.0357399, + "balance_loss_mlp": 1.02176023, + "epoch": 0.7911618818578086, + "flos": 17565489578880.0, + "grad_norm": 2.030035460018427, + "language_loss": 0.67612302, + "learning_rate": 4.40212412422309e-07, + "loss": 0.69741368, + "num_input_tokens_seen": 283963125, + "step": 13159, + "time_per_iteration": 2.6242077350616455 + }, + { + "auxiliary_loss_clip": 0.01098583, + "auxiliary_loss_mlp": 0.01035004, + "balance_loss_clip": 1.03775477, + "balance_loss_mlp": 1.02250552, + "epoch": 0.7912220051104765, + "flos": 16721660298240.0, + "grad_norm": 3.313195141465383, + "language_loss": 0.67271805, + "learning_rate": 4.399686733077206e-07, + "loss": 0.69405401, + "num_input_tokens_seen": 283982850, + "step": 13160, + "time_per_iteration": 2.75685715675354 + }, + { + "auxiliary_loss_clip": 0.0108344, + "auxiliary_loss_mlp": 0.01027351, + "balance_loss_clip": 1.03476191, + "balance_loss_mlp": 1.01664615, + "epoch": 0.7912821283631445, + "flos": 13698736531200.0, + "grad_norm": 2.063884011157957, + "language_loss": 0.72593331, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.74704123, + "num_input_tokens_seen": 283998275, + "step": 13161, + "time_per_iteration": 2.6084799766540527 + }, + { + "auxiliary_loss_clip": 0.01080502, + "auxiliary_loss_mlp": 0.01033054, + "balance_loss_clip": 1.03568757, + "balance_loss_mlp": 1.02046573, + "epoch": 0.7913422516158124, + "flos": 23769021818880.0, + "grad_norm": 1.6120052582411066, + "language_loss": 0.73379862, + "learning_rate": 4.39481372557418e-07, + "loss": 0.75493419, + "num_input_tokens_seen": 284018750, + "step": 13162, + "time_per_iteration": 6.126726865768433 + }, + { + "auxiliary_loss_clip": 0.01089834, + "auxiliary_loss_mlp": 0.01032531, + "balance_loss_clip": 1.03729248, + "balance_loss_mlp": 1.01965666, + "epoch": 0.7914023748684804, + "flos": 19938251220480.0, + "grad_norm": 1.9889723389835698, + "language_loss": 0.71760178, + "learning_rate": 4.392378109401811e-07, + "loss": 0.73882544, + "num_input_tokens_seen": 284037850, + "step": 13163, + "time_per_iteration": 4.413632869720459 + }, + { + "auxiliary_loss_clip": 0.01075124, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.03465581, + "balance_loss_mlp": 1.01800179, + "epoch": 0.7914624981211483, + "flos": 20594805966720.0, + "grad_norm": 1.8803473960616024, + "language_loss": 0.70246696, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.72353578, + "num_input_tokens_seen": 284056380, + "step": 13164, + "time_per_iteration": 2.698758840560913 + }, + { + "auxiliary_loss_clip": 0.01070741, + "auxiliary_loss_mlp": 0.01037319, + "balance_loss_clip": 1.0364182, + "balance_loss_mlp": 1.02521276, + "epoch": 0.7915226213738163, + "flos": 21799465320960.0, + "grad_norm": 1.885675562841956, + "language_loss": 0.67027831, + "learning_rate": 4.387508652677177e-07, + "loss": 0.69135886, + "num_input_tokens_seen": 284074945, + "step": 13165, + "time_per_iteration": 2.74423885345459 + }, + { + "auxiliary_loss_clip": 0.01062193, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.0360235, + "balance_loss_mlp": 1.0160346, + "epoch": 0.7915827446264843, + "flos": 16288362535680.0, + "grad_norm": 2.5652044967821563, + "language_loss": 0.72134489, + "learning_rate": 4.385074812309557e-07, + "loss": 0.74224174, + "num_input_tokens_seen": 284092070, + "step": 13166, + "time_per_iteration": 2.74450421333313 + }, + { + "auxiliary_loss_clip": 0.01107168, + "auxiliary_loss_mlp": 0.01033777, + "balance_loss_clip": 1.03592849, + "balance_loss_mlp": 1.02065766, + "epoch": 0.7916428678791523, + "flos": 25702595867520.0, + "grad_norm": 1.6924622649146908, + "language_loss": 0.77245665, + "learning_rate": 4.382641564061462e-07, + "loss": 0.79386616, + "num_input_tokens_seen": 284112255, + "step": 13167, + "time_per_iteration": 2.6304922103881836 + }, + { + "auxiliary_loss_clip": 0.01074373, + "auxiliary_loss_mlp": 0.01032393, + "balance_loss_clip": 1.03654242, + "balance_loss_mlp": 1.02080607, + "epoch": 0.7917029911318202, + "flos": 23878513451520.0, + "grad_norm": 1.5572430197509217, + "language_loss": 0.8423599, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.86342752, + "num_input_tokens_seen": 284132330, + "step": 13168, + "time_per_iteration": 2.7429237365722656 + }, + { + "auxiliary_loss_clip": 0.011112, + "auxiliary_loss_mlp": 0.01031479, + "balance_loss_clip": 1.03944874, + "balance_loss_mlp": 1.01902127, + "epoch": 0.7917631143844882, + "flos": 21646593037440.0, + "grad_norm": 1.5464810747568485, + "language_loss": 0.72668618, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.74811298, + "num_input_tokens_seen": 284150640, + "step": 13169, + "time_per_iteration": 2.6592273712158203 + }, + { + "auxiliary_loss_clip": 0.01112278, + "auxiliary_loss_mlp": 0.01034078, + "balance_loss_clip": 1.03776097, + "balance_loss_mlp": 1.02096534, + "epoch": 0.7918232376371561, + "flos": 38874198355200.0, + "grad_norm": 3.0164907954915856, + "language_loss": 0.67173648, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.69320005, + "num_input_tokens_seen": 284171910, + "step": 13170, + "time_per_iteration": 4.270065546035767 + }, + { + "auxiliary_loss_clip": 0.01098461, + "auxiliary_loss_mlp": 0.01026319, + "balance_loss_clip": 1.03575373, + "balance_loss_mlp": 1.01436245, + "epoch": 0.7918833608898241, + "flos": 20775544225920.0, + "grad_norm": 1.6549225426524543, + "language_loss": 0.70591486, + "learning_rate": 4.372914494109412e-07, + "loss": 0.72716266, + "num_input_tokens_seen": 284191340, + "step": 13171, + "time_per_iteration": 2.6470091342926025 + }, + { + "auxiliary_loss_clip": 0.01097608, + "auxiliary_loss_mlp": 0.01030463, + "balance_loss_clip": 1.03621912, + "balance_loss_mlp": 1.01798749, + "epoch": 0.7919434841424922, + "flos": 33910122769920.0, + "grad_norm": 3.018313579930399, + "language_loss": 0.67142022, + "learning_rate": 4.370484207842553e-07, + "loss": 0.69270092, + "num_input_tokens_seen": 284212495, + "step": 13172, + "time_per_iteration": 2.7242603302001953 + }, + { + "auxiliary_loss_clip": 0.01083539, + "auxiliary_loss_mlp": 0.01033715, + "balance_loss_clip": 1.03492141, + "balance_loss_mlp": 1.02068591, + "epoch": 0.7920036073951601, + "flos": 21064660796160.0, + "grad_norm": 2.177677653156343, + "language_loss": 0.79725873, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.81843126, + "num_input_tokens_seen": 284230825, + "step": 13173, + "time_per_iteration": 2.6997551918029785 + }, + { + "auxiliary_loss_clip": 0.01071714, + "auxiliary_loss_mlp": 0.01038162, + "balance_loss_clip": 1.03270435, + "balance_loss_mlp": 1.02527571, + "epoch": 0.7920637306478281, + "flos": 23655974739840.0, + "grad_norm": 2.0997194022490038, + "language_loss": 0.76738131, + "learning_rate": 4.365625413419365e-07, + "loss": 0.78848016, + "num_input_tokens_seen": 284250365, + "step": 13174, + "time_per_iteration": 2.8328940868377686 + }, + { + "auxiliary_loss_clip": 0.01083806, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.03376579, + "balance_loss_mlp": 1.02280629, + "epoch": 0.792123853900496, + "flos": 27195438038400.0, + "grad_norm": 2.433097475426471, + "language_loss": 0.71779603, + "learning_rate": 4.363196905447297e-07, + "loss": 0.73897892, + "num_input_tokens_seen": 284269635, + "step": 13175, + "time_per_iteration": 2.7348971366882324 + }, + { + "auxiliary_loss_clip": 0.01098061, + "auxiliary_loss_mlp": 0.010319, + "balance_loss_clip": 1.03613544, + "balance_loss_mlp": 1.01925838, + "epoch": 0.792183977153164, + "flos": 19098659744640.0, + "grad_norm": 1.8855424428426124, + "language_loss": 0.60150284, + "learning_rate": 4.360768990424364e-07, + "loss": 0.62280244, + "num_input_tokens_seen": 284288380, + "step": 13176, + "time_per_iteration": 2.645940065383911 + }, + { + "auxiliary_loss_clip": 0.01112239, + "auxiliary_loss_mlp": 0.01033063, + "balance_loss_clip": 1.04115438, + "balance_loss_mlp": 1.02052176, + "epoch": 0.7922441004058319, + "flos": 17128851851520.0, + "grad_norm": 1.8607925161268413, + "language_loss": 0.73708278, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.75853586, + "num_input_tokens_seen": 284306920, + "step": 13177, + "time_per_iteration": 2.624305009841919 + }, + { + "auxiliary_loss_clip": 0.01092978, + "auxiliary_loss_mlp": 0.01035467, + "balance_loss_clip": 1.03717804, + "balance_loss_mlp": 1.02310514, + "epoch": 0.7923042236585, + "flos": 17821640442240.0, + "grad_norm": 1.880784618902091, + "language_loss": 0.64198965, + "learning_rate": 4.355914939594174e-07, + "loss": 0.66327411, + "num_input_tokens_seen": 284324700, + "step": 13178, + "time_per_iteration": 2.6623740196228027 + }, + { + "auxiliary_loss_clip": 0.01086006, + "auxiliary_loss_mlp": 0.01028552, + "balance_loss_clip": 1.03637266, + "balance_loss_mlp": 1.01807904, + "epoch": 0.7923643469111679, + "flos": 29935206892800.0, + "grad_norm": 1.4540811343422748, + "language_loss": 0.68699908, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.70814466, + "num_input_tokens_seen": 284345985, + "step": 13179, + "time_per_iteration": 2.832632541656494 + }, + { + "auxiliary_loss_clip": 0.01106835, + "auxiliary_loss_mlp": 0.01030886, + "balance_loss_clip": 1.036268, + "balance_loss_mlp": 1.01872063, + "epoch": 0.7924244701638359, + "flos": 22674716023680.0, + "grad_norm": 2.2092827624793117, + "language_loss": 0.74018443, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.76156163, + "num_input_tokens_seen": 284364475, + "step": 13180, + "time_per_iteration": 2.6299288272857666 + }, + { + "auxiliary_loss_clip": 0.01099012, + "auxiliary_loss_mlp": 0.01036443, + "balance_loss_clip": 1.03927088, + "balance_loss_mlp": 1.02306199, + "epoch": 0.7924845934165038, + "flos": 17968156018560.0, + "grad_norm": 2.065397931254967, + "language_loss": 0.8179431, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.83929765, + "num_input_tokens_seen": 284382125, + "step": 13181, + "time_per_iteration": 2.6588377952575684 + }, + { + "auxiliary_loss_clip": 0.01079854, + "auxiliary_loss_mlp": 0.01038549, + "balance_loss_clip": 1.03439593, + "balance_loss_mlp": 1.02538192, + "epoch": 0.7925447166691718, + "flos": 23476960333440.0, + "grad_norm": 1.7700147531802202, + "language_loss": 0.77401638, + "learning_rate": 4.346213957372895e-07, + "loss": 0.79520041, + "num_input_tokens_seen": 284401585, + "step": 13182, + "time_per_iteration": 2.702794313430786 + }, + { + "auxiliary_loss_clip": 0.01097492, + "auxiliary_loss_mlp": 0.01041087, + "balance_loss_clip": 1.0389626, + "balance_loss_mlp": 1.02766991, + "epoch": 0.7926048399218397, + "flos": 20447572118400.0, + "grad_norm": 1.8061510819801756, + "language_loss": 0.74171931, + "learning_rate": 4.34379019557056e-07, + "loss": 0.76310509, + "num_input_tokens_seen": 284419125, + "step": 13183, + "time_per_iteration": 2.615912675857544 + }, + { + "auxiliary_loss_clip": 0.01078552, + "auxiliary_loss_mlp": 0.01032415, + "balance_loss_clip": 1.03608036, + "balance_loss_mlp": 1.0189023, + "epoch": 0.7926649631745077, + "flos": 37160038535040.0, + "grad_norm": 1.5412113664578542, + "language_loss": 0.68428183, + "learning_rate": 4.341367027453264e-07, + "loss": 0.70539147, + "num_input_tokens_seen": 284440445, + "step": 13184, + "time_per_iteration": 2.7763001918792725 + }, + { + "auxiliary_loss_clip": 0.01073218, + "auxiliary_loss_mlp": 0.01034358, + "balance_loss_clip": 1.03828871, + "balance_loss_mlp": 1.02169812, + "epoch": 0.7927250864271758, + "flos": 17018606033280.0, + "grad_norm": 1.8246032292732381, + "language_loss": 0.70783365, + "learning_rate": 4.338944453112907e-07, + "loss": 0.72890937, + "num_input_tokens_seen": 284459370, + "step": 13185, + "time_per_iteration": 2.7633087635040283 + }, + { + "auxiliary_loss_clip": 0.01096127, + "auxiliary_loss_mlp": 0.01032722, + "balance_loss_clip": 1.03772926, + "balance_loss_mlp": 1.02017522, + "epoch": 0.7927852096798437, + "flos": 17749208666880.0, + "grad_norm": 2.140666716995379, + "language_loss": 0.65258479, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.67387331, + "num_input_tokens_seen": 284477525, + "step": 13186, + "time_per_iteration": 2.762816905975342 + }, + { + "auxiliary_loss_clip": 0.01094364, + "auxiliary_loss_mlp": 0.01037697, + "balance_loss_clip": 1.03739357, + "balance_loss_mlp": 1.02557981, + "epoch": 0.7928453329325117, + "flos": 23838436851840.0, + "grad_norm": 1.4957281455318547, + "language_loss": 0.76961684, + "learning_rate": 4.334101086130408e-07, + "loss": 0.79093742, + "num_input_tokens_seen": 284496590, + "step": 13187, + "time_per_iteration": 2.7023680210113525 + }, + { + "auxiliary_loss_clip": 0.01088541, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.03613758, + "balance_loss_mlp": 1.0191083, + "epoch": 0.7929054561851796, + "flos": 17454920538240.0, + "grad_norm": 2.090727269336269, + "language_loss": 0.7242974, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.7454946, + "num_input_tokens_seen": 284511470, + "step": 13188, + "time_per_iteration": 2.6116061210632324 + }, + { + "auxiliary_loss_clip": 0.01110097, + "auxiliary_loss_mlp": 0.00771207, + "balance_loss_clip": 1.03619778, + "balance_loss_mlp": 1.0002346, + "epoch": 0.7929655794378476, + "flos": 21981280988160.0, + "grad_norm": 3.5192145755873043, + "language_loss": 0.63126463, + "learning_rate": 4.329260095357725e-07, + "loss": 0.65007764, + "num_input_tokens_seen": 284531125, + "step": 13189, + "time_per_iteration": 2.5492398738861084 + }, + { + "auxiliary_loss_clip": 0.01063574, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.03684545, + "balance_loss_mlp": 1.02014804, + "epoch": 0.7930257026905155, + "flos": 17273930883840.0, + "grad_norm": 1.84728181644231, + "language_loss": 0.73074591, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.75169981, + "num_input_tokens_seen": 284549340, + "step": 13190, + "time_per_iteration": 2.7327284812927246 + }, + { + "auxiliary_loss_clip": 0.01094105, + "auxiliary_loss_mlp": 0.01030162, + "balance_loss_clip": 1.03697276, + "balance_loss_mlp": 1.01938009, + "epoch": 0.7930858259431836, + "flos": 27300584125440.0, + "grad_norm": 1.7378717321453667, + "language_loss": 0.73166823, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.75291085, + "num_input_tokens_seen": 284567060, + "step": 13191, + "time_per_iteration": 2.761871337890625 + }, + { + "auxiliary_loss_clip": 0.01097055, + "auxiliary_loss_mlp": 0.01039867, + "balance_loss_clip": 1.03603792, + "balance_loss_mlp": 1.02686751, + "epoch": 0.7931459491958515, + "flos": 19863736456320.0, + "grad_norm": 1.7612896167092924, + "language_loss": 0.69279987, + "learning_rate": 4.322003066198219e-07, + "loss": 0.71416903, + "num_input_tokens_seen": 284586600, + "step": 13192, + "time_per_iteration": 2.6835954189300537 + }, + { + "auxiliary_loss_clip": 0.01074955, + "auxiliary_loss_mlp": 0.01035455, + "balance_loss_clip": 1.03394866, + "balance_loss_mlp": 1.0229497, + "epoch": 0.7932060724485195, + "flos": 23147120718720.0, + "grad_norm": 1.8840827690458661, + "language_loss": 0.75363815, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.77474225, + "num_input_tokens_seen": 284605715, + "step": 13193, + "time_per_iteration": 2.723729372024536 + }, + { + "auxiliary_loss_clip": 0.01097101, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.03796721, + "balance_loss_mlp": 1.01971292, + "epoch": 0.7932661957011874, + "flos": 29934847756800.0, + "grad_norm": 2.301032967508139, + "language_loss": 0.71940517, + "learning_rate": 4.317168019161741e-07, + "loss": 0.74070656, + "num_input_tokens_seen": 284628540, + "step": 13194, + "time_per_iteration": 2.758888006210327 + }, + { + "auxiliary_loss_clip": 0.01113373, + "auxiliary_loss_mlp": 0.01036092, + "balance_loss_clip": 1.03853393, + "balance_loss_mlp": 1.02333045, + "epoch": 0.7933263189538554, + "flos": 22559119079040.0, + "grad_norm": 1.9174397116927768, + "language_loss": 0.70116889, + "learning_rate": 4.314751387639517e-07, + "loss": 0.72266352, + "num_input_tokens_seen": 284646040, + "step": 13195, + "time_per_iteration": 2.558119058609009 + }, + { + "auxiliary_loss_clip": 0.01060029, + "auxiliary_loss_mlp": 0.0102797, + "balance_loss_clip": 1.03700423, + "balance_loss_mlp": 1.0154351, + "epoch": 0.7933864422065233, + "flos": 25479051575040.0, + "grad_norm": 3.5361878755115286, + "language_loss": 0.77569836, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.79657841, + "num_input_tokens_seen": 284665110, + "step": 13196, + "time_per_iteration": 2.7758255004882812 + }, + { + "auxiliary_loss_clip": 0.01079414, + "auxiliary_loss_mlp": 0.01037257, + "balance_loss_clip": 1.03883171, + "balance_loss_mlp": 1.02485287, + "epoch": 0.7934465654591913, + "flos": 33583156243200.0, + "grad_norm": 1.7631963808402482, + "language_loss": 0.68811917, + "learning_rate": 4.309919909045268e-07, + "loss": 0.70928586, + "num_input_tokens_seen": 284686515, + "step": 13197, + "time_per_iteration": 2.788442850112915 + }, + { + "auxiliary_loss_clip": 0.01097503, + "auxiliary_loss_mlp": 0.01029061, + "balance_loss_clip": 1.03770566, + "balance_loss_mlp": 1.01680613, + "epoch": 0.7935066887118594, + "flos": 31432538263680.0, + "grad_norm": 2.573420648877448, + "language_loss": 0.65293157, + "learning_rate": 4.30750506215646e-07, + "loss": 0.6741972, + "num_input_tokens_seen": 284707300, + "step": 13198, + "time_per_iteration": 2.785005807876587 + }, + { + "auxiliary_loss_clip": 0.010622, + "auxiliary_loss_mlp": 0.01040394, + "balance_loss_clip": 1.03600621, + "balance_loss_mlp": 1.02515936, + "epoch": 0.7935668119645273, + "flos": 14682616940160.0, + "grad_norm": 2.6924527077689113, + "language_loss": 0.72298622, + "learning_rate": 4.30509081032864e-07, + "loss": 0.74401212, + "num_input_tokens_seen": 284723545, + "step": 13199, + "time_per_iteration": 2.828518867492676 + }, + { + "auxiliary_loss_clip": 0.01083399, + "auxiliary_loss_mlp": 0.01032791, + "balance_loss_clip": 1.03479409, + "balance_loss_mlp": 1.02038765, + "epoch": 0.7936269352171953, + "flos": 18004246208640.0, + "grad_norm": 1.7805702438055635, + "language_loss": 0.80542034, + "learning_rate": 4.302677153653349e-07, + "loss": 0.82658225, + "num_input_tokens_seen": 284742650, + "step": 13200, + "time_per_iteration": 2.719022035598755 + }, + { + "auxiliary_loss_clip": 0.01096575, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.0383296, + "balance_loss_mlp": 1.02258706, + "epoch": 0.7936870584698632, + "flos": 18880215183360.0, + "grad_norm": 1.7717483221141246, + "language_loss": 0.77400053, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.79531235, + "num_input_tokens_seen": 284760955, + "step": 13201, + "time_per_iteration": 4.26847243309021 + }, + { + "auxiliary_loss_clip": 0.01108331, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.03744578, + "balance_loss_mlp": 1.02092719, + "epoch": 0.7937471817225312, + "flos": 23367001824000.0, + "grad_norm": 1.5551997587526456, + "language_loss": 0.67323661, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.69464856, + "num_input_tokens_seen": 284780745, + "step": 13202, + "time_per_iteration": 4.283862352371216 + }, + { + "auxiliary_loss_clip": 0.01099327, + "auxiliary_loss_mlp": 0.01034975, + "balance_loss_clip": 1.03811014, + "balance_loss_mlp": 1.02211785, + "epoch": 0.7938073049751991, + "flos": 22674428714880.0, + "grad_norm": 2.1258656424203464, + "language_loss": 0.75316, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.77450299, + "num_input_tokens_seen": 284799000, + "step": 13203, + "time_per_iteration": 4.218053817749023 + }, + { + "auxiliary_loss_clip": 0.01057545, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.03676009, + "balance_loss_mlp": 1.02075946, + "epoch": 0.7938674282278672, + "flos": 22851431959680.0, + "grad_norm": 1.8069073081221512, + "language_loss": 0.66618353, + "learning_rate": 4.293028480307643e-07, + "loss": 0.68708801, + "num_input_tokens_seen": 284817450, + "step": 13204, + "time_per_iteration": 2.819964647293091 + }, + { + "auxiliary_loss_clip": 0.01049205, + "auxiliary_loss_mlp": 0.01028932, + "balance_loss_clip": 1.03277397, + "balance_loss_mlp": 1.01646256, + "epoch": 0.7939275514805351, + "flos": 27012509049600.0, + "grad_norm": 1.5710457021949253, + "language_loss": 0.7940079, + "learning_rate": 4.290617800767438e-07, + "loss": 0.8147893, + "num_input_tokens_seen": 284838865, + "step": 13205, + "time_per_iteration": 2.832738161087036 + }, + { + "auxiliary_loss_clip": 0.0107234, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.0324893, + "balance_loss_mlp": 1.01827097, + "epoch": 0.7939876747332031, + "flos": 21142838747520.0, + "grad_norm": 7.819538292121243, + "language_loss": 0.7771039, + "learning_rate": 4.28820771692858e-07, + "loss": 0.79813695, + "num_input_tokens_seen": 284857975, + "step": 13206, + "time_per_iteration": 2.7768259048461914 + }, + { + "auxiliary_loss_clip": 0.01086044, + "auxiliary_loss_mlp": 0.01035236, + "balance_loss_clip": 1.03653049, + "balance_loss_mlp": 1.02031064, + "epoch": 0.794047797985871, + "flos": 23289075267840.0, + "grad_norm": 2.0761554247876526, + "language_loss": 0.78858304, + "learning_rate": 4.285798228882456e-07, + "loss": 0.8097958, + "num_input_tokens_seen": 284877145, + "step": 13207, + "time_per_iteration": 2.78918719291687 + }, + { + "auxiliary_loss_clip": 0.01071641, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.03531897, + "balance_loss_mlp": 1.02225077, + "epoch": 0.794107921238539, + "flos": 24608074590720.0, + "grad_norm": 1.921000285111042, + "language_loss": 0.83848017, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.85954154, + "num_input_tokens_seen": 284895560, + "step": 13208, + "time_per_iteration": 2.799513578414917 + }, + { + "auxiliary_loss_clip": 0.00994574, + "auxiliary_loss_mlp": 0.0101022, + "balance_loss_clip": 1.00948644, + "balance_loss_mlp": 1.0090878, + "epoch": 0.7941680444912069, + "flos": 64093690252800.0, + "grad_norm": 0.7333327804859686, + "language_loss": 0.58320063, + "learning_rate": 4.280981040533875e-07, + "loss": 0.60324866, + "num_input_tokens_seen": 284963135, + "step": 13209, + "time_per_iteration": 4.956205368041992 + }, + { + "auxiliary_loss_clip": 0.01076765, + "auxiliary_loss_mlp": 0.01034685, + "balance_loss_clip": 1.03624475, + "balance_loss_mlp": 1.02142262, + "epoch": 0.794228167743875, + "flos": 24388839930240.0, + "grad_norm": 2.256316924700655, + "language_loss": 0.62863505, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.64974952, + "num_input_tokens_seen": 284981755, + "step": 13210, + "time_per_iteration": 2.7703917026519775 + }, + { + "auxiliary_loss_clip": 0.010938, + "auxiliary_loss_mlp": 0.01036077, + "balance_loss_clip": 1.03719687, + "balance_loss_mlp": 1.024073, + "epoch": 0.794288290996543, + "flos": 28512498026880.0, + "grad_norm": 1.9531340028994628, + "language_loss": 0.6936754, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.71497422, + "num_input_tokens_seen": 285003060, + "step": 13211, + "time_per_iteration": 2.74078106880188 + }, + { + "auxiliary_loss_clip": 0.01102649, + "auxiliary_loss_mlp": 0.01039332, + "balance_loss_clip": 1.03825963, + "balance_loss_mlp": 1.02562237, + "epoch": 0.7943484142492109, + "flos": 25922117836800.0, + "grad_norm": 1.640321960898119, + "language_loss": 0.72502631, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.74644607, + "num_input_tokens_seen": 285021640, + "step": 13212, + "time_per_iteration": 2.745793104171753 + }, + { + "auxiliary_loss_clip": 0.01095421, + "auxiliary_loss_mlp": 0.01029916, + "balance_loss_clip": 1.03563583, + "balance_loss_mlp": 1.01776266, + "epoch": 0.7944085375018789, + "flos": 23915286000000.0, + "grad_norm": 1.7707252579484445, + "language_loss": 0.80655056, + "learning_rate": 4.271353817368246e-07, + "loss": 0.82780391, + "num_input_tokens_seen": 285040490, + "step": 13213, + "time_per_iteration": 2.7571616172790527 + }, + { + "auxiliary_loss_clip": 0.01102684, + "auxiliary_loss_mlp": 0.01030906, + "balance_loss_clip": 1.03846729, + "balance_loss_mlp": 1.01816225, + "epoch": 0.7944686607545468, + "flos": 20229953569920.0, + "grad_norm": 2.0723417946435196, + "language_loss": 0.67524314, + "learning_rate": 4.268948502428327e-07, + "loss": 0.69657904, + "num_input_tokens_seen": 285059270, + "step": 13214, + "time_per_iteration": 2.7216098308563232 + }, + { + "auxiliary_loss_clip": 0.01107626, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_clip": 1.03777719, + "balance_loss_mlp": 1.01888001, + "epoch": 0.7945287840072148, + "flos": 21980993679360.0, + "grad_norm": 2.140296316096213, + "language_loss": 0.72678429, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.74817061, + "num_input_tokens_seen": 285075390, + "step": 13215, + "time_per_iteration": 2.687727212905884 + }, + { + "auxiliary_loss_clip": 0.01058497, + "auxiliary_loss_mlp": 0.01037215, + "balance_loss_clip": 1.03636539, + "balance_loss_mlp": 1.02328491, + "epoch": 0.7945889072598827, + "flos": 26397718842240.0, + "grad_norm": 1.5145901921228262, + "language_loss": 0.79136622, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.81232333, + "num_input_tokens_seen": 285096290, + "step": 13216, + "time_per_iteration": 2.7064990997314453 + }, + { + "auxiliary_loss_clip": 0.01096019, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.03587198, + "balance_loss_mlp": 1.02159381, + "epoch": 0.7946490305125508, + "flos": 25810255906560.0, + "grad_norm": 1.5522674129771217, + "language_loss": 0.73874998, + "learning_rate": 4.261736137111598e-07, + "loss": 0.7600522, + "num_input_tokens_seen": 285116020, + "step": 13217, + "time_per_iteration": 2.6791646480560303 + }, + { + "auxiliary_loss_clip": 0.01082895, + "auxiliary_loss_mlp": 0.01034325, + "balance_loss_clip": 1.03578281, + "balance_loss_mlp": 1.02138495, + "epoch": 0.7947091537652187, + "flos": 15960965045760.0, + "grad_norm": 1.8630939701915927, + "language_loss": 0.73956853, + "learning_rate": 4.259333208810907e-07, + "loss": 0.76074076, + "num_input_tokens_seen": 285133510, + "step": 13218, + "time_per_iteration": 2.681337594985962 + }, + { + "auxiliary_loss_clip": 0.01099657, + "auxiliary_loss_mlp": 0.01037837, + "balance_loss_clip": 1.0363996, + "balance_loss_mlp": 1.02428901, + "epoch": 0.7947692770178867, + "flos": 18587866389120.0, + "grad_norm": 1.8649212651108453, + "language_loss": 0.83193207, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.85330701, + "num_input_tokens_seen": 285151690, + "step": 13219, + "time_per_iteration": 2.6580770015716553 + }, + { + "auxiliary_loss_clip": 0.01100239, + "auxiliary_loss_mlp": 0.01043205, + "balance_loss_clip": 1.03854525, + "balance_loss_mlp": 1.02832818, + "epoch": 0.7948294002705546, + "flos": 20442220992000.0, + "grad_norm": 2.3946957736467915, + "language_loss": 0.75677502, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.77820945, + "num_input_tokens_seen": 285170485, + "step": 13220, + "time_per_iteration": 2.644994020462036 + }, + { + "auxiliary_loss_clip": 0.01084385, + "auxiliary_loss_mlp": 0.01035732, + "balance_loss_clip": 1.0356847, + "balance_loss_mlp": 1.0225656, + "epoch": 0.7948895235232226, + "flos": 38181194282880.0, + "grad_norm": 1.8822123036698593, + "language_loss": 0.72409242, + "learning_rate": 4.252128005599176e-07, + "loss": 0.74529362, + "num_input_tokens_seen": 285191050, + "step": 13221, + "time_per_iteration": 2.765852689743042 + }, + { + "auxiliary_loss_clip": 0.01099762, + "auxiliary_loss_mlp": 0.01030106, + "balance_loss_clip": 1.03885102, + "balance_loss_mlp": 1.01822662, + "epoch": 0.7949496467758905, + "flos": 15559806977280.0, + "grad_norm": 2.0084967919839527, + "language_loss": 0.74979097, + "learning_rate": 4.249727465395634e-07, + "loss": 0.77108967, + "num_input_tokens_seen": 285208750, + "step": 13222, + "time_per_iteration": 2.6160507202148438 + }, + { + "auxiliary_loss_clip": 0.01012175, + "auxiliary_loss_mlp": 0.01002836, + "balance_loss_clip": 1.00953972, + "balance_loss_mlp": 1.00179863, + "epoch": 0.7950097700285585, + "flos": 70897036728960.0, + "grad_norm": 0.7723294250131235, + "language_loss": 0.6706062, + "learning_rate": 4.247327522443993e-07, + "loss": 0.69075632, + "num_input_tokens_seen": 285264605, + "step": 13223, + "time_per_iteration": 3.087876319885254 + }, + { + "auxiliary_loss_clip": 0.010973, + "auxiliary_loss_mlp": 0.01033159, + "balance_loss_clip": 1.0365479, + "balance_loss_mlp": 1.01981974, + "epoch": 0.7950698932812266, + "flos": 23951627585280.0, + "grad_norm": 2.366420887555622, + "language_loss": 0.71044689, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.73175144, + "num_input_tokens_seen": 285283940, + "step": 13224, + "time_per_iteration": 2.640591621398926 + }, + { + "auxiliary_loss_clip": 0.0103006, + "auxiliary_loss_mlp": 0.01002258, + "balance_loss_clip": 1.00757217, + "balance_loss_mlp": 1.00124514, + "epoch": 0.7951300165338945, + "flos": 60282561415680.0, + "grad_norm": 0.6682926442494496, + "language_loss": 0.54986, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.57018316, + "num_input_tokens_seen": 285349525, + "step": 13225, + "time_per_iteration": 3.1831283569335938 + }, + { + "auxiliary_loss_clip": 0.01083968, + "auxiliary_loss_mlp": 0.01025518, + "balance_loss_clip": 1.03336477, + "balance_loss_mlp": 1.01375794, + "epoch": 0.7951901397865625, + "flos": 22819004956800.0, + "grad_norm": 2.1821061924274106, + "language_loss": 0.64788163, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.66897643, + "num_input_tokens_seen": 285367355, + "step": 13226, + "time_per_iteration": 2.7201919555664062 + }, + { + "auxiliary_loss_clip": 0.01065742, + "auxiliary_loss_mlp": 0.01037995, + "balance_loss_clip": 1.03712797, + "balance_loss_mlp": 1.02581131, + "epoch": 0.7952502630392304, + "flos": 35695672871040.0, + "grad_norm": 2.811230996366362, + "language_loss": 0.69988328, + "learning_rate": 4.237733724976349e-07, + "loss": 0.72092068, + "num_input_tokens_seen": 285386190, + "step": 13227, + "time_per_iteration": 2.88179874420166 + }, + { + "auxiliary_loss_clip": 0.01065232, + "auxiliary_loss_mlp": 0.01029863, + "balance_loss_clip": 1.03389943, + "balance_loss_mlp": 1.01914048, + "epoch": 0.7953103862918984, + "flos": 25629840869760.0, + "grad_norm": 1.5862839127208228, + "language_loss": 0.69230592, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.71325696, + "num_input_tokens_seen": 285406150, + "step": 13228, + "time_per_iteration": 2.9039552211761475 + }, + { + "auxiliary_loss_clip": 0.01062042, + "auxiliary_loss_mlp": 0.01046101, + "balance_loss_clip": 1.03289461, + "balance_loss_mlp": 1.03178382, + "epoch": 0.7953705095445663, + "flos": 40551980676480.0, + "grad_norm": 1.5155440892228063, + "language_loss": 0.70645332, + "learning_rate": 4.232940412119095e-07, + "loss": 0.72753471, + "num_input_tokens_seen": 285429900, + "step": 13229, + "time_per_iteration": 2.9804372787475586 + }, + { + "auxiliary_loss_clip": 0.01103757, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.03948689, + "balance_loss_mlp": 1.02086771, + "epoch": 0.7954306327972344, + "flos": 27636672706560.0, + "grad_norm": 2.202823116168555, + "language_loss": 0.71696305, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.73833489, + "num_input_tokens_seen": 285452555, + "step": 13230, + "time_per_iteration": 2.8171424865722656 + }, + { + "auxiliary_loss_clip": 0.0101259, + "auxiliary_loss_mlp": 0.01002419, + "balance_loss_clip": 1.00992072, + "balance_loss_mlp": 1.00133443, + "epoch": 0.7954907560499023, + "flos": 59504055995520.0, + "grad_norm": 0.8970251417868289, + "language_loss": 0.63560265, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.65575272, + "num_input_tokens_seen": 285515700, + "step": 13231, + "time_per_iteration": 3.281342029571533 + }, + { + "auxiliary_loss_clip": 0.01086059, + "auxiliary_loss_mlp": 0.01028708, + "balance_loss_clip": 1.03541231, + "balance_loss_mlp": 1.01657856, + "epoch": 0.7955508793025703, + "flos": 20120533764480.0, + "grad_norm": 4.745461781703955, + "language_loss": 0.69967991, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.72082758, + "num_input_tokens_seen": 285533910, + "step": 13232, + "time_per_iteration": 2.6862258911132812 + }, + { + "auxiliary_loss_clip": 0.01098188, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.03567708, + "balance_loss_mlp": 1.01753175, + "epoch": 0.7956110025552382, + "flos": 26505378881280.0, + "grad_norm": 3.7128388079610075, + "language_loss": 0.77988273, + "learning_rate": 4.223360961792952e-07, + "loss": 0.80116582, + "num_input_tokens_seen": 285554080, + "step": 13233, + "time_per_iteration": 2.755737066268921 + }, + { + "auxiliary_loss_clip": 0.01099521, + "auxiliary_loss_mlp": 0.01032696, + "balance_loss_clip": 1.03679132, + "balance_loss_mlp": 1.02042377, + "epoch": 0.7956711258079062, + "flos": 22565475786240.0, + "grad_norm": 1.9443930320320317, + "language_loss": 0.79183459, + "learning_rate": 4.220967594613769e-07, + "loss": 0.81315672, + "num_input_tokens_seen": 285572325, + "step": 13234, + "time_per_iteration": 2.7519893646240234 + }, + { + "auxiliary_loss_clip": 0.01089518, + "auxiliary_loss_mlp": 0.00769637, + "balance_loss_clip": 1.03883278, + "balance_loss_mlp": 1.00016356, + "epoch": 0.7957312490605741, + "flos": 17379005143680.0, + "grad_norm": 1.963343394843674, + "language_loss": 0.69879019, + "learning_rate": 4.218574825777077e-07, + "loss": 0.71738172, + "num_input_tokens_seen": 285589770, + "step": 13235, + "time_per_iteration": 2.6992905139923096 + }, + { + "auxiliary_loss_clip": 0.01072089, + "auxiliary_loss_mlp": 0.01031418, + "balance_loss_clip": 1.03438449, + "balance_loss_mlp": 1.0185138, + "epoch": 0.7957913723132422, + "flos": 22491427898880.0, + "grad_norm": 1.4985866886242822, + "language_loss": 0.6796065, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.70064157, + "num_input_tokens_seen": 285610065, + "step": 13236, + "time_per_iteration": 2.7930455207824707 + }, + { + "auxiliary_loss_clip": 0.01062113, + "auxiliary_loss_mlp": 0.01030284, + "balance_loss_clip": 1.03622985, + "balance_loss_mlp": 1.01748657, + "epoch": 0.7958514955659101, + "flos": 22638087129600.0, + "grad_norm": 1.6336623511601824, + "language_loss": 0.75105399, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.7719779, + "num_input_tokens_seen": 285628480, + "step": 13237, + "time_per_iteration": 2.8149497509002686 + }, + { + "auxiliary_loss_clip": 0.01100352, + "auxiliary_loss_mlp": 0.0103538, + "balance_loss_clip": 1.03876448, + "balance_loss_mlp": 1.02211833, + "epoch": 0.7959116188185781, + "flos": 20704225772160.0, + "grad_norm": 2.4969872572253067, + "language_loss": 0.71244603, + "learning_rate": 4.211400110229175e-07, + "loss": 0.73380333, + "num_input_tokens_seen": 285647805, + "step": 13238, + "time_per_iteration": 2.650225877761841 + }, + { + "auxiliary_loss_clip": 0.01093003, + "auxiliary_loss_mlp": 0.01028042, + "balance_loss_clip": 1.03492129, + "balance_loss_mlp": 1.01565003, + "epoch": 0.7959717420712461, + "flos": 19024683684480.0, + "grad_norm": 2.0234207796888666, + "language_loss": 0.74033141, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.7615419, + "num_input_tokens_seen": 285665505, + "step": 13239, + "time_per_iteration": 2.68799090385437 + }, + { + "auxiliary_loss_clip": 0.01113057, + "auxiliary_loss_mlp": 0.01034221, + "balance_loss_clip": 1.03780365, + "balance_loss_mlp": 1.02156138, + "epoch": 0.796031865323914, + "flos": 26356636661760.0, + "grad_norm": 2.1969833935070953, + "language_loss": 0.69364315, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.7151159, + "num_input_tokens_seen": 285685855, + "step": 13240, + "time_per_iteration": 4.224658250808716 + }, + { + "auxiliary_loss_clip": 0.01024595, + "auxiliary_loss_mlp": 0.00998488, + "balance_loss_clip": 1.01116359, + "balance_loss_mlp": 0.99737942, + "epoch": 0.796091988576582, + "flos": 62069440320000.0, + "grad_norm": 0.887431308267293, + "language_loss": 0.58674192, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.60697281, + "num_input_tokens_seen": 285735710, + "step": 13241, + "time_per_iteration": 4.535626649856567 + }, + { + "auxiliary_loss_clip": 0.01078843, + "auxiliary_loss_mlp": 0.01030817, + "balance_loss_clip": 1.03829169, + "balance_loss_mlp": 1.01925421, + "epoch": 0.7961521118292499, + "flos": 39020103400320.0, + "grad_norm": 1.593674462626725, + "language_loss": 0.64147931, + "learning_rate": 4.201842205128772e-07, + "loss": 0.66257584, + "num_input_tokens_seen": 285757045, + "step": 13242, + "time_per_iteration": 4.472386598587036 + }, + { + "auxiliary_loss_clip": 0.01110267, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.03763533, + "balance_loss_mlp": 1.02198589, + "epoch": 0.796212235081918, + "flos": 21762836426880.0, + "grad_norm": 1.8778627225113254, + "language_loss": 0.75913978, + "learning_rate": 4.199454226296526e-07, + "loss": 0.78059125, + "num_input_tokens_seen": 285776050, + "step": 13243, + "time_per_iteration": 2.590519666671753 + }, + { + "auxiliary_loss_clip": 0.01085583, + "auxiliary_loss_mlp": 0.01032008, + "balance_loss_clip": 1.04038298, + "balance_loss_mlp": 1.01900232, + "epoch": 0.7962723583345859, + "flos": 21178857110400.0, + "grad_norm": 1.6501275630789378, + "language_loss": 0.79442871, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.81560457, + "num_input_tokens_seen": 285796830, + "step": 13244, + "time_per_iteration": 2.752902030944824 + }, + { + "auxiliary_loss_clip": 0.01102665, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.03597069, + "balance_loss_mlp": 1.01640284, + "epoch": 0.7963324815872539, + "flos": 17128636369920.0, + "grad_norm": 2.172628764552698, + "language_loss": 0.68508917, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.70641208, + "num_input_tokens_seen": 285814755, + "step": 13245, + "time_per_iteration": 2.5828065872192383 + }, + { + "auxiliary_loss_clip": 0.01090189, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.03773546, + "balance_loss_mlp": 1.02139258, + "epoch": 0.7963926048399218, + "flos": 21397481239680.0, + "grad_norm": 1.806241454752508, + "language_loss": 0.79336578, + "learning_rate": 4.192293885111549e-07, + "loss": 0.81461, + "num_input_tokens_seen": 285834255, + "step": 13246, + "time_per_iteration": 2.6900124549865723 + }, + { + "auxiliary_loss_clip": 0.01090986, + "auxiliary_loss_mlp": 0.01031115, + "balance_loss_clip": 1.03666353, + "balance_loss_mlp": 1.01834822, + "epoch": 0.7964527280925898, + "flos": 25184188828800.0, + "grad_norm": 2.003867832970485, + "language_loss": 0.66143036, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.6826514, + "num_input_tokens_seen": 285853540, + "step": 13247, + "time_per_iteration": 2.6503524780273438 + }, + { + "auxiliary_loss_clip": 0.01085363, + "auxiliary_loss_mlp": 0.01029081, + "balance_loss_clip": 1.0366044, + "balance_loss_mlp": 1.01764846, + "epoch": 0.7965128513452577, + "flos": 27015884928000.0, + "grad_norm": 2.040458459238989, + "language_loss": 0.71853489, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.73967934, + "num_input_tokens_seen": 285872705, + "step": 13248, + "time_per_iteration": 4.260182857513428 + }, + { + "auxiliary_loss_clip": 0.0109327, + "auxiliary_loss_mlp": 0.01028611, + "balance_loss_clip": 1.03817999, + "balance_loss_mlp": 1.01565921, + "epoch": 0.7965729745979258, + "flos": 24419578993920.0, + "grad_norm": 2.2291806355034507, + "language_loss": 0.76553303, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.78675187, + "num_input_tokens_seen": 285890290, + "step": 13249, + "time_per_iteration": 2.6802589893341064 + }, + { + "auxiliary_loss_clip": 0.01082795, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.03743911, + "balance_loss_mlp": 1.01721263, + "epoch": 0.7966330978505937, + "flos": 18840389978880.0, + "grad_norm": 2.0056248298720623, + "language_loss": 0.61770105, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.63882804, + "num_input_tokens_seen": 285909190, + "step": 13250, + "time_per_iteration": 2.7855334281921387 + }, + { + "auxiliary_loss_clip": 0.01088491, + "auxiliary_loss_mlp": 0.01027346, + "balance_loss_clip": 1.03615296, + "balance_loss_mlp": 1.01445389, + "epoch": 0.7966932211032617, + "flos": 13152319862400.0, + "grad_norm": 2.1520508995588523, + "language_loss": 0.72124857, + "learning_rate": 4.180371972938206e-07, + "loss": 0.74240696, + "num_input_tokens_seen": 285927570, + "step": 13251, + "time_per_iteration": 2.7121150493621826 + }, + { + "auxiliary_loss_clip": 0.01116, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.0401994, + "balance_loss_mlp": 1.01820469, + "epoch": 0.7967533443559297, + "flos": 23949760078080.0, + "grad_norm": 2.6256177602060395, + "language_loss": 0.72742116, + "learning_rate": 4.177989389787624e-07, + "loss": 0.74889851, + "num_input_tokens_seen": 285945810, + "step": 13252, + "time_per_iteration": 2.582284927368164 + }, + { + "auxiliary_loss_clip": 0.01109038, + "auxiliary_loss_mlp": 0.01028059, + "balance_loss_clip": 1.03879833, + "balance_loss_mlp": 1.01554191, + "epoch": 0.7968134676085976, + "flos": 30368791964160.0, + "grad_norm": 1.5712453668855284, + "language_loss": 0.66325545, + "learning_rate": 4.175607406609278e-07, + "loss": 0.68462646, + "num_input_tokens_seen": 285964235, + "step": 13253, + "time_per_iteration": 2.6929616928100586 + }, + { + "auxiliary_loss_clip": 0.0108594, + "auxiliary_loss_mlp": 0.01036955, + "balance_loss_clip": 1.0418272, + "balance_loss_mlp": 1.0236156, + "epoch": 0.7968735908612656, + "flos": 23075048079360.0, + "grad_norm": 1.5829772200812473, + "language_loss": 0.67843878, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.69966775, + "num_input_tokens_seen": 285983710, + "step": 13254, + "time_per_iteration": 2.7649550437927246 + }, + { + "auxiliary_loss_clip": 0.01098933, + "auxiliary_loss_mlp": 0.01034065, + "balance_loss_clip": 1.03641415, + "balance_loss_mlp": 1.02192962, + "epoch": 0.7969337141139335, + "flos": 23582250074880.0, + "grad_norm": 1.8731034083925706, + "language_loss": 0.70037842, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.72170842, + "num_input_tokens_seen": 286003425, + "step": 13255, + "time_per_iteration": 2.6560351848602295 + }, + { + "auxiliary_loss_clip": 0.01108119, + "auxiliary_loss_mlp": 0.01031031, + "balance_loss_clip": 1.03694665, + "balance_loss_mlp": 1.01906836, + "epoch": 0.7969938373666016, + "flos": 19755860935680.0, + "grad_norm": 2.1084612697613268, + "language_loss": 0.79501426, + "learning_rate": 4.168465057810733e-07, + "loss": 0.81640577, + "num_input_tokens_seen": 286020130, + "step": 13256, + "time_per_iteration": 2.6129326820373535 + }, + { + "auxiliary_loss_clip": 0.01098682, + "auxiliary_loss_mlp": 0.01025793, + "balance_loss_clip": 1.03868675, + "balance_loss_mlp": 1.01325274, + "epoch": 0.7970539606192695, + "flos": 24134089697280.0, + "grad_norm": 1.6974660367757792, + "language_loss": 0.66300124, + "learning_rate": 4.166085475424315e-07, + "loss": 0.68424594, + "num_input_tokens_seen": 286040230, + "step": 13257, + "time_per_iteration": 2.6830172538757324 + }, + { + "auxiliary_loss_clip": 0.0109134, + "auxiliary_loss_mlp": 0.01034065, + "balance_loss_clip": 1.0377934, + "balance_loss_mlp": 1.02150643, + "epoch": 0.7971140838719375, + "flos": 17968622895360.0, + "grad_norm": 1.9146410977072226, + "language_loss": 0.72192776, + "learning_rate": 4.163706493461523e-07, + "loss": 0.74318182, + "num_input_tokens_seen": 286059475, + "step": 13258, + "time_per_iteration": 2.661726236343384 + }, + { + "auxiliary_loss_clip": 0.01100938, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.03692877, + "balance_loss_mlp": 1.0205518, + "epoch": 0.7971742071246054, + "flos": 19169547235200.0, + "grad_norm": 1.8087181306355609, + "language_loss": 0.68977499, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.71112633, + "num_input_tokens_seen": 286077820, + "step": 13259, + "time_per_iteration": 2.611186981201172 + }, + { + "auxiliary_loss_clip": 0.01096475, + "auxiliary_loss_mlp": 0.01030487, + "balance_loss_clip": 1.03723931, + "balance_loss_mlp": 1.01854253, + "epoch": 0.7972343303772734, + "flos": 27125951178240.0, + "grad_norm": 2.1313633169820547, + "language_loss": 0.73609447, + "learning_rate": 4.158950331167641e-07, + "loss": 0.75736415, + "num_input_tokens_seen": 286097285, + "step": 13260, + "time_per_iteration": 2.699951648712158 + }, + { + "auxiliary_loss_clip": 0.010819, + "auxiliary_loss_mlp": 0.01032439, + "balance_loss_clip": 1.03271997, + "balance_loss_mlp": 1.02031517, + "epoch": 0.7972944536299413, + "flos": 20996646393600.0, + "grad_norm": 1.836032369081443, + "language_loss": 0.78399926, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.80514264, + "num_input_tokens_seen": 286116000, + "step": 13261, + "time_per_iteration": 2.6140952110290527 + }, + { + "auxiliary_loss_clip": 0.01095642, + "auxiliary_loss_mlp": 0.01030129, + "balance_loss_clip": 1.03774393, + "balance_loss_mlp": 1.0191493, + "epoch": 0.7973545768826094, + "flos": 21580015178880.0, + "grad_norm": 1.439588217770827, + "language_loss": 0.76199102, + "learning_rate": 4.154196571650501e-07, + "loss": 0.78324872, + "num_input_tokens_seen": 286135110, + "step": 13262, + "time_per_iteration": 2.7024636268615723 + }, + { + "auxiliary_loss_clip": 0.01082139, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.03903556, + "balance_loss_mlp": 1.0191412, + "epoch": 0.7974147001352773, + "flos": 20558536208640.0, + "grad_norm": 2.631651732945755, + "language_loss": 0.70419514, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.72535068, + "num_input_tokens_seen": 286152835, + "step": 13263, + "time_per_iteration": 2.72177791595459 + }, + { + "auxiliary_loss_clip": 0.01103923, + "auxiliary_loss_mlp": 0.01038655, + "balance_loss_clip": 1.03778529, + "balance_loss_mlp": 1.02499938, + "epoch": 0.7974748233879453, + "flos": 20996790048000.0, + "grad_norm": 2.0043756449172547, + "language_loss": 0.70802379, + "learning_rate": 4.149445215631153e-07, + "loss": 0.72944963, + "num_input_tokens_seen": 286171785, + "step": 13264, + "time_per_iteration": 2.706388473510742 + }, + { + "auxiliary_loss_clip": 0.01107469, + "auxiliary_loss_mlp": 0.01033786, + "balance_loss_clip": 1.03775704, + "balance_loss_mlp": 1.02225232, + "epoch": 0.7975349466406133, + "flos": 22565188477440.0, + "grad_norm": 1.891498852375028, + "language_loss": 0.76922268, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.79063523, + "num_input_tokens_seen": 286190420, + "step": 13265, + "time_per_iteration": 2.6580817699432373 + }, + { + "auxiliary_loss_clip": 0.01080723, + "auxiliary_loss_mlp": 0.01027162, + "balance_loss_clip": 1.0393877, + "balance_loss_mlp": 1.0149374, + "epoch": 0.7975950698932812, + "flos": 21689542725120.0, + "grad_norm": 2.280360071674855, + "language_loss": 0.75571597, + "learning_rate": 4.144696263830285e-07, + "loss": 0.77679479, + "num_input_tokens_seen": 286210105, + "step": 13266, + "time_per_iteration": 2.707306146621704 + }, + { + "auxiliary_loss_clip": 0.01083885, + "auxiliary_loss_mlp": 0.01026943, + "balance_loss_clip": 1.03626752, + "balance_loss_mlp": 1.01505208, + "epoch": 0.7976551931459492, + "flos": 19604568850560.0, + "grad_norm": 7.354197908727964, + "language_loss": 0.84225118, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.86335951, + "num_input_tokens_seen": 286228180, + "step": 13267, + "time_per_iteration": 2.6513888835906982 + }, + { + "auxiliary_loss_clip": 0.01095515, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.03541887, + "balance_loss_mlp": 1.01869643, + "epoch": 0.7977153163986171, + "flos": 21687603390720.0, + "grad_norm": 1.5920140883630767, + "language_loss": 0.76201731, + "learning_rate": 4.139949716968223e-07, + "loss": 0.7832889, + "num_input_tokens_seen": 286247305, + "step": 13268, + "time_per_iteration": 2.7020766735076904 + }, + { + "auxiliary_loss_clip": 0.01109132, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.03789496, + "balance_loss_mlp": 1.0182898, + "epoch": 0.7977754396512852, + "flos": 23476780765440.0, + "grad_norm": 1.5724932567080838, + "language_loss": 0.77637428, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.79777002, + "num_input_tokens_seen": 286268145, + "step": 13269, + "time_per_iteration": 2.6634888648986816 + }, + { + "auxiliary_loss_clip": 0.01090369, + "auxiliary_loss_mlp": 0.01042032, + "balance_loss_clip": 1.03390729, + "balance_loss_mlp": 1.02950919, + "epoch": 0.7978355629039531, + "flos": 22382223575040.0, + "grad_norm": 1.6845375324844267, + "language_loss": 0.82535768, + "learning_rate": 4.135205575764922e-07, + "loss": 0.84668171, + "num_input_tokens_seen": 286286775, + "step": 13270, + "time_per_iteration": 2.684476613998413 + }, + { + "auxiliary_loss_clip": 0.01068469, + "auxiliary_loss_mlp": 0.01040513, + "balance_loss_clip": 1.03474867, + "balance_loss_mlp": 1.02632725, + "epoch": 0.7978956861566211, + "flos": 20266331068800.0, + "grad_norm": 1.5659305382034026, + "language_loss": 0.59210402, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.61319387, + "num_input_tokens_seen": 286305590, + "step": 13271, + "time_per_iteration": 2.860095262527466 + }, + { + "auxiliary_loss_clip": 0.01090884, + "auxiliary_loss_mlp": 0.01031592, + "balance_loss_clip": 1.03714991, + "balance_loss_mlp": 1.01914704, + "epoch": 0.797955809409289, + "flos": 28112417366400.0, + "grad_norm": 1.4518492514226418, + "language_loss": 0.73159599, + "learning_rate": 4.130463840939975e-07, + "loss": 0.75282073, + "num_input_tokens_seen": 286328050, + "step": 13272, + "time_per_iteration": 2.770979881286621 + }, + { + "auxiliary_loss_clip": 0.01046384, + "auxiliary_loss_mlp": 0.01036557, + "balance_loss_clip": 1.03212595, + "balance_loss_mlp": 1.023736, + "epoch": 0.798015932661957, + "flos": 15559591495680.0, + "grad_norm": 2.073152053590518, + "language_loss": 0.71566808, + "learning_rate": 4.128093876144161e-07, + "loss": 0.73649746, + "num_input_tokens_seen": 286345265, + "step": 13273, + "time_per_iteration": 2.7531182765960693 + }, + { + "auxiliary_loss_clip": 0.0108926, + "auxiliary_loss_mlp": 0.01034875, + "balance_loss_clip": 1.03732777, + "balance_loss_mlp": 1.02203608, + "epoch": 0.7980760559146249, + "flos": 23951196622080.0, + "grad_norm": 1.7484854884585128, + "language_loss": 0.75765157, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.77889293, + "num_input_tokens_seen": 286364465, + "step": 13274, + "time_per_iteration": 2.788862705230713 + }, + { + "auxiliary_loss_clip": 0.0105609, + "auxiliary_loss_mlp": 0.01027938, + "balance_loss_clip": 1.03353024, + "balance_loss_mlp": 1.01679242, + "epoch": 0.798136179167293, + "flos": 28038082170240.0, + "grad_norm": 1.3747811855715935, + "language_loss": 0.77784944, + "learning_rate": 4.12335575223518e-07, + "loss": 0.79868966, + "num_input_tokens_seen": 286385565, + "step": 13275, + "time_per_iteration": 2.823310375213623 + }, + { + "auxiliary_loss_clip": 0.01100598, + "auxiliary_loss_mlp": 0.01039391, + "balance_loss_clip": 1.03790784, + "balance_loss_mlp": 1.02598548, + "epoch": 0.7981963024199609, + "flos": 35984538046080.0, + "grad_norm": 1.8295595288590525, + "language_loss": 0.63964415, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.66104394, + "num_input_tokens_seen": 286403950, + "step": 13276, + "time_per_iteration": 2.6914138793945312 + }, + { + "auxiliary_loss_clip": 0.01067297, + "auxiliary_loss_mlp": 0.01030718, + "balance_loss_clip": 1.03446054, + "balance_loss_mlp": 1.01858199, + "epoch": 0.7982564256726289, + "flos": 25884914325120.0, + "grad_norm": 1.804313446176304, + "language_loss": 0.61235017, + "learning_rate": 4.118620036501945e-07, + "loss": 0.63333035, + "num_input_tokens_seen": 286426160, + "step": 13277, + "time_per_iteration": 2.7913875579833984 + }, + { + "auxiliary_loss_clip": 0.0108732, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.03842235, + "balance_loss_mlp": 1.0209415, + "epoch": 0.7983165489252969, + "flos": 25739152934400.0, + "grad_norm": 1.9796578843322905, + "language_loss": 0.79335415, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.81456405, + "num_input_tokens_seen": 286446610, + "step": 13278, + "time_per_iteration": 2.69783353805542 + }, + { + "auxiliary_loss_clip": 0.01089196, + "auxiliary_loss_mlp": 0.01040766, + "balance_loss_clip": 1.03595579, + "balance_loss_mlp": 1.0271821, + "epoch": 0.7983766721779648, + "flos": 21908202768000.0, + "grad_norm": 1.9939125008903142, + "language_loss": 0.62796175, + "learning_rate": 4.113886729662768e-07, + "loss": 0.64926136, + "num_input_tokens_seen": 286465460, + "step": 13279, + "time_per_iteration": 2.6455893516540527 + }, + { + "auxiliary_loss_clip": 0.01093985, + "auxiliary_loss_mlp": 0.01029527, + "balance_loss_clip": 1.03608727, + "balance_loss_mlp": 1.01845241, + "epoch": 0.7984367954306328, + "flos": 29347420734720.0, + "grad_norm": 2.0437348521937633, + "language_loss": 0.71019673, + "learning_rate": 4.111520979802825e-07, + "loss": 0.73143184, + "num_input_tokens_seen": 286485720, + "step": 13280, + "time_per_iteration": 5.853861093521118 + }, + { + "auxiliary_loss_clip": 0.01071418, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_clip": 1.03524828, + "balance_loss_mlp": 1.02807951, + "epoch": 0.7984969186833007, + "flos": 31357772104320.0, + "grad_norm": 1.7977133455003094, + "language_loss": 0.62786448, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.64900589, + "num_input_tokens_seen": 286507465, + "step": 13281, + "time_per_iteration": 2.8363935947418213 + }, + { + "auxiliary_loss_clip": 0.01098858, + "auxiliary_loss_mlp": 0.01032968, + "balance_loss_clip": 1.03470564, + "balance_loss_mlp": 1.0203495, + "epoch": 0.7985570419359688, + "flos": 24312924535680.0, + "grad_norm": 2.135522103798107, + "language_loss": 0.80706322, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.82838148, + "num_input_tokens_seen": 286526345, + "step": 13282, + "time_per_iteration": 4.3146162033081055 + }, + { + "auxiliary_loss_clip": 0.01075396, + "auxiliary_loss_mlp": 0.00770211, + "balance_loss_clip": 1.03265977, + "balance_loss_mlp": 1.00023508, + "epoch": 0.7986171651886367, + "flos": 15742233175680.0, + "grad_norm": 1.7496465983827643, + "language_loss": 0.71291137, + "learning_rate": 4.10442734553802e-07, + "loss": 0.73136741, + "num_input_tokens_seen": 286544095, + "step": 13283, + "time_per_iteration": 2.7113521099090576 + }, + { + "auxiliary_loss_clip": 0.01094572, + "auxiliary_loss_mlp": 0.01026527, + "balance_loss_clip": 1.03506041, + "balance_loss_mlp": 1.01502371, + "epoch": 0.7986772884413047, + "flos": 11619401091840.0, + "grad_norm": 1.8142883767804951, + "language_loss": 0.73932701, + "learning_rate": 4.102064006186967e-07, + "loss": 0.76053798, + "num_input_tokens_seen": 286560960, + "step": 13284, + "time_per_iteration": 2.690788984298706 + }, + { + "auxiliary_loss_clip": 0.01081168, + "auxiliary_loss_mlp": 0.01036951, + "balance_loss_clip": 1.03430653, + "balance_loss_mlp": 1.02556038, + "epoch": 0.7987374116939726, + "flos": 22091059929600.0, + "grad_norm": 2.8983502428316252, + "language_loss": 0.70378709, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.72496831, + "num_input_tokens_seen": 286579865, + "step": 13285, + "time_per_iteration": 2.6703269481658936 + }, + { + "auxiliary_loss_clip": 0.01080639, + "auxiliary_loss_mlp": 0.01033039, + "balance_loss_clip": 1.03381705, + "balance_loss_mlp": 1.02097476, + "epoch": 0.7987975349466406, + "flos": 17890696339200.0, + "grad_norm": 1.6695326991809423, + "language_loss": 0.7404871, + "learning_rate": 4.097339136128437e-07, + "loss": 0.76162386, + "num_input_tokens_seen": 286597295, + "step": 13286, + "time_per_iteration": 2.663839817047119 + }, + { + "auxiliary_loss_clip": 0.01087446, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.03605843, + "balance_loss_mlp": 1.01811767, + "epoch": 0.7988576581993085, + "flos": 19719232041600.0, + "grad_norm": 1.9331179632037672, + "language_loss": 0.75270319, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.77388239, + "num_input_tokens_seen": 286616270, + "step": 13287, + "time_per_iteration": 2.6603620052337646 + }, + { + "auxiliary_loss_clip": 0.01086627, + "auxiliary_loss_mlp": 0.01029452, + "balance_loss_clip": 1.03799939, + "balance_loss_mlp": 1.01714361, + "epoch": 0.7989177814519766, + "flos": 28036358317440.0, + "grad_norm": 1.5251443213363312, + "language_loss": 0.61793303, + "learning_rate": 4.092616678191863e-07, + "loss": 0.63909382, + "num_input_tokens_seen": 286638315, + "step": 13288, + "time_per_iteration": 4.3285603523254395 + }, + { + "auxiliary_loss_clip": 0.01098321, + "auxiliary_loss_mlp": 0.01032216, + "balance_loss_clip": 1.03874183, + "balance_loss_mlp": 1.02039015, + "epoch": 0.7989779047046445, + "flos": 28871029630080.0, + "grad_norm": 2.003655756829568, + "language_loss": 0.70842254, + "learning_rate": 4.090256353993169e-07, + "loss": 0.72972792, + "num_input_tokens_seen": 286658630, + "step": 13289, + "time_per_iteration": 2.754244089126587 + }, + { + "auxiliary_loss_clip": 0.01077989, + "auxiliary_loss_mlp": 0.01036331, + "balance_loss_clip": 1.0401969, + "balance_loss_mlp": 1.02322364, + "epoch": 0.7990380279573125, + "flos": 18186887888640.0, + "grad_norm": 2.067060536008121, + "language_loss": 0.62479776, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.64594096, + "num_input_tokens_seen": 286676870, + "step": 13290, + "time_per_iteration": 2.7182984352111816 + }, + { + "auxiliary_loss_clip": 0.01102224, + "auxiliary_loss_mlp": 0.0103293, + "balance_loss_clip": 1.03841472, + "balance_loss_mlp": 1.01973963, + "epoch": 0.7990981512099805, + "flos": 20879936127360.0, + "grad_norm": 2.07432932467733, + "language_loss": 0.71562916, + "learning_rate": 4.08553751558248e-07, + "loss": 0.73698068, + "num_input_tokens_seen": 286694300, + "step": 13291, + "time_per_iteration": 2.679877281188965 + }, + { + "auxiliary_loss_clip": 0.01071725, + "auxiliary_loss_mlp": 0.01028448, + "balance_loss_clip": 1.03726125, + "balance_loss_mlp": 1.01692605, + "epoch": 0.7991582744626484, + "flos": 26099911180800.0, + "grad_norm": 1.4271226582537684, + "language_loss": 0.63687944, + "learning_rate": 4.083179001549422e-07, + "loss": 0.65788114, + "num_input_tokens_seen": 286714545, + "step": 13292, + "time_per_iteration": 2.7268645763397217 + }, + { + "auxiliary_loss_clip": 0.01097914, + "auxiliary_loss_mlp": 0.0103158, + "balance_loss_clip": 1.03674936, + "balance_loss_mlp": 1.0198257, + "epoch": 0.7992183977153164, + "flos": 35295843605760.0, + "grad_norm": 1.6084532273776246, + "language_loss": 0.56303227, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.58432722, + "num_input_tokens_seen": 286734525, + "step": 13293, + "time_per_iteration": 2.7652106285095215 + }, + { + "auxiliary_loss_clip": 0.0108332, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.03898919, + "balance_loss_mlp": 1.02236032, + "epoch": 0.7992785209679844, + "flos": 51853426577280.0, + "grad_norm": 6.931153518532829, + "language_loss": 0.71501821, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.73620194, + "num_input_tokens_seen": 286753430, + "step": 13294, + "time_per_iteration": 2.9734227657318115 + }, + { + "auxiliary_loss_clip": 0.01071635, + "auxiliary_loss_mlp": 0.01033492, + "balance_loss_clip": 1.03379464, + "balance_loss_mlp": 1.02083826, + "epoch": 0.7993386442206524, + "flos": 22565116650240.0, + "grad_norm": 1.9346589994202708, + "language_loss": 0.72097647, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.74202782, + "num_input_tokens_seen": 286771915, + "step": 13295, + "time_per_iteration": 2.8697874546051025 + }, + { + "auxiliary_loss_clip": 0.0107528, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.03569388, + "balance_loss_mlp": 1.02194977, + "epoch": 0.7993987674733203, + "flos": 18800277465600.0, + "grad_norm": 1.7062921905810151, + "language_loss": 0.75847328, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.77955961, + "num_input_tokens_seen": 286789835, + "step": 13296, + "time_per_iteration": 2.851438522338867 + }, + { + "auxiliary_loss_clip": 0.00998558, + "auxiliary_loss_mlp": 0.01004815, + "balance_loss_clip": 1.00716496, + "balance_loss_mlp": 1.00364101, + "epoch": 0.7994588907259883, + "flos": 69421720394880.0, + "grad_norm": 0.714455868109846, + "language_loss": 0.60823548, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.6282692, + "num_input_tokens_seen": 286855580, + "step": 13297, + "time_per_iteration": 3.307276725769043 + }, + { + "auxiliary_loss_clip": 0.01086945, + "auxiliary_loss_mlp": 0.01034042, + "balance_loss_clip": 1.03667986, + "balance_loss_mlp": 1.02218103, + "epoch": 0.7995190139786562, + "flos": 13480327883520.0, + "grad_norm": 2.2332538895333482, + "language_loss": 0.70562863, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.72683859, + "num_input_tokens_seen": 286874360, + "step": 13298, + "time_per_iteration": 2.764620542526245 + }, + { + "auxiliary_loss_clip": 0.01073541, + "auxiliary_loss_mlp": 0.01036123, + "balance_loss_clip": 1.03546071, + "balance_loss_mlp": 1.0225215, + "epoch": 0.7995791372313242, + "flos": 21652842003840.0, + "grad_norm": 2.144443690498565, + "language_loss": 0.75778526, + "learning_rate": 4.066686308212037e-07, + "loss": 0.77888191, + "num_input_tokens_seen": 286891950, + "step": 13299, + "time_per_iteration": 2.7200376987457275 + }, + { + "auxiliary_loss_clip": 0.0108171, + "auxiliary_loss_mlp": 0.01035616, + "balance_loss_clip": 1.03365636, + "balance_loss_mlp": 1.02388, + "epoch": 0.7996392604839921, + "flos": 26068130622720.0, + "grad_norm": 1.914646005951808, + "language_loss": 0.77740645, + "learning_rate": 4.064332625220828e-07, + "loss": 0.79857981, + "num_input_tokens_seen": 286911725, + "step": 13300, + "time_per_iteration": 3.0327885150909424 + }, + { + "auxiliary_loss_clip": 0.01066534, + "auxiliary_loss_mlp": 0.01041633, + "balance_loss_clip": 1.03083372, + "balance_loss_mlp": 1.02648187, + "epoch": 0.7996993837366602, + "flos": 24606889441920.0, + "grad_norm": 1.7486826819933081, + "language_loss": 0.6372295, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.65831113, + "num_input_tokens_seen": 286931400, + "step": 13301, + "time_per_iteration": 2.797971725463867 + }, + { + "auxiliary_loss_clip": 0.01096682, + "auxiliary_loss_mlp": 0.01034454, + "balance_loss_clip": 1.03674114, + "balance_loss_mlp": 1.02209187, + "epoch": 0.7997595069893281, + "flos": 20992049452800.0, + "grad_norm": 2.21348387588423, + "language_loss": 0.71967971, + "learning_rate": 4.059627072173928e-07, + "loss": 0.74099112, + "num_input_tokens_seen": 286949795, + "step": 13302, + "time_per_iteration": 2.874833822250366 + }, + { + "auxiliary_loss_clip": 0.01111886, + "auxiliary_loss_mlp": 0.00770697, + "balance_loss_clip": 1.03792214, + "balance_loss_mlp": 1.0001955, + "epoch": 0.7998196302419961, + "flos": 24426510318720.0, + "grad_norm": 2.0232516764799953, + "language_loss": 0.83735251, + "learning_rate": 4.057275202296684e-07, + "loss": 0.8561784, + "num_input_tokens_seen": 286968805, + "step": 13303, + "time_per_iteration": 2.73748779296875 + }, + { + "auxiliary_loss_clip": 0.01106654, + "auxiliary_loss_mlp": 0.01032892, + "balance_loss_clip": 1.03686202, + "balance_loss_mlp": 1.02197862, + "epoch": 0.7998797534946641, + "flos": 30264651457920.0, + "grad_norm": 1.7050821885070455, + "language_loss": 0.58436215, + "learning_rate": 4.054923936969166e-07, + "loss": 0.60575771, + "num_input_tokens_seen": 286990235, + "step": 13304, + "time_per_iteration": 2.6886215209960938 + }, + { + "auxiliary_loss_clip": 0.01111166, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.03615296, + "balance_loss_mlp": 1.01842976, + "epoch": 0.799939876747332, + "flos": 23513984277120.0, + "grad_norm": 1.709353821854052, + "language_loss": 0.6893419, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.71076536, + "num_input_tokens_seen": 287011060, + "step": 13305, + "time_per_iteration": 2.6649460792541504 + }, + { + "auxiliary_loss_clip": 0.01072914, + "auxiliary_loss_mlp": 0.01027062, + "balance_loss_clip": 1.0366323, + "balance_loss_mlp": 1.01584458, + "epoch": 0.8, + "flos": 19318109886720.0, + "grad_norm": 1.5790890199142242, + "language_loss": 0.69499552, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.71599531, + "num_input_tokens_seen": 287029215, + "step": 13306, + "time_per_iteration": 2.7563791275024414 + }, + { + "auxiliary_loss_clip": 0.01101067, + "auxiliary_loss_mlp": 0.0103442, + "balance_loss_clip": 1.03880584, + "balance_loss_mlp": 1.02221918, + "epoch": 0.800060123252668, + "flos": 32412432263040.0, + "grad_norm": 1.5764355485932124, + "language_loss": 0.69476044, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.71611536, + "num_input_tokens_seen": 287050855, + "step": 13307, + "time_per_iteration": 2.732285737991333 + }, + { + "auxiliary_loss_clip": 0.01085939, + "auxiliary_loss_mlp": 0.01036111, + "balance_loss_clip": 1.0350318, + "balance_loss_mlp": 1.02393389, + "epoch": 0.800120246505336, + "flos": 20010611168640.0, + "grad_norm": 1.8640379762112131, + "language_loss": 0.76623571, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.78745627, + "num_input_tokens_seen": 287069915, + "step": 13308, + "time_per_iteration": 2.642228603363037 + }, + { + "auxiliary_loss_clip": 0.01063897, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.03632379, + "balance_loss_mlp": 1.01817083, + "epoch": 0.8001803697580039, + "flos": 31868278151040.0, + "grad_norm": 1.4469096851593135, + "language_loss": 0.78943181, + "learning_rate": 4.0431766816972e-07, + "loss": 0.8103863, + "num_input_tokens_seen": 287091450, + "step": 13309, + "time_per_iteration": 2.864769697189331 + }, + { + "auxiliary_loss_clip": 0.01030417, + "auxiliary_loss_mlp": 0.01001696, + "balance_loss_clip": 1.00792837, + "balance_loss_mlp": 1.00063515, + "epoch": 0.8002404930106719, + "flos": 63392066916480.0, + "grad_norm": 0.9323209922385806, + "language_loss": 0.64716959, + "learning_rate": 4.040829045539571e-07, + "loss": 0.66749072, + "num_input_tokens_seen": 287148365, + "step": 13310, + "time_per_iteration": 3.1092755794525146 + }, + { + "auxiliary_loss_clip": 0.01098583, + "auxiliary_loss_mlp": 0.01033194, + "balance_loss_clip": 1.03659534, + "balance_loss_mlp": 1.02035546, + "epoch": 0.8003006162633398, + "flos": 27855476403840.0, + "grad_norm": 2.1579786023849445, + "language_loss": 0.82891053, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.85022825, + "num_input_tokens_seen": 287168280, + "step": 13311, + "time_per_iteration": 2.7086493968963623 + }, + { + "auxiliary_loss_clip": 0.0109936, + "auxiliary_loss_mlp": 0.01033907, + "balance_loss_clip": 1.03775251, + "balance_loss_mlp": 1.0216639, + "epoch": 0.8003607395160078, + "flos": 18223337214720.0, + "grad_norm": 1.9933201272328842, + "language_loss": 0.66162074, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.68295336, + "num_input_tokens_seen": 287185980, + "step": 13312, + "time_per_iteration": 2.680204153060913 + }, + { + "auxiliary_loss_clip": 0.01114636, + "auxiliary_loss_mlp": 0.01031944, + "balance_loss_clip": 1.04067063, + "balance_loss_mlp": 1.01846743, + "epoch": 0.8004208627686757, + "flos": 20886975192960.0, + "grad_norm": 2.3011865249501264, + "language_loss": 0.75151718, + "learning_rate": 4.033789768462843e-07, + "loss": 0.77298295, + "num_input_tokens_seen": 287203875, + "step": 13313, + "time_per_iteration": 2.606222629547119 + }, + { + "auxiliary_loss_clip": 0.0109515, + "auxiliary_loss_mlp": 0.01030962, + "balance_loss_clip": 1.03412461, + "balance_loss_mlp": 1.01851058, + "epoch": 0.8004809860213438, + "flos": 26436143416320.0, + "grad_norm": 1.3567607017939294, + "language_loss": 0.75564599, + "learning_rate": 4.031444553532575e-07, + "loss": 0.77690709, + "num_input_tokens_seen": 287226445, + "step": 13314, + "time_per_iteration": 2.6715898513793945 + }, + { + "auxiliary_loss_clip": 0.00988299, + "auxiliary_loss_mlp": 0.01000387, + "balance_loss_clip": 1.00804853, + "balance_loss_mlp": 0.99932635, + "epoch": 0.8005411092740117, + "flos": 63648612829440.0, + "grad_norm": 0.8122679233845669, + "language_loss": 0.53769958, + "learning_rate": 4.029099944131522e-07, + "loss": 0.55758643, + "num_input_tokens_seen": 287286240, + "step": 13315, + "time_per_iteration": 3.1782495975494385 + }, + { + "auxiliary_loss_clip": 0.01086886, + "auxiliary_loss_mlp": 0.01029216, + "balance_loss_clip": 1.03729582, + "balance_loss_mlp": 1.0172112, + "epoch": 0.8006012325266797, + "flos": 36138056774400.0, + "grad_norm": 1.6928178696023135, + "language_loss": 0.71341288, + "learning_rate": 4.026755940348603e-07, + "loss": 0.7345739, + "num_input_tokens_seen": 287310265, + "step": 13316, + "time_per_iteration": 2.7924816608428955 + }, + { + "auxiliary_loss_clip": 0.01091573, + "auxiliary_loss_mlp": 0.01030799, + "balance_loss_clip": 1.03969979, + "balance_loss_mlp": 1.0183655, + "epoch": 0.8006613557793477, + "flos": 33838947970560.0, + "grad_norm": 1.868325107289893, + "language_loss": 0.64874738, + "learning_rate": 4.024412542272706e-07, + "loss": 0.66997111, + "num_input_tokens_seen": 287331610, + "step": 13317, + "time_per_iteration": 2.7774088382720947 + }, + { + "auxiliary_loss_clip": 0.01029734, + "auxiliary_loss_mlp": 0.01001074, + "balance_loss_clip": 1.00732291, + "balance_loss_mlp": 1.00008476, + "epoch": 0.8007214790320156, + "flos": 67348310699520.0, + "grad_norm": 0.7791846864300481, + "language_loss": 0.59069222, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.61100036, + "num_input_tokens_seen": 287394795, + "step": 13318, + "time_per_iteration": 4.755454778671265 + }, + { + "auxiliary_loss_clip": 0.01074086, + "auxiliary_loss_mlp": 0.01027358, + "balance_loss_clip": 1.03581715, + "balance_loss_mlp": 1.01549029, + "epoch": 0.8007816022846836, + "flos": 23185653033600.0, + "grad_norm": 1.8075855078848244, + "language_loss": 0.66764301, + "learning_rate": 4.019727563597366e-07, + "loss": 0.68865746, + "num_input_tokens_seen": 287414595, + "step": 13319, + "time_per_iteration": 4.444296360015869 + }, + { + "auxiliary_loss_clip": 0.0111121, + "auxiliary_loss_mlp": 0.00771312, + "balance_loss_clip": 1.03728712, + "balance_loss_mlp": 1.00022757, + "epoch": 0.8008417255373516, + "flos": 21981388728960.0, + "grad_norm": 1.8607859210030597, + "language_loss": 0.74157208, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.76039732, + "num_input_tokens_seen": 287434395, + "step": 13320, + "time_per_iteration": 2.628570079803467 + }, + { + "auxiliary_loss_clip": 0.01097073, + "auxiliary_loss_mlp": 0.01026936, + "balance_loss_clip": 1.0365932, + "balance_loss_mlp": 1.01422882, + "epoch": 0.8009018487900196, + "flos": 16727334647040.0, + "grad_norm": 1.9175300817586667, + "language_loss": 0.80223489, + "learning_rate": 4.015045008816138e-07, + "loss": 0.823475, + "num_input_tokens_seen": 287450590, + "step": 13321, + "time_per_iteration": 4.052290201187134 + }, + { + "auxiliary_loss_clip": 0.01033155, + "auxiliary_loss_mlp": 0.01036586, + "balance_loss_clip": 1.02668345, + "balance_loss_mlp": 1.02364588, + "epoch": 0.8009619720426875, + "flos": 20813609664000.0, + "grad_norm": 1.8862095260452836, + "language_loss": 0.66014248, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.6808399, + "num_input_tokens_seen": 287468455, + "step": 13322, + "time_per_iteration": 2.7416417598724365 + }, + { + "auxiliary_loss_clip": 0.01099704, + "auxiliary_loss_mlp": 0.01028354, + "balance_loss_clip": 1.03734875, + "balance_loss_mlp": 1.01642156, + "epoch": 0.8010220952953555, + "flos": 17931096161280.0, + "grad_norm": 1.9049008418549798, + "language_loss": 0.77709258, + "learning_rate": 4.010364878639265e-07, + "loss": 0.79837316, + "num_input_tokens_seen": 287486485, + "step": 13323, + "time_per_iteration": 2.6071035861968994 + }, + { + "auxiliary_loss_clip": 0.01110946, + "auxiliary_loss_mlp": 0.01029696, + "balance_loss_clip": 1.03769231, + "balance_loss_mlp": 1.01716661, + "epoch": 0.8010822185480234, + "flos": 24572235795840.0, + "grad_norm": 2.445337752212116, + "language_loss": 0.71122754, + "learning_rate": 4.00802572299932e-07, + "loss": 0.73263395, + "num_input_tokens_seen": 287503940, + "step": 13324, + "time_per_iteration": 2.6217870712280273 + }, + { + "auxiliary_loss_clip": 0.01068071, + "auxiliary_loss_mlp": 0.0103353, + "balance_loss_clip": 1.03280735, + "balance_loss_mlp": 1.02047682, + "epoch": 0.8011423418006914, + "flos": 21829988903040.0, + "grad_norm": 1.814435796416432, + "language_loss": 0.76471907, + "learning_rate": 4.005687173776635e-07, + "loss": 0.78573507, + "num_input_tokens_seen": 287521660, + "step": 13325, + "time_per_iteration": 2.6970367431640625 + }, + { + "auxiliary_loss_clip": 0.01084618, + "auxiliary_loss_mlp": 0.01027508, + "balance_loss_clip": 1.03447258, + "balance_loss_mlp": 1.01634359, + "epoch": 0.8012024650533593, + "flos": 23915178259200.0, + "grad_norm": 1.5321170582331973, + "language_loss": 0.7980848, + "learning_rate": 4.003349231059898e-07, + "loss": 0.81920606, + "num_input_tokens_seen": 287541505, + "step": 13326, + "time_per_iteration": 2.6341090202331543 + }, + { + "auxiliary_loss_clip": 0.0109705, + "auxiliary_loss_mlp": 0.01032819, + "balance_loss_clip": 1.03666115, + "balance_loss_mlp": 1.02096391, + "epoch": 0.8012625883060274, + "flos": 23587062497280.0, + "grad_norm": 1.9170928763125719, + "language_loss": 0.65865368, + "learning_rate": 4.001011894937765e-07, + "loss": 0.67995238, + "num_input_tokens_seen": 287560015, + "step": 13327, + "time_per_iteration": 4.200170278549194 + }, + { + "auxiliary_loss_clip": 0.01094832, + "auxiliary_loss_mlp": 0.01031408, + "balance_loss_clip": 1.03746152, + "balance_loss_mlp": 1.02033961, + "epoch": 0.8013227115586953, + "flos": 20813932886400.0, + "grad_norm": 1.5945061628863433, + "language_loss": 0.73482913, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.75609159, + "num_input_tokens_seen": 287579150, + "step": 13328, + "time_per_iteration": 2.598289966583252 + }, + { + "auxiliary_loss_clip": 0.01050876, + "auxiliary_loss_mlp": 0.01034952, + "balance_loss_clip": 1.03355122, + "balance_loss_mlp": 1.02166045, + "epoch": 0.8013828348113633, + "flos": 15888317788800.0, + "grad_norm": 1.9762883167731011, + "language_loss": 0.73578757, + "learning_rate": 3.996339042831798e-07, + "loss": 0.7566458, + "num_input_tokens_seen": 287597420, + "step": 13329, + "time_per_iteration": 2.738548994064331 + }, + { + "auxiliary_loss_clip": 0.0102058, + "auxiliary_loss_mlp": 0.00999735, + "balance_loss_clip": 1.0074687, + "balance_loss_mlp": 0.99866766, + "epoch": 0.8014429580640313, + "flos": 71062981562880.0, + "grad_norm": 0.6934041027763224, + "language_loss": 0.52926564, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.54946882, + "num_input_tokens_seen": 287667280, + "step": 13330, + "time_per_iteration": 3.3172037601470947 + }, + { + "auxiliary_loss_clip": 0.01083958, + "auxiliary_loss_mlp": 0.01037459, + "balance_loss_clip": 1.03489339, + "balance_loss_mlp": 1.02364862, + "epoch": 0.8015030813166992, + "flos": 23076340968960.0, + "grad_norm": 1.7329849942476805, + "language_loss": 0.7308808, + "learning_rate": 3.991668618167519e-07, + "loss": 0.75209498, + "num_input_tokens_seen": 287687375, + "step": 13331, + "time_per_iteration": 2.7093939781188965 + }, + { + "auxiliary_loss_clip": 0.01091699, + "auxiliary_loss_mlp": 0.01029361, + "balance_loss_clip": 1.03614366, + "balance_loss_mlp": 1.01829839, + "epoch": 0.8015632045693672, + "flos": 21872328059520.0, + "grad_norm": 1.8780665842935151, + "language_loss": 0.77335048, + "learning_rate": 3.989334316347401e-07, + "loss": 0.79456115, + "num_input_tokens_seen": 287707895, + "step": 13332, + "time_per_iteration": 2.708766460418701 + }, + { + "auxiliary_loss_clip": 0.01110082, + "auxiliary_loss_mlp": 0.01032572, + "balance_loss_clip": 1.03853345, + "balance_loss_mlp": 1.02041256, + "epoch": 0.8016233278220352, + "flos": 23656728925440.0, + "grad_norm": 1.9285581629240347, + "language_loss": 0.83625793, + "learning_rate": 3.987000621653338e-07, + "loss": 0.85768449, + "num_input_tokens_seen": 287723990, + "step": 13333, + "time_per_iteration": 2.6203196048736572 + }, + { + "auxiliary_loss_clip": 0.01088802, + "auxiliary_loss_mlp": 0.01032312, + "balance_loss_clip": 1.03681588, + "balance_loss_mlp": 1.02005112, + "epoch": 0.8016834510747032, + "flos": 16253170185600.0, + "grad_norm": 2.0639926273292115, + "language_loss": 0.73560673, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.75681788, + "num_input_tokens_seen": 287742380, + "step": 13334, + "time_per_iteration": 2.674370765686035 + }, + { + "auxiliary_loss_clip": 0.01068855, + "auxiliary_loss_mlp": 0.01038369, + "balance_loss_clip": 1.03341401, + "balance_loss_mlp": 1.02465343, + "epoch": 0.8017435743273711, + "flos": 12276027665280.0, + "grad_norm": 3.712822925491278, + "language_loss": 0.7483573, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.7694295, + "num_input_tokens_seen": 287760130, + "step": 13335, + "time_per_iteration": 2.661638021469116 + }, + { + "auxiliary_loss_clip": 0.01067475, + "auxiliary_loss_mlp": 0.01033285, + "balance_loss_clip": 1.03284895, + "balance_loss_mlp": 1.02039289, + "epoch": 0.8018036975800391, + "flos": 17196112068480.0, + "grad_norm": 1.8858612114976723, + "language_loss": 0.75267804, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.77368569, + "num_input_tokens_seen": 287777565, + "step": 13336, + "time_per_iteration": 2.716108560562134 + }, + { + "auxiliary_loss_clip": 0.01077828, + "auxiliary_loss_mlp": 0.01037534, + "balance_loss_clip": 1.03872991, + "balance_loss_mlp": 1.02433777, + "epoch": 0.801863820832707, + "flos": 20631865824000.0, + "grad_norm": 2.2915329222004153, + "language_loss": 0.75145626, + "learning_rate": 3.977671915907068e-07, + "loss": 0.77260983, + "num_input_tokens_seen": 287796310, + "step": 13337, + "time_per_iteration": 2.714571237564087 + }, + { + "auxiliary_loss_clip": 0.0105226, + "auxiliary_loss_mlp": 0.00771062, + "balance_loss_clip": 1.03701448, + "balance_loss_mlp": 1.00021958, + "epoch": 0.801923944085375, + "flos": 30445569285120.0, + "grad_norm": 1.6114282506426694, + "language_loss": 0.80135483, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.81958807, + "num_input_tokens_seen": 287817330, + "step": 13338, + "time_per_iteration": 2.8196728229522705 + }, + { + "auxiliary_loss_clip": 0.01073348, + "auxiliary_loss_mlp": 0.01033358, + "balance_loss_clip": 1.03255105, + "balance_loss_mlp": 1.01937521, + "epoch": 0.801984067338043, + "flos": 20010575255040.0, + "grad_norm": 1.8585955829202727, + "language_loss": 0.74602437, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.76709145, + "num_input_tokens_seen": 287835095, + "step": 13339, + "time_per_iteration": 2.6212968826293945 + }, + { + "auxiliary_loss_clip": 0.01096453, + "auxiliary_loss_mlp": 0.01029361, + "balance_loss_clip": 1.03771079, + "balance_loss_mlp": 1.01769042, + "epoch": 0.802044190590711, + "flos": 22784028088320.0, + "grad_norm": 1.7386931657461442, + "language_loss": 0.79321545, + "learning_rate": 3.970681765754775e-07, + "loss": 0.81447363, + "num_input_tokens_seen": 287854595, + "step": 13340, + "time_per_iteration": 2.6530919075012207 + }, + { + "auxiliary_loss_clip": 0.01083163, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.04112291, + "balance_loss_mlp": 1.02116799, + "epoch": 0.8021043138433789, + "flos": 27600115639680.0, + "grad_norm": 1.956756887496364, + "language_loss": 0.68165088, + "learning_rate": 3.968352931252936e-07, + "loss": 0.70280981, + "num_input_tokens_seen": 287876960, + "step": 13341, + "time_per_iteration": 2.75055193901062 + }, + { + "auxiliary_loss_clip": 0.01012323, + "auxiliary_loss_mlp": 0.01007998, + "balance_loss_clip": 1.00806713, + "balance_loss_mlp": 1.00693703, + "epoch": 0.8021644370960469, + "flos": 62063730057600.0, + "grad_norm": 0.8136387701822201, + "language_loss": 0.61581981, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.63602304, + "num_input_tokens_seen": 287936530, + "step": 13342, + "time_per_iteration": 3.1247668266296387 + }, + { + "auxiliary_loss_clip": 0.01092566, + "auxiliary_loss_mlp": 0.01037048, + "balance_loss_clip": 1.03939772, + "balance_loss_mlp": 1.02370882, + "epoch": 0.8022245603487148, + "flos": 23361794352000.0, + "grad_norm": 1.856395424623049, + "language_loss": 0.63709104, + "learning_rate": 3.963697086102522e-07, + "loss": 0.65838718, + "num_input_tokens_seen": 287954285, + "step": 13343, + "time_per_iteration": 2.7734808921813965 + }, + { + "auxiliary_loss_clip": 0.01081526, + "auxiliary_loss_mlp": 0.01029916, + "balance_loss_clip": 1.03520012, + "balance_loss_mlp": 1.01859128, + "epoch": 0.8022846836013828, + "flos": 10853354712960.0, + "grad_norm": 2.8925242692111124, + "language_loss": 0.68967628, + "learning_rate": 3.96137007563051e-07, + "loss": 0.71079069, + "num_input_tokens_seen": 287971595, + "step": 13344, + "time_per_iteration": 2.7123825550079346 + }, + { + "auxiliary_loss_clip": 0.01099765, + "auxiliary_loss_mlp": 0.0102957, + "balance_loss_clip": 1.03843033, + "balance_loss_mlp": 1.01712489, + "epoch": 0.8023448068540509, + "flos": 29240443054080.0, + "grad_norm": 1.7802127623764623, + "language_loss": 0.7023524, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.72364575, + "num_input_tokens_seen": 287992540, + "step": 13345, + "time_per_iteration": 2.7695276737213135 + }, + { + "auxiliary_loss_clip": 0.01013378, + "auxiliary_loss_mlp": 0.01005433, + "balance_loss_clip": 1.00990939, + "balance_loss_mlp": 1.00426471, + "epoch": 0.8024049301067188, + "flos": 64153588181760.0, + "grad_norm": 0.8891261669037472, + "language_loss": 0.62973511, + "learning_rate": 3.956717879334059e-07, + "loss": 0.64992326, + "num_input_tokens_seen": 288052810, + "step": 13346, + "time_per_iteration": 3.28011417388916 + }, + { + "auxiliary_loss_clip": 0.01084414, + "auxiliary_loss_mlp": 0.01031015, + "balance_loss_clip": 1.03860998, + "balance_loss_mlp": 1.01868272, + "epoch": 0.8024650533593868, + "flos": 28585360765440.0, + "grad_norm": 2.315219650527018, + "language_loss": 0.72604311, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.74719733, + "num_input_tokens_seen": 288073045, + "step": 13347, + "time_per_iteration": 2.7291135787963867 + }, + { + "auxiliary_loss_clip": 0.01098598, + "auxiliary_loss_mlp": 0.01032216, + "balance_loss_clip": 1.0363127, + "balance_loss_mlp": 1.01959181, + "epoch": 0.8025251766120547, + "flos": 16982264448000.0, + "grad_norm": 1.7959998434769961, + "language_loss": 0.72794473, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.74925292, + "num_input_tokens_seen": 288091165, + "step": 13348, + "time_per_iteration": 2.623680353164673 + }, + { + "auxiliary_loss_clip": 0.01083208, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.03903532, + "balance_loss_mlp": 1.0179677, + "epoch": 0.8025852998647227, + "flos": 22163671272960.0, + "grad_norm": 3.547597089549619, + "language_loss": 0.75893748, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.780074, + "num_input_tokens_seen": 288110595, + "step": 13349, + "time_per_iteration": 2.658114433288574 + }, + { + "auxiliary_loss_clip": 0.01110466, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.03946209, + "balance_loss_mlp": 1.02321064, + "epoch": 0.8026454231173906, + "flos": 22017012042240.0, + "grad_norm": 2.407592259971092, + "language_loss": 0.83429128, + "learning_rate": 3.947420787800755e-07, + "loss": 0.85574365, + "num_input_tokens_seen": 288128995, + "step": 13350, + "time_per_iteration": 2.6693131923675537 + }, + { + "auxiliary_loss_clip": 0.01100877, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.03946972, + "balance_loss_mlp": 1.02371919, + "epoch": 0.8027055463700586, + "flos": 22491320158080.0, + "grad_norm": 2.406570051160922, + "language_loss": 0.71667969, + "learning_rate": 3.945098036485679e-07, + "loss": 0.7380501, + "num_input_tokens_seen": 288149265, + "step": 13351, + "time_per_iteration": 2.6675031185150146 + }, + { + "auxiliary_loss_clip": 0.01069791, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.03470254, + "balance_loss_mlp": 1.02237439, + "epoch": 0.8027656696227266, + "flos": 28912901909760.0, + "grad_norm": 1.586066811433664, + "language_loss": 0.61656845, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.63761568, + "num_input_tokens_seen": 288170745, + "step": 13352, + "time_per_iteration": 2.8598105907440186 + }, + { + "auxiliary_loss_clip": 0.01096816, + "auxiliary_loss_mlp": 0.0103672, + "balance_loss_clip": 1.03765738, + "balance_loss_mlp": 1.02495408, + "epoch": 0.8028257928753946, + "flos": 18589374760320.0, + "grad_norm": 2.4706643643447346, + "language_loss": 0.76973253, + "learning_rate": 3.940454360354046e-07, + "loss": 0.79106784, + "num_input_tokens_seen": 288189415, + "step": 13353, + "time_per_iteration": 2.6434624195098877 + }, + { + "auxiliary_loss_clip": 0.01052438, + "auxiliary_loss_mlp": 0.01029892, + "balance_loss_clip": 1.03585696, + "balance_loss_mlp": 1.01597464, + "epoch": 0.8028859161280625, + "flos": 19130009339520.0, + "grad_norm": 2.077119473640625, + "language_loss": 0.73317617, + "learning_rate": 3.938133435713582e-07, + "loss": 0.75399947, + "num_input_tokens_seen": 288206900, + "step": 13354, + "time_per_iteration": 2.980314254760742 + }, + { + "auxiliary_loss_clip": 0.01069099, + "auxiliary_loss_mlp": 0.01040669, + "balance_loss_clip": 1.03414679, + "balance_loss_mlp": 1.02725756, + "epoch": 0.8029460393807305, + "flos": 20229881742720.0, + "grad_norm": 2.1378474316121463, + "language_loss": 0.65846258, + "learning_rate": 3.935813120140714e-07, + "loss": 0.6795603, + "num_input_tokens_seen": 288224800, + "step": 13355, + "time_per_iteration": 2.7468628883361816 + }, + { + "auxiliary_loss_clip": 0.01074013, + "auxiliary_loss_mlp": 0.01033759, + "balance_loss_clip": 1.03249073, + "balance_loss_mlp": 1.0199666, + "epoch": 0.8030061626333984, + "flos": 49783320933120.0, + "grad_norm": 2.375725357962007, + "language_loss": 0.68678093, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.70785862, + "num_input_tokens_seen": 288249400, + "step": 13356, + "time_per_iteration": 2.9967265129089355 + }, + { + "auxiliary_loss_clip": 0.01069606, + "auxiliary_loss_mlp": 0.01029553, + "balance_loss_clip": 1.04181337, + "balance_loss_mlp": 1.01715517, + "epoch": 0.8030662858860664, + "flos": 21615243442560.0, + "grad_norm": 1.6420809304717021, + "language_loss": 0.77664089, + "learning_rate": 3.931174316549666e-07, + "loss": 0.79763246, + "num_input_tokens_seen": 288268780, + "step": 13357, + "time_per_iteration": 4.406202077865601 + }, + { + "auxiliary_loss_clip": 0.01074511, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.03333926, + "balance_loss_mlp": 1.01853395, + "epoch": 0.8031264091387345, + "flos": 25630056351360.0, + "grad_norm": 1.4188016653576663, + "language_loss": 0.77055764, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.79162776, + "num_input_tokens_seen": 288290830, + "step": 13358, + "time_per_iteration": 4.418318033218384 + }, + { + "auxiliary_loss_clip": 0.01097306, + "auxiliary_loss_mlp": 0.01028987, + "balance_loss_clip": 1.03661919, + "balance_loss_mlp": 1.01740575, + "epoch": 0.8031865323914024, + "flos": 19646225648640.0, + "grad_norm": 1.5140849812100452, + "language_loss": 0.84604448, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.86730748, + "num_input_tokens_seen": 288308865, + "step": 13359, + "time_per_iteration": 2.6452579498291016 + }, + { + "auxiliary_loss_clip": 0.01081667, + "auxiliary_loss_mlp": 0.01025679, + "balance_loss_clip": 1.03710377, + "balance_loss_mlp": 1.01406813, + "epoch": 0.8032466556440704, + "flos": 26169110732160.0, + "grad_norm": 1.8313021321254959, + "language_loss": 0.73854876, + "learning_rate": 3.924220681368928e-07, + "loss": 0.75962222, + "num_input_tokens_seen": 288327325, + "step": 13360, + "time_per_iteration": 2.7636659145355225 + }, + { + "auxiliary_loss_clip": 0.01110485, + "auxiliary_loss_mlp": 0.01027583, + "balance_loss_clip": 1.03801131, + "balance_loss_mlp": 1.01598358, + "epoch": 0.8033067788967383, + "flos": 25520026014720.0, + "grad_norm": 2.137959732287125, + "language_loss": 0.69831038, + "learning_rate": 3.921904022048512e-07, + "loss": 0.71969098, + "num_input_tokens_seen": 288347285, + "step": 13361, + "time_per_iteration": 4.267240524291992 + }, + { + "auxiliary_loss_clip": 0.01112515, + "auxiliary_loss_mlp": 0.01035596, + "balance_loss_clip": 1.03754067, + "balance_loss_mlp": 1.02316272, + "epoch": 0.8033669021494063, + "flos": 24024274842240.0, + "grad_norm": 1.8009112987643567, + "language_loss": 0.70254129, + "learning_rate": 3.919587972411098e-07, + "loss": 0.72402239, + "num_input_tokens_seen": 288367785, + "step": 13362, + "time_per_iteration": 2.6688599586486816 + }, + { + "auxiliary_loss_clip": 0.01116592, + "auxiliary_loss_mlp": 0.0103705, + "balance_loss_clip": 1.03921294, + "balance_loss_mlp": 1.02289987, + "epoch": 0.8034270254020742, + "flos": 13588059749760.0, + "grad_norm": 2.3399431246123132, + "language_loss": 0.78629005, + "learning_rate": 3.91727253254452e-07, + "loss": 0.80782652, + "num_input_tokens_seen": 288384135, + "step": 13363, + "time_per_iteration": 2.597430944442749 + }, + { + "auxiliary_loss_clip": 0.01097254, + "auxiliary_loss_mlp": 0.01028939, + "balance_loss_clip": 1.03588295, + "balance_loss_mlp": 1.01675546, + "epoch": 0.8034871486547422, + "flos": 27412661537280.0, + "grad_norm": 6.139321315724576, + "language_loss": 0.74428964, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.76555157, + "num_input_tokens_seen": 288403805, + "step": 13364, + "time_per_iteration": 2.688309669494629 + }, + { + "auxiliary_loss_clip": 0.01096585, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.03990126, + "balance_loss_mlp": 1.0187124, + "epoch": 0.8035472719074102, + "flos": 32598593475840.0, + "grad_norm": 2.077806609776792, + "language_loss": 0.61057466, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.63184774, + "num_input_tokens_seen": 288424895, + "step": 13365, + "time_per_iteration": 2.9324018955230713 + }, + { + "auxiliary_loss_clip": 0.01089765, + "auxiliary_loss_mlp": 0.01034058, + "balance_loss_clip": 1.03685248, + "balance_loss_mlp": 1.02096307, + "epoch": 0.8036073951600782, + "flos": 21287989607040.0, + "grad_norm": 1.971281433653639, + "language_loss": 0.66274738, + "learning_rate": 3.910329872447706e-07, + "loss": 0.68398559, + "num_input_tokens_seen": 288443865, + "step": 13366, + "time_per_iteration": 4.221456050872803 + }, + { + "auxiliary_loss_clip": 0.01106605, + "auxiliary_loss_mlp": 0.01031218, + "balance_loss_clip": 1.03672922, + "balance_loss_mlp": 1.01938665, + "epoch": 0.8036675184127461, + "flos": 18113845582080.0, + "grad_norm": 1.997291212558988, + "language_loss": 0.74654198, + "learning_rate": 3.908016872542259e-07, + "loss": 0.7679202, + "num_input_tokens_seen": 288461065, + "step": 13367, + "time_per_iteration": 2.6199634075164795 + }, + { + "auxiliary_loss_clip": 0.01108205, + "auxiliary_loss_mlp": 0.01027463, + "balance_loss_clip": 1.03705049, + "balance_loss_mlp": 1.01538706, + "epoch": 0.8037276416654141, + "flos": 26030280666240.0, + "grad_norm": 1.603723731254455, + "language_loss": 0.73827767, + "learning_rate": 3.905704482846428e-07, + "loss": 0.75963438, + "num_input_tokens_seen": 288481865, + "step": 13368, + "time_per_iteration": 2.6368210315704346 + }, + { + "auxiliary_loss_clip": 0.01110551, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.03718567, + "balance_loss_mlp": 1.01879573, + "epoch": 0.803787764918082, + "flos": 18802180886400.0, + "grad_norm": 1.9805811504758806, + "language_loss": 0.70231676, + "learning_rate": 3.90339270344789e-07, + "loss": 0.72373503, + "num_input_tokens_seen": 288499345, + "step": 13369, + "time_per_iteration": 2.5763745307922363 + }, + { + "auxiliary_loss_clip": 0.01088545, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.03672147, + "balance_loss_mlp": 1.02170682, + "epoch": 0.80384788817075, + "flos": 20225787592320.0, + "grad_norm": 2.6910439854166466, + "language_loss": 0.73273438, + "learning_rate": 3.901081534434312e-07, + "loss": 0.75395083, + "num_input_tokens_seen": 288517660, + "step": 13370, + "time_per_iteration": 2.748764753341675 + }, + { + "auxiliary_loss_clip": 0.01087131, + "auxiliary_loss_mlp": 0.01032078, + "balance_loss_clip": 1.03560901, + "balance_loss_mlp": 1.01849425, + "epoch": 0.8039080114234181, + "flos": 18515290959360.0, + "grad_norm": 2.9168856587883987, + "language_loss": 0.86785686, + "learning_rate": 3.898770975893342e-07, + "loss": 0.88904893, + "num_input_tokens_seen": 288534180, + "step": 13371, + "time_per_iteration": 2.7640862464904785 + }, + { + "auxiliary_loss_clip": 0.01100956, + "auxiliary_loss_mlp": 0.01032312, + "balance_loss_clip": 1.03582239, + "balance_loss_mlp": 1.0192045, + "epoch": 0.803968134676086, + "flos": 22382510883840.0, + "grad_norm": 1.8421468200068354, + "language_loss": 0.75026673, + "learning_rate": 3.89646102791259e-07, + "loss": 0.77159941, + "num_input_tokens_seen": 288553350, + "step": 13372, + "time_per_iteration": 2.724491596221924 + }, + { + "auxiliary_loss_clip": 0.01068816, + "auxiliary_loss_mlp": 0.01031309, + "balance_loss_clip": 1.03654325, + "balance_loss_mlp": 1.01796961, + "epoch": 0.804028257928754, + "flos": 23842566915840.0, + "grad_norm": 2.31065628339188, + "language_loss": 0.79036891, + "learning_rate": 3.894151690579646e-07, + "loss": 0.81137019, + "num_input_tokens_seen": 288571325, + "step": 13373, + "time_per_iteration": 2.910059928894043 + }, + { + "auxiliary_loss_clip": 0.01081798, + "auxiliary_loss_mlp": 0.01035447, + "balance_loss_clip": 1.03376925, + "balance_loss_mlp": 1.02387166, + "epoch": 0.8040883811814219, + "flos": 23550720912000.0, + "grad_norm": 1.7053581559181326, + "language_loss": 0.74311471, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.76428711, + "num_input_tokens_seen": 288592100, + "step": 13374, + "time_per_iteration": 2.698894500732422 + }, + { + "auxiliary_loss_clip": 0.01059369, + "auxiliary_loss_mlp": 0.01037575, + "balance_loss_clip": 1.03106141, + "balance_loss_mlp": 1.02297187, + "epoch": 0.8041485044340899, + "flos": 19026263882880.0, + "grad_norm": 1.889259029929228, + "language_loss": 0.6848501, + "learning_rate": 3.889534848207452e-07, + "loss": 0.70581961, + "num_input_tokens_seen": 288612305, + "step": 13375, + "time_per_iteration": 2.781163215637207 + }, + { + "auxiliary_loss_clip": 0.01008954, + "auxiliary_loss_mlp": 0.01001942, + "balance_loss_clip": 1.01513779, + "balance_loss_mlp": 1.00076795, + "epoch": 0.8042086276867578, + "flos": 70005663797760.0, + "grad_norm": 0.7220073692688251, + "language_loss": 0.55658954, + "learning_rate": 3.887227343343271e-07, + "loss": 0.57669854, + "num_input_tokens_seen": 288676015, + "step": 13376, + "time_per_iteration": 3.373588800430298 + }, + { + "auxiliary_loss_clip": 0.01056178, + "auxiliary_loss_mlp": 0.01035137, + "balance_loss_clip": 1.03220654, + "balance_loss_mlp": 1.02096248, + "epoch": 0.8042687509394258, + "flos": 21872435800320.0, + "grad_norm": 1.964983939907185, + "language_loss": 0.72909731, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.75001043, + "num_input_tokens_seen": 288696455, + "step": 13377, + "time_per_iteration": 2.8420963287353516 + }, + { + "auxiliary_loss_clip": 0.01095422, + "auxiliary_loss_mlp": 0.01029914, + "balance_loss_clip": 1.03510308, + "balance_loss_mlp": 1.01737309, + "epoch": 0.8043288741920938, + "flos": 26614870513920.0, + "grad_norm": 1.843970634280457, + "language_loss": 0.70282233, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.72407568, + "num_input_tokens_seen": 288715560, + "step": 13378, + "time_per_iteration": 2.656498670578003 + }, + { + "auxiliary_loss_clip": 0.0110247, + "auxiliary_loss_mlp": 0.01027065, + "balance_loss_clip": 1.03830576, + "balance_loss_mlp": 1.01435089, + "epoch": 0.8043889974447618, + "flos": 33403387651200.0, + "grad_norm": 1.3626557970625712, + "language_loss": 0.69352663, + "learning_rate": 3.880308495088347e-07, + "loss": 0.71482199, + "num_input_tokens_seen": 288739485, + "step": 13379, + "time_per_iteration": 2.725536584854126 + }, + { + "auxiliary_loss_clip": 0.01115659, + "auxiliary_loss_mlp": 0.01034653, + "balance_loss_clip": 1.04020107, + "balance_loss_mlp": 1.02027059, + "epoch": 0.8044491206974297, + "flos": 20375966355840.0, + "grad_norm": 1.7895941643177822, + "language_loss": 0.76386261, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.7853657, + "num_input_tokens_seen": 288757420, + "step": 13380, + "time_per_iteration": 2.560413360595703 + }, + { + "auxiliary_loss_clip": 0.01062218, + "auxiliary_loss_mlp": 0.01029167, + "balance_loss_clip": 1.03264856, + "balance_loss_mlp": 1.01679909, + "epoch": 0.8045092439500977, + "flos": 23403810286080.0, + "grad_norm": 2.4010662161686813, + "language_loss": 0.69055688, + "learning_rate": 3.875698985740887e-07, + "loss": 0.71147072, + "num_input_tokens_seen": 288775535, + "step": 13381, + "time_per_iteration": 2.7233426570892334 + }, + { + "auxiliary_loss_clip": 0.01102054, + "auxiliary_loss_mlp": 0.01033657, + "balance_loss_clip": 1.03834701, + "balance_loss_mlp": 1.02112257, + "epoch": 0.8045693672027656, + "flos": 24097245321600.0, + "grad_norm": 1.7871560626135898, + "language_loss": 0.63795519, + "learning_rate": 3.873395148176135e-07, + "loss": 0.65931231, + "num_input_tokens_seen": 288795035, + "step": 13382, + "time_per_iteration": 2.62091326713562 + }, + { + "auxiliary_loss_clip": 0.01086707, + "auxiliary_loss_mlp": 0.01036462, + "balance_loss_clip": 1.03742146, + "balance_loss_mlp": 1.02481508, + "epoch": 0.8046294904554336, + "flos": 27707165147520.0, + "grad_norm": 4.567282213112271, + "language_loss": 0.7625041, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.78373575, + "num_input_tokens_seen": 288816270, + "step": 13383, + "time_per_iteration": 2.7304000854492188 + }, + { + "auxiliary_loss_clip": 0.01093751, + "auxiliary_loss_mlp": 0.01041645, + "balance_loss_clip": 1.03544414, + "balance_loss_mlp": 1.02812028, + "epoch": 0.8046896137081017, + "flos": 24972998814720.0, + "grad_norm": 1.8100283052553972, + "language_loss": 0.69704837, + "learning_rate": 3.868789307701381e-07, + "loss": 0.71840227, + "num_input_tokens_seen": 288836050, + "step": 13384, + "time_per_iteration": 2.623194932937622 + }, + { + "auxiliary_loss_clip": 0.0109844, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.03508019, + "balance_loss_mlp": 1.02301192, + "epoch": 0.8047497369607696, + "flos": 17675484001920.0, + "grad_norm": 8.534865412307397, + "language_loss": 0.79628527, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.81763506, + "num_input_tokens_seen": 288852900, + "step": 13385, + "time_per_iteration": 2.640493869781494 + }, + { + "auxiliary_loss_clip": 0.01109031, + "auxiliary_loss_mlp": 0.01031665, + "balance_loss_clip": 1.03663421, + "balance_loss_mlp": 1.01859391, + "epoch": 0.8048098602134376, + "flos": 22382079920640.0, + "grad_norm": 1.7222276785014166, + "language_loss": 0.72210598, + "learning_rate": 3.864185914015108e-07, + "loss": 0.74351293, + "num_input_tokens_seen": 288872625, + "step": 13386, + "time_per_iteration": 2.620424747467041 + }, + { + "auxiliary_loss_clip": 0.01000165, + "auxiliary_loss_mlp": 0.01002697, + "balance_loss_clip": 1.0073638, + "balance_loss_mlp": 1.00164747, + "epoch": 0.8048699834661055, + "flos": 71200949702400.0, + "grad_norm": 0.6627374322558968, + "language_loss": 0.51254958, + "learning_rate": 3.861885134935865e-07, + "loss": 0.53257823, + "num_input_tokens_seen": 288939180, + "step": 13387, + "time_per_iteration": 3.249873399734497 + }, + { + "auxiliary_loss_clip": 0.01108941, + "auxiliary_loss_mlp": 0.01033944, + "balance_loss_clip": 1.03665853, + "balance_loss_mlp": 1.02005613, + "epoch": 0.8049301067187735, + "flos": 23660320285440.0, + "grad_norm": 1.7754749617398262, + "language_loss": 0.73770982, + "learning_rate": 3.859584967815559e-07, + "loss": 0.75913864, + "num_input_tokens_seen": 288958925, + "step": 13388, + "time_per_iteration": 2.63905930519104 + }, + { + "auxiliary_loss_clip": 0.0108125, + "auxiliary_loss_mlp": 0.01028741, + "balance_loss_clip": 1.04047871, + "balance_loss_mlp": 1.01668882, + "epoch": 0.8049902299714414, + "flos": 24426330750720.0, + "grad_norm": 1.3693007974519653, + "language_loss": 0.71537852, + "learning_rate": 3.857285412741411e-07, + "loss": 0.73647845, + "num_input_tokens_seen": 288980935, + "step": 13389, + "time_per_iteration": 2.8490209579467773 + }, + { + "auxiliary_loss_clip": 0.01085356, + "auxiliary_loss_mlp": 0.01032905, + "balance_loss_clip": 1.03994167, + "balance_loss_mlp": 1.02047765, + "epoch": 0.8050503532241094, + "flos": 17492626840320.0, + "grad_norm": 2.1746579852789565, + "language_loss": 0.82934594, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.8505286, + "num_input_tokens_seen": 288996780, + "step": 13390, + "time_per_iteration": 2.695349931716919 + }, + { + "auxiliary_loss_clip": 0.01021163, + "auxiliary_loss_mlp": 0.01001808, + "balance_loss_clip": 1.00786567, + "balance_loss_mlp": 1.00077081, + "epoch": 0.8051104764767774, + "flos": 57658030369920.0, + "grad_norm": 0.7760583028840095, + "language_loss": 0.55514753, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.57537723, + "num_input_tokens_seen": 289057590, + "step": 13391, + "time_per_iteration": 3.1499392986297607 + }, + { + "auxiliary_loss_clip": 0.01096246, + "auxiliary_loss_mlp": 0.01032181, + "balance_loss_clip": 1.0376209, + "balance_loss_mlp": 1.02046287, + "epoch": 0.8051705997294454, + "flos": 18003456109440.0, + "grad_norm": 1.5156025498114776, + "language_loss": 0.84548998, + "learning_rate": 3.850390420667762e-07, + "loss": 0.86677432, + "num_input_tokens_seen": 289076285, + "step": 13392, + "time_per_iteration": 2.7121686935424805 + }, + { + "auxiliary_loss_clip": 0.01075704, + "auxiliary_loss_mlp": 0.01031392, + "balance_loss_clip": 1.03425109, + "balance_loss_mlp": 1.01953077, + "epoch": 0.8052307229821133, + "flos": 26397754755840.0, + "grad_norm": 1.5752957975366317, + "language_loss": 0.70452738, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.72559834, + "num_input_tokens_seen": 289097585, + "step": 13393, + "time_per_iteration": 2.857966899871826 + }, + { + "auxiliary_loss_clip": 0.01100081, + "auxiliary_loss_mlp": 0.01033968, + "balance_loss_clip": 1.03709984, + "balance_loss_mlp": 1.02045584, + "epoch": 0.8052908462347813, + "flos": 21757018423680.0, + "grad_norm": 2.1123482954588733, + "language_loss": 0.76134676, + "learning_rate": 3.84579682111414e-07, + "loss": 0.78268725, + "num_input_tokens_seen": 289116890, + "step": 13394, + "time_per_iteration": 2.6536917686462402 + }, + { + "auxiliary_loss_clip": 0.0111249, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.03986442, + "balance_loss_mlp": 1.0186125, + "epoch": 0.8053509694874492, + "flos": 25442279026560.0, + "grad_norm": 1.6448341122906027, + "language_loss": 0.64934421, + "learning_rate": 3.843500940147304e-07, + "loss": 0.67077565, + "num_input_tokens_seen": 289136670, + "step": 13395, + "time_per_iteration": 2.6280672550201416 + }, + { + "auxiliary_loss_clip": 0.01019533, + "auxiliary_loss_mlp": 0.00999955, + "balance_loss_clip": 1.00609279, + "balance_loss_mlp": 0.99902552, + "epoch": 0.8054110927401172, + "flos": 57668122091520.0, + "grad_norm": 0.7500398234084821, + "language_loss": 0.57342923, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.59362411, + "num_input_tokens_seen": 289200150, + "step": 13396, + "time_per_iteration": 3.278367519378662 + }, + { + "auxiliary_loss_clip": 0.01099939, + "auxiliary_loss_mlp": 0.01035451, + "balance_loss_clip": 1.0372299, + "balance_loss_mlp": 1.02181315, + "epoch": 0.8054712159927853, + "flos": 19276201693440.0, + "grad_norm": 1.6362380088306854, + "language_loss": 0.77317524, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.79452914, + "num_input_tokens_seen": 289218125, + "step": 13397, + "time_per_iteration": 5.758723258972168 + }, + { + "auxiliary_loss_clip": 0.01095341, + "auxiliary_loss_mlp": 0.01029553, + "balance_loss_clip": 1.04027462, + "balance_loss_mlp": 1.01784718, + "epoch": 0.8055313392454532, + "flos": 17967617314560.0, + "grad_norm": 1.6834134519930992, + "language_loss": 0.70419687, + "learning_rate": 3.836616973531266e-07, + "loss": 0.72544581, + "num_input_tokens_seen": 289237115, + "step": 13398, + "time_per_iteration": 2.6618268489837646 + }, + { + "auxiliary_loss_clip": 0.01086822, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.03505898, + "balance_loss_mlp": 1.01981521, + "epoch": 0.8055914624981212, + "flos": 13478352635520.0, + "grad_norm": 4.3745696767144056, + "language_loss": 0.69005787, + "learning_rate": 3.834323543710805e-07, + "loss": 0.71124166, + "num_input_tokens_seen": 289253635, + "step": 13399, + "time_per_iteration": 2.682286024093628 + }, + { + "auxiliary_loss_clip": 0.01109953, + "auxiliary_loss_mlp": 0.0103448, + "balance_loss_clip": 1.03786373, + "balance_loss_mlp": 1.02234411, + "epoch": 0.8056515857507891, + "flos": 13224787551360.0, + "grad_norm": 2.3581489443950043, + "language_loss": 0.71867836, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.74012268, + "num_input_tokens_seen": 289270085, + "step": 13400, + "time_per_iteration": 4.049706935882568 + }, + { + "auxiliary_loss_clip": 0.0109504, + "auxiliary_loss_mlp": 0.01032952, + "balance_loss_clip": 1.03316319, + "balance_loss_mlp": 1.0205605, + "epoch": 0.8057117090034571, + "flos": 23878190229120.0, + "grad_norm": 1.8105260022834564, + "language_loss": 0.64344472, + "learning_rate": 3.829738523169037e-07, + "loss": 0.66472465, + "num_input_tokens_seen": 289289645, + "step": 13401, + "time_per_iteration": 2.6664888858795166 + }, + { + "auxiliary_loss_clip": 0.01097912, + "auxiliary_loss_mlp": 0.01033141, + "balance_loss_clip": 1.03556728, + "balance_loss_mlp": 1.0208919, + "epoch": 0.805771832256125, + "flos": 21214300855680.0, + "grad_norm": 2.280323005246413, + "language_loss": 0.83644533, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.8577559, + "num_input_tokens_seen": 289306630, + "step": 13402, + "time_per_iteration": 2.6944761276245117 + }, + { + "auxiliary_loss_clip": 0.01058036, + "auxiliary_loss_mlp": 0.01032366, + "balance_loss_clip": 1.03603578, + "balance_loss_mlp": 1.01981318, + "epoch": 0.805831955508793, + "flos": 17566818382080.0, + "grad_norm": 6.24262023613013, + "language_loss": 0.68056262, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.70146668, + "num_input_tokens_seen": 289324960, + "step": 13403, + "time_per_iteration": 2.763301372528076 + }, + { + "auxiliary_loss_clip": 0.01069641, + "auxiliary_loss_mlp": 0.00769597, + "balance_loss_clip": 1.0345124, + "balance_loss_mlp": 1.00027502, + "epoch": 0.805892078761461, + "flos": 26907542530560.0, + "grad_norm": 3.472008939762663, + "language_loss": 0.84777313, + "learning_rate": 3.822865591408084e-07, + "loss": 0.86616552, + "num_input_tokens_seen": 289344980, + "step": 13404, + "time_per_iteration": 2.7284321784973145 + }, + { + "auxiliary_loss_clip": 0.01066717, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.03557312, + "balance_loss_mlp": 1.02060783, + "epoch": 0.805952202014129, + "flos": 31506442496640.0, + "grad_norm": 1.5427853582818836, + "language_loss": 0.70597529, + "learning_rate": 3.820575840915743e-07, + "loss": 0.72696286, + "num_input_tokens_seen": 289367500, + "step": 13405, + "time_per_iteration": 4.542062520980835 + }, + { + "auxiliary_loss_clip": 0.01098712, + "auxiliary_loss_mlp": 0.01025986, + "balance_loss_clip": 1.03720641, + "balance_loss_mlp": 1.01441038, + "epoch": 0.8060123252667969, + "flos": 24389953251840.0, + "grad_norm": 2.8341192767465957, + "language_loss": 0.7541554, + "learning_rate": 3.818286703948788e-07, + "loss": 0.77540243, + "num_input_tokens_seen": 289385930, + "step": 13406, + "time_per_iteration": 2.68805193901062 + }, + { + "auxiliary_loss_clip": 0.01100072, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.03811562, + "balance_loss_mlp": 1.02201676, + "epoch": 0.8060724485194649, + "flos": 23479941162240.0, + "grad_norm": 1.5246012967976152, + "language_loss": 0.76201332, + "learning_rate": 3.815998180594018e-07, + "loss": 0.7833643, + "num_input_tokens_seen": 289408025, + "step": 13407, + "time_per_iteration": 2.666938066482544 + }, + { + "auxiliary_loss_clip": 0.01080345, + "auxiliary_loss_mlp": 0.00770991, + "balance_loss_clip": 1.03358412, + "balance_loss_mlp": 1.00019884, + "epoch": 0.8061325717721328, + "flos": 18624495283200.0, + "grad_norm": 1.5796081620527238, + "language_loss": 0.73616993, + "learning_rate": 3.81371027093822e-07, + "loss": 0.75468338, + "num_input_tokens_seen": 289426575, + "step": 13408, + "time_per_iteration": 2.662716865539551 + }, + { + "auxiliary_loss_clip": 0.01079538, + "auxiliary_loss_mlp": 0.01039164, + "balance_loss_clip": 1.03391623, + "balance_loss_mlp": 1.02488232, + "epoch": 0.8061926950248008, + "flos": 23582752865280.0, + "grad_norm": 1.8848269452946171, + "language_loss": 0.7084735, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.72966051, + "num_input_tokens_seen": 289447760, + "step": 13409, + "time_per_iteration": 2.6887590885162354 + }, + { + "auxiliary_loss_clip": 0.01108, + "auxiliary_loss_mlp": 0.01029607, + "balance_loss_clip": 1.03585696, + "balance_loss_mlp": 1.0173347, + "epoch": 0.8062528182774689, + "flos": 11143333209600.0, + "grad_norm": 2.076248478414053, + "language_loss": 0.76634085, + "learning_rate": 3.809136293070545e-07, + "loss": 0.78771693, + "num_input_tokens_seen": 289463920, + "step": 13410, + "time_per_iteration": 2.5652787685394287 + }, + { + "auxiliary_loss_clip": 0.01099064, + "auxiliary_loss_mlp": 0.01037232, + "balance_loss_clip": 1.03812885, + "balance_loss_mlp": 1.02414274, + "epoch": 0.8063129415301368, + "flos": 22346815743360.0, + "grad_norm": 2.5070949588041653, + "language_loss": 0.68454826, + "learning_rate": 3.806850225032117e-07, + "loss": 0.70591122, + "num_input_tokens_seen": 289482635, + "step": 13411, + "time_per_iteration": 2.627668857574463 + }, + { + "auxiliary_loss_clip": 0.01076065, + "auxiliary_loss_mlp": 0.01032676, + "balance_loss_clip": 1.03557301, + "balance_loss_mlp": 1.02042735, + "epoch": 0.8063730647828048, + "flos": 23988400133760.0, + "grad_norm": 1.6635819299590309, + "language_loss": 0.68043619, + "learning_rate": 3.804564771039551e-07, + "loss": 0.70152354, + "num_input_tokens_seen": 289502040, + "step": 13412, + "time_per_iteration": 2.8055179119110107 + }, + { + "auxiliary_loss_clip": 0.01099792, + "auxiliary_loss_mlp": 0.01036335, + "balance_loss_clip": 1.03846812, + "balance_loss_mlp": 1.02239335, + "epoch": 0.8064331880354727, + "flos": 21321494017920.0, + "grad_norm": 1.701960301408402, + "language_loss": 0.81657159, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.83793283, + "num_input_tokens_seen": 289520740, + "step": 13413, + "time_per_iteration": 2.700803279876709 + }, + { + "auxiliary_loss_clip": 0.01092458, + "auxiliary_loss_mlp": 0.01042312, + "balance_loss_clip": 1.03472614, + "balance_loss_mlp": 1.02902031, + "epoch": 0.8064933112881407, + "flos": 19682890456320.0, + "grad_norm": 1.8513481069997626, + "language_loss": 0.85172534, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.8730731, + "num_input_tokens_seen": 289535840, + "step": 13414, + "time_per_iteration": 2.563521385192871 + }, + { + "auxiliary_loss_clip": 0.01083885, + "auxiliary_loss_mlp": 0.01033091, + "balance_loss_clip": 1.03454745, + "balance_loss_mlp": 1.02088439, + "epoch": 0.8065534345408086, + "flos": 19279721226240.0, + "grad_norm": 1.9940068342487725, + "language_loss": 0.67127073, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.69244045, + "num_input_tokens_seen": 289555205, + "step": 13415, + "time_per_iteration": 2.816197633743286 + }, + { + "auxiliary_loss_clip": 0.01072851, + "auxiliary_loss_mlp": 0.01025923, + "balance_loss_clip": 1.0345068, + "balance_loss_mlp": 1.01406205, + "epoch": 0.8066135577934767, + "flos": 19677718897920.0, + "grad_norm": 1.6185283067641691, + "language_loss": 0.76407629, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.78506404, + "num_input_tokens_seen": 289573000, + "step": 13416, + "time_per_iteration": 2.7045845985412598 + }, + { + "auxiliary_loss_clip": 0.01095005, + "auxiliary_loss_mlp": 0.01034611, + "balance_loss_clip": 1.03473926, + "balance_loss_mlp": 1.02240372, + "epoch": 0.8066736810461446, + "flos": 21143592933120.0, + "grad_norm": 1.4143763344150053, + "language_loss": 0.65079415, + "learning_rate": 3.793146714797086e-07, + "loss": 0.67209029, + "num_input_tokens_seen": 289592625, + "step": 13417, + "time_per_iteration": 2.6034398078918457 + }, + { + "auxiliary_loss_clip": 0.01075095, + "auxiliary_loss_mlp": 0.01055795, + "balance_loss_clip": 1.0338254, + "balance_loss_mlp": 1.0419488, + "epoch": 0.8067338042988126, + "flos": 22598261925120.0, + "grad_norm": 1.7315768879693472, + "language_loss": 0.8098107, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.8311196, + "num_input_tokens_seen": 289610780, + "step": 13418, + "time_per_iteration": 2.721973180770874 + }, + { + "auxiliary_loss_clip": 0.01090483, + "auxiliary_loss_mlp": 0.01032366, + "balance_loss_clip": 1.03820384, + "balance_loss_mlp": 1.01939058, + "epoch": 0.8067939275514805, + "flos": 16508423208960.0, + "grad_norm": 1.9096821915848545, + "language_loss": 0.84676445, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.86799294, + "num_input_tokens_seen": 289628890, + "step": 13419, + "time_per_iteration": 2.6393797397613525 + }, + { + "auxiliary_loss_clip": 0.01071529, + "auxiliary_loss_mlp": 0.00770579, + "balance_loss_clip": 1.03478575, + "balance_loss_mlp": 1.00021386, + "epoch": 0.8068540508041485, + "flos": 28541836460160.0, + "grad_norm": 1.770068441657345, + "language_loss": 0.76010084, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.7785219, + "num_input_tokens_seen": 289647220, + "step": 13420, + "time_per_iteration": 2.8084943294525146 + }, + { + "auxiliary_loss_clip": 0.01090718, + "auxiliary_loss_mlp": 0.00769899, + "balance_loss_clip": 1.03447235, + "balance_loss_mlp": 1.00020134, + "epoch": 0.8069141740568164, + "flos": 21652482867840.0, + "grad_norm": 1.8346765895775454, + "language_loss": 0.78423268, + "learning_rate": 3.784023331462207e-07, + "loss": 0.8028388, + "num_input_tokens_seen": 289665800, + "step": 13421, + "time_per_iteration": 2.6397383213043213 + }, + { + "auxiliary_loss_clip": 0.01078405, + "auxiliary_loss_mlp": 0.01025501, + "balance_loss_clip": 1.0375917, + "balance_loss_mlp": 1.01340711, + "epoch": 0.8069742973094844, + "flos": 17529327561600.0, + "grad_norm": 1.6792370158266972, + "language_loss": 0.80156964, + "learning_rate": 3.78174402269098e-07, + "loss": 0.82260871, + "num_input_tokens_seen": 289682705, + "step": 13422, + "time_per_iteration": 2.7309072017669678 + }, + { + "auxiliary_loss_clip": 0.01108091, + "auxiliary_loss_mlp": 0.01032058, + "balance_loss_clip": 1.03667307, + "balance_loss_mlp": 1.02025604, + "epoch": 0.8070344205621525, + "flos": 23367037737600.0, + "grad_norm": 1.6418430759860865, + "language_loss": 0.67767537, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.69907683, + "num_input_tokens_seen": 289702920, + "step": 13423, + "time_per_iteration": 2.6276538372039795 + }, + { + "auxiliary_loss_clip": 0.01087102, + "auxiliary_loss_mlp": 0.01036916, + "balance_loss_clip": 1.03931355, + "balance_loss_mlp": 1.02424431, + "epoch": 0.8070945438148204, + "flos": 22930184528640.0, + "grad_norm": 1.8059898203268332, + "language_loss": 0.80249333, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.82373351, + "num_input_tokens_seen": 289723280, + "step": 13424, + "time_per_iteration": 2.7442784309387207 + }, + { + "auxiliary_loss_clip": 0.01098964, + "auxiliary_loss_mlp": 0.01028478, + "balance_loss_clip": 1.03548968, + "balance_loss_mlp": 1.01652098, + "epoch": 0.8071546670674884, + "flos": 25300683613440.0, + "grad_norm": 1.410290973233428, + "language_loss": 0.78814334, + "learning_rate": 3.774909786710232e-07, + "loss": 0.80941772, + "num_input_tokens_seen": 289743475, + "step": 13425, + "time_per_iteration": 2.666613817214966 + }, + { + "auxiliary_loss_clip": 0.0107896, + "auxiliary_loss_mlp": 0.01032247, + "balance_loss_clip": 1.03484488, + "balance_loss_mlp": 1.0198555, + "epoch": 0.8072147903201563, + "flos": 18113701927680.0, + "grad_norm": 2.5661654398107814, + "language_loss": 0.75609297, + "learning_rate": 3.772632938448923e-07, + "loss": 0.77720505, + "num_input_tokens_seen": 289761400, + "step": 13426, + "time_per_iteration": 2.6524770259857178 + }, + { + "auxiliary_loss_clip": 0.01098302, + "auxiliary_loss_mlp": 0.0102617, + "balance_loss_clip": 1.03656507, + "balance_loss_mlp": 1.01461828, + "epoch": 0.8072749135728243, + "flos": 26688164215680.0, + "grad_norm": 1.8886628255914524, + "language_loss": 0.72703242, + "learning_rate": 3.770356705530997e-07, + "loss": 0.74827707, + "num_input_tokens_seen": 289781025, + "step": 13427, + "time_per_iteration": 2.6662089824676514 + }, + { + "auxiliary_loss_clip": 0.01060667, + "auxiliary_loss_mlp": 0.01038927, + "balance_loss_clip": 1.03814864, + "balance_loss_mlp": 1.02555811, + "epoch": 0.8073350368254922, + "flos": 19240291071360.0, + "grad_norm": 1.7110395570969614, + "language_loss": 0.70348513, + "learning_rate": 3.768081088042774e-07, + "loss": 0.72448105, + "num_input_tokens_seen": 289798380, + "step": 13428, + "time_per_iteration": 2.7715890407562256 + }, + { + "auxiliary_loss_clip": 0.01089538, + "auxiliary_loss_mlp": 0.0102989, + "balance_loss_clip": 1.03667974, + "balance_loss_mlp": 1.0185827, + "epoch": 0.8073951600781603, + "flos": 13334530579200.0, + "grad_norm": 2.360374881733329, + "language_loss": 0.74510443, + "learning_rate": 3.765806086070544e-07, + "loss": 0.76629871, + "num_input_tokens_seen": 289814515, + "step": 13429, + "time_per_iteration": 2.6052982807159424 + }, + { + "auxiliary_loss_clip": 0.01096224, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.03724742, + "balance_loss_mlp": 1.02020407, + "epoch": 0.8074552833308282, + "flos": 22853191726080.0, + "grad_norm": 2.1805515525099466, + "language_loss": 0.66939056, + "learning_rate": 3.763531699700568e-07, + "loss": 0.6906752, + "num_input_tokens_seen": 289834315, + "step": 13430, + "time_per_iteration": 2.6713409423828125 + }, + { + "auxiliary_loss_clip": 0.01068167, + "auxiliary_loss_mlp": 0.01029352, + "balance_loss_clip": 1.03282046, + "balance_loss_mlp": 1.0171392, + "epoch": 0.8075154065834962, + "flos": 20339409288960.0, + "grad_norm": 1.7027899268363083, + "language_loss": 0.80057859, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.82155377, + "num_input_tokens_seen": 289853770, + "step": 13431, + "time_per_iteration": 2.648855447769165 + }, + { + "auxiliary_loss_clip": 0.01084241, + "auxiliary_loss_mlp": 0.0102858, + "balance_loss_clip": 1.03625894, + "balance_loss_mlp": 1.01611698, + "epoch": 0.8075755298361641, + "flos": 21908059113600.0, + "grad_norm": 1.7686498749229664, + "language_loss": 0.80383635, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.82496452, + "num_input_tokens_seen": 289870480, + "step": 13432, + "time_per_iteration": 2.644226551055908 + }, + { + "auxiliary_loss_clip": 0.01083614, + "auxiliary_loss_mlp": 0.01032011, + "balance_loss_clip": 1.03852034, + "balance_loss_mlp": 1.01924944, + "epoch": 0.8076356530888321, + "flos": 15669298609920.0, + "grad_norm": 7.437398375727633, + "language_loss": 0.70418423, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.72534049, + "num_input_tokens_seen": 289888275, + "step": 13433, + "time_per_iteration": 2.657998561859131 + }, + { + "auxiliary_loss_clip": 0.01083097, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.03641129, + "balance_loss_mlp": 1.01629746, + "epoch": 0.8076957763415, + "flos": 37777414521600.0, + "grad_norm": 1.6613480416430744, + "language_loss": 0.7224468, + "learning_rate": 3.754440311967828e-07, + "loss": 0.7435596, + "num_input_tokens_seen": 289911495, + "step": 13434, + "time_per_iteration": 2.787569046020508 + }, + { + "auxiliary_loss_clip": 0.01071783, + "auxiliary_loss_mlp": 0.01027721, + "balance_loss_clip": 1.03727186, + "balance_loss_mlp": 1.01534724, + "epoch": 0.807755899594168, + "flos": 19610781903360.0, + "grad_norm": 1.8325775965674183, + "language_loss": 0.67859507, + "learning_rate": 3.752169004902361e-07, + "loss": 0.69959009, + "num_input_tokens_seen": 289930045, + "step": 13435, + "time_per_iteration": 2.719987154006958 + }, + { + "auxiliary_loss_clip": 0.01065411, + "auxiliary_loss_mlp": 0.01033721, + "balance_loss_clip": 1.03534269, + "balance_loss_mlp": 1.01921952, + "epoch": 0.8078160228468361, + "flos": 23294893271040.0, + "grad_norm": 1.9641244359702266, + "language_loss": 0.75152278, + "learning_rate": 3.749898313956279e-07, + "loss": 0.7725141, + "num_input_tokens_seen": 289950815, + "step": 13436, + "time_per_iteration": 4.378523826599121 + }, + { + "auxiliary_loss_clip": 0.01104889, + "auxiliary_loss_mlp": 0.01033319, + "balance_loss_clip": 1.03509259, + "balance_loss_mlp": 1.02078414, + "epoch": 0.807876146099504, + "flos": 27162651899520.0, + "grad_norm": 2.08988998980751, + "language_loss": 0.70339876, + "learning_rate": 3.747628239215674e-07, + "loss": 0.7247808, + "num_input_tokens_seen": 289971730, + "step": 13437, + "time_per_iteration": 2.7287251949310303 + }, + { + "auxiliary_loss_clip": 0.01081874, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.03837299, + "balance_loss_mlp": 1.02234817, + "epoch": 0.807936269352172, + "flos": 27160030206720.0, + "grad_norm": 1.7193467545995484, + "language_loss": 0.73327583, + "learning_rate": 3.745358780766636e-07, + "loss": 0.75443482, + "num_input_tokens_seen": 289992995, + "step": 13438, + "time_per_iteration": 2.73563289642334 + }, + { + "auxiliary_loss_clip": 0.01084164, + "auxiliary_loss_mlp": 0.01031364, + "balance_loss_clip": 1.0364821, + "balance_loss_mlp": 1.01958609, + "epoch": 0.8079963926048399, + "flos": 20740423703040.0, + "grad_norm": 1.9218850358156638, + "language_loss": 0.77182925, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.79298449, + "num_input_tokens_seen": 290009405, + "step": 13439, + "time_per_iteration": 4.257704257965088 + }, + { + "auxiliary_loss_clip": 0.01108447, + "auxiliary_loss_mlp": 0.01030482, + "balance_loss_clip": 1.0371114, + "balance_loss_mlp": 1.01817346, + "epoch": 0.8080565158575079, + "flos": 25009663622400.0, + "grad_norm": 1.5573047662808601, + "language_loss": 0.78926432, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.81065357, + "num_input_tokens_seen": 290031085, + "step": 13440, + "time_per_iteration": 2.61833119392395 + }, + { + "auxiliary_loss_clip": 0.01088716, + "auxiliary_loss_mlp": 0.00770828, + "balance_loss_clip": 1.03697038, + "balance_loss_mlp": 1.00019264, + "epoch": 0.8081166391101758, + "flos": 18698076293760.0, + "grad_norm": 1.6195395418805565, + "language_loss": 0.59136355, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.60995901, + "num_input_tokens_seen": 290048670, + "step": 13441, + "time_per_iteration": 2.6545674800872803 + }, + { + "auxiliary_loss_clip": 0.01097558, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.03679454, + "balance_loss_mlp": 1.02092791, + "epoch": 0.8081767623628439, + "flos": 19828651847040.0, + "grad_norm": 2.045109695288156, + "language_loss": 0.76209891, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.78341144, + "num_input_tokens_seen": 290064085, + "step": 13442, + "time_per_iteration": 2.579463005065918 + }, + { + "auxiliary_loss_clip": 0.0108635, + "auxiliary_loss_mlp": 0.01031047, + "balance_loss_clip": 1.03652704, + "balance_loss_mlp": 1.01932859, + "epoch": 0.8082368856155118, + "flos": 35772952982400.0, + "grad_norm": 1.9468682551853589, + "language_loss": 0.70523083, + "learning_rate": 3.734020735906169e-07, + "loss": 0.72640479, + "num_input_tokens_seen": 290086255, + "step": 13443, + "time_per_iteration": 2.768657922744751 + }, + { + "auxiliary_loss_clip": 0.010672, + "auxiliary_loss_mlp": 0.01040475, + "balance_loss_clip": 1.03662682, + "balance_loss_mlp": 1.02816081, + "epoch": 0.8082970088681798, + "flos": 17198015489280.0, + "grad_norm": 2.2011960209089128, + "language_loss": 0.82247496, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.8435517, + "num_input_tokens_seen": 290103995, + "step": 13444, + "time_per_iteration": 4.2101311683654785 + }, + { + "auxiliary_loss_clip": 0.00996531, + "auxiliary_loss_mlp": 0.00751439, + "balance_loss_clip": 1.01225722, + "balance_loss_mlp": 0.99960417, + "epoch": 0.8083571321208477, + "flos": 63555207511680.0, + "grad_norm": 0.8321689368239322, + "language_loss": 0.53573275, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.5532124, + "num_input_tokens_seen": 290157245, + "step": 13445, + "time_per_iteration": 3.0625863075256348 + }, + { + "auxiliary_loss_clip": 0.01071369, + "auxiliary_loss_mlp": 0.01031423, + "balance_loss_clip": 1.03452253, + "balance_loss_mlp": 1.01799452, + "epoch": 0.8084172553735157, + "flos": 17930701111680.0, + "grad_norm": 2.525792385078195, + "language_loss": 0.72017092, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.7411989, + "num_input_tokens_seen": 290174970, + "step": 13446, + "time_per_iteration": 2.7008473873138428 + }, + { + "auxiliary_loss_clip": 0.01084211, + "auxiliary_loss_mlp": 0.01031802, + "balance_loss_clip": 1.03550613, + "balance_loss_mlp": 1.01871324, + "epoch": 0.8084773786261836, + "flos": 24097999507200.0, + "grad_norm": 1.7496181509927413, + "language_loss": 0.71567613, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.73683619, + "num_input_tokens_seen": 290194395, + "step": 13447, + "time_per_iteration": 2.6628973484039307 + }, + { + "auxiliary_loss_clip": 0.01047169, + "auxiliary_loss_mlp": 0.01036303, + "balance_loss_clip": 1.03517139, + "balance_loss_mlp": 1.02175951, + "epoch": 0.8085375018788516, + "flos": 15588211656960.0, + "grad_norm": 3.8730614264516787, + "language_loss": 0.74754572, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.7683804, + "num_input_tokens_seen": 290209200, + "step": 13448, + "time_per_iteration": 2.8440589904785156 + }, + { + "auxiliary_loss_clip": 0.01028882, + "auxiliary_loss_mlp": 0.01000792, + "balance_loss_clip": 1.00652528, + "balance_loss_mlp": 0.99984467, + "epoch": 0.8085976251315197, + "flos": 67561296393600.0, + "grad_norm": 0.7365379441466359, + "language_loss": 0.63871992, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.65901667, + "num_input_tokens_seen": 290274565, + "step": 13449, + "time_per_iteration": 3.194010019302368 + }, + { + "auxiliary_loss_clip": 0.0110053, + "auxiliary_loss_mlp": 0.01027665, + "balance_loss_clip": 1.03765333, + "balance_loss_mlp": 1.01495767, + "epoch": 0.8086577483841876, + "flos": 22561453463040.0, + "grad_norm": 1.8200797231923929, + "language_loss": 0.73743933, + "learning_rate": 3.718173381422105e-07, + "loss": 0.75872129, + "num_input_tokens_seen": 290293630, + "step": 13450, + "time_per_iteration": 2.6638128757476807 + }, + { + "auxiliary_loss_clip": 0.0108587, + "auxiliary_loss_mlp": 0.00770156, + "balance_loss_clip": 1.03534913, + "balance_loss_mlp": 1.00021505, + "epoch": 0.8087178716368556, + "flos": 17968084191360.0, + "grad_norm": 1.7745684945736697, + "language_loss": 0.74215508, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.76071537, + "num_input_tokens_seen": 290311450, + "step": 13451, + "time_per_iteration": 2.6632473468780518 + }, + { + "auxiliary_loss_clip": 0.0108524, + "auxiliary_loss_mlp": 0.0103415, + "balance_loss_clip": 1.03462768, + "balance_loss_mlp": 1.02030349, + "epoch": 0.8087779948895235, + "flos": 21719527603200.0, + "grad_norm": 1.7259402772912478, + "language_loss": 0.80131054, + "learning_rate": 3.713651121244543e-07, + "loss": 0.82250446, + "num_input_tokens_seen": 290330165, + "step": 13452, + "time_per_iteration": 2.7077267169952393 + }, + { + "auxiliary_loss_clip": 0.01100267, + "auxiliary_loss_mlp": 0.01037977, + "balance_loss_clip": 1.03795743, + "balance_loss_mlp": 1.02595496, + "epoch": 0.8088381181421915, + "flos": 29092885983360.0, + "grad_norm": 1.7921268226395743, + "language_loss": 0.78446937, + "learning_rate": 3.711390917482875e-07, + "loss": 0.80585182, + "num_input_tokens_seen": 290350815, + "step": 13453, + "time_per_iteration": 2.655306339263916 + }, + { + "auxiliary_loss_clip": 0.01056304, + "auxiliary_loss_mlp": 0.01031896, + "balance_loss_clip": 1.0308342, + "balance_loss_mlp": 1.01884866, + "epoch": 0.8088982413948594, + "flos": 22198432659840.0, + "grad_norm": 2.2494726223817882, + "language_loss": 0.77063608, + "learning_rate": 3.709131331386892e-07, + "loss": 0.79151809, + "num_input_tokens_seen": 290367380, + "step": 13454, + "time_per_iteration": 2.711794376373291 + }, + { + "auxiliary_loss_clip": 0.01074594, + "auxiliary_loss_mlp": 0.01029586, + "balance_loss_clip": 1.0350008, + "balance_loss_mlp": 1.0173552, + "epoch": 0.8089583646475275, + "flos": 28036717453440.0, + "grad_norm": 1.8232411108878894, + "language_loss": 0.76701343, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.78805518, + "num_input_tokens_seen": 290387965, + "step": 13455, + "time_per_iteration": 2.7459137439727783 + }, + { + "auxiliary_loss_clip": 0.01082819, + "auxiliary_loss_mlp": 0.01035059, + "balance_loss_clip": 1.03439069, + "balance_loss_mlp": 1.02137375, + "epoch": 0.8090184879001954, + "flos": 16617735273600.0, + "grad_norm": 1.8300657936441902, + "language_loss": 0.79052675, + "learning_rate": 3.70461401253471e-07, + "loss": 0.81170559, + "num_input_tokens_seen": 290404150, + "step": 13456, + "time_per_iteration": 2.629514455795288 + }, + { + "auxiliary_loss_clip": 0.01108824, + "auxiliary_loss_mlp": 0.0103641, + "balance_loss_clip": 1.03869963, + "balance_loss_mlp": 1.02431631, + "epoch": 0.8090786111528634, + "flos": 27340804379520.0, + "grad_norm": 1.9435022674849403, + "language_loss": 0.71875274, + "learning_rate": 3.702356279949801e-07, + "loss": 0.74020511, + "num_input_tokens_seen": 290422370, + "step": 13457, + "time_per_iteration": 2.6066160202026367 + }, + { + "auxiliary_loss_clip": 0.01088316, + "auxiliary_loss_mlp": 0.01027847, + "balance_loss_clip": 1.03695107, + "balance_loss_mlp": 1.01670051, + "epoch": 0.8091387344055313, + "flos": 21105742976640.0, + "grad_norm": 2.4771443577175196, + "language_loss": 0.72800726, + "learning_rate": 3.700099165373176e-07, + "loss": 0.74916887, + "num_input_tokens_seen": 290442645, + "step": 13458, + "time_per_iteration": 2.670316696166992 + }, + { + "auxiliary_loss_clip": 0.01097692, + "auxiliary_loss_mlp": 0.01035982, + "balance_loss_clip": 1.03728068, + "balance_loss_mlp": 1.02393568, + "epoch": 0.8091988576581993, + "flos": 11655060318720.0, + "grad_norm": 3.9528236134114736, + "language_loss": 0.78632605, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.80766273, + "num_input_tokens_seen": 290458520, + "step": 13459, + "time_per_iteration": 2.6871142387390137 + }, + { + "auxiliary_loss_clip": 0.0108428, + "auxiliary_loss_mlp": 0.01028178, + "balance_loss_clip": 1.03804088, + "balance_loss_mlp": 1.01529706, + "epoch": 0.8092589809108672, + "flos": 22963329803520.0, + "grad_norm": 2.0963523926810073, + "language_loss": 0.79731387, + "learning_rate": 3.695586790587113e-07, + "loss": 0.81843841, + "num_input_tokens_seen": 290474465, + "step": 13460, + "time_per_iteration": 2.7210707664489746 + }, + { + "auxiliary_loss_clip": 0.01085117, + "auxiliary_loss_mlp": 0.01033465, + "balance_loss_clip": 1.033885, + "balance_loss_mlp": 1.02023244, + "epoch": 0.8093191041635353, + "flos": 13260985482240.0, + "grad_norm": 1.8549028601882585, + "language_loss": 0.84519565, + "learning_rate": 3.693331530548789e-07, + "loss": 0.86638141, + "num_input_tokens_seen": 290492060, + "step": 13461, + "time_per_iteration": 2.7760846614837646 + }, + { + "auxiliary_loss_clip": 0.01100089, + "auxiliary_loss_mlp": 0.01039455, + "balance_loss_clip": 1.03782284, + "balance_loss_mlp": 1.02653313, + "epoch": 0.8093792274162032, + "flos": 25516003691520.0, + "grad_norm": 1.8598987285745991, + "language_loss": 0.76461577, + "learning_rate": 3.69107688886096e-07, + "loss": 0.78601122, + "num_input_tokens_seen": 290511510, + "step": 13462, + "time_per_iteration": 2.845400094985962 + }, + { + "auxiliary_loss_clip": 0.01088384, + "auxiliary_loss_mlp": 0.01034949, + "balance_loss_clip": 1.03809071, + "balance_loss_mlp": 1.02182388, + "epoch": 0.8094393506688712, + "flos": 23546483107200.0, + "grad_norm": 4.497352318555959, + "language_loss": 0.83011431, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.85134763, + "num_input_tokens_seen": 290530035, + "step": 13463, + "time_per_iteration": 2.801821708679199 + }, + { + "auxiliary_loss_clip": 0.01107291, + "auxiliary_loss_mlp": 0.01031928, + "balance_loss_clip": 1.03720284, + "balance_loss_mlp": 1.02069807, + "epoch": 0.8094994739215392, + "flos": 17055917285760.0, + "grad_norm": 2.01920176880003, + "language_loss": 0.62397665, + "learning_rate": 3.686569460878779e-07, + "loss": 0.64536881, + "num_input_tokens_seen": 290548245, + "step": 13464, + "time_per_iteration": 2.7305564880371094 + }, + { + "auxiliary_loss_clip": 0.01106405, + "auxiliary_loss_mlp": 0.01028723, + "balance_loss_clip": 1.03645182, + "balance_loss_mlp": 1.01739168, + "epoch": 0.8095595971742071, + "flos": 23551223702400.0, + "grad_norm": 1.547589805180524, + "language_loss": 0.61729312, + "learning_rate": 3.684316674755341e-07, + "loss": 0.6386444, + "num_input_tokens_seen": 290568625, + "step": 13465, + "time_per_iteration": 2.61460018157959 + }, + { + "auxiliary_loss_clip": 0.01098999, + "auxiliary_loss_mlp": 0.01035736, + "balance_loss_clip": 1.03847957, + "balance_loss_mlp": 1.02339816, + "epoch": 0.8096197204268751, + "flos": 20373201008640.0, + "grad_norm": 2.0112821173289035, + "language_loss": 0.82087392, + "learning_rate": 3.682064507324256e-07, + "loss": 0.84222126, + "num_input_tokens_seen": 290586575, + "step": 13466, + "time_per_iteration": 2.65686297416687 + }, + { + "auxiliary_loss_clip": 0.01094893, + "auxiliary_loss_mlp": 0.0077026, + "balance_loss_clip": 1.03960299, + "balance_loss_mlp": 1.00025487, + "epoch": 0.809679843679543, + "flos": 27818775682560.0, + "grad_norm": 1.8295052098367424, + "language_loss": 0.75791496, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.77656651, + "num_input_tokens_seen": 290606790, + "step": 13467, + "time_per_iteration": 2.7522189617156982 + }, + { + "auxiliary_loss_clip": 0.01073167, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.03119135, + "balance_loss_mlp": 1.0187211, + "epoch": 0.8097399669322111, + "flos": 22014103040640.0, + "grad_norm": 2.4131060616703484, + "language_loss": 0.78938639, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.81043178, + "num_input_tokens_seen": 290625525, + "step": 13468, + "time_per_iteration": 2.7481191158294678 + }, + { + "auxiliary_loss_clip": 0.01095827, + "auxiliary_loss_mlp": 0.01025893, + "balance_loss_clip": 1.03550076, + "balance_loss_mlp": 1.0147531, + "epoch": 0.809800090184879, + "flos": 18988988544000.0, + "grad_norm": 1.863036262504337, + "language_loss": 0.67737466, + "learning_rate": 3.675311718038978e-07, + "loss": 0.69859189, + "num_input_tokens_seen": 290644935, + "step": 13469, + "time_per_iteration": 2.6411309242248535 + }, + { + "auxiliary_loss_clip": 0.01000462, + "auxiliary_loss_mlp": 0.01006561, + "balance_loss_clip": 1.00773001, + "balance_loss_mlp": 1.00516653, + "epoch": 0.809860213437547, + "flos": 66099516508800.0, + "grad_norm": 0.6947384431465805, + "language_loss": 0.54638267, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.56645286, + "num_input_tokens_seen": 290710735, + "step": 13470, + "time_per_iteration": 3.368800401687622 + }, + { + "auxiliary_loss_clip": 0.01106442, + "auxiliary_loss_mlp": 0.01028612, + "balance_loss_clip": 1.03582871, + "balance_loss_mlp": 1.01704264, + "epoch": 0.8099203366902149, + "flos": 20882485992960.0, + "grad_norm": 2.380568963099226, + "language_loss": 0.69673979, + "learning_rate": 3.670812953542279e-07, + "loss": 0.7180903, + "num_input_tokens_seen": 290729565, + "step": 13471, + "time_per_iteration": 2.6116278171539307 + }, + { + "auxiliary_loss_clip": 0.01099408, + "auxiliary_loss_mlp": 0.01028634, + "balance_loss_clip": 1.03811002, + "balance_loss_mlp": 1.01664793, + "epoch": 0.8099804599428829, + "flos": 26030927111040.0, + "grad_norm": 1.7876254964812721, + "language_loss": 0.79963589, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.82091635, + "num_input_tokens_seen": 290749360, + "step": 13472, + "time_per_iteration": 2.656299114227295 + }, + { + "auxiliary_loss_clip": 0.01020676, + "auxiliary_loss_mlp": 0.01001704, + "balance_loss_clip": 1.00737977, + "balance_loss_mlp": 1.00073814, + "epoch": 0.8100405831955508, + "flos": 69303573584640.0, + "grad_norm": 0.7639440927647733, + "language_loss": 0.57809198, + "learning_rate": 3.666316665863201e-07, + "loss": 0.59831583, + "num_input_tokens_seen": 290812145, + "step": 13473, + "time_per_iteration": 3.1810851097106934 + }, + { + "auxiliary_loss_clip": 0.01058837, + "auxiliary_loss_mlp": 0.01030171, + "balance_loss_clip": 1.03626239, + "balance_loss_mlp": 1.01749909, + "epoch": 0.8101007064482189, + "flos": 15012492468480.0, + "grad_norm": 1.6585074003521199, + "language_loss": 0.73932016, + "learning_rate": 3.664069451043399e-07, + "loss": 0.76021028, + "num_input_tokens_seen": 290829845, + "step": 13474, + "time_per_iteration": 2.7276382446289062 + }, + { + "auxiliary_loss_clip": 0.01096806, + "auxiliary_loss_mlp": 0.01037708, + "balance_loss_clip": 1.0382781, + "balance_loss_mlp": 1.02569187, + "epoch": 0.8101608297008868, + "flos": 21067210661760.0, + "grad_norm": 1.8626875728515482, + "language_loss": 0.78847283, + "learning_rate": 3.661822855683723e-07, + "loss": 0.80981803, + "num_input_tokens_seen": 290848815, + "step": 13475, + "time_per_iteration": 4.543492078781128 + }, + { + "auxiliary_loss_clip": 0.01096436, + "auxiliary_loss_mlp": 0.01035936, + "balance_loss_clip": 1.03686523, + "balance_loss_mlp": 1.02425337, + "epoch": 0.8102209529535548, + "flos": 23731279603200.0, + "grad_norm": 1.6073691325832198, + "language_loss": 0.75316137, + "learning_rate": 3.659576879869364e-07, + "loss": 0.77448511, + "num_input_tokens_seen": 290868580, + "step": 13476, + "time_per_iteration": 4.139512062072754 + }, + { + "auxiliary_loss_clip": 0.01089782, + "auxiliary_loss_mlp": 0.01036786, + "balance_loss_clip": 1.03533959, + "balance_loss_mlp": 1.02327955, + "epoch": 0.8102810762062228, + "flos": 10955879107200.0, + "grad_norm": 2.306148776958402, + "language_loss": 0.73640966, + "learning_rate": 3.657331523685485e-07, + "loss": 0.75767529, + "num_input_tokens_seen": 290883540, + "step": 13477, + "time_per_iteration": 2.632864236831665 + }, + { + "auxiliary_loss_clip": 0.01083746, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.03832769, + "balance_loss_mlp": 1.02404094, + "epoch": 0.8103411994588907, + "flos": 14648825220480.0, + "grad_norm": 2.218286037812516, + "language_loss": 0.69816357, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.71935594, + "num_input_tokens_seen": 290901560, + "step": 13478, + "time_per_iteration": 4.151829242706299 + }, + { + "auxiliary_loss_clip": 0.01028235, + "auxiliary_loss_mlp": 0.01001319, + "balance_loss_clip": 1.00567842, + "balance_loss_mlp": 1.00037754, + "epoch": 0.8104013227115587, + "flos": 59153314665600.0, + "grad_norm": 0.6814078364409648, + "language_loss": 0.52150369, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.54179931, + "num_input_tokens_seen": 290959185, + "step": 13479, + "time_per_iteration": 3.0655500888824463 + }, + { + "auxiliary_loss_clip": 0.01055198, + "auxiliary_loss_mlp": 0.01032847, + "balance_loss_clip": 1.03287351, + "balance_loss_mlp": 1.02027011, + "epoch": 0.8104614459642266, + "flos": 19828687760640.0, + "grad_norm": 1.696157853255504, + "language_loss": 0.7152276, + "learning_rate": 3.650599173768072e-07, + "loss": 0.73610806, + "num_input_tokens_seen": 290979585, + "step": 13480, + "time_per_iteration": 2.7024738788604736 + }, + { + "auxiliary_loss_clip": 0.01108515, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.03706646, + "balance_loss_mlp": 1.01809144, + "epoch": 0.8105215692168947, + "flos": 25374264624000.0, + "grad_norm": 1.7323825430226805, + "language_loss": 0.7977457, + "learning_rate": 3.648356296957327e-07, + "loss": 0.81913185, + "num_input_tokens_seen": 291000865, + "step": 13481, + "time_per_iteration": 2.5942976474761963 + }, + { + "auxiliary_loss_clip": 0.01085323, + "auxiliary_loss_mlp": 0.01030766, + "balance_loss_clip": 1.03626788, + "balance_loss_mlp": 1.01913691, + "epoch": 0.8105816924695626, + "flos": 20481722974080.0, + "grad_norm": 2.2690231246252686, + "language_loss": 0.72909606, + "learning_rate": 3.646114040202548e-07, + "loss": 0.75025702, + "num_input_tokens_seen": 291018285, + "step": 13482, + "time_per_iteration": 2.6350491046905518 + }, + { + "auxiliary_loss_clip": 0.01044563, + "auxiliary_loss_mlp": 0.01026969, + "balance_loss_clip": 1.03307259, + "balance_loss_mlp": 1.01405859, + "epoch": 0.8106418157222306, + "flos": 14538687143040.0, + "grad_norm": 2.648947443807841, + "language_loss": 0.65993869, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.68065393, + "num_input_tokens_seen": 291035745, + "step": 13483, + "time_per_iteration": 4.212749719619751 + }, + { + "auxiliary_loss_clip": 0.01080725, + "auxiliary_loss_mlp": 0.01028092, + "balance_loss_clip": 1.03347242, + "balance_loss_mlp": 1.01528955, + "epoch": 0.8107019389748985, + "flos": 22564470205440.0, + "grad_norm": 1.846296028434146, + "language_loss": 0.76504505, + "learning_rate": 3.641631387200992e-07, + "loss": 0.78613329, + "num_input_tokens_seen": 291053280, + "step": 13484, + "time_per_iteration": 2.6466033458709717 + }, + { + "auxiliary_loss_clip": 0.01091182, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.03665829, + "balance_loss_mlp": 1.01950169, + "epoch": 0.8107620622275665, + "flos": 19609560840960.0, + "grad_norm": 1.4911204923404016, + "language_loss": 0.72589421, + "learning_rate": 3.639390991124183e-07, + "loss": 0.74713862, + "num_input_tokens_seen": 291072855, + "step": 13485, + "time_per_iteration": 2.7968504428863525 + }, + { + "auxiliary_loss_clip": 0.01060159, + "auxiliary_loss_mlp": 0.01037186, + "balance_loss_clip": 1.02962208, + "balance_loss_mlp": 1.02368569, + "epoch": 0.8108221854802344, + "flos": 16143498984960.0, + "grad_norm": 1.7176036308983489, + "language_loss": 0.75729263, + "learning_rate": 3.637151215443308e-07, + "loss": 0.77826607, + "num_input_tokens_seen": 291090285, + "step": 13486, + "time_per_iteration": 2.652395725250244 + }, + { + "auxiliary_loss_clip": 0.01089867, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.03695536, + "balance_loss_mlp": 1.01949561, + "epoch": 0.8108823087329025, + "flos": 21106209853440.0, + "grad_norm": 1.9697831053353734, + "language_loss": 0.72577608, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.74698949, + "num_input_tokens_seen": 291107675, + "step": 13487, + "time_per_iteration": 2.5947606563568115 + }, + { + "auxiliary_loss_clip": 0.01046594, + "auxiliary_loss_mlp": 0.0103374, + "balance_loss_clip": 1.03592014, + "balance_loss_mlp": 1.02142572, + "epoch": 0.8109424319855704, + "flos": 29199648182400.0, + "grad_norm": 2.098798308260877, + "language_loss": 0.84576857, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.8665719, + "num_input_tokens_seen": 291126900, + "step": 13488, + "time_per_iteration": 2.793503761291504 + }, + { + "auxiliary_loss_clip": 0.01111048, + "auxiliary_loss_mlp": 0.01032699, + "balance_loss_clip": 1.03847623, + "balance_loss_mlp": 1.02028883, + "epoch": 0.8110025552382384, + "flos": 23111856541440.0, + "grad_norm": 1.9295150325791675, + "language_loss": 0.73623288, + "learning_rate": 3.630435611625502e-07, + "loss": 0.75767034, + "num_input_tokens_seen": 291145285, + "step": 13489, + "time_per_iteration": 2.599238395690918 + }, + { + "auxiliary_loss_clip": 0.01065923, + "auxiliary_loss_mlp": 0.00769841, + "balance_loss_clip": 1.03703368, + "balance_loss_mlp": 1.00027084, + "epoch": 0.8110626784909064, + "flos": 22379961018240.0, + "grad_norm": 1.5399542352120712, + "language_loss": 0.71757072, + "learning_rate": 3.628198318377453e-07, + "loss": 0.73592842, + "num_input_tokens_seen": 291163485, + "step": 13490, + "time_per_iteration": 2.830582857131958 + }, + { + "auxiliary_loss_clip": 0.01077295, + "auxiliary_loss_mlp": 0.01050998, + "balance_loss_clip": 1.03662229, + "balance_loss_mlp": 1.03582263, + "epoch": 0.8111228017435743, + "flos": 23368043318400.0, + "grad_norm": 2.1733695936937103, + "language_loss": 0.71880186, + "learning_rate": 3.625961645949762e-07, + "loss": 0.74008483, + "num_input_tokens_seen": 291182215, + "step": 13491, + "time_per_iteration": 2.788850784301758 + }, + { + "auxiliary_loss_clip": 0.01107942, + "auxiliary_loss_mlp": 0.01029919, + "balance_loss_clip": 1.03627849, + "balance_loss_mlp": 1.01822448, + "epoch": 0.8111829249962423, + "flos": 21286553063040.0, + "grad_norm": 1.3514463639541505, + "language_loss": 0.67817056, + "learning_rate": 3.623725594427245e-07, + "loss": 0.6995492, + "num_input_tokens_seen": 291203145, + "step": 13492, + "time_per_iteration": 2.6831281185150146 + }, + { + "auxiliary_loss_clip": 0.01064465, + "auxiliary_loss_mlp": 0.01029335, + "balance_loss_clip": 1.03531671, + "balance_loss_mlp": 1.01716399, + "epoch": 0.8112430482489102, + "flos": 22345558767360.0, + "grad_norm": 1.6581742417712237, + "language_loss": 0.71983981, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.74077779, + "num_input_tokens_seen": 291220600, + "step": 13493, + "time_per_iteration": 2.7153713703155518 + }, + { + "auxiliary_loss_clip": 0.01091343, + "auxiliary_loss_mlp": 0.01038969, + "balance_loss_clip": 1.03388119, + "balance_loss_mlp": 1.02628565, + "epoch": 0.8113031715015783, + "flos": 31138321962240.0, + "grad_norm": 1.76439624492188, + "language_loss": 0.70763975, + "learning_rate": 3.619255354436885e-07, + "loss": 0.72894287, + "num_input_tokens_seen": 291241195, + "step": 13494, + "time_per_iteration": 2.6391232013702393 + }, + { + "auxiliary_loss_clip": 0.01100106, + "auxiliary_loss_mlp": 0.01033523, + "balance_loss_clip": 1.03767419, + "balance_loss_mlp": 1.02014816, + "epoch": 0.8113632947542462, + "flos": 25335445000320.0, + "grad_norm": 1.998515349302589, + "language_loss": 0.76569247, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.78702873, + "num_input_tokens_seen": 291258715, + "step": 13495, + "time_per_iteration": 2.588968515396118 + }, + { + "auxiliary_loss_clip": 0.01089895, + "auxiliary_loss_mlp": 0.01036707, + "balance_loss_clip": 1.03693032, + "balance_loss_mlp": 1.02406478, + "epoch": 0.8114234180069142, + "flos": 28439168411520.0, + "grad_norm": 1.859703318738602, + "language_loss": 0.80103755, + "learning_rate": 3.614787599084417e-07, + "loss": 0.82230359, + "num_input_tokens_seen": 291278030, + "step": 13496, + "time_per_iteration": 2.612717390060425 + }, + { + "auxiliary_loss_clip": 0.01098421, + "auxiliary_loss_mlp": 0.01031509, + "balance_loss_clip": 1.0357132, + "balance_loss_mlp": 1.01813412, + "epoch": 0.8114835412595821, + "flos": 20338870584960.0, + "grad_norm": 1.7561157530405371, + "language_loss": 0.71104527, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.73234457, + "num_input_tokens_seen": 291296740, + "step": 13497, + "time_per_iteration": 2.505080461502075 + }, + { + "auxiliary_loss_clip": 0.01073865, + "auxiliary_loss_mlp": 0.01031093, + "balance_loss_clip": 1.03375506, + "balance_loss_mlp": 1.01949358, + "epoch": 0.8115436645122501, + "flos": 22490889194880.0, + "grad_norm": 1.5327513158552182, + "language_loss": 0.76614642, + "learning_rate": 3.610322329047508e-07, + "loss": 0.78719592, + "num_input_tokens_seen": 291318730, + "step": 13498, + "time_per_iteration": 2.6442582607269287 + }, + { + "auxiliary_loss_clip": 0.01109819, + "auxiliary_loss_mlp": 0.01034546, + "balance_loss_clip": 1.03731918, + "balance_loss_mlp": 1.02193928, + "epoch": 0.811603787764918, + "flos": 13845288021120.0, + "grad_norm": 3.6462574824728313, + "language_loss": 0.84119499, + "learning_rate": 3.608090626234055e-07, + "loss": 0.86263865, + "num_input_tokens_seen": 291336755, + "step": 13499, + "time_per_iteration": 2.483522653579712 + }, + { + "auxiliary_loss_clip": 0.01075443, + "auxiliary_loss_mlp": 0.01031753, + "balance_loss_clip": 1.03560185, + "balance_loss_mlp": 1.01798427, + "epoch": 0.8116639110175861, + "flos": 21614632911360.0, + "grad_norm": 1.449342518398089, + "language_loss": 0.76081306, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.78188503, + "num_input_tokens_seen": 291356795, + "step": 13500, + "time_per_iteration": 2.605076313018799 + }, + { + "auxiliary_loss_clip": 0.01008001, + "auxiliary_loss_mlp": 0.00999684, + "balance_loss_clip": 1.00579894, + "balance_loss_mlp": 0.9987666, + "epoch": 0.811724034270254, + "flos": 64459799625600.0, + "grad_norm": 0.8052955879746776, + "language_loss": 0.59879011, + "learning_rate": 3.603629085440303e-07, + "loss": 0.61886698, + "num_input_tokens_seen": 291416005, + "step": 13501, + "time_per_iteration": 3.1991348266601562 + }, + { + "auxiliary_loss_clip": 0.01094365, + "auxiliary_loss_mlp": 0.01025632, + "balance_loss_clip": 1.03644705, + "balance_loss_mlp": 1.01366997, + "epoch": 0.811784157522922, + "flos": 24754123290240.0, + "grad_norm": 1.57620399349307, + "language_loss": 0.79127729, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.81247723, + "num_input_tokens_seen": 291434870, + "step": 13502, + "time_per_iteration": 2.612614154815674 + }, + { + "auxiliary_loss_clip": 0.01081743, + "auxiliary_loss_mlp": 0.01037896, + "balance_loss_clip": 1.03356886, + "balance_loss_mlp": 1.02457452, + "epoch": 0.81184428077559, + "flos": 12167146563840.0, + "grad_norm": 1.8233893425242464, + "language_loss": 0.71166331, + "learning_rate": 3.599170031654635e-07, + "loss": 0.73285973, + "num_input_tokens_seen": 291452230, + "step": 13503, + "time_per_iteration": 2.61946964263916 + }, + { + "auxiliary_loss_clip": 0.01079859, + "auxiliary_loss_mlp": 0.01030554, + "balance_loss_clip": 1.03437757, + "balance_loss_mlp": 1.01704192, + "epoch": 0.8119044040282579, + "flos": 44422037775360.0, + "grad_norm": 1.451264981868303, + "language_loss": 0.67748487, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.69858897, + "num_input_tokens_seen": 291477425, + "step": 13504, + "time_per_iteration": 2.850944995880127 + }, + { + "auxiliary_loss_clip": 0.01081144, + "auxiliary_loss_mlp": 0.01032065, + "balance_loss_clip": 1.03502822, + "balance_loss_mlp": 1.01892805, + "epoch": 0.8119645272809259, + "flos": 52155507957120.0, + "grad_norm": 6.247382834720127, + "language_loss": 0.74465597, + "learning_rate": 3.594713465553403e-07, + "loss": 0.76578808, + "num_input_tokens_seen": 291501070, + "step": 13505, + "time_per_iteration": 2.9765865802764893 + }, + { + "auxiliary_loss_clip": 0.01085863, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.03640378, + "balance_loss_mlp": 1.01850462, + "epoch": 0.8120246505335939, + "flos": 30232978640640.0, + "grad_norm": 1.9161202158648818, + "language_loss": 0.73033476, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.75150955, + "num_input_tokens_seen": 291524945, + "step": 13506, + "time_per_iteration": 2.7573962211608887 + }, + { + "auxiliary_loss_clip": 0.01114046, + "auxiliary_loss_mlp": 0.01031523, + "balance_loss_clip": 1.03798163, + "balance_loss_mlp": 1.01900649, + "epoch": 0.8120847737862619, + "flos": 22127652910080.0, + "grad_norm": 2.1835357751588664, + "language_loss": 0.75858426, + "learning_rate": 3.590259387812593e-07, + "loss": 0.78003991, + "num_input_tokens_seen": 291544605, + "step": 13507, + "time_per_iteration": 2.5932223796844482 + }, + { + "auxiliary_loss_clip": 0.01110179, + "auxiliary_loss_mlp": 0.01028613, + "balance_loss_clip": 1.03553689, + "balance_loss_mlp": 1.01663268, + "epoch": 0.8121448970389298, + "flos": 23295180579840.0, + "grad_norm": 1.789952904121179, + "language_loss": 0.70542687, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.72681475, + "num_input_tokens_seen": 291563850, + "step": 13508, + "time_per_iteration": 2.6186447143554688 + }, + { + "auxiliary_loss_clip": 0.01096661, + "auxiliary_loss_mlp": 0.01033361, + "balance_loss_clip": 1.03715634, + "balance_loss_mlp": 1.02163088, + "epoch": 0.8122050202915978, + "flos": 22164138149760.0, + "grad_norm": 2.344890772054223, + "language_loss": 0.75989026, + "learning_rate": 3.585807799107785e-07, + "loss": 0.78119051, + "num_input_tokens_seen": 291581730, + "step": 13509, + "time_per_iteration": 2.589594841003418 + }, + { + "auxiliary_loss_clip": 0.01110373, + "auxiliary_loss_mlp": 0.01030617, + "balance_loss_clip": 1.03736436, + "balance_loss_mlp": 1.01834416, + "epoch": 0.8122651435442657, + "flos": 23258946735360.0, + "grad_norm": 1.705010281678355, + "language_loss": 0.76900029, + "learning_rate": 3.58358293835491e-07, + "loss": 0.79041028, + "num_input_tokens_seen": 291601225, + "step": 13510, + "time_per_iteration": 2.6146199703216553 + }, + { + "auxiliary_loss_clip": 0.01099011, + "auxiliary_loss_mlp": 0.01035544, + "balance_loss_clip": 1.0352056, + "balance_loss_mlp": 1.02240086, + "epoch": 0.8123252667969337, + "flos": 16140015365760.0, + "grad_norm": 2.0243900954468446, + "language_loss": 0.69868124, + "learning_rate": 3.581358700114212e-07, + "loss": 0.72002673, + "num_input_tokens_seen": 291616995, + "step": 13511, + "time_per_iteration": 2.5841333866119385 + }, + { + "auxiliary_loss_clip": 0.0108991, + "auxiliary_loss_mlp": 0.01035297, + "balance_loss_clip": 1.03681922, + "balance_loss_mlp": 1.0227139, + "epoch": 0.8123853900496016, + "flos": 21245399055360.0, + "grad_norm": 1.8851833910284228, + "language_loss": 0.79274458, + "learning_rate": 3.57913508447004e-07, + "loss": 0.81399667, + "num_input_tokens_seen": 291636145, + "step": 13512, + "time_per_iteration": 2.6589250564575195 + }, + { + "auxiliary_loss_clip": 0.01096941, + "auxiliary_loss_mlp": 0.01028829, + "balance_loss_clip": 1.0360527, + "balance_loss_mlp": 1.01692605, + "epoch": 0.8124455133022697, + "flos": 64377596373120.0, + "grad_norm": 1.5979406334713135, + "language_loss": 0.63230824, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.65356594, + "num_input_tokens_seen": 291662440, + "step": 13513, + "time_per_iteration": 2.990612030029297 + }, + { + "auxiliary_loss_clip": 0.01058491, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.03237724, + "balance_loss_mlp": 1.02057886, + "epoch": 0.8125056365549376, + "flos": 23842207779840.0, + "grad_norm": 1.8613542328195332, + "language_loss": 0.71270061, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.73361969, + "num_input_tokens_seen": 291680950, + "step": 13514, + "time_per_iteration": 4.445888519287109 + }, + { + "auxiliary_loss_clip": 0.01073863, + "auxiliary_loss_mlp": 0.01030266, + "balance_loss_clip": 1.03600311, + "balance_loss_mlp": 1.01780891, + "epoch": 0.8125657598076056, + "flos": 23550325862400.0, + "grad_norm": 1.5528453792894483, + "language_loss": 0.62748504, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.64852631, + "num_input_tokens_seen": 291702395, + "step": 13515, + "time_per_iteration": 4.218576192855835 + }, + { + "auxiliary_loss_clip": 0.01102975, + "auxiliary_loss_mlp": 0.00769685, + "balance_loss_clip": 1.03534853, + "balance_loss_mlp": 1.00016952, + "epoch": 0.8126258830602736, + "flos": 20704225772160.0, + "grad_norm": 5.114192306914355, + "language_loss": 0.75868255, + "learning_rate": 3.570246849544616e-07, + "loss": 0.7774092, + "num_input_tokens_seen": 291721135, + "step": 13516, + "time_per_iteration": 2.6113100051879883 + }, + { + "auxiliary_loss_clip": 0.0105995, + "auxiliary_loss_mlp": 0.01028758, + "balance_loss_clip": 1.03563333, + "balance_loss_mlp": 1.01692092, + "epoch": 0.8126860063129415, + "flos": 23618160696960.0, + "grad_norm": 1.4792626024719475, + "language_loss": 0.91386318, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.9347502, + "num_input_tokens_seen": 291741235, + "step": 13517, + "time_per_iteration": 2.730743408203125 + }, + { + "auxiliary_loss_clip": 0.01101276, + "auxiliary_loss_mlp": 0.00770067, + "balance_loss_clip": 1.03974628, + "balance_loss_mlp": 1.00018048, + "epoch": 0.8127461295656095, + "flos": 25007149670400.0, + "grad_norm": 1.60312479075504, + "language_loss": 0.78797936, + "learning_rate": 3.565806469852244e-07, + "loss": 0.80669272, + "num_input_tokens_seen": 291761430, + "step": 13518, + "time_per_iteration": 4.1632232666015625 + }, + { + "auxiliary_loss_clip": 0.01096668, + "auxiliary_loss_mlp": 0.01029335, + "balance_loss_clip": 1.03643668, + "balance_loss_mlp": 1.01815319, + "epoch": 0.8128062528182775, + "flos": 27342169096320.0, + "grad_norm": 1.5642213115065133, + "language_loss": 0.78870726, + "learning_rate": 3.56358721474336e-07, + "loss": 0.80996728, + "num_input_tokens_seen": 291781755, + "step": 13519, + "time_per_iteration": 2.681206226348877 + }, + { + "auxiliary_loss_clip": 0.01109503, + "auxiliary_loss_mlp": 0.01033337, + "balance_loss_clip": 1.03672457, + "balance_loss_mlp": 1.02139187, + "epoch": 0.8128663760709455, + "flos": 26506312634880.0, + "grad_norm": 1.5860289304426558, + "language_loss": 0.70635796, + "learning_rate": 3.561368582904905e-07, + "loss": 0.72778636, + "num_input_tokens_seen": 291804410, + "step": 13520, + "time_per_iteration": 2.6522674560546875 + }, + { + "auxiliary_loss_clip": 0.01093185, + "auxiliary_loss_mlp": 0.01033091, + "balance_loss_clip": 1.03861439, + "balance_loss_mlp": 1.02049041, + "epoch": 0.8129264993236134, + "flos": 17931239815680.0, + "grad_norm": 1.3829440346213167, + "language_loss": 0.7262553, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.74751806, + "num_input_tokens_seen": 291823285, + "step": 13521, + "time_per_iteration": 2.7141287326812744 + }, + { + "auxiliary_loss_clip": 0.01101075, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_clip": 1.03675306, + "balance_loss_mlp": 1.01783347, + "epoch": 0.8129866225762814, + "flos": 26177694082560.0, + "grad_norm": 1.8750216603820857, + "language_loss": 0.70207542, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.7233907, + "num_input_tokens_seen": 291845305, + "step": 13522, + "time_per_iteration": 2.7125802040100098 + }, + { + "auxiliary_loss_clip": 0.01093707, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.03634274, + "balance_loss_mlp": 1.02078998, + "epoch": 0.8130467458289493, + "flos": 21032197879680.0, + "grad_norm": 1.6227036980267577, + "language_loss": 0.7064119, + "learning_rate": 3.554716427853233e-07, + "loss": 0.72767031, + "num_input_tokens_seen": 291863715, + "step": 13523, + "time_per_iteration": 4.0815582275390625 + }, + { + "auxiliary_loss_clip": 0.01096974, + "auxiliary_loss_mlp": 0.01031155, + "balance_loss_clip": 1.03545976, + "balance_loss_mlp": 1.01855457, + "epoch": 0.8131068690816173, + "flos": 15487051979520.0, + "grad_norm": 2.0371214657099435, + "language_loss": 0.70833939, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.7296207, + "num_input_tokens_seen": 291880735, + "step": 13524, + "time_per_iteration": 2.6003952026367188 + }, + { + "auxiliary_loss_clip": 0.01095723, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.03494859, + "balance_loss_mlp": 1.01905477, + "epoch": 0.8131669923342852, + "flos": 29351227576320.0, + "grad_norm": 1.8442373927503088, + "language_loss": 0.62811154, + "learning_rate": 3.550284775712653e-07, + "loss": 0.64937574, + "num_input_tokens_seen": 291900535, + "step": 13525, + "time_per_iteration": 2.657466411590576 + }, + { + "auxiliary_loss_clip": 0.01079403, + "auxiliary_loss_mlp": 0.01036654, + "balance_loss_clip": 1.03601646, + "balance_loss_mlp": 1.02482271, + "epoch": 0.8132271155869533, + "flos": 35256162055680.0, + "grad_norm": 1.777429638442415, + "language_loss": 0.65313601, + "learning_rate": 3.548069885262628e-07, + "loss": 0.67429662, + "num_input_tokens_seen": 291919760, + "step": 13526, + "time_per_iteration": 2.7304532527923584 + }, + { + "auxiliary_loss_clip": 0.01083448, + "auxiliary_loss_mlp": 0.0102643, + "balance_loss_clip": 1.03576994, + "balance_loss_mlp": 1.01569486, + "epoch": 0.8132872388396212, + "flos": 27781895393280.0, + "grad_norm": 1.6432032258331546, + "language_loss": 0.75642002, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.77751887, + "num_input_tokens_seen": 291938915, + "step": 13527, + "time_per_iteration": 2.667377471923828 + }, + { + "auxiliary_loss_clip": 0.01107517, + "auxiliary_loss_mlp": 0.01026255, + "balance_loss_clip": 1.03658116, + "balance_loss_mlp": 1.01454246, + "epoch": 0.8133473620922892, + "flos": 27819601695360.0, + "grad_norm": 2.105634119207497, + "language_loss": 0.70704925, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.728387, + "num_input_tokens_seen": 291958145, + "step": 13528, + "time_per_iteration": 2.6163675785064697 + }, + { + "auxiliary_loss_clip": 0.01108822, + "auxiliary_loss_mlp": 0.01030423, + "balance_loss_clip": 1.03638566, + "balance_loss_mlp": 1.01875806, + "epoch": 0.8134074853449572, + "flos": 18989527248000.0, + "grad_norm": 1.9964746744268802, + "language_loss": 0.69046456, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.71185702, + "num_input_tokens_seen": 291976860, + "step": 13529, + "time_per_iteration": 2.5404155254364014 + }, + { + "auxiliary_loss_clip": 0.01089372, + "auxiliary_loss_mlp": 0.01031807, + "balance_loss_clip": 1.03602207, + "balance_loss_mlp": 1.02028584, + "epoch": 0.8134676085976251, + "flos": 24242863057920.0, + "grad_norm": 2.1717910963600615, + "language_loss": 0.77427143, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.79548317, + "num_input_tokens_seen": 291998085, + "step": 13530, + "time_per_iteration": 2.617090940475464 + }, + { + "auxiliary_loss_clip": 0.01097307, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.03674924, + "balance_loss_mlp": 1.02292991, + "epoch": 0.8135277318502931, + "flos": 19062389986560.0, + "grad_norm": 4.052594441167417, + "language_loss": 0.81679058, + "learning_rate": 3.537004792574052e-07, + "loss": 0.83812189, + "num_input_tokens_seen": 292016585, + "step": 13531, + "time_per_iteration": 2.6205062866210938 + }, + { + "auxiliary_loss_clip": 0.01084413, + "auxiliary_loss_mlp": 0.01034781, + "balance_loss_clip": 1.03383708, + "balance_loss_mlp": 1.02089322, + "epoch": 0.813587855102961, + "flos": 17269728992640.0, + "grad_norm": 1.9466339664768382, + "language_loss": 0.72048044, + "learning_rate": 3.534793646536065e-07, + "loss": 0.7416724, + "num_input_tokens_seen": 292033255, + "step": 13532, + "time_per_iteration": 2.6235249042510986 + }, + { + "auxiliary_loss_clip": 0.01076826, + "auxiliary_loss_mlp": 0.01028928, + "balance_loss_clip": 1.03568232, + "balance_loss_mlp": 1.01717389, + "epoch": 0.8136479783556291, + "flos": 20157593621760.0, + "grad_norm": 1.944089941040769, + "language_loss": 0.7643801, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.78543758, + "num_input_tokens_seen": 292051800, + "step": 13533, + "time_per_iteration": 2.686540126800537 + }, + { + "auxiliary_loss_clip": 0.01112795, + "auxiliary_loss_mlp": 0.0077037, + "balance_loss_clip": 1.03745687, + "balance_loss_mlp": 1.00021124, + "epoch": 0.813708101608297, + "flos": 22052348046720.0, + "grad_norm": 1.645533384240896, + "language_loss": 0.76579952, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.78463125, + "num_input_tokens_seen": 292072215, + "step": 13534, + "time_per_iteration": 2.6405410766601562 + }, + { + "auxiliary_loss_clip": 0.01090662, + "auxiliary_loss_mlp": 0.01028024, + "balance_loss_clip": 1.03678405, + "balance_loss_mlp": 1.01722336, + "epoch": 0.813768224860965, + "flos": 16173412035840.0, + "grad_norm": 2.1008954563080153, + "language_loss": 0.93045878, + "learning_rate": 3.5281639549310336e-07, + "loss": 0.95164573, + "num_input_tokens_seen": 292088830, + "step": 13535, + "time_per_iteration": 2.64209246635437 + }, + { + "auxiliary_loss_clip": 0.01071147, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.0390265, + "balance_loss_mlp": 1.01590967, + "epoch": 0.8138283481136329, + "flos": 24352318776960.0, + "grad_norm": 1.5385602481593355, + "language_loss": 0.70752996, + "learning_rate": 3.52595530684499e-07, + "loss": 0.72851282, + "num_input_tokens_seen": 292109225, + "step": 13536, + "time_per_iteration": 2.80938720703125 + }, + { + "auxiliary_loss_clip": 0.01072251, + "auxiliary_loss_mlp": 0.01029565, + "balance_loss_clip": 1.034621, + "balance_loss_mlp": 1.01691151, + "epoch": 0.8138884713663009, + "flos": 25516362827520.0, + "grad_norm": 1.7915852283109845, + "language_loss": 0.75374007, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.77475834, + "num_input_tokens_seen": 292129660, + "step": 13537, + "time_per_iteration": 2.709963798522949 + }, + { + "auxiliary_loss_clip": 0.01083975, + "auxiliary_loss_mlp": 0.01039156, + "balance_loss_clip": 1.03624582, + "balance_loss_mlp": 1.02596569, + "epoch": 0.8139485946189688, + "flos": 22454368041600.0, + "grad_norm": 1.531988851802544, + "language_loss": 0.76327688, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.78450817, + "num_input_tokens_seen": 292149090, + "step": 13538, + "time_per_iteration": 2.659142255783081 + }, + { + "auxiliary_loss_clip": 0.01090459, + "auxiliary_loss_mlp": 0.01029562, + "balance_loss_clip": 1.03432798, + "balance_loss_mlp": 1.01791525, + "epoch": 0.8140087178716369, + "flos": 21250391045760.0, + "grad_norm": 1.5412733274323733, + "language_loss": 0.78075993, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.80196011, + "num_input_tokens_seen": 292169260, + "step": 13539, + "time_per_iteration": 2.637423515319824 + }, + { + "auxiliary_loss_clip": 0.01073968, + "auxiliary_loss_mlp": 0.01029706, + "balance_loss_clip": 1.04544592, + "balance_loss_mlp": 1.01841712, + "epoch": 0.8140688411243048, + "flos": 39415730774400.0, + "grad_norm": 2.1381487111045145, + "language_loss": 0.66290975, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.68394649, + "num_input_tokens_seen": 292188145, + "step": 13540, + "time_per_iteration": 2.8771181106567383 + }, + { + "auxiliary_loss_clip": 0.01101069, + "auxiliary_loss_mlp": 0.01033274, + "balance_loss_clip": 1.03914928, + "balance_loss_mlp": 1.02268267, + "epoch": 0.8141289643769728, + "flos": 25415885508480.0, + "grad_norm": 1.6197380880165504, + "language_loss": 0.67438757, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.69573104, + "num_input_tokens_seen": 292212135, + "step": 13541, + "time_per_iteration": 2.769536018371582 + }, + { + "auxiliary_loss_clip": 0.01106222, + "auxiliary_loss_mlp": 0.01034432, + "balance_loss_clip": 1.03566313, + "balance_loss_mlp": 1.02187276, + "epoch": 0.8141890876296408, + "flos": 12568053237120.0, + "grad_norm": 1.8846151947016416, + "language_loss": 0.69230938, + "learning_rate": 3.512716539904355e-07, + "loss": 0.71371591, + "num_input_tokens_seen": 292230645, + "step": 13542, + "time_per_iteration": 2.6285057067871094 + }, + { + "auxiliary_loss_clip": 0.0111203, + "auxiliary_loss_mlp": 0.01033744, + "balance_loss_clip": 1.03642273, + "balance_loss_mlp": 1.02083373, + "epoch": 0.8142492108823087, + "flos": 14967172483200.0, + "grad_norm": 2.925687794386818, + "language_loss": 0.79454219, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.81599998, + "num_input_tokens_seen": 292243540, + "step": 13543, + "time_per_iteration": 2.5403339862823486 + }, + { + "auxiliary_loss_clip": 0.01081798, + "auxiliary_loss_mlp": 0.01035405, + "balance_loss_clip": 1.03946197, + "balance_loss_mlp": 1.02264941, + "epoch": 0.8143093341349767, + "flos": 12422004537600.0, + "grad_norm": 2.0517480511317774, + "language_loss": 0.78091002, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.80208206, + "num_input_tokens_seen": 292261715, + "step": 13544, + "time_per_iteration": 2.782600164413452 + }, + { + "auxiliary_loss_clip": 0.01116058, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.03913999, + "balance_loss_mlp": 1.01734674, + "epoch": 0.8143694573876447, + "flos": 11910564737280.0, + "grad_norm": 3.0853831344468707, + "language_loss": 0.7382375, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.75971621, + "num_input_tokens_seen": 292275080, + "step": 13545, + "time_per_iteration": 2.631141185760498 + }, + { + "auxiliary_loss_clip": 0.01096875, + "auxiliary_loss_mlp": 0.01029281, + "balance_loss_clip": 1.03640938, + "balance_loss_mlp": 1.0174439, + "epoch": 0.8144295806403127, + "flos": 21212900225280.0, + "grad_norm": 1.6342395606373197, + "language_loss": 0.76933265, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.79059422, + "num_input_tokens_seen": 292294635, + "step": 13546, + "time_per_iteration": 2.6105756759643555 + }, + { + "auxiliary_loss_clip": 0.0110063, + "auxiliary_loss_mlp": 0.01030809, + "balance_loss_clip": 1.0386976, + "balance_loss_mlp": 1.01948416, + "epoch": 0.8144897038929806, + "flos": 19865280741120.0, + "grad_norm": 2.057693835457072, + "language_loss": 0.70437783, + "learning_rate": 3.501701426337178e-07, + "loss": 0.72569221, + "num_input_tokens_seen": 292312695, + "step": 13547, + "time_per_iteration": 2.6459848880767822 + }, + { + "auxiliary_loss_clip": 0.01112435, + "auxiliary_loss_mlp": 0.01036703, + "balance_loss_clip": 1.03837729, + "balance_loss_mlp": 1.02320886, + "epoch": 0.8145498271456486, + "flos": 24571733005440.0, + "grad_norm": 1.7911803251126166, + "language_loss": 0.70297545, + "learning_rate": 3.49950028014111e-07, + "loss": 0.7244668, + "num_input_tokens_seen": 292332005, + "step": 13548, + "time_per_iteration": 2.7214651107788086 + }, + { + "auxiliary_loss_clip": 0.01099863, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.03860509, + "balance_loss_mlp": 1.01963055, + "epoch": 0.8146099503983165, + "flos": 20193037367040.0, + "grad_norm": 4.113506280616557, + "language_loss": 0.77017093, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.79150021, + "num_input_tokens_seen": 292348365, + "step": 13549, + "time_per_iteration": 2.6624276638031006 + }, + { + "auxiliary_loss_clip": 0.011122, + "auxiliary_loss_mlp": 0.01030353, + "balance_loss_clip": 1.03999424, + "balance_loss_mlp": 1.01782978, + "epoch": 0.8146700736509845, + "flos": 19536949497600.0, + "grad_norm": 1.9226967396977621, + "language_loss": 0.71076775, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.73219323, + "num_input_tokens_seen": 292368050, + "step": 13550, + "time_per_iteration": 2.7254621982574463 + }, + { + "auxiliary_loss_clip": 0.01094556, + "auxiliary_loss_mlp": 0.01025932, + "balance_loss_clip": 1.0368104, + "balance_loss_mlp": 1.01444018, + "epoch": 0.8147301969036524, + "flos": 18041341979520.0, + "grad_norm": 1.9888509797715757, + "language_loss": 0.71529424, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.73649913, + "num_input_tokens_seen": 292385315, + "step": 13551, + "time_per_iteration": 2.704594850540161 + }, + { + "auxiliary_loss_clip": 0.01072466, + "auxiliary_loss_mlp": 0.01036963, + "balance_loss_clip": 1.03925037, + "balance_loss_mlp": 1.02343869, + "epoch": 0.8147903201563205, + "flos": 18004713085440.0, + "grad_norm": 1.9897080161612837, + "language_loss": 0.68656695, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.70766115, + "num_input_tokens_seen": 292403375, + "step": 13552, + "time_per_iteration": 2.7425405979156494 + }, + { + "auxiliary_loss_clip": 0.01107317, + "auxiliary_loss_mlp": 0.01043397, + "balance_loss_clip": 1.03570342, + "balance_loss_mlp": 1.03133857, + "epoch": 0.8148504434089884, + "flos": 20259327916800.0, + "grad_norm": 1.7008594179120202, + "language_loss": 0.82082725, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.84233445, + "num_input_tokens_seen": 292419260, + "step": 13553, + "time_per_iteration": 2.5453405380249023 + }, + { + "auxiliary_loss_clip": 0.01097272, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.035079, + "balance_loss_mlp": 1.01831412, + "epoch": 0.8149105666616564, + "flos": 12494723621760.0, + "grad_norm": 1.6418052636171558, + "language_loss": 0.67904902, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.70032459, + "num_input_tokens_seen": 292436095, + "step": 13554, + "time_per_iteration": 4.209248781204224 + }, + { + "auxiliary_loss_clip": 0.01082623, + "auxiliary_loss_mlp": 0.01041493, + "balance_loss_clip": 1.035748, + "balance_loss_mlp": 1.02727127, + "epoch": 0.8149706899143244, + "flos": 32523683662080.0, + "grad_norm": 1.9729231386540171, + "language_loss": 0.66057062, + "learning_rate": 3.484109781056723e-07, + "loss": 0.68181175, + "num_input_tokens_seen": 292457190, + "step": 13555, + "time_per_iteration": 4.274117708206177 + }, + { + "auxiliary_loss_clip": 0.01102138, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.03688693, + "balance_loss_mlp": 1.02167439, + "epoch": 0.8150308131669923, + "flos": 19386088375680.0, + "grad_norm": 2.108444825647498, + "language_loss": 0.7319755, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.75334281, + "num_input_tokens_seen": 292474300, + "step": 13556, + "time_per_iteration": 2.5886549949645996 + }, + { + "auxiliary_loss_clip": 0.01099496, + "auxiliary_loss_mlp": 0.01027956, + "balance_loss_clip": 1.03907287, + "balance_loss_mlp": 1.01664877, + "epoch": 0.8150909364196604, + "flos": 17421380213760.0, + "grad_norm": 1.654846698931865, + "language_loss": 0.80619091, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.82746542, + "num_input_tokens_seen": 292492420, + "step": 13557, + "time_per_iteration": 4.058533430099487 + }, + { + "auxiliary_loss_clip": 0.01089108, + "auxiliary_loss_mlp": 0.01034069, + "balance_loss_clip": 1.03591609, + "balance_loss_mlp": 1.02201128, + "epoch": 0.8151510596723283, + "flos": 27162795553920.0, + "grad_norm": 1.7508168660237897, + "language_loss": 0.6597842, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.68101597, + "num_input_tokens_seen": 292512895, + "step": 13558, + "time_per_iteration": 2.7690083980560303 + }, + { + "auxiliary_loss_clip": 0.01029498, + "auxiliary_loss_mlp": 0.01004693, + "balance_loss_clip": 1.00695944, + "balance_loss_mlp": 1.00384712, + "epoch": 0.8152111829249963, + "flos": 64219052718720.0, + "grad_norm": 0.8394726411943846, + "language_loss": 0.56896985, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.58931184, + "num_input_tokens_seen": 292566580, + "step": 13559, + "time_per_iteration": 3.114321231842041 + }, + { + "auxiliary_loss_clip": 0.01012079, + "auxiliary_loss_mlp": 0.01011711, + "balance_loss_clip": 1.00770724, + "balance_loss_mlp": 1.0104531, + "epoch": 0.8152713061776642, + "flos": 67072012306560.0, + "grad_norm": 0.6789957550904517, + "language_loss": 0.55196381, + "learning_rate": 3.473135354283334e-07, + "loss": 0.57220173, + "num_input_tokens_seen": 292621490, + "step": 13560, + "time_per_iteration": 3.059293746948242 + }, + { + "auxiliary_loss_clip": 0.0108755, + "auxiliary_loss_mlp": 0.01029779, + "balance_loss_clip": 1.03620529, + "balance_loss_mlp": 1.01832318, + "epoch": 0.8153314294303322, + "flos": 14391130072320.0, + "grad_norm": 1.7604364526960343, + "language_loss": 0.67580026, + "learning_rate": 3.470942348696948e-07, + "loss": 0.6969735, + "num_input_tokens_seen": 292638660, + "step": 13561, + "time_per_iteration": 2.659605026245117 + }, + { + "auxiliary_loss_clip": 0.01103139, + "auxiliary_loss_mlp": 0.01035516, + "balance_loss_clip": 1.03822076, + "balance_loss_mlp": 1.02304101, + "epoch": 0.8153915526830001, + "flos": 25623520076160.0, + "grad_norm": 1.5670796727664797, + "language_loss": 0.81579733, + "learning_rate": 3.468749969894085e-07, + "loss": 0.83718389, + "num_input_tokens_seen": 292658545, + "step": 13562, + "time_per_iteration": 4.182463884353638 + }, + { + "auxiliary_loss_clip": 0.01085183, + "auxiliary_loss_mlp": 0.01033103, + "balance_loss_clip": 1.03773975, + "balance_loss_mlp": 1.02135468, + "epoch": 0.8154516759356681, + "flos": 23369156640000.0, + "grad_norm": 1.459474907859823, + "language_loss": 0.71938479, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.74056768, + "num_input_tokens_seen": 292678460, + "step": 13563, + "time_per_iteration": 2.695099353790283 + }, + { + "auxiliary_loss_clip": 0.01025068, + "auxiliary_loss_mlp": 0.01029122, + "balance_loss_clip": 1.03488255, + "balance_loss_mlp": 1.01562715, + "epoch": 0.815511799188336, + "flos": 28149189914880.0, + "grad_norm": 1.6109076046410835, + "language_loss": 0.702739, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.72328091, + "num_input_tokens_seen": 292699815, + "step": 13564, + "time_per_iteration": 3.0163979530334473 + }, + { + "auxiliary_loss_clip": 0.0108271, + "auxiliary_loss_mlp": 0.0102893, + "balance_loss_clip": 1.03672302, + "balance_loss_mlp": 1.01679492, + "epoch": 0.8155719224410041, + "flos": 16983413683200.0, + "grad_norm": 2.0873578348376745, + "language_loss": 0.70476174, + "learning_rate": 3.462176595017854e-07, + "loss": 0.72587812, + "num_input_tokens_seen": 292717370, + "step": 13565, + "time_per_iteration": 2.8652422428131104 + }, + { + "auxiliary_loss_clip": 0.01097982, + "auxiliary_loss_mlp": 0.01031925, + "balance_loss_clip": 1.03627336, + "balance_loss_mlp": 1.01994491, + "epoch": 0.815632045693672, + "flos": 24681727428480.0, + "grad_norm": 1.7436798411842787, + "language_loss": 0.78950644, + "learning_rate": 3.459986724180188e-07, + "loss": 0.8108055, + "num_input_tokens_seen": 292737110, + "step": 13566, + "time_per_iteration": 2.6846365928649902 + }, + { + "auxiliary_loss_clip": 0.01087086, + "auxiliary_loss_mlp": 0.01029651, + "balance_loss_clip": 1.03798318, + "balance_loss_mlp": 1.01873779, + "epoch": 0.81569216894634, + "flos": 19938323047680.0, + "grad_norm": 1.5991898000196176, + "language_loss": 0.82388943, + "learning_rate": 3.457797480541491e-07, + "loss": 0.84505683, + "num_input_tokens_seen": 292756510, + "step": 13567, + "time_per_iteration": 2.6953818798065186 + }, + { + "auxiliary_loss_clip": 0.01105808, + "auxiliary_loss_mlp": 0.01028496, + "balance_loss_clip": 1.03625703, + "balance_loss_mlp": 1.01798785, + "epoch": 0.8157522921990079, + "flos": 21799393493760.0, + "grad_norm": 2.2084802673905592, + "language_loss": 0.79599839, + "learning_rate": 3.455608864184771e-07, + "loss": 0.81734145, + "num_input_tokens_seen": 292776710, + "step": 13568, + "time_per_iteration": 2.6095540523529053 + }, + { + "auxiliary_loss_clip": 0.01088313, + "auxiliary_loss_mlp": 0.01030015, + "balance_loss_clip": 1.03864861, + "balance_loss_mlp": 1.01857734, + "epoch": 0.8158124154516759, + "flos": 18508323720960.0, + "grad_norm": 1.8748620768134565, + "language_loss": 0.77194703, + "learning_rate": 3.453420875193016e-07, + "loss": 0.79313028, + "num_input_tokens_seen": 292794350, + "step": 13569, + "time_per_iteration": 2.7158358097076416 + }, + { + "auxiliary_loss_clip": 0.01107012, + "auxiliary_loss_mlp": 0.01039138, + "balance_loss_clip": 1.03705049, + "balance_loss_mlp": 1.02786636, + "epoch": 0.815872538704344, + "flos": 26830801123200.0, + "grad_norm": 2.207936196273456, + "language_loss": 0.59039974, + "learning_rate": 3.451233513649199e-07, + "loss": 0.61186123, + "num_input_tokens_seen": 292814005, + "step": 13570, + "time_per_iteration": 2.6274027824401855 + }, + { + "auxiliary_loss_clip": 0.01099743, + "auxiliary_loss_mlp": 0.01037351, + "balance_loss_clip": 1.03609109, + "balance_loss_mlp": 1.02433372, + "epoch": 0.8159326619570119, + "flos": 21725704742400.0, + "grad_norm": 1.7456808566314872, + "language_loss": 0.8209976, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.84236854, + "num_input_tokens_seen": 292833485, + "step": 13571, + "time_per_iteration": 2.607311725616455 + }, + { + "auxiliary_loss_clip": 0.01082011, + "auxiliary_loss_mlp": 0.01040519, + "balance_loss_clip": 1.03530788, + "balance_loss_mlp": 1.02747178, + "epoch": 0.8159927852096799, + "flos": 13840726993920.0, + "grad_norm": 2.3386046142984966, + "language_loss": 0.7775113, + "learning_rate": 3.446860673237142e-07, + "loss": 0.79873657, + "num_input_tokens_seen": 292848045, + "step": 13572, + "time_per_iteration": 2.615434169769287 + }, + { + "auxiliary_loss_clip": 0.01110553, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.0374527, + "balance_loss_mlp": 1.0209341, + "epoch": 0.8160529084623478, + "flos": 24499516711680.0, + "grad_norm": 1.477093078405139, + "language_loss": 0.65240854, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.67384559, + "num_input_tokens_seen": 292869965, + "step": 13573, + "time_per_iteration": 2.6414575576782227 + }, + { + "auxiliary_loss_clip": 0.01075717, + "auxiliary_loss_mlp": 0.01028632, + "balance_loss_clip": 1.03710008, + "balance_loss_mlp": 1.0173732, + "epoch": 0.8161130317150158, + "flos": 24826339584000.0, + "grad_norm": 1.642166809234801, + "language_loss": 0.75473046, + "learning_rate": 3.442490343611868e-07, + "loss": 0.77577394, + "num_input_tokens_seen": 292889680, + "step": 13574, + "time_per_iteration": 2.85577392578125 + }, + { + "auxiliary_loss_clip": 0.01101144, + "auxiliary_loss_mlp": 0.0103536, + "balance_loss_clip": 1.03803658, + "balance_loss_mlp": 1.02264595, + "epoch": 0.8161731549676837, + "flos": 30956542208640.0, + "grad_norm": 2.5612570404533157, + "language_loss": 0.60302323, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.62438828, + "num_input_tokens_seen": 292912360, + "step": 13575, + "time_per_iteration": 2.725813627243042 + }, + { + "auxiliary_loss_clip": 0.01030079, + "auxiliary_loss_mlp": 0.01039791, + "balance_loss_clip": 1.03109765, + "balance_loss_mlp": 1.02550936, + "epoch": 0.8162332782203517, + "flos": 18551991680640.0, + "grad_norm": 7.314687575537146, + "language_loss": 0.74446952, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.76516831, + "num_input_tokens_seen": 292928325, + "step": 13576, + "time_per_iteration": 2.8337759971618652 + }, + { + "auxiliary_loss_clip": 0.01010195, + "auxiliary_loss_mlp": 0.01001162, + "balance_loss_clip": 1.00829458, + "balance_loss_mlp": 1.00028598, + "epoch": 0.8162934014730197, + "flos": 70386853904640.0, + "grad_norm": 0.8299990748373413, + "language_loss": 0.58698022, + "learning_rate": 3.435939558349155e-07, + "loss": 0.60709381, + "num_input_tokens_seen": 292992795, + "step": 13577, + "time_per_iteration": 3.217165470123291 + }, + { + "auxiliary_loss_clip": 0.01050236, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.03245091, + "balance_loss_mlp": 1.01977444, + "epoch": 0.8163535247256877, + "flos": 21214839559680.0, + "grad_norm": 1.6040267571253908, + "language_loss": 0.70921433, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.73003376, + "num_input_tokens_seen": 293011950, + "step": 13578, + "time_per_iteration": 2.840709686279297 + }, + { + "auxiliary_loss_clip": 0.01068471, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.03506184, + "balance_loss_mlp": 1.0190444, + "epoch": 0.8164136479783556, + "flos": 21098847565440.0, + "grad_norm": 2.7752862595751977, + "language_loss": 0.73731124, + "learning_rate": 3.431575508590172e-07, + "loss": 0.7583034, + "num_input_tokens_seen": 293030175, + "step": 13579, + "time_per_iteration": 2.812387704849243 + }, + { + "auxiliary_loss_clip": 0.01110978, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.03761864, + "balance_loss_mlp": 1.01615429, + "epoch": 0.8164737712310236, + "flos": 21720640924800.0, + "grad_norm": 5.498991527014378, + "language_loss": 0.79516351, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.81655371, + "num_input_tokens_seen": 293047980, + "step": 13580, + "time_per_iteration": 2.8092665672302246 + }, + { + "auxiliary_loss_clip": 0.01071948, + "auxiliary_loss_mlp": 0.01034847, + "balance_loss_clip": 1.03299272, + "balance_loss_mlp": 1.02182913, + "epoch": 0.8165338944836915, + "flos": 19536805843200.0, + "grad_norm": 1.723429137426299, + "language_loss": 0.69085348, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.71192145, + "num_input_tokens_seen": 293067030, + "step": 13581, + "time_per_iteration": 2.7907984256744385 + }, + { + "auxiliary_loss_clip": 0.01107871, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.03613353, + "balance_loss_mlp": 1.02049136, + "epoch": 0.8165940177363595, + "flos": 22928568416640.0, + "grad_norm": 1.8992496957974652, + "language_loss": 0.59582806, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.61723232, + "num_input_tokens_seen": 293085575, + "step": 13582, + "time_per_iteration": 2.72542405128479 + }, + { + "auxiliary_loss_clip": 0.0107424, + "auxiliary_loss_mlp": 0.00769809, + "balance_loss_clip": 1.03585207, + "balance_loss_mlp": 1.00015545, + "epoch": 0.8166541409890276, + "flos": 23370377702400.0, + "grad_norm": 1.954054796899383, + "language_loss": 0.82329261, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.84173316, + "num_input_tokens_seen": 293108200, + "step": 13583, + "time_per_iteration": 2.749908685684204 + }, + { + "auxiliary_loss_clip": 0.01088673, + "auxiliary_loss_mlp": 0.01025238, + "balance_loss_clip": 1.03623259, + "balance_loss_mlp": 1.01392472, + "epoch": 0.8167142642416955, + "flos": 18441997257600.0, + "grad_norm": 1.802555874623744, + "language_loss": 0.74573183, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.76687098, + "num_input_tokens_seen": 293126020, + "step": 13584, + "time_per_iteration": 2.8091073036193848 + }, + { + "auxiliary_loss_clip": 0.01098996, + "auxiliary_loss_mlp": 0.0102858, + "balance_loss_clip": 1.03830242, + "balance_loss_mlp": 1.01618278, + "epoch": 0.8167743874943635, + "flos": 21214983214080.0, + "grad_norm": 1.689121999421373, + "language_loss": 0.74577987, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.76705563, + "num_input_tokens_seen": 293144620, + "step": 13585, + "time_per_iteration": 2.6251516342163086 + }, + { + "auxiliary_loss_clip": 0.01083034, + "auxiliary_loss_mlp": 0.01035775, + "balance_loss_clip": 1.03814149, + "balance_loss_mlp": 1.02334797, + "epoch": 0.8168345107470314, + "flos": 18697681244160.0, + "grad_norm": 2.110704900607274, + "language_loss": 0.6954788, + "learning_rate": 3.416321129478068e-07, + "loss": 0.71666694, + "num_input_tokens_seen": 293162850, + "step": 13586, + "time_per_iteration": 2.6488070487976074 + }, + { + "auxiliary_loss_clip": 0.01049954, + "auxiliary_loss_mlp": 0.01038255, + "balance_loss_clip": 1.03342056, + "balance_loss_mlp": 1.02592838, + "epoch": 0.8168946339996994, + "flos": 16253098358400.0, + "grad_norm": 1.5273465759672988, + "language_loss": 0.60744089, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.62832302, + "num_input_tokens_seen": 293181620, + "step": 13587, + "time_per_iteration": 2.7878332138061523 + }, + { + "auxiliary_loss_clip": 0.01100484, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.03639674, + "balance_loss_mlp": 1.02172291, + "epoch": 0.8169547572523673, + "flos": 26941585645440.0, + "grad_norm": 2.223800946247814, + "language_loss": 0.6970458, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.71839273, + "num_input_tokens_seen": 293200270, + "step": 13588, + "time_per_iteration": 2.692920207977295 + }, + { + "auxiliary_loss_clip": 0.01085855, + "auxiliary_loss_mlp": 0.01043402, + "balance_loss_clip": 1.03655553, + "balance_loss_mlp": 1.02848303, + "epoch": 0.8170148805050353, + "flos": 18952323736320.0, + "grad_norm": 1.5303676433154123, + "language_loss": 0.73124111, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.75253367, + "num_input_tokens_seen": 293218960, + "step": 13589, + "time_per_iteration": 2.679173469543457 + }, + { + "auxiliary_loss_clip": 0.01094872, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.03692865, + "balance_loss_mlp": 1.02016521, + "epoch": 0.8170750037577033, + "flos": 21834909066240.0, + "grad_norm": 2.2699236258793456, + "language_loss": 0.73170865, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.75298643, + "num_input_tokens_seen": 293236450, + "step": 13590, + "time_per_iteration": 2.661827802658081 + }, + { + "auxiliary_loss_clip": 0.01112691, + "auxiliary_loss_mlp": 0.01033597, + "balance_loss_clip": 1.03789759, + "balance_loss_mlp": 1.01964402, + "epoch": 0.8171351270103713, + "flos": 33507169021440.0, + "grad_norm": 2.228487135956597, + "language_loss": 0.65462661, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.67608947, + "num_input_tokens_seen": 293256480, + "step": 13591, + "time_per_iteration": 2.713564872741699 + }, + { + "auxiliary_loss_clip": 0.01110837, + "auxiliary_loss_mlp": 0.01036337, + "balance_loss_clip": 1.03630888, + "balance_loss_mlp": 1.02364123, + "epoch": 0.8171952502630392, + "flos": 22708184520960.0, + "grad_norm": 2.2790144502571366, + "language_loss": 0.68108523, + "learning_rate": 3.403270471641373e-07, + "loss": 0.70255697, + "num_input_tokens_seen": 293274960, + "step": 13592, + "time_per_iteration": 2.673107862472534 + }, + { + "auxiliary_loss_clip": 0.01086566, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.03531361, + "balance_loss_mlp": 1.01699781, + "epoch": 0.8172553735157072, + "flos": 26723715701760.0, + "grad_norm": 1.5485466533329424, + "language_loss": 0.6656639, + "learning_rate": 3.401097564244759e-07, + "loss": 0.68682802, + "num_input_tokens_seen": 293295945, + "step": 13593, + "time_per_iteration": 2.738813877105713 + }, + { + "auxiliary_loss_clip": 0.01098161, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.03540421, + "balance_loss_mlp": 1.02022982, + "epoch": 0.8173154967683751, + "flos": 15961072786560.0, + "grad_norm": 1.90048610301986, + "language_loss": 0.69598675, + "learning_rate": 3.398925286280188e-07, + "loss": 0.71728897, + "num_input_tokens_seen": 293313300, + "step": 13594, + "time_per_iteration": 5.800758361816406 + }, + { + "auxiliary_loss_clip": 0.01110285, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.0364691, + "balance_loss_mlp": 1.02115333, + "epoch": 0.8173756200210431, + "flos": 25986720447360.0, + "grad_norm": 1.8053968351175154, + "language_loss": 0.65974349, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.68117678, + "num_input_tokens_seen": 293333085, + "step": 13595, + "time_per_iteration": 2.6032371520996094 + }, + { + "auxiliary_loss_clip": 0.01068247, + "auxiliary_loss_mlp": 0.01028339, + "balance_loss_clip": 1.03591299, + "balance_loss_mlp": 1.01576889, + "epoch": 0.8174357432737112, + "flos": 25664422688640.0, + "grad_norm": 1.659795934344192, + "language_loss": 0.78425729, + "learning_rate": 3.394582618976658e-07, + "loss": 0.80522317, + "num_input_tokens_seen": 293351895, + "step": 13596, + "time_per_iteration": 4.231920003890991 + }, + { + "auxiliary_loss_clip": 0.01081938, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.03306651, + "balance_loss_mlp": 1.01600397, + "epoch": 0.8174958665263791, + "flos": 21835088634240.0, + "grad_norm": 2.5636613927912775, + "language_loss": 0.58887529, + "learning_rate": 3.392412229802362e-07, + "loss": 0.60998344, + "num_input_tokens_seen": 293371165, + "step": 13597, + "time_per_iteration": 2.699782133102417 + }, + { + "auxiliary_loss_clip": 0.0107094, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.03980625, + "balance_loss_mlp": 1.02193189, + "epoch": 0.8175559897790471, + "flos": 22455517276800.0, + "grad_norm": 1.534270538423627, + "language_loss": 0.82330656, + "learning_rate": 3.390242470389462e-07, + "loss": 0.84435457, + "num_input_tokens_seen": 293391150, + "step": 13598, + "time_per_iteration": 2.7620291709899902 + }, + { + "auxiliary_loss_clip": 0.01052171, + "auxiliary_loss_mlp": 0.01031716, + "balance_loss_clip": 1.03996241, + "balance_loss_mlp": 1.01993775, + "epoch": 0.817616113031715, + "flos": 23615790399360.0, + "grad_norm": 1.8636627263308922, + "language_loss": 0.82549691, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.84633583, + "num_input_tokens_seen": 293409440, + "step": 13599, + "time_per_iteration": 2.8193368911743164 + }, + { + "auxiliary_loss_clip": 0.01057864, + "auxiliary_loss_mlp": 0.0104518, + "balance_loss_clip": 1.03178751, + "balance_loss_mlp": 1.03132749, + "epoch": 0.817676236284383, + "flos": 27672260106240.0, + "grad_norm": 2.111179301114437, + "language_loss": 0.83922112, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.86025155, + "num_input_tokens_seen": 293428995, + "step": 13600, + "time_per_iteration": 2.7920475006103516 + }, + { + "auxiliary_loss_clip": 0.01074994, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.03580821, + "balance_loss_mlp": 1.01914954, + "epoch": 0.8177363595370509, + "flos": 24681009156480.0, + "grad_norm": 1.862299702432468, + "language_loss": 0.74226046, + "learning_rate": 3.383736971541766e-07, + "loss": 0.76332384, + "num_input_tokens_seen": 293449155, + "step": 13601, + "time_per_iteration": 4.308535575866699 + }, + { + "auxiliary_loss_clip": 0.01078366, + "auxiliary_loss_mlp": 0.01036015, + "balance_loss_clip": 1.03641343, + "balance_loss_mlp": 1.02314591, + "epoch": 0.817796482789719, + "flos": 17346326745600.0, + "grad_norm": 2.078289918028392, + "language_loss": 0.68360138, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.70474523, + "num_input_tokens_seen": 293466125, + "step": 13602, + "time_per_iteration": 2.8116466999053955 + }, + { + "auxiliary_loss_clip": 0.01068639, + "auxiliary_loss_mlp": 0.01038644, + "balance_loss_clip": 1.03409863, + "balance_loss_mlp": 1.02547121, + "epoch": 0.8178566060423869, + "flos": 17778475272960.0, + "grad_norm": 2.118367882744336, + "language_loss": 0.83765864, + "learning_rate": 3.379403122624718e-07, + "loss": 0.85873151, + "num_input_tokens_seen": 293481345, + "step": 13603, + "time_per_iteration": 2.7411158084869385 + }, + { + "auxiliary_loss_clip": 0.0106116, + "auxiliary_loss_mlp": 0.01028949, + "balance_loss_clip": 1.03759289, + "balance_loss_mlp": 1.0176841, + "epoch": 0.8179167292950549, + "flos": 24973250209920.0, + "grad_norm": 1.7705965391975051, + "language_loss": 0.69410896, + "learning_rate": 3.377237143507159e-07, + "loss": 0.71501005, + "num_input_tokens_seen": 293502330, + "step": 13604, + "time_per_iteration": 2.7547354698181152 + }, + { + "auxiliary_loss_clip": 0.01081221, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.03777099, + "balance_loss_mlp": 1.02226162, + "epoch": 0.8179768525477228, + "flos": 22856783086080.0, + "grad_norm": 1.8951606880095677, + "language_loss": 0.74119198, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.76235086, + "num_input_tokens_seen": 293521415, + "step": 13605, + "time_per_iteration": 2.7130730152130127 + }, + { + "auxiliary_loss_clip": 0.01071497, + "auxiliary_loss_mlp": 0.01038946, + "balance_loss_clip": 1.03906167, + "balance_loss_mlp": 1.02588034, + "epoch": 0.8180369758003908, + "flos": 18515147304960.0, + "grad_norm": 1.7730120877057978, + "language_loss": 0.73990393, + "learning_rate": 3.372907076364666e-07, + "loss": 0.76100838, + "num_input_tokens_seen": 293539245, + "step": 13606, + "time_per_iteration": 2.705872058868408 + }, + { + "auxiliary_loss_clip": 0.01108658, + "auxiliary_loss_mlp": 0.01032239, + "balance_loss_clip": 1.03782868, + "balance_loss_mlp": 1.02010965, + "epoch": 0.8180970990530587, + "flos": 33182105915520.0, + "grad_norm": 1.7215775601325016, + "language_loss": 0.65496033, + "learning_rate": 3.370742988503916e-07, + "loss": 0.67636931, + "num_input_tokens_seen": 293560640, + "step": 13607, + "time_per_iteration": 2.695094347000122 + }, + { + "auxiliary_loss_clip": 0.01087636, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.0383904, + "balance_loss_mlp": 1.0186528, + "epoch": 0.8181572223057267, + "flos": 25010022758400.0, + "grad_norm": 1.9180357233704657, + "language_loss": 0.70527983, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.72646552, + "num_input_tokens_seen": 293579465, + "step": 13608, + "time_per_iteration": 2.7109787464141846 + }, + { + "auxiliary_loss_clip": 0.01094237, + "auxiliary_loss_mlp": 0.01033072, + "balance_loss_clip": 1.03419423, + "balance_loss_mlp": 1.02114487, + "epoch": 0.8182173455583948, + "flos": 28548731871360.0, + "grad_norm": 1.94764090555504, + "language_loss": 0.79518479, + "learning_rate": 3.366416704613735e-07, + "loss": 0.81645787, + "num_input_tokens_seen": 293600540, + "step": 13609, + "time_per_iteration": 2.678457736968994 + }, + { + "auxiliary_loss_clip": 0.01006167, + "auxiliary_loss_mlp": 0.01001094, + "balance_loss_clip": 1.01206219, + "balance_loss_mlp": 0.99999696, + "epoch": 0.8182774688110627, + "flos": 72028043245440.0, + "grad_norm": 0.745693768883286, + "language_loss": 0.55858743, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.57866001, + "num_input_tokens_seen": 293665160, + "step": 13610, + "time_per_iteration": 3.287687063217163 + }, + { + "auxiliary_loss_clip": 0.01043521, + "auxiliary_loss_mlp": 0.00770311, + "balance_loss_clip": 1.02925563, + "balance_loss_mlp": 1.00016284, + "epoch": 0.8183375920637307, + "flos": 19755358145280.0, + "grad_norm": 1.923535272295543, + "language_loss": 0.77933627, + "learning_rate": 3.362092943712107e-07, + "loss": 0.79747456, + "num_input_tokens_seen": 293683995, + "step": 13611, + "time_per_iteration": 2.757842540740967 + }, + { + "auxiliary_loss_clip": 0.01074897, + "auxiliary_loss_mlp": 0.01033531, + "balance_loss_clip": 1.03499138, + "balance_loss_mlp": 1.01989329, + "epoch": 0.8183977153163986, + "flos": 22341895580160.0, + "grad_norm": 1.792092336455415, + "language_loss": 0.77061421, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.79169852, + "num_input_tokens_seen": 293704115, + "step": 13612, + "time_per_iteration": 2.7527639865875244 + }, + { + "auxiliary_loss_clip": 0.01070156, + "auxiliary_loss_mlp": 0.01026228, + "balance_loss_clip": 1.03287673, + "balance_loss_mlp": 1.01489091, + "epoch": 0.8184578385690666, + "flos": 17712472032000.0, + "grad_norm": 2.2843501898761205, + "language_loss": 0.86122215, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.88218594, + "num_input_tokens_seen": 293722225, + "step": 13613, + "time_per_iteration": 2.7401769161224365 + }, + { + "auxiliary_loss_clip": 0.01098117, + "auxiliary_loss_mlp": 0.01045961, + "balance_loss_clip": 1.03796077, + "balance_loss_mlp": 1.03408742, + "epoch": 0.8185179618217345, + "flos": 25701159323520.0, + "grad_norm": 2.6943480518584906, + "language_loss": 0.72842276, + "learning_rate": 3.355612034397746e-07, + "loss": 0.74986356, + "num_input_tokens_seen": 293743995, + "step": 13614, + "time_per_iteration": 2.680565118789673 + }, + { + "auxiliary_loss_clip": 0.01085324, + "auxiliary_loss_mlp": 0.01040844, + "balance_loss_clip": 1.03373837, + "balance_loss_mlp": 1.02824354, + "epoch": 0.8185780850744026, + "flos": 25960326929280.0, + "grad_norm": 1.7330678379647075, + "language_loss": 0.81346858, + "learning_rate": 3.353452993497479e-07, + "loss": 0.83473027, + "num_input_tokens_seen": 293764935, + "step": 13615, + "time_per_iteration": 2.715773105621338 + }, + { + "auxiliary_loss_clip": 0.01093975, + "auxiliary_loss_mlp": 0.01032579, + "balance_loss_clip": 1.03279996, + "balance_loss_mlp": 1.01989484, + "epoch": 0.8186382083270705, + "flos": 25228431406080.0, + "grad_norm": 3.391470400733545, + "language_loss": 0.75472414, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.77598965, + "num_input_tokens_seen": 293784035, + "step": 13616, + "time_per_iteration": 2.6478960514068604 + }, + { + "auxiliary_loss_clip": 0.0106733, + "auxiliary_loss_mlp": 0.01043672, + "balance_loss_clip": 1.03091192, + "balance_loss_mlp": 1.02980757, + "epoch": 0.8186983315797385, + "flos": 22415009713920.0, + "grad_norm": 1.7062309242094946, + "language_loss": 0.75500989, + "learning_rate": 3.349136805494979e-07, + "loss": 0.77611995, + "num_input_tokens_seen": 293803360, + "step": 13617, + "time_per_iteration": 2.7314293384552 + }, + { + "auxiliary_loss_clip": 0.01080104, + "auxiliary_loss_mlp": 0.01032113, + "balance_loss_clip": 1.03370297, + "balance_loss_mlp": 1.02053142, + "epoch": 0.8187584548324064, + "flos": 22018017623040.0, + "grad_norm": 1.9943005109582315, + "language_loss": 0.68466866, + "learning_rate": 3.346979658556415e-07, + "loss": 0.70579082, + "num_input_tokens_seen": 293821325, + "step": 13618, + "time_per_iteration": 2.7118663787841797 + }, + { + "auxiliary_loss_clip": 0.01086635, + "auxiliary_loss_mlp": 0.01032733, + "balance_loss_clip": 1.03645062, + "balance_loss_mlp": 1.01954257, + "epoch": 0.8188185780850744, + "flos": 29241664116480.0, + "grad_norm": 1.958275526623864, + "language_loss": 0.69876873, + "learning_rate": 3.344823143102058e-07, + "loss": 0.71996242, + "num_input_tokens_seen": 293840315, + "step": 13619, + "time_per_iteration": 2.7601280212402344 + }, + { + "auxiliary_loss_clip": 0.01051452, + "auxiliary_loss_mlp": 0.01029042, + "balance_loss_clip": 1.03892016, + "balance_loss_mlp": 1.01647735, + "epoch": 0.8188787013377423, + "flos": 20696504348160.0, + "grad_norm": 1.8298313202907792, + "language_loss": 0.73982012, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.760625, + "num_input_tokens_seen": 293855685, + "step": 13620, + "time_per_iteration": 2.782697916030884 + }, + { + "auxiliary_loss_clip": 0.01079658, + "auxiliary_loss_mlp": 0.00772167, + "balance_loss_clip": 1.03250647, + "balance_loss_mlp": 1.00025058, + "epoch": 0.8189388245904103, + "flos": 23732967542400.0, + "grad_norm": 1.515288767485074, + "language_loss": 0.76337874, + "learning_rate": 3.340512006973011e-07, + "loss": 0.78189701, + "num_input_tokens_seen": 293875540, + "step": 13621, + "time_per_iteration": 2.681579828262329 + }, + { + "auxiliary_loss_clip": 0.01082197, + "auxiliary_loss_mlp": 0.01030264, + "balance_loss_clip": 1.03172946, + "balance_loss_mlp": 1.01746082, + "epoch": 0.8189989478430784, + "flos": 28255090187520.0, + "grad_norm": 2.436105215072431, + "language_loss": 0.66058964, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.68171418, + "num_input_tokens_seen": 293896570, + "step": 13622, + "time_per_iteration": 2.753495216369629 + }, + { + "auxiliary_loss_clip": 0.01111281, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.03886437, + "balance_loss_mlp": 1.01628244, + "epoch": 0.8190590710957463, + "flos": 21397696721280.0, + "grad_norm": 1.992471820034199, + "language_loss": 0.74813384, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.76954669, + "num_input_tokens_seen": 293914680, + "step": 13623, + "time_per_iteration": 2.6488537788391113 + }, + { + "auxiliary_loss_clip": 0.01085531, + "auxiliary_loss_mlp": 0.01039034, + "balance_loss_clip": 1.03339553, + "balance_loss_mlp": 1.02606368, + "epoch": 0.8191191943484143, + "flos": 38796451367040.0, + "grad_norm": 1.888675270274182, + "language_loss": 0.63241279, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.65365839, + "num_input_tokens_seen": 293936480, + "step": 13624, + "time_per_iteration": 2.9440207481384277 + }, + { + "auxiliary_loss_clip": 0.01106162, + "auxiliary_loss_mlp": 0.01034735, + "balance_loss_clip": 1.03641939, + "balance_loss_mlp": 1.02273679, + "epoch": 0.8191793176010822, + "flos": 25446516831360.0, + "grad_norm": 1.6219303590574095, + "language_loss": 0.78032911, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.80173808, + "num_input_tokens_seen": 293957815, + "step": 13625, + "time_per_iteration": 2.685042381286621 + }, + { + "auxiliary_loss_clip": 0.01101604, + "auxiliary_loss_mlp": 0.00771173, + "balance_loss_clip": 1.0347513, + "balance_loss_mlp": 1.0001961, + "epoch": 0.8192394408537502, + "flos": 25083029151360.0, + "grad_norm": 2.016240511414733, + "language_loss": 0.75687516, + "learning_rate": 3.329745223345244e-07, + "loss": 0.77560294, + "num_input_tokens_seen": 293975440, + "step": 13626, + "time_per_iteration": 2.637768507003784 + }, + { + "auxiliary_loss_clip": 0.01098049, + "auxiliary_loss_mlp": 0.0103675, + "balance_loss_clip": 1.0376209, + "balance_loss_mlp": 1.02519846, + "epoch": 0.8192995641064181, + "flos": 27673732563840.0, + "grad_norm": 1.5972949724439707, + "language_loss": 0.73228663, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.75363463, + "num_input_tokens_seen": 293997540, + "step": 13627, + "time_per_iteration": 2.7295448780059814 + }, + { + "auxiliary_loss_clip": 0.01109571, + "auxiliary_loss_mlp": 0.01033076, + "balance_loss_clip": 1.03797257, + "balance_loss_mlp": 1.02066636, + "epoch": 0.8193596873590862, + "flos": 21288492397440.0, + "grad_norm": 1.651474068364024, + "language_loss": 0.69222027, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.71364677, + "num_input_tokens_seen": 294017030, + "step": 13628, + "time_per_iteration": 2.6305129528045654 + }, + { + "auxiliary_loss_clip": 0.01087095, + "auxiliary_loss_mlp": 0.01045505, + "balance_loss_clip": 1.0360409, + "balance_loss_mlp": 1.03141403, + "epoch": 0.8194198106117541, + "flos": 17492626840320.0, + "grad_norm": 2.3448084033624115, + "language_loss": 0.85264301, + "learning_rate": 3.323292738168171e-07, + "loss": 0.87396896, + "num_input_tokens_seen": 294035700, + "step": 13629, + "time_per_iteration": 2.6781747341156006 + }, + { + "auxiliary_loss_clip": 0.01106506, + "auxiliary_loss_mlp": 0.01026288, + "balance_loss_clip": 1.03619409, + "balance_loss_mlp": 1.01409864, + "epoch": 0.8194799338644221, + "flos": 15267925059840.0, + "grad_norm": 2.0184519411378345, + "language_loss": 0.73626029, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.75758827, + "num_input_tokens_seen": 294049730, + "step": 13630, + "time_per_iteration": 2.6452038288116455 + }, + { + "auxiliary_loss_clip": 0.01096556, + "auxiliary_loss_mlp": 0.01039124, + "balance_loss_clip": 1.03655708, + "balance_loss_mlp": 1.02556396, + "epoch": 0.81954005711709, + "flos": 14718814871040.0, + "grad_norm": 1.8847266375290428, + "language_loss": 0.72261512, + "learning_rate": 3.31899424315957e-07, + "loss": 0.74397194, + "num_input_tokens_seen": 294066545, + "step": 13631, + "time_per_iteration": 2.595489025115967 + }, + { + "auxiliary_loss_clip": 0.01108623, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.03625333, + "balance_loss_mlp": 1.018224, + "epoch": 0.819600180369758, + "flos": 23074042498560.0, + "grad_norm": 1.78491625477661, + "language_loss": 0.76710784, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.78849781, + "num_input_tokens_seen": 294087455, + "step": 13632, + "time_per_iteration": 2.639312267303467 + }, + { + "auxiliary_loss_clip": 0.01081621, + "auxiliary_loss_mlp": 0.01031706, + "balance_loss_clip": 1.03269899, + "balance_loss_mlp": 1.01979756, + "epoch": 0.8196603036224259, + "flos": 27599792417280.0, + "grad_norm": 3.6495669730083455, + "language_loss": 0.65916097, + "learning_rate": 3.314698278332588e-07, + "loss": 0.68029428, + "num_input_tokens_seen": 294107480, + "step": 13633, + "time_per_iteration": 4.429157733917236 + }, + { + "auxiliary_loss_clip": 0.01090266, + "auxiliary_loss_mlp": 0.01037639, + "balance_loss_clip": 1.03390145, + "balance_loss_mlp": 1.02634966, + "epoch": 0.8197204268750939, + "flos": 28582020800640.0, + "grad_norm": 1.4436935437112157, + "language_loss": 0.75417399, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.77545297, + "num_input_tokens_seen": 294130115, + "step": 13634, + "time_per_iteration": 4.236420392990112 + }, + { + "auxiliary_loss_clip": 0.01049415, + "auxiliary_loss_mlp": 0.00769002, + "balance_loss_clip": 1.03555465, + "balance_loss_mlp": 1.00017309, + "epoch": 0.819780550127762, + "flos": 23258300290560.0, + "grad_norm": 1.863716594786732, + "language_loss": 0.82285905, + "learning_rate": 3.310404844338841e-07, + "loss": 0.84104323, + "num_input_tokens_seen": 294148495, + "step": 13635, + "time_per_iteration": 4.350587606430054 + }, + { + "auxiliary_loss_clip": 0.01094136, + "auxiliary_loss_mlp": 0.01031626, + "balance_loss_clip": 1.03306413, + "balance_loss_mlp": 1.01876307, + "epoch": 0.8198406733804299, + "flos": 26685255214080.0, + "grad_norm": 1.580556826967959, + "language_loss": 0.7557019, + "learning_rate": 3.308259076607949e-07, + "loss": 0.77695948, + "num_input_tokens_seen": 294169595, + "step": 13636, + "time_per_iteration": 2.694965362548828 + }, + { + "auxiliary_loss_clip": 0.01085829, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.04320598, + "balance_loss_mlp": 1.02125335, + "epoch": 0.8199007966330979, + "flos": 20084084438400.0, + "grad_norm": 2.291328351334751, + "language_loss": 0.81272769, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.83392888, + "num_input_tokens_seen": 294183885, + "step": 13637, + "time_per_iteration": 2.730604410171509 + }, + { + "auxiliary_loss_clip": 0.01097936, + "auxiliary_loss_mlp": 0.01031089, + "balance_loss_clip": 1.03770888, + "balance_loss_mlp": 1.01860201, + "epoch": 0.8199609198857658, + "flos": 31902788142720.0, + "grad_norm": 2.2206002932791566, + "language_loss": 0.710298, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.73158824, + "num_input_tokens_seen": 294200150, + "step": 13638, + "time_per_iteration": 2.683467149734497 + }, + { + "auxiliary_loss_clip": 0.01061969, + "auxiliary_loss_mlp": 0.0103106, + "balance_loss_clip": 1.0327965, + "balance_loss_mlp": 1.01680839, + "epoch": 0.8200210431384338, + "flos": 26470150617600.0, + "grad_norm": 1.942665681540599, + "language_loss": 0.79615062, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.81708086, + "num_input_tokens_seen": 294220385, + "step": 13639, + "time_per_iteration": 2.7710959911346436 + }, + { + "auxiliary_loss_clip": 0.01062834, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.03322732, + "balance_loss_mlp": 1.02089465, + "epoch": 0.8200811663911017, + "flos": 22091454979200.0, + "grad_norm": 1.6392982589425356, + "language_loss": 0.79226673, + "learning_rate": 3.299682336022589e-07, + "loss": 0.81322664, + "num_input_tokens_seen": 294239355, + "step": 13640, + "time_per_iteration": 4.275204658508301 + }, + { + "auxiliary_loss_clip": 0.01076176, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.03405476, + "balance_loss_mlp": 1.0229578, + "epoch": 0.8201412896437698, + "flos": 37593659520000.0, + "grad_norm": 1.7308217218168405, + "language_loss": 0.63248229, + "learning_rate": 3.297539733867336e-07, + "loss": 0.65359992, + "num_input_tokens_seen": 294259395, + "step": 13641, + "time_per_iteration": 2.795254945755005 + }, + { + "auxiliary_loss_clip": 0.01056206, + "auxiliary_loss_mlp": 0.01028153, + "balance_loss_clip": 1.03538704, + "balance_loss_mlp": 1.01539707, + "epoch": 0.8202014128964377, + "flos": 19646333389440.0, + "grad_norm": 1.8557472198282705, + "language_loss": 0.73365706, + "learning_rate": 3.295397765071055e-07, + "loss": 0.75450063, + "num_input_tokens_seen": 294277365, + "step": 13642, + "time_per_iteration": 2.7157320976257324 + }, + { + "auxiliary_loss_clip": 0.01086181, + "auxiliary_loss_mlp": 0.01031537, + "balance_loss_clip": 1.03858817, + "balance_loss_mlp": 1.01963401, + "epoch": 0.8202615361491057, + "flos": 31467335564160.0, + "grad_norm": 2.095752785900936, + "language_loss": 0.70286655, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.72404379, + "num_input_tokens_seen": 294297555, + "step": 13643, + "time_per_iteration": 2.7395925521850586 + }, + { + "auxiliary_loss_clip": 0.01097598, + "auxiliary_loss_mlp": 0.01031628, + "balance_loss_clip": 1.03775418, + "balance_loss_mlp": 1.01995778, + "epoch": 0.8203216594017736, + "flos": 24715555061760.0, + "grad_norm": 1.69681118758784, + "language_loss": 0.65516806, + "learning_rate": 3.291115727880256e-07, + "loss": 0.67646027, + "num_input_tokens_seen": 294317600, + "step": 13644, + "time_per_iteration": 2.6443233489990234 + }, + { + "auxiliary_loss_clip": 0.01069905, + "auxiliary_loss_mlp": 0.01034884, + "balance_loss_clip": 1.0356884, + "balance_loss_mlp": 1.02291584, + "epoch": 0.8203817826544416, + "flos": 26031824951040.0, + "grad_norm": 1.4101189561247485, + "language_loss": 0.70740688, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.72845483, + "num_input_tokens_seen": 294340215, + "step": 13645, + "time_per_iteration": 2.7722573280334473 + }, + { + "auxiliary_loss_clip": 0.01083381, + "auxiliary_loss_mlp": 0.01027664, + "balance_loss_clip": 1.03680301, + "balance_loss_mlp": 1.01596987, + "epoch": 0.8204419059071095, + "flos": 25954544839680.0, + "grad_norm": 2.371583298507033, + "language_loss": 0.7132858, + "learning_rate": 3.286836225099707e-07, + "loss": 0.73439622, + "num_input_tokens_seen": 294358590, + "step": 13646, + "time_per_iteration": 2.713864803314209 + }, + { + "auxiliary_loss_clip": 0.01089571, + "auxiliary_loss_mlp": 0.01030919, + "balance_loss_clip": 1.036955, + "balance_loss_mlp": 1.01831234, + "epoch": 0.8205020291597775, + "flos": 23580059345280.0, + "grad_norm": 2.245036233958922, + "language_loss": 0.78633201, + "learning_rate": 3.284697424316132e-07, + "loss": 0.80753696, + "num_input_tokens_seen": 294375825, + "step": 13647, + "time_per_iteration": 2.659745693206787 + }, + { + "auxiliary_loss_clip": 0.01105517, + "auxiliary_loss_mlp": 0.01033284, + "balance_loss_clip": 1.03771901, + "balance_loss_mlp": 1.02169704, + "epoch": 0.8205621524124456, + "flos": 26799164219520.0, + "grad_norm": 1.7369474065732662, + "language_loss": 0.67728269, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.69867074, + "num_input_tokens_seen": 294398500, + "step": 13648, + "time_per_iteration": 2.642002582550049 + }, + { + "auxiliary_loss_clip": 0.01080292, + "auxiliary_loss_mlp": 0.01028555, + "balance_loss_clip": 1.03181791, + "balance_loss_mlp": 1.0157932, + "epoch": 0.8206222756651135, + "flos": 27527863432320.0, + "grad_norm": 1.7471547354733792, + "language_loss": 0.80010235, + "learning_rate": 3.28042172436791e-07, + "loss": 0.82119077, + "num_input_tokens_seen": 294418840, + "step": 13649, + "time_per_iteration": 2.704329252243042 + }, + { + "auxiliary_loss_clip": 0.01092884, + "auxiliary_loss_mlp": 0.01034827, + "balance_loss_clip": 1.03850818, + "balance_loss_mlp": 1.0212965, + "epoch": 0.8206823989177815, + "flos": 21178605715200.0, + "grad_norm": 1.9987063648882384, + "language_loss": 0.69307315, + "learning_rate": 3.278284825365396e-07, + "loss": 0.71435022, + "num_input_tokens_seen": 294438215, + "step": 13650, + "time_per_iteration": 2.59381365776062 + }, + { + "auxiliary_loss_clip": 0.01090201, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.03758073, + "balance_loss_mlp": 1.01843143, + "epoch": 0.8207425221704494, + "flos": 11509622150400.0, + "grad_norm": 1.942606809988791, + "language_loss": 0.60333896, + "learning_rate": 3.276148560452001e-07, + "loss": 0.62455606, + "num_input_tokens_seen": 294455260, + "step": 13651, + "time_per_iteration": 2.620542287826538 + }, + { + "auxiliary_loss_clip": 0.01069774, + "auxiliary_loss_mlp": 0.00773358, + "balance_loss_clip": 1.03502905, + "balance_loss_mlp": 1.00031233, + "epoch": 0.8208026454231174, + "flos": 19791987039360.0, + "grad_norm": 3.123048822731667, + "language_loss": 0.72240758, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.74083889, + "num_input_tokens_seen": 294473205, + "step": 13652, + "time_per_iteration": 2.7204532623291016 + }, + { + "auxiliary_loss_clip": 0.01081839, + "auxiliary_loss_mlp": 0.01030063, + "balance_loss_clip": 1.03512836, + "balance_loss_mlp": 1.01936436, + "epoch": 0.8208627686757853, + "flos": 15667538843520.0, + "grad_norm": 1.909630535987182, + "language_loss": 0.73210537, + "learning_rate": 3.271877933216558e-07, + "loss": 0.75322437, + "num_input_tokens_seen": 294490645, + "step": 13653, + "time_per_iteration": 2.6469080448150635 + }, + { + "auxiliary_loss_clip": 0.0107235, + "auxiliary_loss_mlp": 0.01036685, + "balance_loss_clip": 1.03659797, + "balance_loss_mlp": 1.02340472, + "epoch": 0.8209228919284534, + "flos": 37482659516160.0, + "grad_norm": 1.930498918584404, + "language_loss": 0.63319474, + "learning_rate": 3.269743571056451e-07, + "loss": 0.65428507, + "num_input_tokens_seen": 294513500, + "step": 13654, + "time_per_iteration": 2.9437685012817383 + }, + { + "auxiliary_loss_clip": 0.0108459, + "auxiliary_loss_mlp": 0.01029817, + "balance_loss_clip": 1.03793693, + "balance_loss_mlp": 1.01780069, + "epoch": 0.8209830151811213, + "flos": 23112969863040.0, + "grad_norm": 1.5668397368199467, + "language_loss": 0.70084441, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.72198856, + "num_input_tokens_seen": 294535710, + "step": 13655, + "time_per_iteration": 2.7804574966430664 + }, + { + "auxiliary_loss_clip": 0.01084392, + "auxiliary_loss_mlp": 0.01036883, + "balance_loss_clip": 1.03608942, + "balance_loss_mlp": 1.0246346, + "epoch": 0.8210431384337893, + "flos": 21288169175040.0, + "grad_norm": 2.0172748125132283, + "language_loss": 0.82037187, + "learning_rate": 3.265476750056162e-07, + "loss": 0.84158462, + "num_input_tokens_seen": 294554055, + "step": 13656, + "time_per_iteration": 2.721017599105835 + }, + { + "auxiliary_loss_clip": 0.01080199, + "auxiliary_loss_mlp": 0.01030184, + "balance_loss_clip": 1.03631461, + "balance_loss_mlp": 1.01812029, + "epoch": 0.8211032616864572, + "flos": 11502403516800.0, + "grad_norm": 2.1429350332327335, + "language_loss": 0.74038959, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.76149338, + "num_input_tokens_seen": 294570390, + "step": 13657, + "time_per_iteration": 2.6449975967407227 + }, + { + "auxiliary_loss_clip": 0.01076624, + "auxiliary_loss_mlp": 0.01033144, + "balance_loss_clip": 1.03495431, + "balance_loss_mlp": 1.02119923, + "epoch": 0.8211633849391252, + "flos": 29821477455360.0, + "grad_norm": 1.677076204685542, + "language_loss": 0.55757195, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.57866967, + "num_input_tokens_seen": 294593050, + "step": 13658, + "time_per_iteration": 2.7866504192352295 + }, + { + "auxiliary_loss_clip": 0.01046948, + "auxiliary_loss_mlp": 0.01032121, + "balance_loss_clip": 1.03354919, + "balance_loss_mlp": 1.01984835, + "epoch": 0.8212235081917931, + "flos": 13115439573120.0, + "grad_norm": 2.054958093178623, + "language_loss": 0.78911436, + "learning_rate": 3.259081278068805e-07, + "loss": 0.80990505, + "num_input_tokens_seen": 294608550, + "step": 13659, + "time_per_iteration": 2.7733964920043945 + }, + { + "auxiliary_loss_clip": 0.01090521, + "auxiliary_loss_mlp": 0.01028594, + "balance_loss_clip": 1.03315973, + "balance_loss_mlp": 1.01845503, + "epoch": 0.8212836314444611, + "flos": 40515351782400.0, + "grad_norm": 1.7003866148099478, + "language_loss": 0.59908175, + "learning_rate": 3.256950723599887e-07, + "loss": 0.62027293, + "num_input_tokens_seen": 294630380, + "step": 13660, + "time_per_iteration": 2.7818117141723633 + }, + { + "auxiliary_loss_clip": 0.01096127, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.03519523, + "balance_loss_mlp": 1.0208652, + "epoch": 0.8213437546971292, + "flos": 18770543982720.0, + "grad_norm": 2.120880379867683, + "language_loss": 0.73009235, + "learning_rate": 3.254820804029075e-07, + "loss": 0.75140172, + "num_input_tokens_seen": 294648655, + "step": 13661, + "time_per_iteration": 2.5873122215270996 + }, + { + "auxiliary_loss_clip": 0.01093175, + "auxiliary_loss_mlp": 0.01030555, + "balance_loss_clip": 1.03569698, + "balance_loss_mlp": 1.01827097, + "epoch": 0.8214038779497971, + "flos": 19682279925120.0, + "grad_norm": 2.1908603009914707, + "language_loss": 0.74912691, + "learning_rate": 3.252691519437143e-07, + "loss": 0.77036428, + "num_input_tokens_seen": 294666915, + "step": 13662, + "time_per_iteration": 2.70076322555542 + }, + { + "auxiliary_loss_clip": 0.01029455, + "auxiliary_loss_mlp": 0.01001299, + "balance_loss_clip": 1.00707769, + "balance_loss_mlp": 1.00035727, + "epoch": 0.8214640012024651, + "flos": 71602969697280.0, + "grad_norm": 0.7436789430956001, + "language_loss": 0.54036576, + "learning_rate": 3.250562869904825e-07, + "loss": 0.5606733, + "num_input_tokens_seen": 294731545, + "step": 13663, + "time_per_iteration": 3.2524144649505615 + }, + { + "auxiliary_loss_clip": 0.0106094, + "auxiliary_loss_mlp": 0.01032266, + "balance_loss_clip": 1.03105712, + "balance_loss_mlp": 1.02002287, + "epoch": 0.821524124455133, + "flos": 14757203531520.0, + "grad_norm": 2.109135364690857, + "language_loss": 0.65783775, + "learning_rate": 3.248434855512838e-07, + "loss": 0.67876983, + "num_input_tokens_seen": 294748745, + "step": 13664, + "time_per_iteration": 2.7579057216644287 + }, + { + "auxiliary_loss_clip": 0.01081895, + "auxiliary_loss_mlp": 0.0103047, + "balance_loss_clip": 1.03475428, + "balance_loss_mlp": 1.01932395, + "epoch": 0.821584247707801, + "flos": 25082274965760.0, + "grad_norm": 1.5036493569794076, + "language_loss": 0.75327474, + "learning_rate": 3.246307476341881e-07, + "loss": 0.77439839, + "num_input_tokens_seen": 294768955, + "step": 13665, + "time_per_iteration": 2.7124111652374268 + }, + { + "auxiliary_loss_clip": 0.01093989, + "auxiliary_loss_mlp": 0.00769563, + "balance_loss_clip": 1.03792393, + "balance_loss_mlp": 1.00023198, + "epoch": 0.8216443709604689, + "flos": 36830701710720.0, + "grad_norm": 2.32376999717277, + "language_loss": 0.65432054, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.67295599, + "num_input_tokens_seen": 294789250, + "step": 13666, + "time_per_iteration": 2.7520713806152344 + }, + { + "auxiliary_loss_clip": 0.01059201, + "auxiliary_loss_mlp": 0.01030678, + "balance_loss_clip": 1.03574967, + "balance_loss_mlp": 1.01929307, + "epoch": 0.821704494213137, + "flos": 25081808088960.0, + "grad_norm": 1.6586859973993004, + "language_loss": 0.76773095, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.78862977, + "num_input_tokens_seen": 294809760, + "step": 13667, + "time_per_iteration": 2.8164875507354736 + }, + { + "auxiliary_loss_clip": 0.01077218, + "auxiliary_loss_mlp": 0.01032735, + "balance_loss_clip": 1.03665185, + "balance_loss_mlp": 1.02043223, + "epoch": 0.8217646174658049, + "flos": 14356117290240.0, + "grad_norm": 1.9214564024977732, + "language_loss": 0.77153236, + "learning_rate": 3.239929150961773e-07, + "loss": 0.79263186, + "num_input_tokens_seen": 294826495, + "step": 13668, + "time_per_iteration": 2.795309066772461 + }, + { + "auxiliary_loss_clip": 0.0106108, + "auxiliary_loss_mlp": 0.01032359, + "balance_loss_clip": 1.03410029, + "balance_loss_mlp": 1.02047384, + "epoch": 0.8218247407184729, + "flos": 22090557139200.0, + "grad_norm": 2.232101782459693, + "language_loss": 0.7333163, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.75425071, + "num_input_tokens_seen": 294845370, + "step": 13669, + "time_per_iteration": 2.733705520629883 + }, + { + "auxiliary_loss_clip": 0.01096991, + "auxiliary_loss_mlp": 0.01026791, + "balance_loss_clip": 1.03674257, + "balance_loss_mlp": 1.01509678, + "epoch": 0.8218848639711408, + "flos": 16764035368320.0, + "grad_norm": 1.5914876728736391, + "language_loss": 0.78921843, + "learning_rate": 3.235680111625161e-07, + "loss": 0.81045628, + "num_input_tokens_seen": 294863740, + "step": 13670, + "time_per_iteration": 2.632380723953247 + }, + { + "auxiliary_loss_clip": 0.01101033, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.03839719, + "balance_loss_mlp": 1.02437234, + "epoch": 0.8219449872238088, + "flos": 25994801007360.0, + "grad_norm": 1.7358038060221426, + "language_loss": 0.74638772, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.76776969, + "num_input_tokens_seen": 294882815, + "step": 13671, + "time_per_iteration": 2.6536366939544678 + }, + { + "auxiliary_loss_clip": 0.01102103, + "auxiliary_loss_mlp": 0.0103, + "balance_loss_clip": 1.03765309, + "balance_loss_mlp": 1.0173583, + "epoch": 0.8220051104764767, + "flos": 20778094091520.0, + "grad_norm": 1.8480200060327416, + "language_loss": 0.76200128, + "learning_rate": 3.23143361510728e-07, + "loss": 0.78332233, + "num_input_tokens_seen": 294901985, + "step": 13672, + "time_per_iteration": 2.6287293434143066 + }, + { + "auxiliary_loss_clip": 0.0105776, + "auxiliary_loss_mlp": 0.01037446, + "balance_loss_clip": 1.03279448, + "balance_loss_mlp": 1.02387452, + "epoch": 0.8220652337291448, + "flos": 14574849160320.0, + "grad_norm": 2.155588749623656, + "language_loss": 0.74635303, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.76730502, + "num_input_tokens_seen": 294919705, + "step": 13673, + "time_per_iteration": 5.964927911758423 + }, + { + "auxiliary_loss_clip": 0.01091542, + "auxiliary_loss_mlp": 0.01034095, + "balance_loss_clip": 1.03949618, + "balance_loss_mlp": 1.02133989, + "epoch": 0.8221253569818128, + "flos": 23805866194560.0, + "grad_norm": 1.8576069667953699, + "language_loss": 0.79360175, + "learning_rate": 3.227189662052254e-07, + "loss": 0.8148582, + "num_input_tokens_seen": 294939900, + "step": 13674, + "time_per_iteration": 2.711923599243164 + }, + { + "auxiliary_loss_clip": 0.01082091, + "auxiliary_loss_mlp": 0.01037891, + "balance_loss_clip": 1.03274429, + "balance_loss_mlp": 1.0257858, + "epoch": 0.8221854802344807, + "flos": 21288241002240.0, + "grad_norm": 2.0257881823597508, + "language_loss": 0.69993466, + "learning_rate": 3.225068639524484e-07, + "loss": 0.72113442, + "num_input_tokens_seen": 294959110, + "step": 13675, + "time_per_iteration": 4.205335378646851 + }, + { + "auxiliary_loss_clip": 0.01089922, + "auxiliary_loss_mlp": 0.01037385, + "balance_loss_clip": 1.03466141, + "balance_loss_mlp": 1.02468348, + "epoch": 0.8222456034871487, + "flos": 20956785275520.0, + "grad_norm": 1.6271888022504428, + "language_loss": 0.74166471, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.76293778, + "num_input_tokens_seen": 294978660, + "step": 13676, + "time_per_iteration": 2.633631944656372 + }, + { + "auxiliary_loss_clip": 0.01081581, + "auxiliary_loss_mlp": 0.01032411, + "balance_loss_clip": 1.03602481, + "balance_loss_mlp": 1.02066302, + "epoch": 0.8223057267398166, + "flos": 21397517153280.0, + "grad_norm": 1.8848040435519355, + "language_loss": 0.80344379, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.82458377, + "num_input_tokens_seen": 294998075, + "step": 13677, + "time_per_iteration": 2.715427875518799 + }, + { + "auxiliary_loss_clip": 0.01093784, + "auxiliary_loss_mlp": 0.01037139, + "balance_loss_clip": 1.03556919, + "balance_loss_mlp": 1.02450824, + "epoch": 0.8223658499924846, + "flos": 15268212368640.0, + "grad_norm": 2.296503126138382, + "language_loss": 0.70510441, + "learning_rate": 3.218709388905245e-07, + "loss": 0.72641361, + "num_input_tokens_seen": 295015950, + "step": 13678, + "time_per_iteration": 2.662177085876465 + }, + { + "auxiliary_loss_clip": 0.01107791, + "auxiliary_loss_mlp": 0.01034832, + "balance_loss_clip": 1.03623056, + "balance_loss_mlp": 1.02258909, + "epoch": 0.8224259732451525, + "flos": 31249537447680.0, + "grad_norm": 1.4830532789333675, + "language_loss": 0.71389025, + "learning_rate": 3.216590911288133e-07, + "loss": 0.73531646, + "num_input_tokens_seen": 295036800, + "step": 13679, + "time_per_iteration": 4.202351808547974 + }, + { + "auxiliary_loss_clip": 0.01079212, + "auxiliary_loss_mlp": 0.01033441, + "balance_loss_clip": 1.03329039, + "balance_loss_mlp": 1.02008915, + "epoch": 0.8224860964978206, + "flos": 21574628138880.0, + "grad_norm": 1.9740769073564464, + "language_loss": 0.70159578, + "learning_rate": 3.214473070099564e-07, + "loss": 0.72272229, + "num_input_tokens_seen": 295055300, + "step": 13680, + "time_per_iteration": 2.644590139389038 + }, + { + "auxiliary_loss_clip": 0.01075147, + "auxiliary_loss_mlp": 0.01029985, + "balance_loss_clip": 1.03547573, + "balance_loss_mlp": 1.01875556, + "epoch": 0.8225462197504885, + "flos": 25483217552640.0, + "grad_norm": 3.3850190908064164, + "language_loss": 0.59734452, + "learning_rate": 3.21235586541986e-07, + "loss": 0.61839581, + "num_input_tokens_seen": 295076420, + "step": 13681, + "time_per_iteration": 2.693240165710449 + }, + { + "auxiliary_loss_clip": 0.01084056, + "auxiliary_loss_mlp": 0.01038117, + "balance_loss_clip": 1.0347333, + "balance_loss_mlp": 1.02480125, + "epoch": 0.8226063430031565, + "flos": 39385458587520.0, + "grad_norm": 2.647199220979941, + "language_loss": 0.68972695, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.71094871, + "num_input_tokens_seen": 295100540, + "step": 13682, + "time_per_iteration": 2.793362855911255 + }, + { + "auxiliary_loss_clip": 0.01109468, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.0367074, + "balance_loss_mlp": 1.01775503, + "epoch": 0.8226664662558244, + "flos": 22815269942400.0, + "grad_norm": 1.8560800713238335, + "language_loss": 0.79419553, + "learning_rate": 3.20812336590816e-07, + "loss": 0.81559926, + "num_input_tokens_seen": 295120180, + "step": 13683, + "time_per_iteration": 2.663804292678833 + }, + { + "auxiliary_loss_clip": 0.01104253, + "auxiliary_loss_mlp": 0.01029993, + "balance_loss_clip": 1.03593254, + "balance_loss_mlp": 1.01891863, + "epoch": 0.8227265895084924, + "flos": 25665607837440.0, + "grad_norm": 1.9656579535514493, + "language_loss": 0.86604738, + "learning_rate": 3.206008071236661e-07, + "loss": 0.88738984, + "num_input_tokens_seen": 295138530, + "step": 13684, + "time_per_iteration": 2.6015169620513916 + }, + { + "auxiliary_loss_clip": 0.01104335, + "auxiliary_loss_mlp": 0.01029249, + "balance_loss_clip": 1.03555918, + "balance_loss_mlp": 1.01763213, + "epoch": 0.8227867127611603, + "flos": 26179274280960.0, + "grad_norm": 1.536812487486819, + "language_loss": 0.79920459, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.82054043, + "num_input_tokens_seen": 295160260, + "step": 13685, + "time_per_iteration": 2.7008142471313477 + }, + { + "auxiliary_loss_clip": 0.0107249, + "auxiliary_loss_mlp": 0.01030486, + "balance_loss_clip": 1.03493214, + "balance_loss_mlp": 1.01813579, + "epoch": 0.8228468360138284, + "flos": 22018053536640.0, + "grad_norm": 1.6748443436475502, + "language_loss": 0.68744385, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.70847368, + "num_input_tokens_seen": 295177055, + "step": 13686, + "time_per_iteration": 2.7271742820739746 + }, + { + "auxiliary_loss_clip": 0.01076871, + "auxiliary_loss_mlp": 0.01032996, + "balance_loss_clip": 1.03525162, + "balance_loss_mlp": 1.02082491, + "epoch": 0.8229069592664963, + "flos": 14903359971840.0, + "grad_norm": 1.8146731165016403, + "language_loss": 0.77963513, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.80073375, + "num_input_tokens_seen": 295193870, + "step": 13687, + "time_per_iteration": 2.6741888523101807 + }, + { + "auxiliary_loss_clip": 0.01097929, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.03655159, + "balance_loss_mlp": 1.01794028, + "epoch": 0.8229670825191643, + "flos": 15669478177920.0, + "grad_norm": 1.7147177179883277, + "language_loss": 0.72279108, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.74407512, + "num_input_tokens_seen": 295211040, + "step": 13688, + "time_per_iteration": 2.583867311477661 + }, + { + "auxiliary_loss_clip": 0.01108409, + "auxiliary_loss_mlp": 0.00769781, + "balance_loss_clip": 1.03682184, + "balance_loss_mlp": 1.0001483, + "epoch": 0.8230272057718323, + "flos": 23183498217600.0, + "grad_norm": 2.234271170897282, + "language_loss": 0.73181629, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.75059819, + "num_input_tokens_seen": 295231300, + "step": 13689, + "time_per_iteration": 2.718895673751831 + }, + { + "auxiliary_loss_clip": 0.01098539, + "auxiliary_loss_mlp": 0.01031163, + "balance_loss_clip": 1.0351994, + "balance_loss_mlp": 1.0188967, + "epoch": 0.8230873290245002, + "flos": 21032413361280.0, + "grad_norm": 3.545814626026256, + "language_loss": 0.69253677, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.71383381, + "num_input_tokens_seen": 295251045, + "step": 13690, + "time_per_iteration": 2.6642231941223145 + }, + { + "auxiliary_loss_clip": 0.01062899, + "auxiliary_loss_mlp": 0.01041098, + "balance_loss_clip": 1.03263807, + "balance_loss_mlp": 1.02722192, + "epoch": 0.8231474522771682, + "flos": 21250139650560.0, + "grad_norm": 1.8299845733517255, + "language_loss": 0.85268778, + "learning_rate": 3.191218844260988e-07, + "loss": 0.87372774, + "num_input_tokens_seen": 295270225, + "step": 13691, + "time_per_iteration": 2.7507143020629883 + }, + { + "auxiliary_loss_clip": 0.01101229, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.03781307, + "balance_loss_mlp": 1.02287257, + "epoch": 0.8232075755298361, + "flos": 23842028211840.0, + "grad_norm": 1.8079890688492317, + "language_loss": 0.77103651, + "learning_rate": 3.189108646472252e-07, + "loss": 0.79239464, + "num_input_tokens_seen": 295288950, + "step": 13692, + "time_per_iteration": 2.67478084564209 + }, + { + "auxiliary_loss_clip": 0.01096284, + "auxiliary_loss_mlp": 0.01027159, + "balance_loss_clip": 1.03692162, + "balance_loss_mlp": 1.0151006, + "epoch": 0.8232676987825042, + "flos": 21653955325440.0, + "grad_norm": 1.722595052749625, + "language_loss": 0.71423566, + "learning_rate": 3.186999086154205e-07, + "loss": 0.73547006, + "num_input_tokens_seen": 295309405, + "step": 13693, + "time_per_iteration": 2.718867301940918 + }, + { + "auxiliary_loss_clip": 0.01070842, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.03349066, + "balance_loss_mlp": 1.01865232, + "epoch": 0.8233278220351721, + "flos": 26322701287680.0, + "grad_norm": 1.3395802259030574, + "language_loss": 0.83745861, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.85846514, + "num_input_tokens_seen": 295331115, + "step": 13694, + "time_per_iteration": 2.7664167881011963 + }, + { + "auxiliary_loss_clip": 0.0104721, + "auxiliary_loss_mlp": 0.01032456, + "balance_loss_clip": 1.03542459, + "balance_loss_mlp": 1.0194205, + "epoch": 0.8233879452878401, + "flos": 21725812483200.0, + "grad_norm": 1.774536934152641, + "language_loss": 0.76836276, + "learning_rate": 3.182781878250118e-07, + "loss": 0.78915936, + "num_input_tokens_seen": 295350495, + "step": 13695, + "time_per_iteration": 2.750267744064331 + }, + { + "auxiliary_loss_clip": 0.01087721, + "auxiliary_loss_mlp": 0.01034204, + "balance_loss_clip": 1.03655171, + "balance_loss_mlp": 1.02215171, + "epoch": 0.823448068540508, + "flos": 20557746109440.0, + "grad_norm": 1.7975071163239338, + "language_loss": 0.80965418, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.83087343, + "num_input_tokens_seen": 295368225, + "step": 13696, + "time_per_iteration": 2.6955337524414062 + }, + { + "auxiliary_loss_clip": 0.01020282, + "auxiliary_loss_mlp": 0.0100384, + "balance_loss_clip": 1.00769222, + "balance_loss_mlp": 1.00285649, + "epoch": 0.823508191793176, + "flos": 67273688194560.0, + "grad_norm": 0.7350797292935349, + "language_loss": 0.63867533, + "learning_rate": 3.178567221188393e-07, + "loss": 0.65891653, + "num_input_tokens_seen": 295430035, + "step": 13697, + "time_per_iteration": 3.2243242263793945 + }, + { + "auxiliary_loss_clip": 0.01070899, + "auxiliary_loss_mlp": 0.01025732, + "balance_loss_clip": 1.03476644, + "balance_loss_mlp": 1.01477075, + "epoch": 0.8235683150458439, + "flos": 17928402641280.0, + "grad_norm": 1.6913547769566408, + "language_loss": 0.72991723, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.75088358, + "num_input_tokens_seen": 295447765, + "step": 13698, + "time_per_iteration": 2.670644998550415 + }, + { + "auxiliary_loss_clip": 0.01063119, + "auxiliary_loss_mlp": 0.01047662, + "balance_loss_clip": 1.03002477, + "balance_loss_mlp": 1.03214049, + "epoch": 0.823628438298512, + "flos": 18916089891840.0, + "grad_norm": 1.861601543372515, + "language_loss": 0.71800578, + "learning_rate": 3.174355115608305e-07, + "loss": 0.73911357, + "num_input_tokens_seen": 295464810, + "step": 13699, + "time_per_iteration": 2.7969279289245605 + }, + { + "auxiliary_loss_clip": 0.01086761, + "auxiliary_loss_mlp": 0.01028632, + "balance_loss_clip": 1.03632307, + "balance_loss_mlp": 1.01650214, + "epoch": 0.8236885615511799, + "flos": 18696460181760.0, + "grad_norm": 1.9855299133733353, + "language_loss": 0.8196975, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.84085149, + "num_input_tokens_seen": 295482605, + "step": 13700, + "time_per_iteration": 2.6503469944000244 + }, + { + "auxiliary_loss_clip": 0.01086133, + "auxiliary_loss_mlp": 0.01035546, + "balance_loss_clip": 1.03498542, + "balance_loss_mlp": 1.02366662, + "epoch": 0.8237486848038479, + "flos": 23695009845120.0, + "grad_norm": 1.6635741154144412, + "language_loss": 0.73422629, + "learning_rate": 3.170145562148763e-07, + "loss": 0.7554431, + "num_input_tokens_seen": 295503780, + "step": 13701, + "time_per_iteration": 2.6358823776245117 + }, + { + "auxiliary_loss_clip": 0.01097849, + "auxiliary_loss_mlp": 0.01036965, + "balance_loss_clip": 1.03509569, + "balance_loss_mlp": 1.02432895, + "epoch": 0.8238088080565159, + "flos": 23441301106560.0, + "grad_norm": 1.9462768217086985, + "language_loss": 0.69265807, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.71400625, + "num_input_tokens_seen": 295522035, + "step": 13702, + "time_per_iteration": 2.60188627243042 + }, + { + "auxiliary_loss_clip": 0.01063324, + "auxiliary_loss_mlp": 0.01034265, + "balance_loss_clip": 1.03598332, + "balance_loss_mlp": 1.02128875, + "epoch": 0.8238689313091838, + "flos": 22746537267840.0, + "grad_norm": 1.9923897807991633, + "language_loss": 0.75280106, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.77377695, + "num_input_tokens_seen": 295541190, + "step": 13703, + "time_per_iteration": 2.7468554973602295 + }, + { + "auxiliary_loss_clip": 0.01113854, + "auxiliary_loss_mlp": 0.01037893, + "balance_loss_clip": 1.03847456, + "balance_loss_mlp": 1.02467299, + "epoch": 0.8239290545618518, + "flos": 25630092264960.0, + "grad_norm": 1.7182748421567742, + "language_loss": 0.69657588, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.71809334, + "num_input_tokens_seen": 295558860, + "step": 13704, + "time_per_iteration": 2.5931785106658936 + }, + { + "auxiliary_loss_clip": 0.01105612, + "auxiliary_loss_mlp": 0.01030565, + "balance_loss_clip": 1.03566051, + "balance_loss_mlp": 1.01859665, + "epoch": 0.8239891778145197, + "flos": 26026473824640.0, + "grad_norm": 1.8447020844215793, + "language_loss": 0.64444757, + "learning_rate": 3.161734114144916e-07, + "loss": 0.66580933, + "num_input_tokens_seen": 295578155, + "step": 13705, + "time_per_iteration": 2.5968048572540283 + }, + { + "auxiliary_loss_clip": 0.01110492, + "auxiliary_loss_mlp": 0.01031144, + "balance_loss_clip": 1.03668666, + "balance_loss_mlp": 1.01796532, + "epoch": 0.8240493010671878, + "flos": 21833257040640.0, + "grad_norm": 1.541851656815521, + "language_loss": 0.69572484, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.71714121, + "num_input_tokens_seen": 295599170, + "step": 13706, + "time_per_iteration": 2.5887668132781982 + }, + { + "auxiliary_loss_clip": 0.0108328, + "auxiliary_loss_mlp": 0.01039333, + "balance_loss_clip": 1.03719616, + "balance_loss_mlp": 1.02601147, + "epoch": 0.8241094243198557, + "flos": 18551919853440.0, + "grad_norm": 1.661218457463816, + "language_loss": 0.69479191, + "learning_rate": 3.157532220876475e-07, + "loss": 0.71601802, + "num_input_tokens_seen": 295617465, + "step": 13707, + "time_per_iteration": 2.6411385536193848 + }, + { + "auxiliary_loss_clip": 0.01072958, + "auxiliary_loss_mlp": 0.01038335, + "balance_loss_clip": 1.03431034, + "balance_loss_mlp": 1.0244596, + "epoch": 0.8241695475725237, + "flos": 25447163276160.0, + "grad_norm": 1.8467879085994943, + "language_loss": 0.79235733, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.81347024, + "num_input_tokens_seen": 295634960, + "step": 13708, + "time_per_iteration": 2.700183153152466 + }, + { + "auxiliary_loss_clip": 0.01092221, + "auxiliary_loss_mlp": 0.0103148, + "balance_loss_clip": 1.0341289, + "balance_loss_mlp": 1.01864731, + "epoch": 0.8242296708251916, + "flos": 18989670902400.0, + "grad_norm": 3.0430641807268954, + "language_loss": 0.68361056, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.70484757, + "num_input_tokens_seen": 295652725, + "step": 13709, + "time_per_iteration": 2.5937395095825195 + }, + { + "auxiliary_loss_clip": 0.01065101, + "auxiliary_loss_mlp": 0.01032868, + "balance_loss_clip": 1.03181398, + "balance_loss_mlp": 1.02109027, + "epoch": 0.8242897940778596, + "flos": 22600883617920.0, + "grad_norm": 1.766284405655816, + "language_loss": 0.82331645, + "learning_rate": 3.151234171183319e-07, + "loss": 0.84429616, + "num_input_tokens_seen": 295671195, + "step": 13710, + "time_per_iteration": 2.749650239944458 + }, + { + "auxiliary_loss_clip": 0.01096973, + "auxiliary_loss_mlp": 0.01034028, + "balance_loss_clip": 1.03629923, + "balance_loss_mlp": 1.02127314, + "epoch": 0.8243499173305275, + "flos": 21468153248640.0, + "grad_norm": 13.701839105359984, + "language_loss": 0.78112018, + "learning_rate": 3.149136098993257e-07, + "loss": 0.80243027, + "num_input_tokens_seen": 295689130, + "step": 13711, + "time_per_iteration": 2.7447783946990967 + }, + { + "auxiliary_loss_clip": 0.0107344, + "auxiliary_loss_mlp": 0.01029912, + "balance_loss_clip": 1.03311896, + "balance_loss_mlp": 1.01736498, + "epoch": 0.8244100405831956, + "flos": 20010359773440.0, + "grad_norm": 3.3468765947444568, + "language_loss": 0.65435582, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.67538929, + "num_input_tokens_seen": 295706385, + "step": 13712, + "time_per_iteration": 4.317276477813721 + }, + { + "auxiliary_loss_clip": 0.01091569, + "auxiliary_loss_mlp": 0.01029045, + "balance_loss_clip": 1.03673708, + "balance_loss_mlp": 1.0174818, + "epoch": 0.8244701638358635, + "flos": 26430684549120.0, + "grad_norm": 1.8364742562696106, + "language_loss": 0.74371034, + "learning_rate": 3.14494187165202e-07, + "loss": 0.76491648, + "num_input_tokens_seen": 295727925, + "step": 13713, + "time_per_iteration": 4.166277647018433 + }, + { + "auxiliary_loss_clip": 0.01096875, + "auxiliary_loss_mlp": 0.01027842, + "balance_loss_clip": 1.03534007, + "balance_loss_mlp": 1.01558685, + "epoch": 0.8245302870885315, + "flos": 17640004343040.0, + "grad_norm": 6.838551643078677, + "language_loss": 0.80911207, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.83035922, + "num_input_tokens_seen": 295744420, + "step": 13714, + "time_per_iteration": 2.624154806137085 + }, + { + "auxiliary_loss_clip": 0.01099074, + "auxiliary_loss_mlp": 0.01034976, + "balance_loss_clip": 1.03917253, + "balance_loss_mlp": 1.02173758, + "epoch": 0.8245904103411995, + "flos": 26209510554240.0, + "grad_norm": 1.9766045334359852, + "language_loss": 0.66371924, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.68505979, + "num_input_tokens_seen": 295765105, + "step": 13715, + "time_per_iteration": 4.212578296661377 + }, + { + "auxiliary_loss_clip": 0.01081096, + "auxiliary_loss_mlp": 0.01029311, + "balance_loss_clip": 1.03784251, + "balance_loss_mlp": 1.01657939, + "epoch": 0.8246505335938674, + "flos": 24205084928640.0, + "grad_norm": 2.0424767412149567, + "language_loss": 0.74730164, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.76840568, + "num_input_tokens_seen": 295784200, + "step": 13716, + "time_per_iteration": 2.7325594425201416 + }, + { + "auxiliary_loss_clip": 0.00991112, + "auxiliary_loss_mlp": 0.0100064, + "balance_loss_clip": 1.00916934, + "balance_loss_mlp": 0.99952489, + "epoch": 0.8247106568465354, + "flos": 67092195749760.0, + "grad_norm": 0.7138774267720784, + "language_loss": 0.58973479, + "learning_rate": 3.136561087351175e-07, + "loss": 0.60965228, + "num_input_tokens_seen": 295846555, + "step": 13717, + "time_per_iteration": 4.931637763977051 + }, + { + "auxiliary_loss_clip": 0.01094759, + "auxiliary_loss_mlp": 0.00770088, + "balance_loss_clip": 1.03633809, + "balance_loss_mlp": 1.00021517, + "epoch": 0.8247707800992033, + "flos": 12568232805120.0, + "grad_norm": 1.8911400591103953, + "language_loss": 0.79565227, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.81430078, + "num_input_tokens_seen": 295863425, + "step": 13718, + "time_per_iteration": 2.6436800956726074 + }, + { + "auxiliary_loss_clip": 0.01088621, + "auxiliary_loss_mlp": 0.01033203, + "balance_loss_clip": 1.03615391, + "balance_loss_mlp": 1.02111554, + "epoch": 0.8248309033518714, + "flos": 15923617879680.0, + "grad_norm": 1.5520316842938593, + "language_loss": 0.68703258, + "learning_rate": 3.132374531662778e-07, + "loss": 0.70825082, + "num_input_tokens_seen": 295880925, + "step": 13719, + "time_per_iteration": 2.716325044631958 + }, + { + "auxiliary_loss_clip": 0.01079067, + "auxiliary_loss_mlp": 0.01034824, + "balance_loss_clip": 1.03340077, + "balance_loss_mlp": 1.0202682, + "epoch": 0.8248910266045393, + "flos": 17564735393280.0, + "grad_norm": 2.6589956640079038, + "language_loss": 0.70158517, + "learning_rate": 3.13028221321197e-07, + "loss": 0.72272408, + "num_input_tokens_seen": 295898205, + "step": 13720, + "time_per_iteration": 2.5896477699279785 + }, + { + "auxiliary_loss_clip": 0.01033476, + "auxiliary_loss_mlp": 0.01024097, + "balance_loss_clip": 1.03508949, + "balance_loss_mlp": 1.01189578, + "epoch": 0.8249511498572073, + "flos": 28619655275520.0, + "grad_norm": 1.5927327922033778, + "language_loss": 0.75676763, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.77734333, + "num_input_tokens_seen": 295918130, + "step": 13721, + "time_per_iteration": 3.003366470336914 + }, + { + "auxiliary_loss_clip": 0.01064431, + "auxiliary_loss_mlp": 0.0102768, + "balance_loss_clip": 1.04172993, + "balance_loss_mlp": 1.01569343, + "epoch": 0.8250112731098752, + "flos": 25556583081600.0, + "grad_norm": 1.9277434065767896, + "language_loss": 0.7792846, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.80020571, + "num_input_tokens_seen": 295937760, + "step": 13722, + "time_per_iteration": 2.993467092514038 + }, + { + "auxiliary_loss_clip": 0.01107933, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.03832984, + "balance_loss_mlp": 1.01868153, + "epoch": 0.8250713963625432, + "flos": 27746164339200.0, + "grad_norm": 1.9336689467836483, + "language_loss": 0.63077027, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.6521557, + "num_input_tokens_seen": 295957585, + "step": 13723, + "time_per_iteration": 2.65627384185791 + }, + { + "auxiliary_loss_clip": 0.01109221, + "auxiliary_loss_mlp": 0.0103494, + "balance_loss_clip": 1.03650689, + "balance_loss_mlp": 1.0223273, + "epoch": 0.8251315196152111, + "flos": 21610610588160.0, + "grad_norm": 1.425967776015011, + "language_loss": 0.74256718, + "learning_rate": 3.121919337215666e-07, + "loss": 0.76400876, + "num_input_tokens_seen": 295977135, + "step": 13724, + "time_per_iteration": 2.6450181007385254 + }, + { + "auxiliary_loss_clip": 0.01076005, + "auxiliary_loss_mlp": 0.01035593, + "balance_loss_clip": 1.03590727, + "balance_loss_mlp": 1.02253342, + "epoch": 0.8251916428678792, + "flos": 28579363194240.0, + "grad_norm": 1.8109135586659708, + "language_loss": 0.6419245, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.66304046, + "num_input_tokens_seen": 295996265, + "step": 13725, + "time_per_iteration": 2.747354507446289 + }, + { + "auxiliary_loss_clip": 0.01081699, + "auxiliary_loss_mlp": 0.01029734, + "balance_loss_clip": 1.03467178, + "balance_loss_mlp": 1.01717496, + "epoch": 0.8252517661205471, + "flos": 23075191733760.0, + "grad_norm": 1.5423551170824084, + "language_loss": 0.81953287, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.84064722, + "num_input_tokens_seen": 296014745, + "step": 13726, + "time_per_iteration": 2.677957057952881 + }, + { + "auxiliary_loss_clip": 0.01090181, + "auxiliary_loss_mlp": 0.01033897, + "balance_loss_clip": 1.03259659, + "balance_loss_mlp": 1.02245855, + "epoch": 0.8253118893732151, + "flos": 31759576617600.0, + "grad_norm": 1.6832694134847563, + "language_loss": 0.70317417, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.72441494, + "num_input_tokens_seen": 296036960, + "step": 13727, + "time_per_iteration": 2.6928937435150146 + }, + { + "auxiliary_loss_clip": 0.01102136, + "auxiliary_loss_mlp": 0.01028405, + "balance_loss_clip": 1.03817558, + "balance_loss_mlp": 1.01547694, + "epoch": 0.8253720126258831, + "flos": 18296415434880.0, + "grad_norm": 1.667834208410725, + "language_loss": 0.62520349, + "learning_rate": 3.113566701515036e-07, + "loss": 0.64650893, + "num_input_tokens_seen": 296056540, + "step": 13728, + "time_per_iteration": 2.6370222568511963 + }, + { + "auxiliary_loss_clip": 0.01092032, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.03751087, + "balance_loss_mlp": 1.0174228, + "epoch": 0.825432135878551, + "flos": 26797332625920.0, + "grad_norm": 1.9482709923382855, + "language_loss": 0.71667683, + "learning_rate": 3.111480143230092e-07, + "loss": 0.73789644, + "num_input_tokens_seen": 296077950, + "step": 13729, + "time_per_iteration": 2.6492090225219727 + }, + { + "auxiliary_loss_clip": 0.01014426, + "auxiliary_loss_mlp": 0.0100436, + "balance_loss_clip": 1.01090586, + "balance_loss_mlp": 1.00330532, + "epoch": 0.825492259131219, + "flos": 54219116217600.0, + "grad_norm": 0.8488116722729574, + "language_loss": 0.6264025, + "learning_rate": 3.109394225359514e-07, + "loss": 0.64659035, + "num_input_tokens_seen": 296127060, + "step": 13730, + "time_per_iteration": 3.0054545402526855 + }, + { + "auxiliary_loss_clip": 0.01058894, + "auxiliary_loss_mlp": 0.01034161, + "balance_loss_clip": 1.03521633, + "balance_loss_mlp": 1.02156639, + "epoch": 0.825552382383887, + "flos": 43756145493120.0, + "grad_norm": 7.4365130225127505, + "language_loss": 0.6353327, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.65626323, + "num_input_tokens_seen": 296147775, + "step": 13731, + "time_per_iteration": 2.9331674575805664 + }, + { + "auxiliary_loss_clip": 0.0107139, + "auxiliary_loss_mlp": 0.00773278, + "balance_loss_clip": 1.03046966, + "balance_loss_mlp": 1.0002327, + "epoch": 0.825612505636555, + "flos": 12602814624000.0, + "grad_norm": 2.180240651143821, + "language_loss": 0.70295024, + "learning_rate": 3.105224311177812e-07, + "loss": 0.72139692, + "num_input_tokens_seen": 296163560, + "step": 13732, + "time_per_iteration": 2.765413761138916 + }, + { + "auxiliary_loss_clip": 0.01100354, + "auxiliary_loss_mlp": 0.01038249, + "balance_loss_clip": 1.03632462, + "balance_loss_mlp": 1.02532113, + "epoch": 0.8256726288892229, + "flos": 17595618111360.0, + "grad_norm": 2.287080193464761, + "language_loss": 0.71307957, + "learning_rate": 3.103140315024817e-07, + "loss": 0.7344656, + "num_input_tokens_seen": 296178730, + "step": 13733, + "time_per_iteration": 2.663184642791748 + }, + { + "auxiliary_loss_clip": 0.01106421, + "auxiliary_loss_mlp": 0.01033457, + "balance_loss_clip": 1.03536689, + "balance_loss_mlp": 1.02092218, + "epoch": 0.8257327521418909, + "flos": 23805794367360.0, + "grad_norm": 1.5370953364737692, + "language_loss": 0.82361829, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.84501708, + "num_input_tokens_seen": 296200175, + "step": 13734, + "time_per_iteration": 2.5860283374786377 + }, + { + "auxiliary_loss_clip": 0.01078022, + "auxiliary_loss_mlp": 0.01033879, + "balance_loss_clip": 1.03394449, + "balance_loss_mlp": 1.02108788, + "epoch": 0.8257928753945588, + "flos": 19281121856640.0, + "grad_norm": 1.767732379268741, + "language_loss": 0.8304292, + "learning_rate": 3.098974244989676e-07, + "loss": 0.85154831, + "num_input_tokens_seen": 296219305, + "step": 13735, + "time_per_iteration": 2.6341776847839355 + }, + { + "auxiliary_loss_clip": 0.01103224, + "auxiliary_loss_mlp": 0.01029169, + "balance_loss_clip": 1.03989172, + "balance_loss_mlp": 1.01795721, + "epoch": 0.8258529986472268, + "flos": 18478841633280.0, + "grad_norm": 1.736444707629355, + "language_loss": 0.70653635, + "learning_rate": 3.096892171265497e-07, + "loss": 0.72786027, + "num_input_tokens_seen": 296236945, + "step": 13736, + "time_per_iteration": 2.5950427055358887 + }, + { + "auxiliary_loss_clip": 0.01021603, + "auxiliary_loss_mlp": 0.01002911, + "balance_loss_clip": 1.00879157, + "balance_loss_mlp": 1.00194514, + "epoch": 0.8259131218998947, + "flos": 62137957512960.0, + "grad_norm": 0.8987273381116809, + "language_loss": 0.6798467, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.70009184, + "num_input_tokens_seen": 296294685, + "step": 13737, + "time_per_iteration": 3.1607825756073 + }, + { + "auxiliary_loss_clip": 0.01084099, + "auxiliary_loss_mlp": 0.010326, + "balance_loss_clip": 1.0343399, + "balance_loss_mlp": 1.02032113, + "epoch": 0.8259732451525628, + "flos": 22159038418560.0, + "grad_norm": 1.7543830364671171, + "language_loss": 0.69818115, + "learning_rate": 3.0927299467987e-07, + "loss": 0.71934807, + "num_input_tokens_seen": 296314790, + "step": 13738, + "time_per_iteration": 2.715946912765503 + }, + { + "auxiliary_loss_clip": 0.01092604, + "auxiliary_loss_mlp": 0.01029077, + "balance_loss_clip": 1.03914821, + "balance_loss_mlp": 1.01492107, + "epoch": 0.8260333684052307, + "flos": 38361645233280.0, + "grad_norm": 1.9809949104241253, + "language_loss": 0.63092321, + "learning_rate": 3.090649796213911e-07, + "loss": 0.65214008, + "num_input_tokens_seen": 296335355, + "step": 13739, + "time_per_iteration": 2.8820793628692627 + }, + { + "auxiliary_loss_clip": 0.01011074, + "auxiliary_loss_mlp": 0.01000594, + "balance_loss_clip": 1.00743914, + "balance_loss_mlp": 0.99961609, + "epoch": 0.8260934916578987, + "flos": 62185611882240.0, + "grad_norm": 0.8059815006501098, + "language_loss": 0.59246588, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.61258256, + "num_input_tokens_seen": 296399885, + "step": 13740, + "time_per_iteration": 3.2520594596862793 + }, + { + "auxiliary_loss_clip": 0.01114893, + "auxiliary_loss_mlp": 0.01034041, + "balance_loss_clip": 1.03906655, + "balance_loss_mlp": 1.02052891, + "epoch": 0.8261536149105667, + "flos": 22565475786240.0, + "grad_norm": 2.0240971997235317, + "language_loss": 0.75221682, + "learning_rate": 3.086491418735959e-07, + "loss": 0.7737062, + "num_input_tokens_seen": 296417660, + "step": 13741, + "time_per_iteration": 2.543391704559326 + }, + { + "auxiliary_loss_clip": 0.01096486, + "auxiliary_loss_mlp": 0.01034655, + "balance_loss_clip": 1.03584099, + "balance_loss_mlp": 1.02222109, + "epoch": 0.8262137381632346, + "flos": 32525479342080.0, + "grad_norm": 1.8715316592875402, + "language_loss": 0.62344342, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.64475489, + "num_input_tokens_seen": 296438255, + "step": 13742, + "time_per_iteration": 2.7066636085510254 + }, + { + "auxiliary_loss_clip": 0.01066607, + "auxiliary_loss_mlp": 0.01036357, + "balance_loss_clip": 1.03625488, + "balance_loss_mlp": 1.02224827, + "epoch": 0.8262738614159026, + "flos": 14136451666560.0, + "grad_norm": 2.739309614101712, + "language_loss": 0.65881348, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.67984313, + "num_input_tokens_seen": 296454485, + "step": 13743, + "time_per_iteration": 2.722188949584961 + }, + { + "auxiliary_loss_clip": 0.01089117, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.0356648, + "balance_loss_mlp": 1.02275968, + "epoch": 0.8263339846685706, + "flos": 19825347795840.0, + "grad_norm": 1.7892755960798923, + "language_loss": 0.66778719, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.6890341, + "num_input_tokens_seen": 296473740, + "step": 13744, + "time_per_iteration": 2.632858991622925 + }, + { + "auxiliary_loss_clip": 0.01077178, + "auxiliary_loss_mlp": 0.01032278, + "balance_loss_clip": 1.03721189, + "balance_loss_mlp": 1.02044034, + "epoch": 0.8263941079212386, + "flos": 22745962650240.0, + "grad_norm": 1.8023826175749642, + "language_loss": 0.75316632, + "learning_rate": 3.078182360753612e-07, + "loss": 0.77426088, + "num_input_tokens_seen": 296493355, + "step": 13745, + "time_per_iteration": 2.7315781116485596 + }, + { + "auxiliary_loss_clip": 0.01077899, + "auxiliary_loss_mlp": 0.0077187, + "balance_loss_clip": 1.03393078, + "balance_loss_mlp": 1.00011253, + "epoch": 0.8264542311739065, + "flos": 20120641505280.0, + "grad_norm": 1.8014211048676299, + "language_loss": 0.79279208, + "learning_rate": 3.076106700253709e-07, + "loss": 0.81128979, + "num_input_tokens_seen": 296510520, + "step": 13746, + "time_per_iteration": 2.6316795349121094 + }, + { + "auxiliary_loss_clip": 0.01103647, + "auxiliary_loss_mlp": 0.01036543, + "balance_loss_clip": 1.03922772, + "balance_loss_mlp": 1.02318525, + "epoch": 0.8265143544265745, + "flos": 16837149502080.0, + "grad_norm": 1.8721646210593863, + "language_loss": 0.68316424, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.70456612, + "num_input_tokens_seen": 296528265, + "step": 13747, + "time_per_iteration": 2.586827039718628 + }, + { + "auxiliary_loss_clip": 0.0109475, + "auxiliary_loss_mlp": 0.01037445, + "balance_loss_clip": 1.03468108, + "balance_loss_mlp": 1.02315235, + "epoch": 0.8265744776792424, + "flos": 22018592240640.0, + "grad_norm": 1.994737585930927, + "language_loss": 0.75182354, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.77314556, + "num_input_tokens_seen": 296547810, + "step": 13748, + "time_per_iteration": 2.650148868560791 + }, + { + "auxiliary_loss_clip": 0.01071464, + "auxiliary_loss_mlp": 0.01033138, + "balance_loss_clip": 1.03686166, + "balance_loss_mlp": 1.02170539, + "epoch": 0.8266346009319104, + "flos": 19244852098560.0, + "grad_norm": 1.689203762569125, + "language_loss": 0.64030862, + "learning_rate": 3.069883569603102e-07, + "loss": 0.6613546, + "num_input_tokens_seen": 296565940, + "step": 13749, + "time_per_iteration": 2.757077217102051 + }, + { + "auxiliary_loss_clip": 0.01082519, + "auxiliary_loss_mlp": 0.01028885, + "balance_loss_clip": 1.03196669, + "balance_loss_mlp": 1.01680279, + "epoch": 0.8266947241845783, + "flos": 24166768095360.0, + "grad_norm": 1.5570975728465015, + "language_loss": 0.73744154, + "learning_rate": 3.067810476598132e-07, + "loss": 0.75855553, + "num_input_tokens_seen": 296585090, + "step": 13750, + "time_per_iteration": 2.714416742324829 + }, + { + "auxiliary_loss_clip": 0.01099886, + "auxiliary_loss_mlp": 0.01035041, + "balance_loss_clip": 1.03848624, + "balance_loss_mlp": 1.02245283, + "epoch": 0.8267548474372464, + "flos": 21105814803840.0, + "grad_norm": 1.7884167706589897, + "language_loss": 0.65513742, + "learning_rate": 3.065738025663496e-07, + "loss": 0.67648673, + "num_input_tokens_seen": 296604950, + "step": 13751, + "time_per_iteration": 5.785562753677368 + }, + { + "auxiliary_loss_clip": 0.01081731, + "auxiliary_loss_mlp": 0.01027835, + "balance_loss_clip": 1.03284156, + "balance_loss_mlp": 1.01637304, + "epoch": 0.8268149706899143, + "flos": 39968288668800.0, + "grad_norm": 1.5963517669581677, + "language_loss": 0.60753131, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.628627, + "num_input_tokens_seen": 296627780, + "step": 13752, + "time_per_iteration": 2.755326747894287 + }, + { + "auxiliary_loss_clip": 0.01018872, + "auxiliary_loss_mlp": 0.0100062, + "balance_loss_clip": 1.00675297, + "balance_loss_mlp": 0.99959439, + "epoch": 0.8268750939425823, + "flos": 65782423244160.0, + "grad_norm": 0.7684012049495671, + "language_loss": 0.57412326, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.59431815, + "num_input_tokens_seen": 296683850, + "step": 13753, + "time_per_iteration": 3.1750407218933105 + }, + { + "auxiliary_loss_clip": 0.0099067, + "auxiliary_loss_mlp": 0.00751461, + "balance_loss_clip": 1.01540029, + "balance_loss_mlp": 0.99955767, + "epoch": 0.8269352171952503, + "flos": 52981455242880.0, + "grad_norm": 0.6979413175863002, + "language_loss": 0.54908955, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.56651086, + "num_input_tokens_seen": 296741420, + "step": 13754, + "time_per_iteration": 4.901344299316406 + }, + { + "auxiliary_loss_clip": 0.0106662, + "auxiliary_loss_mlp": 0.01033194, + "balance_loss_clip": 1.03270113, + "balance_loss_mlp": 1.02221489, + "epoch": 0.8269953404479182, + "flos": 23076125487360.0, + "grad_norm": 1.746367517796231, + "language_loss": 0.69104445, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.71204263, + "num_input_tokens_seen": 296759620, + "step": 13755, + "time_per_iteration": 3.0003440380096436 + }, + { + "auxiliary_loss_clip": 0.01062261, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.03354418, + "balance_loss_mlp": 1.01864886, + "epoch": 0.8270554637005862, + "flos": 14209996763520.0, + "grad_norm": 1.955736447357461, + "language_loss": 0.70088506, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.72180462, + "num_input_tokens_seen": 296777275, + "step": 13756, + "time_per_iteration": 4.257762432098389 + }, + { + "auxiliary_loss_clip": 0.01102671, + "auxiliary_loss_mlp": 0.01033469, + "balance_loss_clip": 1.04094052, + "balance_loss_mlp": 1.02107131, + "epoch": 0.8271155869532542, + "flos": 21762046327680.0, + "grad_norm": 1.737700331339717, + "language_loss": 0.72146094, + "learning_rate": 3.053316807931623e-07, + "loss": 0.74282235, + "num_input_tokens_seen": 296796655, + "step": 13757, + "time_per_iteration": 2.6405348777770996 + }, + { + "auxiliary_loss_clip": 0.01101277, + "auxiliary_loss_mlp": 0.01035132, + "balance_loss_clip": 1.03689456, + "balance_loss_mlp": 1.02129722, + "epoch": 0.8271757102059222, + "flos": 15120475729920.0, + "grad_norm": 2.690067923112346, + "language_loss": 0.6930806, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.7144447, + "num_input_tokens_seen": 296813705, + "step": 13758, + "time_per_iteration": 2.6009304523468018 + }, + { + "auxiliary_loss_clip": 0.01083685, + "auxiliary_loss_mlp": 0.01028318, + "balance_loss_clip": 1.03558612, + "balance_loss_mlp": 1.01677251, + "epoch": 0.8272358334585901, + "flos": 24133730561280.0, + "grad_norm": 1.5602292181836888, + "language_loss": 0.69900572, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.72012579, + "num_input_tokens_seen": 296833985, + "step": 13759, + "time_per_iteration": 2.6815199851989746 + }, + { + "auxiliary_loss_clip": 0.01087619, + "auxiliary_loss_mlp": 0.0103009, + "balance_loss_clip": 1.03719068, + "balance_loss_mlp": 1.01779902, + "epoch": 0.8272959567112581, + "flos": 18990712396800.0, + "grad_norm": 1.7630032179390376, + "language_loss": 0.70951492, + "learning_rate": 3.047114873375161e-07, + "loss": 0.73069203, + "num_input_tokens_seen": 296850150, + "step": 13760, + "time_per_iteration": 2.6415457725524902 + }, + { + "auxiliary_loss_clip": 0.0106558, + "auxiliary_loss_mlp": 0.01031388, + "balance_loss_clip": 1.03384495, + "balance_loss_mlp": 1.01930583, + "epoch": 0.827356079963926, + "flos": 20631614428800.0, + "grad_norm": 44.683792034890395, + "language_loss": 0.77058452, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.79155421, + "num_input_tokens_seen": 296869585, + "step": 13761, + "time_per_iteration": 2.658909320831299 + }, + { + "auxiliary_loss_clip": 0.01075197, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.03739285, + "balance_loss_mlp": 1.0216558, + "epoch": 0.827416203216594, + "flos": 22416625825920.0, + "grad_norm": 1.6365377494190674, + "language_loss": 0.70046437, + "learning_rate": 3.042983464482387e-07, + "loss": 0.72154659, + "num_input_tokens_seen": 296887710, + "step": 13762, + "time_per_iteration": 2.6890883445739746 + }, + { + "auxiliary_loss_clip": 0.01056694, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.03542447, + "balance_loss_mlp": 1.0196439, + "epoch": 0.827476326469262, + "flos": 19026192055680.0, + "grad_norm": 2.3529843833311297, + "language_loss": 0.70278549, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.72367239, + "num_input_tokens_seen": 296906265, + "step": 13763, + "time_per_iteration": 2.7008395195007324 + }, + { + "auxiliary_loss_clip": 0.01013794, + "auxiliary_loss_mlp": 0.00999838, + "balance_loss_clip": 1.01678598, + "balance_loss_mlp": 0.99868739, + "epoch": 0.82753644972193, + "flos": 68500575089280.0, + "grad_norm": 0.8946836161965805, + "language_loss": 0.65109873, + "learning_rate": 3.038854627636651e-07, + "loss": 0.67123502, + "num_input_tokens_seen": 296971290, + "step": 13764, + "time_per_iteration": 3.350186586380005 + }, + { + "auxiliary_loss_clip": 0.01100213, + "auxiliary_loss_mlp": 0.01033972, + "balance_loss_clip": 1.03844428, + "balance_loss_mlp": 1.02069247, + "epoch": 0.8275965729745979, + "flos": 18405404277120.0, + "grad_norm": 1.9854785426124901, + "language_loss": 0.77840686, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.79974878, + "num_input_tokens_seen": 296989060, + "step": 13765, + "time_per_iteration": 2.6723389625549316 + }, + { + "auxiliary_loss_clip": 0.01056381, + "auxiliary_loss_mlp": 0.01029974, + "balance_loss_clip": 1.03462076, + "balance_loss_mlp": 1.01668835, + "epoch": 0.8276566962272659, + "flos": 28512067063680.0, + "grad_norm": 1.6645003745934188, + "language_loss": 0.62420988, + "learning_rate": 3.034728363464214e-07, + "loss": 0.64507341, + "num_input_tokens_seen": 297011300, + "step": 13766, + "time_per_iteration": 2.811694383621216 + }, + { + "auxiliary_loss_clip": 0.01073861, + "auxiliary_loss_mlp": 0.01030979, + "balance_loss_clip": 1.03385091, + "balance_loss_mlp": 1.01828325, + "epoch": 0.8277168194799339, + "flos": 20230240878720.0, + "grad_norm": 1.6477178817747764, + "language_loss": 0.82427168, + "learning_rate": 3.03266619632609e-07, + "loss": 0.84532011, + "num_input_tokens_seen": 297030350, + "step": 13767, + "time_per_iteration": 2.716275453567505 + }, + { + "auxiliary_loss_clip": 0.01082823, + "auxiliary_loss_mlp": 0.0102858, + "balance_loss_clip": 1.03814888, + "balance_loss_mlp": 1.01580667, + "epoch": 0.8277769427326018, + "flos": 28476623318400.0, + "grad_norm": 1.6672913040668584, + "language_loss": 0.6903677, + "learning_rate": 3.030604672590964e-07, + "loss": 0.71148169, + "num_input_tokens_seen": 297049710, + "step": 13768, + "time_per_iteration": 2.688441753387451 + }, + { + "auxiliary_loss_clip": 0.0103987, + "auxiliary_loss_mlp": 0.0103503, + "balance_loss_clip": 1.02953947, + "balance_loss_mlp": 1.02242327, + "epoch": 0.8278370659852698, + "flos": 27197628768000.0, + "grad_norm": 2.022593721700604, + "language_loss": 0.74887329, + "learning_rate": 3.028543792337006e-07, + "loss": 0.76962233, + "num_input_tokens_seen": 297070510, + "step": 13769, + "time_per_iteration": 2.765038251876831 + }, + { + "auxiliary_loss_clip": 0.01084819, + "auxiliary_loss_mlp": 0.01030015, + "balance_loss_clip": 1.03507888, + "balance_loss_mlp": 1.01786184, + "epoch": 0.8278971892379378, + "flos": 37816126404480.0, + "grad_norm": 1.6742460818696816, + "language_loss": 0.74587786, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.76702625, + "num_input_tokens_seen": 297092585, + "step": 13770, + "time_per_iteration": 2.78021502494812 + }, + { + "auxiliary_loss_clip": 0.010808, + "auxiliary_loss_mlp": 0.01033104, + "balance_loss_clip": 1.03744841, + "balance_loss_mlp": 1.0202477, + "epoch": 0.8279573124906058, + "flos": 22560160573440.0, + "grad_norm": 2.613906237758894, + "language_loss": 0.75822175, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.77936077, + "num_input_tokens_seen": 297110055, + "step": 13771, + "time_per_iteration": 2.6900837421417236 + }, + { + "auxiliary_loss_clip": 0.01109049, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.03709054, + "balance_loss_mlp": 1.02323627, + "epoch": 0.8280174357432737, + "flos": 36064619418240.0, + "grad_norm": 1.6606339233038099, + "language_loss": 0.72508442, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.74652761, + "num_input_tokens_seen": 297132170, + "step": 13772, + "time_per_iteration": 2.7568705081939697 + }, + { + "auxiliary_loss_clip": 0.01087016, + "auxiliary_loss_mlp": 0.01030121, + "balance_loss_clip": 1.03733611, + "balance_loss_mlp": 1.01710296, + "epoch": 0.8280775589959417, + "flos": 22961067246720.0, + "grad_norm": 2.2592902165659154, + "language_loss": 0.75143635, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.77260774, + "num_input_tokens_seen": 297149515, + "step": 13773, + "time_per_iteration": 2.683868646621704 + }, + { + "auxiliary_loss_clip": 0.01062538, + "auxiliary_loss_mlp": 0.01034452, + "balance_loss_clip": 1.03560376, + "balance_loss_mlp": 1.02264452, + "epoch": 0.8281376822486096, + "flos": 26063282286720.0, + "grad_norm": 1.872449151264808, + "language_loss": 0.75778252, + "learning_rate": 3.01824904601915e-07, + "loss": 0.77875245, + "num_input_tokens_seen": 297170320, + "step": 13774, + "time_per_iteration": 2.7567591667175293 + }, + { + "auxiliary_loss_clip": 0.01081331, + "auxiliary_loss_mlp": 0.00770591, + "balance_loss_clip": 1.03898907, + "balance_loss_mlp": 1.00031459, + "epoch": 0.8281978055012776, + "flos": 20667776446080.0, + "grad_norm": 1.8056323038896689, + "language_loss": 0.74878412, + "learning_rate": 3.01619202829249e-07, + "loss": 0.76730335, + "num_input_tokens_seen": 297189935, + "step": 13775, + "time_per_iteration": 2.74230694770813 + }, + { + "auxiliary_loss_clip": 0.01112679, + "auxiliary_loss_mlp": 0.01030499, + "balance_loss_clip": 1.0371238, + "balance_loss_mlp": 1.01723146, + "epoch": 0.8282579287539455, + "flos": 29315281040640.0, + "grad_norm": 2.0814301454392994, + "language_loss": 0.73856264, + "learning_rate": 3.01413565459353e-07, + "loss": 0.75999445, + "num_input_tokens_seen": 297210885, + "step": 13776, + "time_per_iteration": 2.684095621109009 + }, + { + "auxiliary_loss_clip": 0.01053766, + "auxiliary_loss_mlp": 0.01037151, + "balance_loss_clip": 1.0285629, + "balance_loss_mlp": 1.02321506, + "epoch": 0.8283180520066136, + "flos": 15706178899200.0, + "grad_norm": 1.9371446657055744, + "language_loss": 0.77532077, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.79622996, + "num_input_tokens_seen": 297228500, + "step": 13777, + "time_per_iteration": 2.7644686698913574 + }, + { + "auxiliary_loss_clip": 0.01096655, + "auxiliary_loss_mlp": 0.01029876, + "balance_loss_clip": 1.03806889, + "balance_loss_mlp": 1.01883733, + "epoch": 0.8283781752592815, + "flos": 24791470456320.0, + "grad_norm": 1.6926504608706043, + "language_loss": 0.82732141, + "learning_rate": 3.010024839590604e-07, + "loss": 0.8485868, + "num_input_tokens_seen": 297249470, + "step": 13778, + "time_per_iteration": 2.7171225547790527 + }, + { + "auxiliary_loss_clip": 0.01092306, + "auxiliary_loss_mlp": 0.01025464, + "balance_loss_clip": 1.03395522, + "balance_loss_mlp": 1.01303005, + "epoch": 0.8284382985119495, + "flos": 18982811404800.0, + "grad_norm": 1.8413358549591246, + "language_loss": 0.74458718, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.76576483, + "num_input_tokens_seen": 297265970, + "step": 13779, + "time_per_iteration": 2.626110553741455 + }, + { + "auxiliary_loss_clip": 0.0100526, + "auxiliary_loss_mlp": 0.01000579, + "balance_loss_clip": 1.01090991, + "balance_loss_mlp": 0.99951804, + "epoch": 0.8284984217646175, + "flos": 61034460814080.0, + "grad_norm": 0.7702655263685751, + "language_loss": 0.56672931, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.5867877, + "num_input_tokens_seen": 297325525, + "step": 13780, + "time_per_iteration": 3.212908983230591 + }, + { + "auxiliary_loss_clip": 0.01067858, + "auxiliary_loss_mlp": 0.01029582, + "balance_loss_clip": 1.03421974, + "balance_loss_mlp": 1.01657593, + "epoch": 0.8285585450172854, + "flos": 19714635100800.0, + "grad_norm": 1.699800901130364, + "language_loss": 0.79739404, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.81836849, + "num_input_tokens_seen": 297345025, + "step": 13781, + "time_per_iteration": 2.655301809310913 + }, + { + "auxiliary_loss_clip": 0.01065725, + "auxiliary_loss_mlp": 0.01033354, + "balance_loss_clip": 1.03596509, + "balance_loss_mlp": 1.01966882, + "epoch": 0.8286186682699535, + "flos": 21688896280320.0, + "grad_norm": 1.8730371598803492, + "language_loss": 0.75640142, + "learning_rate": 3.001810941346543e-07, + "loss": 0.77739221, + "num_input_tokens_seen": 297363570, + "step": 13782, + "time_per_iteration": 2.6944918632507324 + }, + { + "auxiliary_loss_clip": 0.01095829, + "auxiliary_loss_mlp": 0.01033027, + "balance_loss_clip": 1.03388703, + "balance_loss_mlp": 1.02099264, + "epoch": 0.8286787915226214, + "flos": 25775566346880.0, + "grad_norm": 1.6561193474083664, + "language_loss": 0.76484203, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.78613055, + "num_input_tokens_seen": 297385385, + "step": 13783, + "time_per_iteration": 2.6690874099731445 + }, + { + "auxiliary_loss_clip": 0.01107918, + "auxiliary_loss_mlp": 0.01028274, + "balance_loss_clip": 1.03614211, + "balance_loss_mlp": 1.01590598, + "epoch": 0.8287389147752894, + "flos": 21288348743040.0, + "grad_norm": 1.6914982205488613, + "language_loss": 0.73518729, + "learning_rate": 2.997707859351304e-07, + "loss": 0.75654924, + "num_input_tokens_seen": 297403950, + "step": 13784, + "time_per_iteration": 2.6368956565856934 + }, + { + "auxiliary_loss_clip": 0.01100253, + "auxiliary_loss_mlp": 0.01035986, + "balance_loss_clip": 1.03504157, + "balance_loss_mlp": 1.02221763, + "epoch": 0.8287990380279573, + "flos": 33544875323520.0, + "grad_norm": 6.002474127634083, + "language_loss": 0.69880319, + "learning_rate": 2.99565728540772e-07, + "loss": 0.72016555, + "num_input_tokens_seen": 297424565, + "step": 13785, + "time_per_iteration": 2.7842202186584473 + }, + { + "auxiliary_loss_clip": 0.01085403, + "auxiliary_loss_mlp": 0.01035636, + "balance_loss_clip": 1.03928435, + "balance_loss_mlp": 1.02327418, + "epoch": 0.8288591612806253, + "flos": 22966346545920.0, + "grad_norm": 1.401854742726992, + "language_loss": 0.68165773, + "learning_rate": 2.993607356270516e-07, + "loss": 0.7028681, + "num_input_tokens_seen": 297445180, + "step": 13786, + "time_per_iteration": 2.6792120933532715 + }, + { + "auxiliary_loss_clip": 0.01069299, + "auxiliary_loss_mlp": 0.01035995, + "balance_loss_clip": 1.03638959, + "balance_loss_mlp": 1.02368629, + "epoch": 0.8289192845332932, + "flos": 18588979710720.0, + "grad_norm": 1.8600312404195347, + "language_loss": 0.77116591, + "learning_rate": 2.991558072017426e-07, + "loss": 0.7922188, + "num_input_tokens_seen": 297463790, + "step": 13787, + "time_per_iteration": 2.7485241889953613 + }, + { + "auxiliary_loss_clip": 0.01090466, + "auxiliary_loss_mlp": 0.01033116, + "balance_loss_clip": 1.03657961, + "balance_loss_mlp": 1.02168417, + "epoch": 0.8289794077859612, + "flos": 15450423085440.0, + "grad_norm": 1.668975764455463, + "language_loss": 0.80241442, + "learning_rate": 2.989509432726163e-07, + "loss": 0.82365024, + "num_input_tokens_seen": 297480100, + "step": 13788, + "time_per_iteration": 2.646430730819702 + }, + { + "auxiliary_loss_clip": 0.01083639, + "auxiliary_loss_mlp": 0.01033973, + "balance_loss_clip": 1.03628707, + "balance_loss_mlp": 1.02209973, + "epoch": 0.8290395310386292, + "flos": 28877853214080.0, + "grad_norm": 1.718363547417138, + "language_loss": 0.71454132, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.73571742, + "num_input_tokens_seen": 297499890, + "step": 13789, + "time_per_iteration": 2.6843364238739014 + }, + { + "auxiliary_loss_clip": 0.01076455, + "auxiliary_loss_mlp": 0.01028674, + "balance_loss_clip": 1.032372, + "balance_loss_mlp": 1.01604366, + "epoch": 0.8290996542912972, + "flos": 36576274700160.0, + "grad_norm": 2.586850358316563, + "language_loss": 0.68054211, + "learning_rate": 2.985414089339813e-07, + "loss": 0.7015934, + "num_input_tokens_seen": 297521440, + "step": 13790, + "time_per_iteration": 4.365084171295166 + }, + { + "auxiliary_loss_clip": 0.01099215, + "auxiliary_loss_mlp": 0.01030184, + "balance_loss_clip": 1.03627872, + "balance_loss_mlp": 1.01633167, + "epoch": 0.8291597775439651, + "flos": 23623009032960.0, + "grad_norm": 1.629598209312908, + "language_loss": 0.77366352, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.7949574, + "num_input_tokens_seen": 297539920, + "step": 13791, + "time_per_iteration": 4.515652894973755 + }, + { + "auxiliary_loss_clip": 0.01083692, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.03688049, + "balance_loss_mlp": 1.01825666, + "epoch": 0.8292199007966331, + "flos": 21397481239680.0, + "grad_norm": 1.4251720115143436, + "language_loss": 0.70067787, + "learning_rate": 2.981321326732651e-07, + "loss": 0.72182631, + "num_input_tokens_seen": 297560000, + "step": 13792, + "time_per_iteration": 2.7335619926452637 + }, + { + "auxiliary_loss_clip": 0.0108758, + "auxiliary_loss_mlp": 0.01032436, + "balance_loss_clip": 1.03578472, + "balance_loss_mlp": 1.01971602, + "epoch": 0.829280024049301, + "flos": 28767607395840.0, + "grad_norm": 1.529482170283821, + "language_loss": 0.64886749, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.67006767, + "num_input_tokens_seen": 297579300, + "step": 13793, + "time_per_iteration": 4.254675388336182 + }, + { + "auxiliary_loss_clip": 0.01052518, + "auxiliary_loss_mlp": 0.01038478, + "balance_loss_clip": 1.03231871, + "balance_loss_mlp": 1.02323079, + "epoch": 0.829340147301969, + "flos": 19938071652480.0, + "grad_norm": 1.91807865555319, + "language_loss": 0.66570354, + "learning_rate": 2.977231145525461e-07, + "loss": 0.6866135, + "num_input_tokens_seen": 297598095, + "step": 13794, + "time_per_iteration": 2.6897053718566895 + }, + { + "auxiliary_loss_clip": 0.01108178, + "auxiliary_loss_mlp": 0.01036453, + "balance_loss_clip": 1.03576493, + "balance_loss_mlp": 1.0234766, + "epoch": 0.829400270554637, + "flos": 25228575060480.0, + "grad_norm": 2.1693553604990132, + "language_loss": 0.66396624, + "learning_rate": 2.975187023140757e-07, + "loss": 0.68541253, + "num_input_tokens_seen": 297615955, + "step": 13795, + "time_per_iteration": 2.609815835952759 + }, + { + "auxiliary_loss_clip": 0.0101707, + "auxiliary_loss_mlp": 0.01041895, + "balance_loss_clip": 1.031497, + "balance_loss_mlp": 1.02748859, + "epoch": 0.829460393807305, + "flos": 24463570176000.0, + "grad_norm": 1.7274097985625807, + "language_loss": 0.66617584, + "learning_rate": 2.973143546338661e-07, + "loss": 0.68676549, + "num_input_tokens_seen": 297636285, + "step": 13796, + "time_per_iteration": 4.47485876083374 + }, + { + "auxiliary_loss_clip": 0.01060431, + "auxiliary_loss_mlp": 0.0104346, + "balance_loss_clip": 1.03264594, + "balance_loss_mlp": 1.02998924, + "epoch": 0.829520517059973, + "flos": 15122486891520.0, + "grad_norm": 1.7688571213307858, + "language_loss": 0.7208361, + "learning_rate": 2.971100715196666e-07, + "loss": 0.74187499, + "num_input_tokens_seen": 297653315, + "step": 13797, + "time_per_iteration": 2.996868133544922 + }, + { + "auxiliary_loss_clip": 0.01042783, + "auxiliary_loss_mlp": 0.01032554, + "balance_loss_clip": 1.03644705, + "balance_loss_mlp": 1.02056766, + "epoch": 0.8295806403126409, + "flos": 21579979265280.0, + "grad_norm": 2.64934630097921, + "language_loss": 0.72061169, + "learning_rate": 2.969058529792243e-07, + "loss": 0.74136508, + "num_input_tokens_seen": 297673480, + "step": 13798, + "time_per_iteration": 2.8359265327453613 + }, + { + "auxiliary_loss_clip": 0.01069075, + "auxiliary_loss_mlp": 0.01032261, + "balance_loss_clip": 1.03152323, + "balance_loss_mlp": 1.01987529, + "epoch": 0.8296407635653089, + "flos": 21726566668800.0, + "grad_norm": 1.5432798793202427, + "language_loss": 0.76292628, + "learning_rate": 2.967016990202822e-07, + "loss": 0.78393966, + "num_input_tokens_seen": 297693250, + "step": 13799, + "time_per_iteration": 2.693103790283203 + }, + { + "auxiliary_loss_clip": 0.01108566, + "auxiliary_loss_mlp": 0.01033314, + "balance_loss_clip": 1.03785658, + "balance_loss_mlp": 1.02094579, + "epoch": 0.8297008868179768, + "flos": 11181147252480.0, + "grad_norm": 1.9112775618394213, + "language_loss": 0.67614651, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.69756532, + "num_input_tokens_seen": 297710975, + "step": 13800, + "time_per_iteration": 2.6247994899749756 + }, + { + "auxiliary_loss_clip": 0.01074439, + "auxiliary_loss_mlp": 0.01033186, + "balance_loss_clip": 1.03878558, + "balance_loss_mlp": 1.01930976, + "epoch": 0.8297610100706448, + "flos": 20664041431680.0, + "grad_norm": 2.709008705792723, + "language_loss": 0.74460614, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.76568246, + "num_input_tokens_seen": 297730860, + "step": 13801, + "time_per_iteration": 2.7638845443725586 + }, + { + "auxiliary_loss_clip": 0.01063708, + "auxiliary_loss_mlp": 0.0102829, + "balance_loss_clip": 1.03407621, + "balance_loss_mlp": 1.01658368, + "epoch": 0.8298211333233128, + "flos": 20376325491840.0, + "grad_norm": 1.5797415663470742, + "language_loss": 0.73625791, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.75717783, + "num_input_tokens_seen": 297749765, + "step": 13802, + "time_per_iteration": 2.7499916553497314 + }, + { + "auxiliary_loss_clip": 0.01088515, + "auxiliary_loss_mlp": 0.01029669, + "balance_loss_clip": 1.03459883, + "balance_loss_mlp": 1.01764071, + "epoch": 0.8298812565759808, + "flos": 21508696725120.0, + "grad_norm": 1.4712858328123304, + "language_loss": 0.74977744, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.77095926, + "num_input_tokens_seen": 297770380, + "step": 13803, + "time_per_iteration": 2.7700328826904297 + }, + { + "auxiliary_loss_clip": 0.01099479, + "auxiliary_loss_mlp": 0.01034049, + "balance_loss_clip": 1.03803515, + "balance_loss_mlp": 1.02196717, + "epoch": 0.8299413798286487, + "flos": 22818681734400.0, + "grad_norm": 1.629212800491102, + "language_loss": 0.79214036, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.81347561, + "num_input_tokens_seen": 297789440, + "step": 13804, + "time_per_iteration": 2.668266773223877 + }, + { + "auxiliary_loss_clip": 0.01109225, + "auxiliary_loss_mlp": 0.01032372, + "balance_loss_clip": 1.03797591, + "balance_loss_mlp": 1.0205344, + "epoch": 0.8300015030813167, + "flos": 29679199683840.0, + "grad_norm": 2.3697694156081157, + "language_loss": 0.72845703, + "learning_rate": 2.954781319115016e-07, + "loss": 0.74987304, + "num_input_tokens_seen": 297810425, + "step": 13805, + "time_per_iteration": 2.68404221534729 + }, + { + "auxiliary_loss_clip": 0.01102118, + "auxiliary_loss_mlp": 0.00771001, + "balance_loss_clip": 1.03904784, + "balance_loss_mlp": 1.00029325, + "epoch": 0.8300616263339846, + "flos": 19719483436800.0, + "grad_norm": 2.0648930657274462, + "language_loss": 0.77626657, + "learning_rate": 2.952744302396906e-07, + "loss": 0.79499781, + "num_input_tokens_seen": 297827680, + "step": 13806, + "time_per_iteration": 2.6478402614593506 + }, + { + "auxiliary_loss_clip": 0.0110212, + "auxiliary_loss_mlp": 0.01033184, + "balance_loss_clip": 1.03748512, + "balance_loss_mlp": 1.01954055, + "epoch": 0.8301217495866526, + "flos": 19901945548800.0, + "grad_norm": 1.8834407676447842, + "language_loss": 0.63916278, + "learning_rate": 2.950707932112444e-07, + "loss": 0.66051579, + "num_input_tokens_seen": 297848005, + "step": 13807, + "time_per_iteration": 2.6306519508361816 + }, + { + "auxiliary_loss_clip": 0.01097082, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.0383265, + "balance_loss_mlp": 1.01728976, + "epoch": 0.8301818728393207, + "flos": 19715784336000.0, + "grad_norm": 1.9692323669369614, + "language_loss": 0.72846484, + "learning_rate": 2.948672208338847e-07, + "loss": 0.74973214, + "num_input_tokens_seen": 297866730, + "step": 13808, + "time_per_iteration": 2.640733480453491 + }, + { + "auxiliary_loss_clip": 0.0109338, + "auxiliary_loss_mlp": 0.01046632, + "balance_loss_clip": 1.03866029, + "balance_loss_mlp": 1.03264272, + "epoch": 0.8302419960919886, + "flos": 28293658416000.0, + "grad_norm": 1.7739722668753906, + "language_loss": 0.66351604, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.6849162, + "num_input_tokens_seen": 297886390, + "step": 13809, + "time_per_iteration": 2.751115322113037 + }, + { + "auxiliary_loss_clip": 0.011108, + "auxiliary_loss_mlp": 0.01024776, + "balance_loss_clip": 1.03813148, + "balance_loss_mlp": 1.01287341, + "epoch": 0.8303021193446566, + "flos": 18223444955520.0, + "grad_norm": 1.8449229056789198, + "language_loss": 0.74058008, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.76193583, + "num_input_tokens_seen": 297905110, + "step": 13810, + "time_per_iteration": 2.506547451019287 + }, + { + "auxiliary_loss_clip": 0.01076467, + "auxiliary_loss_mlp": 0.01036006, + "balance_loss_clip": 1.03609502, + "balance_loss_mlp": 1.02471638, + "epoch": 0.8303622425973245, + "flos": 23111425578240.0, + "grad_norm": 1.5651865455038416, + "language_loss": 0.81083822, + "learning_rate": 2.94256891685505e-07, + "loss": 0.83196294, + "num_input_tokens_seen": 297925460, + "step": 13811, + "time_per_iteration": 2.7325217723846436 + }, + { + "auxiliary_loss_clip": 0.01076005, + "auxiliary_loss_mlp": 0.01045296, + "balance_loss_clip": 1.0357846, + "balance_loss_mlp": 1.03202796, + "epoch": 0.8304223658499925, + "flos": 19572860119680.0, + "grad_norm": 2.992999936529954, + "language_loss": 0.73513645, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.75634944, + "num_input_tokens_seen": 297941760, + "step": 13812, + "time_per_iteration": 2.7724623680114746 + }, + { + "auxiliary_loss_clip": 0.01081692, + "auxiliary_loss_mlp": 0.01028975, + "balance_loss_clip": 1.03739822, + "balance_loss_mlp": 1.01693439, + "epoch": 0.8304824891026604, + "flos": 24426115269120.0, + "grad_norm": 1.8080417533170523, + "language_loss": 0.78173685, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.80284357, + "num_input_tokens_seen": 297959745, + "step": 13813, + "time_per_iteration": 2.7371325492858887 + }, + { + "auxiliary_loss_clip": 0.01054685, + "auxiliary_loss_mlp": 0.00771129, + "balance_loss_clip": 1.03353238, + "balance_loss_mlp": 1.00019467, + "epoch": 0.8305426123553284, + "flos": 22381792611840.0, + "grad_norm": 1.8015570621783799, + "language_loss": 0.71141535, + "learning_rate": 2.93647144674658e-07, + "loss": 0.7296735, + "num_input_tokens_seen": 297977665, + "step": 13814, + "time_per_iteration": 2.8410873413085938 + }, + { + "auxiliary_loss_clip": 0.01117986, + "auxiliary_loss_mlp": 0.01044096, + "balance_loss_clip": 1.03891778, + "balance_loss_mlp": 1.02902818, + "epoch": 0.8306027356079964, + "flos": 14903575453440.0, + "grad_norm": 2.331626844380792, + "language_loss": 0.67776018, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.69938099, + "num_input_tokens_seen": 297993525, + "step": 13815, + "time_per_iteration": 2.607855796813965 + }, + { + "auxiliary_loss_clip": 0.01097003, + "auxiliary_loss_mlp": 0.01033309, + "balance_loss_clip": 1.03770578, + "balance_loss_mlp": 1.02078068, + "epoch": 0.8306628588606644, + "flos": 19644573623040.0, + "grad_norm": 1.9297971278174, + "language_loss": 0.75907093, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.78037405, + "num_input_tokens_seen": 298012920, + "step": 13816, + "time_per_iteration": 2.632202625274658 + }, + { + "auxiliary_loss_clip": 0.01074394, + "auxiliary_loss_mlp": 0.01036444, + "balance_loss_clip": 1.03376317, + "balance_loss_mlp": 1.02446318, + "epoch": 0.8307229821133323, + "flos": 24389737770240.0, + "grad_norm": 1.9005747270922144, + "language_loss": 0.81343293, + "learning_rate": 2.930379800094371e-07, + "loss": 0.83454132, + "num_input_tokens_seen": 298033310, + "step": 13817, + "time_per_iteration": 2.8131661415100098 + }, + { + "auxiliary_loss_clip": 0.01101146, + "auxiliary_loss_mlp": 0.01040878, + "balance_loss_clip": 1.03882217, + "balance_loss_mlp": 1.02748489, + "epoch": 0.8307831053660003, + "flos": 20996933702400.0, + "grad_norm": 1.505220062902958, + "language_loss": 0.78014338, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.80156362, + "num_input_tokens_seen": 298053530, + "step": 13818, + "time_per_iteration": 2.6576030254364014 + }, + { + "auxiliary_loss_clip": 0.01093761, + "auxiliary_loss_mlp": 0.01035748, + "balance_loss_clip": 1.03938222, + "balance_loss_mlp": 1.02312958, + "epoch": 0.8308432286186682, + "flos": 21397301671680.0, + "grad_norm": 1.8020045024766413, + "language_loss": 0.819812, + "learning_rate": 2.926321938606453e-07, + "loss": 0.84110707, + "num_input_tokens_seen": 298069305, + "step": 13819, + "time_per_iteration": 2.6772990226745605 + }, + { + "auxiliary_loss_clip": 0.01020743, + "auxiliary_loss_mlp": 0.0100494, + "balance_loss_clip": 1.00830984, + "balance_loss_mlp": 1.00400436, + "epoch": 0.8309033518713362, + "flos": 62533656714240.0, + "grad_norm": 0.7602438257539984, + "language_loss": 0.56127542, + "learning_rate": 2.924293978977399e-07, + "loss": 0.58153224, + "num_input_tokens_seen": 298125830, + "step": 13820, + "time_per_iteration": 3.193361520767212 + }, + { + "auxiliary_loss_clip": 0.01095529, + "auxiliary_loss_mlp": 0.01025816, + "balance_loss_clip": 1.0352664, + "balance_loss_mlp": 1.01369286, + "epoch": 0.8309634751240043, + "flos": 16979104051200.0, + "grad_norm": 1.789990737596213, + "language_loss": 0.67907584, + "learning_rate": 2.922266666860831e-07, + "loss": 0.70028925, + "num_input_tokens_seen": 298142320, + "step": 13821, + "time_per_iteration": 2.661176919937134 + }, + { + "auxiliary_loss_clip": 0.01043861, + "auxiliary_loss_mlp": 0.01036485, + "balance_loss_clip": 1.029109, + "balance_loss_mlp": 1.02242458, + "epoch": 0.8310235983766722, + "flos": 22674464628480.0, + "grad_norm": 1.7649942540467223, + "language_loss": 0.69191265, + "learning_rate": 2.920240002333625e-07, + "loss": 0.7127161, + "num_input_tokens_seen": 298161845, + "step": 13822, + "time_per_iteration": 2.9704768657684326 + }, + { + "auxiliary_loss_clip": 0.01059895, + "auxiliary_loss_mlp": 0.01035644, + "balance_loss_clip": 1.03586471, + "balance_loss_mlp": 1.02335334, + "epoch": 0.8310837216293402, + "flos": 30811463176320.0, + "grad_norm": 1.6650310845720533, + "language_loss": 0.62025028, + "learning_rate": 2.918213985472631e-07, + "loss": 0.64120567, + "num_input_tokens_seen": 298184165, + "step": 13823, + "time_per_iteration": 2.8505992889404297 + }, + { + "auxiliary_loss_clip": 0.01009787, + "auxiliary_loss_mlp": 0.00999982, + "balance_loss_clip": 1.00688207, + "balance_loss_mlp": 0.9989447, + "epoch": 0.8311438448820081, + "flos": 71276074997760.0, + "grad_norm": 0.9240644196901294, + "language_loss": 0.61982203, + "learning_rate": 2.916188616354669e-07, + "loss": 0.63991976, + "num_input_tokens_seen": 298251720, + "step": 13824, + "time_per_iteration": 3.28657603263855 + }, + { + "auxiliary_loss_clip": 0.01110797, + "auxiliary_loss_mlp": 0.01030392, + "balance_loss_clip": 1.03885794, + "balance_loss_mlp": 1.01815486, + "epoch": 0.8312039681346761, + "flos": 20887082933760.0, + "grad_norm": 1.7761437257032648, + "language_loss": 0.73975819, + "learning_rate": 2.914163895056552e-07, + "loss": 0.76117009, + "num_input_tokens_seen": 298271910, + "step": 13825, + "time_per_iteration": 2.6168012619018555 + }, + { + "auxiliary_loss_clip": 0.01060453, + "auxiliary_loss_mlp": 0.0077103, + "balance_loss_clip": 1.03461838, + "balance_loss_mlp": 1.00020123, + "epoch": 0.831264091387344, + "flos": 17017528625280.0, + "grad_norm": 1.9546255724089596, + "language_loss": 0.80497503, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.82328987, + "num_input_tokens_seen": 298288105, + "step": 13826, + "time_per_iteration": 2.6546146869659424 + }, + { + "auxiliary_loss_clip": 0.01110653, + "auxiliary_loss_mlp": 0.01033321, + "balance_loss_clip": 1.03793025, + "balance_loss_mlp": 1.02049446, + "epoch": 0.831324214640012, + "flos": 24419578993920.0, + "grad_norm": 1.5280583431221222, + "language_loss": 0.67963809, + "learning_rate": 2.910116396226914e-07, + "loss": 0.70107782, + "num_input_tokens_seen": 298307600, + "step": 13827, + "time_per_iteration": 2.5905277729034424 + }, + { + "auxiliary_loss_clip": 0.01098107, + "auxiliary_loss_mlp": 0.01030102, + "balance_loss_clip": 1.03539395, + "balance_loss_mlp": 1.01871204, + "epoch": 0.83138433789268, + "flos": 13545576938880.0, + "grad_norm": 1.973600288976441, + "language_loss": 0.74098945, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.76227152, + "num_input_tokens_seen": 298323055, + "step": 13828, + "time_per_iteration": 2.6251087188720703 + }, + { + "auxiliary_loss_clip": 0.01073913, + "auxiliary_loss_mlp": 0.01033531, + "balance_loss_clip": 1.03275013, + "balance_loss_mlp": 1.0203644, + "epoch": 0.831444461145348, + "flos": 44492386561920.0, + "grad_norm": 2.63988910993405, + "language_loss": 0.67159581, + "learning_rate": 2.906071489597657e-07, + "loss": 0.69267023, + "num_input_tokens_seen": 298346950, + "step": 13829, + "time_per_iteration": 3.220686435699463 + }, + { + "auxiliary_loss_clip": 0.01085933, + "auxiliary_loss_mlp": 0.01031507, + "balance_loss_clip": 1.03704345, + "balance_loss_mlp": 1.01854897, + "epoch": 0.8315045843980159, + "flos": 22705024124160.0, + "grad_norm": 1.6177335267963915, + "language_loss": 0.82913047, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.8503049, + "num_input_tokens_seen": 298366315, + "step": 13830, + "time_per_iteration": 6.03197717666626 + }, + { + "auxiliary_loss_clip": 0.01097952, + "auxiliary_loss_mlp": 0.01034258, + "balance_loss_clip": 1.03697491, + "balance_loss_mlp": 1.02208698, + "epoch": 0.8315647076506839, + "flos": 16873491087360.0, + "grad_norm": 2.1932847563543247, + "language_loss": 0.73822612, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.75954819, + "num_input_tokens_seen": 298385185, + "step": 13831, + "time_per_iteration": 2.665022611618042 + }, + { + "auxiliary_loss_clip": 0.01111975, + "auxiliary_loss_mlp": 0.01033792, + "balance_loss_clip": 1.03963041, + "balance_loss_mlp": 1.02083445, + "epoch": 0.8316248309033518, + "flos": 13808730954240.0, + "grad_norm": 1.6071595034037367, + "language_loss": 0.7129162, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.73437387, + "num_input_tokens_seen": 298402335, + "step": 13832, + "time_per_iteration": 2.647451400756836 + }, + { + "auxiliary_loss_clip": 0.0108072, + "auxiliary_loss_mlp": 0.0103351, + "balance_loss_clip": 1.03389788, + "balance_loss_mlp": 1.02102256, + "epoch": 0.8316849541560198, + "flos": 23512511819520.0, + "grad_norm": 1.6195807094532317, + "language_loss": 0.84269989, + "learning_rate": 2.897989455393979e-07, + "loss": 0.86384219, + "num_input_tokens_seen": 298423370, + "step": 13833, + "time_per_iteration": 4.226484298706055 + }, + { + "auxiliary_loss_clip": 0.010921, + "auxiliary_loss_mlp": 0.01036257, + "balance_loss_clip": 1.03806973, + "balance_loss_mlp": 1.02329278, + "epoch": 0.8317450774086879, + "flos": 23771356202880.0, + "grad_norm": 2.0307476796649917, + "language_loss": 0.76316315, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.78444666, + "num_input_tokens_seen": 298444835, + "step": 13834, + "time_per_iteration": 2.814335584640503 + }, + { + "auxiliary_loss_clip": 0.01105662, + "auxiliary_loss_mlp": 0.00769799, + "balance_loss_clip": 1.03617358, + "balance_loss_mlp": 1.00016499, + "epoch": 0.8318052006613558, + "flos": 16215535710720.0, + "grad_norm": 1.877967943064554, + "language_loss": 0.79689634, + "learning_rate": 2.893952329045459e-07, + "loss": 0.81565094, + "num_input_tokens_seen": 298461845, + "step": 13835, + "time_per_iteration": 4.108726978302002 + }, + { + "auxiliary_loss_clip": 0.01103663, + "auxiliary_loss_mlp": 0.01037899, + "balance_loss_clip": 1.03955829, + "balance_loss_mlp": 1.02354026, + "epoch": 0.8318653239140238, + "flos": 19974556892160.0, + "grad_norm": 1.8066423967351954, + "language_loss": 0.80604517, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.82746077, + "num_input_tokens_seen": 298479095, + "step": 13836, + "time_per_iteration": 2.624318838119507 + }, + { + "auxiliary_loss_clip": 0.01088523, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.03795898, + "balance_loss_mlp": 1.01932681, + "epoch": 0.8319254471666917, + "flos": 17704714694400.0, + "grad_norm": 1.9385404559381145, + "language_loss": 0.77292264, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.79412162, + "num_input_tokens_seen": 298494475, + "step": 13837, + "time_per_iteration": 2.662458896636963 + }, + { + "auxiliary_loss_clip": 0.01114063, + "auxiliary_loss_mlp": 0.01030366, + "balance_loss_clip": 1.03759873, + "balance_loss_mlp": 1.01654339, + "epoch": 0.8319855704193597, + "flos": 19536554448000.0, + "grad_norm": 1.6836751176353142, + "language_loss": 0.83425492, + "learning_rate": 2.887901504686685e-07, + "loss": 0.85569924, + "num_input_tokens_seen": 298513185, + "step": 13838, + "time_per_iteration": 2.533836603164673 + }, + { + "auxiliary_loss_clip": 0.01081066, + "auxiliary_loss_mlp": 0.01035142, + "balance_loss_clip": 1.03288436, + "balance_loss_mlp": 1.02131331, + "epoch": 0.8320456936720276, + "flos": 21178067011200.0, + "grad_norm": 2.4451044719374613, + "language_loss": 0.74250424, + "learning_rate": 2.885885860916795e-07, + "loss": 0.76366633, + "num_input_tokens_seen": 298531885, + "step": 13839, + "time_per_iteration": 2.6616058349609375 + }, + { + "auxiliary_loss_clip": 0.01096452, + "auxiliary_loss_mlp": 0.01033098, + "balance_loss_clip": 1.03666425, + "balance_loss_mlp": 1.02004433, + "epoch": 0.8321058169246957, + "flos": 33250874503680.0, + "grad_norm": 1.4805288033952766, + "language_loss": 0.67812371, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.69941914, + "num_input_tokens_seen": 298554905, + "step": 13840, + "time_per_iteration": 2.735732078552246 + }, + { + "auxiliary_loss_clip": 0.01054107, + "auxiliary_loss_mlp": 0.01039263, + "balance_loss_clip": 1.03295565, + "balance_loss_mlp": 1.02507687, + "epoch": 0.8321659401773636, + "flos": 14208129256320.0, + "grad_norm": 1.9499790502126348, + "language_loss": 0.79567152, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.81660521, + "num_input_tokens_seen": 298571185, + "step": 13841, + "time_per_iteration": 2.6811771392822266 + }, + { + "auxiliary_loss_clip": 0.0106104, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.03763831, + "balance_loss_mlp": 1.0179522, + "epoch": 0.8322260634300316, + "flos": 15158253859200.0, + "grad_norm": 1.7496340078851804, + "language_loss": 0.68060827, + "learning_rate": 2.879842823726262e-07, + "loss": 0.70152342, + "num_input_tokens_seen": 298588505, + "step": 13842, + "time_per_iteration": 2.8322203159332275 + }, + { + "auxiliary_loss_clip": 0.0108993, + "auxiliary_loss_mlp": 0.0103225, + "balance_loss_clip": 1.03790903, + "balance_loss_mlp": 1.01888657, + "epoch": 0.8322861866826995, + "flos": 25300827267840.0, + "grad_norm": 1.5488311429576032, + "language_loss": 0.73103952, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.75226128, + "num_input_tokens_seen": 298609295, + "step": 13843, + "time_per_iteration": 2.886599540710449 + }, + { + "auxiliary_loss_clip": 0.01077287, + "auxiliary_loss_mlp": 0.01027519, + "balance_loss_clip": 1.03611994, + "balance_loss_mlp": 1.01505589, + "epoch": 0.8323463099353675, + "flos": 17019360218880.0, + "grad_norm": 1.8098235185512692, + "language_loss": 0.77365232, + "learning_rate": 2.875817378128975e-07, + "loss": 0.79470038, + "num_input_tokens_seen": 298625765, + "step": 13844, + "time_per_iteration": 2.7069430351257324 + }, + { + "auxiliary_loss_clip": 0.01007928, + "auxiliary_loss_mlp": 0.01001333, + "balance_loss_clip": 1.00663698, + "balance_loss_mlp": 1.00036097, + "epoch": 0.8324064331880354, + "flos": 55607889709440.0, + "grad_norm": 0.7847120872391591, + "language_loss": 0.55208087, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.57217348, + "num_input_tokens_seen": 298683005, + "step": 13845, + "time_per_iteration": 3.0783231258392334 + }, + { + "auxiliary_loss_clip": 0.01102275, + "auxiliary_loss_mlp": 0.01044708, + "balance_loss_clip": 1.03761721, + "balance_loss_mlp": 1.0314219, + "epoch": 0.8324665564407034, + "flos": 26138623063680.0, + "grad_norm": 1.6009211364700722, + "language_loss": 0.75140607, + "learning_rate": 2.871794529934555e-07, + "loss": 0.77287591, + "num_input_tokens_seen": 298703060, + "step": 13846, + "time_per_iteration": 2.676182508468628 + }, + { + "auxiliary_loss_clip": 0.01056649, + "auxiliary_loss_mlp": 0.01033367, + "balance_loss_clip": 1.03160548, + "balance_loss_mlp": 1.01809657, + "epoch": 0.8325266796933715, + "flos": 22049187649920.0, + "grad_norm": 1.6388328541738983, + "language_loss": 0.78896999, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.80987018, + "num_input_tokens_seen": 298721765, + "step": 13847, + "time_per_iteration": 2.7297866344451904 + }, + { + "auxiliary_loss_clip": 0.01052928, + "auxiliary_loss_mlp": 0.01030708, + "balance_loss_clip": 1.0369612, + "balance_loss_mlp": 1.01901376, + "epoch": 0.8325868029460394, + "flos": 22816634659200.0, + "grad_norm": 2.65968337371303, + "language_loss": 0.74193573, + "learning_rate": 2.867774279753175e-07, + "loss": 0.76277208, + "num_input_tokens_seen": 298740825, + "step": 13848, + "time_per_iteration": 2.740797758102417 + }, + { + "auxiliary_loss_clip": 0.0110005, + "auxiliary_loss_mlp": 0.01027688, + "balance_loss_clip": 1.03858709, + "balance_loss_mlp": 1.0153321, + "epoch": 0.8326469261987074, + "flos": 14757454926720.0, + "grad_norm": 1.7578930460196398, + "language_loss": 0.63396668, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.65524411, + "num_input_tokens_seen": 298758515, + "step": 13849, + "time_per_iteration": 2.5713930130004883 + }, + { + "auxiliary_loss_clip": 0.01084755, + "auxiliary_loss_mlp": 0.01033475, + "balance_loss_clip": 1.0322125, + "balance_loss_mlp": 1.0203917, + "epoch": 0.8327070494513753, + "flos": 22926126291840.0, + "grad_norm": 2.0835174192024533, + "language_loss": 0.79707754, + "learning_rate": 2.863756628194638e-07, + "loss": 0.81825984, + "num_input_tokens_seen": 298776375, + "step": 13850, + "time_per_iteration": 2.6037027835845947 + }, + { + "auxiliary_loss_clip": 0.0106844, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.03266466, + "balance_loss_mlp": 1.02161264, + "epoch": 0.8327671727040433, + "flos": 20665334321280.0, + "grad_norm": 1.589654785001457, + "language_loss": 0.7825923, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.80360448, + "num_input_tokens_seen": 298795135, + "step": 13851, + "time_per_iteration": 2.669689416885376 + }, + { + "auxiliary_loss_clip": 0.01021321, + "auxiliary_loss_mlp": 0.01003693, + "balance_loss_clip": 1.00839996, + "balance_loss_mlp": 1.00260222, + "epoch": 0.8328272959567112, + "flos": 56060760384000.0, + "grad_norm": 0.7603079247900993, + "language_loss": 0.55759335, + "learning_rate": 2.859741575868344e-07, + "loss": 0.57784349, + "num_input_tokens_seen": 298855475, + "step": 13852, + "time_per_iteration": 3.171971321105957 + }, + { + "auxiliary_loss_clip": 0.01096762, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.03631687, + "balance_loss_mlp": 1.01785994, + "epoch": 0.8328874192093793, + "flos": 32303084284800.0, + "grad_norm": 1.490710672408854, + "language_loss": 0.67185426, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.69312215, + "num_input_tokens_seen": 298875875, + "step": 13853, + "time_per_iteration": 2.705221176147461 + }, + { + "auxiliary_loss_clip": 0.01082363, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.03677762, + "balance_loss_mlp": 1.02310205, + "epoch": 0.8329475424620472, + "flos": 23512691387520.0, + "grad_norm": 1.8131340833394713, + "language_loss": 0.7809993, + "learning_rate": 2.855729123383286e-07, + "loss": 0.80217344, + "num_input_tokens_seen": 298895950, + "step": 13854, + "time_per_iteration": 2.6784071922302246 + }, + { + "auxiliary_loss_clip": 0.01029094, + "auxiliary_loss_mlp": 0.00999528, + "balance_loss_clip": 1.00678289, + "balance_loss_mlp": 0.99855083, + "epoch": 0.8330076657147152, + "flos": 67840680378240.0, + "grad_norm": 0.7605812264158395, + "language_loss": 0.58664268, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.60692888, + "num_input_tokens_seen": 298955770, + "step": 13855, + "time_per_iteration": 3.0027222633361816 + }, + { + "auxiliary_loss_clip": 0.01098543, + "auxiliary_loss_mlp": 0.01027235, + "balance_loss_clip": 1.03760314, + "balance_loss_mlp": 1.01511717, + "epoch": 0.8330677889673831, + "flos": 22892801448960.0, + "grad_norm": 1.6486782153606043, + "language_loss": 0.71799862, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.73925638, + "num_input_tokens_seen": 298976545, + "step": 13856, + "time_per_iteration": 2.6572425365448 + }, + { + "auxiliary_loss_clip": 0.01098496, + "auxiliary_loss_mlp": 0.01031016, + "balance_loss_clip": 1.03601694, + "balance_loss_mlp": 1.01861823, + "epoch": 0.8331279122200511, + "flos": 27345042184320.0, + "grad_norm": 1.530155897456529, + "language_loss": 0.75503182, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.77632695, + "num_input_tokens_seen": 298996750, + "step": 13857, + "time_per_iteration": 2.709289073944092 + }, + { + "auxiliary_loss_clip": 0.0106038, + "auxiliary_loss_mlp": 0.01024239, + "balance_loss_clip": 1.0359571, + "balance_loss_mlp": 1.01319456, + "epoch": 0.833188035472719, + "flos": 19938179393280.0, + "grad_norm": 1.5089034219469146, + "language_loss": 0.7372514, + "learning_rate": 2.847712020370958e-07, + "loss": 0.75809759, + "num_input_tokens_seen": 299014895, + "step": 13858, + "time_per_iteration": 2.771655321121216 + }, + { + "auxiliary_loss_clip": 0.01112772, + "auxiliary_loss_mlp": 0.01033358, + "balance_loss_clip": 1.03744984, + "balance_loss_mlp": 1.01981604, + "epoch": 0.833248158725387, + "flos": 15232624968960.0, + "grad_norm": 1.9106405712672399, + "language_loss": 0.73376054, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.75522184, + "num_input_tokens_seen": 299032855, + "step": 13859, + "time_per_iteration": 2.5690972805023193 + }, + { + "auxiliary_loss_clip": 0.01093273, + "auxiliary_loss_mlp": 0.01025326, + "balance_loss_clip": 1.03597152, + "balance_loss_mlp": 1.01405454, + "epoch": 0.8333082819780551, + "flos": 24535535074560.0, + "grad_norm": 1.588476401883647, + "language_loss": 0.79069161, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.81187761, + "num_input_tokens_seen": 299052055, + "step": 13860, + "time_per_iteration": 2.687077283859253 + }, + { + "auxiliary_loss_clip": 0.0103731, + "auxiliary_loss_mlp": 0.01031939, + "balance_loss_clip": 1.03546524, + "balance_loss_mlp": 1.01915944, + "epoch": 0.833368405230723, + "flos": 31467407391360.0, + "grad_norm": 1.5993206787679535, + "language_loss": 0.8204006, + "learning_rate": 2.841706022218644e-07, + "loss": 0.84109306, + "num_input_tokens_seen": 299075285, + "step": 13861, + "time_per_iteration": 3.007451295852661 + }, + { + "auxiliary_loss_clip": 0.01112118, + "auxiliary_loss_mlp": 0.0103305, + "balance_loss_clip": 1.03988099, + "balance_loss_mlp": 1.02040219, + "epoch": 0.833428528483391, + "flos": 14902713527040.0, + "grad_norm": 1.7332412626678735, + "language_loss": 0.78811872, + "learning_rate": 2.839705324021806e-07, + "loss": 0.80957043, + "num_input_tokens_seen": 299092520, + "step": 13862, + "time_per_iteration": 2.7910513877868652 + }, + { + "auxiliary_loss_clip": 0.01099183, + "auxiliary_loss_mlp": 0.01035326, + "balance_loss_clip": 1.03552341, + "balance_loss_mlp": 1.02280307, + "epoch": 0.8334886517360589, + "flos": 22199833290240.0, + "grad_norm": 1.8555893155682146, + "language_loss": 0.75250399, + "learning_rate": 2.83770527654505e-07, + "loss": 0.77384913, + "num_input_tokens_seen": 299109450, + "step": 13863, + "time_per_iteration": 2.623645782470703 + }, + { + "auxiliary_loss_clip": 0.01049642, + "auxiliary_loss_mlp": 0.00771776, + "balance_loss_clip": 1.03239465, + "balance_loss_mlp": 1.00020719, + "epoch": 0.8335487749887269, + "flos": 30372562892160.0, + "grad_norm": 1.9984651067343642, + "language_loss": 0.75399351, + "learning_rate": 2.835705879864232e-07, + "loss": 0.77220774, + "num_input_tokens_seen": 299129540, + "step": 13864, + "time_per_iteration": 2.8347368240356445 + }, + { + "auxiliary_loss_clip": 0.01086549, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.03666651, + "balance_loss_mlp": 1.02171326, + "epoch": 0.8336088982413948, + "flos": 24681152810880.0, + "grad_norm": 1.9805001513042173, + "language_loss": 0.69349921, + "learning_rate": 2.833707134055168e-07, + "loss": 0.71470916, + "num_input_tokens_seen": 299148670, + "step": 13865, + "time_per_iteration": 2.7639873027801514 + }, + { + "auxiliary_loss_clip": 0.01099811, + "auxiliary_loss_mlp": 0.01032523, + "balance_loss_clip": 1.03819227, + "balance_loss_mlp": 1.01979089, + "epoch": 0.8336690214940629, + "flos": 38177207873280.0, + "grad_norm": 5.1442614378118625, + "language_loss": 0.75333238, + "learning_rate": 2.831709039193653e-07, + "loss": 0.7746557, + "num_input_tokens_seen": 299169330, + "step": 13866, + "time_per_iteration": 2.777001142501831 + }, + { + "auxiliary_loss_clip": 0.01008617, + "auxiliary_loss_mlp": 0.01009028, + "balance_loss_clip": 1.00722134, + "balance_loss_mlp": 1.00765133, + "epoch": 0.8337291447467308, + "flos": 55565119589760.0, + "grad_norm": 0.870724565539336, + "language_loss": 0.63078576, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.65096223, + "num_input_tokens_seen": 299220980, + "step": 13867, + "time_per_iteration": 3.081568956375122 + }, + { + "auxiliary_loss_clip": 0.01083895, + "auxiliary_loss_mlp": 0.01028872, + "balance_loss_clip": 1.03740549, + "balance_loss_mlp": 1.01767826, + "epoch": 0.8337892679993988, + "flos": 24133550993280.0, + "grad_norm": 1.7649595884410185, + "language_loss": 0.71936655, + "learning_rate": 2.827714802616301e-07, + "loss": 0.74049425, + "num_input_tokens_seen": 299240130, + "step": 13868, + "time_per_iteration": 2.652420997619629 + }, + { + "auxiliary_loss_clip": 0.0108564, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.03956676, + "balance_loss_mlp": 1.02057862, + "epoch": 0.8338493912520667, + "flos": 28183915388160.0, + "grad_norm": 1.3896296545352977, + "language_loss": 0.80381906, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.82500416, + "num_input_tokens_seen": 299260705, + "step": 13869, + "time_per_iteration": 4.254533529281616 + }, + { + "auxiliary_loss_clip": 0.01100488, + "auxiliary_loss_mlp": 0.01032723, + "balance_loss_clip": 1.03849924, + "balance_loss_mlp": 1.02023017, + "epoch": 0.8339095145047347, + "flos": 22158356060160.0, + "grad_norm": 1.5584172404732568, + "language_loss": 0.82560688, + "learning_rate": 2.823723170738028e-07, + "loss": 0.84693897, + "num_input_tokens_seen": 299278925, + "step": 13870, + "time_per_iteration": 4.364636421203613 + }, + { + "auxiliary_loss_clip": 0.01078884, + "auxiliary_loss_mlp": 0.0102765, + "balance_loss_clip": 1.03682613, + "balance_loss_mlp": 1.01443601, + "epoch": 0.8339696377574026, + "flos": 17307112072320.0, + "grad_norm": 2.7050320038401146, + "language_loss": 0.7043367, + "learning_rate": 2.821728331750264e-07, + "loss": 0.72540206, + "num_input_tokens_seen": 299291580, + "step": 13871, + "time_per_iteration": 2.650563955307007 + }, + { + "auxiliary_loss_clip": 0.01097514, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.03766418, + "balance_loss_mlp": 1.02192545, + "epoch": 0.8340297610100706, + "flos": 20668351063680.0, + "grad_norm": 1.6604394599481103, + "language_loss": 0.6898998, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.7112115, + "num_input_tokens_seen": 299310385, + "step": 13872, + "time_per_iteration": 4.172610759735107 + }, + { + "auxiliary_loss_clip": 0.01086882, + "auxiliary_loss_mlp": 0.01026823, + "balance_loss_clip": 1.03666329, + "balance_loss_mlp": 1.01506281, + "epoch": 0.8340898842627387, + "flos": 20515442866560.0, + "grad_norm": 1.969935634979257, + "language_loss": 0.73773992, + "learning_rate": 2.817740608055712e-07, + "loss": 0.75887698, + "num_input_tokens_seen": 299327660, + "step": 13873, + "time_per_iteration": 2.7069506645202637 + }, + { + "auxiliary_loss_clip": 0.01087674, + "auxiliary_loss_mlp": 0.01035501, + "balance_loss_clip": 1.03668487, + "balance_loss_mlp": 1.02100515, + "epoch": 0.8341500075154066, + "flos": 21425850005760.0, + "grad_norm": 2.086638333931779, + "language_loss": 0.75528133, + "learning_rate": 2.81574772350013e-07, + "loss": 0.7765131, + "num_input_tokens_seen": 299343685, + "step": 13874, + "time_per_iteration": 4.401844263076782 + }, + { + "auxiliary_loss_clip": 0.0108051, + "auxiliary_loss_mlp": 0.01028815, + "balance_loss_clip": 1.0355829, + "balance_loss_mlp": 1.01691747, + "epoch": 0.8342101307680746, + "flos": 22090988102400.0, + "grad_norm": 2.2988326749129766, + "language_loss": 0.66232169, + "learning_rate": 2.813755490573118e-07, + "loss": 0.68341494, + "num_input_tokens_seen": 299363305, + "step": 13875, + "time_per_iteration": 2.7391769886016846 + }, + { + "auxiliary_loss_clip": 0.010648, + "auxiliary_loss_mlp": 0.01036714, + "balance_loss_clip": 1.03338897, + "balance_loss_mlp": 1.02434039, + "epoch": 0.8342702540207425, + "flos": 21871466133120.0, + "grad_norm": 1.700714112258655, + "language_loss": 0.79729408, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.81830925, + "num_input_tokens_seen": 299382630, + "step": 13876, + "time_per_iteration": 2.8024299144744873 + }, + { + "auxiliary_loss_clip": 0.01093328, + "auxiliary_loss_mlp": 0.01038156, + "balance_loss_clip": 1.03557479, + "balance_loss_mlp": 1.02462614, + "epoch": 0.8343303772734105, + "flos": 22528487756160.0, + "grad_norm": 1.934148297226032, + "language_loss": 0.87182283, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.89313757, + "num_input_tokens_seen": 299402385, + "step": 13877, + "time_per_iteration": 2.652780055999756 + }, + { + "auxiliary_loss_clip": 0.01064054, + "auxiliary_loss_mlp": 0.01029724, + "balance_loss_clip": 1.03309846, + "balance_loss_mlp": 1.01811349, + "epoch": 0.8343905005260784, + "flos": 14939773384320.0, + "grad_norm": 2.0462356502158445, + "language_loss": 0.69456965, + "learning_rate": 2.807782702318828e-07, + "loss": 0.71550739, + "num_input_tokens_seen": 299419820, + "step": 13878, + "time_per_iteration": 2.642768144607544 + }, + { + "auxiliary_loss_clip": 0.01084966, + "auxiliary_loss_mlp": 0.01028475, + "balance_loss_clip": 1.03594303, + "balance_loss_mlp": 1.01660752, + "epoch": 0.8344506237787465, + "flos": 15012456554880.0, + "grad_norm": 2.2290576221184537, + "language_loss": 0.790878, + "learning_rate": 2.805793076661309e-07, + "loss": 0.81201237, + "num_input_tokens_seen": 299436265, + "step": 13879, + "time_per_iteration": 2.6227519512176514 + }, + { + "auxiliary_loss_clip": 0.01061568, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.03857195, + "balance_loss_mlp": 1.02046072, + "epoch": 0.8345107470314144, + "flos": 17560389847680.0, + "grad_norm": 2.053274894813819, + "language_loss": 0.8324911, + "learning_rate": 2.803804103009828e-07, + "loss": 0.85342157, + "num_input_tokens_seen": 299451660, + "step": 13880, + "time_per_iteration": 2.7081100940704346 + }, + { + "auxiliary_loss_clip": 0.01089609, + "auxiliary_loss_mlp": 0.0102994, + "balance_loss_clip": 1.035254, + "balance_loss_mlp": 1.01767302, + "epoch": 0.8345708702840824, + "flos": 25187277398400.0, + "grad_norm": 1.577577271354365, + "language_loss": 0.78032011, + "learning_rate": 2.80181578143982e-07, + "loss": 0.80151558, + "num_input_tokens_seen": 299472070, + "step": 13881, + "time_per_iteration": 2.672635793685913 + }, + { + "auxiliary_loss_clip": 0.010645, + "auxiliary_loss_mlp": 0.01024591, + "balance_loss_clip": 1.03461313, + "balance_loss_mlp": 1.01385057, + "epoch": 0.8346309935367503, + "flos": 15083559527040.0, + "grad_norm": 2.2926708400629137, + "language_loss": 0.78564227, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.80653316, + "num_input_tokens_seen": 299486725, + "step": 13882, + "time_per_iteration": 2.6480295658111572 + }, + { + "auxiliary_loss_clip": 0.01070114, + "auxiliary_loss_mlp": 0.01053336, + "balance_loss_clip": 1.03278971, + "balance_loss_mlp": 1.03948951, + "epoch": 0.8346911167894183, + "flos": 22930615491840.0, + "grad_norm": 1.6147247688158133, + "language_loss": 0.80761689, + "learning_rate": 2.79784109484579e-07, + "loss": 0.82885134, + "num_input_tokens_seen": 299505435, + "step": 13883, + "time_per_iteration": 2.6793839931488037 + }, + { + "auxiliary_loss_clip": 0.01096684, + "auxiliary_loss_mlp": 0.01036312, + "balance_loss_clip": 1.03590465, + "balance_loss_mlp": 1.02373528, + "epoch": 0.8347512400420862, + "flos": 20193037367040.0, + "grad_norm": 4.685073844907577, + "language_loss": 0.74089235, + "learning_rate": 2.795854729972482e-07, + "loss": 0.76222229, + "num_input_tokens_seen": 299523555, + "step": 13884, + "time_per_iteration": 2.604556083679199 + }, + { + "auxiliary_loss_clip": 0.01095519, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.03934622, + "balance_loss_mlp": 1.02086711, + "epoch": 0.8348113632947542, + "flos": 25954832148480.0, + "grad_norm": 1.6937955935304687, + "language_loss": 0.7016691, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.72297329, + "num_input_tokens_seen": 299541660, + "step": 13885, + "time_per_iteration": 2.6773464679718018 + }, + { + "auxiliary_loss_clip": 0.01077954, + "auxiliary_loss_mlp": 0.01033472, + "balance_loss_clip": 1.03690195, + "balance_loss_mlp": 1.02092576, + "epoch": 0.8348714865474223, + "flos": 34204554552960.0, + "grad_norm": 1.8731982804534555, + "language_loss": 0.6992318, + "learning_rate": 2.791883957449912e-07, + "loss": 0.72034615, + "num_input_tokens_seen": 299562465, + "step": 13886, + "time_per_iteration": 2.8285069465637207 + }, + { + "auxiliary_loss_clip": 0.01073957, + "auxiliary_loss_mlp": 0.01033794, + "balance_loss_clip": 1.03586638, + "balance_loss_mlp": 1.01972771, + "epoch": 0.8349316098000902, + "flos": 24390132819840.0, + "grad_norm": 2.5697448718102414, + "language_loss": 0.79508579, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.81616336, + "num_input_tokens_seen": 299582700, + "step": 13887, + "time_per_iteration": 2.7178754806518555 + }, + { + "auxiliary_loss_clip": 0.01092328, + "auxiliary_loss_mlp": 0.00771043, + "balance_loss_clip": 1.03849149, + "balance_loss_mlp": 1.00030017, + "epoch": 0.8349917330527582, + "flos": 23032744836480.0, + "grad_norm": 2.693894693314375, + "language_loss": 0.64530712, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.66394079, + "num_input_tokens_seen": 299600310, + "step": 13888, + "time_per_iteration": 2.6735687255859375 + }, + { + "auxiliary_loss_clip": 0.01088596, + "auxiliary_loss_mlp": 0.01028671, + "balance_loss_clip": 1.03663945, + "balance_loss_mlp": 1.01663125, + "epoch": 0.8350518563054261, + "flos": 13625873792640.0, + "grad_norm": 2.0620816016550436, + "language_loss": 0.66669202, + "learning_rate": 2.785932692855244e-07, + "loss": 0.68786466, + "num_input_tokens_seen": 299617025, + "step": 13889, + "time_per_iteration": 2.680638551712036 + }, + { + "auxiliary_loss_clip": 0.01090008, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.03369141, + "balance_loss_mlp": 1.01666367, + "epoch": 0.8351119795580941, + "flos": 21579799697280.0, + "grad_norm": 2.20971990347348, + "language_loss": 0.6832096, + "learning_rate": 2.783950243408399e-07, + "loss": 0.70439726, + "num_input_tokens_seen": 299633050, + "step": 13890, + "time_per_iteration": 2.627889394760132 + }, + { + "auxiliary_loss_clip": 0.01088958, + "auxiliary_loss_mlp": 0.01036104, + "balance_loss_clip": 1.03766465, + "balance_loss_mlp": 1.02320004, + "epoch": 0.835172102810762, + "flos": 20038297576320.0, + "grad_norm": 2.518146173573676, + "language_loss": 0.59095812, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.61220872, + "num_input_tokens_seen": 299646445, + "step": 13891, + "time_per_iteration": 2.7173044681549072 + }, + { + "auxiliary_loss_clip": 0.01099806, + "auxiliary_loss_mlp": 0.01029991, + "balance_loss_clip": 1.03822279, + "balance_loss_mlp": 1.01823068, + "epoch": 0.8352322260634301, + "flos": 25111577485440.0, + "grad_norm": 1.6267311876614727, + "language_loss": 0.71812761, + "learning_rate": 2.779987303092846e-07, + "loss": 0.7394256, + "num_input_tokens_seen": 299662665, + "step": 13892, + "time_per_iteration": 2.662322998046875 + }, + { + "auxiliary_loss_clip": 0.01106347, + "auxiliary_loss_mlp": 0.01034997, + "balance_loss_clip": 1.03654015, + "balance_loss_mlp": 1.02241993, + "epoch": 0.835292349316098, + "flos": 24863758577280.0, + "grad_norm": 1.986327047613025, + "language_loss": 0.65929645, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.68070984, + "num_input_tokens_seen": 299683585, + "step": 13893, + "time_per_iteration": 2.666810989379883 + }, + { + "auxiliary_loss_clip": 0.01079282, + "auxiliary_loss_mlp": 0.01024568, + "balance_loss_clip": 1.03549695, + "balance_loss_mlp": 1.01279628, + "epoch": 0.835352472568766, + "flos": 19865568049920.0, + "grad_norm": 2.066965169500514, + "language_loss": 0.78525186, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.80629033, + "num_input_tokens_seen": 299702680, + "step": 13894, + "time_per_iteration": 2.656261920928955 + }, + { + "auxiliary_loss_clip": 0.01089446, + "auxiliary_loss_mlp": 0.01029648, + "balance_loss_clip": 1.03555644, + "balance_loss_mlp": 1.01722014, + "epoch": 0.8354125958214339, + "flos": 22054754257920.0, + "grad_norm": 1.734895870039155, + "language_loss": 0.72428441, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.74547529, + "num_input_tokens_seen": 299721050, + "step": 13895, + "time_per_iteration": 2.5912013053894043 + }, + { + "auxiliary_loss_clip": 0.01098522, + "auxiliary_loss_mlp": 0.01043176, + "balance_loss_clip": 1.0375011, + "balance_loss_mlp": 1.02959836, + "epoch": 0.8354727190741019, + "flos": 21397804462080.0, + "grad_norm": 2.180746282209792, + "language_loss": 0.72239274, + "learning_rate": 2.772069258877667e-07, + "loss": 0.7438097, + "num_input_tokens_seen": 299738255, + "step": 13896, + "time_per_iteration": 2.816459894180298 + }, + { + "auxiliary_loss_clip": 0.01096666, + "auxiliary_loss_mlp": 0.01033423, + "balance_loss_clip": 1.03551006, + "balance_loss_mlp": 1.02084064, + "epoch": 0.8355328423267698, + "flos": 50840997834240.0, + "grad_norm": 2.4314822364196456, + "language_loss": 0.5891223, + "learning_rate": 2.770091380848423e-07, + "loss": 0.61042321, + "num_input_tokens_seen": 299761315, + "step": 13897, + "time_per_iteration": 2.854132652282715 + }, + { + "auxiliary_loss_clip": 0.01029051, + "auxiliary_loss_mlp": 0.00750932, + "balance_loss_clip": 1.00681758, + "balance_loss_mlp": 0.99963111, + "epoch": 0.8355929655794379, + "flos": 65551052764800.0, + "grad_norm": 0.6926411996792173, + "language_loss": 0.57645589, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.59425569, + "num_input_tokens_seen": 299828735, + "step": 13898, + "time_per_iteration": 3.189154624938965 + }, + { + "auxiliary_loss_clip": 0.01095352, + "auxiliary_loss_mlp": 0.01038588, + "balance_loss_clip": 1.03804767, + "balance_loss_mlp": 1.02465272, + "epoch": 0.8356530888321058, + "flos": 19170516902400.0, + "grad_norm": 1.982321802271085, + "language_loss": 0.79983473, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.8211742, + "num_input_tokens_seen": 299848395, + "step": 13899, + "time_per_iteration": 2.6372761726379395 + }, + { + "auxiliary_loss_clip": 0.01110341, + "auxiliary_loss_mlp": 0.01035111, + "balance_loss_clip": 1.03744435, + "balance_loss_mlp": 1.0235889, + "epoch": 0.8357132120847738, + "flos": 44126672238720.0, + "grad_norm": 3.0475154129794473, + "language_loss": 0.69246173, + "learning_rate": 2.764161667219749e-07, + "loss": 0.7139163, + "num_input_tokens_seen": 299871665, + "step": 13900, + "time_per_iteration": 2.7809805870056152 + }, + { + "auxiliary_loss_clip": 0.01086706, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.03770447, + "balance_loss_mlp": 1.01880407, + "epoch": 0.8357733353374418, + "flos": 24389701856640.0, + "grad_norm": 1.5605306335935556, + "language_loss": 0.71076608, + "learning_rate": 2.762186403079716e-07, + "loss": 0.73194158, + "num_input_tokens_seen": 299891960, + "step": 13901, + "time_per_iteration": 2.6282958984375 + }, + { + "auxiliary_loss_clip": 0.01065898, + "auxiliary_loss_mlp": 0.01039549, + "balance_loss_clip": 1.03284073, + "balance_loss_mlp": 1.02650762, + "epoch": 0.8358334585901097, + "flos": 20916313626240.0, + "grad_norm": 2.0190709128744686, + "language_loss": 0.79701173, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.81806624, + "num_input_tokens_seen": 299905070, + "step": 13902, + "time_per_iteration": 2.690213441848755 + }, + { + "auxiliary_loss_clip": 0.01096183, + "auxiliary_loss_mlp": 0.01031756, + "balance_loss_clip": 1.03576422, + "balance_loss_mlp": 1.01979947, + "epoch": 0.8358935818427777, + "flos": 19244169740160.0, + "grad_norm": 1.8244739444872173, + "language_loss": 0.62556911, + "learning_rate": 2.758237835853379e-07, + "loss": 0.6468485, + "num_input_tokens_seen": 299925130, + "step": 13903, + "time_per_iteration": 2.6231348514556885 + }, + { + "auxiliary_loss_clip": 0.01084825, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.03508985, + "balance_loss_mlp": 1.02401519, + "epoch": 0.8359537050954456, + "flos": 24134053783680.0, + "grad_norm": 4.416778142718545, + "language_loss": 0.7411294, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.7623443, + "num_input_tokens_seen": 299943845, + "step": 13904, + "time_per_iteration": 2.7428109645843506 + }, + { + "auxiliary_loss_clip": 0.01082834, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.03411317, + "balance_loss_mlp": 1.017802, + "epoch": 0.8360138283481137, + "flos": 16180415187840.0, + "grad_norm": 1.704840597436559, + "language_loss": 0.72898692, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.75012279, + "num_input_tokens_seen": 299961620, + "step": 13905, + "time_per_iteration": 2.7568368911743164 + }, + { + "auxiliary_loss_clip": 0.01096191, + "auxiliary_loss_mlp": 0.01036518, + "balance_loss_clip": 1.03658271, + "balance_loss_mlp": 1.02507973, + "epoch": 0.8360739516007816, + "flos": 22198899536640.0, + "grad_norm": 1.8034628053468047, + "language_loss": 0.66455811, + "learning_rate": 2.752319888771e-07, + "loss": 0.68588519, + "num_input_tokens_seen": 299982170, + "step": 13906, + "time_per_iteration": 2.6989848613739014 + }, + { + "auxiliary_loss_clip": 0.01096481, + "auxiliary_loss_mlp": 0.01028681, + "balance_loss_clip": 1.03535354, + "balance_loss_mlp": 1.01639068, + "epoch": 0.8361340748534496, + "flos": 20923137210240.0, + "grad_norm": 2.553726632874823, + "language_loss": 0.74047542, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.76172698, + "num_input_tokens_seen": 300001330, + "step": 13907, + "time_per_iteration": 2.5955686569213867 + }, + { + "auxiliary_loss_clip": 0.0107652, + "auxiliary_loss_mlp": 0.0103429, + "balance_loss_clip": 1.03481364, + "balance_loss_mlp": 1.02162361, + "epoch": 0.8361941981061175, + "flos": 26173599932160.0, + "grad_norm": 11.039148102509154, + "language_loss": 0.75409931, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.7752074, + "num_input_tokens_seen": 300020645, + "step": 13908, + "time_per_iteration": 2.696906566619873 + }, + { + "auxiliary_loss_clip": 0.01097882, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.03590965, + "balance_loss_mlp": 1.01825249, + "epoch": 0.8362543213587855, + "flos": 24419363512320.0, + "grad_norm": 1.9388338218951495, + "language_loss": 0.71320546, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.73449743, + "num_input_tokens_seen": 300039945, + "step": 13909, + "time_per_iteration": 5.753490686416626 + }, + { + "auxiliary_loss_clip": 0.01112711, + "auxiliary_loss_mlp": 0.00771249, + "balance_loss_clip": 1.03798199, + "balance_loss_mlp": 1.00026917, + "epoch": 0.8363144446114534, + "flos": 17202396948480.0, + "grad_norm": 1.7414813953695425, + "language_loss": 0.73090255, + "learning_rate": 2.744438449482338e-07, + "loss": 0.74974209, + "num_input_tokens_seen": 300058260, + "step": 13910, + "time_per_iteration": 2.6283226013183594 + }, + { + "auxiliary_loss_clip": 0.01095006, + "auxiliary_loss_mlp": 0.00772614, + "balance_loss_clip": 1.0360173, + "balance_loss_mlp": 1.00014329, + "epoch": 0.8363745678641215, + "flos": 19279398003840.0, + "grad_norm": 2.750713587824779, + "language_loss": 0.73741031, + "learning_rate": 2.742469725305001e-07, + "loss": 0.75608653, + "num_input_tokens_seen": 300076720, + "step": 13911, + "time_per_iteration": 4.149497985839844 + }, + { + "auxiliary_loss_clip": 0.01090915, + "auxiliary_loss_mlp": 0.01037296, + "balance_loss_clip": 1.03743172, + "balance_loss_mlp": 1.02490461, + "epoch": 0.8364346911167894, + "flos": 11874869596800.0, + "grad_norm": 2.172823280625671, + "language_loss": 0.78602064, + "learning_rate": 2.740501655534946e-07, + "loss": 0.80730277, + "num_input_tokens_seen": 300092950, + "step": 13912, + "time_per_iteration": 2.7247042655944824 + }, + { + "auxiliary_loss_clip": 0.01099282, + "auxiliary_loss_mlp": 0.01030051, + "balance_loss_clip": 1.03660643, + "balance_loss_mlp": 1.01849961, + "epoch": 0.8364948143694574, + "flos": 20225212974720.0, + "grad_norm": 1.9881992667595267, + "language_loss": 0.7914685, + "learning_rate": 2.738534240246797e-07, + "loss": 0.81276178, + "num_input_tokens_seen": 300110950, + "step": 13913, + "time_per_iteration": 4.134316682815552 + }, + { + "auxiliary_loss_clip": 0.01097532, + "auxiliary_loss_mlp": 0.01030734, + "balance_loss_clip": 1.03647411, + "balance_loss_mlp": 1.0179075, + "epoch": 0.8365549376221254, + "flos": 21612909058560.0, + "grad_norm": 2.642445797747624, + "language_loss": 0.73418862, + "learning_rate": 2.736567479515153e-07, + "loss": 0.75547129, + "num_input_tokens_seen": 300128705, + "step": 13914, + "time_per_iteration": 2.571171760559082 + }, + { + "auxiliary_loss_clip": 0.01062932, + "auxiliary_loss_mlp": 0.0103246, + "balance_loss_clip": 1.03763103, + "balance_loss_mlp": 1.01987803, + "epoch": 0.8366150608747933, + "flos": 23294210912640.0, + "grad_norm": 1.590677583762713, + "language_loss": 0.71320194, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.73415583, + "num_input_tokens_seen": 300148635, + "step": 13915, + "time_per_iteration": 2.751453161239624 + }, + { + "auxiliary_loss_clip": 0.01080426, + "auxiliary_loss_mlp": 0.01030234, + "balance_loss_clip": 1.03791106, + "balance_loss_mlp": 1.01852822, + "epoch": 0.8366751841274613, + "flos": 15267673664640.0, + "grad_norm": 1.8965589808135064, + "language_loss": 0.71970236, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.74080902, + "num_input_tokens_seen": 300165490, + "step": 13916, + "time_per_iteration": 2.6807093620300293 + }, + { + "auxiliary_loss_clip": 0.01077533, + "auxiliary_loss_mlp": 0.00770081, + "balance_loss_clip": 1.03652239, + "balance_loss_mlp": 1.00017905, + "epoch": 0.8367353073801292, + "flos": 13224931205760.0, + "grad_norm": 2.314558822643351, + "language_loss": 0.74767375, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.76614988, + "num_input_tokens_seen": 300182130, + "step": 13917, + "time_per_iteration": 2.6898746490478516 + }, + { + "auxiliary_loss_clip": 0.01107617, + "auxiliary_loss_mlp": 0.01034237, + "balance_loss_clip": 1.03919959, + "balance_loss_mlp": 1.02238786, + "epoch": 0.8367954306327973, + "flos": 24205084928640.0, + "grad_norm": 1.7520480918143468, + "language_loss": 0.79073501, + "learning_rate": 2.728706983644933e-07, + "loss": 0.81215358, + "num_input_tokens_seen": 300203050, + "step": 13918, + "time_per_iteration": 2.585444450378418 + }, + { + "auxiliary_loss_clip": 0.01069111, + "auxiliary_loss_mlp": 0.01035129, + "balance_loss_clip": 1.03858578, + "balance_loss_mlp": 1.02256465, + "epoch": 0.8368555538854652, + "flos": 24534744975360.0, + "grad_norm": 1.689646886321145, + "language_loss": 0.67851698, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.69955939, + "num_input_tokens_seen": 300224380, + "step": 13919, + "time_per_iteration": 2.7965781688690186 + }, + { + "auxiliary_loss_clip": 0.0109041, + "auxiliary_loss_mlp": 0.01036947, + "balance_loss_clip": 1.03292394, + "balance_loss_mlp": 1.02389407, + "epoch": 0.8369156771381332, + "flos": 20259363830400.0, + "grad_norm": 1.776956438502091, + "language_loss": 0.73908985, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.76036346, + "num_input_tokens_seen": 300242915, + "step": 13920, + "time_per_iteration": 2.636904716491699 + }, + { + "auxiliary_loss_clip": 0.01088456, + "auxiliary_loss_mlp": 0.01029889, + "balance_loss_clip": 1.03454947, + "balance_loss_mlp": 1.0174973, + "epoch": 0.8369758003908011, + "flos": 21835555511040.0, + "grad_norm": 1.8125965247419975, + "language_loss": 0.68985099, + "learning_rate": 2.722818488237566e-07, + "loss": 0.71103442, + "num_input_tokens_seen": 300261905, + "step": 13921, + "time_per_iteration": 2.649538278579712 + }, + { + "auxiliary_loss_clip": 0.01101931, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.03782213, + "balance_loss_mlp": 1.01993847, + "epoch": 0.8370359236434691, + "flos": 21719312121600.0, + "grad_norm": 1.96204567174708, + "language_loss": 0.85527766, + "learning_rate": 2.720856966640801e-07, + "loss": 0.876616, + "num_input_tokens_seen": 300281145, + "step": 13922, + "time_per_iteration": 2.6043357849121094 + }, + { + "auxiliary_loss_clip": 0.01068346, + "auxiliary_loss_mlp": 0.00769545, + "balance_loss_clip": 1.0355072, + "balance_loss_mlp": 1.00012457, + "epoch": 0.837096046896137, + "flos": 23148880485120.0, + "grad_norm": 1.4962135590682923, + "language_loss": 0.71717429, + "learning_rate": 2.71889610027088e-07, + "loss": 0.73555321, + "num_input_tokens_seen": 300301610, + "step": 13923, + "time_per_iteration": 2.6874313354492188 + }, + { + "auxiliary_loss_clip": 0.01082449, + "auxiliary_loss_mlp": 0.01029737, + "balance_loss_clip": 1.03654337, + "balance_loss_mlp": 1.01662445, + "epoch": 0.8371561701488051, + "flos": 24492872695680.0, + "grad_norm": 1.885906995135632, + "language_loss": 0.759628, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.78074992, + "num_input_tokens_seen": 300319420, + "step": 13924, + "time_per_iteration": 2.671105146408081 + }, + { + "auxiliary_loss_clip": 0.01084333, + "auxiliary_loss_mlp": 0.01027444, + "balance_loss_clip": 1.03405309, + "balance_loss_mlp": 1.01530862, + "epoch": 0.837216293401473, + "flos": 29206723161600.0, + "grad_norm": 1.5298544720059444, + "language_loss": 0.64247084, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.66358864, + "num_input_tokens_seen": 300341325, + "step": 13925, + "time_per_iteration": 2.6903226375579834 + }, + { + "auxiliary_loss_clip": 0.01086129, + "auxiliary_loss_mlp": 0.01032025, + "balance_loss_clip": 1.03692162, + "balance_loss_mlp": 1.01949048, + "epoch": 0.837276416654141, + "flos": 25265275781760.0, + "grad_norm": 2.05791983020966, + "language_loss": 0.74643993, + "learning_rate": 2.713017433265543e-07, + "loss": 0.7676214, + "num_input_tokens_seen": 300361620, + "step": 13926, + "time_per_iteration": 2.7113802433013916 + }, + { + "auxiliary_loss_clip": 0.01099802, + "auxiliary_loss_mlp": 0.01036252, + "balance_loss_clip": 1.03915524, + "balance_loss_mlp": 1.02321601, + "epoch": 0.837336539906809, + "flos": 13882024656000.0, + "grad_norm": 5.046746125844788, + "language_loss": 0.71061361, + "learning_rate": 2.711059188546274e-07, + "loss": 0.73197412, + "num_input_tokens_seen": 300378675, + "step": 13927, + "time_per_iteration": 2.5931549072265625 + }, + { + "auxiliary_loss_clip": 0.01001985, + "auxiliary_loss_mlp": 0.01002351, + "balance_loss_clip": 1.00909257, + "balance_loss_mlp": 1.00123012, + "epoch": 0.8373966631594769, + "flos": 68870599044480.0, + "grad_norm": 0.7031143541961051, + "language_loss": 0.58787715, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.60792047, + "num_input_tokens_seen": 300449740, + "step": 13928, + "time_per_iteration": 3.2960739135742188 + }, + { + "auxiliary_loss_clip": 0.01071961, + "auxiliary_loss_mlp": 0.01042566, + "balance_loss_clip": 1.03641248, + "balance_loss_mlp": 1.02829027, + "epoch": 0.8374567864121449, + "flos": 20448972748800.0, + "grad_norm": 1.77650313447611, + "language_loss": 0.69560969, + "learning_rate": 2.707144665977068e-07, + "loss": 0.71675503, + "num_input_tokens_seen": 300470000, + "step": 13929, + "time_per_iteration": 2.6807215213775635 + }, + { + "auxiliary_loss_clip": 0.01098571, + "auxiliary_loss_mlp": 0.01027466, + "balance_loss_clip": 1.03639913, + "balance_loss_mlp": 1.01444805, + "epoch": 0.8375169096648128, + "flos": 41904197101440.0, + "grad_norm": 1.5497858215784133, + "language_loss": 0.66676092, + "learning_rate": 2.705188388275574e-07, + "loss": 0.68802124, + "num_input_tokens_seen": 300494975, + "step": 13930, + "time_per_iteration": 2.861119031906128 + }, + { + "auxiliary_loss_clip": 0.01066352, + "auxiliary_loss_mlp": 0.01027411, + "balance_loss_clip": 1.03803921, + "balance_loss_mlp": 1.01527548, + "epoch": 0.8375770329174809, + "flos": 20009354192640.0, + "grad_norm": 1.6161268751514244, + "language_loss": 0.71101642, + "learning_rate": 2.703232766395067e-07, + "loss": 0.7319541, + "num_input_tokens_seen": 300513175, + "step": 13931, + "time_per_iteration": 2.8232531547546387 + }, + { + "auxiliary_loss_clip": 0.01072605, + "auxiliary_loss_mlp": 0.01032905, + "balance_loss_clip": 1.03191161, + "balance_loss_mlp": 1.02047718, + "epoch": 0.8376371561701488, + "flos": 22783597125120.0, + "grad_norm": 2.2209749295259913, + "language_loss": 0.71790922, + "learning_rate": 2.701277800409705e-07, + "loss": 0.73896432, + "num_input_tokens_seen": 300533770, + "step": 13932, + "time_per_iteration": 2.7237002849578857 + }, + { + "auxiliary_loss_clip": 0.0104491, + "auxiliary_loss_mlp": 0.01034703, + "balance_loss_clip": 1.03452373, + "balance_loss_mlp": 1.02334225, + "epoch": 0.8376972794228168, + "flos": 23914459987200.0, + "grad_norm": 2.3684193578736785, + "language_loss": 0.66962039, + "learning_rate": 2.699323490393628e-07, + "loss": 0.69041657, + "num_input_tokens_seen": 300552995, + "step": 13933, + "time_per_iteration": 2.926781415939331 + }, + { + "auxiliary_loss_clip": 0.01079254, + "auxiliary_loss_mlp": 0.01043444, + "balance_loss_clip": 1.03600967, + "balance_loss_mlp": 1.0309329, + "epoch": 0.8377574026754847, + "flos": 13734718980480.0, + "grad_norm": 1.9196886282794297, + "language_loss": 0.76411772, + "learning_rate": 2.697369836420933e-07, + "loss": 0.78534472, + "num_input_tokens_seen": 300570275, + "step": 13934, + "time_per_iteration": 2.8570826053619385 + }, + { + "auxiliary_loss_clip": 0.01100527, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.04098976, + "balance_loss_mlp": 1.01738369, + "epoch": 0.8378175259281527, + "flos": 21651333632640.0, + "grad_norm": 1.505075221616912, + "language_loss": 0.77353156, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.79483283, + "num_input_tokens_seen": 300590875, + "step": 13935, + "time_per_iteration": 2.6582868099212646 + }, + { + "auxiliary_loss_clip": 0.01070099, + "auxiliary_loss_mlp": 0.01027318, + "balance_loss_clip": 1.03629911, + "balance_loss_mlp": 1.01469421, + "epoch": 0.8378776491808206, + "flos": 15448806973440.0, + "grad_norm": 5.23588368234973, + "language_loss": 0.56080019, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.58177441, + "num_input_tokens_seen": 300607490, + "step": 13936, + "time_per_iteration": 2.684828042984009 + }, + { + "auxiliary_loss_clip": 0.01090807, + "auxiliary_loss_mlp": 0.0103317, + "balance_loss_clip": 1.03235912, + "balance_loss_mlp": 1.02025938, + "epoch": 0.8379377724334887, + "flos": 14720395069440.0, + "grad_norm": 2.797405460790855, + "language_loss": 0.89294749, + "learning_rate": 2.691512811503882e-07, + "loss": 0.91418725, + "num_input_tokens_seen": 300623635, + "step": 13937, + "time_per_iteration": 2.5899250507354736 + }, + { + "auxiliary_loss_clip": 0.01099019, + "auxiliary_loss_mlp": 0.01026723, + "balance_loss_clip": 1.03668594, + "balance_loss_mlp": 1.01458192, + "epoch": 0.8379978956861566, + "flos": 24535247765760.0, + "grad_norm": 2.44657354170433, + "language_loss": 0.81838822, + "learning_rate": 2.689561782445313e-07, + "loss": 0.83964562, + "num_input_tokens_seen": 300643835, + "step": 13938, + "time_per_iteration": 2.634232521057129 + }, + { + "auxiliary_loss_clip": 0.01101448, + "auxiliary_loss_mlp": 0.01033084, + "balance_loss_clip": 1.03746319, + "balance_loss_mlp": 1.01988757, + "epoch": 0.8380580189388246, + "flos": 18952611045120.0, + "grad_norm": 1.6841429045668255, + "language_loss": 0.7044903, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.72583556, + "num_input_tokens_seen": 300662500, + "step": 13939, + "time_per_iteration": 2.61344575881958 + }, + { + "auxiliary_loss_clip": 0.01078321, + "auxiliary_loss_mlp": 0.01039086, + "balance_loss_clip": 1.03616691, + "balance_loss_mlp": 1.02573454, + "epoch": 0.8381181421914926, + "flos": 26540283922560.0, + "grad_norm": 6.593424997719047, + "language_loss": 0.76224637, + "learning_rate": 2.6856616936428e-07, + "loss": 0.78342044, + "num_input_tokens_seen": 300681480, + "step": 13940, + "time_per_iteration": 2.6947879791259766 + }, + { + "auxiliary_loss_clip": 0.01093556, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.0350914, + "balance_loss_mlp": 1.02206564, + "epoch": 0.8381782654441605, + "flos": 23291481479040.0, + "grad_norm": 1.6698207376849759, + "language_loss": 0.76370448, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.78498459, + "num_input_tokens_seen": 300699165, + "step": 13941, + "time_per_iteration": 2.629971742630005 + }, + { + "auxiliary_loss_clip": 0.01068862, + "auxiliary_loss_mlp": 0.01030093, + "balance_loss_clip": 1.03516936, + "balance_loss_mlp": 1.01728964, + "epoch": 0.8382383886968285, + "flos": 26758800311040.0, + "grad_norm": 2.4302037617265793, + "language_loss": 0.73204666, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.75303626, + "num_input_tokens_seen": 300714615, + "step": 13942, + "time_per_iteration": 2.741283893585205 + }, + { + "auxiliary_loss_clip": 0.01067172, + "auxiliary_loss_mlp": 0.01039608, + "balance_loss_clip": 1.03562307, + "balance_loss_mlp": 1.02545786, + "epoch": 0.8382985119494964, + "flos": 26104544035200.0, + "grad_norm": 1.6102800053781703, + "language_loss": 0.79528558, + "learning_rate": 2.679816484834554e-07, + "loss": 0.81635338, + "num_input_tokens_seen": 300734860, + "step": 13943, + "time_per_iteration": 2.7648844718933105 + }, + { + "auxiliary_loss_clip": 0.01057583, + "auxiliary_loss_mlp": 0.01030261, + "balance_loss_clip": 1.03292835, + "balance_loss_mlp": 1.01832187, + "epoch": 0.8383586352021645, + "flos": 16435129507200.0, + "grad_norm": 1.9936529414882505, + "language_loss": 0.85062182, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.87150025, + "num_input_tokens_seen": 300752735, + "step": 13944, + "time_per_iteration": 2.702016592025757 + }, + { + "auxiliary_loss_clip": 0.01009407, + "auxiliary_loss_mlp": 0.00750919, + "balance_loss_clip": 1.00603545, + "balance_loss_mlp": 0.99966449, + "epoch": 0.8384187584548324, + "flos": 64195532288640.0, + "grad_norm": 0.6194539078330007, + "language_loss": 0.50268608, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.5202893, + "num_input_tokens_seen": 300820760, + "step": 13945, + "time_per_iteration": 3.2719228267669678 + }, + { + "auxiliary_loss_clip": 0.01067358, + "auxiliary_loss_mlp": 0.01031818, + "balance_loss_clip": 1.03898573, + "balance_loss_mlp": 1.01964068, + "epoch": 0.8384788817075004, + "flos": 22382905933440.0, + "grad_norm": 2.0041104630114726, + "language_loss": 0.64992464, + "learning_rate": 2.673977187074017e-07, + "loss": 0.67091638, + "num_input_tokens_seen": 300840025, + "step": 13946, + "time_per_iteration": 2.7332231998443604 + }, + { + "auxiliary_loss_clip": 0.01060162, + "auxiliary_loss_mlp": 0.01032335, + "balance_loss_clip": 1.03414798, + "balance_loss_mlp": 1.01927578, + "epoch": 0.8385390049601683, + "flos": 29496845312640.0, + "grad_norm": 1.5282176707936672, + "language_loss": 0.67431152, + "learning_rate": 2.672032068397829e-07, + "loss": 0.69523644, + "num_input_tokens_seen": 300860380, + "step": 13947, + "time_per_iteration": 2.8148739337921143 + }, + { + "auxiliary_loss_clip": 0.01084671, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.03683496, + "balance_loss_mlp": 1.02082467, + "epoch": 0.8385991282128363, + "flos": 32707797799680.0, + "grad_norm": 2.1167215710156566, + "language_loss": 0.70042205, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.72161293, + "num_input_tokens_seen": 300881895, + "step": 13948, + "time_per_iteration": 4.3659327030181885 + }, + { + "auxiliary_loss_clip": 0.01084202, + "auxiliary_loss_mlp": 0.01032949, + "balance_loss_clip": 1.03514576, + "balance_loss_mlp": 1.02195239, + "epoch": 0.8386592514655042, + "flos": 25441022050560.0, + "grad_norm": 2.5602152463146033, + "language_loss": 0.85150999, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.8726815, + "num_input_tokens_seen": 300901575, + "step": 13949, + "time_per_iteration": 4.24399995803833 + }, + { + "auxiliary_loss_clip": 0.01081801, + "auxiliary_loss_mlp": 0.01029076, + "balance_loss_clip": 1.03833914, + "balance_loss_mlp": 1.01670778, + "epoch": 0.8387193747181723, + "flos": 22015898720640.0, + "grad_norm": 4.303340454207266, + "language_loss": 0.69926894, + "learning_rate": 2.66620065513385e-07, + "loss": 0.72037774, + "num_input_tokens_seen": 300919735, + "step": 13950, + "time_per_iteration": 4.277710914611816 + }, + { + "auxiliary_loss_clip": 0.01091242, + "auxiliary_loss_mlp": 0.01029375, + "balance_loss_clip": 1.03648567, + "balance_loss_mlp": 1.01687598, + "epoch": 0.8387794979708402, + "flos": 18150223080960.0, + "grad_norm": 1.697645904301953, + "language_loss": 0.6442672, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.6654734, + "num_input_tokens_seen": 300939150, + "step": 13951, + "time_per_iteration": 2.564544439315796 + }, + { + "auxiliary_loss_clip": 0.01100469, + "auxiliary_loss_mlp": 0.01030012, + "balance_loss_clip": 1.03913283, + "balance_loss_mlp": 1.01795959, + "epoch": 0.8388396212235082, + "flos": 25411216740480.0, + "grad_norm": 1.428691785600865, + "language_loss": 0.69986898, + "learning_rate": 2.662316332665393e-07, + "loss": 0.72117376, + "num_input_tokens_seen": 300959730, + "step": 13952, + "time_per_iteration": 4.1969122886657715 + }, + { + "auxiliary_loss_clip": 0.01096336, + "auxiliary_loss_mlp": 0.01032886, + "balance_loss_clip": 1.03714609, + "balance_loss_mlp": 1.02128077, + "epoch": 0.8388997444761762, + "flos": 22273055164800.0, + "grad_norm": 1.8744192088426839, + "language_loss": 0.72788477, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.74917698, + "num_input_tokens_seen": 300976120, + "step": 13953, + "time_per_iteration": 2.6013376712799072 + }, + { + "auxiliary_loss_clip": 0.01036141, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.03169441, + "balance_loss_mlp": 1.01992166, + "epoch": 0.8389598677288441, + "flos": 19573219255680.0, + "grad_norm": 1.9148205474215427, + "language_loss": 0.68345833, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.70415455, + "num_input_tokens_seen": 300995080, + "step": 13954, + "time_per_iteration": 2.7236297130584717 + }, + { + "auxiliary_loss_clip": 0.0108771, + "auxiliary_loss_mlp": 0.01035111, + "balance_loss_clip": 1.03767776, + "balance_loss_mlp": 1.0240128, + "epoch": 0.8390199909815121, + "flos": 17384715406080.0, + "grad_norm": 1.7636414088599872, + "language_loss": 0.7324779, + "learning_rate": 2.656494779996932e-07, + "loss": 0.7537061, + "num_input_tokens_seen": 301012920, + "step": 13955, + "time_per_iteration": 2.661045551300049 + }, + { + "auxiliary_loss_clip": 0.01043432, + "auxiliary_loss_mlp": 0.01032322, + "balance_loss_clip": 1.03135204, + "balance_loss_mlp": 1.019346, + "epoch": 0.83908011423418, + "flos": 24639639667200.0, + "grad_norm": 8.047952352869046, + "language_loss": 0.66471386, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.68547142, + "num_input_tokens_seen": 301028875, + "step": 13956, + "time_per_iteration": 2.7914817333221436 + }, + { + "auxiliary_loss_clip": 0.01099865, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.03744364, + "balance_loss_mlp": 1.02332473, + "epoch": 0.8391402374868481, + "flos": 24718356322560.0, + "grad_norm": 2.4631411130881995, + "language_loss": 0.79544741, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.81680918, + "num_input_tokens_seen": 301050115, + "step": 13957, + "time_per_iteration": 2.7476260662078857 + }, + { + "auxiliary_loss_clip": 0.00983967, + "auxiliary_loss_mlp": 0.01019247, + "balance_loss_clip": 1.01336145, + "balance_loss_mlp": 1.01760185, + "epoch": 0.839200360739516, + "flos": 56871695784960.0, + "grad_norm": 0.7593984964089096, + "language_loss": 0.53379953, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.5538317, + "num_input_tokens_seen": 301114155, + "step": 13958, + "time_per_iteration": 3.488459825515747 + }, + { + "auxiliary_loss_clip": 0.01098132, + "auxiliary_loss_mlp": 0.01032752, + "balance_loss_clip": 1.03722978, + "balance_loss_mlp": 1.01981759, + "epoch": 0.839260483992184, + "flos": 18332792933760.0, + "grad_norm": 1.8527919164572209, + "language_loss": 0.72979414, + "learning_rate": 2.648741917459574e-07, + "loss": 0.75110304, + "num_input_tokens_seen": 301133150, + "step": 13959, + "time_per_iteration": 3.048078775405884 + }, + { + "auxiliary_loss_clip": 0.01075035, + "auxiliary_loss_mlp": 0.01024754, + "balance_loss_clip": 1.037763, + "balance_loss_mlp": 1.01298177, + "epoch": 0.8393206072448519, + "flos": 27087921653760.0, + "grad_norm": 2.0364412666865843, + "language_loss": 0.55541557, + "learning_rate": 2.646805346545169e-07, + "loss": 0.57641345, + "num_input_tokens_seen": 301153600, + "step": 13960, + "time_per_iteration": 2.835035800933838 + }, + { + "auxiliary_loss_clip": 0.01002229, + "auxiliary_loss_mlp": 0.01000998, + "balance_loss_clip": 1.0077697, + "balance_loss_mlp": 1.00003195, + "epoch": 0.8393807304975199, + "flos": 61521192057600.0, + "grad_norm": 0.7763315867784596, + "language_loss": 0.60705209, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.62708437, + "num_input_tokens_seen": 301214335, + "step": 13961, + "time_per_iteration": 3.3663535118103027 + }, + { + "auxiliary_loss_clip": 0.01052805, + "auxiliary_loss_mlp": 0.010396, + "balance_loss_clip": 1.02986741, + "balance_loss_mlp": 1.02657616, + "epoch": 0.8394408537501878, + "flos": 14894848448640.0, + "grad_norm": 2.557584362268972, + "language_loss": 0.68461823, + "learning_rate": 2.642934178894405e-07, + "loss": 0.70554227, + "num_input_tokens_seen": 301228960, + "step": 13962, + "time_per_iteration": 2.6838927268981934 + }, + { + "auxiliary_loss_clip": 0.01077301, + "auxiliary_loss_mlp": 0.01027695, + "balance_loss_clip": 1.03520894, + "balance_loss_mlp": 1.01575601, + "epoch": 0.8395009770028559, + "flos": 17412186332160.0, + "grad_norm": 1.9087314512083013, + "language_loss": 0.72709483, + "learning_rate": 2.640999582304841e-07, + "loss": 0.74814475, + "num_input_tokens_seen": 301245875, + "step": 13963, + "time_per_iteration": 2.7063026428222656 + }, + { + "auxiliary_loss_clip": 0.01085945, + "auxiliary_loss_mlp": 0.01034113, + "balance_loss_clip": 1.035007, + "balance_loss_mlp": 1.02194226, + "epoch": 0.8395611002555238, + "flos": 27924747782400.0, + "grad_norm": 1.5783482209537385, + "language_loss": 0.76520944, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.78640997, + "num_input_tokens_seen": 301265550, + "step": 13964, + "time_per_iteration": 2.7841615676879883 + }, + { + "auxiliary_loss_clip": 0.01089552, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.03722572, + "balance_loss_mlp": 1.02287793, + "epoch": 0.8396212235081918, + "flos": 11100922225920.0, + "grad_norm": 2.0757447568639633, + "language_loss": 0.78032225, + "learning_rate": 2.637132363964161e-07, + "loss": 0.80158186, + "num_input_tokens_seen": 301282035, + "step": 13965, + "time_per_iteration": 2.67738938331604 + }, + { + "auxiliary_loss_clip": 0.01092348, + "auxiliary_loss_mlp": 0.01032536, + "balance_loss_clip": 1.03630924, + "balance_loss_mlp": 1.02068114, + "epoch": 0.8396813467608598, + "flos": 35735641729920.0, + "grad_norm": 1.499677295305954, + "language_loss": 0.65898132, + "learning_rate": 2.635199742359684e-07, + "loss": 0.68023014, + "num_input_tokens_seen": 301305210, + "step": 13966, + "time_per_iteration": 2.7493228912353516 + }, + { + "auxiliary_loss_clip": 0.01086107, + "auxiliary_loss_mlp": 0.01032854, + "balance_loss_clip": 1.03722155, + "balance_loss_mlp": 1.02049196, + "epoch": 0.8397414700135277, + "flos": 26176724415360.0, + "grad_norm": 1.9100754434512948, + "language_loss": 0.74755192, + "learning_rate": 2.633267779230177e-07, + "loss": 0.76874149, + "num_input_tokens_seen": 301324885, + "step": 13967, + "time_per_iteration": 2.6640665531158447 + }, + { + "auxiliary_loss_clip": 0.01081249, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.0370276, + "balance_loss_mlp": 1.01756883, + "epoch": 0.8398015932661957, + "flos": 18333116156160.0, + "grad_norm": 1.8492234177580402, + "language_loss": 0.82993788, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.85104513, + "num_input_tokens_seen": 301343070, + "step": 13968, + "time_per_iteration": 2.5900182723999023 + }, + { + "auxiliary_loss_clip": 0.01083656, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.03804672, + "balance_loss_mlp": 1.01986384, + "epoch": 0.8398617165188637, + "flos": 17379507934080.0, + "grad_norm": 2.094652231343387, + "language_loss": 0.7729916, + "learning_rate": 2.629405828689075e-07, + "loss": 0.7941494, + "num_input_tokens_seen": 301359280, + "step": 13969, + "time_per_iteration": 2.611394166946411 + }, + { + "auxiliary_loss_clip": 0.01090762, + "auxiliary_loss_mlp": 0.01030453, + "balance_loss_clip": 1.03660858, + "balance_loss_mlp": 1.01741195, + "epoch": 0.8399218397715317, + "flos": 22929681738240.0, + "grad_norm": 2.0003618837908244, + "language_loss": 0.77181804, + "learning_rate": 2.627475841423923e-07, + "loss": 0.79303014, + "num_input_tokens_seen": 301376465, + "step": 13970, + "time_per_iteration": 2.6121816635131836 + }, + { + "auxiliary_loss_clip": 0.01087144, + "auxiliary_loss_mlp": 0.01038704, + "balance_loss_clip": 1.03651595, + "balance_loss_mlp": 1.02689075, + "epoch": 0.8399819630241996, + "flos": 23149562843520.0, + "grad_norm": 2.097520356637354, + "language_loss": 0.71949625, + "learning_rate": 2.625546512926633e-07, + "loss": 0.74075466, + "num_input_tokens_seen": 301396000, + "step": 13971, + "time_per_iteration": 2.6382222175598145 + }, + { + "auxiliary_loss_clip": 0.01085619, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.03411746, + "balance_loss_mlp": 1.01840544, + "epoch": 0.8400420862768676, + "flos": 16397423205120.0, + "grad_norm": 1.7445644136224228, + "language_loss": 0.77706194, + "learning_rate": 2.623617843270358e-07, + "loss": 0.79823112, + "num_input_tokens_seen": 301413160, + "step": 13972, + "time_per_iteration": 2.637141227722168 + }, + { + "auxiliary_loss_clip": 0.01041674, + "auxiliary_loss_mlp": 0.01037854, + "balance_loss_clip": 1.03100634, + "balance_loss_mlp": 1.02458596, + "epoch": 0.8401022095295355, + "flos": 21287486816640.0, + "grad_norm": 1.304807190993185, + "language_loss": 0.68481863, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.70561385, + "num_input_tokens_seen": 301433325, + "step": 13973, + "time_per_iteration": 2.7618348598480225 + }, + { + "auxiliary_loss_clip": 0.01088741, + "auxiliary_loss_mlp": 0.01030742, + "balance_loss_clip": 1.03717828, + "balance_loss_mlp": 1.01786125, + "epoch": 0.8401623327822035, + "flos": 17311313963520.0, + "grad_norm": 2.2621035315363858, + "language_loss": 0.78135633, + "learning_rate": 2.619762480773382e-07, + "loss": 0.80255115, + "num_input_tokens_seen": 301450265, + "step": 13974, + "time_per_iteration": 2.6674814224243164 + }, + { + "auxiliary_loss_clip": 0.01095006, + "auxiliary_loss_mlp": 0.01030636, + "balance_loss_clip": 1.03827214, + "balance_loss_mlp": 1.01826859, + "epoch": 0.8402224560348714, + "flos": 22236677665920.0, + "grad_norm": 1.513610095867281, + "language_loss": 0.7256, + "learning_rate": 2.617835788078868e-07, + "loss": 0.74685645, + "num_input_tokens_seen": 301470760, + "step": 13975, + "time_per_iteration": 2.838907241821289 + }, + { + "auxiliary_loss_clip": 0.01089044, + "auxiliary_loss_mlp": 0.01025952, + "balance_loss_clip": 1.03686631, + "balance_loss_mlp": 1.01353598, + "epoch": 0.8402825792875395, + "flos": 20229953569920.0, + "grad_norm": 7.753004351668279, + "language_loss": 0.72390342, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.74505341, + "num_input_tokens_seen": 301489425, + "step": 13976, + "time_per_iteration": 2.726900100708008 + }, + { + "auxiliary_loss_clip": 0.01107341, + "auxiliary_loss_mlp": 0.00769496, + "balance_loss_clip": 1.03678119, + "balance_loss_mlp": 1.00013971, + "epoch": 0.8403427025402074, + "flos": 23289973107840.0, + "grad_norm": 1.8413083341315597, + "language_loss": 0.71979779, + "learning_rate": 2.61398438016311e-07, + "loss": 0.7385661, + "num_input_tokens_seen": 301508885, + "step": 13977, + "time_per_iteration": 2.630323886871338 + }, + { + "auxiliary_loss_clip": 0.01096098, + "auxiliary_loss_mlp": 0.01032339, + "balance_loss_clip": 1.03397727, + "balance_loss_mlp": 1.02011466, + "epoch": 0.8404028257928754, + "flos": 32675586278400.0, + "grad_norm": 1.7930366312861392, + "language_loss": 0.68852651, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.70981085, + "num_input_tokens_seen": 301533780, + "step": 13978, + "time_per_iteration": 2.7467479705810547 + }, + { + "auxiliary_loss_clip": 0.01071792, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.03347301, + "balance_loss_mlp": 1.01965952, + "epoch": 0.8404629490455434, + "flos": 16180522928640.0, + "grad_norm": 1.8105667854844871, + "language_loss": 0.77938527, + "learning_rate": 2.610135609365145e-07, + "loss": 0.80042142, + "num_input_tokens_seen": 301551775, + "step": 13979, + "time_per_iteration": 2.6985831260681152 + }, + { + "auxiliary_loss_clip": 0.0109651, + "auxiliary_loss_mlp": 0.01030498, + "balance_loss_clip": 1.04012775, + "balance_loss_mlp": 1.01822543, + "epoch": 0.8405230722982113, + "flos": 15194451790080.0, + "grad_norm": 2.0614080045574714, + "language_loss": 0.77732342, + "learning_rate": 2.60821221306778e-07, + "loss": 0.79859352, + "num_input_tokens_seen": 301570495, + "step": 13980, + "time_per_iteration": 2.5943267345428467 + }, + { + "auxiliary_loss_clip": 0.01073604, + "auxiliary_loss_mlp": 0.01029935, + "balance_loss_clip": 1.03548491, + "balance_loss_mlp": 1.01782358, + "epoch": 0.8405831955508793, + "flos": 27812418975360.0, + "grad_norm": 5.854551943090863, + "language_loss": 0.86627793, + "learning_rate": 2.606289476268757e-07, + "loss": 0.88731331, + "num_input_tokens_seen": 301591705, + "step": 13981, + "time_per_iteration": 2.742199182510376 + }, + { + "auxiliary_loss_clip": 0.01097126, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.03606057, + "balance_loss_mlp": 1.02131581, + "epoch": 0.8406433188035473, + "flos": 23769452782080.0, + "grad_norm": 2.935607999333321, + "language_loss": 0.67501163, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.69631881, + "num_input_tokens_seen": 301611670, + "step": 13982, + "time_per_iteration": 2.6252353191375732 + }, + { + "auxiliary_loss_clip": 0.01061743, + "auxiliary_loss_mlp": 0.01041459, + "balance_loss_clip": 1.03561366, + "balance_loss_mlp": 1.02742803, + "epoch": 0.8407034420562153, + "flos": 29205681667200.0, + "grad_norm": 3.073966172290324, + "language_loss": 0.67936915, + "learning_rate": 2.602445981457324e-07, + "loss": 0.70040119, + "num_input_tokens_seen": 301632540, + "step": 13983, + "time_per_iteration": 2.7724905014038086 + }, + { + "auxiliary_loss_clip": 0.01069644, + "auxiliary_loss_mlp": 0.01032871, + "balance_loss_clip": 1.03083551, + "balance_loss_mlp": 1.01959062, + "epoch": 0.8407635653088832, + "flos": 26360084367360.0, + "grad_norm": 1.7674861859482776, + "language_loss": 0.79221404, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.8132391, + "num_input_tokens_seen": 301651480, + "step": 13984, + "time_per_iteration": 2.7457640171051025 + }, + { + "auxiliary_loss_clip": 0.01094285, + "auxiliary_loss_mlp": 0.01033616, + "balance_loss_clip": 1.03387666, + "balance_loss_mlp": 1.02156985, + "epoch": 0.8408236885615512, + "flos": 21468799693440.0, + "grad_norm": 2.394750373647798, + "language_loss": 0.59764493, + "learning_rate": 2.598605125513842e-07, + "loss": 0.6189239, + "num_input_tokens_seen": 301670010, + "step": 13985, + "time_per_iteration": 2.6200110912323 + }, + { + "auxiliary_loss_clip": 0.01067816, + "auxiliary_loss_mlp": 0.01029355, + "balance_loss_clip": 1.03496993, + "balance_loss_mlp": 1.01653397, + "epoch": 0.8408838118142191, + "flos": 22963724853120.0, + "grad_norm": 1.5708649671091988, + "language_loss": 0.82083929, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.84181106, + "num_input_tokens_seen": 301689785, + "step": 13986, + "time_per_iteration": 2.728940725326538 + }, + { + "auxiliary_loss_clip": 0.01088746, + "auxiliary_loss_mlp": 0.0077024, + "balance_loss_clip": 1.03921962, + "balance_loss_mlp": 1.00023127, + "epoch": 0.8409439350668871, + "flos": 26800026145920.0, + "grad_norm": 1.4303842163720517, + "language_loss": 0.6583513, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.67694116, + "num_input_tokens_seen": 301712225, + "step": 13987, + "time_per_iteration": 4.393038988113403 + }, + { + "auxiliary_loss_clip": 0.01109413, + "auxiliary_loss_mlp": 0.00770439, + "balance_loss_clip": 1.03814602, + "balance_loss_mlp": 1.00023389, + "epoch": 0.841004058319555, + "flos": 26578672583040.0, + "grad_norm": 2.100312722202425, + "language_loss": 0.67510009, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.69389856, + "num_input_tokens_seen": 301730955, + "step": 13988, + "time_per_iteration": 4.25507664680481 + }, + { + "auxiliary_loss_clip": 0.01099532, + "auxiliary_loss_mlp": 0.01036728, + "balance_loss_clip": 1.04084682, + "balance_loss_mlp": 1.02341866, + "epoch": 0.8410641815722231, + "flos": 14501878680960.0, + "grad_norm": 2.2740432778318143, + "language_loss": 0.81379843, + "learning_rate": 2.590931332560622e-07, + "loss": 0.83516109, + "num_input_tokens_seen": 301746930, + "step": 13989, + "time_per_iteration": 2.584982395172119 + }, + { + "auxiliary_loss_clip": 0.01096831, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.03519654, + "balance_loss_mlp": 1.01829755, + "epoch": 0.841124304824891, + "flos": 29166682475520.0, + "grad_norm": 1.6804070387823404, + "language_loss": 0.75063282, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.77190965, + "num_input_tokens_seen": 301766945, + "step": 13990, + "time_per_iteration": 4.359278440475464 + }, + { + "auxiliary_loss_clip": 0.01093958, + "auxiliary_loss_mlp": 0.01031281, + "balance_loss_clip": 1.03545666, + "balance_loss_mlp": 1.01897252, + "epoch": 0.841184428077559, + "flos": 22412028885120.0, + "grad_norm": 1.7221123856133072, + "language_loss": 0.80666637, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.82791877, + "num_input_tokens_seen": 301785460, + "step": 13991, + "time_per_iteration": 4.206341743469238 + }, + { + "auxiliary_loss_clip": 0.01070481, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.0353756, + "balance_loss_mlp": 1.0215528, + "epoch": 0.841244551330227, + "flos": 22962791099520.0, + "grad_norm": 2.0828857174593263, + "language_loss": 0.70396578, + "learning_rate": 2.585182919204105e-07, + "loss": 0.72500479, + "num_input_tokens_seen": 301804180, + "step": 13992, + "time_per_iteration": 2.692427158355713 + }, + { + "auxiliary_loss_clip": 0.01075291, + "auxiliary_loss_mlp": 0.01027217, + "balance_loss_clip": 1.03414965, + "balance_loss_mlp": 1.01490271, + "epoch": 0.8413046745828949, + "flos": 21032736583680.0, + "grad_norm": 3.8503455300269427, + "language_loss": 0.76960343, + "learning_rate": 2.583268102064959e-07, + "loss": 0.79062855, + "num_input_tokens_seen": 301823670, + "step": 13993, + "time_per_iteration": 2.704113006591797 + }, + { + "auxiliary_loss_clip": 0.01102579, + "auxiliary_loss_mlp": 0.01036453, + "balance_loss_clip": 1.0354774, + "balance_loss_mlp": 1.02206421, + "epoch": 0.841364797835563, + "flos": 27052082858880.0, + "grad_norm": 6.277502737271607, + "language_loss": 0.74242276, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.76381308, + "num_input_tokens_seen": 301845890, + "step": 13994, + "time_per_iteration": 2.6997077465057373 + }, + { + "auxiliary_loss_clip": 0.01094097, + "auxiliary_loss_mlp": 0.01029334, + "balance_loss_clip": 1.0345273, + "balance_loss_mlp": 1.01830745, + "epoch": 0.8414249210882309, + "flos": 17895688329600.0, + "grad_norm": 1.5616526510476096, + "language_loss": 0.5941689, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.61540318, + "num_input_tokens_seen": 301863985, + "step": 13995, + "time_per_iteration": 2.6176936626434326 + }, + { + "auxiliary_loss_clip": 0.01095561, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.035285, + "balance_loss_mlp": 1.01885819, + "epoch": 0.8414850443408989, + "flos": 25441201618560.0, + "grad_norm": 2.764700779392295, + "language_loss": 0.71651798, + "learning_rate": 2.577527613603163e-07, + "loss": 0.73778963, + "num_input_tokens_seen": 301882765, + "step": 13996, + "time_per_iteration": 2.596438407897949 + }, + { + "auxiliary_loss_clip": 0.0108265, + "auxiliary_loss_mlp": 0.01030717, + "balance_loss_clip": 1.03388953, + "balance_loss_mlp": 1.01880229, + "epoch": 0.8415451675935668, + "flos": 23220055284480.0, + "grad_norm": 1.7462285917475873, + "language_loss": 0.64240086, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.66353452, + "num_input_tokens_seen": 301902720, + "step": 13997, + "time_per_iteration": 2.7167398929595947 + }, + { + "auxiliary_loss_clip": 0.01087567, + "auxiliary_loss_mlp": 0.01035998, + "balance_loss_clip": 1.03863931, + "balance_loss_mlp": 1.02296853, + "epoch": 0.8416052908462348, + "flos": 18546496899840.0, + "grad_norm": 1.8459858361991137, + "language_loss": 0.82516265, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.84639835, + "num_input_tokens_seen": 301921245, + "step": 13998, + "time_per_iteration": 2.6906321048736572 + }, + { + "auxiliary_loss_clip": 0.0110001, + "auxiliary_loss_mlp": 0.00769946, + "balance_loss_clip": 1.03833914, + "balance_loss_mlp": 1.00016832, + "epoch": 0.8416654140989027, + "flos": 26105190480000.0, + "grad_norm": 2.037627492824348, + "language_loss": 0.80260479, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.82130432, + "num_input_tokens_seen": 301942320, + "step": 13999, + "time_per_iteration": 2.679971218109131 + }, + { + "auxiliary_loss_clip": 0.01098013, + "auxiliary_loss_mlp": 0.01033879, + "balance_loss_clip": 1.03585649, + "balance_loss_mlp": 1.02033651, + "epoch": 0.8417255373515707, + "flos": 26433270328320.0, + "grad_norm": 2.9164172994343946, + "language_loss": 0.66541272, + "learning_rate": 2.569882878592096e-07, + "loss": 0.68673158, + "num_input_tokens_seen": 301963110, + "step": 14000, + "time_per_iteration": 2.6393961906433105 + }, + { + "auxiliary_loss_clip": 0.011048, + "auxiliary_loss_mlp": 0.01028492, + "balance_loss_clip": 1.03878963, + "balance_loss_mlp": 1.01545656, + "epoch": 0.8417856606042387, + "flos": 24717745791360.0, + "grad_norm": 1.439326835594235, + "language_loss": 0.79285717, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.81419003, + "num_input_tokens_seen": 301984915, + "step": 14001, + "time_per_iteration": 2.6358094215393066 + }, + { + "auxiliary_loss_clip": 0.01045692, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.0337944, + "balance_loss_mlp": 1.01975203, + "epoch": 0.8418457838569067, + "flos": 20850849089280.0, + "grad_norm": 1.7593384488852517, + "language_loss": 0.78821921, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.80899262, + "num_input_tokens_seen": 302004095, + "step": 14002, + "time_per_iteration": 2.7560184001922607 + }, + { + "auxiliary_loss_clip": 0.01062189, + "auxiliary_loss_mlp": 0.00769355, + "balance_loss_clip": 1.03507459, + "balance_loss_mlp": 1.00019288, + "epoch": 0.8419059071095746, + "flos": 28660629715200.0, + "grad_norm": 1.490278429458478, + "language_loss": 0.78022176, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.79853719, + "num_input_tokens_seen": 302027250, + "step": 14003, + "time_per_iteration": 2.792100429534912 + }, + { + "auxiliary_loss_clip": 0.01083114, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.03756177, + "balance_loss_mlp": 1.01619506, + "epoch": 0.8419660303622426, + "flos": 21653596189440.0, + "grad_norm": 4.275398079582637, + "language_loss": 0.65523028, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.67634964, + "num_input_tokens_seen": 302046950, + "step": 14004, + "time_per_iteration": 2.676882028579712 + }, + { + "auxiliary_loss_clip": 0.01098301, + "auxiliary_loss_mlp": 0.01032889, + "balance_loss_clip": 1.03571546, + "balance_loss_mlp": 1.01909709, + "epoch": 0.8420261536149106, + "flos": 25301114576640.0, + "grad_norm": 2.012358102157947, + "language_loss": 0.76216292, + "learning_rate": 2.560341831785724e-07, + "loss": 0.7834748, + "num_input_tokens_seen": 302065470, + "step": 14005, + "time_per_iteration": 2.6246840953826904 + }, + { + "auxiliary_loss_clip": 0.01072567, + "auxiliary_loss_mlp": 0.00770849, + "balance_loss_clip": 1.03307796, + "balance_loss_mlp": 1.00026453, + "epoch": 0.8420862768675785, + "flos": 18763397176320.0, + "grad_norm": 1.64958735251114, + "language_loss": 0.77457279, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.7930069, + "num_input_tokens_seen": 302083190, + "step": 14006, + "time_per_iteration": 2.686828136444092 + }, + { + "auxiliary_loss_clip": 0.01098645, + "auxiliary_loss_mlp": 0.01036893, + "balance_loss_clip": 1.03723645, + "balance_loss_mlp": 1.02451313, + "epoch": 0.8421464001202466, + "flos": 18328052338560.0, + "grad_norm": 2.595732898924613, + "language_loss": 0.76791775, + "learning_rate": 2.556530041751932e-07, + "loss": 0.78927308, + "num_input_tokens_seen": 302098820, + "step": 14007, + "time_per_iteration": 2.5972254276275635 + }, + { + "auxiliary_loss_clip": 0.01081698, + "auxiliary_loss_mlp": 0.01034223, + "balance_loss_clip": 1.03605211, + "balance_loss_mlp": 1.02137184, + "epoch": 0.8422065233729145, + "flos": 31537181560320.0, + "grad_norm": 2.375931901998386, + "language_loss": 0.65710688, + "learning_rate": 2.554625138886102e-07, + "loss": 0.67826605, + "num_input_tokens_seen": 302117075, + "step": 14008, + "time_per_iteration": 2.700505256652832 + }, + { + "auxiliary_loss_clip": 0.01019521, + "auxiliary_loss_mlp": 0.01001888, + "balance_loss_clip": 1.00692487, + "balance_loss_mlp": 1.00089824, + "epoch": 0.8422666466255825, + "flos": 64298128510080.0, + "grad_norm": 0.7086546699989251, + "language_loss": 0.5692749, + "learning_rate": 2.552720897550631e-07, + "loss": 0.58948898, + "num_input_tokens_seen": 302179735, + "step": 14009, + "time_per_iteration": 3.2387187480926514 + }, + { + "auxiliary_loss_clip": 0.01039857, + "auxiliary_loss_mlp": 0.01034967, + "balance_loss_clip": 1.03280532, + "balance_loss_mlp": 1.02329016, + "epoch": 0.8423267698782504, + "flos": 24316731377280.0, + "grad_norm": 1.394156026072437, + "language_loss": 0.77893424, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.79968244, + "num_input_tokens_seen": 302202055, + "step": 14010, + "time_per_iteration": 2.8507986068725586 + }, + { + "auxiliary_loss_clip": 0.01113646, + "auxiliary_loss_mlp": 0.01037844, + "balance_loss_clip": 1.03962326, + "balance_loss_mlp": 1.0242126, + "epoch": 0.8423868931309184, + "flos": 18296092212480.0, + "grad_norm": 1.6155120229975741, + "language_loss": 0.72607601, + "learning_rate": 2.548914399759592e-07, + "loss": 0.7475909, + "num_input_tokens_seen": 302221360, + "step": 14011, + "time_per_iteration": 2.614745855331421 + }, + { + "auxiliary_loss_clip": 0.01093355, + "auxiliary_loss_mlp": 0.01039684, + "balance_loss_clip": 1.0365963, + "balance_loss_mlp": 1.02718472, + "epoch": 0.8424470163835863, + "flos": 23550218121600.0, + "grad_norm": 1.762716946245802, + "language_loss": 0.84175313, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.86308348, + "num_input_tokens_seen": 302240715, + "step": 14012, + "time_per_iteration": 2.872255325317383 + }, + { + "auxiliary_loss_clip": 0.01100527, + "auxiliary_loss_mlp": 0.01030747, + "balance_loss_clip": 1.03485525, + "balance_loss_mlp": 1.02031064, + "epoch": 0.8425071396362543, + "flos": 23769488695680.0, + "grad_norm": 1.7021120391885685, + "language_loss": 0.67887056, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.70018327, + "num_input_tokens_seen": 302260950, + "step": 14013, + "time_per_iteration": 2.603848457336426 + }, + { + "auxiliary_loss_clip": 0.01115809, + "auxiliary_loss_mlp": 0.01036207, + "balance_loss_clip": 1.03945398, + "balance_loss_mlp": 1.02304602, + "epoch": 0.8425672628889223, + "flos": 16178906816640.0, + "grad_norm": 3.415074767080469, + "language_loss": 0.78946209, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.81098223, + "num_input_tokens_seen": 302277500, + "step": 14014, + "time_per_iteration": 2.555556297302246 + }, + { + "auxiliary_loss_clip": 0.01077492, + "auxiliary_loss_mlp": 0.00770145, + "balance_loss_clip": 1.03449714, + "balance_loss_mlp": 1.00027716, + "epoch": 0.8426273861415903, + "flos": 23149131880320.0, + "grad_norm": 1.667905320409494, + "language_loss": 0.67027128, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.68874758, + "num_input_tokens_seen": 302297930, + "step": 14015, + "time_per_iteration": 2.7183566093444824 + }, + { + "auxiliary_loss_clip": 0.011092, + "auxiliary_loss_mlp": 0.01031339, + "balance_loss_clip": 1.03803563, + "balance_loss_mlp": 1.0183686, + "epoch": 0.8426875093942582, + "flos": 17457757712640.0, + "grad_norm": 5.511316765631844, + "language_loss": 0.76168728, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.78309268, + "num_input_tokens_seen": 302315735, + "step": 14016, + "time_per_iteration": 2.5260772705078125 + }, + { + "auxiliary_loss_clip": 0.01086806, + "auxiliary_loss_mlp": 0.01032087, + "balance_loss_clip": 1.03610539, + "balance_loss_mlp": 1.01919413, + "epoch": 0.8427476326469262, + "flos": 19640551299840.0, + "grad_norm": 1.8433329789592472, + "language_loss": 0.79657745, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.81776643, + "num_input_tokens_seen": 302332790, + "step": 14017, + "time_per_iteration": 2.630877733230591 + }, + { + "auxiliary_loss_clip": 0.01087127, + "auxiliary_loss_mlp": 0.0103184, + "balance_loss_clip": 1.03714514, + "balance_loss_mlp": 1.02009821, + "epoch": 0.8428077558995941, + "flos": 11941160146560.0, + "grad_norm": 2.433761002198627, + "language_loss": 0.63508832, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.65627795, + "num_input_tokens_seen": 302346490, + "step": 14018, + "time_per_iteration": 2.600435256958008 + }, + { + "auxiliary_loss_clip": 0.0109746, + "auxiliary_loss_mlp": 0.01036009, + "balance_loss_clip": 1.03713536, + "balance_loss_mlp": 1.02413559, + "epoch": 0.8428678791522621, + "flos": 10451729767680.0, + "grad_norm": 2.058952264097869, + "language_loss": 0.79526985, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.81660461, + "num_input_tokens_seen": 302363235, + "step": 14019, + "time_per_iteration": 2.606147289276123 + }, + { + "auxiliary_loss_clip": 0.01066617, + "auxiliary_loss_mlp": 0.01042966, + "balance_loss_clip": 1.0320183, + "balance_loss_mlp": 1.02731419, + "epoch": 0.8429280024049302, + "flos": 28767248259840.0, + "grad_norm": 1.8880951217635216, + "language_loss": 0.78381932, + "learning_rate": 2.531817924498265e-07, + "loss": 0.80491519, + "num_input_tokens_seen": 302383270, + "step": 14020, + "time_per_iteration": 2.761439561843872 + }, + { + "auxiliary_loss_clip": 0.01094532, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.03691518, + "balance_loss_mlp": 1.01528084, + "epoch": 0.8429881256575981, + "flos": 19537093152000.0, + "grad_norm": 1.619318951916878, + "language_loss": 0.71194899, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.73316747, + "num_input_tokens_seen": 302401355, + "step": 14021, + "time_per_iteration": 2.5756282806396484 + }, + { + "auxiliary_loss_clip": 0.01082102, + "auxiliary_loss_mlp": 0.01039813, + "balance_loss_clip": 1.03787649, + "balance_loss_mlp": 1.02695012, + "epoch": 0.8430482489102661, + "flos": 24790931752320.0, + "grad_norm": 1.6917414821142582, + "language_loss": 0.69565594, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.71687508, + "num_input_tokens_seen": 302419515, + "step": 14022, + "time_per_iteration": 2.654576301574707 + }, + { + "auxiliary_loss_clip": 0.01053571, + "auxiliary_loss_mlp": 0.01034251, + "balance_loss_clip": 1.03549337, + "balance_loss_mlp": 1.02110815, + "epoch": 0.843108372162934, + "flos": 21544248211200.0, + "grad_norm": 2.0704880658750264, + "language_loss": 0.72135806, + "learning_rate": 2.526131019933553e-07, + "loss": 0.74223632, + "num_input_tokens_seen": 302438280, + "step": 14023, + "time_per_iteration": 2.763561248779297 + }, + { + "auxiliary_loss_clip": 0.01097817, + "auxiliary_loss_mlp": 0.01036537, + "balance_loss_clip": 1.03748226, + "balance_loss_mlp": 1.02365077, + "epoch": 0.843168495415602, + "flos": 24608792862720.0, + "grad_norm": 3.1279379432314496, + "language_loss": 0.66840017, + "learning_rate": 2.524236710204559e-07, + "loss": 0.68974364, + "num_input_tokens_seen": 302460860, + "step": 14024, + "time_per_iteration": 2.6798737049102783 + }, + { + "auxiliary_loss_clip": 0.01094098, + "auxiliary_loss_mlp": 0.01033274, + "balance_loss_clip": 1.0358882, + "balance_loss_mlp": 1.02064943, + "epoch": 0.8432286186682699, + "flos": 15122738286720.0, + "grad_norm": 1.7785534436425128, + "language_loss": 0.80463433, + "learning_rate": 2.522343063158261e-07, + "loss": 0.82590806, + "num_input_tokens_seen": 302476980, + "step": 14025, + "time_per_iteration": 2.5957210063934326 + }, + { + "auxiliary_loss_clip": 0.01094269, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.03669548, + "balance_loss_mlp": 1.02171469, + "epoch": 0.843288741920938, + "flos": 20301882554880.0, + "grad_norm": 1.7599057641282252, + "language_loss": 0.77854842, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.79981452, + "num_input_tokens_seen": 302496380, + "step": 14026, + "time_per_iteration": 4.200474500656128 + }, + { + "auxiliary_loss_clip": 0.01082991, + "auxiliary_loss_mlp": 0.01035909, + "balance_loss_clip": 1.03349411, + "balance_loss_mlp": 1.02325487, + "epoch": 0.8433488651736059, + "flos": 23332096782720.0, + "grad_norm": 1.3875448337644876, + "language_loss": 0.8256402, + "learning_rate": 2.518557757400945e-07, + "loss": 0.84682918, + "num_input_tokens_seen": 302516845, + "step": 14027, + "time_per_iteration": 2.649754524230957 + }, + { + "auxiliary_loss_clip": 0.01083401, + "auxiliary_loss_mlp": 0.01029357, + "balance_loss_clip": 1.03570163, + "balance_loss_mlp": 1.01768661, + "epoch": 0.8434089884262739, + "flos": 39458105844480.0, + "grad_norm": 1.6608095267116312, + "language_loss": 0.56683648, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.58796406, + "num_input_tokens_seen": 302538865, + "step": 14028, + "time_per_iteration": 4.3750526905059814 + }, + { + "auxiliary_loss_clip": 0.01082684, + "auxiliary_loss_mlp": 0.0102599, + "balance_loss_clip": 1.03466272, + "balance_loss_mlp": 1.01414621, + "epoch": 0.8434691116789418, + "flos": 23768842250880.0, + "grad_norm": 2.5757916535354304, + "language_loss": 0.64079869, + "learning_rate": 2.51477510323578e-07, + "loss": 0.66188538, + "num_input_tokens_seen": 302557970, + "step": 14029, + "time_per_iteration": 4.223181962966919 + }, + { + "auxiliary_loss_clip": 0.01105336, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.037485, + "balance_loss_mlp": 1.02116728, + "epoch": 0.8435292349316098, + "flos": 22671411972480.0, + "grad_norm": 2.654906642720587, + "language_loss": 0.7511518, + "learning_rate": 2.51288477067956e-07, + "loss": 0.77253079, + "num_input_tokens_seen": 302578915, + "step": 14030, + "time_per_iteration": 4.182165145874023 + }, + { + "auxiliary_loss_clip": 0.01087432, + "auxiliary_loss_mlp": 0.01035541, + "balance_loss_clip": 1.03771615, + "balance_loss_mlp": 1.02353668, + "epoch": 0.8435893581842777, + "flos": 18843622202880.0, + "grad_norm": 1.6649016404625991, + "language_loss": 0.83075505, + "learning_rate": 2.510995101236502e-07, + "loss": 0.85198474, + "num_input_tokens_seen": 302596300, + "step": 14031, + "time_per_iteration": 2.6392641067504883 + }, + { + "auxiliary_loss_clip": 0.01084779, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.03526592, + "balance_loss_mlp": 1.01829624, + "epoch": 0.8436494814369457, + "flos": 20704225772160.0, + "grad_norm": 1.8193190780443504, + "language_loss": 0.80412525, + "learning_rate": 2.509106094978266e-07, + "loss": 0.82526779, + "num_input_tokens_seen": 302614975, + "step": 14032, + "time_per_iteration": 2.640856981277466 + }, + { + "auxiliary_loss_clip": 0.01072594, + "auxiliary_loss_mlp": 0.01035806, + "balance_loss_clip": 1.03194261, + "balance_loss_mlp": 1.02130389, + "epoch": 0.8437096046896138, + "flos": 22674177319680.0, + "grad_norm": 1.5175948868756235, + "language_loss": 0.75642312, + "learning_rate": 2.507217751976478e-07, + "loss": 0.77750713, + "num_input_tokens_seen": 302636415, + "step": 14033, + "time_per_iteration": 2.6690027713775635 + }, + { + "auxiliary_loss_clip": 0.01070256, + "auxiliary_loss_mlp": 0.01037556, + "balance_loss_clip": 1.03320062, + "balance_loss_mlp": 1.02597451, + "epoch": 0.8437697279422817, + "flos": 16180127879040.0, + "grad_norm": 1.777468155857912, + "language_loss": 0.83613944, + "learning_rate": 2.505330072302743e-07, + "loss": 0.85721743, + "num_input_tokens_seen": 302653605, + "step": 14034, + "time_per_iteration": 2.765951156616211 + }, + { + "auxiliary_loss_clip": 0.01074581, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.03461361, + "balance_loss_mlp": 1.01791012, + "epoch": 0.8438298511949497, + "flos": 28765847629440.0, + "grad_norm": 1.4932135863758922, + "language_loss": 0.78466785, + "learning_rate": 2.503443056028656e-07, + "loss": 0.80573499, + "num_input_tokens_seen": 302673965, + "step": 14035, + "time_per_iteration": 2.76178240776062 + }, + { + "auxiliary_loss_clip": 0.01093632, + "auxiliary_loss_mlp": 0.01036882, + "balance_loss_clip": 1.03451896, + "balance_loss_mlp": 1.02403092, + "epoch": 0.8438899744476176, + "flos": 33724284779520.0, + "grad_norm": 1.361908156116711, + "language_loss": 0.72181344, + "learning_rate": 2.501556703225751e-07, + "loss": 0.74311858, + "num_input_tokens_seen": 302695560, + "step": 14036, + "time_per_iteration": 2.719937562942505 + }, + { + "auxiliary_loss_clip": 0.01103676, + "auxiliary_loss_mlp": 0.01025959, + "balance_loss_clip": 1.03616214, + "balance_loss_mlp": 1.01573718, + "epoch": 0.8439500977002856, + "flos": 25110787386240.0, + "grad_norm": 1.7256131227226181, + "language_loss": 0.69647789, + "learning_rate": 2.49967101396557e-07, + "loss": 0.71777427, + "num_input_tokens_seen": 302713480, + "step": 14037, + "time_per_iteration": 2.581303596496582 + }, + { + "auxiliary_loss_clip": 0.01107935, + "auxiliary_loss_mlp": 0.01026894, + "balance_loss_clip": 1.03714299, + "balance_loss_mlp": 1.01509237, + "epoch": 0.8440102209529535, + "flos": 32850362880000.0, + "grad_norm": 1.8136551952010338, + "language_loss": 0.69107378, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.71242201, + "num_input_tokens_seen": 302736860, + "step": 14038, + "time_per_iteration": 2.6723809242248535 + }, + { + "auxiliary_loss_clip": 0.01051869, + "auxiliary_loss_mlp": 0.01039696, + "balance_loss_clip": 1.03102171, + "balance_loss_mlp": 1.02648771, + "epoch": 0.8440703442056215, + "flos": 23730202195200.0, + "grad_norm": 1.5801528390801436, + "language_loss": 0.76572794, + "learning_rate": 2.49590162635938e-07, + "loss": 0.78664356, + "num_input_tokens_seen": 302757745, + "step": 14039, + "time_per_iteration": 2.721997022628784 + }, + { + "auxiliary_loss_clip": 0.0111525, + "auxiliary_loss_mlp": 0.01028155, + "balance_loss_clip": 1.03972554, + "balance_loss_mlp": 1.01560223, + "epoch": 0.8441304674582895, + "flos": 20193719725440.0, + "grad_norm": 1.8472982875687889, + "language_loss": 0.79579616, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.81723017, + "num_input_tokens_seen": 302774885, + "step": 14040, + "time_per_iteration": 2.531126022338867 + }, + { + "auxiliary_loss_clip": 0.01077191, + "auxiliary_loss_mlp": 0.01039289, + "balance_loss_clip": 1.03676081, + "balance_loss_mlp": 1.02646196, + "epoch": 0.8441905907109575, + "flos": 20219897761920.0, + "grad_norm": 1.9576992195932046, + "language_loss": 0.69267452, + "learning_rate": 2.492134893781821e-07, + "loss": 0.71383929, + "num_input_tokens_seen": 302791035, + "step": 14041, + "time_per_iteration": 2.749387741088867 + }, + { + "auxiliary_loss_clip": 0.01087824, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.03482831, + "balance_loss_mlp": 1.02474546, + "epoch": 0.8442507139636254, + "flos": 13516453987200.0, + "grad_norm": 1.879973628715058, + "language_loss": 0.68978488, + "learning_rate": 2.490252523307341e-07, + "loss": 0.71103394, + "num_input_tokens_seen": 302808650, + "step": 14042, + "time_per_iteration": 2.656613826751709 + }, + { + "auxiliary_loss_clip": 0.01085316, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.03706896, + "balance_loss_mlp": 1.02187014, + "epoch": 0.8443108372162934, + "flos": 18220212731520.0, + "grad_norm": 3.089358843777989, + "language_loss": 0.74717695, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.76836395, + "num_input_tokens_seen": 302824605, + "step": 14043, + "time_per_iteration": 2.633385181427002 + }, + { + "auxiliary_loss_clip": 0.01107453, + "auxiliary_loss_mlp": 0.00769638, + "balance_loss_clip": 1.03682041, + "balance_loss_mlp": 1.00010276, + "epoch": 0.8443709604689613, + "flos": 16105110324480.0, + "grad_norm": 2.7829513982165306, + "language_loss": 0.7167477, + "learning_rate": 2.486489774343865e-07, + "loss": 0.73551863, + "num_input_tokens_seen": 302840170, + "step": 14044, + "time_per_iteration": 2.5848615169525146 + }, + { + "auxiliary_loss_clip": 0.01085792, + "auxiliary_loss_mlp": 0.01029904, + "balance_loss_clip": 1.03579986, + "balance_loss_mlp": 1.01772702, + "epoch": 0.8444310837216293, + "flos": 18512130562560.0, + "grad_norm": 1.5511958815264777, + "language_loss": 0.74899876, + "learning_rate": 2.484609395997559e-07, + "loss": 0.77015567, + "num_input_tokens_seen": 302858320, + "step": 14045, + "time_per_iteration": 2.6302268505096436 + }, + { + "auxiliary_loss_clip": 0.01086761, + "auxiliary_loss_mlp": 0.00769733, + "balance_loss_clip": 1.03393674, + "balance_loss_mlp": 1.00021636, + "epoch": 0.8444912069742974, + "flos": 14939845211520.0, + "grad_norm": 1.9839329932661167, + "language_loss": 0.78436804, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.80293298, + "num_input_tokens_seen": 302875255, + "step": 14046, + "time_per_iteration": 2.6413092613220215 + }, + { + "auxiliary_loss_clip": 0.01081685, + "auxiliary_loss_mlp": 0.01035179, + "balance_loss_clip": 1.03393447, + "balance_loss_mlp": 1.02033782, + "epoch": 0.8445513302269653, + "flos": 20120318282880.0, + "grad_norm": 2.2863023721610842, + "language_loss": 0.7816503, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.80281889, + "num_input_tokens_seen": 302894690, + "step": 14047, + "time_per_iteration": 2.6660380363464355 + }, + { + "auxiliary_loss_clip": 0.01086084, + "auxiliary_loss_mlp": 0.0103182, + "balance_loss_clip": 1.03934455, + "balance_loss_mlp": 1.01988101, + "epoch": 0.8446114534796333, + "flos": 31170928533120.0, + "grad_norm": 1.8722756124069524, + "language_loss": 0.72262931, + "learning_rate": 2.478972246355935e-07, + "loss": 0.74380839, + "num_input_tokens_seen": 302912405, + "step": 14048, + "time_per_iteration": 2.750633716583252 + }, + { + "auxiliary_loss_clip": 0.01029086, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.03568673, + "balance_loss_mlp": 1.02149785, + "epoch": 0.8446715767323012, + "flos": 23948323534080.0, + "grad_norm": 1.6779262102032728, + "language_loss": 0.73663235, + "learning_rate": 2.477094525178667e-07, + "loss": 0.75726056, + "num_input_tokens_seen": 302932525, + "step": 14049, + "time_per_iteration": 3.1203606128692627 + }, + { + "auxiliary_loss_clip": 0.01019667, + "auxiliary_loss_mlp": 0.00751068, + "balance_loss_clip": 1.00710368, + "balance_loss_mlp": 0.99964279, + "epoch": 0.8447316999849692, + "flos": 67984897484160.0, + "grad_norm": 0.890275680771782, + "language_loss": 0.60581625, + "learning_rate": 2.475217468471729e-07, + "loss": 0.62352359, + "num_input_tokens_seen": 302991285, + "step": 14050, + "time_per_iteration": 3.2392361164093018 + }, + { + "auxiliary_loss_clip": 0.01082426, + "auxiliary_loss_mlp": 0.00772259, + "balance_loss_clip": 1.03367877, + "balance_loss_mlp": 1.00022018, + "epoch": 0.8447918232376371, + "flos": 22418924296320.0, + "grad_norm": 2.8956487608781036, + "language_loss": 0.72659081, + "learning_rate": 2.473341076306303e-07, + "loss": 0.74513769, + "num_input_tokens_seen": 303009515, + "step": 14051, + "time_per_iteration": 2.6661341190338135 + }, + { + "auxiliary_loss_clip": 0.01095777, + "auxiliary_loss_mlp": 0.01027878, + "balance_loss_clip": 1.03622103, + "balance_loss_mlp": 1.01606417, + "epoch": 0.8448519464903052, + "flos": 23694147918720.0, + "grad_norm": 1.811318817116214, + "language_loss": 0.74613708, + "learning_rate": 2.471465348753547e-07, + "loss": 0.76737368, + "num_input_tokens_seen": 303026905, + "step": 14052, + "time_per_iteration": 2.7032968997955322 + }, + { + "auxiliary_loss_clip": 0.0107808, + "auxiliary_loss_mlp": 0.01028694, + "balance_loss_clip": 1.03693604, + "balance_loss_mlp": 1.01800132, + "epoch": 0.8449120697429731, + "flos": 13735904129280.0, + "grad_norm": 2.027055068247027, + "language_loss": 0.73807508, + "learning_rate": 2.469590285884575e-07, + "loss": 0.75914282, + "num_input_tokens_seen": 303045245, + "step": 14053, + "time_per_iteration": 2.6814658641815186 + }, + { + "auxiliary_loss_clip": 0.01092814, + "auxiliary_loss_mlp": 0.01029071, + "balance_loss_clip": 1.03634143, + "balance_loss_mlp": 1.01660776, + "epoch": 0.8449721929956411, + "flos": 20886795624960.0, + "grad_norm": 1.652927903849228, + "language_loss": 0.73763537, + "learning_rate": 2.467715887770494e-07, + "loss": 0.75885427, + "num_input_tokens_seen": 303065205, + "step": 14054, + "time_per_iteration": 2.7116918563842773 + }, + { + "auxiliary_loss_clip": 0.01101862, + "auxiliary_loss_mlp": 0.0103134, + "balance_loss_clip": 1.03724992, + "balance_loss_mlp": 1.01904297, + "epoch": 0.845032316248309, + "flos": 33216939129600.0, + "grad_norm": 1.366097555200395, + "language_loss": 0.77969533, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.8010273, + "num_input_tokens_seen": 303088250, + "step": 14055, + "time_per_iteration": 2.73816180229187 + }, + { + "auxiliary_loss_clip": 0.01096569, + "auxiliary_loss_mlp": 0.01036149, + "balance_loss_clip": 1.03655601, + "balance_loss_mlp": 1.02406144, + "epoch": 0.845092439500977, + "flos": 23585230903680.0, + "grad_norm": 1.670368654954579, + "language_loss": 0.72893804, + "learning_rate": 2.463969086091302e-07, + "loss": 0.75026524, + "num_input_tokens_seen": 303109280, + "step": 14056, + "time_per_iteration": 2.70538592338562 + }, + { + "auxiliary_loss_clip": 0.01102046, + "auxiliary_loss_mlp": 0.01036064, + "balance_loss_clip": 1.03741002, + "balance_loss_mlp": 1.02301073, + "epoch": 0.8451525627536449, + "flos": 13333920048000.0, + "grad_norm": 2.25806090872676, + "language_loss": 0.67116416, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.69254524, + "num_input_tokens_seen": 303126075, + "step": 14057, + "time_per_iteration": 2.6500983238220215 + }, + { + "auxiliary_loss_clip": 0.01061896, + "auxiliary_loss_mlp": 0.01031163, + "balance_loss_clip": 1.03401434, + "balance_loss_mlp": 1.01830649, + "epoch": 0.8452126860063129, + "flos": 27817985583360.0, + "grad_norm": 14.445297752057405, + "language_loss": 0.77819413, + "learning_rate": 2.460224944284284e-07, + "loss": 0.79912472, + "num_input_tokens_seen": 303146920, + "step": 14058, + "time_per_iteration": 2.7543530464172363 + }, + { + "auxiliary_loss_clip": 0.01110927, + "auxiliary_loss_mlp": 0.0103341, + "balance_loss_clip": 1.03813863, + "balance_loss_mlp": 1.02150726, + "epoch": 0.845272809258981, + "flos": 27124694202240.0, + "grad_norm": 1.550470920076943, + "language_loss": 0.69772273, + "learning_rate": 2.45835387101033e-07, + "loss": 0.7191661, + "num_input_tokens_seen": 303167885, + "step": 14059, + "time_per_iteration": 2.6287412643432617 + }, + { + "auxiliary_loss_clip": 0.01112261, + "auxiliary_loss_mlp": 0.01036279, + "balance_loss_clip": 1.03764248, + "balance_loss_mlp": 1.02282071, + "epoch": 0.8453329325116489, + "flos": 18332577452160.0, + "grad_norm": 2.0803365731251278, + "language_loss": 0.57748783, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.59897316, + "num_input_tokens_seen": 303185000, + "step": 14060, + "time_per_iteration": 2.5504209995269775 + }, + { + "auxiliary_loss_clip": 0.0108835, + "auxiliary_loss_mlp": 0.01037716, + "balance_loss_clip": 1.0332464, + "balance_loss_mlp": 1.02378607, + "epoch": 0.8453930557643169, + "flos": 22675254727680.0, + "grad_norm": 1.5991086812981428, + "language_loss": 0.756387, + "learning_rate": 2.454613720076277e-07, + "loss": 0.77764767, + "num_input_tokens_seen": 303205210, + "step": 14061, + "time_per_iteration": 2.6448512077331543 + }, + { + "auxiliary_loss_clip": 0.010831, + "auxiliary_loss_mlp": 0.0102804, + "balance_loss_clip": 1.0347625, + "balance_loss_mlp": 1.01493907, + "epoch": 0.8454531790169848, + "flos": 22487261921280.0, + "grad_norm": 2.9339159034227134, + "language_loss": 0.71316195, + "learning_rate": 2.452744642558013e-07, + "loss": 0.73427337, + "num_input_tokens_seen": 303224655, + "step": 14062, + "time_per_iteration": 2.6758151054382324 + }, + { + "auxiliary_loss_clip": 0.00988143, + "auxiliary_loss_mlp": 0.00999448, + "balance_loss_clip": 1.01111102, + "balance_loss_mlp": 0.99836904, + "epoch": 0.8455133022696528, + "flos": 58277848481280.0, + "grad_norm": 0.6380346194484136, + "language_loss": 0.52619612, + "learning_rate": 2.450876230433432e-07, + "loss": 0.54607201, + "num_input_tokens_seen": 303289645, + "step": 14063, + "time_per_iteration": 3.2946317195892334 + }, + { + "auxiliary_loss_clip": 0.01065561, + "auxiliary_loss_mlp": 0.01025729, + "balance_loss_clip": 1.03565407, + "balance_loss_mlp": 1.01469028, + "epoch": 0.8455734255223207, + "flos": 21361283308800.0, + "grad_norm": 1.9822663620489593, + "language_loss": 0.82145214, + "learning_rate": 2.449008483773378e-07, + "loss": 0.84236503, + "num_input_tokens_seen": 303308350, + "step": 14064, + "time_per_iteration": 2.6607656478881836 + }, + { + "auxiliary_loss_clip": 0.01101966, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.03909707, + "balance_loss_mlp": 1.02036476, + "epoch": 0.8456335487749888, + "flos": 20449260057600.0, + "grad_norm": 1.7113113897930685, + "language_loss": 0.72365153, + "learning_rate": 2.447141402648685e-07, + "loss": 0.74500531, + "num_input_tokens_seen": 303325230, + "step": 14065, + "time_per_iteration": 4.209578037261963 + }, + { + "auxiliary_loss_clip": 0.0107366, + "auxiliary_loss_mlp": 0.01027599, + "balance_loss_clip": 1.03522468, + "balance_loss_mlp": 1.01634598, + "epoch": 0.8456936720276567, + "flos": 28840901097600.0, + "grad_norm": 1.7659052654385863, + "language_loss": 0.77673888, + "learning_rate": 2.445274987130146e-07, + "loss": 0.79775143, + "num_input_tokens_seen": 303345810, + "step": 14066, + "time_per_iteration": 2.7587270736694336 + }, + { + "auxiliary_loss_clip": 0.01072656, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.03802919, + "balance_loss_mlp": 1.01832891, + "epoch": 0.8457537952803247, + "flos": 22672884430080.0, + "grad_norm": 1.438108024739344, + "language_loss": 0.69719791, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.71823066, + "num_input_tokens_seen": 303365140, + "step": 14067, + "time_per_iteration": 4.787655353546143 + }, + { + "auxiliary_loss_clip": 0.01071366, + "auxiliary_loss_mlp": 0.01028654, + "balance_loss_clip": 1.03298759, + "balance_loss_mlp": 1.01651883, + "epoch": 0.8458139185329926, + "flos": 33802929607680.0, + "grad_norm": 2.205817987023731, + "language_loss": 0.71166551, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.73266566, + "num_input_tokens_seen": 303386150, + "step": 14068, + "time_per_iteration": 4.351239204406738 + }, + { + "auxiliary_loss_clip": 0.00992733, + "auxiliary_loss_mlp": 0.01001464, + "balance_loss_clip": 1.00806594, + "balance_loss_mlp": 1.00047481, + "epoch": 0.8458740417856606, + "flos": 70295929603200.0, + "grad_norm": 0.6926239489661511, + "language_loss": 0.604882, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.62482405, + "num_input_tokens_seen": 303453770, + "step": 14069, + "time_per_iteration": 5.011239290237427 + }, + { + "auxiliary_loss_clip": 0.01085111, + "auxiliary_loss_mlp": 0.01030137, + "balance_loss_clip": 1.0371033, + "balance_loss_mlp": 1.01844859, + "epoch": 0.8459341650383285, + "flos": 24170862245760.0, + "grad_norm": 1.60147052022308, + "language_loss": 0.74564326, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.76679569, + "num_input_tokens_seen": 303474520, + "step": 14070, + "time_per_iteration": 2.651233196258545 + }, + { + "auxiliary_loss_clip": 0.01061032, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.03419256, + "balance_loss_mlp": 1.01784265, + "epoch": 0.8459942882909965, + "flos": 38181158369280.0, + "grad_norm": 3.793310499972189, + "language_loss": 0.66902626, + "learning_rate": 2.435952896106039e-07, + "loss": 0.68994129, + "num_input_tokens_seen": 303497345, + "step": 14071, + "time_per_iteration": 2.863940954208374 + }, + { + "auxiliary_loss_clip": 0.01019962, + "auxiliary_loss_mlp": 0.00751058, + "balance_loss_clip": 1.00760365, + "balance_loss_mlp": 0.99957699, + "epoch": 0.8460544115436646, + "flos": 64118252177280.0, + "grad_norm": 0.7316338741504227, + "language_loss": 0.61046565, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.62817585, + "num_input_tokens_seen": 303554890, + "step": 14072, + "time_per_iteration": 3.041468858718872 + }, + { + "auxiliary_loss_clip": 0.01069973, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.0375272, + "balance_loss_mlp": 1.01801336, + "epoch": 0.8461145347963325, + "flos": 24170826332160.0, + "grad_norm": 2.698687896203199, + "language_loss": 0.72609383, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.74710786, + "num_input_tokens_seen": 303574380, + "step": 14073, + "time_per_iteration": 2.7544729709625244 + }, + { + "auxiliary_loss_clip": 0.01091999, + "auxiliary_loss_mlp": 0.01034187, + "balance_loss_clip": 1.0379231, + "balance_loss_mlp": 1.02047241, + "epoch": 0.8461746580490005, + "flos": 34893787697280.0, + "grad_norm": 2.1945547327589976, + "language_loss": 0.78341836, + "learning_rate": 2.430367633291155e-07, + "loss": 0.80468023, + "num_input_tokens_seen": 303594910, + "step": 14074, + "time_per_iteration": 2.8085241317749023 + }, + { + "auxiliary_loss_clip": 0.01099175, + "auxiliary_loss_mlp": 0.01030891, + "balance_loss_clip": 1.03856003, + "balance_loss_mlp": 1.01867247, + "epoch": 0.8462347813016684, + "flos": 25557014044800.0, + "grad_norm": 2.0589509569637143, + "language_loss": 0.75481176, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.77611244, + "num_input_tokens_seen": 303613520, + "step": 14075, + "time_per_iteration": 2.6737287044525146 + }, + { + "auxiliary_loss_clip": 0.01084327, + "auxiliary_loss_mlp": 0.01026954, + "balance_loss_clip": 1.03593063, + "balance_loss_mlp": 1.0144732, + "epoch": 0.8462949045543364, + "flos": 21325336773120.0, + "grad_norm": 2.312181037526072, + "language_loss": 0.73324478, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.75435758, + "num_input_tokens_seen": 303631225, + "step": 14076, + "time_per_iteration": 2.6550984382629395 + }, + { + "auxiliary_loss_clip": 0.01091988, + "auxiliary_loss_mlp": 0.01032969, + "balance_loss_clip": 1.03691387, + "balance_loss_mlp": 1.02119696, + "epoch": 0.8463550278070043, + "flos": 22637440684800.0, + "grad_norm": 2.1680045577224543, + "language_loss": 0.78016102, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.80141062, + "num_input_tokens_seen": 303649175, + "step": 14077, + "time_per_iteration": 2.7090940475463867 + }, + { + "auxiliary_loss_clip": 0.01075749, + "auxiliary_loss_mlp": 0.0103478, + "balance_loss_clip": 1.03529286, + "balance_loss_mlp": 1.022102, + "epoch": 0.8464151510596724, + "flos": 13005588804480.0, + "grad_norm": 3.514395307267888, + "language_loss": 0.75203717, + "learning_rate": 2.422929943924643e-07, + "loss": 0.77314246, + "num_input_tokens_seen": 303665915, + "step": 14078, + "time_per_iteration": 2.668720245361328 + }, + { + "auxiliary_loss_clip": 0.01069196, + "auxiliary_loss_mlp": 0.01025183, + "balance_loss_clip": 1.0366447, + "balance_loss_mlp": 1.01232052, + "epoch": 0.8464752743123403, + "flos": 15704921923200.0, + "grad_norm": 3.2183911644001237, + "language_loss": 0.85171533, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.87265909, + "num_input_tokens_seen": 303679985, + "step": 14079, + "time_per_iteration": 2.7119951248168945 + }, + { + "auxiliary_loss_clip": 0.01084378, + "auxiliary_loss_mlp": 0.01037369, + "balance_loss_clip": 1.03693473, + "balance_loss_mlp": 1.02382135, + "epoch": 0.8465353975650083, + "flos": 21653955325440.0, + "grad_norm": 1.8047778580763019, + "language_loss": 0.5904004, + "learning_rate": 2.419215098104965e-07, + "loss": 0.61161786, + "num_input_tokens_seen": 303698470, + "step": 14080, + "time_per_iteration": 2.6963582038879395 + }, + { + "auxiliary_loss_clip": 0.01084298, + "auxiliary_loss_mlp": 0.01029709, + "balance_loss_clip": 1.03614235, + "balance_loss_mlp": 1.01678658, + "epoch": 0.8465955208176762, + "flos": 18515650095360.0, + "grad_norm": 2.4057964944506174, + "language_loss": 0.65874493, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.67988491, + "num_input_tokens_seen": 303716415, + "step": 14081, + "time_per_iteration": 2.68113112449646 + }, + { + "auxiliary_loss_clip": 0.01096638, + "auxiliary_loss_mlp": 0.01035444, + "balance_loss_clip": 1.03579867, + "balance_loss_mlp": 1.0226047, + "epoch": 0.8466556440703442, + "flos": 24200559815040.0, + "grad_norm": 1.8302638990848867, + "language_loss": 0.72922373, + "learning_rate": 2.41550291894576e-07, + "loss": 0.75054455, + "num_input_tokens_seen": 303734490, + "step": 14082, + "time_per_iteration": 2.6815195083618164 + }, + { + "auxiliary_loss_clip": 0.01055673, + "auxiliary_loss_mlp": 0.01036867, + "balance_loss_clip": 1.03194714, + "balance_loss_mlp": 1.02327132, + "epoch": 0.8467157673230121, + "flos": 20375894528640.0, + "grad_norm": 2.0055254774422666, + "language_loss": 0.76295221, + "learning_rate": 2.413647829539809e-07, + "loss": 0.78387761, + "num_input_tokens_seen": 303752310, + "step": 14083, + "time_per_iteration": 2.7438621520996094 + }, + { + "auxiliary_loss_clip": 0.01061542, + "auxiliary_loss_mlp": 0.01034056, + "balance_loss_clip": 1.03273213, + "balance_loss_mlp": 1.01982224, + "epoch": 0.8467758905756801, + "flos": 28473642489600.0, + "grad_norm": 1.7272953079134175, + "language_loss": 0.65860844, + "learning_rate": 2.411793407010092e-07, + "loss": 0.67956436, + "num_input_tokens_seen": 303776065, + "step": 14084, + "time_per_iteration": 2.7735166549682617 + }, + { + "auxiliary_loss_clip": 0.01067401, + "auxiliary_loss_mlp": 0.01030615, + "balance_loss_clip": 1.03815413, + "balance_loss_mlp": 1.01846731, + "epoch": 0.8468360138283482, + "flos": 11692551139200.0, + "grad_norm": 2.7040264249801766, + "language_loss": 0.69605839, + "learning_rate": 2.409939651426938e-07, + "loss": 0.71703851, + "num_input_tokens_seen": 303793500, + "step": 14085, + "time_per_iteration": 2.773153781890869 + }, + { + "auxiliary_loss_clip": 0.01066275, + "auxiliary_loss_mlp": 0.0102846, + "balance_loss_clip": 1.03402877, + "balance_loss_mlp": 1.01666391, + "epoch": 0.8468961370810161, + "flos": 24607859109120.0, + "grad_norm": 1.5869517214902362, + "language_loss": 0.71034825, + "learning_rate": 2.408086562860634e-07, + "loss": 0.73129559, + "num_input_tokens_seen": 303814835, + "step": 14086, + "time_per_iteration": 2.778090476989746 + }, + { + "auxiliary_loss_clip": 0.01091608, + "auxiliary_loss_mlp": 0.01032202, + "balance_loss_clip": 1.03516686, + "balance_loss_mlp": 1.01981008, + "epoch": 0.8469562603336841, + "flos": 19609812236160.0, + "grad_norm": 1.9986258796414704, + "language_loss": 0.74891198, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.77015007, + "num_input_tokens_seen": 303834505, + "step": 14087, + "time_per_iteration": 2.659958600997925 + }, + { + "auxiliary_loss_clip": 0.01080191, + "auxiliary_loss_mlp": 0.01026574, + "balance_loss_clip": 1.03761494, + "balance_loss_mlp": 1.01394975, + "epoch": 0.847016383586352, + "flos": 22638949056000.0, + "grad_norm": 1.3437593766156344, + "language_loss": 0.74087977, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.76194739, + "num_input_tokens_seen": 303855050, + "step": 14088, + "time_per_iteration": 2.820697784423828 + }, + { + "auxiliary_loss_clip": 0.01099232, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.03691757, + "balance_loss_mlp": 1.01979518, + "epoch": 0.84707650683902, + "flos": 20960161153920.0, + "grad_norm": 2.494250359435125, + "language_loss": 0.7231648, + "learning_rate": 2.402531299965387e-07, + "loss": 0.74447769, + "num_input_tokens_seen": 303875635, + "step": 14089, + "time_per_iteration": 2.6343815326690674 + }, + { + "auxiliary_loss_clip": 0.01108775, + "auxiliary_loss_mlp": 0.01028953, + "balance_loss_clip": 1.03952324, + "balance_loss_mlp": 1.01720452, + "epoch": 0.8471366300916879, + "flos": 24093007516800.0, + "grad_norm": 1.3722946087239658, + "language_loss": 0.79204518, + "learning_rate": 2.400680880168928e-07, + "loss": 0.81342244, + "num_input_tokens_seen": 303896750, + "step": 14090, + "time_per_iteration": 2.6099236011505127 + }, + { + "auxiliary_loss_clip": 0.01053519, + "auxiliary_loss_mlp": 0.01040145, + "balance_loss_clip": 1.03225899, + "balance_loss_mlp": 1.02603018, + "epoch": 0.847196753344356, + "flos": 18332900674560.0, + "grad_norm": 2.9954684546587553, + "language_loss": 0.76710737, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.78804398, + "num_input_tokens_seen": 303915435, + "step": 14091, + "time_per_iteration": 2.780735492706299 + }, + { + "auxiliary_loss_clip": 0.01028625, + "auxiliary_loss_mlp": 0.01002892, + "balance_loss_clip": 1.00622869, + "balance_loss_mlp": 1.00184846, + "epoch": 0.8472568765970239, + "flos": 49567536956160.0, + "grad_norm": 0.8179269563899582, + "language_loss": 0.59413207, + "learning_rate": 2.396982042749982e-07, + "loss": 0.61444724, + "num_input_tokens_seen": 303977245, + "step": 14092, + "time_per_iteration": 3.1960132122039795 + }, + { + "auxiliary_loss_clip": 0.01081941, + "auxiliary_loss_mlp": 0.01036569, + "balance_loss_clip": 1.03254557, + "balance_loss_mlp": 1.02275276, + "epoch": 0.8473169998496919, + "flos": 19279074781440.0, + "grad_norm": 1.7883321120809321, + "language_loss": 0.70245391, + "learning_rate": 2.395133625267756e-07, + "loss": 0.72363901, + "num_input_tokens_seen": 303996055, + "step": 14093, + "time_per_iteration": 2.6437125205993652 + }, + { + "auxiliary_loss_clip": 0.01105171, + "auxiliary_loss_mlp": 0.01025923, + "balance_loss_clip": 1.03583193, + "balance_loss_mlp": 1.01443684, + "epoch": 0.8473771231023598, + "flos": 17675555829120.0, + "grad_norm": 2.1012559182302866, + "language_loss": 0.83147365, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.85278457, + "num_input_tokens_seen": 304012205, + "step": 14094, + "time_per_iteration": 2.5802862644195557 + }, + { + "auxiliary_loss_clip": 0.01089017, + "auxiliary_loss_mlp": 0.01031094, + "balance_loss_clip": 1.03741288, + "balance_loss_mlp": 1.01977515, + "epoch": 0.8474372463550278, + "flos": 26359761144960.0, + "grad_norm": 2.04303085122746, + "language_loss": 0.71497333, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.7361744, + "num_input_tokens_seen": 304033475, + "step": 14095, + "time_per_iteration": 2.6501832008361816 + }, + { + "auxiliary_loss_clip": 0.01094545, + "auxiliary_loss_mlp": 0.00769791, + "balance_loss_clip": 1.03552461, + "balance_loss_mlp": 1.00019312, + "epoch": 0.8474973696076957, + "flos": 23402050519680.0, + "grad_norm": 1.904609327228077, + "language_loss": 0.80488968, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.82353306, + "num_input_tokens_seen": 304051845, + "step": 14096, + "time_per_iteration": 2.644343376159668 + }, + { + "auxiliary_loss_clip": 0.0110016, + "auxiliary_loss_mlp": 0.01030989, + "balance_loss_clip": 1.03743386, + "balance_loss_mlp": 1.01790023, + "epoch": 0.8475574928603637, + "flos": 25075666863360.0, + "grad_norm": 8.034016369371804, + "language_loss": 0.77681863, + "learning_rate": 2.387746631822374e-07, + "loss": 0.79813015, + "num_input_tokens_seen": 304069965, + "step": 14097, + "time_per_iteration": 2.793025255203247 + }, + { + "auxiliary_loss_clip": 0.0107883, + "auxiliary_loss_mlp": 0.0102726, + "balance_loss_clip": 1.03687394, + "balance_loss_mlp": 1.01560712, + "epoch": 0.8476176161130318, + "flos": 19966691813760.0, + "grad_norm": 1.7033024624087645, + "language_loss": 0.802845, + "learning_rate": 2.385901552932048e-07, + "loss": 0.82390594, + "num_input_tokens_seen": 304086805, + "step": 14098, + "time_per_iteration": 2.675039052963257 + }, + { + "auxiliary_loss_clip": 0.01092536, + "auxiliary_loss_mlp": 0.00770177, + "balance_loss_clip": 1.03604007, + "balance_loss_mlp": 1.00013864, + "epoch": 0.8476777393656997, + "flos": 21285834791040.0, + "grad_norm": 1.8975178373976451, + "language_loss": 0.71665621, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.73528326, + "num_input_tokens_seen": 304105865, + "step": 14099, + "time_per_iteration": 2.5827932357788086 + }, + { + "auxiliary_loss_clip": 0.01094872, + "auxiliary_loss_mlp": 0.01032372, + "balance_loss_clip": 1.03322709, + "balance_loss_mlp": 1.01883566, + "epoch": 0.8477378626183677, + "flos": 29971476650880.0, + "grad_norm": 2.118472556405624, + "language_loss": 0.63617903, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.65745145, + "num_input_tokens_seen": 304128300, + "step": 14100, + "time_per_iteration": 2.723047971725464 + }, + { + "auxiliary_loss_clip": 0.01099377, + "auxiliary_loss_mlp": 0.01033502, + "balance_loss_clip": 1.03651261, + "balance_loss_mlp": 1.02066314, + "epoch": 0.8477979858710356, + "flos": 24237727413120.0, + "grad_norm": 2.0266984363046876, + "language_loss": 0.73806208, + "learning_rate": 2.380370324111085e-07, + "loss": 0.75939089, + "num_input_tokens_seen": 304143695, + "step": 14101, + "time_per_iteration": 2.6568257808685303 + }, + { + "auxiliary_loss_clip": 0.01098555, + "auxiliary_loss_mlp": 0.01028521, + "balance_loss_clip": 1.03505516, + "balance_loss_mlp": 1.01662445, + "epoch": 0.8478581091237036, + "flos": 25593678852480.0, + "grad_norm": 1.6724420871950227, + "language_loss": 0.71237093, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.73364168, + "num_input_tokens_seen": 304165800, + "step": 14102, + "time_per_iteration": 2.72493052482605 + }, + { + "auxiliary_loss_clip": 0.01084921, + "auxiliary_loss_mlp": 0.01033048, + "balance_loss_clip": 1.03555894, + "balance_loss_mlp": 1.01940477, + "epoch": 0.8479182323763715, + "flos": 12057116227200.0, + "grad_norm": 2.4752629302772426, + "language_loss": 0.81727469, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.83845437, + "num_input_tokens_seen": 304182910, + "step": 14103, + "time_per_iteration": 2.723888874053955 + }, + { + "auxiliary_loss_clip": 0.01109645, + "auxiliary_loss_mlp": 0.01030985, + "balance_loss_clip": 1.03859901, + "balance_loss_mlp": 1.01881981, + "epoch": 0.8479783556290396, + "flos": 21433391861760.0, + "grad_norm": 1.9133517562586435, + "language_loss": 0.78571969, + "learning_rate": 2.374845108533079e-07, + "loss": 0.80712605, + "num_input_tokens_seen": 304200175, + "step": 14104, + "time_per_iteration": 4.045240879058838 + }, + { + "auxiliary_loss_clip": 0.01101779, + "auxiliary_loss_mlp": 0.01037373, + "balance_loss_clip": 1.03928828, + "balance_loss_mlp": 1.02440929, + "epoch": 0.8480384788817075, + "flos": 19642634288640.0, + "grad_norm": 1.8032304310085965, + "language_loss": 0.78830254, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.80969405, + "num_input_tokens_seen": 304217775, + "step": 14105, + "time_per_iteration": 2.5720746517181396 + }, + { + "auxiliary_loss_clip": 0.01083671, + "auxiliary_loss_mlp": 0.01037196, + "balance_loss_clip": 1.03580463, + "balance_loss_mlp": 1.02209163, + "epoch": 0.8480986021343755, + "flos": 22489201255680.0, + "grad_norm": 1.7624192448776133, + "language_loss": 0.50159001, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.52279866, + "num_input_tokens_seen": 304235760, + "step": 14106, + "time_per_iteration": 4.288937330245972 + }, + { + "auxiliary_loss_clip": 0.01077376, + "auxiliary_loss_mlp": 0.01035024, + "balance_loss_clip": 1.03691649, + "balance_loss_mlp": 1.0228827, + "epoch": 0.8481587253870434, + "flos": 22090557139200.0, + "grad_norm": 2.145828005559372, + "language_loss": 0.75445443, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.7755785, + "num_input_tokens_seen": 304253985, + "step": 14107, + "time_per_iteration": 2.6221656799316406 + }, + { + "auxiliary_loss_clip": 0.01076318, + "auxiliary_loss_mlp": 0.0102835, + "balance_loss_clip": 1.03518283, + "balance_loss_mlp": 1.01601171, + "epoch": 0.8482188486397114, + "flos": 33582689366400.0, + "grad_norm": 1.5246182504502446, + "language_loss": 0.73586017, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.75690687, + "num_input_tokens_seen": 304276785, + "step": 14108, + "time_per_iteration": 5.8729071617126465 + }, + { + "auxiliary_loss_clip": 0.01106391, + "auxiliary_loss_mlp": 0.01029487, + "balance_loss_clip": 1.03722811, + "balance_loss_mlp": 1.01592076, + "epoch": 0.8482789718923793, + "flos": 20919402195840.0, + "grad_norm": 1.650722642214462, + "language_loss": 0.72323227, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.74459112, + "num_input_tokens_seen": 304296310, + "step": 14109, + "time_per_iteration": 2.633683443069458 + }, + { + "auxiliary_loss_clip": 0.01039152, + "auxiliary_loss_mlp": 0.01036217, + "balance_loss_clip": 1.03288758, + "balance_loss_mlp": 1.02361703, + "epoch": 0.8483390951450474, + "flos": 12896204912640.0, + "grad_norm": 2.505097141687178, + "language_loss": 0.74121177, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.76196551, + "num_input_tokens_seen": 304311715, + "step": 14110, + "time_per_iteration": 2.7661683559417725 + }, + { + "auxiliary_loss_clip": 0.0105041, + "auxiliary_loss_mlp": 0.01031441, + "balance_loss_clip": 1.03519773, + "balance_loss_mlp": 1.01922786, + "epoch": 0.8483992183977154, + "flos": 25081628520960.0, + "grad_norm": 1.695497905568318, + "language_loss": 0.75963587, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.7804544, + "num_input_tokens_seen": 304331910, + "step": 14111, + "time_per_iteration": 2.809145450592041 + }, + { + "auxiliary_loss_clip": 0.01107437, + "auxiliary_loss_mlp": 0.01029784, + "balance_loss_clip": 1.03754044, + "balance_loss_mlp": 1.018435, + "epoch": 0.8484593416503833, + "flos": 25557445008000.0, + "grad_norm": 3.4199751822671955, + "language_loss": 0.67615312, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.69752538, + "num_input_tokens_seen": 304351405, + "step": 14112, + "time_per_iteration": 2.576991081237793 + }, + { + "auxiliary_loss_clip": 0.01093257, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.03299069, + "balance_loss_mlp": 1.0202558, + "epoch": 0.8485194649030513, + "flos": 27198454780800.0, + "grad_norm": 1.4428256767877636, + "language_loss": 0.73642004, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.75767583, + "num_input_tokens_seen": 304372935, + "step": 14113, + "time_per_iteration": 2.6638875007629395 + }, + { + "auxiliary_loss_clip": 0.01071779, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.03808439, + "balance_loss_mlp": 1.01990139, + "epoch": 0.8485795881557192, + "flos": 24205910941440.0, + "grad_norm": 16.765212760184376, + "language_loss": 0.66891378, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.68995047, + "num_input_tokens_seen": 304393070, + "step": 14114, + "time_per_iteration": 2.71993088722229 + }, + { + "auxiliary_loss_clip": 0.01111702, + "auxiliary_loss_mlp": 0.01031552, + "balance_loss_clip": 1.03860688, + "balance_loss_mlp": 1.01900554, + "epoch": 0.8486397114083872, + "flos": 21141653598720.0, + "grad_norm": 1.6329871649970922, + "language_loss": 0.78943914, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.81087166, + "num_input_tokens_seen": 304411195, + "step": 14115, + "time_per_iteration": 2.5624794960021973 + }, + { + "auxiliary_loss_clip": 0.01110202, + "auxiliary_loss_mlp": 0.01033883, + "balance_loss_clip": 1.03798008, + "balance_loss_mlp": 1.02150321, + "epoch": 0.8486998346610551, + "flos": 19974772373760.0, + "grad_norm": 1.8401751033694997, + "language_loss": 0.78926462, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.81070548, + "num_input_tokens_seen": 304429425, + "step": 14116, + "time_per_iteration": 2.5436830520629883 + }, + { + "auxiliary_loss_clip": 0.01101053, + "auxiliary_loss_mlp": 0.0102868, + "balance_loss_clip": 1.037081, + "balance_loss_mlp": 1.01596665, + "epoch": 0.8487599579137232, + "flos": 19792310261760.0, + "grad_norm": 1.863909931261485, + "language_loss": 0.68563157, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.70692891, + "num_input_tokens_seen": 304447460, + "step": 14117, + "time_per_iteration": 2.580505609512329 + }, + { + "auxiliary_loss_clip": 0.01089877, + "auxiliary_loss_mlp": 0.01028249, + "balance_loss_clip": 1.0346086, + "balance_loss_mlp": 1.01571453, + "epoch": 0.8488200811663911, + "flos": 26396030903040.0, + "grad_norm": 1.9028577306172345, + "language_loss": 0.65127873, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.67246002, + "num_input_tokens_seen": 304468230, + "step": 14118, + "time_per_iteration": 2.670452356338501 + }, + { + "auxiliary_loss_clip": 0.01066258, + "auxiliary_loss_mlp": 0.01029959, + "balance_loss_clip": 1.0340513, + "balance_loss_mlp": 1.01815736, + "epoch": 0.8488802044190591, + "flos": 16359285939840.0, + "grad_norm": 1.7651382162143987, + "language_loss": 0.7343511, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.75531328, + "num_input_tokens_seen": 304484860, + "step": 14119, + "time_per_iteration": 2.681450128555298 + }, + { + "auxiliary_loss_clip": 0.01076463, + "auxiliary_loss_mlp": 0.01030361, + "balance_loss_clip": 1.03407943, + "balance_loss_mlp": 1.01667559, + "epoch": 0.848940327671727, + "flos": 19208869649280.0, + "grad_norm": 2.0775692345074486, + "language_loss": 0.77856582, + "learning_rate": 2.345478926864446e-07, + "loss": 0.7996341, + "num_input_tokens_seen": 304503575, + "step": 14120, + "time_per_iteration": 2.6914706230163574 + }, + { + "auxiliary_loss_clip": 0.01094944, + "auxiliary_loss_mlp": 0.01029569, + "balance_loss_clip": 1.03799891, + "balance_loss_mlp": 1.01668835, + "epoch": 0.849000450924395, + "flos": 21871178824320.0, + "grad_norm": 1.7424170846480949, + "language_loss": 0.75571072, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.7769559, + "num_input_tokens_seen": 304525005, + "step": 14121, + "time_per_iteration": 2.6244821548461914 + }, + { + "auxiliary_loss_clip": 0.00992252, + "auxiliary_loss_mlp": 0.00999952, + "balance_loss_clip": 1.00927687, + "balance_loss_mlp": 0.9989683, + "epoch": 0.8490605741770629, + "flos": 71166475624320.0, + "grad_norm": 1.3338181745049902, + "language_loss": 0.60148805, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.62141007, + "num_input_tokens_seen": 304585220, + "step": 14122, + "time_per_iteration": 3.219271659851074 + }, + { + "auxiliary_loss_clip": 0.01098712, + "auxiliary_loss_mlp": 0.01031259, + "balance_loss_clip": 1.03732467, + "balance_loss_mlp": 1.01909947, + "epoch": 0.849120697429731, + "flos": 24973357950720.0, + "grad_norm": 1.7666847271822834, + "language_loss": 0.79593515, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.81723487, + "num_input_tokens_seen": 304604665, + "step": 14123, + "time_per_iteration": 2.696174144744873 + }, + { + "auxiliary_loss_clip": 0.01095036, + "auxiliary_loss_mlp": 0.01030234, + "balance_loss_clip": 1.03751028, + "balance_loss_mlp": 1.01858711, + "epoch": 0.8491808206823989, + "flos": 23032277959680.0, + "grad_norm": 1.9536438787496402, + "language_loss": 0.82910216, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.85035485, + "num_input_tokens_seen": 304620600, + "step": 14124, + "time_per_iteration": 2.7340493202209473 + }, + { + "auxiliary_loss_clip": 0.01064676, + "auxiliary_loss_mlp": 0.01034224, + "balance_loss_clip": 1.03637028, + "balance_loss_mlp": 1.0209558, + "epoch": 0.8492409439350669, + "flos": 23878549365120.0, + "grad_norm": 1.89858398727176, + "language_loss": 0.72199571, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.74298477, + "num_input_tokens_seen": 304639540, + "step": 14125, + "time_per_iteration": 2.736920118331909 + }, + { + "auxiliary_loss_clip": 0.01114158, + "auxiliary_loss_mlp": 0.0103526, + "balance_loss_clip": 1.03876936, + "balance_loss_mlp": 1.0216043, + "epoch": 0.8493010671877349, + "flos": 22419893963520.0, + "grad_norm": 1.649969149423374, + "language_loss": 0.73402816, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.75552237, + "num_input_tokens_seen": 304660595, + "step": 14126, + "time_per_iteration": 2.5707271099090576 + }, + { + "auxiliary_loss_clip": 0.01061518, + "auxiliary_loss_mlp": 0.01039889, + "balance_loss_clip": 1.03374052, + "balance_loss_mlp": 1.0265317, + "epoch": 0.8493611904404028, + "flos": 17529435302400.0, + "grad_norm": 1.4324709138124028, + "language_loss": 0.67603076, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.69704485, + "num_input_tokens_seen": 304679580, + "step": 14127, + "time_per_iteration": 2.7047815322875977 + }, + { + "auxiliary_loss_clip": 0.01075849, + "auxiliary_loss_mlp": 0.00772172, + "balance_loss_clip": 1.03386354, + "balance_loss_mlp": 1.00027514, + "epoch": 0.8494213136930708, + "flos": 19462937523840.0, + "grad_norm": 2.2394682768750727, + "language_loss": 0.68882221, + "learning_rate": 2.330860086502211e-07, + "loss": 0.70730239, + "num_input_tokens_seen": 304698385, + "step": 14128, + "time_per_iteration": 2.714137077331543 + }, + { + "auxiliary_loss_clip": 0.01082408, + "auxiliary_loss_mlp": 0.01032133, + "balance_loss_clip": 1.03493214, + "balance_loss_mlp": 1.01906157, + "epoch": 0.8494814369457387, + "flos": 18770292587520.0, + "grad_norm": 1.7314045833982252, + "language_loss": 0.77983749, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.80098283, + "num_input_tokens_seen": 304715430, + "step": 14129, + "time_per_iteration": 2.6516494750976562 + }, + { + "auxiliary_loss_clip": 0.01044399, + "auxiliary_loss_mlp": 0.01036739, + "balance_loss_clip": 1.03597188, + "balance_loss_mlp": 1.02454424, + "epoch": 0.8495415601984068, + "flos": 23331486251520.0, + "grad_norm": 1.784130753830601, + "language_loss": 0.67886949, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.69968086, + "num_input_tokens_seen": 304734345, + "step": 14130, + "time_per_iteration": 2.8086585998535156 + }, + { + "auxiliary_loss_clip": 0.01099002, + "auxiliary_loss_mlp": 0.01034825, + "balance_loss_clip": 1.03699052, + "balance_loss_mlp": 1.02243376, + "epoch": 0.8496016834510747, + "flos": 26612859352320.0, + "grad_norm": 1.7932919190030374, + "language_loss": 0.71109772, + "learning_rate": 2.3253890747186e-07, + "loss": 0.732436, + "num_input_tokens_seen": 304755030, + "step": 14131, + "time_per_iteration": 2.704787254333496 + }, + { + "auxiliary_loss_clip": 0.01079775, + "auxiliary_loss_mlp": 0.01028208, + "balance_loss_clip": 1.03795338, + "balance_loss_mlp": 1.0159409, + "epoch": 0.8496618067037427, + "flos": 25480380378240.0, + "grad_norm": 1.8086796883087504, + "language_loss": 0.68577588, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.70685571, + "num_input_tokens_seen": 304774320, + "step": 14132, + "time_per_iteration": 2.7110090255737305 + }, + { + "auxiliary_loss_clip": 0.01105556, + "auxiliary_loss_mlp": 0.01035286, + "balance_loss_clip": 1.03522205, + "balance_loss_mlp": 1.0235796, + "epoch": 0.8497219299564106, + "flos": 25374587846400.0, + "grad_norm": 1.7573733285315933, + "language_loss": 0.70354646, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.7249549, + "num_input_tokens_seen": 304795355, + "step": 14133, + "time_per_iteration": 2.567920684814453 + }, + { + "auxiliary_loss_clip": 0.00997066, + "auxiliary_loss_mlp": 0.00751378, + "balance_loss_clip": 1.01175642, + "balance_loss_mlp": 0.99961358, + "epoch": 0.8497820532090786, + "flos": 67780279658880.0, + "grad_norm": 0.7450619720676375, + "language_loss": 0.57556748, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.59305191, + "num_input_tokens_seen": 304863915, + "step": 14134, + "time_per_iteration": 3.3689846992492676 + }, + { + "auxiliary_loss_clip": 0.0107422, + "auxiliary_loss_mlp": 0.01027994, + "balance_loss_clip": 1.03716052, + "balance_loss_mlp": 1.01549459, + "epoch": 0.8498421764617465, + "flos": 23440546920960.0, + "grad_norm": 2.466409087633597, + "language_loss": 0.78983986, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.81086206, + "num_input_tokens_seen": 304881555, + "step": 14135, + "time_per_iteration": 2.7446372509002686 + }, + { + "auxiliary_loss_clip": 0.01097445, + "auxiliary_loss_mlp": 0.01030777, + "balance_loss_clip": 1.03782988, + "balance_loss_mlp": 1.01817632, + "epoch": 0.8499022997144146, + "flos": 17712615686400.0, + "grad_norm": 2.7995527616505966, + "language_loss": 0.63055122, + "learning_rate": 2.316284127127044e-07, + "loss": 0.65183342, + "num_input_tokens_seen": 304898760, + "step": 14136, + "time_per_iteration": 2.5907950401306152 + }, + { + "auxiliary_loss_clip": 0.01101166, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.03812134, + "balance_loss_mlp": 1.01783872, + "epoch": 0.8499624229670825, + "flos": 18588512833920.0, + "grad_norm": 1.700183635273407, + "language_loss": 0.84176117, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.86308306, + "num_input_tokens_seen": 304915465, + "step": 14137, + "time_per_iteration": 2.605083703994751 + }, + { + "auxiliary_loss_clip": 0.010792, + "auxiliary_loss_mlp": 0.01027843, + "balance_loss_clip": 1.03870046, + "balance_loss_mlp": 1.01687622, + "epoch": 0.8500225462197505, + "flos": 24345854328960.0, + "grad_norm": 2.180201293156008, + "language_loss": 0.78512466, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.80619514, + "num_input_tokens_seen": 304933190, + "step": 14138, + "time_per_iteration": 2.70701003074646 + }, + { + "auxiliary_loss_clip": 0.01098762, + "auxiliary_loss_mlp": 0.01028021, + "balance_loss_clip": 1.03806686, + "balance_loss_mlp": 1.0162487, + "epoch": 0.8500826694724185, + "flos": 16545518979840.0, + "grad_norm": 1.5977485951908699, + "language_loss": 0.64826471, + "learning_rate": 2.310829204839073e-07, + "loss": 0.66953254, + "num_input_tokens_seen": 304951110, + "step": 14139, + "time_per_iteration": 2.5747222900390625 + }, + { + "auxiliary_loss_clip": 0.01067444, + "auxiliary_loss_mlp": 0.01031881, + "balance_loss_clip": 1.03539836, + "balance_loss_mlp": 1.02024066, + "epoch": 0.8501427927250864, + "flos": 16289404030080.0, + "grad_norm": 1.8080135201880956, + "language_loss": 0.7064625, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.72745574, + "num_input_tokens_seen": 304969095, + "step": 14140, + "time_per_iteration": 2.7031800746917725 + }, + { + "auxiliary_loss_clip": 0.01073027, + "auxiliary_loss_mlp": 0.01034541, + "balance_loss_clip": 1.03628802, + "balance_loss_mlp": 1.02165496, + "epoch": 0.8502029159777544, + "flos": 26687912820480.0, + "grad_norm": 2.024190780090597, + "language_loss": 0.64177513, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.6628508, + "num_input_tokens_seen": 304989315, + "step": 14141, + "time_per_iteration": 2.780942916870117 + }, + { + "auxiliary_loss_clip": 0.01079122, + "auxiliary_loss_mlp": 0.01034336, + "balance_loss_clip": 1.03825319, + "balance_loss_mlp": 1.02206933, + "epoch": 0.8502630392304223, + "flos": 35590778179200.0, + "grad_norm": 1.598166482791058, + "language_loss": 0.70859313, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.72972775, + "num_input_tokens_seen": 305011020, + "step": 14142, + "time_per_iteration": 2.8212552070617676 + }, + { + "auxiliary_loss_clip": 0.01061273, + "auxiliary_loss_mlp": 0.0103314, + "balance_loss_clip": 1.03280842, + "balance_loss_mlp": 1.0207963, + "epoch": 0.8503231624830904, + "flos": 21649466125440.0, + "grad_norm": 1.747417790949646, + "language_loss": 0.6528132, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.67375731, + "num_input_tokens_seen": 305033550, + "step": 14143, + "time_per_iteration": 2.785883665084839 + }, + { + "auxiliary_loss_clip": 0.01081279, + "auxiliary_loss_mlp": 0.00770514, + "balance_loss_clip": 1.03600597, + "balance_loss_mlp": 1.00017881, + "epoch": 0.8503832857357583, + "flos": 22417451838720.0, + "grad_norm": 2.048866472556172, + "language_loss": 0.68279046, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.70130837, + "num_input_tokens_seen": 305052885, + "step": 14144, + "time_per_iteration": 4.240123748779297 + }, + { + "auxiliary_loss_clip": 0.01042348, + "auxiliary_loss_mlp": 0.01033949, + "balance_loss_clip": 1.03315759, + "balance_loss_mlp": 1.02033496, + "epoch": 0.8504434089884263, + "flos": 18697968552960.0, + "grad_norm": 2.1174262858689628, + "language_loss": 0.6438145, + "learning_rate": 2.299937473050777e-07, + "loss": 0.66457748, + "num_input_tokens_seen": 305071995, + "step": 14145, + "time_per_iteration": 4.4199535846710205 + }, + { + "auxiliary_loss_clip": 0.01087485, + "auxiliary_loss_mlp": 0.01031784, + "balance_loss_clip": 1.03486562, + "balance_loss_mlp": 1.01891518, + "epoch": 0.8505035322410942, + "flos": 20007989475840.0, + "grad_norm": 1.8443246841114695, + "language_loss": 0.8561762, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.87736893, + "num_input_tokens_seen": 305090190, + "step": 14146, + "time_per_iteration": 2.6533970832824707 + }, + { + "auxiliary_loss_clip": 0.0110625, + "auxiliary_loss_mlp": 0.01027868, + "balance_loss_clip": 1.03480434, + "balance_loss_mlp": 1.01580358, + "epoch": 0.8505636554937622, + "flos": 20812173120000.0, + "grad_norm": 1.6073228623135094, + "language_loss": 0.84023243, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.86157364, + "num_input_tokens_seen": 305109355, + "step": 14147, + "time_per_iteration": 4.045815706253052 + }, + { + "auxiliary_loss_clip": 0.01099865, + "auxiliary_loss_mlp": 0.01031152, + "balance_loss_clip": 1.0365082, + "balance_loss_mlp": 1.01814604, + "epoch": 0.8506237787464301, + "flos": 14174445277440.0, + "grad_norm": 3.171596156329527, + "language_loss": 0.85552716, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.87683737, + "num_input_tokens_seen": 305124165, + "step": 14148, + "time_per_iteration": 4.178382635116577 + }, + { + "auxiliary_loss_clip": 0.01086687, + "auxiliary_loss_mlp": 0.01033466, + "balance_loss_clip": 1.03529978, + "balance_loss_mlp": 1.02051425, + "epoch": 0.8506839019990982, + "flos": 23258372117760.0, + "grad_norm": 1.5915072699945274, + "language_loss": 0.71948111, + "learning_rate": 2.292689741370204e-07, + "loss": 0.7406826, + "num_input_tokens_seen": 305143940, + "step": 14149, + "time_per_iteration": 2.7413246631622314 + }, + { + "auxiliary_loss_clip": 0.01087525, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.03729534, + "balance_loss_mlp": 1.0173254, + "epoch": 0.8507440252517661, + "flos": 23659206963840.0, + "grad_norm": 1.895927290429812, + "language_loss": 0.76037747, + "learning_rate": 2.290879486935804e-07, + "loss": 0.78154701, + "num_input_tokens_seen": 305163505, + "step": 14150, + "time_per_iteration": 2.8601326942443848 + }, + { + "auxiliary_loss_clip": 0.01068558, + "auxiliary_loss_mlp": 0.01033535, + "balance_loss_clip": 1.03508079, + "balance_loss_mlp": 1.02081537, + "epoch": 0.8508041485044341, + "flos": 18661339658880.0, + "grad_norm": 1.9437397223028954, + "language_loss": 0.72261739, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.74363828, + "num_input_tokens_seen": 305182325, + "step": 14151, + "time_per_iteration": 2.7191174030303955 + }, + { + "auxiliary_loss_clip": 0.00989017, + "auxiliary_loss_mlp": 0.01001335, + "balance_loss_clip": 1.01485205, + "balance_loss_mlp": 1.00013149, + "epoch": 0.8508642717571021, + "flos": 52510918055040.0, + "grad_norm": 0.8877797296007555, + "language_loss": 0.5956288, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.61553234, + "num_input_tokens_seen": 305230775, + "step": 14152, + "time_per_iteration": 3.0959417819976807 + }, + { + "auxiliary_loss_clip": 0.01012053, + "auxiliary_loss_mlp": 0.01000683, + "balance_loss_clip": 1.00869, + "balance_loss_mlp": 0.99963391, + "epoch": 0.85092439500977, + "flos": 69297145050240.0, + "grad_norm": 0.6913266470704932, + "language_loss": 0.61156118, + "learning_rate": 2.285452753096797e-07, + "loss": 0.63168854, + "num_input_tokens_seen": 305296000, + "step": 14153, + "time_per_iteration": 3.1953656673431396 + }, + { + "auxiliary_loss_clip": 0.01099193, + "auxiliary_loss_mlp": 0.01032395, + "balance_loss_clip": 1.03656745, + "balance_loss_mlp": 1.0191927, + "epoch": 0.850984518262438, + "flos": 24389737770240.0, + "grad_norm": 1.8650933862340224, + "language_loss": 0.80833215, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.82964802, + "num_input_tokens_seen": 305314705, + "step": 14154, + "time_per_iteration": 2.6398138999938965 + }, + { + "auxiliary_loss_clip": 0.01070524, + "auxiliary_loss_mlp": 0.01031227, + "balance_loss_clip": 1.0340178, + "balance_loss_mlp": 1.02010489, + "epoch": 0.851044641515106, + "flos": 23294821443840.0, + "grad_norm": 3.9078640909935753, + "language_loss": 0.79612941, + "learning_rate": 2.281838289110165e-07, + "loss": 0.8171469, + "num_input_tokens_seen": 305333870, + "step": 14155, + "time_per_iteration": 2.7668473720550537 + }, + { + "auxiliary_loss_clip": 0.01075246, + "auxiliary_loss_mlp": 0.0103132, + "balance_loss_clip": 1.03424454, + "balance_loss_mlp": 1.01889825, + "epoch": 0.851104764767774, + "flos": 22050085489920.0, + "grad_norm": 2.1664550070392172, + "language_loss": 0.70601416, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.72707975, + "num_input_tokens_seen": 305352780, + "step": 14156, + "time_per_iteration": 2.712688684463501 + }, + { + "auxiliary_loss_clip": 0.01067563, + "auxiliary_loss_mlp": 0.01030898, + "balance_loss_clip": 1.03651905, + "balance_loss_mlp": 1.01922178, + "epoch": 0.8511648880204419, + "flos": 20704728562560.0, + "grad_norm": 2.295507051871608, + "language_loss": 0.73186374, + "learning_rate": 2.278226512621386e-07, + "loss": 0.75284839, + "num_input_tokens_seen": 305371370, + "step": 14157, + "time_per_iteration": 2.702608108520508 + }, + { + "auxiliary_loss_clip": 0.01040081, + "auxiliary_loss_mlp": 0.010238, + "balance_loss_clip": 1.03516209, + "balance_loss_mlp": 1.01280284, + "epoch": 0.8512250112731099, + "flos": 24024669891840.0, + "grad_norm": 2.053663216507987, + "language_loss": 0.800686, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.82132483, + "num_input_tokens_seen": 305387955, + "step": 14158, + "time_per_iteration": 2.8139398097991943 + }, + { + "auxiliary_loss_clip": 0.01094324, + "auxiliary_loss_mlp": 0.01036878, + "balance_loss_clip": 1.03557563, + "balance_loss_mlp": 1.02236986, + "epoch": 0.8512851345257778, + "flos": 22015467757440.0, + "grad_norm": 2.1721071422061446, + "language_loss": 0.79100662, + "learning_rate": 2.27461742417828e-07, + "loss": 0.81231868, + "num_input_tokens_seen": 305406285, + "step": 14159, + "time_per_iteration": 2.5417728424072266 + }, + { + "auxiliary_loss_clip": 0.01089601, + "auxiliary_loss_mlp": 0.0103497, + "balance_loss_clip": 1.0372653, + "balance_loss_mlp": 1.02239358, + "epoch": 0.8513452577784458, + "flos": 14830209924480.0, + "grad_norm": 2.049292713518449, + "language_loss": 0.71023905, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.73148477, + "num_input_tokens_seen": 305424500, + "step": 14160, + "time_per_iteration": 2.5549099445343018 + }, + { + "auxiliary_loss_clip": 0.01104724, + "auxiliary_loss_mlp": 0.01029216, + "balance_loss_clip": 1.03866696, + "balance_loss_mlp": 1.01627553, + "epoch": 0.8514053810311137, + "flos": 33035662166400.0, + "grad_norm": 2.7738458222833637, + "language_loss": 0.7019136, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.72325301, + "num_input_tokens_seen": 305442990, + "step": 14161, + "time_per_iteration": 2.5866782665252686 + }, + { + "auxiliary_loss_clip": 0.01097425, + "auxiliary_loss_mlp": 0.01030584, + "balance_loss_clip": 1.03306913, + "balance_loss_mlp": 1.01881218, + "epoch": 0.8514655042837818, + "flos": 27564456412800.0, + "grad_norm": 2.41119817546413, + "language_loss": 0.77940011, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.80068016, + "num_input_tokens_seen": 305463065, + "step": 14162, + "time_per_iteration": 2.7035062313079834 + }, + { + "auxiliary_loss_clip": 0.01099345, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.03699732, + "balance_loss_mlp": 1.02044845, + "epoch": 0.8515256275364497, + "flos": 35556052705920.0, + "grad_norm": 1.8591590026423754, + "language_loss": 0.77019423, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.79152089, + "num_input_tokens_seen": 305489070, + "step": 14163, + "time_per_iteration": 2.750953435897827 + }, + { + "auxiliary_loss_clip": 0.01013801, + "auxiliary_loss_mlp": 0.01003898, + "balance_loss_clip": 1.01090002, + "balance_loss_mlp": 1.00288486, + "epoch": 0.8515857507891177, + "flos": 70207372621440.0, + "grad_norm": 0.6897705551367352, + "language_loss": 0.54935861, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.56953561, + "num_input_tokens_seen": 305551490, + "step": 14164, + "time_per_iteration": 3.223865509033203 + }, + { + "auxiliary_loss_clip": 0.01099487, + "auxiliary_loss_mlp": 0.01033823, + "balance_loss_clip": 1.03638053, + "balance_loss_mlp": 1.02158666, + "epoch": 0.8516458740417857, + "flos": 22675290641280.0, + "grad_norm": 1.9828346759864348, + "language_loss": 0.7308625, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.7521956, + "num_input_tokens_seen": 305570535, + "step": 14165, + "time_per_iteration": 2.683063268661499 + }, + { + "auxiliary_loss_clip": 0.01070656, + "auxiliary_loss_mlp": 0.01030913, + "balance_loss_clip": 1.03600621, + "balance_loss_mlp": 1.01849699, + "epoch": 0.8517059972944536, + "flos": 22747435107840.0, + "grad_norm": 1.5369280358332664, + "language_loss": 0.6716156, + "learning_rate": 2.26200679088697e-07, + "loss": 0.6926313, + "num_input_tokens_seen": 305590800, + "step": 14166, + "time_per_iteration": 2.7411108016967773 + }, + { + "auxiliary_loss_clip": 0.01084994, + "auxiliary_loss_mlp": 0.01034282, + "balance_loss_clip": 1.03303361, + "balance_loss_mlp": 1.02188396, + "epoch": 0.8517661205471216, + "flos": 21689147675520.0, + "grad_norm": 1.785592393708889, + "language_loss": 0.73291379, + "learning_rate": 2.260207961805125e-07, + "loss": 0.75410652, + "num_input_tokens_seen": 305609495, + "step": 14167, + "time_per_iteration": 2.6664106845855713 + }, + { + "auxiliary_loss_clip": 0.01109416, + "auxiliary_loss_mlp": 0.0103133, + "balance_loss_clip": 1.03773403, + "balance_loss_mlp": 1.01968884, + "epoch": 0.8518262437997896, + "flos": 25374839241600.0, + "grad_norm": 1.6176439709713288, + "language_loss": 0.80560851, + "learning_rate": 2.258409805417969e-07, + "loss": 0.827016, + "num_input_tokens_seen": 305629420, + "step": 14168, + "time_per_iteration": 2.59899640083313 + }, + { + "auxiliary_loss_clip": 0.01106516, + "auxiliary_loss_mlp": 0.01027237, + "balance_loss_clip": 1.03524876, + "balance_loss_mlp": 1.01554823, + "epoch": 0.8518863670524576, + "flos": 27235406897280.0, + "grad_norm": 1.781183741177256, + "language_loss": 0.76068074, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.7820183, + "num_input_tokens_seen": 305649835, + "step": 14169, + "time_per_iteration": 2.634589672088623 + }, + { + "auxiliary_loss_clip": 0.01112356, + "auxiliary_loss_mlp": 0.01030417, + "balance_loss_clip": 1.03858984, + "balance_loss_mlp": 1.01746488, + "epoch": 0.8519464903051255, + "flos": 20959514709120.0, + "grad_norm": 1.6296067224566693, + "language_loss": 0.63455546, + "learning_rate": 2.254815511000452e-07, + "loss": 0.65598321, + "num_input_tokens_seen": 305668840, + "step": 14170, + "time_per_iteration": 2.556849718093872 + }, + { + "auxiliary_loss_clip": 0.0109011, + "auxiliary_loss_mlp": 0.01029143, + "balance_loss_clip": 1.03445804, + "balance_loss_mlp": 1.0168829, + "epoch": 0.8520066135577935, + "flos": 18441745862400.0, + "grad_norm": 2.158149023964769, + "language_loss": 0.8638401, + "learning_rate": 2.253019373106384e-07, + "loss": 0.88503265, + "num_input_tokens_seen": 305686955, + "step": 14171, + "time_per_iteration": 2.6308727264404297 + }, + { + "auxiliary_loss_clip": 0.01094344, + "auxiliary_loss_mlp": 0.01038693, + "balance_loss_clip": 1.0366205, + "balance_loss_mlp": 1.02613449, + "epoch": 0.8520667368104614, + "flos": 29130233149440.0, + "grad_norm": 1.7172812217189943, + "language_loss": 0.54943144, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.5707618, + "num_input_tokens_seen": 305706290, + "step": 14172, + "time_per_iteration": 2.6792733669281006 + }, + { + "auxiliary_loss_clip": 0.01082291, + "auxiliary_loss_mlp": 0.01028716, + "balance_loss_clip": 1.0339576, + "balance_loss_mlp": 1.01860142, + "epoch": 0.8521268600631294, + "flos": 16034366488320.0, + "grad_norm": 2.2874699102047824, + "language_loss": 0.6964975, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.71760762, + "num_input_tokens_seen": 305723835, + "step": 14173, + "time_per_iteration": 2.656660795211792 + }, + { + "auxiliary_loss_clip": 0.0108799, + "auxiliary_loss_mlp": 0.0077035, + "balance_loss_clip": 1.03576326, + "balance_loss_mlp": 1.0002656, + "epoch": 0.8521869833157973, + "flos": 22454870832000.0, + "grad_norm": 2.469794290307129, + "language_loss": 0.77085257, + "learning_rate": 2.247634997500205e-07, + "loss": 0.78943598, + "num_input_tokens_seen": 305741655, + "step": 14174, + "time_per_iteration": 2.6629743576049805 + }, + { + "auxiliary_loss_clip": 0.01074547, + "auxiliary_loss_mlp": 0.00771408, + "balance_loss_clip": 1.03330672, + "balance_loss_mlp": 1.00036669, + "epoch": 0.8522471065684654, + "flos": 24972029147520.0, + "grad_norm": 3.681847019850499, + "language_loss": 0.8197754, + "learning_rate": 2.245841551883676e-07, + "loss": 0.83823496, + "num_input_tokens_seen": 305761890, + "step": 14175, + "time_per_iteration": 2.6883835792541504 + }, + { + "auxiliary_loss_clip": 0.01112836, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.03869867, + "balance_loss_mlp": 1.02256155, + "epoch": 0.8523072298211333, + "flos": 17710604524800.0, + "grad_norm": 7.221526535208017, + "language_loss": 0.65591013, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.67739511, + "num_input_tokens_seen": 305779190, + "step": 14176, + "time_per_iteration": 2.513249397277832 + }, + { + "auxiliary_loss_clip": 0.01083655, + "auxiliary_loss_mlp": 0.00769903, + "balance_loss_clip": 1.03461874, + "balance_loss_mlp": 1.00019979, + "epoch": 0.8523673530738013, + "flos": 25446193608960.0, + "grad_norm": 1.6790468369786946, + "language_loss": 0.7851091, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.80364466, + "num_input_tokens_seen": 305799870, + "step": 14177, + "time_per_iteration": 2.6671228408813477 + }, + { + "auxiliary_loss_clip": 0.0108583, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.03573, + "balance_loss_mlp": 1.01741612, + "epoch": 0.8524274763264693, + "flos": 31429593348480.0, + "grad_norm": 1.9646253723972047, + "language_loss": 0.73313767, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.75429547, + "num_input_tokens_seen": 305819695, + "step": 14178, + "time_per_iteration": 2.713926315307617 + }, + { + "auxiliary_loss_clip": 0.01074008, + "auxiliary_loss_mlp": 0.01037664, + "balance_loss_clip": 1.03707623, + "balance_loss_mlp": 1.02537966, + "epoch": 0.8524875995791372, + "flos": 17712651600000.0, + "grad_norm": 1.8623872713684044, + "language_loss": 0.74955928, + "learning_rate": 2.238674502491935e-07, + "loss": 0.77067608, + "num_input_tokens_seen": 305837270, + "step": 14179, + "time_per_iteration": 2.6611170768737793 + }, + { + "auxiliary_loss_clip": 0.01109256, + "auxiliary_loss_mlp": 0.01029994, + "balance_loss_clip": 1.03910112, + "balance_loss_mlp": 1.01806116, + "epoch": 0.8525477228318052, + "flos": 21687316081920.0, + "grad_norm": 2.060347527701932, + "language_loss": 0.816504, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.83789647, + "num_input_tokens_seen": 305855250, + "step": 14180, + "time_per_iteration": 2.6562328338623047 + }, + { + "auxiliary_loss_clip": 0.01051532, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.03316164, + "balance_loss_mlp": 1.02265859, + "epoch": 0.8526078460844732, + "flos": 24827057856000.0, + "grad_norm": 6.706307974363978, + "language_loss": 0.60821462, + "learning_rate": 2.235095018591815e-07, + "loss": 0.62907696, + "num_input_tokens_seen": 305875660, + "step": 14181, + "time_per_iteration": 2.7725861072540283 + }, + { + "auxiliary_loss_clip": 0.011084, + "auxiliary_loss_mlp": 0.01033406, + "balance_loss_clip": 1.03824615, + "balance_loss_mlp": 1.02208114, + "epoch": 0.8526679693371412, + "flos": 13516418073600.0, + "grad_norm": 2.1617391285888314, + "language_loss": 0.72616804, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.74758613, + "num_input_tokens_seen": 305892415, + "step": 14182, + "time_per_iteration": 2.5392303466796875 + }, + { + "auxiliary_loss_clip": 0.01056951, + "auxiliary_loss_mlp": 0.01033126, + "balance_loss_clip": 1.03387702, + "balance_loss_mlp": 1.0208416, + "epoch": 0.8527280925898091, + "flos": 23514092017920.0, + "grad_norm": 1.4656692633945465, + "language_loss": 0.70735776, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.72825855, + "num_input_tokens_seen": 305912665, + "step": 14183, + "time_per_iteration": 4.254406213760376 + }, + { + "auxiliary_loss_clip": 0.01081461, + "auxiliary_loss_mlp": 0.01031438, + "balance_loss_clip": 1.03771853, + "balance_loss_mlp": 1.01956463, + "epoch": 0.8527882158424771, + "flos": 20303031790080.0, + "grad_norm": 1.7895488338576169, + "language_loss": 0.72972029, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.75084925, + "num_input_tokens_seen": 305931515, + "step": 14184, + "time_per_iteration": 4.304826974868774 + }, + { + "auxiliary_loss_clip": 0.01109825, + "auxiliary_loss_mlp": 0.01033313, + "balance_loss_clip": 1.0379746, + "balance_loss_mlp": 1.02064705, + "epoch": 0.852848339095145, + "flos": 17202504689280.0, + "grad_norm": 1.7597843900192167, + "language_loss": 0.7711637, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.79259503, + "num_input_tokens_seen": 305949965, + "step": 14185, + "time_per_iteration": 2.5977091789245605 + }, + { + "auxiliary_loss_clip": 0.0106691, + "auxiliary_loss_mlp": 0.0102992, + "balance_loss_clip": 1.03286219, + "balance_loss_mlp": 1.01596713, + "epoch": 0.852908462347813, + "flos": 18368990864640.0, + "grad_norm": 2.425914836353015, + "language_loss": 0.79841149, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.81937975, + "num_input_tokens_seen": 305967820, + "step": 14186, + "time_per_iteration": 4.160691738128662 + }, + { + "auxiliary_loss_clip": 0.01085946, + "auxiliary_loss_mlp": 0.01029366, + "balance_loss_clip": 1.03557575, + "balance_loss_mlp": 1.01655054, + "epoch": 0.8529685856004809, + "flos": 18624890332800.0, + "grad_norm": 1.6292428132802597, + "language_loss": 0.62476075, + "learning_rate": 2.224372736588449e-07, + "loss": 0.64591384, + "num_input_tokens_seen": 305985505, + "step": 14187, + "time_per_iteration": 4.218466758728027 + }, + { + "auxiliary_loss_clip": 0.01056813, + "auxiliary_loss_mlp": 0.01030511, + "balance_loss_clip": 1.03186965, + "balance_loss_mlp": 1.01697493, + "epoch": 0.853028708853149, + "flos": 29607665748480.0, + "grad_norm": 1.8450425087178943, + "language_loss": 0.76632512, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.78719831, + "num_input_tokens_seen": 306005220, + "step": 14188, + "time_per_iteration": 2.756181240081787 + }, + { + "auxiliary_loss_clip": 0.01098789, + "auxiliary_loss_mlp": 0.01029553, + "balance_loss_clip": 1.03644693, + "balance_loss_mlp": 1.01655281, + "epoch": 0.8530888321058169, + "flos": 26353153042560.0, + "grad_norm": 1.4873919798576203, + "language_loss": 0.7809422, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.80222559, + "num_input_tokens_seen": 306023785, + "step": 14189, + "time_per_iteration": 2.6410326957702637 + }, + { + "auxiliary_loss_clip": 0.01086145, + "auxiliary_loss_mlp": 0.01032823, + "balance_loss_clip": 1.03470349, + "balance_loss_mlp": 1.01997268, + "epoch": 0.8531489553584849, + "flos": 20521979141760.0, + "grad_norm": 2.5651447957757005, + "language_loss": 0.7962172, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.81740683, + "num_input_tokens_seen": 306041600, + "step": 14190, + "time_per_iteration": 2.6400444507598877 + }, + { + "auxiliary_loss_clip": 0.01059317, + "auxiliary_loss_mlp": 0.01029769, + "balance_loss_clip": 1.03576827, + "balance_loss_mlp": 1.01700187, + "epoch": 0.8532090786111529, + "flos": 20704297599360.0, + "grad_norm": 2.162987125122954, + "language_loss": 0.75559723, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.77648813, + "num_input_tokens_seen": 306060345, + "step": 14191, + "time_per_iteration": 2.691556692123413 + }, + { + "auxiliary_loss_clip": 0.01098409, + "auxiliary_loss_mlp": 0.01030091, + "balance_loss_clip": 1.0377655, + "balance_loss_mlp": 1.01723993, + "epoch": 0.8532692018638208, + "flos": 19828903242240.0, + "grad_norm": 1.957341316580574, + "language_loss": 0.69267607, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.71396106, + "num_input_tokens_seen": 306078285, + "step": 14192, + "time_per_iteration": 2.631347894668579 + }, + { + "auxiliary_loss_clip": 0.01101694, + "auxiliary_loss_mlp": 0.01035412, + "balance_loss_clip": 1.03725314, + "balance_loss_mlp": 1.02091622, + "epoch": 0.8533293251164888, + "flos": 20996790048000.0, + "grad_norm": 2.1501600362317643, + "language_loss": 0.63451266, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.65588367, + "num_input_tokens_seen": 306093760, + "step": 14193, + "time_per_iteration": 2.626577377319336 + }, + { + "auxiliary_loss_clip": 0.01081646, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.03549838, + "balance_loss_mlp": 1.01570606, + "epoch": 0.8533894483691568, + "flos": 22419606654720.0, + "grad_norm": 2.0851012965746905, + "language_loss": 0.76840144, + "learning_rate": 2.211894078044365e-07, + "loss": 0.78949881, + "num_input_tokens_seen": 306112595, + "step": 14194, + "time_per_iteration": 2.6441872119903564 + }, + { + "auxiliary_loss_clip": 0.01110242, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.03740048, + "balance_loss_mlp": 1.01674914, + "epoch": 0.8534495716218248, + "flos": 21616536332160.0, + "grad_norm": 2.217628380709863, + "language_loss": 0.69469094, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.71607888, + "num_input_tokens_seen": 306131800, + "step": 14195, + "time_per_iteration": 2.679945707321167 + }, + { + "auxiliary_loss_clip": 0.01082724, + "auxiliary_loss_mlp": 0.01032645, + "balance_loss_clip": 1.03602624, + "balance_loss_mlp": 1.01968741, + "epoch": 0.8535096948744927, + "flos": 22346277039360.0, + "grad_norm": 1.838474831161169, + "language_loss": 0.85432625, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.87547994, + "num_input_tokens_seen": 306150590, + "step": 14196, + "time_per_iteration": 2.6546883583068848 + }, + { + "auxiliary_loss_clip": 0.01011396, + "auxiliary_loss_mlp": 0.01001419, + "balance_loss_clip": 1.00853372, + "balance_loss_mlp": 1.00029302, + "epoch": 0.8535698181271607, + "flos": 52762507891200.0, + "grad_norm": 0.7576235506473017, + "language_loss": 0.55055857, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.5706867, + "num_input_tokens_seen": 306205850, + "step": 14197, + "time_per_iteration": 3.1265292167663574 + }, + { + "auxiliary_loss_clip": 0.01072451, + "auxiliary_loss_mlp": 0.00769866, + "balance_loss_clip": 1.03453422, + "balance_loss_mlp": 1.0002501, + "epoch": 0.8536299413798286, + "flos": 19062892776960.0, + "grad_norm": 1.7349390720233626, + "language_loss": 0.81448388, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.83290708, + "num_input_tokens_seen": 306225220, + "step": 14198, + "time_per_iteration": 2.709376573562622 + }, + { + "auxiliary_loss_clip": 0.01107145, + "auxiliary_loss_mlp": 0.01028426, + "balance_loss_clip": 1.03658509, + "balance_loss_mlp": 1.01760149, + "epoch": 0.8536900646324966, + "flos": 49344743871360.0, + "grad_norm": 3.58688569610331, + "language_loss": 0.6833868, + "learning_rate": 2.203000984963035e-07, + "loss": 0.70474249, + "num_input_tokens_seen": 306249865, + "step": 14199, + "time_per_iteration": 2.8150370121002197 + }, + { + "auxiliary_loss_clip": 0.01070955, + "auxiliary_loss_mlp": 0.01028243, + "balance_loss_clip": 1.03376341, + "balance_loss_mlp": 1.01713872, + "epoch": 0.8537501878851645, + "flos": 21762333636480.0, + "grad_norm": 1.5671357707792795, + "language_loss": 0.86500955, + "learning_rate": 2.201224390669072e-07, + "loss": 0.88600153, + "num_input_tokens_seen": 306270215, + "step": 14200, + "time_per_iteration": 2.6922430992126465 + }, + { + "auxiliary_loss_clip": 0.01079411, + "auxiliary_loss_mlp": 0.01028505, + "balance_loss_clip": 1.03768003, + "balance_loss_mlp": 1.01712668, + "epoch": 0.8538103111378326, + "flos": 22269176496000.0, + "grad_norm": 1.8836819449837667, + "language_loss": 0.77679044, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.79786962, + "num_input_tokens_seen": 306288960, + "step": 14201, + "time_per_iteration": 2.686408758163452 + }, + { + "auxiliary_loss_clip": 0.01080739, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.03589165, + "balance_loss_mlp": 1.01757574, + "epoch": 0.8538704343905005, + "flos": 20303929630080.0, + "grad_norm": 2.780473134009725, + "language_loss": 0.6885708, + "learning_rate": 2.19767322694256e-07, + "loss": 0.70967031, + "num_input_tokens_seen": 306308735, + "step": 14202, + "time_per_iteration": 2.6336662769317627 + }, + { + "auxiliary_loss_clip": 0.01099521, + "auxiliary_loss_mlp": 0.01036207, + "balance_loss_clip": 1.037709, + "balance_loss_mlp": 1.02389884, + "epoch": 0.8539305576431685, + "flos": 24755164784640.0, + "grad_norm": 1.9187950545950658, + "language_loss": 0.80178666, + "learning_rate": 2.195898657644666e-07, + "loss": 0.82314396, + "num_input_tokens_seen": 306329015, + "step": 14203, + "time_per_iteration": 2.6216869354248047 + }, + { + "auxiliary_loss_clip": 0.01090886, + "auxiliary_loss_mlp": 0.01032011, + "balance_loss_clip": 1.03592849, + "balance_loss_mlp": 1.01897538, + "epoch": 0.8539906808958365, + "flos": 26687625511680.0, + "grad_norm": 2.0827727006300543, + "language_loss": 0.66570961, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.68693864, + "num_input_tokens_seen": 306349085, + "step": 14204, + "time_per_iteration": 2.7057762145996094 + }, + { + "auxiliary_loss_clip": 0.01111148, + "auxiliary_loss_mlp": 0.01032529, + "balance_loss_clip": 1.03801191, + "balance_loss_mlp": 1.01958251, + "epoch": 0.8540508041485044, + "flos": 13365521038080.0, + "grad_norm": 2.906237255185196, + "language_loss": 0.59810114, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.61953795, + "num_input_tokens_seen": 306365385, + "step": 14205, + "time_per_iteration": 2.573305368423462 + }, + { + "auxiliary_loss_clip": 0.0108658, + "auxiliary_loss_mlp": 0.0102928, + "balance_loss_clip": 1.03708744, + "balance_loss_mlp": 1.01709652, + "epoch": 0.8541109274011724, + "flos": 32780876019840.0, + "grad_norm": 3.4708591258451116, + "language_loss": 0.72213638, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.74329495, + "num_input_tokens_seen": 306384585, + "step": 14206, + "time_per_iteration": 2.7664809226989746 + }, + { + "auxiliary_loss_clip": 0.01100148, + "auxiliary_loss_mlp": 0.01027694, + "balance_loss_clip": 1.03798437, + "balance_loss_mlp": 1.01563621, + "epoch": 0.8541710506538404, + "flos": 17639286071040.0, + "grad_norm": 2.7591002381259617, + "language_loss": 0.76277685, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.78405529, + "num_input_tokens_seen": 306401565, + "step": 14207, + "time_per_iteration": 2.5857670307159424 + }, + { + "auxiliary_loss_clip": 0.01110866, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.03753805, + "balance_loss_mlp": 1.01777935, + "epoch": 0.8542311739065084, + "flos": 20263062931200.0, + "grad_norm": 1.7437874977291616, + "language_loss": 0.85243803, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.8738569, + "num_input_tokens_seen": 306419995, + "step": 14208, + "time_per_iteration": 2.5491318702697754 + }, + { + "auxiliary_loss_clip": 0.01090714, + "auxiliary_loss_mlp": 0.0102915, + "balance_loss_clip": 1.03670692, + "balance_loss_mlp": 1.01759243, + "epoch": 0.8542912971591763, + "flos": 17785657992960.0, + "grad_norm": 1.579396751637571, + "language_loss": 0.66011345, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.68131208, + "num_input_tokens_seen": 306439240, + "step": 14209, + "time_per_iteration": 2.619147539138794 + }, + { + "auxiliary_loss_clip": 0.01062026, + "auxiliary_loss_mlp": 0.01025767, + "balance_loss_clip": 1.03395295, + "balance_loss_mlp": 1.01391149, + "epoch": 0.8543514204118443, + "flos": 26979507429120.0, + "grad_norm": 2.0420847855392297, + "language_loss": 0.70425576, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.72513366, + "num_input_tokens_seen": 306458425, + "step": 14210, + "time_per_iteration": 2.7978549003601074 + }, + { + "auxiliary_loss_clip": 0.01085485, + "auxiliary_loss_mlp": 0.01031679, + "balance_loss_clip": 1.03576684, + "balance_loss_mlp": 1.0193646, + "epoch": 0.8544115436645122, + "flos": 24024598064640.0, + "grad_norm": 1.6548708543912152, + "language_loss": 0.70239341, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.7235651, + "num_input_tokens_seen": 306477210, + "step": 14211, + "time_per_iteration": 2.766183614730835 + }, + { + "auxiliary_loss_clip": 0.01090016, + "auxiliary_loss_mlp": 0.0103377, + "balance_loss_clip": 1.03690755, + "balance_loss_mlp": 1.02137232, + "epoch": 0.8544716669171802, + "flos": 16617986668800.0, + "grad_norm": 2.2883331624161687, + "language_loss": 0.81601977, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.83725762, + "num_input_tokens_seen": 306495820, + "step": 14212, + "time_per_iteration": 2.6845991611480713 + }, + { + "auxiliary_loss_clip": 0.01073343, + "auxiliary_loss_mlp": 0.01033296, + "balance_loss_clip": 1.03170538, + "balance_loss_mlp": 1.01963472, + "epoch": 0.8545317901698481, + "flos": 40005779489280.0, + "grad_norm": 1.7913118059444788, + "language_loss": 0.66273463, + "learning_rate": 2.178190108088105e-07, + "loss": 0.68380105, + "num_input_tokens_seen": 306516420, + "step": 14213, + "time_per_iteration": 2.8582568168640137 + }, + { + "auxiliary_loss_clip": 0.01107415, + "auxiliary_loss_mlp": 0.01029384, + "balance_loss_clip": 1.03667092, + "balance_loss_mlp": 1.01733816, + "epoch": 0.8545919134225162, + "flos": 19902520166400.0, + "grad_norm": 1.7812973298458348, + "language_loss": 0.78218639, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.80355442, + "num_input_tokens_seen": 306534785, + "step": 14214, + "time_per_iteration": 2.5741806030273438 + }, + { + "auxiliary_loss_clip": 0.01090515, + "auxiliary_loss_mlp": 0.01030143, + "balance_loss_clip": 1.03572309, + "balance_loss_mlp": 1.01646936, + "epoch": 0.8546520366751841, + "flos": 18952970181120.0, + "grad_norm": 2.3620228976169013, + "language_loss": 0.66771472, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.68892121, + "num_input_tokens_seen": 306552440, + "step": 14215, + "time_per_iteration": 2.682720422744751 + }, + { + "auxiliary_loss_clip": 0.01108233, + "auxiliary_loss_mlp": 0.01027691, + "balance_loss_clip": 1.03707683, + "balance_loss_mlp": 1.01554906, + "epoch": 0.8547121599278521, + "flos": 35621445415680.0, + "grad_norm": 1.6345629270986273, + "language_loss": 0.62375963, + "learning_rate": 2.172890718362279e-07, + "loss": 0.64511889, + "num_input_tokens_seen": 306573600, + "step": 14216, + "time_per_iteration": 2.675818681716919 + }, + { + "auxiliary_loss_clip": 0.01073815, + "auxiliary_loss_mlp": 0.01034717, + "balance_loss_clip": 1.03433871, + "balance_loss_mlp": 1.0223552, + "epoch": 0.8547722831805201, + "flos": 16910048154240.0, + "grad_norm": 2.187084459340775, + "language_loss": 0.6559574, + "learning_rate": 2.17112560704259e-07, + "loss": 0.67704272, + "num_input_tokens_seen": 306592840, + "step": 14217, + "time_per_iteration": 2.6645264625549316 + }, + { + "auxiliary_loss_clip": 0.01095964, + "auxiliary_loss_mlp": 0.0103166, + "balance_loss_clip": 1.03822827, + "balance_loss_mlp": 1.01984668, + "epoch": 0.854832406433188, + "flos": 23002616304000.0, + "grad_norm": 1.691658565151652, + "language_loss": 0.64885128, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.67012751, + "num_input_tokens_seen": 306613210, + "step": 14218, + "time_per_iteration": 2.659118890762329 + }, + { + "auxiliary_loss_clip": 0.01094891, + "auxiliary_loss_mlp": 0.01035498, + "balance_loss_clip": 1.03430879, + "balance_loss_mlp": 1.02195024, + "epoch": 0.854892529685856, + "flos": 20412595249920.0, + "grad_norm": 1.722487926122784, + "language_loss": 0.70405877, + "learning_rate": 2.167597412688238e-07, + "loss": 0.72536266, + "num_input_tokens_seen": 306631620, + "step": 14219, + "time_per_iteration": 2.6162991523742676 + }, + { + "auxiliary_loss_clip": 0.01085887, + "auxiliary_loss_mlp": 0.01039141, + "balance_loss_clip": 1.03332317, + "balance_loss_mlp": 1.02628446, + "epoch": 0.854952652938524, + "flos": 16398716094720.0, + "grad_norm": 2.7265350217211397, + "language_loss": 0.67212754, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.69337785, + "num_input_tokens_seen": 306646695, + "step": 14220, + "time_per_iteration": 2.618908166885376 + }, + { + "auxiliary_loss_clip": 0.01105252, + "auxiliary_loss_mlp": 0.01030801, + "balance_loss_clip": 1.03653455, + "balance_loss_mlp": 1.01895165, + "epoch": 0.855012776191192, + "flos": 21178677542400.0, + "grad_norm": 1.9488426413623547, + "language_loss": 0.71819413, + "learning_rate": 2.164071923159827e-07, + "loss": 0.73955464, + "num_input_tokens_seen": 306665465, + "step": 14221, + "time_per_iteration": 2.547293186187744 + }, + { + "auxiliary_loss_clip": 0.01077738, + "auxiliary_loss_mlp": 0.01041646, + "balance_loss_clip": 1.03548348, + "balance_loss_mlp": 1.02897441, + "epoch": 0.8550728994438599, + "flos": 26140993361280.0, + "grad_norm": 1.7974681861069632, + "language_loss": 0.59693348, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.61812735, + "num_input_tokens_seen": 306685950, + "step": 14222, + "time_per_iteration": 4.256742477416992 + }, + { + "auxiliary_loss_clip": 0.01079753, + "auxiliary_loss_mlp": 0.01032489, + "balance_loss_clip": 1.03260887, + "balance_loss_mlp": 1.01989484, + "epoch": 0.8551330226965279, + "flos": 22786793435520.0, + "grad_norm": 1.5392521458494535, + "language_loss": 0.84364492, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.86476731, + "num_input_tokens_seen": 306705740, + "step": 14223, + "time_per_iteration": 2.6583445072174072 + }, + { + "auxiliary_loss_clip": 0.01097669, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.03776193, + "balance_loss_mlp": 1.01984763, + "epoch": 0.8551931459491958, + "flos": 22419032037120.0, + "grad_norm": 1.7057034680905072, + "language_loss": 0.74193132, + "learning_rate": 2.158788761585515e-07, + "loss": 0.76322699, + "num_input_tokens_seen": 306725065, + "step": 14224, + "time_per_iteration": 4.2042076587677 + }, + { + "auxiliary_loss_clip": 0.01081831, + "auxiliary_loss_mlp": 0.00772053, + "balance_loss_clip": 1.03394115, + "balance_loss_mlp": 1.00025678, + "epoch": 0.8552532692018638, + "flos": 19573183342080.0, + "grad_norm": 1.8055208702511056, + "language_loss": 0.75255108, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.77108991, + "num_input_tokens_seen": 306743630, + "step": 14225, + "time_per_iteration": 4.162761449813843 + }, + { + "auxiliary_loss_clip": 0.01047716, + "auxiliary_loss_mlp": 0.01039572, + "balance_loss_clip": 1.0343529, + "balance_loss_mlp": 1.02750206, + "epoch": 0.8553133924545318, + "flos": 26432767537920.0, + "grad_norm": 1.8461972921962662, + "language_loss": 0.77405238, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.79492527, + "num_input_tokens_seen": 306763105, + "step": 14226, + "time_per_iteration": 2.7609846591949463 + }, + { + "auxiliary_loss_clip": 0.01112703, + "auxiliary_loss_mlp": 0.01038329, + "balance_loss_clip": 1.03843546, + "balance_loss_mlp": 1.02525818, + "epoch": 0.8553735157071998, + "flos": 16362446336640.0, + "grad_norm": 18.71502714000466, + "language_loss": 0.54893303, + "learning_rate": 2.153511688875702e-07, + "loss": 0.57044339, + "num_input_tokens_seen": 306779875, + "step": 14227, + "time_per_iteration": 4.112335443496704 + }, + { + "auxiliary_loss_clip": 0.01077046, + "auxiliary_loss_mlp": 0.0077063, + "balance_loss_clip": 1.03572583, + "balance_loss_mlp": 1.00020015, + "epoch": 0.8554336389598677, + "flos": 20887334328960.0, + "grad_norm": 1.839893156700162, + "language_loss": 0.6559819, + "learning_rate": 2.151754018031442e-07, + "loss": 0.67445874, + "num_input_tokens_seen": 306800015, + "step": 14228, + "time_per_iteration": 2.6349892616271973 + }, + { + "auxiliary_loss_clip": 0.01076617, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.03681397, + "balance_loss_mlp": 1.02012038, + "epoch": 0.8554937622125357, + "flos": 21284721469440.0, + "grad_norm": 2.007233284435357, + "language_loss": 0.73960888, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.76070321, + "num_input_tokens_seen": 306814160, + "step": 14229, + "time_per_iteration": 2.653921365737915 + }, + { + "auxiliary_loss_clip": 0.01096335, + "auxiliary_loss_mlp": 0.01031192, + "balance_loss_clip": 1.03618884, + "balance_loss_mlp": 1.01952744, + "epoch": 0.8555538854652037, + "flos": 22413178120320.0, + "grad_norm": 2.129951857800807, + "language_loss": 0.72556508, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.74684036, + "num_input_tokens_seen": 306833310, + "step": 14230, + "time_per_iteration": 2.611541509628296 + }, + { + "auxiliary_loss_clip": 0.01094442, + "auxiliary_loss_mlp": 0.01030708, + "balance_loss_clip": 1.03460538, + "balance_loss_mlp": 1.01829863, + "epoch": 0.8556140087178716, + "flos": 20193719725440.0, + "grad_norm": 2.23514067772812, + "language_loss": 0.82251632, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.84376776, + "num_input_tokens_seen": 306851345, + "step": 14231, + "time_per_iteration": 2.6085596084594727 + }, + { + "auxiliary_loss_clip": 0.01100487, + "auxiliary_loss_mlp": 0.01033167, + "balance_loss_clip": 1.03759503, + "balance_loss_mlp": 1.01945221, + "epoch": 0.8556741319705397, + "flos": 22638123043200.0, + "grad_norm": 2.1730018430212175, + "language_loss": 0.67839086, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.69972742, + "num_input_tokens_seen": 306871040, + "step": 14232, + "time_per_iteration": 2.619722843170166 + }, + { + "auxiliary_loss_clip": 0.01088548, + "auxiliary_loss_mlp": 0.01032023, + "balance_loss_clip": 1.03769374, + "balance_loss_mlp": 1.01942301, + "epoch": 0.8557342552232076, + "flos": 23549320281600.0, + "grad_norm": 1.4620803714373924, + "language_loss": 0.66840327, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.68960893, + "num_input_tokens_seen": 306891625, + "step": 14233, + "time_per_iteration": 2.645831346511841 + }, + { + "auxiliary_loss_clip": 0.01096889, + "auxiliary_loss_mlp": 0.01034394, + "balance_loss_clip": 1.03637278, + "balance_loss_mlp": 1.02236605, + "epoch": 0.8557943784758756, + "flos": 19609884063360.0, + "grad_norm": 2.026925189610044, + "language_loss": 0.76869869, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.79001153, + "num_input_tokens_seen": 306910020, + "step": 14234, + "time_per_iteration": 2.58845853805542 + }, + { + "auxiliary_loss_clip": 0.01001494, + "auxiliary_loss_mlp": 0.01021829, + "balance_loss_clip": 1.00670254, + "balance_loss_mlp": 1.02035093, + "epoch": 0.8558545017285435, + "flos": 70641891446400.0, + "grad_norm": 0.7646124593211208, + "language_loss": 0.57967913, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.59991229, + "num_input_tokens_seen": 306969505, + "step": 14235, + "time_per_iteration": 3.2275688648223877 + }, + { + "auxiliary_loss_clip": 0.0101382, + "auxiliary_loss_mlp": 0.01002617, + "balance_loss_clip": 1.01051199, + "balance_loss_mlp": 1.00143075, + "epoch": 0.8559146249812115, + "flos": 56649983086080.0, + "grad_norm": 0.8551315667817418, + "language_loss": 0.56688058, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.58704495, + "num_input_tokens_seen": 307027710, + "step": 14236, + "time_per_iteration": 3.086979866027832 + }, + { + "auxiliary_loss_clip": 0.01086537, + "auxiliary_loss_mlp": 0.01035642, + "balance_loss_clip": 1.035743, + "balance_loss_mlp": 1.02300572, + "epoch": 0.8559747482338794, + "flos": 22888240421760.0, + "grad_norm": 1.785861454279873, + "language_loss": 0.70469606, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.72591788, + "num_input_tokens_seen": 307045515, + "step": 14237, + "time_per_iteration": 2.615514039993286 + }, + { + "auxiliary_loss_clip": 0.01085764, + "auxiliary_loss_mlp": 0.01029104, + "balance_loss_clip": 1.03411865, + "balance_loss_mlp": 1.01737368, + "epoch": 0.8560348714865474, + "flos": 22601925112320.0, + "grad_norm": 2.6092090917428465, + "language_loss": 0.63390237, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.65505099, + "num_input_tokens_seen": 307064470, + "step": 14238, + "time_per_iteration": 2.8091626167297363 + }, + { + "auxiliary_loss_clip": 0.01104641, + "auxiliary_loss_mlp": 0.01032794, + "balance_loss_clip": 1.03615522, + "balance_loss_mlp": 1.02233911, + "epoch": 0.8560949947392154, + "flos": 17931455297280.0, + "grad_norm": 1.7164911082437782, + "language_loss": 0.69517374, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.71654809, + "num_input_tokens_seen": 307083900, + "step": 14239, + "time_per_iteration": 2.57605242729187 + }, + { + "auxiliary_loss_clip": 0.01111794, + "auxiliary_loss_mlp": 0.0103739, + "balance_loss_clip": 1.0377574, + "balance_loss_mlp": 1.02455664, + "epoch": 0.8561551179918834, + "flos": 31026208636800.0, + "grad_norm": 2.169346981343539, + "language_loss": 0.66365606, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.68514788, + "num_input_tokens_seen": 307104590, + "step": 14240, + "time_per_iteration": 2.6193511486053467 + }, + { + "auxiliary_loss_clip": 0.01068263, + "auxiliary_loss_mlp": 0.01040061, + "balance_loss_clip": 1.0336616, + "balance_loss_mlp": 1.02518964, + "epoch": 0.8562152412445513, + "flos": 30665198995200.0, + "grad_norm": 1.6476205784607059, + "language_loss": 0.62131298, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.64239621, + "num_input_tokens_seen": 307125580, + "step": 14241, + "time_per_iteration": 2.7614312171936035 + }, + { + "auxiliary_loss_clip": 0.01112623, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.03619862, + "balance_loss_mlp": 1.02343321, + "epoch": 0.8562753644972193, + "flos": 31576144838400.0, + "grad_norm": 2.266500331980379, + "language_loss": 0.74537355, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.76686835, + "num_input_tokens_seen": 307147625, + "step": 14242, + "time_per_iteration": 2.6258413791656494 + }, + { + "auxiliary_loss_clip": 0.01043356, + "auxiliary_loss_mlp": 0.01049301, + "balance_loss_clip": 1.0376476, + "balance_loss_mlp": 1.03507984, + "epoch": 0.8563354877498872, + "flos": 26213640618240.0, + "grad_norm": 2.248077645392886, + "language_loss": 0.7636081, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.78453457, + "num_input_tokens_seen": 307164665, + "step": 14243, + "time_per_iteration": 2.819819927215576 + }, + { + "auxiliary_loss_clip": 0.01088321, + "auxiliary_loss_mlp": 0.00769311, + "balance_loss_clip": 1.03758311, + "balance_loss_mlp": 1.00028658, + "epoch": 0.8563956110025552, + "flos": 24134341092480.0, + "grad_norm": 2.314406650865767, + "language_loss": 0.68075836, + "learning_rate": 2.123723375556974e-07, + "loss": 0.69933462, + "num_input_tokens_seen": 307182530, + "step": 14244, + "time_per_iteration": 2.668156147003174 + }, + { + "auxiliary_loss_clip": 0.01020209, + "auxiliary_loss_mlp": 0.01006142, + "balance_loss_clip": 1.0066725, + "balance_loss_mlp": 1.00496769, + "epoch": 0.8564557342552233, + "flos": 56271986311680.0, + "grad_norm": 0.7568226385522613, + "language_loss": 0.58461487, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.60487843, + "num_input_tokens_seen": 307241240, + "step": 14245, + "time_per_iteration": 3.0361111164093018 + }, + { + "auxiliary_loss_clip": 0.01102848, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.03873086, + "balance_loss_mlp": 1.01862907, + "epoch": 0.8565158575078912, + "flos": 23440618748160.0, + "grad_norm": 1.7549005151263664, + "language_loss": 0.77337581, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.79471886, + "num_input_tokens_seen": 307261485, + "step": 14246, + "time_per_iteration": 2.630526542663574 + }, + { + "auxiliary_loss_clip": 0.01082478, + "auxiliary_loss_mlp": 0.01027673, + "balance_loss_clip": 1.03102589, + "balance_loss_mlp": 1.01518607, + "epoch": 0.8565759807605592, + "flos": 20375930442240.0, + "grad_norm": 1.8941484357847163, + "language_loss": 0.81755006, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.83865154, + "num_input_tokens_seen": 307279160, + "step": 14247, + "time_per_iteration": 2.637540578842163 + }, + { + "auxiliary_loss_clip": 0.01088373, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.03624964, + "balance_loss_mlp": 1.01832116, + "epoch": 0.8566361040132271, + "flos": 18807101049600.0, + "grad_norm": 1.8985078396153772, + "language_loss": 0.77648062, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.79768062, + "num_input_tokens_seen": 307297920, + "step": 14248, + "time_per_iteration": 2.637140989303589 + }, + { + "auxiliary_loss_clip": 0.01059574, + "auxiliary_loss_mlp": 0.01038673, + "balance_loss_clip": 1.03150558, + "balance_loss_mlp": 1.02398682, + "epoch": 0.8566962272658951, + "flos": 24535355506560.0, + "grad_norm": 1.8205967022668303, + "language_loss": 0.78117526, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.8021577, + "num_input_tokens_seen": 307318320, + "step": 14249, + "time_per_iteration": 2.747084856033325 + }, + { + "auxiliary_loss_clip": 0.01082913, + "auxiliary_loss_mlp": 0.01033057, + "balance_loss_clip": 1.03500676, + "balance_loss_mlp": 1.02062345, + "epoch": 0.856756350518563, + "flos": 23178506227200.0, + "grad_norm": 1.834584951570381, + "language_loss": 0.78369069, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.80485034, + "num_input_tokens_seen": 307336720, + "step": 14250, + "time_per_iteration": 2.6694507598876953 + }, + { + "auxiliary_loss_clip": 0.01085775, + "auxiliary_loss_mlp": 0.01030606, + "balance_loss_clip": 1.03689909, + "balance_loss_mlp": 1.01948988, + "epoch": 0.856816473771231, + "flos": 20808581760000.0, + "grad_norm": 1.7702839302991833, + "language_loss": 0.79165637, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.81282026, + "num_input_tokens_seen": 307354120, + "step": 14251, + "time_per_iteration": 2.61769962310791 + }, + { + "auxiliary_loss_clip": 0.01071172, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.03706813, + "balance_loss_mlp": 1.01999116, + "epoch": 0.856876597023899, + "flos": 20228157889920.0, + "grad_norm": 2.063789660652868, + "language_loss": 0.61335462, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.63438439, + "num_input_tokens_seen": 307373165, + "step": 14252, + "time_per_iteration": 2.730942964553833 + }, + { + "auxiliary_loss_clip": 0.01088715, + "auxiliary_loss_mlp": 0.01037397, + "balance_loss_clip": 1.04091692, + "balance_loss_mlp": 1.02395606, + "epoch": 0.856936720276567, + "flos": 18296128126080.0, + "grad_norm": 1.8690578228710872, + "language_loss": 0.69612849, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.71738964, + "num_input_tokens_seen": 307391000, + "step": 14253, + "time_per_iteration": 2.6573426723480225 + }, + { + "auxiliary_loss_clip": 0.01013485, + "auxiliary_loss_mlp": 0.01001116, + "balance_loss_clip": 1.00999308, + "balance_loss_mlp": 1.0000428, + "epoch": 0.8569968435292349, + "flos": 69878394933120.0, + "grad_norm": 0.7842094362693159, + "language_loss": 0.59178007, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.61192608, + "num_input_tokens_seen": 307452865, + "step": 14254, + "time_per_iteration": 3.2271313667297363 + }, + { + "auxiliary_loss_clip": 0.0108384, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.0343616, + "balance_loss_mlp": 1.02042699, + "epoch": 0.8570569667819029, + "flos": 25848572739840.0, + "grad_norm": 1.7290830197798204, + "language_loss": 0.80958641, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.8307668, + "num_input_tokens_seen": 307471940, + "step": 14255, + "time_per_iteration": 2.6941943168640137 + }, + { + "auxiliary_loss_clip": 0.011065, + "auxiliary_loss_mlp": 0.01024921, + "balance_loss_clip": 1.03668928, + "balance_loss_mlp": 1.01289284, + "epoch": 0.8571170900345708, + "flos": 23257115141760.0, + "grad_norm": 1.9710027507831065, + "language_loss": 0.67309523, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.69440937, + "num_input_tokens_seen": 307488745, + "step": 14256, + "time_per_iteration": 2.719081163406372 + }, + { + "auxiliary_loss_clip": 0.01099477, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.037992, + "balance_loss_mlp": 1.01950645, + "epoch": 0.8571772132872388, + "flos": 18917670090240.0, + "grad_norm": 2.031884958657008, + "language_loss": 0.70139217, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.72269881, + "num_input_tokens_seen": 307506855, + "step": 14257, + "time_per_iteration": 2.600598096847534 + }, + { + "auxiliary_loss_clip": 0.01073361, + "auxiliary_loss_mlp": 0.01032458, + "balance_loss_clip": 1.03339398, + "balance_loss_mlp": 1.01963735, + "epoch": 0.8572373365399069, + "flos": 33250120318080.0, + "grad_norm": 2.468248667135471, + "language_loss": 0.77000117, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.79105937, + "num_input_tokens_seen": 307526115, + "step": 14258, + "time_per_iteration": 2.757704973220825 + }, + { + "auxiliary_loss_clip": 0.01096583, + "auxiliary_loss_mlp": 0.00769575, + "balance_loss_clip": 1.0357585, + "balance_loss_mlp": 1.00029516, + "epoch": 0.8572974597925748, + "flos": 23327535755520.0, + "grad_norm": 1.474412147771869, + "language_loss": 0.6799866, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.69864815, + "num_input_tokens_seen": 307545230, + "step": 14259, + "time_per_iteration": 2.6122398376464844 + }, + { + "auxiliary_loss_clip": 0.010953, + "auxiliary_loss_mlp": 0.0103545, + "balance_loss_clip": 1.03352249, + "balance_loss_mlp": 1.02250957, + "epoch": 0.8573575830452428, + "flos": 24535858296960.0, + "grad_norm": 1.6836228896931322, + "language_loss": 0.77251399, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.79382151, + "num_input_tokens_seen": 307564900, + "step": 14260, + "time_per_iteration": 2.6170718669891357 + }, + { + "auxiliary_loss_clip": 0.01083087, + "auxiliary_loss_mlp": 0.01031864, + "balance_loss_clip": 1.03345168, + "balance_loss_mlp": 1.0186913, + "epoch": 0.8574177062979107, + "flos": 24165403378560.0, + "grad_norm": 1.694275563361149, + "language_loss": 0.74151957, + "learning_rate": 2.09413096654806e-07, + "loss": 0.76266909, + "num_input_tokens_seen": 307583500, + "step": 14261, + "time_per_iteration": 4.178469181060791 + }, + { + "auxiliary_loss_clip": 0.0109609, + "auxiliary_loss_mlp": 0.01032733, + "balance_loss_clip": 1.03748691, + "balance_loss_mlp": 1.01923871, + "epoch": 0.8574778295505787, + "flos": 17930737025280.0, + "grad_norm": 1.9745240066766159, + "language_loss": 0.78983176, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.81111997, + "num_input_tokens_seen": 307601430, + "step": 14262, + "time_per_iteration": 2.646378993988037 + }, + { + "auxiliary_loss_clip": 0.0107326, + "auxiliary_loss_mlp": 0.01032783, + "balance_loss_clip": 1.03582883, + "balance_loss_mlp": 1.02097511, + "epoch": 0.8575379528032466, + "flos": 21580697537280.0, + "grad_norm": 1.674172506907798, + "language_loss": 0.67816055, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.69922101, + "num_input_tokens_seen": 307621495, + "step": 14263, + "time_per_iteration": 4.332361698150635 + }, + { + "auxiliary_loss_clip": 0.01072214, + "auxiliary_loss_mlp": 0.00770907, + "balance_loss_clip": 1.03429055, + "balance_loss_mlp": 1.00025988, + "epoch": 0.8575980760559146, + "flos": 21761579450880.0, + "grad_norm": 1.4408705629721363, + "language_loss": 0.79718733, + "learning_rate": 2.088929137266986e-07, + "loss": 0.81561852, + "num_input_tokens_seen": 307640840, + "step": 14264, + "time_per_iteration": 2.753828287124634 + }, + { + "auxiliary_loss_clip": 0.01071482, + "auxiliary_loss_mlp": 0.01039645, + "balance_loss_clip": 1.03247488, + "balance_loss_mlp": 1.02618599, + "epoch": 0.8576581993085826, + "flos": 34386442047360.0, + "grad_norm": 1.2896850911673399, + "language_loss": 0.69861013, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.71972132, + "num_input_tokens_seen": 307663820, + "step": 14265, + "time_per_iteration": 4.417909145355225 + }, + { + "auxiliary_loss_clip": 0.01105479, + "auxiliary_loss_mlp": 0.01028348, + "balance_loss_clip": 1.03650212, + "balance_loss_mlp": 1.01695776, + "epoch": 0.8577183225612506, + "flos": 23222497409280.0, + "grad_norm": 1.6592250093825642, + "language_loss": 0.66188025, + "learning_rate": 2.085464646918027e-07, + "loss": 0.68321854, + "num_input_tokens_seen": 307682385, + "step": 14266, + "time_per_iteration": 2.6130142211914062 + }, + { + "auxiliary_loss_clip": 0.01087662, + "auxiliary_loss_mlp": 0.01032473, + "balance_loss_clip": 1.03722739, + "balance_loss_mlp": 1.02009344, + "epoch": 0.8577784458139185, + "flos": 28804164462720.0, + "grad_norm": 1.6281862094757322, + "language_loss": 0.75571585, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.77691722, + "num_input_tokens_seen": 307704680, + "step": 14267, + "time_per_iteration": 4.48302960395813 + }, + { + "auxiliary_loss_clip": 0.01095891, + "auxiliary_loss_mlp": 0.01032819, + "balance_loss_clip": 1.03645444, + "balance_loss_mlp": 1.02119589, + "epoch": 0.8578385690665865, + "flos": 19755573626880.0, + "grad_norm": 1.7702696425064848, + "language_loss": 0.87967706, + "learning_rate": 2.082002873852946e-07, + "loss": 0.9009642, + "num_input_tokens_seen": 307723245, + "step": 14268, + "time_per_iteration": 2.7304728031158447 + }, + { + "auxiliary_loss_clip": 0.01098203, + "auxiliary_loss_mlp": 0.01036419, + "balance_loss_clip": 1.03701484, + "balance_loss_mlp": 1.02400303, + "epoch": 0.8578986923192544, + "flos": 20704082117760.0, + "grad_norm": 2.207459116191671, + "language_loss": 0.72899628, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.75034249, + "num_input_tokens_seen": 307742510, + "step": 14269, + "time_per_iteration": 2.686720848083496 + }, + { + "auxiliary_loss_clip": 0.01099494, + "auxiliary_loss_mlp": 0.01032209, + "balance_loss_clip": 1.03617907, + "balance_loss_mlp": 1.01991236, + "epoch": 0.8579588155719224, + "flos": 36101715189120.0, + "grad_norm": 1.7486556391574948, + "language_loss": 0.66497004, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.68628705, + "num_input_tokens_seen": 307766030, + "step": 14270, + "time_per_iteration": 2.759577751159668 + }, + { + "auxiliary_loss_clip": 0.01082271, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.0320828, + "balance_loss_mlp": 1.01854658, + "epoch": 0.8580189388245905, + "flos": 22853479034880.0, + "grad_norm": 1.9964784224395893, + "language_loss": 0.73861098, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.75974405, + "num_input_tokens_seen": 307785800, + "step": 14271, + "time_per_iteration": 2.6652464866638184 + }, + { + "auxiliary_loss_clip": 0.00990812, + "auxiliary_loss_mlp": 0.00751033, + "balance_loss_clip": 1.00730669, + "balance_loss_mlp": 0.99962157, + "epoch": 0.8580790620772584, + "flos": 69642104290560.0, + "grad_norm": 0.808728293182982, + "language_loss": 0.595052, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.61247051, + "num_input_tokens_seen": 307850995, + "step": 14272, + "time_per_iteration": 3.3493616580963135 + }, + { + "auxiliary_loss_clip": 0.0108737, + "auxiliary_loss_mlp": 0.01037635, + "balance_loss_clip": 1.03556502, + "balance_loss_mlp": 1.02342474, + "epoch": 0.8581391853299264, + "flos": 13334243270400.0, + "grad_norm": 1.7519497448745491, + "language_loss": 0.75282109, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.7740711, + "num_input_tokens_seen": 307868585, + "step": 14273, + "time_per_iteration": 2.6791751384735107 + }, + { + "auxiliary_loss_clip": 0.01097542, + "auxiliary_loss_mlp": 0.01029287, + "balance_loss_clip": 1.03653765, + "balance_loss_mlp": 1.01684737, + "epoch": 0.8581993085825943, + "flos": 19645651031040.0, + "grad_norm": 1.8670463657155183, + "language_loss": 0.82038534, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.84165359, + "num_input_tokens_seen": 307886820, + "step": 14274, + "time_per_iteration": 2.617358446121216 + }, + { + "auxiliary_loss_clip": 0.0101945, + "auxiliary_loss_mlp": 0.01002494, + "balance_loss_clip": 1.00673366, + "balance_loss_mlp": 1.00137389, + "epoch": 0.8582594318352623, + "flos": 55825077294720.0, + "grad_norm": 0.7943422785901219, + "language_loss": 0.60750306, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.6277225, + "num_input_tokens_seen": 307944020, + "step": 14275, + "time_per_iteration": 3.2472341060638428 + }, + { + "auxiliary_loss_clip": 0.01096248, + "auxiliary_loss_mlp": 0.01028909, + "balance_loss_clip": 1.03805137, + "balance_loss_mlp": 1.01562345, + "epoch": 0.8583195550879302, + "flos": 24279563779200.0, + "grad_norm": 2.0431646133306764, + "language_loss": 0.59516066, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.61641222, + "num_input_tokens_seen": 307961055, + "step": 14276, + "time_per_iteration": 2.7009382247924805 + }, + { + "auxiliary_loss_clip": 0.0108586, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.03556418, + "balance_loss_mlp": 1.02205861, + "epoch": 0.8583796783405983, + "flos": 13444129952640.0, + "grad_norm": 2.25300331444078, + "language_loss": 0.76484519, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.786044, + "num_input_tokens_seen": 307978690, + "step": 14277, + "time_per_iteration": 2.6383044719696045 + }, + { + "auxiliary_loss_clip": 0.01085815, + "auxiliary_loss_mlp": 0.0102939, + "balance_loss_clip": 1.0350821, + "balance_loss_mlp": 1.01708126, + "epoch": 0.8584398015932662, + "flos": 16180271533440.0, + "grad_norm": 1.614064459915635, + "language_loss": 0.83699441, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.85814643, + "num_input_tokens_seen": 307995870, + "step": 14278, + "time_per_iteration": 2.690840721130371 + }, + { + "auxiliary_loss_clip": 0.01087706, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.03669083, + "balance_loss_mlp": 1.01709092, + "epoch": 0.8584999248459342, + "flos": 17450431338240.0, + "grad_norm": 2.022696664220824, + "language_loss": 0.74557948, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.76675826, + "num_input_tokens_seen": 308013645, + "step": 14279, + "time_per_iteration": 2.6342451572418213 + }, + { + "auxiliary_loss_clip": 0.01107856, + "auxiliary_loss_mlp": 0.0103262, + "balance_loss_clip": 1.03726792, + "balance_loss_mlp": 1.0206275, + "epoch": 0.8585600480986021, + "flos": 23441013797760.0, + "grad_norm": 2.2412241965372095, + "language_loss": 0.66438127, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.68578601, + "num_input_tokens_seen": 308032490, + "step": 14280, + "time_per_iteration": 2.586599349975586 + }, + { + "auxiliary_loss_clip": 0.01095719, + "auxiliary_loss_mlp": 0.01028274, + "balance_loss_clip": 1.03592777, + "balance_loss_mlp": 1.01651978, + "epoch": 0.8586201713512701, + "flos": 19937927998080.0, + "grad_norm": 1.86716453090562, + "language_loss": 0.62667966, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.64791965, + "num_input_tokens_seen": 308052110, + "step": 14281, + "time_per_iteration": 2.6187994480133057 + }, + { + "auxiliary_loss_clip": 0.01084456, + "auxiliary_loss_mlp": 0.00770032, + "balance_loss_clip": 1.03628945, + "balance_loss_mlp": 1.00014341, + "epoch": 0.858680294603938, + "flos": 15304769435520.0, + "grad_norm": 3.7299826958950493, + "language_loss": 0.73169029, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.7502352, + "num_input_tokens_seen": 308070660, + "step": 14282, + "time_per_iteration": 2.7070963382720947 + }, + { + "auxiliary_loss_clip": 0.01080016, + "auxiliary_loss_mlp": 0.01030436, + "balance_loss_clip": 1.0322001, + "balance_loss_mlp": 1.01892662, + "epoch": 0.858740417856606, + "flos": 22711237176960.0, + "grad_norm": 1.8508205946022054, + "language_loss": 0.75599825, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.77710283, + "num_input_tokens_seen": 308089520, + "step": 14283, + "time_per_iteration": 2.70784854888916 + }, + { + "auxiliary_loss_clip": 0.01093289, + "auxiliary_loss_mlp": 0.01032374, + "balance_loss_clip": 1.03351057, + "balance_loss_mlp": 1.01973772, + "epoch": 0.8588005411092741, + "flos": 34054303962240.0, + "grad_norm": 1.810517869683259, + "language_loss": 0.60200775, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.62326431, + "num_input_tokens_seen": 308111545, + "step": 14284, + "time_per_iteration": 2.804454803466797 + }, + { + "auxiliary_loss_clip": 0.01080997, + "auxiliary_loss_mlp": 0.01030838, + "balance_loss_clip": 1.03671587, + "balance_loss_mlp": 1.01877391, + "epoch": 0.858860664361942, + "flos": 28913584268160.0, + "grad_norm": 1.9759393563383274, + "language_loss": 0.75834155, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.77945989, + "num_input_tokens_seen": 308129690, + "step": 14285, + "time_per_iteration": 2.717355489730835 + }, + { + "auxiliary_loss_clip": 0.01096428, + "auxiliary_loss_mlp": 0.01034968, + "balance_loss_clip": 1.03976953, + "balance_loss_mlp": 1.0222249, + "epoch": 0.85892078761461, + "flos": 19792525743360.0, + "grad_norm": 2.2818993237689242, + "language_loss": 0.7433964, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.76471031, + "num_input_tokens_seen": 308147410, + "step": 14286, + "time_per_iteration": 2.60193133354187 + }, + { + "auxiliary_loss_clip": 0.01009396, + "auxiliary_loss_mlp": 0.00750956, + "balance_loss_clip": 1.0070982, + "balance_loss_mlp": 0.99964851, + "epoch": 0.8589809108672779, + "flos": 67106630039040.0, + "grad_norm": 0.7818074542698659, + "language_loss": 0.4943513, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.51195478, + "num_input_tokens_seen": 308204875, + "step": 14287, + "time_per_iteration": 3.223233461380005 + }, + { + "auxiliary_loss_clip": 0.01099243, + "auxiliary_loss_mlp": 0.01030496, + "balance_loss_clip": 1.03820276, + "balance_loss_mlp": 1.01868248, + "epoch": 0.8590410341199459, + "flos": 29716259541120.0, + "grad_norm": 2.0753846040431574, + "language_loss": 0.79119551, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.81249291, + "num_input_tokens_seen": 308225690, + "step": 14288, + "time_per_iteration": 2.8012468814849854 + }, + { + "auxiliary_loss_clip": 0.01070856, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.03844345, + "balance_loss_mlp": 1.02160382, + "epoch": 0.8591011573726138, + "flos": 23987430466560.0, + "grad_norm": 2.333742079437343, + "language_loss": 0.80807364, + "learning_rate": 2.045818444528553e-07, + "loss": 0.82912946, + "num_input_tokens_seen": 308245255, + "step": 14289, + "time_per_iteration": 2.677363634109497 + }, + { + "auxiliary_loss_clip": 0.01101023, + "auxiliary_loss_mlp": 0.01032169, + "balance_loss_clip": 1.03798854, + "balance_loss_mlp": 1.01974702, + "epoch": 0.8591612806252819, + "flos": 14428656806400.0, + "grad_norm": 1.8411584307927096, + "language_loss": 0.65171552, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.67304742, + "num_input_tokens_seen": 308261755, + "step": 14290, + "time_per_iteration": 2.6130077838897705 + }, + { + "auxiliary_loss_clip": 0.01088699, + "auxiliary_loss_mlp": 0.01029909, + "balance_loss_clip": 1.0362072, + "balance_loss_mlp": 1.01692736, + "epoch": 0.8592214038779498, + "flos": 31577150419200.0, + "grad_norm": 1.9868152248145707, + "language_loss": 0.55034781, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.57153386, + "num_input_tokens_seen": 308285145, + "step": 14291, + "time_per_iteration": 2.7079780101776123 + }, + { + "auxiliary_loss_clip": 0.0110119, + "auxiliary_loss_mlp": 0.01031083, + "balance_loss_clip": 1.03754354, + "balance_loss_mlp": 1.01885247, + "epoch": 0.8592815271306178, + "flos": 17457290835840.0, + "grad_norm": 2.056468770778706, + "language_loss": 0.71314991, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.73447263, + "num_input_tokens_seen": 308304130, + "step": 14292, + "time_per_iteration": 2.595897674560547 + }, + { + "auxiliary_loss_clip": 0.01098184, + "auxiliary_loss_mlp": 0.01034158, + "balance_loss_clip": 1.03526211, + "balance_loss_mlp": 1.02187383, + "epoch": 0.8593416503832857, + "flos": 25411360394880.0, + "grad_norm": 1.470631901330716, + "language_loss": 0.71314609, + "learning_rate": 2.038960195018542e-07, + "loss": 0.73446953, + "num_input_tokens_seen": 308324670, + "step": 14293, + "time_per_iteration": 2.652717351913452 + }, + { + "auxiliary_loss_clip": 0.01080648, + "auxiliary_loss_mlp": 0.01034358, + "balance_loss_clip": 1.03720033, + "balance_loss_mlp": 1.02217507, + "epoch": 0.8594017736359537, + "flos": 20996646393600.0, + "grad_norm": 1.543293476083091, + "language_loss": 0.6855827, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.70673275, + "num_input_tokens_seen": 308344215, + "step": 14294, + "time_per_iteration": 2.6766042709350586 + }, + { + "auxiliary_loss_clip": 0.01104946, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.03467357, + "balance_loss_mlp": 1.01901507, + "epoch": 0.8594618968886216, + "flos": 22091059929600.0, + "grad_norm": 1.9038081617192384, + "language_loss": 0.77887809, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.80023754, + "num_input_tokens_seen": 308360520, + "step": 14295, + "time_per_iteration": 2.6753733158111572 + }, + { + "auxiliary_loss_clip": 0.01085392, + "auxiliary_loss_mlp": 0.01037375, + "balance_loss_clip": 1.03575659, + "balance_loss_mlp": 1.02281332, + "epoch": 0.8595220201412896, + "flos": 11656245467520.0, + "grad_norm": 2.8815633850100095, + "language_loss": 0.69029182, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.71151948, + "num_input_tokens_seen": 308376865, + "step": 14296, + "time_per_iteration": 2.6722471714019775 + }, + { + "auxiliary_loss_clip": 0.01081568, + "auxiliary_loss_mlp": 0.01033296, + "balance_loss_clip": 1.03467476, + "balance_loss_mlp": 1.02070773, + "epoch": 0.8595821433939577, + "flos": 25040366772480.0, + "grad_norm": 2.176741931564133, + "language_loss": 0.78606057, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.80720925, + "num_input_tokens_seen": 308395870, + "step": 14297, + "time_per_iteration": 2.6577630043029785 + }, + { + "auxiliary_loss_clip": 0.01091905, + "auxiliary_loss_mlp": 0.01032076, + "balance_loss_clip": 1.03271341, + "balance_loss_mlp": 1.02086425, + "epoch": 0.8596422666466256, + "flos": 28511528359680.0, + "grad_norm": 2.673036998280705, + "language_loss": 0.67951548, + "learning_rate": 2.030402708016954e-07, + "loss": 0.7007553, + "num_input_tokens_seen": 308417250, + "step": 14298, + "time_per_iteration": 2.7069945335388184 + }, + { + "auxiliary_loss_clip": 0.01083251, + "auxiliary_loss_mlp": 0.01035882, + "balance_loss_clip": 1.0348295, + "balance_loss_mlp": 1.02360308, + "epoch": 0.8597023898992936, + "flos": 13589137157760.0, + "grad_norm": 2.2714540225430775, + "language_loss": 0.68807364, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.70926499, + "num_input_tokens_seen": 308434565, + "step": 14299, + "time_per_iteration": 2.637234687805176 + }, + { + "auxiliary_loss_clip": 0.01080144, + "auxiliary_loss_mlp": 0.01036834, + "balance_loss_clip": 1.03766489, + "balance_loss_mlp": 1.02434683, + "epoch": 0.8597625131519615, + "flos": 32300821728000.0, + "grad_norm": 2.455453131727374, + "language_loss": 0.71315849, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.73432827, + "num_input_tokens_seen": 308450040, + "step": 14300, + "time_per_iteration": 4.307279109954834 + }, + { + "auxiliary_loss_clip": 0.01080749, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.03184569, + "balance_loss_mlp": 1.02227378, + "epoch": 0.8598226364046295, + "flos": 28730367970560.0, + "grad_norm": 2.4178089215843377, + "language_loss": 0.69498658, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.71614629, + "num_input_tokens_seen": 308470545, + "step": 14301, + "time_per_iteration": 2.7081966400146484 + }, + { + "auxiliary_loss_clip": 0.01056383, + "auxiliary_loss_mlp": 0.01033857, + "balance_loss_clip": 1.03381944, + "balance_loss_mlp": 1.02151251, + "epoch": 0.8598827596572974, + "flos": 21871825269120.0, + "grad_norm": 1.627550751133936, + "language_loss": 0.74207568, + "learning_rate": 2.023568983386641e-07, + "loss": 0.76297808, + "num_input_tokens_seen": 308490020, + "step": 14302, + "time_per_iteration": 2.711632251739502 + }, + { + "auxiliary_loss_clip": 0.01092554, + "auxiliary_loss_mlp": 0.01030522, + "balance_loss_clip": 1.03438914, + "balance_loss_mlp": 1.01904821, + "epoch": 0.8599428829099655, + "flos": 23767297966080.0, + "grad_norm": 1.6910498368057518, + "language_loss": 0.83883357, + "learning_rate": 2.02186225623733e-07, + "loss": 0.86006427, + "num_input_tokens_seen": 308509065, + "step": 14303, + "time_per_iteration": 4.2169249057769775 + }, + { + "auxiliary_loss_clip": 0.01096255, + "auxiliary_loss_mlp": 0.01036748, + "balance_loss_clip": 1.03428876, + "balance_loss_mlp": 1.02355707, + "epoch": 0.8600030061626334, + "flos": 16212770363520.0, + "grad_norm": 2.148560231945797, + "language_loss": 0.7746321, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.7959621, + "num_input_tokens_seen": 308524725, + "step": 14304, + "time_per_iteration": 4.171972990036011 + }, + { + "auxiliary_loss_clip": 0.01110849, + "auxiliary_loss_mlp": 0.01035361, + "balance_loss_clip": 1.03822732, + "balance_loss_mlp": 1.02181315, + "epoch": 0.8600631294153014, + "flos": 15669370437120.0, + "grad_norm": 1.95456339418458, + "language_loss": 0.54470098, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.56616312, + "num_input_tokens_seen": 308543525, + "step": 14305, + "time_per_iteration": 2.594041585922241 + }, + { + "auxiliary_loss_clip": 0.01108772, + "auxiliary_loss_mlp": 0.01029221, + "balance_loss_clip": 1.03798604, + "balance_loss_mlp": 1.01622105, + "epoch": 0.8601232526679693, + "flos": 17493093717120.0, + "grad_norm": 1.9212568904782885, + "language_loss": 0.84086001, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.86223984, + "num_input_tokens_seen": 308557995, + "step": 14306, + "time_per_iteration": 4.086545467376709 + }, + { + "auxiliary_loss_clip": 0.01097083, + "auxiliary_loss_mlp": 0.00769534, + "balance_loss_clip": 1.03672814, + "balance_loss_mlp": 1.00017905, + "epoch": 0.8601833759206373, + "flos": 26985935963520.0, + "grad_norm": 1.3981944993349464, + "language_loss": 0.71432567, + "learning_rate": 2.01504216561474e-07, + "loss": 0.73299187, + "num_input_tokens_seen": 308582750, + "step": 14307, + "time_per_iteration": 2.7123961448669434 + }, + { + "auxiliary_loss_clip": 0.01096964, + "auxiliary_loss_mlp": 0.00771884, + "balance_loss_clip": 1.03435898, + "balance_loss_mlp": 1.00030386, + "epoch": 0.8602434991733052, + "flos": 25229760209280.0, + "grad_norm": 1.8399000779871275, + "language_loss": 0.636989, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.6556775, + "num_input_tokens_seen": 308603770, + "step": 14308, + "time_per_iteration": 2.6409523487091064 + }, + { + "auxiliary_loss_clip": 0.01010709, + "auxiliary_loss_mlp": 0.01001248, + "balance_loss_clip": 1.00715673, + "balance_loss_mlp": 1.00013912, + "epoch": 0.8603036224259732, + "flos": 71015363107200.0, + "grad_norm": 0.6173812153983712, + "language_loss": 0.48415971, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.50427926, + "num_input_tokens_seen": 308667735, + "step": 14309, + "time_per_iteration": 3.2728710174560547 + }, + { + "auxiliary_loss_clip": 0.01054401, + "auxiliary_loss_mlp": 0.01034995, + "balance_loss_clip": 1.03519821, + "balance_loss_mlp": 1.02176285, + "epoch": 0.8603637456786413, + "flos": 20300625578880.0, + "grad_norm": 1.8566830795066585, + "language_loss": 0.67076862, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.69166255, + "num_input_tokens_seen": 308686300, + "step": 14310, + "time_per_iteration": 2.7875287532806396 + }, + { + "auxiliary_loss_clip": 0.01040328, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.02937603, + "balance_loss_mlp": 1.02352858, + "epoch": 0.8604238689313092, + "flos": 21835842819840.0, + "grad_norm": 1.7678336453099173, + "language_loss": 0.7815913, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.80234909, + "num_input_tokens_seen": 308705825, + "step": 14311, + "time_per_iteration": 2.779208183288574 + }, + { + "auxiliary_loss_clip": 0.01096237, + "auxiliary_loss_mlp": 0.01031381, + "balance_loss_clip": 1.03626657, + "balance_loss_mlp": 1.0191319, + "epoch": 0.8604839921839772, + "flos": 18004210295040.0, + "grad_norm": 2.0823633926297087, + "language_loss": 0.72099596, + "learning_rate": 2.006532397626639e-07, + "loss": 0.74227214, + "num_input_tokens_seen": 308723340, + "step": 14312, + "time_per_iteration": 2.572300672531128 + }, + { + "auxiliary_loss_clip": 0.01079744, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.03377199, + "balance_loss_mlp": 1.02101254, + "epoch": 0.8605441154366451, + "flos": 16252164604800.0, + "grad_norm": 4.48770964436052, + "language_loss": 0.77972746, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.80085838, + "num_input_tokens_seen": 308741280, + "step": 14313, + "time_per_iteration": 2.6455512046813965 + }, + { + "auxiliary_loss_clip": 0.01084267, + "auxiliary_loss_mlp": 0.0103529, + "balance_loss_clip": 1.03463316, + "balance_loss_mlp": 1.02147377, + "epoch": 0.8606042386893131, + "flos": 32267065921920.0, + "grad_norm": 1.4772487181933294, + "language_loss": 0.7305848, + "learning_rate": 2.003133266178474e-07, + "loss": 0.75178033, + "num_input_tokens_seen": 308762875, + "step": 14314, + "time_per_iteration": 2.760899782180786 + }, + { + "auxiliary_loss_clip": 0.01085045, + "auxiliary_loss_mlp": 0.01033377, + "balance_loss_clip": 1.03471231, + "balance_loss_mlp": 1.02096725, + "epoch": 0.860664361941981, + "flos": 20229774001920.0, + "grad_norm": 1.8071847940662549, + "language_loss": 0.68796486, + "learning_rate": 2.001434724086657e-07, + "loss": 0.70914906, + "num_input_tokens_seen": 308780315, + "step": 14315, + "time_per_iteration": 2.649801254272461 + }, + { + "auxiliary_loss_clip": 0.01096879, + "auxiliary_loss_mlp": 0.01032695, + "balance_loss_clip": 1.03672695, + "balance_loss_mlp": 1.02085114, + "epoch": 0.8607244851946491, + "flos": 25191622944000.0, + "grad_norm": 1.885182281921848, + "language_loss": 0.71844518, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.73974097, + "num_input_tokens_seen": 308799435, + "step": 14316, + "time_per_iteration": 2.7529983520507812 + }, + { + "auxiliary_loss_clip": 0.01090676, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.04007196, + "balance_loss_mlp": 1.0183723, + "epoch": 0.860784608447317, + "flos": 20482082110080.0, + "grad_norm": 1.9094680545566136, + "language_loss": 0.82880986, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.85001934, + "num_input_tokens_seen": 308817730, + "step": 14317, + "time_per_iteration": 2.6640453338623047 + }, + { + "auxiliary_loss_clip": 0.01090255, + "auxiliary_loss_mlp": 0.01030325, + "balance_loss_clip": 1.03797185, + "balance_loss_mlp": 1.01819539, + "epoch": 0.860844731699985, + "flos": 50476037696640.0, + "grad_norm": 1.6214847591514705, + "language_loss": 0.67348385, + "learning_rate": 1.996343193113108e-07, + "loss": 0.69468963, + "num_input_tokens_seen": 308841735, + "step": 14318, + "time_per_iteration": 2.869259834289551 + }, + { + "auxiliary_loss_clip": 0.01094097, + "auxiliary_loss_mlp": 0.01027928, + "balance_loss_clip": 1.0362227, + "balance_loss_mlp": 1.01671052, + "epoch": 0.8609048549526529, + "flos": 41172768455040.0, + "grad_norm": 1.558793225555784, + "language_loss": 0.71354842, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.73476869, + "num_input_tokens_seen": 308865050, + "step": 14319, + "time_per_iteration": 2.844249963760376 + }, + { + "auxiliary_loss_clip": 0.0109006, + "auxiliary_loss_mlp": 0.00769912, + "balance_loss_clip": 1.03683519, + "balance_loss_mlp": 1.00023806, + "epoch": 0.8609649782053209, + "flos": 23951196622080.0, + "grad_norm": 1.8769046861773884, + "language_loss": 0.67780548, + "learning_rate": 1.992952252525839e-07, + "loss": 0.69640523, + "num_input_tokens_seen": 308885375, + "step": 14320, + "time_per_iteration": 2.6762452125549316 + }, + { + "auxiliary_loss_clip": 0.01080757, + "auxiliary_loss_mlp": 0.01037726, + "balance_loss_clip": 1.03380013, + "balance_loss_mlp": 1.02343893, + "epoch": 0.8610251014579888, + "flos": 23112574813440.0, + "grad_norm": 5.062268799214488, + "language_loss": 0.79499638, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.81618118, + "num_input_tokens_seen": 308904700, + "step": 14321, + "time_per_iteration": 2.7844552993774414 + }, + { + "auxiliary_loss_clip": 0.01092256, + "auxiliary_loss_mlp": 0.00770223, + "balance_loss_clip": 1.03433347, + "balance_loss_mlp": 1.00014472, + "epoch": 0.8610852247106568, + "flos": 19426811420160.0, + "grad_norm": 1.8063677075547142, + "language_loss": 0.7084378, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.72706258, + "num_input_tokens_seen": 308922985, + "step": 14322, + "time_per_iteration": 2.6614699363708496 + }, + { + "auxiliary_loss_clip": 0.01087983, + "auxiliary_loss_mlp": 0.01039264, + "balance_loss_clip": 1.03474808, + "balance_loss_mlp": 1.02575755, + "epoch": 0.8611453479633249, + "flos": 19312076401920.0, + "grad_norm": 1.9402477905188305, + "language_loss": 0.56338006, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.58465254, + "num_input_tokens_seen": 308940765, + "step": 14323, + "time_per_iteration": 2.639302968978882 + }, + { + "auxiliary_loss_clip": 0.01071823, + "auxiliary_loss_mlp": 0.01026562, + "balance_loss_clip": 1.03276682, + "balance_loss_mlp": 1.01427782, + "epoch": 0.8612054712159928, + "flos": 23253667436160.0, + "grad_norm": 2.0929228827532413, + "language_loss": 0.75493181, + "learning_rate": 1.986178565813801e-07, + "loss": 0.77591568, + "num_input_tokens_seen": 308960110, + "step": 14324, + "time_per_iteration": 2.6960513591766357 + }, + { + "auxiliary_loss_clip": 0.01063342, + "auxiliary_loss_mlp": 0.01035964, + "balance_loss_clip": 1.03498292, + "balance_loss_mlp": 1.02134275, + "epoch": 0.8612655944686608, + "flos": 16028440744320.0, + "grad_norm": 2.114341094605167, + "language_loss": 0.66620868, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.68720174, + "num_input_tokens_seen": 308976665, + "step": 14325, + "time_per_iteration": 2.704503297805786 + }, + { + "auxiliary_loss_clip": 0.01099873, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.03732955, + "balance_loss_mlp": 1.01800442, + "epoch": 0.8613257177213287, + "flos": 22492720788480.0, + "grad_norm": 1.7540053494594063, + "language_loss": 0.64823282, + "learning_rate": 1.982795820716472e-07, + "loss": 0.66953552, + "num_input_tokens_seen": 308997015, + "step": 14326, + "time_per_iteration": 2.634575843811035 + }, + { + "auxiliary_loss_clip": 0.01085647, + "auxiliary_loss_mlp": 0.01033577, + "balance_loss_clip": 1.03369999, + "balance_loss_mlp": 1.02078009, + "epoch": 0.8613858409739967, + "flos": 17238056175360.0, + "grad_norm": 1.9850234614136824, + "language_loss": 0.84380805, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.86500031, + "num_input_tokens_seen": 309015250, + "step": 14327, + "time_per_iteration": 2.653275728225708 + }, + { + "auxiliary_loss_clip": 0.01098118, + "auxiliary_loss_mlp": 0.01031945, + "balance_loss_clip": 1.0356977, + "balance_loss_mlp": 1.01924908, + "epoch": 0.8614459642266646, + "flos": 22821123859200.0, + "grad_norm": 2.1125726227452186, + "language_loss": 0.7496419, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.77094257, + "num_input_tokens_seen": 309034140, + "step": 14328, + "time_per_iteration": 2.644585132598877 + }, + { + "auxiliary_loss_clip": 0.01096938, + "auxiliary_loss_mlp": 0.01027149, + "balance_loss_clip": 1.03526139, + "balance_loss_mlp": 1.01539493, + "epoch": 0.8615060874793327, + "flos": 26504301473280.0, + "grad_norm": 1.8484016306146063, + "language_loss": 0.80306005, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.82430089, + "num_input_tokens_seen": 309055075, + "step": 14329, + "time_per_iteration": 2.723478078842163 + }, + { + "auxiliary_loss_clip": 0.01083147, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.03760588, + "balance_loss_mlp": 1.01930857, + "epoch": 0.8615662107320006, + "flos": 24061011477120.0, + "grad_norm": 2.0863615030267937, + "language_loss": 0.76824546, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.78939486, + "num_input_tokens_seen": 309074650, + "step": 14330, + "time_per_iteration": 2.812311887741089 + }, + { + "auxiliary_loss_clip": 0.01096755, + "auxiliary_loss_mlp": 0.01030872, + "balance_loss_clip": 1.03553391, + "balance_loss_mlp": 1.01855159, + "epoch": 0.8616263339846686, + "flos": 24165044242560.0, + "grad_norm": 1.8215281342853327, + "language_loss": 0.64920008, + "learning_rate": 1.974350915342702e-07, + "loss": 0.67047632, + "num_input_tokens_seen": 309094385, + "step": 14331, + "time_per_iteration": 2.6918468475341797 + }, + { + "auxiliary_loss_clip": 0.01086033, + "auxiliary_loss_mlp": 0.01032455, + "balance_loss_clip": 1.03811228, + "balance_loss_mlp": 1.02118349, + "epoch": 0.8616864572373365, + "flos": 21724340025600.0, + "grad_norm": 1.6141703486069339, + "language_loss": 0.760149, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.7813338, + "num_input_tokens_seen": 309111815, + "step": 14332, + "time_per_iteration": 2.7376909255981445 + }, + { + "auxiliary_loss_clip": 0.01096761, + "auxiliary_loss_mlp": 0.01031207, + "balance_loss_clip": 1.03702247, + "balance_loss_mlp": 1.01777267, + "epoch": 0.8617465804900045, + "flos": 23766651521280.0, + "grad_norm": 1.7306536007075406, + "language_loss": 0.67241013, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.69368982, + "num_input_tokens_seen": 309131385, + "step": 14333, + "time_per_iteration": 2.6434760093688965 + }, + { + "auxiliary_loss_clip": 0.01086243, + "auxiliary_loss_mlp": 0.0103825, + "balance_loss_clip": 1.03663921, + "balance_loss_mlp": 1.02411211, + "epoch": 0.8618067037426724, + "flos": 37703941251840.0, + "grad_norm": 1.6353598696173437, + "language_loss": 0.62017745, + "learning_rate": 1.969292174019157e-07, + "loss": 0.64142239, + "num_input_tokens_seen": 309155020, + "step": 14334, + "time_per_iteration": 2.758512258529663 + }, + { + "auxiliary_loss_clip": 0.01080188, + "auxiliary_loss_mlp": 0.01048728, + "balance_loss_clip": 1.0376997, + "balance_loss_mlp": 1.03463769, + "epoch": 0.8618668269953405, + "flos": 21471026336640.0, + "grad_norm": 4.004935288615531, + "language_loss": 0.69439906, + "learning_rate": 1.967607294278577e-07, + "loss": 0.71568823, + "num_input_tokens_seen": 309172865, + "step": 14335, + "time_per_iteration": 2.69771671295166 + }, + { + "auxiliary_loss_clip": 0.01100982, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.03802538, + "balance_loss_mlp": 1.02374029, + "epoch": 0.8619269502480085, + "flos": 22232691256320.0, + "grad_norm": 3.0287384377889297, + "language_loss": 0.82912672, + "learning_rate": 1.965923098328135e-07, + "loss": 0.85049564, + "num_input_tokens_seen": 309193575, + "step": 14336, + "time_per_iteration": 2.6209864616394043 + }, + { + "auxiliary_loss_clip": 0.01112224, + "auxiliary_loss_mlp": 0.010339, + "balance_loss_clip": 1.03766823, + "balance_loss_mlp": 1.02133584, + "epoch": 0.8619870735006764, + "flos": 22710626645760.0, + "grad_norm": 2.099074168500333, + "language_loss": 0.67489713, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.69635832, + "num_input_tokens_seen": 309212680, + "step": 14337, + "time_per_iteration": 2.6033341884613037 + }, + { + "auxiliary_loss_clip": 0.01069511, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.03057778, + "balance_loss_mlp": 1.02058089, + "epoch": 0.8620471967533444, + "flos": 37520293991040.0, + "grad_norm": 1.5608583142668484, + "language_loss": 0.6694777, + "learning_rate": 1.962556758053089e-07, + "loss": 0.69050497, + "num_input_tokens_seen": 309234485, + "step": 14338, + "time_per_iteration": 2.775123119354248 + }, + { + "auxiliary_loss_clip": 0.01086678, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.03658581, + "balance_loss_mlp": 1.02030885, + "epoch": 0.8621073200060123, + "flos": 19682459493120.0, + "grad_norm": 1.9189965100666158, + "language_loss": 0.62008345, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.64126867, + "num_input_tokens_seen": 309253630, + "step": 14339, + "time_per_iteration": 2.696450710296631 + }, + { + "auxiliary_loss_clip": 0.01086707, + "auxiliary_loss_mlp": 0.00770344, + "balance_loss_clip": 1.03489327, + "balance_loss_mlp": 1.00020528, + "epoch": 0.8621674432586803, + "flos": 14536855549440.0, + "grad_norm": 1.8496565464342125, + "language_loss": 0.62634254, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.64491296, + "num_input_tokens_seen": 309270950, + "step": 14340, + "time_per_iteration": 4.219670295715332 + }, + { + "auxiliary_loss_clip": 0.01060496, + "auxiliary_loss_mlp": 0.0102529, + "balance_loss_clip": 1.03393662, + "balance_loss_mlp": 1.01384556, + "epoch": 0.8622275665113482, + "flos": 20740100480640.0, + "grad_norm": 1.5540537291722216, + "language_loss": 0.79882658, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.81968445, + "num_input_tokens_seen": 309288780, + "step": 14341, + "time_per_iteration": 2.7992727756500244 + }, + { + "auxiliary_loss_clip": 0.01092904, + "auxiliary_loss_mlp": 0.01032082, + "balance_loss_clip": 1.0364188, + "balance_loss_mlp": 1.02028048, + "epoch": 0.8622876897640163, + "flos": 24715914197760.0, + "grad_norm": 1.6849671618732158, + "language_loss": 0.74542058, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.76667047, + "num_input_tokens_seen": 309310875, + "step": 14342, + "time_per_iteration": 4.3738038539886475 + }, + { + "auxiliary_loss_clip": 0.01069834, + "auxiliary_loss_mlp": 0.01028747, + "balance_loss_clip": 1.03554666, + "balance_loss_mlp": 1.01579463, + "epoch": 0.8623478130166842, + "flos": 17457362663040.0, + "grad_norm": 1.6056166986401446, + "language_loss": 0.68522966, + "learning_rate": 1.95415287816028e-07, + "loss": 0.7062155, + "num_input_tokens_seen": 309329900, + "step": 14343, + "time_per_iteration": 4.237400770187378 + }, + { + "auxiliary_loss_clip": 0.01096424, + "auxiliary_loss_mlp": 0.01042074, + "balance_loss_clip": 1.03559923, + "balance_loss_mlp": 1.02879965, + "epoch": 0.8624079362693522, + "flos": 18109176814080.0, + "grad_norm": 1.6148942161800302, + "language_loss": 0.6802907, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.70167565, + "num_input_tokens_seen": 309347870, + "step": 14344, + "time_per_iteration": 4.1997270584106445 + }, + { + "auxiliary_loss_clip": 0.01067509, + "auxiliary_loss_mlp": 0.01046204, + "balance_loss_clip": 1.0338335, + "balance_loss_mlp": 1.03233421, + "epoch": 0.8624680595220201, + "flos": 30666455971200.0, + "grad_norm": 1.5830249885479915, + "language_loss": 0.81282222, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.83395934, + "num_input_tokens_seen": 309371695, + "step": 14345, + "time_per_iteration": 2.7645456790924072 + }, + { + "auxiliary_loss_clip": 0.01103951, + "auxiliary_loss_mlp": 0.01034874, + "balance_loss_clip": 1.03953946, + "balance_loss_mlp": 1.02208841, + "epoch": 0.8625281827746881, + "flos": 37998588516480.0, + "grad_norm": 1.9141588154194698, + "language_loss": 0.50585526, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.52724349, + "num_input_tokens_seen": 309394645, + "step": 14346, + "time_per_iteration": 2.7219948768615723 + }, + { + "auxiliary_loss_clip": 0.01029718, + "auxiliary_loss_mlp": 0.01032771, + "balance_loss_clip": 1.03116322, + "balance_loss_mlp": 1.01971757, + "epoch": 0.862588306027356, + "flos": 26249730808320.0, + "grad_norm": 1.6259000305173057, + "language_loss": 0.75161147, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.77223635, + "num_input_tokens_seen": 309413170, + "step": 14347, + "time_per_iteration": 2.8139262199401855 + }, + { + "auxiliary_loss_clip": 0.01082643, + "auxiliary_loss_mlp": 0.01030127, + "balance_loss_clip": 1.03561497, + "balance_loss_mlp": 1.01690078, + "epoch": 0.862648429280024, + "flos": 25878809013120.0, + "grad_norm": 2.1840928220647684, + "language_loss": 0.80749428, + "learning_rate": 1.945766105774449e-07, + "loss": 0.82862198, + "num_input_tokens_seen": 309431315, + "step": 14348, + "time_per_iteration": 2.656729221343994 + }, + { + "auxiliary_loss_clip": 0.01091404, + "auxiliary_loss_mlp": 0.01029077, + "balance_loss_clip": 1.03467631, + "balance_loss_mlp": 1.01720428, + "epoch": 0.862708552532692, + "flos": 37816413713280.0, + "grad_norm": 1.8503371551245635, + "language_loss": 0.66269898, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.68390381, + "num_input_tokens_seen": 309453020, + "step": 14349, + "time_per_iteration": 2.799384832382202 + }, + { + "auxiliary_loss_clip": 0.0109691, + "auxiliary_loss_mlp": 0.01036094, + "balance_loss_clip": 1.03515387, + "balance_loss_mlp": 1.02395201, + "epoch": 0.86276867578536, + "flos": 19091800247040.0, + "grad_norm": 2.6246269667941906, + "language_loss": 0.7027539, + "learning_rate": 1.942416188703573e-07, + "loss": 0.7240839, + "num_input_tokens_seen": 309469780, + "step": 14350, + "time_per_iteration": 2.5943920612335205 + }, + { + "auxiliary_loss_clip": 0.0108035, + "auxiliary_loss_mlp": 0.01033768, + "balance_loss_clip": 1.03473318, + "balance_loss_mlp": 1.02111983, + "epoch": 0.862828799038028, + "flos": 22164281804160.0, + "grad_norm": 1.8551444377087964, + "language_loss": 0.76769114, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.78883231, + "num_input_tokens_seen": 309489610, + "step": 14351, + "time_per_iteration": 2.6581666469573975 + }, + { + "auxiliary_loss_clip": 0.01096886, + "auxiliary_loss_mlp": 0.0103006, + "balance_loss_clip": 1.0370357, + "balance_loss_mlp": 1.01837111, + "epoch": 0.8628889222906959, + "flos": 23145576433920.0, + "grad_norm": 3.863289439771493, + "language_loss": 0.85162789, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.87289739, + "num_input_tokens_seen": 309508295, + "step": 14352, + "time_per_iteration": 2.6280806064605713 + }, + { + "auxiliary_loss_clip": 0.01022246, + "auxiliary_loss_mlp": 0.0100272, + "balance_loss_clip": 1.0090481, + "balance_loss_mlp": 1.00167739, + "epoch": 0.8629490455433639, + "flos": 57817762151040.0, + "grad_norm": 0.7895499485816829, + "language_loss": 0.61935335, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.63960302, + "num_input_tokens_seen": 309567960, + "step": 14353, + "time_per_iteration": 3.146935224533081 + }, + { + "auxiliary_loss_clip": 0.01107884, + "auxiliary_loss_mlp": 0.01030033, + "balance_loss_clip": 1.038095, + "balance_loss_mlp": 1.0188508, + "epoch": 0.8630091687960318, + "flos": 15919667383680.0, + "grad_norm": 1.6638505981636493, + "language_loss": 0.81754172, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.83892089, + "num_input_tokens_seen": 309586050, + "step": 14354, + "time_per_iteration": 2.566462993621826 + }, + { + "auxiliary_loss_clip": 0.01086608, + "auxiliary_loss_mlp": 0.01027335, + "balance_loss_clip": 1.03349864, + "balance_loss_mlp": 1.01475871, + "epoch": 0.8630692920486999, + "flos": 17961691570560.0, + "grad_norm": 2.0513019933105827, + "language_loss": 0.85992026, + "learning_rate": 1.934053380181031e-07, + "loss": 0.88105971, + "num_input_tokens_seen": 309602910, + "step": 14355, + "time_per_iteration": 2.5831828117370605 + }, + { + "auxiliary_loss_clip": 0.01069864, + "auxiliary_loss_mlp": 0.01030488, + "balance_loss_clip": 1.03425539, + "balance_loss_mlp": 1.0177269, + "epoch": 0.8631294153013678, + "flos": 22455158140800.0, + "grad_norm": 4.854829851946411, + "language_loss": 0.58569849, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.60670203, + "num_input_tokens_seen": 309621175, + "step": 14356, + "time_per_iteration": 2.65341854095459 + }, + { + "auxiliary_loss_clip": 0.01064009, + "auxiliary_loss_mlp": 0.01035736, + "balance_loss_clip": 1.03384709, + "balance_loss_mlp": 1.02203846, + "epoch": 0.8631895385540358, + "flos": 16837005847680.0, + "grad_norm": 1.8090879268972078, + "language_loss": 0.77420521, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.79520273, + "num_input_tokens_seen": 309639395, + "step": 14357, + "time_per_iteration": 2.710195302963257 + }, + { + "auxiliary_loss_clip": 0.01098594, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.03671813, + "balance_loss_mlp": 1.02122521, + "epoch": 0.8632496618067037, + "flos": 18697214367360.0, + "grad_norm": 12.099648120671757, + "language_loss": 0.77500695, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.79632944, + "num_input_tokens_seen": 309657265, + "step": 14358, + "time_per_iteration": 2.6657116413116455 + }, + { + "auxiliary_loss_clip": 0.01071096, + "auxiliary_loss_mlp": 0.0103447, + "balance_loss_clip": 1.03174829, + "balance_loss_mlp": 1.02037358, + "epoch": 0.8633097850593717, + "flos": 24279922915200.0, + "grad_norm": 1.4851174588982734, + "language_loss": 0.75020039, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.77125597, + "num_input_tokens_seen": 309678610, + "step": 14359, + "time_per_iteration": 2.6872808933258057 + }, + { + "auxiliary_loss_clip": 0.01045653, + "auxiliary_loss_mlp": 0.01028999, + "balance_loss_clip": 1.03073585, + "balance_loss_mlp": 1.01629102, + "epoch": 0.8633699083120396, + "flos": 21178569801600.0, + "grad_norm": 1.864228118741394, + "language_loss": 0.70209599, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.72284251, + "num_input_tokens_seen": 309697710, + "step": 14360, + "time_per_iteration": 2.8204243183135986 + }, + { + "auxiliary_loss_clip": 0.01079991, + "auxiliary_loss_mlp": 0.01034146, + "balance_loss_clip": 1.03886342, + "balance_loss_mlp": 1.02069938, + "epoch": 0.8634300315647077, + "flos": 19244888012160.0, + "grad_norm": 1.7674774133909552, + "language_loss": 0.7663061, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.78744745, + "num_input_tokens_seen": 309715985, + "step": 14361, + "time_per_iteration": 2.7079758644104004 + }, + { + "auxiliary_loss_clip": 0.01028441, + "auxiliary_loss_mlp": 0.01002241, + "balance_loss_clip": 1.00602102, + "balance_loss_mlp": 1.00118601, + "epoch": 0.8634901548173756, + "flos": 66195648282240.0, + "grad_norm": 0.9560869661193441, + "language_loss": 0.58801341, + "learning_rate": 1.922374222645329e-07, + "loss": 0.60832024, + "num_input_tokens_seen": 309779930, + "step": 14362, + "time_per_iteration": 3.145829677581787 + }, + { + "auxiliary_loss_clip": 0.01042985, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.03692436, + "balance_loss_mlp": 1.01842105, + "epoch": 0.8635502780700436, + "flos": 24789531121920.0, + "grad_norm": 1.852310617760456, + "language_loss": 0.80515075, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.82589483, + "num_input_tokens_seen": 309800580, + "step": 14363, + "time_per_iteration": 2.862398147583008 + }, + { + "auxiliary_loss_clip": 0.01082251, + "auxiliary_loss_mlp": 0.0104491, + "balance_loss_clip": 1.03282666, + "balance_loss_mlp": 1.02994919, + "epoch": 0.8636104013227116, + "flos": 25189970918400.0, + "grad_norm": 2.305599711448788, + "language_loss": 0.72819698, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.74946856, + "num_input_tokens_seen": 309821725, + "step": 14364, + "time_per_iteration": 2.7694895267486572 + }, + { + "auxiliary_loss_clip": 0.01084893, + "auxiliary_loss_mlp": 0.01037595, + "balance_loss_clip": 1.033113, + "balance_loss_mlp": 1.02479792, + "epoch": 0.8636705245753795, + "flos": 23878441624320.0, + "grad_norm": 3.709270849116724, + "language_loss": 0.71231377, + "learning_rate": 1.917379150731755e-07, + "loss": 0.73353863, + "num_input_tokens_seen": 309841565, + "step": 14365, + "time_per_iteration": 2.6591691970825195 + }, + { + "auxiliary_loss_clip": 0.01084976, + "auxiliary_loss_mlp": 0.01048634, + "balance_loss_clip": 1.03588641, + "balance_loss_mlp": 1.03338158, + "epoch": 0.8637306478280475, + "flos": 23110455911040.0, + "grad_norm": 2.5553133795092853, + "language_loss": 0.7095083, + "learning_rate": 1.915715498065993e-07, + "loss": 0.73084438, + "num_input_tokens_seen": 309858635, + "step": 14366, + "time_per_iteration": 2.654860019683838 + }, + { + "auxiliary_loss_clip": 0.01080294, + "auxiliary_loss_mlp": 0.01025551, + "balance_loss_clip": 1.03619814, + "balance_loss_mlp": 1.01414287, + "epoch": 0.8637907710807154, + "flos": 21906802137600.0, + "grad_norm": 1.7096438755629864, + "language_loss": 0.81546772, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.83652616, + "num_input_tokens_seen": 309877885, + "step": 14367, + "time_per_iteration": 2.658378839492798 + }, + { + "auxiliary_loss_clip": 0.01084703, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.03672993, + "balance_loss_mlp": 1.01633847, + "epoch": 0.8638508943333835, + "flos": 23580526222080.0, + "grad_norm": 1.893928917899102, + "language_loss": 0.61735493, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.63849741, + "num_input_tokens_seen": 309893140, + "step": 14368, + "time_per_iteration": 2.7563858032226562 + }, + { + "auxiliary_loss_clip": 0.01100198, + "auxiliary_loss_mlp": 0.01032496, + "balance_loss_clip": 1.03874695, + "balance_loss_mlp": 1.02039003, + "epoch": 0.8639110175860514, + "flos": 25775853655680.0, + "grad_norm": 1.8793002534030256, + "language_loss": 0.76034266, + "learning_rate": 1.91072865486821e-07, + "loss": 0.78166956, + "num_input_tokens_seen": 309914175, + "step": 14369, + "time_per_iteration": 2.720898389816284 + }, + { + "auxiliary_loss_clip": 0.01084672, + "auxiliary_loss_mlp": 0.01036631, + "balance_loss_clip": 1.03559625, + "balance_loss_mlp": 1.02341676, + "epoch": 0.8639711408387194, + "flos": 23369443948800.0, + "grad_norm": 1.7853455922645574, + "language_loss": 0.64685416, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.66806722, + "num_input_tokens_seen": 309932395, + "step": 14370, + "time_per_iteration": 2.7746939659118652 + }, + { + "auxiliary_loss_clip": 0.01051431, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.03813696, + "balance_loss_mlp": 1.02186131, + "epoch": 0.8640312640913873, + "flos": 22127221946880.0, + "grad_norm": 1.6691251892121577, + "language_loss": 0.66381669, + "learning_rate": 1.907407522366209e-07, + "loss": 0.68467391, + "num_input_tokens_seen": 309951720, + "step": 14371, + "time_per_iteration": 2.7832515239715576 + }, + { + "auxiliary_loss_clip": 0.01010679, + "auxiliary_loss_mlp": 0.0100181, + "balance_loss_clip": 1.00754333, + "balance_loss_mlp": 1.00070095, + "epoch": 0.8640913873440553, + "flos": 57571735944960.0, + "grad_norm": 0.8715418299752374, + "language_loss": 0.56873655, + "learning_rate": 1.905747985193107e-07, + "loss": 0.58886147, + "num_input_tokens_seen": 310006120, + "step": 14372, + "time_per_iteration": 3.080965042114258 + }, + { + "auxiliary_loss_clip": 0.01107085, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.03817725, + "balance_loss_mlp": 1.01909208, + "epoch": 0.8641515105967232, + "flos": 23987430466560.0, + "grad_norm": 1.722968636798083, + "language_loss": 0.79519123, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.81657857, + "num_input_tokens_seen": 310026740, + "step": 14373, + "time_per_iteration": 2.635335683822632 + }, + { + "auxiliary_loss_clip": 0.01110837, + "auxiliary_loss_mlp": 0.01028883, + "balance_loss_clip": 1.03787744, + "balance_loss_mlp": 1.01607943, + "epoch": 0.8642116338493913, + "flos": 19062749122560.0, + "grad_norm": 1.6653536401221238, + "language_loss": 0.63377726, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.65517449, + "num_input_tokens_seen": 310044135, + "step": 14374, + "time_per_iteration": 2.5494918823242188 + }, + { + "auxiliary_loss_clip": 0.01077851, + "auxiliary_loss_mlp": 0.01034525, + "balance_loss_clip": 1.03636634, + "balance_loss_mlp": 1.02211523, + "epoch": 0.8642717571020592, + "flos": 18254148105600.0, + "grad_norm": 1.7168843124571862, + "language_loss": 0.77189004, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.79301381, + "num_input_tokens_seen": 310061560, + "step": 14375, + "time_per_iteration": 2.677976131439209 + }, + { + "auxiliary_loss_clip": 0.01064524, + "auxiliary_loss_mlp": 0.00770411, + "balance_loss_clip": 1.03405952, + "balance_loss_mlp": 1.00014496, + "epoch": 0.8643318803547272, + "flos": 57663270777600.0, + "grad_norm": 1.8714174217127035, + "language_loss": 0.60663325, + "learning_rate": 1.899116698488117e-07, + "loss": 0.6249826, + "num_input_tokens_seen": 310087310, + "step": 14376, + "time_per_iteration": 3.0315792560577393 + }, + { + "auxiliary_loss_clip": 0.01065318, + "auxiliary_loss_mlp": 0.01037856, + "balance_loss_clip": 1.0328449, + "balance_loss_mlp": 1.02571476, + "epoch": 0.8643920036073952, + "flos": 19609524927360.0, + "grad_norm": 1.4665083491596096, + "language_loss": 0.66321123, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.68424296, + "num_input_tokens_seen": 310106260, + "step": 14377, + "time_per_iteration": 2.661478042602539 + }, + { + "auxiliary_loss_clip": 0.01082246, + "auxiliary_loss_mlp": 0.0104227, + "balance_loss_clip": 1.03249764, + "balance_loss_mlp": 1.02842414, + "epoch": 0.8644521268600631, + "flos": 20850346298880.0, + "grad_norm": 1.6699792562126987, + "language_loss": 0.70700777, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.72825295, + "num_input_tokens_seen": 310125305, + "step": 14378, + "time_per_iteration": 2.6440517902374268 + }, + { + "auxiliary_loss_clip": 0.0101912, + "auxiliary_loss_mlp": 0.01001905, + "balance_loss_clip": 1.0065546, + "balance_loss_mlp": 1.00082636, + "epoch": 0.8645122501127311, + "flos": 66719550101760.0, + "grad_norm": 0.8082600022248976, + "language_loss": 0.60236883, + "learning_rate": 1.894150440305995e-07, + "loss": 0.6225791, + "num_input_tokens_seen": 310189270, + "step": 14379, + "time_per_iteration": 3.1792728900909424 + }, + { + "auxiliary_loss_clip": 0.01077548, + "auxiliary_loss_mlp": 0.01032454, + "balance_loss_clip": 1.03373933, + "balance_loss_mlp": 1.02031279, + "epoch": 0.864572373365399, + "flos": 21690009601920.0, + "grad_norm": 1.8837339678348841, + "language_loss": 0.74800771, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.76910776, + "num_input_tokens_seen": 310208395, + "step": 14380, + "time_per_iteration": 4.324819803237915 + }, + { + "auxiliary_loss_clip": 0.01080903, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.03346038, + "balance_loss_mlp": 1.02307606, + "epoch": 0.8646324966180671, + "flos": 20266402896000.0, + "grad_norm": 1.9558839364360057, + "language_loss": 0.75436544, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.77553868, + "num_input_tokens_seen": 310227415, + "step": 14381, + "time_per_iteration": 4.3003315925598145 + }, + { + "auxiliary_loss_clip": 0.01085169, + "auxiliary_loss_mlp": 0.01035093, + "balance_loss_clip": 1.03721309, + "balance_loss_mlp": 1.02337468, + "epoch": 0.864692619870735, + "flos": 11946188050560.0, + "grad_norm": 2.531870478420468, + "language_loss": 0.84684384, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.86804652, + "num_input_tokens_seen": 310242625, + "step": 14382, + "time_per_iteration": 4.235616683959961 + }, + { + "auxiliary_loss_clip": 0.01101073, + "auxiliary_loss_mlp": 0.01035119, + "balance_loss_clip": 1.03812909, + "balance_loss_mlp": 1.02230954, + "epoch": 0.864752743123403, + "flos": 21470703114240.0, + "grad_norm": 2.020788387095791, + "language_loss": 0.75921559, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.78057754, + "num_input_tokens_seen": 310260585, + "step": 14383, + "time_per_iteration": 2.743340015411377 + }, + { + "auxiliary_loss_clip": 0.01089565, + "auxiliary_loss_mlp": 0.0103368, + "balance_loss_clip": 1.03891516, + "balance_loss_mlp": 1.02116311, + "epoch": 0.8648128663760709, + "flos": 19530018172800.0, + "grad_norm": 1.8560596447894047, + "language_loss": 0.85428023, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.87551272, + "num_input_tokens_seen": 310277210, + "step": 14384, + "time_per_iteration": 4.140477418899536 + }, + { + "auxiliary_loss_clip": 0.01093344, + "auxiliary_loss_mlp": 0.01030627, + "balance_loss_clip": 1.0340718, + "balance_loss_mlp": 1.01875997, + "epoch": 0.8648729896287389, + "flos": 21287953693440.0, + "grad_norm": 1.6613358165771401, + "language_loss": 0.8117463, + "learning_rate": 1.884236463176072e-07, + "loss": 0.832986, + "num_input_tokens_seen": 310296610, + "step": 14385, + "time_per_iteration": 2.563424825668335 + }, + { + "auxiliary_loss_clip": 0.01094427, + "auxiliary_loss_mlp": 0.01035484, + "balance_loss_clip": 1.040411, + "balance_loss_mlp": 1.02252555, + "epoch": 0.8649331128814068, + "flos": 24604483230720.0, + "grad_norm": 2.3388483586087303, + "language_loss": 0.72581172, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.74711078, + "num_input_tokens_seen": 310316830, + "step": 14386, + "time_per_iteration": 2.667926549911499 + }, + { + "auxiliary_loss_clip": 0.01093992, + "auxiliary_loss_mlp": 0.01041396, + "balance_loss_clip": 1.03530121, + "balance_loss_mlp": 1.02797318, + "epoch": 0.8649932361340749, + "flos": 15377811742080.0, + "grad_norm": 4.510791763694996, + "language_loss": 0.81868196, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.8400358, + "num_input_tokens_seen": 310334355, + "step": 14387, + "time_per_iteration": 2.660701036453247 + }, + { + "auxiliary_loss_clip": 0.01106932, + "auxiliary_loss_mlp": 0.01028302, + "balance_loss_clip": 1.0378803, + "balance_loss_mlp": 1.01641643, + "epoch": 0.8650533593867428, + "flos": 19901227276800.0, + "grad_norm": 2.104447554520212, + "language_loss": 0.68797326, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.70932555, + "num_input_tokens_seen": 310352900, + "step": 14388, + "time_per_iteration": 2.5773561000823975 + }, + { + "auxiliary_loss_clip": 0.01073211, + "auxiliary_loss_mlp": 0.01036245, + "balance_loss_clip": 1.03666544, + "balance_loss_mlp": 1.02510452, + "epoch": 0.8651134826394108, + "flos": 25626931868160.0, + "grad_norm": 2.8952711176553043, + "language_loss": 0.90358889, + "learning_rate": 1.877640883285283e-07, + "loss": 0.92468345, + "num_input_tokens_seen": 310372855, + "step": 14389, + "time_per_iteration": 2.712479591369629 + }, + { + "auxiliary_loss_clip": 0.01065736, + "auxiliary_loss_mlp": 0.00769129, + "balance_loss_clip": 1.03819394, + "balance_loss_mlp": 1.0002389, + "epoch": 0.8651736058920788, + "flos": 18734525619840.0, + "grad_norm": 1.556328693404614, + "language_loss": 0.70784509, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.72619373, + "num_input_tokens_seen": 310391595, + "step": 14390, + "time_per_iteration": 2.7250664234161377 + }, + { + "auxiliary_loss_clip": 0.0110984, + "auxiliary_loss_mlp": 0.01034761, + "balance_loss_clip": 1.03667974, + "balance_loss_mlp": 1.02206492, + "epoch": 0.8652337291447467, + "flos": 20776765288320.0, + "grad_norm": 3.0242900770440158, + "language_loss": 0.82031155, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.84175754, + "num_input_tokens_seen": 310410090, + "step": 14391, + "time_per_iteration": 2.5874016284942627 + }, + { + "auxiliary_loss_clip": 0.00999016, + "auxiliary_loss_mlp": 0.00998272, + "balance_loss_clip": 1.00931406, + "balance_loss_mlp": 0.99692518, + "epoch": 0.8652938523974147, + "flos": 64227887464320.0, + "grad_norm": 0.800129032908664, + "language_loss": 0.67961007, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.69958293, + "num_input_tokens_seen": 310470055, + "step": 14392, + "time_per_iteration": 3.141786813735962 + }, + { + "auxiliary_loss_clip": 0.01102797, + "auxiliary_loss_mlp": 0.01032477, + "balance_loss_clip": 1.03694808, + "balance_loss_mlp": 1.01924479, + "epoch": 0.8653539756500827, + "flos": 18040587793920.0, + "grad_norm": 1.8856474230308053, + "language_loss": 0.75999135, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.78134412, + "num_input_tokens_seen": 310487665, + "step": 14393, + "time_per_iteration": 2.6403071880340576 + }, + { + "auxiliary_loss_clip": 0.01085265, + "auxiliary_loss_mlp": 0.01035656, + "balance_loss_clip": 1.03292179, + "balance_loss_mlp": 1.02319884, + "epoch": 0.8654140989027507, + "flos": 17382416935680.0, + "grad_norm": 1.8766276101061499, + "language_loss": 0.73443645, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.75564563, + "num_input_tokens_seen": 310506130, + "step": 14394, + "time_per_iteration": 2.589737892150879 + }, + { + "auxiliary_loss_clip": 0.01098893, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.03559685, + "balance_loss_mlp": 1.01642823, + "epoch": 0.8654742221554186, + "flos": 53284862448000.0, + "grad_norm": 25.445187757651638, + "language_loss": 0.65340948, + "learning_rate": 1.867768130747036e-07, + "loss": 0.67469549, + "num_input_tokens_seen": 310532445, + "step": 14395, + "time_per_iteration": 2.8686017990112305 + }, + { + "auxiliary_loss_clip": 0.01091975, + "auxiliary_loss_mlp": 0.0103619, + "balance_loss_clip": 1.03594851, + "balance_loss_mlp": 1.02362514, + "epoch": 0.8655343454080866, + "flos": 23914711382400.0, + "grad_norm": 3.648513206821013, + "language_loss": 0.68270028, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.70398188, + "num_input_tokens_seen": 310552300, + "step": 14396, + "time_per_iteration": 2.691372871398926 + }, + { + "auxiliary_loss_clip": 0.01102693, + "auxiliary_loss_mlp": 0.01036564, + "balance_loss_clip": 1.03977966, + "balance_loss_mlp": 1.02393413, + "epoch": 0.8655944686607545, + "flos": 24097209408000.0, + "grad_norm": 2.1296548078090067, + "language_loss": 0.6985743, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.71996689, + "num_input_tokens_seen": 310572710, + "step": 14397, + "time_per_iteration": 2.6537063121795654 + }, + { + "auxiliary_loss_clip": 0.01092627, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.03830481, + "balance_loss_mlp": 1.01773953, + "epoch": 0.8656545919134225, + "flos": 23112718467840.0, + "grad_norm": 1.7708020135557936, + "language_loss": 0.63645488, + "learning_rate": 1.86284103591253e-07, + "loss": 0.65767658, + "num_input_tokens_seen": 310592460, + "step": 14398, + "time_per_iteration": 2.721609592437744 + }, + { + "auxiliary_loss_clip": 0.01072146, + "auxiliary_loss_mlp": 0.01040273, + "balance_loss_clip": 1.03550839, + "balance_loss_mlp": 1.02659369, + "epoch": 0.8657147151660904, + "flos": 21141761339520.0, + "grad_norm": 2.410679040433659, + "language_loss": 0.76115006, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.78227425, + "num_input_tokens_seen": 310609375, + "step": 14399, + "time_per_iteration": 2.6792304515838623 + }, + { + "auxiliary_loss_clip": 0.01091264, + "auxiliary_loss_mlp": 0.0102886, + "balance_loss_clip": 1.03629327, + "balance_loss_mlp": 1.0173502, + "epoch": 0.8657748384187585, + "flos": 16289439943680.0, + "grad_norm": 2.1842250603302906, + "language_loss": 0.93539166, + "learning_rate": 1.8595597447334855e-07, + "loss": 0.95659292, + "num_input_tokens_seen": 310627405, + "step": 14400, + "time_per_iteration": 2.557438850402832 + }, + { + "auxiliary_loss_clip": 0.01044413, + "auxiliary_loss_mlp": 0.01038088, + "balance_loss_clip": 1.0341754, + "balance_loss_mlp": 1.02537465, + "epoch": 0.8658349616714264, + "flos": 30843890179200.0, + "grad_norm": 1.8571085521140969, + "language_loss": 0.67723525, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.69806027, + "num_input_tokens_seen": 310649945, + "step": 14401, + "time_per_iteration": 2.8091368675231934 + }, + { + "auxiliary_loss_clip": 0.01099417, + "auxiliary_loss_mlp": 0.01031502, + "balance_loss_clip": 1.03662825, + "balance_loss_mlp": 1.01880038, + "epoch": 0.8658950849240944, + "flos": 18952862440320.0, + "grad_norm": 2.157466322300169, + "language_loss": 0.73613071, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.75743997, + "num_input_tokens_seen": 310668285, + "step": 14402, + "time_per_iteration": 2.570737838745117 + }, + { + "auxiliary_loss_clip": 0.01036456, + "auxiliary_loss_mlp": 0.01033626, + "balance_loss_clip": 1.03347492, + "balance_loss_mlp": 1.02177048, + "epoch": 0.8659552081767624, + "flos": 23364344217600.0, + "grad_norm": 1.7804756809265996, + "language_loss": 0.74911118, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.76981199, + "num_input_tokens_seen": 310687015, + "step": 14403, + "time_per_iteration": 2.8824269771575928 + }, + { + "auxiliary_loss_clip": 0.0108389, + "auxiliary_loss_mlp": 0.01034677, + "balance_loss_clip": 1.03559339, + "balance_loss_mlp": 1.02145052, + "epoch": 0.8660153314294303, + "flos": 23841992298240.0, + "grad_norm": 1.9785439757020915, + "language_loss": 0.73294771, + "learning_rate": 1.853005417520368e-07, + "loss": 0.75413334, + "num_input_tokens_seen": 310707580, + "step": 14404, + "time_per_iteration": 2.691854238510132 + }, + { + "auxiliary_loss_clip": 0.01070251, + "auxiliary_loss_mlp": 0.01036432, + "balance_loss_clip": 1.03529263, + "balance_loss_mlp": 1.02364087, + "epoch": 0.8660754546820983, + "flos": 23112467072640.0, + "grad_norm": 1.6230968808599193, + "language_loss": 0.70621324, + "learning_rate": 1.851368555901447e-07, + "loss": 0.72728002, + "num_input_tokens_seen": 310727300, + "step": 14405, + "time_per_iteration": 2.6545495986938477 + }, + { + "auxiliary_loss_clip": 0.01099979, + "auxiliary_loss_mlp": 0.0077033, + "balance_loss_clip": 1.03619599, + "balance_loss_mlp": 1.00023413, + "epoch": 0.8661355779347663, + "flos": 14391991998720.0, + "grad_norm": 1.8683678221955426, + "language_loss": 0.66598046, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.68468356, + "num_input_tokens_seen": 310744935, + "step": 14406, + "time_per_iteration": 2.6244313716888428 + }, + { + "auxiliary_loss_clip": 0.01087721, + "auxiliary_loss_mlp": 0.01027006, + "balance_loss_clip": 1.0369488, + "balance_loss_mlp": 1.01565766, + "epoch": 0.8661957011874343, + "flos": 21870137329920.0, + "grad_norm": 1.713289909017667, + "language_loss": 0.82678503, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.84793234, + "num_input_tokens_seen": 310765085, + "step": 14407, + "time_per_iteration": 2.7246527671813965 + }, + { + "auxiliary_loss_clip": 0.01097432, + "auxiliary_loss_mlp": 0.01038578, + "balance_loss_clip": 1.03706372, + "balance_loss_mlp": 1.02623403, + "epoch": 0.8662558244401022, + "flos": 21835160461440.0, + "grad_norm": 1.640288408492858, + "language_loss": 0.70144266, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.72280276, + "num_input_tokens_seen": 310783260, + "step": 14408, + "time_per_iteration": 2.688714027404785 + }, + { + "auxiliary_loss_clip": 0.01088368, + "auxiliary_loss_mlp": 0.01034051, + "balance_loss_clip": 1.036026, + "balance_loss_mlp": 1.02254152, + "epoch": 0.8663159476927702, + "flos": 17384104874880.0, + "grad_norm": 1.9035272419543303, + "language_loss": 0.7693873, + "learning_rate": 1.844827992025304e-07, + "loss": 0.79061151, + "num_input_tokens_seen": 310801970, + "step": 14409, + "time_per_iteration": 2.668154239654541 + }, + { + "auxiliary_loss_clip": 0.01101925, + "auxiliary_loss_mlp": 0.01034118, + "balance_loss_clip": 1.03869689, + "balance_loss_mlp": 1.02009869, + "epoch": 0.8663760709454381, + "flos": 22747722416640.0, + "grad_norm": 1.696612134520476, + "language_loss": 0.77045894, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.79181939, + "num_input_tokens_seen": 310822070, + "step": 14410, + "time_per_iteration": 2.6069350242614746 + }, + { + "auxiliary_loss_clip": 0.0106574, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.03402448, + "balance_loss_mlp": 1.02225292, + "epoch": 0.8664361941981061, + "flos": 17376850327680.0, + "grad_norm": 1.9481665792177514, + "language_loss": 0.77590597, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.79691112, + "num_input_tokens_seen": 310838355, + "step": 14411, + "time_per_iteration": 2.6132922172546387 + }, + { + "auxiliary_loss_clip": 0.01078109, + "auxiliary_loss_mlp": 0.01035885, + "balance_loss_clip": 1.03366232, + "balance_loss_mlp": 1.02461982, + "epoch": 0.866496317450774, + "flos": 16034438315520.0, + "grad_norm": 1.750688188601547, + "language_loss": 0.74020624, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.76134622, + "num_input_tokens_seen": 310856055, + "step": 14412, + "time_per_iteration": 2.6058592796325684 + }, + { + "auxiliary_loss_clip": 0.01090356, + "auxiliary_loss_mlp": 0.00771287, + "balance_loss_clip": 1.03415728, + "balance_loss_mlp": 1.0002377, + "epoch": 0.8665564407034421, + "flos": 20814830726400.0, + "grad_norm": 1.7730290452974458, + "language_loss": 0.6952216, + "learning_rate": 1.83829844328371e-07, + "loss": 0.71383798, + "num_input_tokens_seen": 310876695, + "step": 14413, + "time_per_iteration": 2.614438056945801 + }, + { + "auxiliary_loss_clip": 0.01098326, + "auxiliary_loss_mlp": 0.01035601, + "balance_loss_clip": 1.03807211, + "balance_loss_mlp": 1.02280378, + "epoch": 0.86661656395611, + "flos": 15815167741440.0, + "grad_norm": 2.2624919572268603, + "language_loss": 0.62299776, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.64433706, + "num_input_tokens_seen": 310893880, + "step": 14414, + "time_per_iteration": 2.5781359672546387 + }, + { + "auxiliary_loss_clip": 0.01078873, + "auxiliary_loss_mlp": 0.00769848, + "balance_loss_clip": 1.03693521, + "balance_loss_mlp": 1.00018334, + "epoch": 0.866676687208778, + "flos": 23036910814080.0, + "grad_norm": 1.633402194861805, + "language_loss": 0.6382761, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.65676332, + "num_input_tokens_seen": 310914145, + "step": 14415, + "time_per_iteration": 2.718871831893921 + }, + { + "auxiliary_loss_clip": 0.01001561, + "auxiliary_loss_mlp": 0.01003608, + "balance_loss_clip": 1.00817573, + "balance_loss_mlp": 1.00249326, + "epoch": 0.866736810461446, + "flos": 63802275212160.0, + "grad_norm": 0.7984060732990605, + "language_loss": 0.60386515, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.62391675, + "num_input_tokens_seen": 310972825, + "step": 14416, + "time_per_iteration": 3.32995343208313 + }, + { + "auxiliary_loss_clip": 0.01101132, + "auxiliary_loss_mlp": 0.00771613, + "balance_loss_clip": 1.03657961, + "balance_loss_mlp": 1.00021935, + "epoch": 0.8667969337141139, + "flos": 20449367798400.0, + "grad_norm": 1.8418559136989974, + "language_loss": 0.74591923, + "learning_rate": 1.831779913638285e-07, + "loss": 0.76464671, + "num_input_tokens_seen": 310992050, + "step": 14417, + "time_per_iteration": 2.6240720748901367 + }, + { + "auxiliary_loss_clip": 0.0108446, + "auxiliary_loss_mlp": 0.01035619, + "balance_loss_clip": 1.03623867, + "balance_loss_mlp": 1.02401364, + "epoch": 0.866857056966782, + "flos": 21653703930240.0, + "grad_norm": 1.6010496631035476, + "language_loss": 0.75304806, + "learning_rate": 1.830152003424319e-07, + "loss": 0.77424884, + "num_input_tokens_seen": 311011105, + "step": 14418, + "time_per_iteration": 2.6442039012908936 + }, + { + "auxiliary_loss_clip": 0.01096633, + "auxiliary_loss_mlp": 0.01034851, + "balance_loss_clip": 1.0357796, + "balance_loss_mlp": 1.02292967, + "epoch": 0.8669171802194499, + "flos": 22852832590080.0, + "grad_norm": 1.669621966476557, + "language_loss": 0.68341649, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.70473135, + "num_input_tokens_seen": 311032080, + "step": 14419, + "time_per_iteration": 2.623978853225708 + }, + { + "auxiliary_loss_clip": 0.01099318, + "auxiliary_loss_mlp": 0.01031681, + "balance_loss_clip": 1.03616405, + "balance_loss_mlp": 1.02020669, + "epoch": 0.8669773034721179, + "flos": 18734166483840.0, + "grad_norm": 1.6685720418473156, + "language_loss": 0.78522211, + "learning_rate": 1.826898250065465e-07, + "loss": 0.80653214, + "num_input_tokens_seen": 311049735, + "step": 14420, + "time_per_iteration": 4.198700189590454 + }, + { + "auxiliary_loss_clip": 0.01093862, + "auxiliary_loss_mlp": 0.01032049, + "balance_loss_clip": 1.03496552, + "balance_loss_mlp": 1.01974106, + "epoch": 0.8670374267247858, + "flos": 18916018064640.0, + "grad_norm": 1.5087342244931736, + "language_loss": 0.83599997, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.85725909, + "num_input_tokens_seen": 311067675, + "step": 14421, + "time_per_iteration": 4.208746910095215 + }, + { + "auxiliary_loss_clip": 0.01006687, + "auxiliary_loss_mlp": 0.00999775, + "balance_loss_clip": 1.00802314, + "balance_loss_mlp": 0.99845761, + "epoch": 0.8670975499774538, + "flos": 48814527214080.0, + "grad_norm": 0.7509779369384021, + "language_loss": 0.49057785, + "learning_rate": 1.823647253209941e-07, + "loss": 0.51064241, + "num_input_tokens_seen": 311126605, + "step": 14422, + "time_per_iteration": 4.777186870574951 + }, + { + "auxiliary_loss_clip": 0.01087105, + "auxiliary_loss_mlp": 0.00769697, + "balance_loss_clip": 1.03720963, + "balance_loss_mlp": 1.00028849, + "epoch": 0.8671576732301217, + "flos": 26136145025280.0, + "grad_norm": 1.670233296430545, + "language_loss": 0.73442525, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.75299329, + "num_input_tokens_seen": 311147325, + "step": 14423, + "time_per_iteration": 4.283585786819458 + }, + { + "auxiliary_loss_clip": 0.01061427, + "auxiliary_loss_mlp": 0.0103548, + "balance_loss_clip": 1.03110516, + "balance_loss_mlp": 1.02256989, + "epoch": 0.8672177964827897, + "flos": 18367446579840.0, + "grad_norm": 1.5662705117653968, + "language_loss": 0.76781297, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.78878212, + "num_input_tokens_seen": 311165385, + "step": 14424, + "time_per_iteration": 2.645517110824585 + }, + { + "auxiliary_loss_clip": 0.01066724, + "auxiliary_loss_mlp": 0.01040643, + "balance_loss_clip": 1.03161621, + "balance_loss_mlp": 1.02800703, + "epoch": 0.8672779197354576, + "flos": 28545355992960.0, + "grad_norm": 1.9458194171790135, + "language_loss": 0.71327066, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.73434436, + "num_input_tokens_seen": 311185860, + "step": 14425, + "time_per_iteration": 2.7444801330566406 + }, + { + "auxiliary_loss_clip": 0.01100034, + "auxiliary_loss_mlp": 0.01033916, + "balance_loss_clip": 1.03743434, + "balance_loss_mlp": 1.02065444, + "epoch": 0.8673380429881257, + "flos": 22382474970240.0, + "grad_norm": 1.755018176625315, + "language_loss": 0.6806134, + "learning_rate": 1.817153530980926e-07, + "loss": 0.70195293, + "num_input_tokens_seen": 311205810, + "step": 14426, + "time_per_iteration": 2.5805845260620117 + }, + { + "auxiliary_loss_clip": 0.01065339, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.03625464, + "balance_loss_mlp": 1.01546359, + "epoch": 0.8673981662407936, + "flos": 20996430912000.0, + "grad_norm": 1.8587393637126561, + "language_loss": 0.70647991, + "learning_rate": 1.815531824008234e-07, + "loss": 0.72741318, + "num_input_tokens_seen": 311226080, + "step": 14427, + "time_per_iteration": 2.685107469558716 + }, + { + "auxiliary_loss_clip": 0.01080277, + "auxiliary_loss_mlp": 0.0103208, + "balance_loss_clip": 1.03615725, + "balance_loss_mlp": 1.02000976, + "epoch": 0.8674582894934616, + "flos": 24426797627520.0, + "grad_norm": 1.894860167096284, + "language_loss": 0.68146193, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.70258546, + "num_input_tokens_seen": 311246380, + "step": 14428, + "time_per_iteration": 2.7677488327026367 + }, + { + "auxiliary_loss_clip": 0.01080543, + "auxiliary_loss_mlp": 0.01029001, + "balance_loss_clip": 1.0359962, + "balance_loss_mlp": 1.01683569, + "epoch": 0.8675184127461296, + "flos": 20737514701440.0, + "grad_norm": 2.892495609398215, + "language_loss": 0.70616251, + "learning_rate": 1.812290478794889e-07, + "loss": 0.72725797, + "num_input_tokens_seen": 311266465, + "step": 14429, + "time_per_iteration": 2.624802827835083 + }, + { + "auxiliary_loss_clip": 0.010878, + "auxiliary_loss_mlp": 0.01030213, + "balance_loss_clip": 1.03670454, + "balance_loss_mlp": 1.01785088, + "epoch": 0.8675785359987975, + "flos": 19135647774720.0, + "grad_norm": 1.8760175406026705, + "language_loss": 0.66803014, + "learning_rate": 1.810670840677151e-07, + "loss": 0.6892103, + "num_input_tokens_seen": 311285075, + "step": 14430, + "time_per_iteration": 2.6141793727874756 + }, + { + "auxiliary_loss_clip": 0.01064719, + "auxiliary_loss_mlp": 0.01037459, + "balance_loss_clip": 1.03474712, + "balance_loss_mlp": 1.02360034, + "epoch": 0.8676386592514655, + "flos": 22710662559360.0, + "grad_norm": 1.8772851475850807, + "language_loss": 0.69439894, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.71542072, + "num_input_tokens_seen": 311303230, + "step": 14431, + "time_per_iteration": 2.760996103286743 + }, + { + "auxiliary_loss_clip": 0.01097351, + "auxiliary_loss_mlp": 0.01040167, + "balance_loss_clip": 1.03582358, + "balance_loss_mlp": 1.02768576, + "epoch": 0.8676987825041335, + "flos": 14209853109120.0, + "grad_norm": 2.630424540057507, + "language_loss": 0.63210046, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.65347564, + "num_input_tokens_seen": 311318070, + "step": 14432, + "time_per_iteration": 2.5565524101257324 + }, + { + "auxiliary_loss_clip": 0.0109965, + "auxiliary_loss_mlp": 0.01039814, + "balance_loss_clip": 1.03807235, + "balance_loss_mlp": 1.02821505, + "epoch": 0.8677589057568015, + "flos": 13589927256960.0, + "grad_norm": 1.9324335266361277, + "language_loss": 0.78167832, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.80307293, + "num_input_tokens_seen": 311334885, + "step": 14433, + "time_per_iteration": 2.603163242340088 + }, + { + "auxiliary_loss_clip": 0.01010943, + "auxiliary_loss_mlp": 0.01002541, + "balance_loss_clip": 1.00770855, + "balance_loss_mlp": 1.00159311, + "epoch": 0.8678190290094694, + "flos": 68933657370240.0, + "grad_norm": 0.7061148841104811, + "language_loss": 0.5846473, + "learning_rate": 1.804199186231805e-07, + "loss": 0.6047821, + "num_input_tokens_seen": 311399780, + "step": 14434, + "time_per_iteration": 3.2711222171783447 + }, + { + "auxiliary_loss_clip": 0.01084546, + "auxiliary_loss_mlp": 0.01034123, + "balance_loss_clip": 1.03522635, + "balance_loss_mlp": 1.02258372, + "epoch": 0.8678791522621374, + "flos": 32557726776960.0, + "grad_norm": 1.9678570849349808, + "language_loss": 0.80160731, + "learning_rate": 1.802582997433628e-07, + "loss": 0.82279408, + "num_input_tokens_seen": 311419610, + "step": 14435, + "time_per_iteration": 2.729384660720825 + }, + { + "auxiliary_loss_clip": 0.0108652, + "auxiliary_loss_mlp": 0.00771159, + "balance_loss_clip": 1.03368807, + "balance_loss_mlp": 1.00019312, + "epoch": 0.8679392755148053, + "flos": 35042637657600.0, + "grad_norm": 2.323598256693539, + "language_loss": 0.62088466, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.63946146, + "num_input_tokens_seen": 311440045, + "step": 14436, + "time_per_iteration": 2.7514889240264893 + }, + { + "auxiliary_loss_clip": 0.01084626, + "auxiliary_loss_mlp": 0.01030406, + "balance_loss_clip": 1.03650117, + "balance_loss_mlp": 1.01762128, + "epoch": 0.8679993987674733, + "flos": 18552494471040.0, + "grad_norm": 2.2152793164861477, + "language_loss": 0.70417553, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.72532582, + "num_input_tokens_seen": 311456660, + "step": 14437, + "time_per_iteration": 2.682568311691284 + }, + { + "auxiliary_loss_clip": 0.01073964, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.03458905, + "balance_loss_mlp": 1.01451957, + "epoch": 0.8680595220201412, + "flos": 27454390162560.0, + "grad_norm": 1.9672371609341477, + "language_loss": 0.80644393, + "learning_rate": 1.797738571571381e-07, + "loss": 0.8274526, + "num_input_tokens_seen": 311475460, + "step": 14438, + "time_per_iteration": 2.7269651889801025 + }, + { + "auxiliary_loss_clip": 0.01089468, + "auxiliary_loss_mlp": 0.01024249, + "balance_loss_clip": 1.035025, + "balance_loss_mlp": 1.01237011, + "epoch": 0.8681196452728093, + "flos": 19208797822080.0, + "grad_norm": 1.7527538645260887, + "language_loss": 0.67584556, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.69698274, + "num_input_tokens_seen": 311494575, + "step": 14439, + "time_per_iteration": 2.581627130508423 + }, + { + "auxiliary_loss_clip": 0.01096234, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.03661394, + "balance_loss_mlp": 1.02268469, + "epoch": 0.8681797685254772, + "flos": 37560442417920.0, + "grad_norm": 1.484819058237711, + "language_loss": 0.63649923, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.65780365, + "num_input_tokens_seen": 311515805, + "step": 14440, + "time_per_iteration": 2.761298656463623 + }, + { + "auxiliary_loss_clip": 0.01095909, + "auxiliary_loss_mlp": 0.01034623, + "balance_loss_clip": 1.03644252, + "balance_loss_mlp": 1.02241611, + "epoch": 0.8682398917781452, + "flos": 23289937194240.0, + "grad_norm": 1.7310260750928266, + "language_loss": 0.66075879, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.68206406, + "num_input_tokens_seen": 311536000, + "step": 14441, + "time_per_iteration": 2.5800838470458984 + }, + { + "auxiliary_loss_clip": 0.01091494, + "auxiliary_loss_mlp": 0.01025353, + "balance_loss_clip": 1.0385139, + "balance_loss_mlp": 1.0138557, + "epoch": 0.8683000150308132, + "flos": 21872794936320.0, + "grad_norm": 1.681496330113871, + "language_loss": 0.66083562, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.68200409, + "num_input_tokens_seen": 311556220, + "step": 14442, + "time_per_iteration": 2.642595052719116 + }, + { + "auxiliary_loss_clip": 0.01084435, + "auxiliary_loss_mlp": 0.01033988, + "balance_loss_clip": 1.03615665, + "balance_loss_mlp": 1.02031493, + "epoch": 0.8683601382834811, + "flos": 14647209108480.0, + "grad_norm": 1.780014180776551, + "language_loss": 0.72400081, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.74518502, + "num_input_tokens_seen": 311572530, + "step": 14443, + "time_per_iteration": 2.621661901473999 + }, + { + "auxiliary_loss_clip": 0.01109856, + "auxiliary_loss_mlp": 0.01028336, + "balance_loss_clip": 1.0374794, + "balance_loss_mlp": 1.01575971, + "epoch": 0.8684202615361492, + "flos": 26359904799360.0, + "grad_norm": 1.7034879908488254, + "language_loss": 0.83455396, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.85593581, + "num_input_tokens_seen": 311591105, + "step": 14444, + "time_per_iteration": 2.5682990550994873 + }, + { + "auxiliary_loss_clip": 0.01071317, + "auxiliary_loss_mlp": 0.01030828, + "balance_loss_clip": 1.03839469, + "balance_loss_mlp": 1.0183413, + "epoch": 0.8684803847888171, + "flos": 20704010290560.0, + "grad_norm": 1.882585411960033, + "language_loss": 0.77276009, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.79378152, + "num_input_tokens_seen": 311608350, + "step": 14445, + "time_per_iteration": 2.6933975219726562 + }, + { + "auxiliary_loss_clip": 0.01097793, + "auxiliary_loss_mlp": 0.01031892, + "balance_loss_clip": 1.0368073, + "balance_loss_mlp": 1.01954842, + "epoch": 0.8685405080414851, + "flos": 22638123043200.0, + "grad_norm": 1.8974232570725826, + "language_loss": 0.68224823, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.70354509, + "num_input_tokens_seen": 311626380, + "step": 14446, + "time_per_iteration": 2.6505656242370605 + }, + { + "auxiliary_loss_clip": 0.01093238, + "auxiliary_loss_mlp": 0.01034173, + "balance_loss_clip": 1.03448546, + "balance_loss_mlp": 1.0214951, + "epoch": 0.868600631294153, + "flos": 24822065865600.0, + "grad_norm": 1.831558393609818, + "language_loss": 0.83143735, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.85271144, + "num_input_tokens_seen": 311644345, + "step": 14447, + "time_per_iteration": 2.5855720043182373 + }, + { + "auxiliary_loss_clip": 0.01028885, + "auxiliary_loss_mlp": 0.01028809, + "balance_loss_clip": 1.03098965, + "balance_loss_mlp": 1.01697183, + "epoch": 0.868660754546821, + "flos": 25113983696640.0, + "grad_norm": 1.612042145706922, + "language_loss": 0.74218094, + "learning_rate": 1.781635359686515e-07, + "loss": 0.76275784, + "num_input_tokens_seen": 311663340, + "step": 14448, + "time_per_iteration": 2.75423002243042 + }, + { + "auxiliary_loss_clip": 0.01081834, + "auxiliary_loss_mlp": 0.01032637, + "balance_loss_clip": 1.03381288, + "balance_loss_mlp": 1.01907682, + "epoch": 0.8687208777994889, + "flos": 12677832178560.0, + "grad_norm": 1.9294306155040917, + "language_loss": 0.79997855, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.82112324, + "num_input_tokens_seen": 311679860, + "step": 14449, + "time_per_iteration": 2.6481199264526367 + }, + { + "auxiliary_loss_clip": 0.0100162, + "auxiliary_loss_mlp": 0.01004017, + "balance_loss_clip": 1.00803828, + "balance_loss_mlp": 1.0030396, + "epoch": 0.8687810010521569, + "flos": 65617235573760.0, + "grad_norm": 0.811742362179789, + "language_loss": 0.60572553, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.62578189, + "num_input_tokens_seen": 311738135, + "step": 14450, + "time_per_iteration": 3.1744225025177 + }, + { + "auxiliary_loss_clip": 0.01084674, + "auxiliary_loss_mlp": 0.01030905, + "balance_loss_clip": 1.03782833, + "balance_loss_mlp": 1.01882339, + "epoch": 0.8688411243048249, + "flos": 24244012293120.0, + "grad_norm": 1.7384604154685417, + "language_loss": 0.76132762, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.78248346, + "num_input_tokens_seen": 311756975, + "step": 14451, + "time_per_iteration": 2.71647310256958 + }, + { + "auxiliary_loss_clip": 0.01093999, + "auxiliary_loss_mlp": 0.01027005, + "balance_loss_clip": 1.03542089, + "balance_loss_mlp": 1.01485705, + "epoch": 0.8689012475574929, + "flos": 18221828843520.0, + "grad_norm": 3.077369236554663, + "language_loss": 0.71929884, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.74050885, + "num_input_tokens_seen": 311771830, + "step": 14452, + "time_per_iteration": 2.6421010494232178 + }, + { + "auxiliary_loss_clip": 0.01086249, + "auxiliary_loss_mlp": 0.00770837, + "balance_loss_clip": 1.03687978, + "balance_loss_mlp": 1.00033617, + "epoch": 0.8689613708101608, + "flos": 19646728439040.0, + "grad_norm": 1.4971300186991454, + "language_loss": 0.72101021, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.73958105, + "num_input_tokens_seen": 311790130, + "step": 14453, + "time_per_iteration": 2.6629247665405273 + }, + { + "auxiliary_loss_clip": 0.01096295, + "auxiliary_loss_mlp": 0.01035101, + "balance_loss_clip": 1.03675365, + "balance_loss_mlp": 1.02261996, + "epoch": 0.8690214940628288, + "flos": 11728749070080.0, + "grad_norm": 3.182912447217293, + "language_loss": 0.73198676, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.75330073, + "num_input_tokens_seen": 311808360, + "step": 14454, + "time_per_iteration": 2.6625709533691406 + }, + { + "auxiliary_loss_clip": 0.01109645, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.03889263, + "balance_loss_mlp": 1.01947582, + "epoch": 0.8690816173154968, + "flos": 34936450076160.0, + "grad_norm": 2.5283080783573615, + "language_loss": 0.59421092, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.61562192, + "num_input_tokens_seen": 311831325, + "step": 14455, + "time_per_iteration": 2.716947078704834 + }, + { + "auxiliary_loss_clip": 0.01088564, + "auxiliary_loss_mlp": 0.01031079, + "balance_loss_clip": 1.03601408, + "balance_loss_mlp": 1.0188961, + "epoch": 0.8691417405681647, + "flos": 11614804151040.0, + "grad_norm": 2.476455717843228, + "language_loss": 0.80191058, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.823107, + "num_input_tokens_seen": 311848090, + "step": 14456, + "time_per_iteration": 2.608692169189453 + }, + { + "auxiliary_loss_clip": 0.01050256, + "auxiliary_loss_mlp": 0.01043748, + "balance_loss_clip": 1.03250086, + "balance_loss_mlp": 1.02862024, + "epoch": 0.8692018638208328, + "flos": 24608038677120.0, + "grad_norm": 3.350924717538294, + "language_loss": 0.74652326, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.76746327, + "num_input_tokens_seen": 311867855, + "step": 14457, + "time_per_iteration": 2.8124382495880127 + }, + { + "auxiliary_loss_clip": 0.0104746, + "auxiliary_loss_mlp": 0.0103056, + "balance_loss_clip": 1.03249383, + "balance_loss_mlp": 1.01879406, + "epoch": 0.8692619870735007, + "flos": 25995124229760.0, + "grad_norm": 1.6659706537885548, + "language_loss": 0.78279102, + "learning_rate": 1.765601232001328e-07, + "loss": 0.80357122, + "num_input_tokens_seen": 311888675, + "step": 14458, + "time_per_iteration": 2.7865068912506104 + }, + { + "auxiliary_loss_clip": 0.0109921, + "auxiliary_loss_mlp": 0.01034656, + "balance_loss_clip": 1.0370791, + "balance_loss_mlp": 1.02152491, + "epoch": 0.8693221103261687, + "flos": 18041808856320.0, + "grad_norm": 1.8944843149653803, + "language_loss": 0.70788461, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.72922325, + "num_input_tokens_seen": 311907310, + "step": 14459, + "time_per_iteration": 4.2408952713012695 + }, + { + "auxiliary_loss_clip": 0.01082625, + "auxiliary_loss_mlp": 0.01030636, + "balance_loss_clip": 1.03549707, + "balance_loss_mlp": 1.01960313, + "epoch": 0.8693822335788366, + "flos": 27492347859840.0, + "grad_norm": 1.4467054831762125, + "language_loss": 0.73848921, + "learning_rate": 1.762402701923398e-07, + "loss": 0.75962174, + "num_input_tokens_seen": 311929635, + "step": 14460, + "time_per_iteration": 4.442849636077881 + }, + { + "auxiliary_loss_clip": 0.01092251, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.03765035, + "balance_loss_mlp": 1.02094245, + "epoch": 0.8694423568315046, + "flos": 24097712198400.0, + "grad_norm": 1.8288235329592715, + "language_loss": 0.64751619, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.66877288, + "num_input_tokens_seen": 311948800, + "step": 14461, + "time_per_iteration": 2.68937087059021 + }, + { + "auxiliary_loss_clip": 0.01093111, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.03253245, + "balance_loss_mlp": 1.0203414, + "epoch": 0.8695024800841725, + "flos": 18362131367040.0, + "grad_norm": 2.5518242110711933, + "language_loss": 0.82737637, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.84863782, + "num_input_tokens_seen": 311964090, + "step": 14462, + "time_per_iteration": 5.744420289993286 + }, + { + "auxiliary_loss_clip": 0.01096615, + "auxiliary_loss_mlp": 0.01033503, + "balance_loss_clip": 1.03401327, + "balance_loss_mlp": 1.02065229, + "epoch": 0.8695626033368405, + "flos": 14027750133120.0, + "grad_norm": 1.890249833404203, + "language_loss": 0.65323138, + "learning_rate": 1.757610093744335e-07, + "loss": 0.67453253, + "num_input_tokens_seen": 311981460, + "step": 14463, + "time_per_iteration": 2.601334810256958 + }, + { + "auxiliary_loss_clip": 0.01091864, + "auxiliary_loss_mlp": 0.01035596, + "balance_loss_clip": 1.03908527, + "balance_loss_mlp": 1.02291179, + "epoch": 0.8696227265895085, + "flos": 16836862193280.0, + "grad_norm": 2.1647226205532206, + "language_loss": 0.66890931, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.690184, + "num_input_tokens_seen": 312000115, + "step": 14464, + "time_per_iteration": 2.6851119995117188 + }, + { + "auxiliary_loss_clip": 0.01090151, + "auxiliary_loss_mlp": 0.01034839, + "balance_loss_clip": 1.03739452, + "balance_loss_mlp": 1.02226293, + "epoch": 0.8696828498421765, + "flos": 21799070271360.0, + "grad_norm": 2.2457253344226245, + "language_loss": 0.62439811, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.64564812, + "num_input_tokens_seen": 312020770, + "step": 14465, + "time_per_iteration": 2.79040265083313 + }, + { + "auxiliary_loss_clip": 0.01091695, + "auxiliary_loss_mlp": 0.01041479, + "balance_loss_clip": 1.03505969, + "balance_loss_mlp": 1.03027892, + "epoch": 0.8697429730948444, + "flos": 22894812610560.0, + "grad_norm": 1.5293603652202958, + "language_loss": 0.84881204, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.87014377, + "num_input_tokens_seen": 312041870, + "step": 14466, + "time_per_iteration": 2.636146306991577 + }, + { + "auxiliary_loss_clip": 0.0108122, + "auxiliary_loss_mlp": 0.01044083, + "balance_loss_clip": 1.03755033, + "balance_loss_mlp": 1.02939653, + "epoch": 0.8698030963475124, + "flos": 24717458482560.0, + "grad_norm": 2.8453884595631846, + "language_loss": 0.61869633, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.63994938, + "num_input_tokens_seen": 312058210, + "step": 14467, + "time_per_iteration": 2.6638076305389404 + }, + { + "auxiliary_loss_clip": 0.01103261, + "auxiliary_loss_mlp": 0.01028354, + "balance_loss_clip": 1.03525686, + "balance_loss_mlp": 1.01705909, + "epoch": 0.8698632196001803, + "flos": 28442221067520.0, + "grad_norm": 1.4153650067531596, + "language_loss": 0.68961638, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.71093249, + "num_input_tokens_seen": 312082665, + "step": 14468, + "time_per_iteration": 2.6570017337799072 + }, + { + "auxiliary_loss_clip": 0.01083749, + "auxiliary_loss_mlp": 0.01030565, + "balance_loss_clip": 1.03446794, + "balance_loss_mlp": 1.01894248, + "epoch": 0.8699233428528483, + "flos": 27636457224960.0, + "grad_norm": 1.5754041648724575, + "language_loss": 0.71199894, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.73314214, + "num_input_tokens_seen": 312101960, + "step": 14469, + "time_per_iteration": 2.6813437938690186 + }, + { + "auxiliary_loss_clip": 0.01091595, + "auxiliary_loss_mlp": 0.01032263, + "balance_loss_clip": 1.03561163, + "balance_loss_mlp": 1.0210638, + "epoch": 0.8699834661055164, + "flos": 20045659864320.0, + "grad_norm": 1.885135452961054, + "language_loss": 0.84151506, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.86275363, + "num_input_tokens_seen": 312117125, + "step": 14470, + "time_per_iteration": 2.6702113151550293 + }, + { + "auxiliary_loss_clip": 0.01081371, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.03428483, + "balance_loss_mlp": 1.02214813, + "epoch": 0.8700435893581843, + "flos": 23732787974400.0, + "grad_norm": 1.7089523138026592, + "language_loss": 0.72859287, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.74975377, + "num_input_tokens_seen": 312135775, + "step": 14471, + "time_per_iteration": 2.695295572280884 + }, + { + "auxiliary_loss_clip": 0.01107843, + "auxiliary_loss_mlp": 0.01025754, + "balance_loss_clip": 1.03751683, + "balance_loss_mlp": 1.01464319, + "epoch": 0.8701037126108523, + "flos": 23548422441600.0, + "grad_norm": 1.3968254989831368, + "language_loss": 0.78822994, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.80956596, + "num_input_tokens_seen": 312156070, + "step": 14472, + "time_per_iteration": 2.570103883743286 + }, + { + "auxiliary_loss_clip": 0.01091602, + "auxiliary_loss_mlp": 0.00771146, + "balance_loss_clip": 1.03555846, + "balance_loss_mlp": 1.00026262, + "epoch": 0.8701638358635202, + "flos": 18843442634880.0, + "grad_norm": 2.053518578575987, + "language_loss": 0.72808838, + "learning_rate": 1.741679706279644e-07, + "loss": 0.74671578, + "num_input_tokens_seen": 312174380, + "step": 14473, + "time_per_iteration": 2.5629189014434814 + }, + { + "auxiliary_loss_clip": 0.01111529, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.03829002, + "balance_loss_mlp": 1.02074575, + "epoch": 0.8702239591161882, + "flos": 27928339142400.0, + "grad_norm": 1.5975251132862047, + "language_loss": 0.72459877, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.74604738, + "num_input_tokens_seen": 312195130, + "step": 14474, + "time_per_iteration": 2.629110097885132 + }, + { + "auxiliary_loss_clip": 0.01084584, + "auxiliary_loss_mlp": 0.01037278, + "balance_loss_clip": 1.03387856, + "balance_loss_mlp": 1.02389669, + "epoch": 0.8702840823688561, + "flos": 17233997938560.0, + "grad_norm": 1.7683975899654203, + "language_loss": 0.67307568, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.69429433, + "num_input_tokens_seen": 312212300, + "step": 14475, + "time_per_iteration": 2.7122128009796143 + }, + { + "auxiliary_loss_clip": 0.01107714, + "auxiliary_loss_mlp": 0.01025637, + "balance_loss_clip": 1.03506005, + "balance_loss_mlp": 1.01282167, + "epoch": 0.8703442056215241, + "flos": 19427565605760.0, + "grad_norm": 1.7492617051008474, + "language_loss": 0.77730834, + "learning_rate": 1.736914088262349e-07, + "loss": 0.79864192, + "num_input_tokens_seen": 312231735, + "step": 14476, + "time_per_iteration": 2.6359400749206543 + }, + { + "auxiliary_loss_clip": 0.01090317, + "auxiliary_loss_mlp": 0.01034999, + "balance_loss_clip": 1.03377438, + "balance_loss_mlp": 1.02168965, + "epoch": 0.8704043288741921, + "flos": 22273845264000.0, + "grad_norm": 1.9949328659253254, + "language_loss": 0.72224838, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.74350154, + "num_input_tokens_seen": 312253060, + "step": 14477, + "time_per_iteration": 2.7253026962280273 + }, + { + "auxiliary_loss_clip": 0.01100703, + "auxiliary_loss_mlp": 0.01029682, + "balance_loss_clip": 1.03792751, + "balance_loss_mlp": 1.01752841, + "epoch": 0.8704644521268601, + "flos": 16648725732480.0, + "grad_norm": 1.8285670196603703, + "language_loss": 0.59689963, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.61820352, + "num_input_tokens_seen": 312269460, + "step": 14478, + "time_per_iteration": 2.6406443119049072 + }, + { + "auxiliary_loss_clip": 0.01099279, + "auxiliary_loss_mlp": 0.01028306, + "balance_loss_clip": 1.04014349, + "balance_loss_mlp": 1.01780367, + "epoch": 0.870524575379528, + "flos": 24280210224000.0, + "grad_norm": 1.716825353140286, + "language_loss": 0.71369159, + "learning_rate": 1.732154703087323e-07, + "loss": 0.73496747, + "num_input_tokens_seen": 312289830, + "step": 14479, + "time_per_iteration": 2.6733837127685547 + }, + { + "auxiliary_loss_clip": 0.01084359, + "auxiliary_loss_mlp": 0.01031083, + "balance_loss_clip": 1.03538418, + "balance_loss_mlp": 1.01857221, + "epoch": 0.870584698632196, + "flos": 28768684803840.0, + "grad_norm": 1.4964038489812062, + "language_loss": 0.70916605, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.73032045, + "num_input_tokens_seen": 312311320, + "step": 14480, + "time_per_iteration": 2.724393367767334 + }, + { + "auxiliary_loss_clip": 0.01056493, + "auxiliary_loss_mlp": 0.01033786, + "balance_loss_clip": 1.03123474, + "balance_loss_mlp": 1.0217936, + "epoch": 0.8706448218848639, + "flos": 32449635774720.0, + "grad_norm": 1.7419679363065612, + "language_loss": 0.70210093, + "learning_rate": 1.728985243129666e-07, + "loss": 0.72300369, + "num_input_tokens_seen": 312332095, + "step": 14481, + "time_per_iteration": 2.9082820415496826 + }, + { + "auxiliary_loss_clip": 0.01096033, + "auxiliary_loss_mlp": 0.0103073, + "balance_loss_clip": 1.03603554, + "balance_loss_mlp": 1.01895249, + "epoch": 0.8707049451375319, + "flos": 22748009725440.0, + "grad_norm": 1.9715155189450182, + "language_loss": 0.76938367, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.79065132, + "num_input_tokens_seen": 312351225, + "step": 14482, + "time_per_iteration": 2.663579225540161 + }, + { + "auxiliary_loss_clip": 0.0108459, + "auxiliary_loss_mlp": 0.01033132, + "balance_loss_clip": 1.03461742, + "balance_loss_mlp": 1.02019787, + "epoch": 0.8707650683902, + "flos": 15851976203520.0, + "grad_norm": 1.9099743094346329, + "language_loss": 0.76708519, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.78826237, + "num_input_tokens_seen": 312369730, + "step": 14483, + "time_per_iteration": 2.6323695182800293 + }, + { + "auxiliary_loss_clip": 0.01102699, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.03712118, + "balance_loss_mlp": 1.02540636, + "epoch": 0.8708251916428679, + "flos": 16468131127680.0, + "grad_norm": 2.2142588001680856, + "language_loss": 0.61881113, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.64022452, + "num_input_tokens_seen": 312386780, + "step": 14484, + "time_per_iteration": 2.710033893585205 + }, + { + "auxiliary_loss_clip": 0.01108847, + "auxiliary_loss_mlp": 0.01031795, + "balance_loss_clip": 1.03816152, + "balance_loss_mlp": 1.01980281, + "epoch": 0.8708853148955359, + "flos": 15377847655680.0, + "grad_norm": 2.077574729557336, + "language_loss": 0.68238926, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.70379567, + "num_input_tokens_seen": 312404875, + "step": 14485, + "time_per_iteration": 2.5754683017730713 + }, + { + "auxiliary_loss_clip": 0.01050138, + "auxiliary_loss_mlp": 0.00770399, + "balance_loss_clip": 1.03129363, + "balance_loss_mlp": 1.00021982, + "epoch": 0.8709454381482038, + "flos": 30551325903360.0, + "grad_norm": 1.7252030737684174, + "language_loss": 0.62990439, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.64810973, + "num_input_tokens_seen": 312425280, + "step": 14486, + "time_per_iteration": 2.9066638946533203 + }, + { + "auxiliary_loss_clip": 0.01111488, + "auxiliary_loss_mlp": 0.01033225, + "balance_loss_clip": 1.03683174, + "balance_loss_mlp": 1.02015388, + "epoch": 0.8710055614008718, + "flos": 22601422321920.0, + "grad_norm": 1.8160916488481187, + "language_loss": 0.61385965, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.63530672, + "num_input_tokens_seen": 312443835, + "step": 14487, + "time_per_iteration": 2.5739262104034424 + }, + { + "auxiliary_loss_clip": 0.01081023, + "auxiliary_loss_mlp": 0.00768637, + "balance_loss_clip": 1.03572392, + "balance_loss_mlp": 1.00022483, + "epoch": 0.8710656846535397, + "flos": 18443146492800.0, + "grad_norm": 2.0123613366122126, + "language_loss": 0.67942166, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.6979183, + "num_input_tokens_seen": 312460830, + "step": 14488, + "time_per_iteration": 2.7428195476531982 + }, + { + "auxiliary_loss_clip": 0.01092486, + "auxiliary_loss_mlp": 0.007699, + "balance_loss_clip": 1.03904903, + "balance_loss_mlp": 1.00028038, + "epoch": 0.8711258079062077, + "flos": 16503862181760.0, + "grad_norm": 1.8864520858010565, + "language_loss": 0.85530466, + "learning_rate": 1.716335121648338e-07, + "loss": 0.87392855, + "num_input_tokens_seen": 312477575, + "step": 14489, + "time_per_iteration": 2.647411346435547 + }, + { + "auxiliary_loss_clip": 0.01102857, + "auxiliary_loss_mlp": 0.01030869, + "balance_loss_clip": 1.03787231, + "balance_loss_mlp": 1.01791716, + "epoch": 0.8711859311588757, + "flos": 15663336952320.0, + "grad_norm": 11.279745936995974, + "language_loss": 0.75571835, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.77705562, + "num_input_tokens_seen": 312492140, + "step": 14490, + "time_per_iteration": 2.602102041244507 + }, + { + "auxiliary_loss_clip": 0.01100977, + "auxiliary_loss_mlp": 0.01029449, + "balance_loss_clip": 1.03637326, + "balance_loss_mlp": 1.01625216, + "epoch": 0.8712460544115437, + "flos": 15557544420480.0, + "grad_norm": 2.2840810833035157, + "language_loss": 0.7581045, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.77940881, + "num_input_tokens_seen": 312508400, + "step": 14491, + "time_per_iteration": 2.600862503051758 + }, + { + "auxiliary_loss_clip": 0.01080925, + "auxiliary_loss_mlp": 0.01026616, + "balance_loss_clip": 1.04117799, + "balance_loss_mlp": 1.01437354, + "epoch": 0.8713061776642116, + "flos": 16763568491520.0, + "grad_norm": 1.774399528748011, + "language_loss": 0.67152178, + "learning_rate": 1.711602764198723e-07, + "loss": 0.69259721, + "num_input_tokens_seen": 312525915, + "step": 14492, + "time_per_iteration": 2.666191577911377 + }, + { + "auxiliary_loss_clip": 0.01095889, + "auxiliary_loss_mlp": 0.01032256, + "balance_loss_clip": 1.03753376, + "balance_loss_mlp": 1.02096081, + "epoch": 0.8713663009168796, + "flos": 24279887001600.0, + "grad_norm": 1.7247817112541417, + "language_loss": 0.6931386, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.71442008, + "num_input_tokens_seen": 312544735, + "step": 14493, + "time_per_iteration": 2.6735992431640625 + }, + { + "auxiliary_loss_clip": 0.01112164, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.03958261, + "balance_loss_mlp": 1.02021742, + "epoch": 0.8714264241695475, + "flos": 23795594904960.0, + "grad_norm": 2.938022699932479, + "language_loss": 0.8914628, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.91292143, + "num_input_tokens_seen": 312557910, + "step": 14494, + "time_per_iteration": 2.5774879455566406 + }, + { + "auxiliary_loss_clip": 0.01074718, + "auxiliary_loss_mlp": 0.01032805, + "balance_loss_clip": 1.03785324, + "balance_loss_mlp": 1.02117586, + "epoch": 0.8714865474222155, + "flos": 37997942071680.0, + "grad_norm": 1.9797291272398052, + "language_loss": 0.59116101, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.61223626, + "num_input_tokens_seen": 312580360, + "step": 14495, + "time_per_iteration": 2.8289716243743896 + }, + { + "auxiliary_loss_clip": 0.01076759, + "auxiliary_loss_mlp": 0.01037611, + "balance_loss_clip": 1.03488982, + "balance_loss_mlp": 1.02474201, + "epoch": 0.8715466706748836, + "flos": 22455696844800.0, + "grad_norm": 2.176188158663058, + "language_loss": 0.80262101, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.82376468, + "num_input_tokens_seen": 312597550, + "step": 14496, + "time_per_iteration": 2.6638436317443848 + }, + { + "auxiliary_loss_clip": 0.01083126, + "auxiliary_loss_mlp": 0.01037102, + "balance_loss_clip": 1.03796446, + "balance_loss_mlp": 1.02335715, + "epoch": 0.8716067939275515, + "flos": 21215126868480.0, + "grad_norm": 2.0021272743800536, + "language_loss": 0.78574479, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.80694699, + "num_input_tokens_seen": 312616435, + "step": 14497, + "time_per_iteration": 2.6190896034240723 + }, + { + "auxiliary_loss_clip": 0.01111391, + "auxiliary_loss_mlp": 0.01030843, + "balance_loss_clip": 1.03765655, + "balance_loss_mlp": 1.01795101, + "epoch": 0.8716669171802195, + "flos": 22997732054400.0, + "grad_norm": 1.9670976270372313, + "language_loss": 0.67136586, + "learning_rate": 1.70215677535406e-07, + "loss": 0.69278824, + "num_input_tokens_seen": 312632770, + "step": 14498, + "time_per_iteration": 4.060052394866943 + }, + { + "auxiliary_loss_clip": 0.01070213, + "auxiliary_loss_mlp": 0.01031582, + "balance_loss_clip": 1.03320992, + "balance_loss_mlp": 1.01950634, + "epoch": 0.8717270404328874, + "flos": 29784058462080.0, + "grad_norm": 1.6975392941334262, + "language_loss": 0.57051951, + "learning_rate": 1.700584872028108e-07, + "loss": 0.59153748, + "num_input_tokens_seen": 312651900, + "step": 14499, + "time_per_iteration": 4.371240615844727 + }, + { + "auxiliary_loss_clip": 0.01067535, + "auxiliary_loss_mlp": 0.01035634, + "balance_loss_clip": 1.03329492, + "balance_loss_mlp": 1.02273571, + "epoch": 0.8717871636855554, + "flos": 22018125363840.0, + "grad_norm": 2.018070377597452, + "language_loss": 0.79869312, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.8197248, + "num_input_tokens_seen": 312671380, + "step": 14500, + "time_per_iteration": 2.641244888305664 + }, + { + "auxiliary_loss_clip": 0.01093156, + "auxiliary_loss_mlp": 0.0103193, + "balance_loss_clip": 1.03767002, + "balance_loss_mlp": 1.01997352, + "epoch": 0.8718472869382233, + "flos": 16654256426880.0, + "grad_norm": 1.9117037031727822, + "language_loss": 0.72699761, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.74824846, + "num_input_tokens_seen": 312689215, + "step": 14501, + "time_per_iteration": 5.817331552505493 + }, + { + "auxiliary_loss_clip": 0.01072933, + "auxiliary_loss_mlp": 0.0102922, + "balance_loss_clip": 1.03339136, + "balance_loss_mlp": 1.01612496, + "epoch": 0.8719074101908914, + "flos": 19495328613120.0, + "grad_norm": 2.7665364794887934, + "language_loss": 0.64852804, + "learning_rate": 1.695873325782482e-07, + "loss": 0.66954952, + "num_input_tokens_seen": 312706400, + "step": 14502, + "time_per_iteration": 2.730670690536499 + }, + { + "auxiliary_loss_clip": 0.01083793, + "auxiliary_loss_mlp": 0.01040001, + "balance_loss_clip": 1.03453636, + "balance_loss_mlp": 1.02594066, + "epoch": 0.8719675334435593, + "flos": 33070890430080.0, + "grad_norm": 1.7549915055892822, + "language_loss": 0.68897182, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.71020973, + "num_input_tokens_seen": 312727985, + "step": 14503, + "time_per_iteration": 2.7599282264709473 + }, + { + "auxiliary_loss_clip": 0.01085187, + "auxiliary_loss_mlp": 0.01028495, + "balance_loss_clip": 1.03614664, + "balance_loss_mlp": 1.01606762, + "epoch": 0.8720276566962273, + "flos": 13626268842240.0, + "grad_norm": 2.4452833389757833, + "language_loss": 0.69641596, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.71755278, + "num_input_tokens_seen": 312745025, + "step": 14504, + "time_per_iteration": 2.651085376739502 + }, + { + "auxiliary_loss_clip": 0.01095546, + "auxiliary_loss_mlp": 0.00770191, + "balance_loss_clip": 1.03598738, + "balance_loss_mlp": 1.00016737, + "epoch": 0.8720877799488952, + "flos": 23514163845120.0, + "grad_norm": 2.77338091149224, + "language_loss": 0.7014603, + "learning_rate": 1.691168026385552e-07, + "loss": 0.72011769, + "num_input_tokens_seen": 312764170, + "step": 14505, + "time_per_iteration": 2.6669936180114746 + }, + { + "auxiliary_loss_clip": 0.010867, + "auxiliary_loss_mlp": 0.01028087, + "balance_loss_clip": 1.03689265, + "balance_loss_mlp": 1.01638639, + "epoch": 0.8721479032015632, + "flos": 20814148368000.0, + "grad_norm": 2.005999921971975, + "language_loss": 0.78253883, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.80368668, + "num_input_tokens_seen": 312783830, + "step": 14506, + "time_per_iteration": 2.657680034637451 + }, + { + "auxiliary_loss_clip": 0.01088712, + "auxiliary_loss_mlp": 0.01028485, + "balance_loss_clip": 1.03430939, + "balance_loss_mlp": 1.01588416, + "epoch": 0.8722080264542311, + "flos": 19463655795840.0, + "grad_norm": 2.6356366496590775, + "language_loss": 0.73982906, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.76100105, + "num_input_tokens_seen": 312802015, + "step": 14507, + "time_per_iteration": 2.6549437046051025 + }, + { + "auxiliary_loss_clip": 0.01050345, + "auxiliary_loss_mlp": 0.01041227, + "balance_loss_clip": 1.03153408, + "balance_loss_mlp": 1.02601552, + "epoch": 0.8722681497068991, + "flos": 21761866759680.0, + "grad_norm": 2.186590491088002, + "language_loss": 0.72111464, + "learning_rate": 1.686468975443156e-07, + "loss": 0.74203038, + "num_input_tokens_seen": 312820650, + "step": 14508, + "time_per_iteration": 2.7782466411590576 + }, + { + "auxiliary_loss_clip": 0.01091843, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.03782344, + "balance_loss_mlp": 1.02198446, + "epoch": 0.8723282729595672, + "flos": 28877134942080.0, + "grad_norm": 9.271419619391889, + "language_loss": 0.68848205, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.70975429, + "num_input_tokens_seen": 312841310, + "step": 14509, + "time_per_iteration": 2.729306221008301 + }, + { + "auxiliary_loss_clip": 0.01084143, + "auxiliary_loss_mlp": 0.01032677, + "balance_loss_clip": 1.03603458, + "balance_loss_mlp": 1.02049422, + "epoch": 0.8723883962122351, + "flos": 26469145036800.0, + "grad_norm": 1.83494283279599, + "language_loss": 0.58361018, + "learning_rate": 1.683339746970558e-07, + "loss": 0.60477841, + "num_input_tokens_seen": 312862100, + "step": 14510, + "time_per_iteration": 2.712592363357544 + }, + { + "auxiliary_loss_clip": 0.01115632, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.03837419, + "balance_loss_mlp": 1.01929033, + "epoch": 0.8724485194649031, + "flos": 20521476351360.0, + "grad_norm": 2.9455639532360003, + "language_loss": 0.67271483, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.69419849, + "num_input_tokens_seen": 312880220, + "step": 14511, + "time_per_iteration": 2.6101818084716797 + }, + { + "auxiliary_loss_clip": 0.01066568, + "auxiliary_loss_mlp": 0.01035139, + "balance_loss_clip": 1.03755903, + "balance_loss_mlp": 1.02190125, + "epoch": 0.872508642717571, + "flos": 24353360271360.0, + "grad_norm": 1.5822238245751863, + "language_loss": 0.81579173, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.8368088, + "num_input_tokens_seen": 312900765, + "step": 14512, + "time_per_iteration": 2.8737993240356445 + }, + { + "auxiliary_loss_clip": 0.01013613, + "auxiliary_loss_mlp": 0.01001982, + "balance_loss_clip": 1.01023149, + "balance_loss_mlp": 1.00103402, + "epoch": 0.872568765970239, + "flos": 61410012485760.0, + "grad_norm": 0.7938781120275261, + "language_loss": 0.58586168, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.60601765, + "num_input_tokens_seen": 312955840, + "step": 14513, + "time_per_iteration": 3.0974059104919434 + }, + { + "auxiliary_loss_clip": 0.01099507, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.03738713, + "balance_loss_mlp": 1.02111769, + "epoch": 0.8726288892229069, + "flos": 22598046443520.0, + "grad_norm": 1.742471393477679, + "language_loss": 0.76562905, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.78696269, + "num_input_tokens_seen": 312973565, + "step": 14514, + "time_per_iteration": 2.6420650482177734 + }, + { + "auxiliary_loss_clip": 0.01103565, + "auxiliary_loss_mlp": 0.0102728, + "balance_loss_clip": 1.03866398, + "balance_loss_mlp": 1.01560926, + "epoch": 0.872689012475575, + "flos": 25885201633920.0, + "grad_norm": 1.9498734403168592, + "language_loss": 0.6555599, + "learning_rate": 1.675528831794055e-07, + "loss": 0.67686838, + "num_input_tokens_seen": 312994660, + "step": 14515, + "time_per_iteration": 2.6264796257019043 + }, + { + "auxiliary_loss_clip": 0.01097256, + "auxiliary_loss_mlp": 0.01035195, + "balance_loss_clip": 1.03490353, + "balance_loss_mlp": 1.02188492, + "epoch": 0.8727491357282429, + "flos": 21506721477120.0, + "grad_norm": 2.001096470926363, + "language_loss": 0.79334152, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.81466603, + "num_input_tokens_seen": 313009860, + "step": 14516, + "time_per_iteration": 2.620288133621216 + }, + { + "auxiliary_loss_clip": 0.01112304, + "auxiliary_loss_mlp": 0.01034072, + "balance_loss_clip": 1.0381372, + "balance_loss_mlp": 1.02127457, + "epoch": 0.8728092589809109, + "flos": 19207504932480.0, + "grad_norm": 2.0299342762070927, + "language_loss": 0.72229123, + "learning_rate": 1.672409329369453e-07, + "loss": 0.74375498, + "num_input_tokens_seen": 313027025, + "step": 14517, + "time_per_iteration": 2.5668914318084717 + }, + { + "auxiliary_loss_clip": 0.0106993, + "auxiliary_loss_mlp": 0.01024167, + "balance_loss_clip": 1.03314495, + "balance_loss_mlp": 1.01283014, + "epoch": 0.8728693822335788, + "flos": 20595308757120.0, + "grad_norm": 2.054216166652221, + "language_loss": 0.72725064, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.74819165, + "num_input_tokens_seen": 313046830, + "step": 14518, + "time_per_iteration": 2.6475393772125244 + }, + { + "auxiliary_loss_clip": 0.01081214, + "auxiliary_loss_mlp": 0.01038057, + "balance_loss_clip": 1.03350496, + "balance_loss_mlp": 1.02506304, + "epoch": 0.8729295054862468, + "flos": 21728613744000.0, + "grad_norm": 1.3596830366410917, + "language_loss": 0.743572, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.76476473, + "num_input_tokens_seen": 313067715, + "step": 14519, + "time_per_iteration": 2.6572721004486084 + }, + { + "auxiliary_loss_clip": 0.01099689, + "auxiliary_loss_mlp": 0.01031709, + "balance_loss_clip": 1.03680825, + "balance_loss_mlp": 1.0181669, + "epoch": 0.8729896287389147, + "flos": 17673436926720.0, + "grad_norm": 2.5396553116313205, + "language_loss": 0.76397449, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.78528845, + "num_input_tokens_seen": 313082305, + "step": 14520, + "time_per_iteration": 2.5867063999176025 + }, + { + "auxiliary_loss_clip": 0.01086668, + "auxiliary_loss_mlp": 0.01036169, + "balance_loss_clip": 1.03518891, + "balance_loss_mlp": 1.02296638, + "epoch": 0.8730497519915827, + "flos": 24571804832640.0, + "grad_norm": 1.6038658913961292, + "language_loss": 0.82005751, + "learning_rate": 1.666178664801816e-07, + "loss": 0.84128582, + "num_input_tokens_seen": 313101190, + "step": 14521, + "time_per_iteration": 2.7092795372009277 + }, + { + "auxiliary_loss_clip": 0.01097676, + "auxiliary_loss_mlp": 0.01032217, + "balance_loss_clip": 1.03878248, + "balance_loss_mlp": 1.01914012, + "epoch": 0.8731098752442508, + "flos": 13443734903040.0, + "grad_norm": 1.8658353480537415, + "language_loss": 0.76242197, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.78372091, + "num_input_tokens_seen": 313118965, + "step": 14522, + "time_per_iteration": 2.5802886486053467 + }, + { + "auxiliary_loss_clip": 0.01094482, + "auxiliary_loss_mlp": 0.00769289, + "balance_loss_clip": 1.03429079, + "balance_loss_mlp": 1.0001862, + "epoch": 0.8731699984969187, + "flos": 23474446381440.0, + "grad_norm": 3.16869295355315, + "language_loss": 0.75775874, + "learning_rate": 1.66306750360385e-07, + "loss": 0.77639639, + "num_input_tokens_seen": 313139280, + "step": 14523, + "time_per_iteration": 2.684039831161499 + }, + { + "auxiliary_loss_clip": 0.01097173, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.03595114, + "balance_loss_mlp": 1.01999831, + "epoch": 0.8732301217495867, + "flos": 17712651600000.0, + "grad_norm": 2.782713247138861, + "language_loss": 0.78118378, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.80247641, + "num_input_tokens_seen": 313156655, + "step": 14524, + "time_per_iteration": 2.5906875133514404 + }, + { + "auxiliary_loss_clip": 0.01089545, + "auxiliary_loss_mlp": 0.01031376, + "balance_loss_clip": 1.03745615, + "balance_loss_mlp": 1.02009869, + "epoch": 0.8732902450022546, + "flos": 22054359208320.0, + "grad_norm": 4.924039303176845, + "language_loss": 0.77730787, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.79851705, + "num_input_tokens_seen": 313174050, + "step": 14525, + "time_per_iteration": 2.6270298957824707 + }, + { + "auxiliary_loss_clip": 0.01020522, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.03363109, + "balance_loss_mlp": 1.02209401, + "epoch": 0.8733503682549226, + "flos": 22272983337600.0, + "grad_norm": 2.157402662097444, + "language_loss": 0.6920954, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.71264577, + "num_input_tokens_seen": 313192765, + "step": 14526, + "time_per_iteration": 3.1794915199279785 + }, + { + "auxiliary_loss_clip": 0.01059512, + "auxiliary_loss_mlp": 0.01041504, + "balance_loss_clip": 1.03597927, + "balance_loss_mlp": 1.02804565, + "epoch": 0.8734104915075905, + "flos": 23364344217600.0, + "grad_norm": 2.126615018801638, + "language_loss": 0.6124419, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.63345206, + "num_input_tokens_seen": 313210925, + "step": 14527, + "time_per_iteration": 2.93717098236084 + }, + { + "auxiliary_loss_clip": 0.01102101, + "auxiliary_loss_mlp": 0.01036741, + "balance_loss_clip": 1.04113436, + "balance_loss_mlp": 1.02263236, + "epoch": 0.8734706147602586, + "flos": 17712292464000.0, + "grad_norm": 1.9327506841110211, + "language_loss": 0.65617096, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.67755938, + "num_input_tokens_seen": 313228250, + "step": 14528, + "time_per_iteration": 2.5247788429260254 + }, + { + "auxiliary_loss_clip": 0.01080324, + "auxiliary_loss_mlp": 0.01027224, + "balance_loss_clip": 1.03828454, + "balance_loss_mlp": 1.01499307, + "epoch": 0.8735307380129265, + "flos": 22049367217920.0, + "grad_norm": 2.128650528943947, + "language_loss": 0.89494413, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.91601956, + "num_input_tokens_seen": 313247880, + "step": 14529, + "time_per_iteration": 2.800915241241455 + }, + { + "auxiliary_loss_clip": 0.01085933, + "auxiliary_loss_mlp": 0.01031322, + "balance_loss_clip": 1.03527832, + "balance_loss_mlp": 1.01898432, + "epoch": 0.8735908612655945, + "flos": 25338425829120.0, + "grad_norm": 1.740049553302022, + "language_loss": 0.84358543, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.8647579, + "num_input_tokens_seen": 313266790, + "step": 14530, + "time_per_iteration": 2.7246882915496826 + }, + { + "auxiliary_loss_clip": 0.01086126, + "auxiliary_loss_mlp": 0.01038129, + "balance_loss_clip": 1.03533483, + "balance_loss_mlp": 1.02629161, + "epoch": 0.8736509845182624, + "flos": 21540908246400.0, + "grad_norm": 2.065068159593715, + "language_loss": 0.74541724, + "learning_rate": 1.650650677057128e-07, + "loss": 0.7666598, + "num_input_tokens_seen": 313286805, + "step": 14531, + "time_per_iteration": 2.7866251468658447 + }, + { + "auxiliary_loss_clip": 0.01094848, + "auxiliary_loss_mlp": 0.0103322, + "balance_loss_clip": 1.0341115, + "balance_loss_mlp": 1.02093542, + "epoch": 0.8737111077709304, + "flos": 22017227523840.0, + "grad_norm": 2.6296616466434655, + "language_loss": 0.6131202, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.6344009, + "num_input_tokens_seen": 313305415, + "step": 14532, + "time_per_iteration": 2.677741289138794 + }, + { + "auxiliary_loss_clip": 0.01018177, + "auxiliary_loss_mlp": 0.01004849, + "balance_loss_clip": 1.005548, + "balance_loss_mlp": 1.00377011, + "epoch": 0.8737712310235983, + "flos": 70066315912320.0, + "grad_norm": 0.9206045969458919, + "language_loss": 0.58650947, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.60673976, + "num_input_tokens_seen": 313369940, + "step": 14533, + "time_per_iteration": 4.089330434799194 + }, + { + "auxiliary_loss_clip": 0.01079874, + "auxiliary_loss_mlp": 0.01032746, + "balance_loss_clip": 1.03403592, + "balance_loss_mlp": 1.02048564, + "epoch": 0.8738313542762663, + "flos": 28658331244800.0, + "grad_norm": 1.49408783242758, + "language_loss": 0.76831782, + "learning_rate": 1.646005846335954e-07, + "loss": 0.78944403, + "num_input_tokens_seen": 313390965, + "step": 14534, + "time_per_iteration": 2.702711582183838 + }, + { + "auxiliary_loss_clip": 0.0108079, + "auxiliary_loss_mlp": 0.01033546, + "balance_loss_clip": 1.03330386, + "balance_loss_mlp": 1.02107036, + "epoch": 0.8738914775289344, + "flos": 22346384780160.0, + "grad_norm": 1.7135543711038013, + "language_loss": 0.75193512, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.77307844, + "num_input_tokens_seen": 313409680, + "step": 14535, + "time_per_iteration": 2.6537675857543945 + }, + { + "auxiliary_loss_clip": 0.01107851, + "auxiliary_loss_mlp": 0.01033358, + "balance_loss_clip": 1.03563666, + "balance_loss_mlp": 1.02093053, + "epoch": 0.8739516007816023, + "flos": 31759648444800.0, + "grad_norm": 2.0846644532444625, + "language_loss": 0.74546909, + "learning_rate": 1.64291277235048e-07, + "loss": 0.76688123, + "num_input_tokens_seen": 313431335, + "step": 14536, + "time_per_iteration": 2.6706697940826416 + }, + { + "auxiliary_loss_clip": 0.01087464, + "auxiliary_loss_mlp": 0.01031248, + "balance_loss_clip": 1.03460896, + "balance_loss_mlp": 1.01939237, + "epoch": 0.8740117240342703, + "flos": 21211715076480.0, + "grad_norm": 1.8068501761157092, + "language_loss": 0.63835013, + "learning_rate": 1.641367279482304e-07, + "loss": 0.65953726, + "num_input_tokens_seen": 313449225, + "step": 14537, + "time_per_iteration": 4.280652761459351 + }, + { + "auxiliary_loss_clip": 0.01094433, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.03392243, + "balance_loss_mlp": 1.01478267, + "epoch": 0.8740718472869382, + "flos": 25186666867200.0, + "grad_norm": 1.8076510907949124, + "language_loss": 0.57990402, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.60112923, + "num_input_tokens_seen": 313467715, + "step": 14538, + "time_per_iteration": 4.291844844818115 + }, + { + "auxiliary_loss_clip": 0.01096418, + "auxiliary_loss_mlp": 0.01025884, + "balance_loss_clip": 1.03719354, + "balance_loss_mlp": 1.0136714, + "epoch": 0.8741319705396062, + "flos": 19500931134720.0, + "grad_norm": 1.7388451814310184, + "language_loss": 0.68716401, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.70838702, + "num_input_tokens_seen": 313486805, + "step": 14539, + "time_per_iteration": 2.5990817546844482 + }, + { + "auxiliary_loss_clip": 0.01101524, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.03593516, + "balance_loss_mlp": 1.01815796, + "epoch": 0.8741920937922741, + "flos": 14100900180480.0, + "grad_norm": 2.0449241273671355, + "language_loss": 0.74361241, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.76493561, + "num_input_tokens_seen": 313504880, + "step": 14540, + "time_per_iteration": 2.6135077476501465 + }, + { + "auxiliary_loss_clip": 0.01082066, + "auxiliary_loss_mlp": 0.0103892, + "balance_loss_clip": 1.03429246, + "balance_loss_mlp": 1.02535379, + "epoch": 0.8742522170449422, + "flos": 27709858667520.0, + "grad_norm": 2.2042306692212947, + "language_loss": 0.78727126, + "learning_rate": 1.635192270207193e-07, + "loss": 0.8084811, + "num_input_tokens_seen": 313524995, + "step": 14541, + "time_per_iteration": 5.828189849853516 + }, + { + "auxiliary_loss_clip": 0.01068115, + "auxiliary_loss_mlp": 0.01034781, + "balance_loss_clip": 1.03299069, + "balance_loss_mlp": 1.02049947, + "epoch": 0.8743123402976101, + "flos": 21142587352320.0, + "grad_norm": 2.5163397271017724, + "language_loss": 0.66620183, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.68723083, + "num_input_tokens_seen": 313541740, + "step": 14542, + "time_per_iteration": 2.7577908039093018 + }, + { + "auxiliary_loss_clip": 0.01027438, + "auxiliary_loss_mlp": 0.0100168, + "balance_loss_clip": 1.00493681, + "balance_loss_mlp": 1.00071454, + "epoch": 0.8743724635502781, + "flos": 60870024351360.0, + "grad_norm": 0.7818261146678972, + "language_loss": 0.54485422, + "learning_rate": 1.632108943707642e-07, + "loss": 0.56514537, + "num_input_tokens_seen": 313593445, + "step": 14543, + "time_per_iteration": 2.908863067626953 + }, + { + "auxiliary_loss_clip": 0.01084752, + "auxiliary_loss_mlp": 0.01035429, + "balance_loss_clip": 1.0375371, + "balance_loss_mlp": 1.02258444, + "epoch": 0.874432586802946, + "flos": 28109292883200.0, + "grad_norm": 2.3839087640585457, + "language_loss": 0.69428027, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.71548212, + "num_input_tokens_seen": 313615640, + "step": 14544, + "time_per_iteration": 2.6920766830444336 + }, + { + "auxiliary_loss_clip": 0.01064253, + "auxiliary_loss_mlp": 0.01028954, + "balance_loss_clip": 1.03769612, + "balance_loss_mlp": 1.01798081, + "epoch": 0.874492710055614, + "flos": 23550289948800.0, + "grad_norm": 1.7246009574285497, + "language_loss": 0.75945365, + "learning_rate": 1.62902840325714e-07, + "loss": 0.78038573, + "num_input_tokens_seen": 313635550, + "step": 14545, + "time_per_iteration": 2.7786312103271484 + }, + { + "auxiliary_loss_clip": 0.01097234, + "auxiliary_loss_mlp": 0.00771469, + "balance_loss_clip": 1.03498626, + "balance_loss_mlp": 1.00026131, + "epoch": 0.8745528333082819, + "flos": 40915647924480.0, + "grad_norm": 10.499099096665093, + "language_loss": 0.66618592, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.68487293, + "num_input_tokens_seen": 313659275, + "step": 14546, + "time_per_iteration": 2.8346989154815674 + }, + { + "auxiliary_loss_clip": 0.01109602, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.03745484, + "balance_loss_mlp": 1.01785886, + "epoch": 0.87461295656095, + "flos": 23622901292160.0, + "grad_norm": 1.5789135583569807, + "language_loss": 0.7296229, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.75102079, + "num_input_tokens_seen": 313680595, + "step": 14547, + "time_per_iteration": 2.659517526626587 + }, + { + "auxiliary_loss_clip": 0.01115124, + "auxiliary_loss_mlp": 0.01040105, + "balance_loss_clip": 1.03795385, + "balance_loss_mlp": 1.02661061, + "epoch": 0.874673079813618, + "flos": 38794116983040.0, + "grad_norm": 3.3678360175538087, + "language_loss": 0.69317234, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.71472466, + "num_input_tokens_seen": 313699730, + "step": 14548, + "time_per_iteration": 2.754931926727295 + }, + { + "auxiliary_loss_clip": 0.01090989, + "auxiliary_loss_mlp": 0.01033861, + "balance_loss_clip": 1.03693557, + "balance_loss_mlp": 1.02118921, + "epoch": 0.8747332030662859, + "flos": 23696159080320.0, + "grad_norm": 2.005045026903121, + "language_loss": 0.70676434, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.72801286, + "num_input_tokens_seen": 313720090, + "step": 14549, + "time_per_iteration": 2.8153107166290283 + }, + { + "auxiliary_loss_clip": 0.01101259, + "auxiliary_loss_mlp": 0.00772545, + "balance_loss_clip": 1.0357511, + "balance_loss_mlp": 1.00031376, + "epoch": 0.8747933263189539, + "flos": 24462456854400.0, + "grad_norm": 2.512472286488796, + "language_loss": 0.84052968, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.85926771, + "num_input_tokens_seen": 313736795, + "step": 14550, + "time_per_iteration": 2.6691277027130127 + }, + { + "auxiliary_loss_clip": 0.01100072, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.03686762, + "balance_loss_mlp": 1.02883005, + "epoch": 0.8748534495716218, + "flos": 13809161917440.0, + "grad_norm": 1.6392278362685582, + "language_loss": 0.71681327, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.7382248, + "num_input_tokens_seen": 313754820, + "step": 14551, + "time_per_iteration": 2.6196999549865723 + }, + { + "auxiliary_loss_clip": 0.01098688, + "auxiliary_loss_mlp": 0.00770542, + "balance_loss_clip": 1.03751254, + "balance_loss_mlp": 1.00018144, + "epoch": 0.8749135728242898, + "flos": 29862092759040.0, + "grad_norm": 5.521178940955395, + "language_loss": 0.64576298, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.66445529, + "num_input_tokens_seen": 313775830, + "step": 14552, + "time_per_iteration": 2.7710392475128174 + }, + { + "auxiliary_loss_clip": 0.01078604, + "auxiliary_loss_mlp": 0.01028395, + "balance_loss_clip": 1.03420365, + "balance_loss_mlp": 1.01439333, + "epoch": 0.8749736960769577, + "flos": 24133479166080.0, + "grad_norm": 5.011357337141667, + "language_loss": 0.79550266, + "learning_rate": 1.616734111284479e-07, + "loss": 0.81657255, + "num_input_tokens_seen": 313795745, + "step": 14553, + "time_per_iteration": 2.7544870376586914 + }, + { + "auxiliary_loss_clip": 0.01093009, + "auxiliary_loss_mlp": 0.01033697, + "balance_loss_clip": 1.03364944, + "balance_loss_mlp": 1.02119756, + "epoch": 0.8750338193296258, + "flos": 17202540602880.0, + "grad_norm": 2.1871328119231337, + "language_loss": 0.70039916, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.72166622, + "num_input_tokens_seen": 313813895, + "step": 14554, + "time_per_iteration": 2.5449023246765137 + }, + { + "auxiliary_loss_clip": 0.01091308, + "auxiliary_loss_mlp": 0.00770366, + "balance_loss_clip": 1.03953791, + "balance_loss_mlp": 1.00012708, + "epoch": 0.8750939425822937, + "flos": 23733218937600.0, + "grad_norm": 1.5371757112217883, + "language_loss": 0.83528662, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.85390329, + "num_input_tokens_seen": 313834225, + "step": 14555, + "time_per_iteration": 2.712270498275757 + }, + { + "auxiliary_loss_clip": 0.01097341, + "auxiliary_loss_mlp": 0.01034286, + "balance_loss_clip": 1.03663278, + "balance_loss_mlp": 1.02133369, + "epoch": 0.8751540658349617, + "flos": 26541684552960.0, + "grad_norm": 1.5869522480564469, + "language_loss": 0.71009433, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.73141062, + "num_input_tokens_seen": 313854430, + "step": 14556, + "time_per_iteration": 2.626359462738037 + }, + { + "auxiliary_loss_clip": 0.01093494, + "auxiliary_loss_mlp": 0.01036101, + "balance_loss_clip": 1.03601527, + "balance_loss_mlp": 1.02195692, + "epoch": 0.8752141890876296, + "flos": 19386806647680.0, + "grad_norm": 1.8472844895882763, + "language_loss": 0.76663041, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.78792638, + "num_input_tokens_seen": 313871600, + "step": 14557, + "time_per_iteration": 2.7687621116638184 + }, + { + "auxiliary_loss_clip": 0.01072231, + "auxiliary_loss_mlp": 0.01039476, + "balance_loss_clip": 1.03658962, + "balance_loss_mlp": 1.02650011, + "epoch": 0.8752743123402976, + "flos": 25374408278400.0, + "grad_norm": 1.8980752716365015, + "language_loss": 0.83232927, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.85344636, + "num_input_tokens_seen": 313891570, + "step": 14558, + "time_per_iteration": 2.7216644287109375 + }, + { + "auxiliary_loss_clip": 0.01027546, + "auxiliary_loss_mlp": 0.01003435, + "balance_loss_clip": 1.00482631, + "balance_loss_mlp": 1.00239205, + "epoch": 0.8753344355929655, + "flos": 59952398578560.0, + "grad_norm": 0.8156616177259552, + "language_loss": 0.56093448, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.58124429, + "num_input_tokens_seen": 313951290, + "step": 14559, + "time_per_iteration": 3.1608095169067383 + }, + { + "auxiliary_loss_clip": 0.01099027, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.03775668, + "balance_loss_mlp": 1.02299678, + "epoch": 0.8753945588456336, + "flos": 17894646835200.0, + "grad_norm": 2.769429121490499, + "language_loss": 0.66112006, + "learning_rate": 1.606013202286407e-07, + "loss": 0.68246031, + "num_input_tokens_seen": 313968645, + "step": 14560, + "time_per_iteration": 2.62923526763916 + }, + { + "auxiliary_loss_clip": 0.011089, + "auxiliary_loss_mlp": 0.01030691, + "balance_loss_clip": 1.03712916, + "balance_loss_mlp": 1.0187583, + "epoch": 0.8754546820983016, + "flos": 30914885410560.0, + "grad_norm": 3.865819478454591, + "language_loss": 0.78949714, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.810893, + "num_input_tokens_seen": 313987580, + "step": 14561, + "time_per_iteration": 2.6706154346466064 + }, + { + "auxiliary_loss_clip": 0.01109674, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.03582835, + "balance_loss_mlp": 1.01984262, + "epoch": 0.8755148053509695, + "flos": 20631075724800.0, + "grad_norm": 1.9781083362240712, + "language_loss": 0.77276206, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.79419112, + "num_input_tokens_seen": 314004460, + "step": 14562, + "time_per_iteration": 2.5154237747192383 + }, + { + "auxiliary_loss_clip": 0.01103173, + "auxiliary_loss_mlp": 0.01027848, + "balance_loss_clip": 1.03530455, + "balance_loss_mlp": 1.0163027, + "epoch": 0.8755749286036375, + "flos": 34969739005440.0, + "grad_norm": 1.5352533116826146, + "language_loss": 0.71789098, + "learning_rate": 1.601428988367981e-07, + "loss": 0.73920125, + "num_input_tokens_seen": 314026855, + "step": 14563, + "time_per_iteration": 2.743906021118164 + }, + { + "auxiliary_loss_clip": 0.01114581, + "auxiliary_loss_mlp": 0.01034001, + "balance_loss_clip": 1.04004955, + "balance_loss_mlp": 1.0215075, + "epoch": 0.8756350518563054, + "flos": 18186456925440.0, + "grad_norm": 2.1781284121642304, + "language_loss": 0.65630162, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.67778742, + "num_input_tokens_seen": 314042830, + "step": 14564, + "time_per_iteration": 2.601315498352051 + }, + { + "auxiliary_loss_clip": 0.01095159, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.03489327, + "balance_loss_mlp": 1.02443063, + "epoch": 0.8756951751089734, + "flos": 20084012611200.0, + "grad_norm": 1.7268939160144312, + "language_loss": 0.7091375, + "learning_rate": 1.598376334037408e-07, + "loss": 0.73045349, + "num_input_tokens_seen": 314062225, + "step": 14565, + "time_per_iteration": 2.67029070854187 + }, + { + "auxiliary_loss_clip": 0.01092949, + "auxiliary_loss_mlp": 0.01036021, + "balance_loss_clip": 1.03708506, + "balance_loss_mlp": 1.02246666, + "epoch": 0.8757552983616413, + "flos": 27525241739520.0, + "grad_norm": 1.5872462776777525, + "language_loss": 0.77823293, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.79952264, + "num_input_tokens_seen": 314082325, + "step": 14566, + "time_per_iteration": 2.728349447250366 + }, + { + "auxiliary_loss_clip": 0.01087655, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.03929698, + "balance_loss_mlp": 1.01946163, + "epoch": 0.8758154216143094, + "flos": 18073014796800.0, + "grad_norm": 1.606606930203952, + "language_loss": 0.71347201, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.73466635, + "num_input_tokens_seen": 314100310, + "step": 14567, + "time_per_iteration": 2.6560468673706055 + }, + { + "auxiliary_loss_clip": 0.01089483, + "auxiliary_loss_mlp": 0.00770872, + "balance_loss_clip": 1.0368377, + "balance_loss_mlp": 1.00015092, + "epoch": 0.8758755448669773, + "flos": 25045681985280.0, + "grad_norm": 1.924193327132232, + "language_loss": 0.74096954, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.7595731, + "num_input_tokens_seen": 314121330, + "step": 14568, + "time_per_iteration": 2.669600248336792 + }, + { + "auxiliary_loss_clip": 0.0106924, + "auxiliary_loss_mlp": 0.01031283, + "balance_loss_clip": 1.03213978, + "balance_loss_mlp": 1.01898623, + "epoch": 0.8759356681196453, + "flos": 22856818999680.0, + "grad_norm": 2.753044994093851, + "language_loss": 0.86606205, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.88706732, + "num_input_tokens_seen": 314139875, + "step": 14569, + "time_per_iteration": 2.7353930473327637 + }, + { + "auxiliary_loss_clip": 0.01069957, + "auxiliary_loss_mlp": 0.01032444, + "balance_loss_clip": 1.0342617, + "balance_loss_mlp": 1.02065396, + "epoch": 0.8759957913723132, + "flos": 21032521102080.0, + "grad_norm": 1.818630760471602, + "language_loss": 0.74142909, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.76245314, + "num_input_tokens_seen": 314157850, + "step": 14570, + "time_per_iteration": 2.699028253555298 + }, + { + "auxiliary_loss_clip": 0.01100775, + "auxiliary_loss_mlp": 0.00770915, + "balance_loss_clip": 1.03732276, + "balance_loss_mlp": 1.00026119, + "epoch": 0.8760559146249812, + "flos": 20010467514240.0, + "grad_norm": 1.5893457614137378, + "language_loss": 0.67510492, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.69382179, + "num_input_tokens_seen": 314176720, + "step": 14571, + "time_per_iteration": 2.617493152618408 + }, + { + "auxiliary_loss_clip": 0.01069948, + "auxiliary_loss_mlp": 0.01027892, + "balance_loss_clip": 1.03497171, + "balance_loss_mlp": 1.01635253, + "epoch": 0.8761160378776491, + "flos": 19974161842560.0, + "grad_norm": 2.1723550236606486, + "language_loss": 0.62609088, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.64706922, + "num_input_tokens_seen": 314196645, + "step": 14572, + "time_per_iteration": 2.80468487739563 + }, + { + "auxiliary_loss_clip": 0.01095539, + "auxiliary_loss_mlp": 0.01029157, + "balance_loss_clip": 1.03618896, + "balance_loss_mlp": 1.01790953, + "epoch": 0.8761761611303172, + "flos": 28804415857920.0, + "grad_norm": 1.6603874349444352, + "language_loss": 0.73751938, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.75876629, + "num_input_tokens_seen": 314217430, + "step": 14573, + "time_per_iteration": 2.8996636867523193 + }, + { + "auxiliary_loss_clip": 0.01058502, + "auxiliary_loss_mlp": 0.0076881, + "balance_loss_clip": 1.03545105, + "balance_loss_mlp": 1.00015557, + "epoch": 0.8762362843829851, + "flos": 18332505624960.0, + "grad_norm": 1.9024608944750214, + "language_loss": 0.72550857, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.74378169, + "num_input_tokens_seen": 314235310, + "step": 14574, + "time_per_iteration": 2.7545413970947266 + }, + { + "auxiliary_loss_clip": 0.01095926, + "auxiliary_loss_mlp": 0.01036229, + "balance_loss_clip": 1.03621411, + "balance_loss_mlp": 1.02418888, + "epoch": 0.8762964076356531, + "flos": 15779149378560.0, + "grad_norm": 1.8502793872644558, + "language_loss": 0.76065028, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.78197181, + "num_input_tokens_seen": 314252355, + "step": 14575, + "time_per_iteration": 2.5257208347320557 + }, + { + "auxiliary_loss_clip": 0.01081299, + "auxiliary_loss_mlp": 0.01038665, + "balance_loss_clip": 1.03579473, + "balance_loss_mlp": 1.02677417, + "epoch": 0.8763565308883211, + "flos": 33176754789120.0, + "grad_norm": 1.9305081146362895, + "language_loss": 0.66477948, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.68597913, + "num_input_tokens_seen": 314272755, + "step": 14576, + "time_per_iteration": 2.7134413719177246 + }, + { + "auxiliary_loss_clip": 0.01078146, + "auxiliary_loss_mlp": 0.01034504, + "balance_loss_clip": 1.03182101, + "balance_loss_mlp": 1.02249372, + "epoch": 0.876416654140989, + "flos": 15888102307200.0, + "grad_norm": 2.865595040791599, + "language_loss": 0.668244, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.68937051, + "num_input_tokens_seen": 314291365, + "step": 14577, + "time_per_iteration": 5.730209589004517 + }, + { + "auxiliary_loss_clip": 0.01099421, + "auxiliary_loss_mlp": 0.01031703, + "balance_loss_clip": 1.03849435, + "balance_loss_mlp": 1.01906085, + "epoch": 0.876476777393657, + "flos": 25885237547520.0, + "grad_norm": 2.277451139554719, + "language_loss": 0.71319246, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.73450363, + "num_input_tokens_seen": 314310075, + "step": 14578, + "time_per_iteration": 2.6785285472869873 + }, + { + "auxiliary_loss_clip": 0.01110348, + "auxiliary_loss_mlp": 0.01034332, + "balance_loss_clip": 1.03671813, + "balance_loss_mlp": 1.02167737, + "epoch": 0.876536900646325, + "flos": 13589675861760.0, + "grad_norm": 2.477066541201799, + "language_loss": 0.7168777, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.73832452, + "num_input_tokens_seen": 314325695, + "step": 14579, + "time_per_iteration": 4.083740472793579 + }, + { + "auxiliary_loss_clip": 0.01075998, + "auxiliary_loss_mlp": 0.01036896, + "balance_loss_clip": 1.0316453, + "balance_loss_mlp": 1.02358592, + "epoch": 0.876597023898993, + "flos": 12203344494720.0, + "grad_norm": 1.7087182635635378, + "language_loss": 0.70119214, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.72232103, + "num_input_tokens_seen": 314343605, + "step": 14580, + "time_per_iteration": 4.30855393409729 + }, + { + "auxiliary_loss_clip": 0.01105953, + "auxiliary_loss_mlp": 0.00769599, + "balance_loss_clip": 1.03692436, + "balance_loss_mlp": 1.00017405, + "epoch": 0.8766571471516609, + "flos": 25336773803520.0, + "grad_norm": 1.6590992493321417, + "language_loss": 0.65825737, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.67701292, + "num_input_tokens_seen": 314364275, + "step": 14581, + "time_per_iteration": 2.6293153762817383 + }, + { + "auxiliary_loss_clip": 0.0108123, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.03592646, + "balance_loss_mlp": 1.0222156, + "epoch": 0.8767172704043289, + "flos": 30113287545600.0, + "grad_norm": 1.6719712227937835, + "language_loss": 0.7391513, + "learning_rate": 1.572541512164416e-07, + "loss": 0.76030058, + "num_input_tokens_seen": 314385140, + "step": 14582, + "time_per_iteration": 2.8180127143859863 + }, + { + "auxiliary_loss_clip": 0.01106807, + "auxiliary_loss_mlp": 0.00770216, + "balance_loss_clip": 1.03510261, + "balance_loss_mlp": 1.00013459, + "epoch": 0.8767773936569968, + "flos": 19281157770240.0, + "grad_norm": 1.8898145602887721, + "language_loss": 0.66737789, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.68614811, + "num_input_tokens_seen": 314403715, + "step": 14583, + "time_per_iteration": 2.68875789642334 + }, + { + "auxiliary_loss_clip": 0.011013, + "auxiliary_loss_mlp": 0.00770347, + "balance_loss_clip": 1.03735173, + "balance_loss_mlp": 1.00024498, + "epoch": 0.8768375169096648, + "flos": 21247230648960.0, + "grad_norm": 1.7289325254896564, + "language_loss": 0.7945081, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.81322455, + "num_input_tokens_seen": 314421880, + "step": 14584, + "time_per_iteration": 2.6574294567108154 + }, + { + "auxiliary_loss_clip": 0.01078304, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.03573895, + "balance_loss_mlp": 1.01824355, + "epoch": 0.8768976401623327, + "flos": 23295539715840.0, + "grad_norm": 2.4589506169652147, + "language_loss": 0.72250307, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.74359208, + "num_input_tokens_seen": 314441585, + "step": 14585, + "time_per_iteration": 2.755363702774048 + }, + { + "auxiliary_loss_clip": 0.01087385, + "auxiliary_loss_mlp": 0.01030085, + "balance_loss_clip": 1.03488159, + "balance_loss_mlp": 1.01720476, + "epoch": 0.8769577634150008, + "flos": 21361247395200.0, + "grad_norm": 1.8198293013572575, + "language_loss": 0.74285269, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.76402736, + "num_input_tokens_seen": 314459020, + "step": 14586, + "time_per_iteration": 2.7154970169067383 + }, + { + "auxiliary_loss_clip": 0.01107064, + "auxiliary_loss_mlp": 0.01030626, + "balance_loss_clip": 1.0354507, + "balance_loss_mlp": 1.0183115, + "epoch": 0.8770178866676687, + "flos": 23514056104320.0, + "grad_norm": 1.7048029370407318, + "language_loss": 0.78917754, + "learning_rate": 1.564981454895844e-07, + "loss": 0.81055439, + "num_input_tokens_seen": 314478935, + "step": 14587, + "time_per_iteration": 2.659623384475708 + }, + { + "auxiliary_loss_clip": 0.010977, + "auxiliary_loss_mlp": 0.01033091, + "balance_loss_clip": 1.0367806, + "balance_loss_mlp": 1.01905441, + "epoch": 0.8770780099203367, + "flos": 19719052473600.0, + "grad_norm": 1.5723021986517474, + "language_loss": 0.73480511, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.75611293, + "num_input_tokens_seen": 314497635, + "step": 14588, + "time_per_iteration": 2.6490304470062256 + }, + { + "auxiliary_loss_clip": 0.01042159, + "auxiliary_loss_mlp": 0.0077056, + "balance_loss_clip": 1.03166127, + "balance_loss_mlp": 1.00015152, + "epoch": 0.8771381331730047, + "flos": 21395901041280.0, + "grad_norm": 1.8014093436247518, + "language_loss": 0.66976607, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.68789327, + "num_input_tokens_seen": 314515445, + "step": 14589, + "time_per_iteration": 2.7724153995513916 + }, + { + "auxiliary_loss_clip": 0.01098134, + "auxiliary_loss_mlp": 0.0103223, + "balance_loss_clip": 1.03780675, + "balance_loss_mlp": 1.01989174, + "epoch": 0.8771982564256726, + "flos": 20261770041600.0, + "grad_norm": 2.6258083956029776, + "language_loss": 0.70362616, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.72492981, + "num_input_tokens_seen": 314533040, + "step": 14590, + "time_per_iteration": 2.6688060760498047 + }, + { + "auxiliary_loss_clip": 0.01085853, + "auxiliary_loss_mlp": 0.01041073, + "balance_loss_clip": 1.03592718, + "balance_loss_mlp": 1.0268271, + "epoch": 0.8772583796783406, + "flos": 12489372495360.0, + "grad_norm": 2.102125056445036, + "language_loss": 0.74291348, + "learning_rate": 1.558945991776086e-07, + "loss": 0.76418269, + "num_input_tokens_seen": 314548280, + "step": 14591, + "time_per_iteration": 2.644625425338745 + }, + { + "auxiliary_loss_clip": 0.01104605, + "auxiliary_loss_mlp": 0.01027114, + "balance_loss_clip": 1.03682637, + "balance_loss_mlp": 1.01522839, + "epoch": 0.8773185029310085, + "flos": 15921103927680.0, + "grad_norm": 1.6170050772781672, + "language_loss": 0.79845113, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.81976831, + "num_input_tokens_seen": 314565345, + "step": 14592, + "time_per_iteration": 2.604241132736206 + }, + { + "auxiliary_loss_clip": 0.01106487, + "auxiliary_loss_mlp": 0.0103183, + "balance_loss_clip": 1.03708172, + "balance_loss_mlp": 1.02006376, + "epoch": 0.8773786261836766, + "flos": 21504530747520.0, + "grad_norm": 1.5930198030485112, + "language_loss": 0.82747221, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.84885532, + "num_input_tokens_seen": 314584190, + "step": 14593, + "time_per_iteration": 2.5794694423675537 + }, + { + "auxiliary_loss_clip": 0.0109194, + "auxiliary_loss_mlp": 0.01028175, + "balance_loss_clip": 1.03585052, + "balance_loss_mlp": 1.0158962, + "epoch": 0.8774387494363445, + "flos": 26761493831040.0, + "grad_norm": 1.3623421288990831, + "language_loss": 0.76057625, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.78177738, + "num_input_tokens_seen": 314605625, + "step": 14594, + "time_per_iteration": 2.66890025138855 + }, + { + "auxiliary_loss_clip": 0.01057614, + "auxiliary_loss_mlp": 0.01040841, + "balance_loss_clip": 1.03001809, + "balance_loss_mlp": 1.02620232, + "epoch": 0.8774988726890125, + "flos": 18478841633280.0, + "grad_norm": 2.047444074315711, + "language_loss": 0.77807617, + "learning_rate": 1.552921717241651e-07, + "loss": 0.7990607, + "num_input_tokens_seen": 314622630, + "step": 14595, + "time_per_iteration": 2.8318984508514404 + }, + { + "auxiliary_loss_clip": 0.01075529, + "auxiliary_loss_mlp": 0.01033164, + "balance_loss_clip": 1.03548956, + "balance_loss_mlp": 1.02087939, + "epoch": 0.8775589959416804, + "flos": 24426366664320.0, + "grad_norm": 1.536994143649266, + "language_loss": 0.70930111, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.7303881, + "num_input_tokens_seen": 314642460, + "step": 14596, + "time_per_iteration": 2.7869088649749756 + }, + { + "auxiliary_loss_clip": 0.01074468, + "auxiliary_loss_mlp": 0.01024808, + "balance_loss_clip": 1.03594506, + "balance_loss_mlp": 1.01340532, + "epoch": 0.8776191191943484, + "flos": 23440151871360.0, + "grad_norm": 1.7123074266942537, + "language_loss": 0.85920203, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.88019478, + "num_input_tokens_seen": 314659875, + "step": 14597, + "time_per_iteration": 2.741469383239746 + }, + { + "auxiliary_loss_clip": 0.01095944, + "auxiliary_loss_mlp": 0.01030022, + "balance_loss_clip": 1.0365026, + "balance_loss_mlp": 1.01824403, + "epoch": 0.8776792424470163, + "flos": 26830872950400.0, + "grad_norm": 1.6418502548107807, + "language_loss": 0.72893673, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.7501964, + "num_input_tokens_seen": 314680260, + "step": 14598, + "time_per_iteration": 2.679743766784668 + }, + { + "auxiliary_loss_clip": 0.01093166, + "auxiliary_loss_mlp": 0.00771018, + "balance_loss_clip": 1.03571749, + "balance_loss_mlp": 1.00025177, + "epoch": 0.8777393656996844, + "flos": 15626169354240.0, + "grad_norm": 2.1344366739736915, + "language_loss": 0.77418303, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.79282486, + "num_input_tokens_seen": 314696260, + "step": 14599, + "time_per_iteration": 2.645653486251831 + }, + { + "auxiliary_loss_clip": 0.01077971, + "auxiliary_loss_mlp": 0.01030468, + "balance_loss_clip": 1.03576493, + "balance_loss_mlp": 1.0187732, + "epoch": 0.8777994889523523, + "flos": 18879999701760.0, + "grad_norm": 2.045708434317711, + "language_loss": 0.67680991, + "learning_rate": 1.545407113589332e-07, + "loss": 0.69789433, + "num_input_tokens_seen": 314714215, + "step": 14600, + "time_per_iteration": 2.67521333694458 + }, + { + "auxiliary_loss_clip": 0.01098236, + "auxiliary_loss_mlp": 0.01040238, + "balance_loss_clip": 1.03573418, + "balance_loss_mlp": 1.02782202, + "epoch": 0.8778596122050203, + "flos": 48826516400640.0, + "grad_norm": 1.696137650912348, + "language_loss": 0.69482052, + "learning_rate": 1.543906292031072e-07, + "loss": 0.71620524, + "num_input_tokens_seen": 314735700, + "step": 14601, + "time_per_iteration": 2.852067708969116 + }, + { + "auxiliary_loss_clip": 0.01102467, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.03806257, + "balance_loss_mlp": 1.0211978, + "epoch": 0.8779197354576883, + "flos": 25660184883840.0, + "grad_norm": 1.8150997518302137, + "language_loss": 0.72907132, + "learning_rate": 1.542406170329733e-07, + "loss": 0.75043446, + "num_input_tokens_seen": 314753335, + "step": 14602, + "time_per_iteration": 2.666530132293701 + }, + { + "auxiliary_loss_clip": 0.01106598, + "auxiliary_loss_mlp": 0.01033593, + "balance_loss_clip": 1.03583145, + "balance_loss_mlp": 1.02214909, + "epoch": 0.8779798587103562, + "flos": 18843227153280.0, + "grad_norm": 2.0286896900141103, + "language_loss": 0.70824677, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.72964865, + "num_input_tokens_seen": 314770800, + "step": 14603, + "time_per_iteration": 2.6004815101623535 + }, + { + "auxiliary_loss_clip": 0.01011292, + "auxiliary_loss_mlp": 0.01001925, + "balance_loss_clip": 1.00817752, + "balance_loss_mlp": 1.00094128, + "epoch": 0.8780399819630242, + "flos": 68613119377920.0, + "grad_norm": 0.7394752492120941, + "language_loss": 0.54153609, + "learning_rate": 1.539408026725344e-07, + "loss": 0.56166828, + "num_input_tokens_seen": 314837275, + "step": 14604, + "time_per_iteration": 3.240145683288574 + }, + { + "auxiliary_loss_clip": 0.01001285, + "auxiliary_loss_mlp": 0.01016546, + "balance_loss_clip": 1.00654078, + "balance_loss_mlp": 1.01528251, + "epoch": 0.8781001052156922, + "flos": 65734807766400.0, + "grad_norm": 0.7095982216693757, + "language_loss": 0.59140944, + "learning_rate": 1.537910004935976e-07, + "loss": 0.61158776, + "num_input_tokens_seen": 314902220, + "step": 14605, + "time_per_iteration": 3.193176507949829 + }, + { + "auxiliary_loss_clip": 0.01068364, + "auxiliary_loss_mlp": 0.01034464, + "balance_loss_clip": 1.03649735, + "balance_loss_mlp": 1.02195311, + "epoch": 0.8781602284683602, + "flos": 22049654526720.0, + "grad_norm": 1.640207436482767, + "language_loss": 0.85104489, + "learning_rate": 1.536412683230912e-07, + "loss": 0.87207323, + "num_input_tokens_seen": 314921645, + "step": 14606, + "time_per_iteration": 2.7456490993499756 + }, + { + "auxiliary_loss_clip": 0.01111634, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.03835487, + "balance_loss_mlp": 1.01814675, + "epoch": 0.8782203517210281, + "flos": 17562939713280.0, + "grad_norm": 2.106006869176157, + "language_loss": 0.70568335, + "learning_rate": 1.534916061666931e-07, + "loss": 0.72711432, + "num_input_tokens_seen": 314939390, + "step": 14607, + "time_per_iteration": 2.5896804332733154 + }, + { + "auxiliary_loss_clip": 0.01086458, + "auxiliary_loss_mlp": 0.01041468, + "balance_loss_clip": 1.03582692, + "balance_loss_mlp": 1.03008974, + "epoch": 0.8782804749736961, + "flos": 25520421064320.0, + "grad_norm": 1.8237368142749963, + "language_loss": 0.72306776, + "learning_rate": 1.533420140300785e-07, + "loss": 0.74434698, + "num_input_tokens_seen": 314959205, + "step": 14608, + "time_per_iteration": 2.741672992706299 + }, + { + "auxiliary_loss_clip": 0.01099239, + "auxiliary_loss_mlp": 0.01037651, + "balance_loss_clip": 1.03510618, + "balance_loss_mlp": 1.0248003, + "epoch": 0.878340598226364, + "flos": 21798747048960.0, + "grad_norm": 1.955140517106729, + "language_loss": 0.87650675, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.89787567, + "num_input_tokens_seen": 314977485, + "step": 14609, + "time_per_iteration": 2.7044589519500732 + }, + { + "auxiliary_loss_clip": 0.01064019, + "auxiliary_loss_mlp": 0.01031489, + "balance_loss_clip": 1.03733373, + "balance_loss_mlp": 1.01938868, + "epoch": 0.878400721479032, + "flos": 21102403011840.0, + "grad_norm": 1.6056637569887062, + "language_loss": 0.70521188, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.72616696, + "num_input_tokens_seen": 314997830, + "step": 14610, + "time_per_iteration": 2.803408145904541 + }, + { + "auxiliary_loss_clip": 0.01090443, + "auxiliary_loss_mlp": 0.00770344, + "balance_loss_clip": 1.03708553, + "balance_loss_mlp": 1.0002346, + "epoch": 0.8784608447316999, + "flos": 20923532259840.0, + "grad_norm": 5.657745869325684, + "language_loss": 0.80772901, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.82633686, + "num_input_tokens_seen": 315016480, + "step": 14611, + "time_per_iteration": 2.660065174102783 + }, + { + "auxiliary_loss_clip": 0.01108968, + "auxiliary_loss_mlp": 0.01032438, + "balance_loss_clip": 1.03722143, + "balance_loss_mlp": 1.01999247, + "epoch": 0.878520967984368, + "flos": 23330660238720.0, + "grad_norm": 1.533059433053689, + "language_loss": 0.76187742, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.78329152, + "num_input_tokens_seen": 315036135, + "step": 14612, + "time_per_iteration": 2.6014697551727295 + }, + { + "auxiliary_loss_clip": 0.01056207, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.03447223, + "balance_loss_mlp": 1.02104592, + "epoch": 0.8785810912370359, + "flos": 25518984520320.0, + "grad_norm": 1.467098748610033, + "language_loss": 0.72364843, + "learning_rate": 1.525951038422002e-07, + "loss": 0.74453855, + "num_input_tokens_seen": 315057995, + "step": 14613, + "time_per_iteration": 2.865140676498413 + }, + { + "auxiliary_loss_clip": 0.0100752, + "auxiliary_loss_mlp": 0.01000964, + "balance_loss_clip": 1.01305175, + "balance_loss_mlp": 0.9998787, + "epoch": 0.8786412144897039, + "flos": 61841047691520.0, + "grad_norm": 1.0274738596365884, + "language_loss": 0.64512694, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.6652118, + "num_input_tokens_seen": 315104010, + "step": 14614, + "time_per_iteration": 3.0442123413085938 + }, + { + "auxiliary_loss_clip": 0.01027601, + "auxiliary_loss_mlp": 0.01004471, + "balance_loss_clip": 1.00515628, + "balance_loss_mlp": 1.00352311, + "epoch": 0.8787013377423719, + "flos": 70989364638720.0, + "grad_norm": 0.6570239019291962, + "language_loss": 0.58545709, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.6057778, + "num_input_tokens_seen": 315174550, + "step": 14615, + "time_per_iteration": 3.2077083587646484 + }, + { + "auxiliary_loss_clip": 0.01059951, + "auxiliary_loss_mlp": 0.01028954, + "balance_loss_clip": 1.03297782, + "balance_loss_mlp": 1.01675916, + "epoch": 0.8787614609950398, + "flos": 17347404153600.0, + "grad_norm": 3.0128650645072503, + "language_loss": 0.72307193, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.74396092, + "num_input_tokens_seen": 315191825, + "step": 14616, + "time_per_iteration": 4.491776704788208 + }, + { + "auxiliary_loss_clip": 0.01028184, + "auxiliary_loss_mlp": 0.01002892, + "balance_loss_clip": 1.00566876, + "balance_loss_mlp": 1.00189614, + "epoch": 0.8788215842477078, + "flos": 72511401588480.0, + "grad_norm": 0.8039335760257915, + "language_loss": 0.5797807, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.60009146, + "num_input_tokens_seen": 315255075, + "step": 14617, + "time_per_iteration": 3.238430976867676 + }, + { + "auxiliary_loss_clip": 0.01081125, + "auxiliary_loss_mlp": 0.01037319, + "balance_loss_clip": 1.03397489, + "balance_loss_mlp": 1.02404451, + "epoch": 0.8788817075003758, + "flos": 24827452905600.0, + "grad_norm": 1.7430695626814152, + "language_loss": 0.83371663, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.85490113, + "num_input_tokens_seen": 315273995, + "step": 14618, + "time_per_iteration": 2.6718552112579346 + }, + { + "auxiliary_loss_clip": 0.01081904, + "auxiliary_loss_mlp": 0.01028474, + "balance_loss_clip": 1.03612018, + "balance_loss_mlp": 1.0165534, + "epoch": 0.8789418307530438, + "flos": 22638769488000.0, + "grad_norm": 1.5514378708700263, + "language_loss": 0.69016528, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.71126908, + "num_input_tokens_seen": 315294485, + "step": 14619, + "time_per_iteration": 5.7080559730529785 + }, + { + "auxiliary_loss_clip": 0.01067003, + "auxiliary_loss_mlp": 0.01037225, + "balance_loss_clip": 1.03445745, + "balance_loss_mlp": 1.02513099, + "epoch": 0.8790019540057117, + "flos": 19785738072960.0, + "grad_norm": 3.788287535500063, + "language_loss": 0.77142107, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.79246336, + "num_input_tokens_seen": 315310420, + "step": 14620, + "time_per_iteration": 2.7002434730529785 + }, + { + "auxiliary_loss_clip": 0.0108692, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.03867853, + "balance_loss_mlp": 1.01823735, + "epoch": 0.8790620772583797, + "flos": 20229774001920.0, + "grad_norm": 1.6265722719383797, + "language_loss": 0.79121077, + "learning_rate": 1.514036906317542e-07, + "loss": 0.81239492, + "num_input_tokens_seen": 315330110, + "step": 14621, + "time_per_iteration": 2.706190824508667 + }, + { + "auxiliary_loss_clip": 0.01088315, + "auxiliary_loss_mlp": 0.01033697, + "balance_loss_clip": 1.03491962, + "balance_loss_mlp": 1.02115011, + "epoch": 0.8791222005110476, + "flos": 24130785646080.0, + "grad_norm": 1.9310015183922709, + "language_loss": 0.66529369, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.68651378, + "num_input_tokens_seen": 315350080, + "step": 14622, + "time_per_iteration": 2.7165491580963135 + }, + { + "auxiliary_loss_clip": 0.01082524, + "auxiliary_loss_mlp": 0.01036818, + "balance_loss_clip": 1.03749692, + "balance_loss_mlp": 1.02424717, + "epoch": 0.8791823237637156, + "flos": 21614201948160.0, + "grad_norm": 1.9313985868431403, + "language_loss": 0.72802383, + "learning_rate": 1.511065382058687e-07, + "loss": 0.74921727, + "num_input_tokens_seen": 315366360, + "step": 14623, + "time_per_iteration": 2.747246026992798 + }, + { + "auxiliary_loss_clip": 0.01055452, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.03043795, + "balance_loss_mlp": 1.02029753, + "epoch": 0.8792424470163835, + "flos": 24243401761920.0, + "grad_norm": 1.9152762624748565, + "language_loss": 0.78623891, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.80711675, + "num_input_tokens_seen": 315385890, + "step": 14624, + "time_per_iteration": 2.8343048095703125 + }, + { + "auxiliary_loss_clip": 0.01097982, + "auxiliary_loss_mlp": 0.01037468, + "balance_loss_clip": 1.03468323, + "balance_loss_mlp": 1.02431333, + "epoch": 0.8793025702690516, + "flos": 24893204751360.0, + "grad_norm": 1.7619469810650616, + "language_loss": 0.79745495, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.81880945, + "num_input_tokens_seen": 315403400, + "step": 14625, + "time_per_iteration": 2.660099983215332 + }, + { + "auxiliary_loss_clip": 0.01083648, + "auxiliary_loss_mlp": 0.01039128, + "balance_loss_clip": 1.03540492, + "balance_loss_mlp": 1.02714157, + "epoch": 0.8793626935217195, + "flos": 25373115388800.0, + "grad_norm": 1.6785159518142898, + "language_loss": 0.74372435, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.76495212, + "num_input_tokens_seen": 315423670, + "step": 14626, + "time_per_iteration": 2.676588535308838 + }, + { + "auxiliary_loss_clip": 0.01098546, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.03614759, + "balance_loss_mlp": 1.019876, + "epoch": 0.8794228167743875, + "flos": 34678000742400.0, + "grad_norm": 3.563179851520993, + "language_loss": 0.71319157, + "learning_rate": 1.505130747218246e-07, + "loss": 0.73450279, + "num_input_tokens_seen": 315446265, + "step": 14627, + "time_per_iteration": 2.7192656993865967 + }, + { + "auxiliary_loss_clip": 0.01081037, + "auxiliary_loss_mlp": 0.01032231, + "balance_loss_clip": 1.04066432, + "balance_loss_mlp": 1.01920116, + "epoch": 0.8794829400270555, + "flos": 19464014931840.0, + "grad_norm": 1.8203006608438008, + "language_loss": 0.72041732, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.74155003, + "num_input_tokens_seen": 315464655, + "step": 14628, + "time_per_iteration": 2.6673803329467773 + }, + { + "auxiliary_loss_clip": 0.01077339, + "auxiliary_loss_mlp": 0.01034101, + "balance_loss_clip": 1.03383875, + "balance_loss_mlp": 1.02086902, + "epoch": 0.8795430632797234, + "flos": 15231403906560.0, + "grad_norm": 2.5577360809446312, + "language_loss": 0.69041932, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.71153378, + "num_input_tokens_seen": 315481090, + "step": 14629, + "time_per_iteration": 2.6587491035461426 + }, + { + "auxiliary_loss_clip": 0.01082842, + "auxiliary_loss_mlp": 0.0103309, + "balance_loss_clip": 1.03334451, + "balance_loss_mlp": 1.02162218, + "epoch": 0.8796031865323914, + "flos": 27744727795200.0, + "grad_norm": 1.5244181147754692, + "language_loss": 0.68586159, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.70702088, + "num_input_tokens_seen": 315502010, + "step": 14630, + "time_per_iteration": 2.6706295013427734 + }, + { + "auxiliary_loss_clip": 0.01081928, + "auxiliary_loss_mlp": 0.01033864, + "balance_loss_clip": 1.03443193, + "balance_loss_mlp": 1.02096558, + "epoch": 0.8796633097850594, + "flos": 31285412156160.0, + "grad_norm": 1.7384017818460198, + "language_loss": 0.74517637, + "learning_rate": 1.499207333613999e-07, + "loss": 0.7663343, + "num_input_tokens_seen": 315523040, + "step": 14631, + "time_per_iteration": 2.7020559310913086 + }, + { + "auxiliary_loss_clip": 0.01085004, + "auxiliary_loss_mlp": 0.00769583, + "balance_loss_clip": 1.03570437, + "balance_loss_mlp": 1.00020719, + "epoch": 0.8797234330377274, + "flos": 24243150366720.0, + "grad_norm": 2.2960657953969434, + "language_loss": 0.69393373, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.71247965, + "num_input_tokens_seen": 315541865, + "step": 14632, + "time_per_iteration": 2.75093674659729 + }, + { + "auxiliary_loss_clip": 0.01087331, + "auxiliary_loss_mlp": 0.01027739, + "balance_loss_clip": 1.03704596, + "balance_loss_mlp": 1.01637244, + "epoch": 0.8797835562903953, + "flos": 24167414540160.0, + "grad_norm": 1.8690741277115708, + "language_loss": 0.65338004, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.67453068, + "num_input_tokens_seen": 315561470, + "step": 14633, + "time_per_iteration": 2.6868348121643066 + }, + { + "auxiliary_loss_clip": 0.01075776, + "auxiliary_loss_mlp": 0.01034988, + "balance_loss_clip": 1.03406906, + "balance_loss_mlp": 1.0226382, + "epoch": 0.8798436795430633, + "flos": 19284677303040.0, + "grad_norm": 1.4189442310597726, + "language_loss": 0.84372133, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.864829, + "num_input_tokens_seen": 315583140, + "step": 14634, + "time_per_iteration": 2.711578845977783 + }, + { + "auxiliary_loss_clip": 0.01085532, + "auxiliary_loss_mlp": 0.00770557, + "balance_loss_clip": 1.03607786, + "balance_loss_mlp": 1.00022292, + "epoch": 0.8799038027957312, + "flos": 28179390274560.0, + "grad_norm": 1.6380725692975024, + "language_loss": 0.79907227, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.81763315, + "num_input_tokens_seen": 315601935, + "step": 14635, + "time_per_iteration": 2.7726967334747314 + }, + { + "auxiliary_loss_clip": 0.01081031, + "auxiliary_loss_mlp": 0.01025598, + "balance_loss_clip": 1.03709126, + "balance_loss_mlp": 1.01331937, + "epoch": 0.8799639260483992, + "flos": 24644703484800.0, + "grad_norm": 1.9658310023555117, + "language_loss": 0.65064734, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.67171359, + "num_input_tokens_seen": 315619995, + "step": 14636, + "time_per_iteration": 2.686582565307617 + }, + { + "auxiliary_loss_clip": 0.01082702, + "auxiliary_loss_mlp": 0.01038782, + "balance_loss_clip": 1.03411579, + "balance_loss_mlp": 1.02539492, + "epoch": 0.8800240493010671, + "flos": 22200479735040.0, + "grad_norm": 1.4477537955972881, + "language_loss": 0.70313036, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.72434527, + "num_input_tokens_seen": 315637895, + "step": 14637, + "time_per_iteration": 2.6938488483428955 + }, + { + "auxiliary_loss_clip": 0.01087054, + "auxiliary_loss_mlp": 0.01029647, + "balance_loss_clip": 1.03786731, + "balance_loss_mlp": 1.01791048, + "epoch": 0.8800841725537352, + "flos": 14246086953600.0, + "grad_norm": 2.359329981837555, + "language_loss": 0.66048372, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.6816507, + "num_input_tokens_seen": 315655520, + "step": 14638, + "time_per_iteration": 2.633389472961426 + }, + { + "auxiliary_loss_clip": 0.01097569, + "auxiliary_loss_mlp": 0.01029703, + "balance_loss_clip": 1.03652537, + "balance_loss_mlp": 1.01738858, + "epoch": 0.8801442958064031, + "flos": 37415794348800.0, + "grad_norm": 2.0860441545932247, + "language_loss": 0.57805324, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.59932595, + "num_input_tokens_seen": 315678955, + "step": 14639, + "time_per_iteration": 2.762080669403076 + }, + { + "auxiliary_loss_clip": 0.01081797, + "auxiliary_loss_mlp": 0.0103561, + "balance_loss_clip": 1.03559208, + "balance_loss_mlp": 1.0227828, + "epoch": 0.8802044190590711, + "flos": 25047334010880.0, + "grad_norm": 1.6274947606267138, + "language_loss": 0.74253106, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.76370513, + "num_input_tokens_seen": 315700360, + "step": 14640, + "time_per_iteration": 2.6815481185913086 + }, + { + "auxiliary_loss_clip": 0.010844, + "auxiliary_loss_mlp": 0.01043439, + "balance_loss_clip": 1.03346467, + "balance_loss_mlp": 1.03030789, + "epoch": 0.8802645423117391, + "flos": 24133874215680.0, + "grad_norm": 2.333395940952266, + "language_loss": 0.69967985, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.72095823, + "num_input_tokens_seen": 315719270, + "step": 14641, + "time_per_iteration": 2.749075174331665 + }, + { + "auxiliary_loss_clip": 0.01095024, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.03647685, + "balance_loss_mlp": 1.01772094, + "epoch": 0.880324665564407, + "flos": 17931203902080.0, + "grad_norm": 2.1703882572052837, + "language_loss": 0.84749234, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.86875057, + "num_input_tokens_seen": 315737425, + "step": 14642, + "time_per_iteration": 2.5922858715057373 + }, + { + "auxiliary_loss_clip": 0.0107106, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.04185271, + "balance_loss_mlp": 1.02141285, + "epoch": 0.880384788817075, + "flos": 21287630471040.0, + "grad_norm": 1.7146284056948287, + "language_loss": 0.78968871, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.81074202, + "num_input_tokens_seen": 315755725, + "step": 14643, + "time_per_iteration": 2.7133426666259766 + }, + { + "auxiliary_loss_clip": 0.01091961, + "auxiliary_loss_mlp": 0.010299, + "balance_loss_clip": 1.03380251, + "balance_loss_mlp": 1.01829529, + "epoch": 0.880444912069743, + "flos": 12458489777280.0, + "grad_norm": 1.637601444546806, + "language_loss": 0.72898597, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.75020456, + "num_input_tokens_seen": 315773835, + "step": 14644, + "time_per_iteration": 2.644477367401123 + }, + { + "auxiliary_loss_clip": 0.01111824, + "auxiliary_loss_mlp": 0.00770434, + "balance_loss_clip": 1.03767347, + "balance_loss_mlp": 1.00029421, + "epoch": 0.880505035322411, + "flos": 13625945619840.0, + "grad_norm": 1.899004626318215, + "language_loss": 0.79560626, + "learning_rate": 1.47856380505911e-07, + "loss": 0.81442887, + "num_input_tokens_seen": 315790615, + "step": 14645, + "time_per_iteration": 2.5354764461517334 + }, + { + "auxiliary_loss_clip": 0.01092346, + "auxiliary_loss_mlp": 0.01037836, + "balance_loss_clip": 1.03347158, + "balance_loss_mlp": 1.02530098, + "epoch": 0.8805651585750789, + "flos": 23183067254400.0, + "grad_norm": 1.7052158673760782, + "language_loss": 0.64392948, + "learning_rate": 1.477094533001364e-07, + "loss": 0.66523129, + "num_input_tokens_seen": 315811010, + "step": 14646, + "time_per_iteration": 2.579423427581787 + }, + { + "auxiliary_loss_clip": 0.01080209, + "auxiliary_loss_mlp": 0.01036788, + "balance_loss_clip": 1.03776836, + "balance_loss_mlp": 1.02298915, + "epoch": 0.8806252818277469, + "flos": 14903000835840.0, + "grad_norm": 2.7067451127953874, + "language_loss": 0.77432781, + "learning_rate": 1.475625963334055e-07, + "loss": 0.79549778, + "num_input_tokens_seen": 315828130, + "step": 14647, + "time_per_iteration": 2.6500446796417236 + }, + { + "auxiliary_loss_clip": 0.01106216, + "auxiliary_loss_mlp": 0.01031301, + "balance_loss_clip": 1.03662324, + "balance_loss_mlp": 1.01965976, + "epoch": 0.8806854050804148, + "flos": 17639178330240.0, + "grad_norm": 2.4139145058404976, + "language_loss": 0.75048065, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.77185583, + "num_input_tokens_seen": 315844900, + "step": 14648, + "time_per_iteration": 2.5998997688293457 + }, + { + "auxiliary_loss_clip": 0.01087799, + "auxiliary_loss_mlp": 0.01031948, + "balance_loss_clip": 1.03425181, + "balance_loss_mlp": 1.01994991, + "epoch": 0.8807455283330828, + "flos": 25332392344320.0, + "grad_norm": 1.6348853786721524, + "language_loss": 0.65398651, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.67518401, + "num_input_tokens_seen": 315863745, + "step": 14649, + "time_per_iteration": 2.7652242183685303 + }, + { + "auxiliary_loss_clip": 0.010729, + "auxiliary_loss_mlp": 0.01033475, + "balance_loss_clip": 1.03727496, + "balance_loss_mlp": 1.02036798, + "epoch": 0.8808056515857507, + "flos": 25265168040960.0, + "grad_norm": 1.3476131678952612, + "language_loss": 0.62504375, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.64610744, + "num_input_tokens_seen": 315885765, + "step": 14650, + "time_per_iteration": 2.77528715133667 + }, + { + "auxiliary_loss_clip": 0.01081061, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.03366303, + "balance_loss_mlp": 1.02105451, + "epoch": 0.8808657748384188, + "flos": 26578852151040.0, + "grad_norm": 1.497639019636266, + "language_loss": 0.72776234, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.74890018, + "num_input_tokens_seen": 315907340, + "step": 14651, + "time_per_iteration": 2.755974769592285 + }, + { + "auxiliary_loss_clip": 0.01102624, + "auxiliary_loss_mlp": 0.01034556, + "balance_loss_clip": 1.03813457, + "balance_loss_mlp": 1.0211333, + "epoch": 0.8809258980910867, + "flos": 18661231918080.0, + "grad_norm": 1.7947734047574024, + "language_loss": 0.71671438, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.73808622, + "num_input_tokens_seen": 315924935, + "step": 14652, + "time_per_iteration": 2.6478350162506104 + }, + { + "auxiliary_loss_clip": 0.01088537, + "auxiliary_loss_mlp": 0.01029586, + "balance_loss_clip": 1.03456485, + "balance_loss_mlp": 1.01757014, + "epoch": 0.8809860213437547, + "flos": 19792274348160.0, + "grad_norm": 2.741119431069501, + "language_loss": 0.74593818, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.76711941, + "num_input_tokens_seen": 315943165, + "step": 14653, + "time_per_iteration": 2.657860517501831 + }, + { + "auxiliary_loss_clip": 0.01111355, + "auxiliary_loss_mlp": 0.01031409, + "balance_loss_clip": 1.03685915, + "balance_loss_mlp": 1.01822495, + "epoch": 0.8810461445964227, + "flos": 17894467267200.0, + "grad_norm": 1.7692800722324005, + "language_loss": 0.71231246, + "learning_rate": 1.465365647269421e-07, + "loss": 0.73374015, + "num_input_tokens_seen": 315961340, + "step": 14654, + "time_per_iteration": 2.6377742290496826 + }, + { + "auxiliary_loss_clip": 0.01062842, + "auxiliary_loss_mlp": 0.01038906, + "balance_loss_clip": 1.03567505, + "balance_loss_mlp": 1.02497637, + "epoch": 0.8811062678490906, + "flos": 29163917128320.0, + "grad_norm": 1.6194615705289337, + "language_loss": 0.71497536, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.73599279, + "num_input_tokens_seen": 315981335, + "step": 14655, + "time_per_iteration": 4.449506044387817 + }, + { + "auxiliary_loss_clip": 0.01059688, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.03264832, + "balance_loss_mlp": 1.02179229, + "epoch": 0.8811663911017587, + "flos": 20338834671360.0, + "grad_norm": 2.1016384343696246, + "language_loss": 0.81381142, + "learning_rate": 1.462440453077449e-07, + "loss": 0.83476096, + "num_input_tokens_seen": 316001325, + "step": 14656, + "time_per_iteration": 4.342563629150391 + }, + { + "auxiliary_loss_clip": 0.01084679, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.03799617, + "balance_loss_mlp": 1.02292371, + "epoch": 0.8812265143544266, + "flos": 25885704424320.0, + "grad_norm": 1.9594168695096041, + "language_loss": 0.68740302, + "learning_rate": 1.460978910372914e-07, + "loss": 0.70859885, + "num_input_tokens_seen": 316022540, + "step": 14657, + "time_per_iteration": 2.75775408744812 + }, + { + "auxiliary_loss_clip": 0.01086792, + "auxiliary_loss_mlp": 0.01036523, + "balance_loss_clip": 1.03888392, + "balance_loss_mlp": 1.02426791, + "epoch": 0.8812866376070946, + "flos": 27195509865600.0, + "grad_norm": 2.309045431146604, + "language_loss": 0.84054673, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.86177993, + "num_input_tokens_seen": 316037735, + "step": 14658, + "time_per_iteration": 4.1529762744903564 + }, + { + "auxiliary_loss_clip": 0.01094436, + "auxiliary_loss_mlp": 0.01035928, + "balance_loss_clip": 1.03857708, + "balance_loss_mlp": 1.02275574, + "epoch": 0.8813467608597625, + "flos": 23807194997760.0, + "grad_norm": 1.9486638108186574, + "language_loss": 0.77363259, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.79493624, + "num_input_tokens_seen": 316058105, + "step": 14659, + "time_per_iteration": 4.211735010147095 + }, + { + "auxiliary_loss_clip": 0.01085864, + "auxiliary_loss_mlp": 0.01034435, + "balance_loss_clip": 1.03627634, + "balance_loss_mlp": 1.0214169, + "epoch": 0.8814068841124305, + "flos": 21105455667840.0, + "grad_norm": 2.1180822282078235, + "language_loss": 0.60540521, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.62660819, + "num_input_tokens_seen": 316074415, + "step": 14660, + "time_per_iteration": 2.6319613456726074 + }, + { + "auxiliary_loss_clip": 0.01094829, + "auxiliary_loss_mlp": 0.01038386, + "balance_loss_clip": 1.04060745, + "balance_loss_mlp": 1.02493942, + "epoch": 0.8814670073650984, + "flos": 24716991605760.0, + "grad_norm": 1.6496161205890179, + "language_loss": 0.77789259, + "learning_rate": 1.455139770123972e-07, + "loss": 0.79922473, + "num_input_tokens_seen": 316094405, + "step": 14661, + "time_per_iteration": 2.633333444595337 + }, + { + "auxiliary_loss_clip": 0.01068997, + "auxiliary_loss_mlp": 0.01045562, + "balance_loss_clip": 1.03819084, + "balance_loss_mlp": 1.03196073, + "epoch": 0.8815271306177664, + "flos": 22966274718720.0, + "grad_norm": 2.4670359855209374, + "language_loss": 0.76707077, + "learning_rate": 1.45368174298081e-07, + "loss": 0.78821635, + "num_input_tokens_seen": 316113390, + "step": 14662, + "time_per_iteration": 2.645803451538086 + }, + { + "auxiliary_loss_clip": 0.01059478, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.03322673, + "balance_loss_mlp": 1.01856136, + "epoch": 0.8815872538704344, + "flos": 19460064435840.0, + "grad_norm": 2.046618055728614, + "language_loss": 0.73941565, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.76030809, + "num_input_tokens_seen": 316131085, + "step": 14663, + "time_per_iteration": 2.7289090156555176 + }, + { + "auxiliary_loss_clip": 0.01099377, + "auxiliary_loss_mlp": 0.00769769, + "balance_loss_clip": 1.03778672, + "balance_loss_mlp": 1.00022185, + "epoch": 0.8816473771231024, + "flos": 32156604622080.0, + "grad_norm": 2.211377651108035, + "language_loss": 0.69977838, + "learning_rate": 1.450767798584489e-07, + "loss": 0.71846986, + "num_input_tokens_seen": 316151440, + "step": 14664, + "time_per_iteration": 2.679704427719116 + }, + { + "auxiliary_loss_clip": 0.01028116, + "auxiliary_loss_mlp": 0.01040404, + "balance_loss_clip": 1.03083181, + "balance_loss_mlp": 1.02833962, + "epoch": 0.8817075003757703, + "flos": 19682279925120.0, + "grad_norm": 1.499474682944125, + "language_loss": 0.80967414, + "learning_rate": 1.449311881441828e-07, + "loss": 0.83035928, + "num_input_tokens_seen": 316170750, + "step": 14665, + "time_per_iteration": 2.818871021270752 + }, + { + "auxiliary_loss_clip": 0.01085891, + "auxiliary_loss_mlp": 0.01035568, + "balance_loss_clip": 1.03590584, + "balance_loss_mlp": 1.0237484, + "epoch": 0.8817676236284383, + "flos": 15668616251520.0, + "grad_norm": 2.192576285565641, + "language_loss": 0.5833683, + "learning_rate": 1.447856667743117e-07, + "loss": 0.60458285, + "num_input_tokens_seen": 316187265, + "step": 14666, + "time_per_iteration": 2.6670124530792236 + }, + { + "auxiliary_loss_clip": 0.01101515, + "auxiliary_loss_mlp": 0.01031699, + "balance_loss_clip": 1.03911185, + "balance_loss_mlp": 1.01791823, + "epoch": 0.8818277468811063, + "flos": 17895185539200.0, + "grad_norm": 2.486999206259205, + "language_loss": 0.83586216, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.8571943, + "num_input_tokens_seen": 316206555, + "step": 14667, + "time_per_iteration": 2.6268537044525146 + }, + { + "auxiliary_loss_clip": 0.01109075, + "auxiliary_loss_mlp": 0.01032153, + "balance_loss_clip": 1.03729033, + "balance_loss_mlp": 1.01920688, + "epoch": 0.8818878701337742, + "flos": 18770508069120.0, + "grad_norm": 1.817207647136482, + "language_loss": 0.62429118, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.64570343, + "num_input_tokens_seen": 316225210, + "step": 14668, + "time_per_iteration": 2.552854061126709 + }, + { + "auxiliary_loss_clip": 0.01095167, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.03637564, + "balance_loss_mlp": 1.02023387, + "epoch": 0.8819479933864423, + "flos": 17712292464000.0, + "grad_norm": 2.79196460175423, + "language_loss": 0.57027191, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.59153754, + "num_input_tokens_seen": 316242685, + "step": 14669, + "time_per_iteration": 2.565288782119751 + }, + { + "auxiliary_loss_clip": 0.0110705, + "auxiliary_loss_mlp": 0.01032118, + "balance_loss_clip": 1.03566611, + "balance_loss_mlp": 1.01975608, + "epoch": 0.8820081166391102, + "flos": 11728749070080.0, + "grad_norm": 1.8986730900413675, + "language_loss": 0.71354139, + "learning_rate": 1.442042848491043e-07, + "loss": 0.73493308, + "num_input_tokens_seen": 316260935, + "step": 14670, + "time_per_iteration": 2.563056707382202 + }, + { + "auxiliary_loss_clip": 0.01090236, + "auxiliary_loss_mlp": 0.01032669, + "balance_loss_clip": 1.03279638, + "balance_loss_mlp": 1.02009296, + "epoch": 0.8820682398917782, + "flos": 27490372611840.0, + "grad_norm": 1.913343870820924, + "language_loss": 0.73558605, + "learning_rate": 1.44059115283929e-07, + "loss": 0.75681508, + "num_input_tokens_seen": 316281190, + "step": 14671, + "time_per_iteration": 2.648991346359253 + }, + { + "auxiliary_loss_clip": 0.0108854, + "auxiliary_loss_mlp": 0.01032215, + "balance_loss_clip": 1.03446448, + "balance_loss_mlp": 1.01891685, + "epoch": 0.8821283631444461, + "flos": 16873850223360.0, + "grad_norm": 2.5015746427003878, + "language_loss": 0.84854722, + "learning_rate": 1.43914016096218e-07, + "loss": 0.86975479, + "num_input_tokens_seen": 316297115, + "step": 14672, + "time_per_iteration": 2.582524061203003 + }, + { + "auxiliary_loss_clip": 0.01071178, + "auxiliary_loss_mlp": 0.01030273, + "balance_loss_clip": 1.03337216, + "balance_loss_mlp": 1.01805353, + "epoch": 0.8821884863971141, + "flos": 24280964409600.0, + "grad_norm": 1.630028849005291, + "language_loss": 0.7247709, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.74578547, + "num_input_tokens_seen": 316318235, + "step": 14673, + "time_per_iteration": 2.7013115882873535 + }, + { + "auxiliary_loss_clip": 0.01008529, + "auxiliary_loss_mlp": 0.01000562, + "balance_loss_clip": 1.00525308, + "balance_loss_mlp": 0.99949533, + "epoch": 0.882248609649782, + "flos": 59432342492160.0, + "grad_norm": 0.8079833493209833, + "language_loss": 0.49358672, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.5136776, + "num_input_tokens_seen": 316384705, + "step": 14674, + "time_per_iteration": 3.268969774246216 + }, + { + "auxiliary_loss_clip": 0.01083711, + "auxiliary_loss_mlp": 0.00770966, + "balance_loss_clip": 1.03282237, + "balance_loss_mlp": 1.00024939, + "epoch": 0.88230873290245, + "flos": 19937784343680.0, + "grad_norm": 2.0037273036642578, + "language_loss": 0.76279628, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.78134304, + "num_input_tokens_seen": 316401165, + "step": 14675, + "time_per_iteration": 2.6139438152313232 + }, + { + "auxiliary_loss_clip": 0.01083195, + "auxiliary_loss_mlp": 0.01035716, + "balance_loss_clip": 1.03536808, + "balance_loss_mlp": 1.02411079, + "epoch": 0.882368856155118, + "flos": 16362769559040.0, + "grad_norm": 1.8400865500932602, + "language_loss": 0.79260898, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.81379807, + "num_input_tokens_seen": 316418780, + "step": 14676, + "time_per_iteration": 2.6346635818481445 + }, + { + "auxiliary_loss_clip": 0.00997838, + "auxiliary_loss_mlp": 0.01005545, + "balance_loss_clip": 1.01021266, + "balance_loss_mlp": 1.00431693, + "epoch": 0.882428979407786, + "flos": 70594563277440.0, + "grad_norm": 0.7902692138186003, + "language_loss": 0.54692107, + "learning_rate": 1.431895760121109e-07, + "loss": 0.56695491, + "num_input_tokens_seen": 316482030, + "step": 14677, + "time_per_iteration": 3.293663501739502 + }, + { + "auxiliary_loss_clip": 0.01105406, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.03478503, + "balance_loss_mlp": 1.01775551, + "epoch": 0.8824891026604539, + "flos": 18150294908160.0, + "grad_norm": 2.2487393421780673, + "language_loss": 0.64326406, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.66461849, + "num_input_tokens_seen": 316499175, + "step": 14678, + "time_per_iteration": 2.5656399726867676 + }, + { + "auxiliary_loss_clip": 0.01087368, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.03426218, + "balance_loss_mlp": 1.01929188, + "epoch": 0.8825492259131219, + "flos": 27232713377280.0, + "grad_norm": 1.973421047113739, + "language_loss": 0.71194983, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.73314273, + "num_input_tokens_seen": 316519495, + "step": 14679, + "time_per_iteration": 2.717034339904785 + }, + { + "auxiliary_loss_clip": 0.01084094, + "auxiliary_loss_mlp": 0.01031717, + "balance_loss_clip": 1.03604484, + "balance_loss_mlp": 1.02098179, + "epoch": 0.8826093491657898, + "flos": 22274419881600.0, + "grad_norm": 1.9561596241675088, + "language_loss": 0.63978046, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.66093856, + "num_input_tokens_seen": 316538180, + "step": 14680, + "time_per_iteration": 2.6951301097869873 + }, + { + "auxiliary_loss_clip": 0.01107228, + "auxiliary_loss_mlp": 0.01033016, + "balance_loss_clip": 1.03680277, + "balance_loss_mlp": 1.0208087, + "epoch": 0.8826694724184578, + "flos": 14204753377920.0, + "grad_norm": 2.3967020475044767, + "language_loss": 0.77099824, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.79240072, + "num_input_tokens_seen": 316551750, + "step": 14681, + "time_per_iteration": 2.5262744426727295 + }, + { + "auxiliary_loss_clip": 0.01087034, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.03454781, + "balance_loss_mlp": 1.02127814, + "epoch": 0.8827295956711259, + "flos": 20631686256000.0, + "grad_norm": 1.7532857738520948, + "language_loss": 0.72604549, + "learning_rate": 1.424668961888047e-07, + "loss": 0.74726152, + "num_input_tokens_seen": 316570680, + "step": 14682, + "time_per_iteration": 2.632432699203491 + }, + { + "auxiliary_loss_clip": 0.01069185, + "auxiliary_loss_mlp": 0.0103165, + "balance_loss_clip": 1.03995907, + "balance_loss_mlp": 1.01723146, + "epoch": 0.8827897189237938, + "flos": 18513064316160.0, + "grad_norm": 1.9501054227353172, + "language_loss": 0.74376327, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.76477158, + "num_input_tokens_seen": 316588635, + "step": 14683, + "time_per_iteration": 2.7173256874084473 + }, + { + "auxiliary_loss_clip": 0.01074481, + "auxiliary_loss_mlp": 0.01032656, + "balance_loss_clip": 1.03458118, + "balance_loss_mlp": 1.01993001, + "epoch": 0.8828498421764618, + "flos": 22747399194240.0, + "grad_norm": 1.85393754134711, + "language_loss": 0.65667385, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.67774516, + "num_input_tokens_seen": 316607550, + "step": 14684, + "time_per_iteration": 2.7330434322357178 + }, + { + "auxiliary_loss_clip": 0.0109236, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.03487706, + "balance_loss_mlp": 1.01779604, + "epoch": 0.8829099654291297, + "flos": 15012384727680.0, + "grad_norm": 1.9479646224303804, + "language_loss": 0.69623429, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.71745217, + "num_input_tokens_seen": 316624460, + "step": 14685, + "time_per_iteration": 2.5940215587615967 + }, + { + "auxiliary_loss_clip": 0.01057757, + "auxiliary_loss_mlp": 0.0103887, + "balance_loss_clip": 1.03562188, + "balance_loss_mlp": 1.02445173, + "epoch": 0.8829700886817977, + "flos": 16720546976640.0, + "grad_norm": 1.9726343446445405, + "language_loss": 0.74293447, + "learning_rate": 1.418900201783806e-07, + "loss": 0.76390076, + "num_input_tokens_seen": 316640765, + "step": 14686, + "time_per_iteration": 2.724073886871338 + }, + { + "auxiliary_loss_clip": 0.01055836, + "auxiliary_loss_mlp": 0.01028261, + "balance_loss_clip": 1.03210068, + "balance_loss_mlp": 1.01602983, + "epoch": 0.8830302119344656, + "flos": 15263256291840.0, + "grad_norm": 1.8765198803907357, + "language_loss": 0.63015836, + "learning_rate": 1.417459773114007e-07, + "loss": 0.65099931, + "num_input_tokens_seen": 316656120, + "step": 14687, + "time_per_iteration": 2.707498550415039 + }, + { + "auxiliary_loss_clip": 0.01100271, + "auxiliary_loss_mlp": 0.01038773, + "balance_loss_clip": 1.03685296, + "balance_loss_mlp": 1.02611268, + "epoch": 0.8830903351871336, + "flos": 28617751854720.0, + "grad_norm": 1.78215533171273, + "language_loss": 0.69295615, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.71434665, + "num_input_tokens_seen": 316676095, + "step": 14688, + "time_per_iteration": 2.6418840885162354 + }, + { + "auxiliary_loss_clip": 0.0109326, + "auxiliary_loss_mlp": 0.01027423, + "balance_loss_clip": 1.03498924, + "balance_loss_mlp": 1.01532912, + "epoch": 0.8831504584398016, + "flos": 28001632844160.0, + "grad_norm": 1.920117351658533, + "language_loss": 0.66948056, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.69068736, + "num_input_tokens_seen": 316696235, + "step": 14689, + "time_per_iteration": 2.572154998779297 + }, + { + "auxiliary_loss_clip": 0.01082065, + "auxiliary_loss_mlp": 0.01027763, + "balance_loss_clip": 1.03897274, + "balance_loss_mlp": 1.01591396, + "epoch": 0.8832105816924696, + "flos": 26579642250240.0, + "grad_norm": 1.390214083666347, + "language_loss": 0.74641317, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.76751149, + "num_input_tokens_seen": 316719680, + "step": 14690, + "time_per_iteration": 2.7160091400146484 + }, + { + "auxiliary_loss_clip": 0.0108565, + "auxiliary_loss_mlp": 0.01037391, + "balance_loss_clip": 1.0344497, + "balance_loss_mlp": 1.02380705, + "epoch": 0.8832707049451375, + "flos": 24898771359360.0, + "grad_norm": 1.4286133557095182, + "language_loss": 0.72746867, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.74869907, + "num_input_tokens_seen": 316739830, + "step": 14691, + "time_per_iteration": 2.650376558303833 + }, + { + "auxiliary_loss_clip": 0.01076966, + "auxiliary_loss_mlp": 0.0102843, + "balance_loss_clip": 1.03778577, + "balance_loss_mlp": 1.01508439, + "epoch": 0.8833308281978055, + "flos": 15451141357440.0, + "grad_norm": 2.0965604291100277, + "language_loss": 0.51753283, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.53858674, + "num_input_tokens_seen": 316758105, + "step": 14692, + "time_per_iteration": 2.656104564666748 + }, + { + "auxiliary_loss_clip": 0.01072794, + "auxiliary_loss_mlp": 0.01033738, + "balance_loss_clip": 1.03685403, + "balance_loss_mlp": 1.02114379, + "epoch": 0.8833909514504734, + "flos": 20301523418880.0, + "grad_norm": 2.5366757871087264, + "language_loss": 0.60396338, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.62502873, + "num_input_tokens_seen": 316777455, + "step": 14693, + "time_per_iteration": 2.6937055587768555 + }, + { + "auxiliary_loss_clip": 0.01104793, + "auxiliary_loss_mlp": 0.01027886, + "balance_loss_clip": 1.03680062, + "balance_loss_mlp": 1.01657307, + "epoch": 0.8834510747031414, + "flos": 20374027021440.0, + "grad_norm": 1.6196331469074723, + "language_loss": 0.75283146, + "learning_rate": 1.407396505730898e-07, + "loss": 0.77415824, + "num_input_tokens_seen": 316796300, + "step": 14694, + "time_per_iteration": 2.577456474304199 + }, + { + "auxiliary_loss_clip": 0.01092067, + "auxiliary_loss_mlp": 0.01031516, + "balance_loss_clip": 1.03433728, + "balance_loss_mlp": 1.02011991, + "epoch": 0.8835111979558095, + "flos": 29752026508800.0, + "grad_norm": 1.8847177158288673, + "language_loss": 0.72582275, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.74705863, + "num_input_tokens_seen": 316819090, + "step": 14695, + "time_per_iteration": 5.806610822677612 + }, + { + "auxiliary_loss_clip": 0.01092613, + "auxiliary_loss_mlp": 0.01026382, + "balance_loss_clip": 1.03546548, + "balance_loss_mlp": 1.01503897, + "epoch": 0.8835713212084774, + "flos": 24134556574080.0, + "grad_norm": 1.7805903977771496, + "language_loss": 0.80249125, + "learning_rate": 1.404527630961998e-07, + "loss": 0.82368124, + "num_input_tokens_seen": 316839250, + "step": 14696, + "time_per_iteration": 2.6262238025665283 + }, + { + "auxiliary_loss_clip": 0.01070594, + "auxiliary_loss_mlp": 0.01033129, + "balance_loss_clip": 1.03721249, + "balance_loss_mlp": 1.02114844, + "epoch": 0.8836314444611454, + "flos": 27672331933440.0, + "grad_norm": 1.3961231216590477, + "language_loss": 0.74813706, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.76917428, + "num_input_tokens_seen": 316861315, + "step": 14697, + "time_per_iteration": 4.375631809234619 + }, + { + "auxiliary_loss_clip": 0.01087263, + "auxiliary_loss_mlp": 0.01030892, + "balance_loss_clip": 1.03708208, + "balance_loss_mlp": 1.01885819, + "epoch": 0.8836915677138133, + "flos": 16836969934080.0, + "grad_norm": 2.288430876034272, + "language_loss": 0.72242546, + "learning_rate": 1.401661576761779e-07, + "loss": 0.74360704, + "num_input_tokens_seen": 316879325, + "step": 14698, + "time_per_iteration": 4.223493576049805 + }, + { + "auxiliary_loss_clip": 0.01018409, + "auxiliary_loss_mlp": 0.00999712, + "balance_loss_clip": 1.00626993, + "balance_loss_mlp": 0.99860901, + "epoch": 0.8837516909664813, + "flos": 69310540823040.0, + "grad_norm": 0.8057459170171036, + "language_loss": 0.53683382, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.55701506, + "num_input_tokens_seen": 316936425, + "step": 14699, + "time_per_iteration": 3.2273147106170654 + }, + { + "auxiliary_loss_clip": 0.01087948, + "auxiliary_loss_mlp": 0.01031005, + "balance_loss_clip": 1.03542256, + "balance_loss_mlp": 1.01808274, + "epoch": 0.8838118142191492, + "flos": 21324726241920.0, + "grad_norm": 1.773127577183959, + "language_loss": 0.76996839, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.79115796, + "num_input_tokens_seen": 316956360, + "step": 14700, + "time_per_iteration": 2.7143490314483643 + }, + { + "auxiliary_loss_clip": 0.01074827, + "auxiliary_loss_mlp": 0.01031532, + "balance_loss_clip": 1.03586185, + "balance_loss_mlp": 1.01926565, + "epoch": 0.8838719374718172, + "flos": 21470559459840.0, + "grad_norm": 1.7340083630316034, + "language_loss": 0.72865736, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.74972093, + "num_input_tokens_seen": 316975295, + "step": 14701, + "time_per_iteration": 2.6882786750793457 + }, + { + "auxiliary_loss_clip": 0.01086251, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.03455663, + "balance_loss_mlp": 1.02270126, + "epoch": 0.8839320607244852, + "flos": 26468929555200.0, + "grad_norm": 1.8625463465240368, + "language_loss": 0.71503305, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.73626214, + "num_input_tokens_seen": 316994520, + "step": 14702, + "time_per_iteration": 2.72592830657959 + }, + { + "auxiliary_loss_clip": 0.01071764, + "auxiliary_loss_mlp": 0.01044197, + "balance_loss_clip": 1.03413224, + "balance_loss_mlp": 1.02983212, + "epoch": 0.8839921839771532, + "flos": 45222270923520.0, + "grad_norm": 1.7559603641053307, + "language_loss": 0.71454448, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.73570406, + "num_input_tokens_seen": 317018095, + "step": 14703, + "time_per_iteration": 2.9277431964874268 + }, + { + "auxiliary_loss_clip": 0.01065783, + "auxiliary_loss_mlp": 0.01031109, + "balance_loss_clip": 1.03791165, + "balance_loss_mlp": 1.01922941, + "epoch": 0.8840523072298211, + "flos": 20006876154240.0, + "grad_norm": 2.1820279654831474, + "language_loss": 0.6694417, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.69041061, + "num_input_tokens_seen": 317035755, + "step": 14704, + "time_per_iteration": 2.8294484615325928 + }, + { + "auxiliary_loss_clip": 0.01087087, + "auxiliary_loss_mlp": 0.0102583, + "balance_loss_clip": 1.03454638, + "balance_loss_mlp": 1.0141418, + "epoch": 0.8841124304824891, + "flos": 24426007528320.0, + "grad_norm": 1.887740201159673, + "language_loss": 0.70546407, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.72659326, + "num_input_tokens_seen": 317055765, + "step": 14705, + "time_per_iteration": 2.7231884002685547 + }, + { + "auxiliary_loss_clip": 0.01086994, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.03693652, + "balance_loss_mlp": 1.02126944, + "epoch": 0.884172553735157, + "flos": 31284622056960.0, + "grad_norm": 1.4798383584085324, + "language_loss": 0.70781028, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.72900212, + "num_input_tokens_seen": 317077955, + "step": 14706, + "time_per_iteration": 2.817166805267334 + }, + { + "auxiliary_loss_clip": 0.0109745, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.03596139, + "balance_loss_mlp": 1.0189985, + "epoch": 0.884232676987825, + "flos": 21391160446080.0, + "grad_norm": 1.9611948964387604, + "language_loss": 0.74633074, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.76761764, + "num_input_tokens_seen": 317095825, + "step": 14707, + "time_per_iteration": 2.692667007446289 + }, + { + "auxiliary_loss_clip": 0.01001598, + "auxiliary_loss_mlp": 0.01000676, + "balance_loss_clip": 1.00856423, + "balance_loss_mlp": 0.99968618, + "epoch": 0.8842928002404931, + "flos": 57911451799680.0, + "grad_norm": 0.8872244282469403, + "language_loss": 0.60417277, + "learning_rate": 1.387373629491173e-07, + "loss": 0.62419552, + "num_input_tokens_seen": 317152875, + "step": 14708, + "time_per_iteration": 3.083991765975952 + }, + { + "auxiliary_loss_clip": 0.01077587, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.03236675, + "balance_loss_mlp": 1.02057397, + "epoch": 0.884352923493161, + "flos": 41463896186880.0, + "grad_norm": 4.7609896272216305, + "language_loss": 0.67469186, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.6957885, + "num_input_tokens_seen": 317176725, + "step": 14709, + "time_per_iteration": 2.8194525241851807 + }, + { + "auxiliary_loss_clip": 0.01091628, + "auxiliary_loss_mlp": 0.01036852, + "balance_loss_clip": 1.03700888, + "balance_loss_mlp": 1.02251136, + "epoch": 0.884413046745829, + "flos": 46541234332800.0, + "grad_norm": 1.664665419544956, + "language_loss": 0.62438279, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.64566755, + "num_input_tokens_seen": 317206880, + "step": 14710, + "time_per_iteration": 2.9080650806427 + }, + { + "auxiliary_loss_clip": 0.01074046, + "auxiliary_loss_mlp": 0.01025213, + "balance_loss_clip": 1.03497434, + "balance_loss_mlp": 1.01367342, + "epoch": 0.8844731699984969, + "flos": 19135324552320.0, + "grad_norm": 5.507655560622358, + "language_loss": 0.63936687, + "learning_rate": 1.38310100580431e-07, + "loss": 0.66035938, + "num_input_tokens_seen": 317224135, + "step": 14711, + "time_per_iteration": 2.7565457820892334 + }, + { + "auxiliary_loss_clip": 0.0107192, + "auxiliary_loss_mlp": 0.01033004, + "balance_loss_clip": 1.03220356, + "balance_loss_mlp": 1.01972961, + "epoch": 0.8845332932511649, + "flos": 23260634674560.0, + "grad_norm": 2.576105371894639, + "language_loss": 0.76215911, + "learning_rate": 1.38167820974606e-07, + "loss": 0.78320837, + "num_input_tokens_seen": 317244505, + "step": 14712, + "time_per_iteration": 2.7664034366607666 + }, + { + "auxiliary_loss_clip": 0.01048291, + "auxiliary_loss_mlp": 0.01029784, + "balance_loss_clip": 1.02974892, + "balance_loss_mlp": 1.01695108, + "epoch": 0.8845934165038328, + "flos": 17564591738880.0, + "grad_norm": 2.26538239437818, + "language_loss": 0.80963331, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.83041406, + "num_input_tokens_seen": 317257830, + "step": 14713, + "time_per_iteration": 2.7584569454193115 + }, + { + "auxiliary_loss_clip": 0.01084824, + "auxiliary_loss_mlp": 0.01028599, + "balance_loss_clip": 1.03427589, + "balance_loss_mlp": 1.01581335, + "epoch": 0.8846535397565009, + "flos": 27485739757440.0, + "grad_norm": 1.3779261727690353, + "language_loss": 0.55363518, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.57476938, + "num_input_tokens_seen": 317278430, + "step": 14714, + "time_per_iteration": 2.733762502670288 + }, + { + "auxiliary_loss_clip": 0.01053317, + "auxiliary_loss_mlp": 0.01038776, + "balance_loss_clip": 1.03666592, + "balance_loss_mlp": 1.02476311, + "epoch": 0.8847136630091688, + "flos": 28761430256640.0, + "grad_norm": 1.7611265846696629, + "language_loss": 0.74193525, + "learning_rate": 1.377414057838755e-07, + "loss": 0.76285625, + "num_input_tokens_seen": 317295970, + "step": 14715, + "time_per_iteration": 2.841095447540283 + }, + { + "auxiliary_loss_clip": 0.01098367, + "auxiliary_loss_mlp": 0.01030945, + "balance_loss_clip": 1.03592944, + "balance_loss_mlp": 1.0190537, + "epoch": 0.8847737862618368, + "flos": 23476924419840.0, + "grad_norm": 2.015334930490414, + "language_loss": 0.75194365, + "learning_rate": 1.375994086138461e-07, + "loss": 0.77323675, + "num_input_tokens_seen": 317316185, + "step": 14716, + "time_per_iteration": 2.661020517349243 + }, + { + "auxiliary_loss_clip": 0.01075664, + "auxiliary_loss_mlp": 0.0103605, + "balance_loss_clip": 1.03667819, + "balance_loss_mlp": 1.02395606, + "epoch": 0.8848339095145047, + "flos": 18660872782080.0, + "grad_norm": 1.993706294910503, + "language_loss": 0.71433997, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.73545712, + "num_input_tokens_seen": 317333275, + "step": 14717, + "time_per_iteration": 2.7001688480377197 + }, + { + "auxiliary_loss_clip": 0.01093455, + "auxiliary_loss_mlp": 0.01033322, + "balance_loss_clip": 1.0350275, + "balance_loss_mlp": 1.02174115, + "epoch": 0.8848940327671727, + "flos": 32270298145920.0, + "grad_norm": 2.3327665948166643, + "language_loss": 0.73770732, + "learning_rate": 1.373156261464208e-07, + "loss": 0.75897503, + "num_input_tokens_seen": 317351245, + "step": 14718, + "time_per_iteration": 2.677098274230957 + }, + { + "auxiliary_loss_clip": 0.01058475, + "auxiliary_loss_mlp": 0.01029814, + "balance_loss_clip": 1.03630209, + "balance_loss_mlp": 1.01655793, + "epoch": 0.8849541560198406, + "flos": 24021832717440.0, + "grad_norm": 2.0713842614778755, + "language_loss": 0.78531897, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.80620188, + "num_input_tokens_seen": 317370740, + "step": 14719, + "time_per_iteration": 2.8046772480010986 + }, + { + "auxiliary_loss_clip": 0.01108831, + "auxiliary_loss_mlp": 0.01026154, + "balance_loss_clip": 1.03627968, + "balance_loss_mlp": 1.01376843, + "epoch": 0.8850142792725086, + "flos": 16873060124160.0, + "grad_norm": 1.562689851566494, + "language_loss": 0.71582258, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.73717248, + "num_input_tokens_seen": 317388370, + "step": 14720, + "time_per_iteration": 2.6795947551727295 + }, + { + "auxiliary_loss_clip": 0.01087567, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.03469348, + "balance_loss_mlp": 1.02011943, + "epoch": 0.8850744025251767, + "flos": 24024059360640.0, + "grad_norm": 1.9018606695741462, + "language_loss": 0.82328093, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.84448266, + "num_input_tokens_seen": 317407390, + "step": 14721, + "time_per_iteration": 2.7234106063842773 + }, + { + "auxiliary_loss_clip": 0.01087774, + "auxiliary_loss_mlp": 0.01030218, + "balance_loss_clip": 1.03554928, + "balance_loss_mlp": 1.01725388, + "epoch": 0.8851345257778446, + "flos": 47955575329920.0, + "grad_norm": 2.0019899609402994, + "language_loss": 0.6242708, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.64545077, + "num_input_tokens_seen": 317430825, + "step": 14722, + "time_per_iteration": 2.94286847114563 + }, + { + "auxiliary_loss_clip": 0.01098996, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.03618419, + "balance_loss_mlp": 1.01827097, + "epoch": 0.8851946490305126, + "flos": 36611000173440.0, + "grad_norm": 2.1418673941566815, + "language_loss": 0.68605435, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.70735055, + "num_input_tokens_seen": 317451905, + "step": 14723, + "time_per_iteration": 2.733093023300171 + }, + { + "auxiliary_loss_clip": 0.01073469, + "auxiliary_loss_mlp": 0.01037418, + "balance_loss_clip": 1.03269005, + "balance_loss_mlp": 1.02493691, + "epoch": 0.8852547722831805, + "flos": 21544248211200.0, + "grad_norm": 1.6603204159034268, + "language_loss": 0.77997786, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.80108678, + "num_input_tokens_seen": 317470030, + "step": 14724, + "time_per_iteration": 2.667952299118042 + }, + { + "auxiliary_loss_clip": 0.01018949, + "auxiliary_loss_mlp": 0.01000573, + "balance_loss_clip": 1.00656819, + "balance_loss_mlp": 0.9996435, + "epoch": 0.8853148955358485, + "flos": 63059246472960.0, + "grad_norm": 0.79872504573919, + "language_loss": 0.58856809, + "learning_rate": 1.363246127376143e-07, + "loss": 0.60876334, + "num_input_tokens_seen": 317527460, + "step": 14725, + "time_per_iteration": 3.0969929695129395 + }, + { + "auxiliary_loss_clip": 0.010877, + "auxiliary_loss_mlp": 0.00772122, + "balance_loss_clip": 1.0332129, + "balance_loss_mlp": 1.00029242, + "epoch": 0.8853750187885164, + "flos": 18149828031360.0, + "grad_norm": 1.9516180183005214, + "language_loss": 0.69201702, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.71061528, + "num_input_tokens_seen": 317544070, + "step": 14726, + "time_per_iteration": 2.6915600299835205 + }, + { + "auxiliary_loss_clip": 0.01095197, + "auxiliary_loss_mlp": 0.00770245, + "balance_loss_clip": 1.03544807, + "balance_loss_mlp": 1.00021529, + "epoch": 0.8854351420411845, + "flos": 39570542392320.0, + "grad_norm": 1.2107511197334673, + "language_loss": 0.69623214, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.71488655, + "num_input_tokens_seen": 317570275, + "step": 14727, + "time_per_iteration": 2.7665956020355225 + }, + { + "auxiliary_loss_clip": 0.01088033, + "auxiliary_loss_mlp": 0.01032809, + "balance_loss_clip": 1.03910947, + "balance_loss_mlp": 1.02020836, + "epoch": 0.8854952652938524, + "flos": 23769309127680.0, + "grad_norm": 1.5740836195645216, + "language_loss": 0.69980741, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.72101581, + "num_input_tokens_seen": 317590160, + "step": 14728, + "time_per_iteration": 2.765291929244995 + }, + { + "auxiliary_loss_clip": 0.0107448, + "auxiliary_loss_mlp": 0.01027952, + "balance_loss_clip": 1.03473592, + "balance_loss_mlp": 1.01644814, + "epoch": 0.8855553885465204, + "flos": 18290310122880.0, + "grad_norm": 2.332652743923133, + "language_loss": 0.66558629, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.68661064, + "num_input_tokens_seen": 317608340, + "step": 14729, + "time_per_iteration": 2.7198948860168457 + }, + { + "auxiliary_loss_clip": 0.01079258, + "auxiliary_loss_mlp": 0.01037721, + "balance_loss_clip": 1.03742743, + "balance_loss_mlp": 1.02642, + "epoch": 0.8856155117991883, + "flos": 36867402432000.0, + "grad_norm": 1.6722891950677918, + "language_loss": 0.62810826, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.64927804, + "num_input_tokens_seen": 317629910, + "step": 14730, + "time_per_iteration": 2.8442556858062744 + }, + { + "auxiliary_loss_clip": 0.01071976, + "auxiliary_loss_mlp": 0.01031648, + "balance_loss_clip": 1.03443062, + "balance_loss_mlp": 1.01946437, + "epoch": 0.8856756350518563, + "flos": 22163886754560.0, + "grad_norm": 1.401332014115865, + "language_loss": 0.79437548, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.81541169, + "num_input_tokens_seen": 317650265, + "step": 14731, + "time_per_iteration": 2.762430429458618 + }, + { + "auxiliary_loss_clip": 0.01072107, + "auxiliary_loss_mlp": 0.01033343, + "balance_loss_clip": 1.0311588, + "balance_loss_mlp": 1.02106476, + "epoch": 0.8857357583045242, + "flos": 20740962407040.0, + "grad_norm": 1.5976657601317488, + "language_loss": 0.82999492, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.85104942, + "num_input_tokens_seen": 317669045, + "step": 14732, + "time_per_iteration": 2.7181379795074463 + }, + { + "auxiliary_loss_clip": 0.01009214, + "auxiliary_loss_mlp": 0.01003697, + "balance_loss_clip": 1.00654268, + "balance_loss_mlp": 1.00273728, + "epoch": 0.8857958815571922, + "flos": 69892329409920.0, + "grad_norm": 0.9009578672979854, + "language_loss": 0.5992915, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.61942059, + "num_input_tokens_seen": 317728065, + "step": 14733, + "time_per_iteration": 4.828664064407349 + }, + { + "auxiliary_loss_clip": 0.01109749, + "auxiliary_loss_mlp": 0.0077073, + "balance_loss_clip": 1.03790414, + "balance_loss_mlp": 1.00016737, + "epoch": 0.8858560048098603, + "flos": 15121948187520.0, + "grad_norm": 4.085770877577171, + "language_loss": 0.66732299, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.68612778, + "num_input_tokens_seen": 317746120, + "step": 14734, + "time_per_iteration": 4.0870819091796875 + }, + { + "auxiliary_loss_clip": 0.01081595, + "auxiliary_loss_mlp": 0.01037825, + "balance_loss_clip": 1.03617239, + "balance_loss_mlp": 1.02644062, + "epoch": 0.8859161280625282, + "flos": 16611019430400.0, + "grad_norm": 1.9769334143757535, + "language_loss": 0.75267172, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.77386594, + "num_input_tokens_seen": 317762280, + "step": 14735, + "time_per_iteration": 2.596672534942627 + }, + { + "auxiliary_loss_clip": 0.01070336, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.03347635, + "balance_loss_mlp": 1.02087295, + "epoch": 0.8859762513151962, + "flos": 18694484933760.0, + "grad_norm": 1.9172644356964386, + "language_loss": 0.70264298, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.72368586, + "num_input_tokens_seen": 317780615, + "step": 14736, + "time_per_iteration": 4.219033479690552 + }, + { + "auxiliary_loss_clip": 0.01077332, + "auxiliary_loss_mlp": 0.01031715, + "balance_loss_clip": 1.03754532, + "balance_loss_mlp": 1.01955533, + "epoch": 0.8860363745678641, + "flos": 19536877670400.0, + "grad_norm": 5.918141742658791, + "language_loss": 0.84637642, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.86746687, + "num_input_tokens_seen": 317798830, + "step": 14737, + "time_per_iteration": 4.119691848754883 + }, + { + "auxiliary_loss_clip": 0.01084938, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.03715491, + "balance_loss_mlp": 1.02060747, + "epoch": 0.8860964978205321, + "flos": 35954912304000.0, + "grad_norm": 2.9176785944040087, + "language_loss": 0.67942357, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.70061862, + "num_input_tokens_seen": 317819235, + "step": 14738, + "time_per_iteration": 2.865959882736206 + }, + { + "auxiliary_loss_clip": 0.01101518, + "auxiliary_loss_mlp": 0.01030491, + "balance_loss_clip": 1.03650808, + "balance_loss_mlp": 1.0173068, + "epoch": 0.8861566210732, + "flos": 21212577002880.0, + "grad_norm": 1.8188122899172712, + "language_loss": 0.75242293, + "learning_rate": 1.343529763547222e-07, + "loss": 0.77374303, + "num_input_tokens_seen": 317836785, + "step": 14739, + "time_per_iteration": 2.6084749698638916 + }, + { + "auxiliary_loss_clip": 0.01096641, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.03646207, + "balance_loss_mlp": 1.02176273, + "epoch": 0.886216744325868, + "flos": 14609071843200.0, + "grad_norm": 2.373250938513307, + "language_loss": 0.87370729, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.89500761, + "num_input_tokens_seen": 317854225, + "step": 14740, + "time_per_iteration": 2.6357059478759766 + }, + { + "auxiliary_loss_clip": 0.01058963, + "auxiliary_loss_mlp": 0.01034429, + "balance_loss_clip": 1.03261304, + "balance_loss_mlp": 1.02202511, + "epoch": 0.886276867578536, + "flos": 26651643062400.0, + "grad_norm": 1.7903686918676003, + "language_loss": 0.63587701, + "learning_rate": 1.34072445601471e-07, + "loss": 0.656811, + "num_input_tokens_seen": 317874865, + "step": 14741, + "time_per_iteration": 2.7529678344726562 + }, + { + "auxiliary_loss_clip": 0.01108743, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.03720188, + "balance_loss_mlp": 1.01766753, + "epoch": 0.886336990831204, + "flos": 16764071281920.0, + "grad_norm": 1.7833239303064403, + "language_loss": 0.72917497, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.75055945, + "num_input_tokens_seen": 317892830, + "step": 14742, + "time_per_iteration": 2.5617966651916504 + }, + { + "auxiliary_loss_clip": 0.01097185, + "auxiliary_loss_mlp": 0.00770206, + "balance_loss_clip": 1.0359509, + "balance_loss_mlp": 1.00016761, + "epoch": 0.8863971140838719, + "flos": 25265275781760.0, + "grad_norm": 1.894504945703206, + "language_loss": 0.59785163, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.61652559, + "num_input_tokens_seen": 317911780, + "step": 14743, + "time_per_iteration": 2.7500805854797363 + }, + { + "auxiliary_loss_clip": 0.01079179, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.03562689, + "balance_loss_mlp": 1.02005744, + "epoch": 0.8864572373365399, + "flos": 23404313076480.0, + "grad_norm": 1.5564571259362694, + "language_loss": 0.60083222, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.62196267, + "num_input_tokens_seen": 317932855, + "step": 14744, + "time_per_iteration": 2.770298957824707 + }, + { + "auxiliary_loss_clip": 0.01092438, + "auxiliary_loss_mlp": 0.0077049, + "balance_loss_clip": 1.03708875, + "balance_loss_mlp": 1.00017428, + "epoch": 0.8865173605892078, + "flos": 18548759456640.0, + "grad_norm": 1.674319681826978, + "language_loss": 0.76905382, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.78768307, + "num_input_tokens_seen": 317952090, + "step": 14745, + "time_per_iteration": 2.5852930545806885 + }, + { + "auxiliary_loss_clip": 0.01107283, + "auxiliary_loss_mlp": 0.00770198, + "balance_loss_clip": 1.03665972, + "balance_loss_mlp": 1.00020969, + "epoch": 0.8865774838418758, + "flos": 19025868833280.0, + "grad_norm": 2.096197494867565, + "language_loss": 0.77457786, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.79335266, + "num_input_tokens_seen": 317970370, + "step": 14746, + "time_per_iteration": 2.573580026626587 + }, + { + "auxiliary_loss_clip": 0.01086009, + "auxiliary_loss_mlp": 0.01035282, + "balance_loss_clip": 1.0389545, + "balance_loss_mlp": 1.02199042, + "epoch": 0.8866376070945439, + "flos": 22163168482560.0, + "grad_norm": 2.0671047150936674, + "language_loss": 0.76368475, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.78489769, + "num_input_tokens_seen": 317989125, + "step": 14747, + "time_per_iteration": 2.624581813812256 + }, + { + "auxiliary_loss_clip": 0.01082631, + "auxiliary_loss_mlp": 0.007697, + "balance_loss_clip": 1.03356695, + "balance_loss_mlp": 1.00016332, + "epoch": 0.8866977303472118, + "flos": 20704261685760.0, + "grad_norm": 1.7191098225964694, + "language_loss": 0.82627869, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.84480202, + "num_input_tokens_seen": 318007820, + "step": 14748, + "time_per_iteration": 2.67641282081604 + }, + { + "auxiliary_loss_clip": 0.01099108, + "auxiliary_loss_mlp": 0.01035329, + "balance_loss_clip": 1.03823555, + "balance_loss_mlp": 1.02254343, + "epoch": 0.8867578535998798, + "flos": 48794448533760.0, + "grad_norm": 1.734559291294961, + "language_loss": 0.77452302, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.79586738, + "num_input_tokens_seen": 318030435, + "step": 14749, + "time_per_iteration": 2.84780216217041 + }, + { + "auxiliary_loss_clip": 0.01044507, + "auxiliary_loss_mlp": 0.00770609, + "balance_loss_clip": 1.0361824, + "balance_loss_mlp": 1.0002166, + "epoch": 0.8868179768525477, + "flos": 21105312013440.0, + "grad_norm": 1.9998873550656093, + "language_loss": 0.69549012, + "learning_rate": 1.328135602550451e-07, + "loss": 0.71364129, + "num_input_tokens_seen": 318049465, + "step": 14750, + "time_per_iteration": 2.714163064956665 + }, + { + "auxiliary_loss_clip": 0.01097015, + "auxiliary_loss_mlp": 0.01037086, + "balance_loss_clip": 1.03601015, + "balance_loss_mlp": 1.02531457, + "epoch": 0.8868781001052157, + "flos": 21830922656640.0, + "grad_norm": 1.7739396110359793, + "language_loss": 0.59205437, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.61339533, + "num_input_tokens_seen": 318067760, + "step": 14751, + "time_per_iteration": 2.627380609512329 + }, + { + "auxiliary_loss_clip": 0.01109091, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.03742659, + "balance_loss_mlp": 1.01954198, + "epoch": 0.8869382233578836, + "flos": 13516418073600.0, + "grad_norm": 2.24908964745291, + "language_loss": 0.81063259, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.83204532, + "num_input_tokens_seen": 318082785, + "step": 14752, + "time_per_iteration": 2.548123836517334 + }, + { + "auxiliary_loss_clip": 0.01090623, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.0372467, + "balance_loss_mlp": 1.01752663, + "epoch": 0.8869983466105517, + "flos": 22704988210560.0, + "grad_norm": 2.2048651718571963, + "language_loss": 0.80242121, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.82363582, + "num_input_tokens_seen": 318101925, + "step": 14753, + "time_per_iteration": 2.634328842163086 + }, + { + "auxiliary_loss_clip": 0.01106619, + "auxiliary_loss_mlp": 0.01033372, + "balance_loss_clip": 1.03586936, + "balance_loss_mlp": 1.02094483, + "epoch": 0.8870584698632196, + "flos": 15340751884800.0, + "grad_norm": 1.7988782645876313, + "language_loss": 0.65307128, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.67447126, + "num_input_tokens_seen": 318119945, + "step": 14754, + "time_per_iteration": 2.5431594848632812 + }, + { + "auxiliary_loss_clip": 0.01110421, + "auxiliary_loss_mlp": 0.01031473, + "balance_loss_clip": 1.03804612, + "balance_loss_mlp": 1.01891458, + "epoch": 0.8871185931158876, + "flos": 26615624699520.0, + "grad_norm": 2.066769305763262, + "language_loss": 0.7433095, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.76472843, + "num_input_tokens_seen": 318139685, + "step": 14755, + "time_per_iteration": 2.5941274166107178 + }, + { + "auxiliary_loss_clip": 0.01084027, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.03191829, + "balance_loss_mlp": 1.01927161, + "epoch": 0.8871787163685555, + "flos": 21799034357760.0, + "grad_norm": 1.4611791846416269, + "language_loss": 0.77831644, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.79948902, + "num_input_tokens_seen": 318160375, + "step": 14756, + "time_per_iteration": 2.7859320640563965 + }, + { + "auxiliary_loss_clip": 0.01089134, + "auxiliary_loss_mlp": 0.01034258, + "balance_loss_clip": 1.0377419, + "balance_loss_mlp": 1.02150822, + "epoch": 0.8872388396212235, + "flos": 14902964922240.0, + "grad_norm": 2.1242136336639414, + "language_loss": 0.76514637, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.78638029, + "num_input_tokens_seen": 318177995, + "step": 14757, + "time_per_iteration": 2.637052059173584 + }, + { + "auxiliary_loss_clip": 0.01048992, + "auxiliary_loss_mlp": 0.01036807, + "balance_loss_clip": 1.03180897, + "balance_loss_mlp": 1.02424812, + "epoch": 0.8872989628738914, + "flos": 26432157006720.0, + "grad_norm": 1.8638847565120873, + "language_loss": 0.68011022, + "learning_rate": 1.316993656021632e-07, + "loss": 0.70096827, + "num_input_tokens_seen": 318197030, + "step": 14758, + "time_per_iteration": 2.852785348892212 + }, + { + "auxiliary_loss_clip": 0.01108807, + "auxiliary_loss_mlp": 0.01035484, + "balance_loss_clip": 1.03694987, + "balance_loss_mlp": 1.02170336, + "epoch": 0.8873590861265594, + "flos": 48142562555520.0, + "grad_norm": 3.8430422864269356, + "language_loss": 0.69252694, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.71396983, + "num_input_tokens_seen": 318221780, + "step": 14759, + "time_per_iteration": 2.795743942260742 + }, + { + "auxiliary_loss_clip": 0.01106874, + "auxiliary_loss_mlp": 0.01033127, + "balance_loss_clip": 1.03578842, + "balance_loss_mlp": 1.0204078, + "epoch": 0.8874192093792275, + "flos": 18332972501760.0, + "grad_norm": 1.7718328909299519, + "language_loss": 0.74552894, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.76692903, + "num_input_tokens_seen": 318239710, + "step": 14760, + "time_per_iteration": 2.5467581748962402 + }, + { + "auxiliary_loss_clip": 0.01090454, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.03724909, + "balance_loss_mlp": 1.02152801, + "epoch": 0.8874793326318954, + "flos": 17894215872000.0, + "grad_norm": 2.98069622772717, + "language_loss": 0.76240933, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.78365493, + "num_input_tokens_seen": 318257425, + "step": 14761, + "time_per_iteration": 2.641578197479248 + }, + { + "auxiliary_loss_clip": 0.01110247, + "auxiliary_loss_mlp": 0.01036705, + "balance_loss_clip": 1.03677571, + "balance_loss_mlp": 1.02415276, + "epoch": 0.8875394558845634, + "flos": 31102231772160.0, + "grad_norm": 1.7387210735055314, + "language_loss": 0.61797994, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.63944948, + "num_input_tokens_seen": 318278485, + "step": 14762, + "time_per_iteration": 2.6031076908111572 + }, + { + "auxiliary_loss_clip": 0.0109514, + "auxiliary_loss_mlp": 0.01034017, + "balance_loss_clip": 1.03448653, + "balance_loss_mlp": 1.02041471, + "epoch": 0.8875995791372313, + "flos": 21142048648320.0, + "grad_norm": 1.7775042808478863, + "language_loss": 0.63881463, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.66010618, + "num_input_tokens_seen": 318297560, + "step": 14763, + "time_per_iteration": 2.5757639408111572 + }, + { + "auxiliary_loss_clip": 0.01082921, + "auxiliary_loss_mlp": 0.00770724, + "balance_loss_clip": 1.03658664, + "balance_loss_mlp": 1.00031114, + "epoch": 0.8876597023898993, + "flos": 17455136019840.0, + "grad_norm": 2.009886034280031, + "language_loss": 0.71068102, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.72921747, + "num_input_tokens_seen": 318313060, + "step": 14764, + "time_per_iteration": 2.6084272861480713 + }, + { + "auxiliary_loss_clip": 0.01113096, + "auxiliary_loss_mlp": 0.01036581, + "balance_loss_clip": 1.03770447, + "balance_loss_mlp": 1.02350986, + "epoch": 0.8877198256425672, + "flos": 22707933125760.0, + "grad_norm": 2.026362447668411, + "language_loss": 0.66558039, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.68707716, + "num_input_tokens_seen": 318332030, + "step": 14765, + "time_per_iteration": 2.547609806060791 + }, + { + "auxiliary_loss_clip": 0.01068364, + "auxiliary_loss_mlp": 0.01027425, + "balance_loss_clip": 1.0361414, + "balance_loss_mlp": 1.01615393, + "epoch": 0.8877799488952353, + "flos": 24535104111360.0, + "grad_norm": 1.776562659783939, + "language_loss": 0.7677201, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.78867799, + "num_input_tokens_seen": 318351090, + "step": 14766, + "time_per_iteration": 2.6800858974456787 + }, + { + "auxiliary_loss_clip": 0.01076267, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.03301024, + "balance_loss_mlp": 1.01951444, + "epoch": 0.8878400721479032, + "flos": 20959191486720.0, + "grad_norm": 1.9295075111293745, + "language_loss": 0.73348194, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.75456071, + "num_input_tokens_seen": 318372000, + "step": 14767, + "time_per_iteration": 2.605175256729126 + }, + { + "auxiliary_loss_clip": 0.0110506, + "auxiliary_loss_mlp": 0.0103329, + "balance_loss_clip": 1.03636575, + "balance_loss_mlp": 1.02166677, + "epoch": 0.8879001954005712, + "flos": 25295260659840.0, + "grad_norm": 1.7081054740283463, + "language_loss": 0.70993221, + "learning_rate": 1.303129987538778e-07, + "loss": 0.73131573, + "num_input_tokens_seen": 318391530, + "step": 14768, + "time_per_iteration": 2.5900521278381348 + }, + { + "auxiliary_loss_clip": 0.01093069, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.03431153, + "balance_loss_mlp": 1.01872909, + "epoch": 0.8879603186532391, + "flos": 23185329811200.0, + "grad_norm": 1.9932230097119548, + "language_loss": 0.70054102, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.72178221, + "num_input_tokens_seen": 318410690, + "step": 14769, + "time_per_iteration": 2.5676157474517822 + }, + { + "auxiliary_loss_clip": 0.01080083, + "auxiliary_loss_mlp": 0.01031718, + "balance_loss_clip": 1.03361869, + "balance_loss_mlp": 1.01996374, + "epoch": 0.8880204419059071, + "flos": 13655427707520.0, + "grad_norm": 2.022851777751632, + "language_loss": 0.67168438, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.69280243, + "num_input_tokens_seen": 318427380, + "step": 14770, + "time_per_iteration": 2.6081535816192627 + }, + { + "auxiliary_loss_clip": 0.01094329, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.03698134, + "balance_loss_mlp": 1.01974094, + "epoch": 0.888080565158575, + "flos": 20631865824000.0, + "grad_norm": 1.758420734888644, + "language_loss": 0.65032512, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.67159081, + "num_input_tokens_seen": 318448530, + "step": 14771, + "time_per_iteration": 2.6046431064605713 + }, + { + "auxiliary_loss_clip": 0.01084735, + "auxiliary_loss_mlp": 0.01028682, + "balance_loss_clip": 1.03336012, + "balance_loss_mlp": 1.01646304, + "epoch": 0.888140688411243, + "flos": 28620014411520.0, + "grad_norm": 1.5574826798475248, + "language_loss": 0.82247543, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.84360957, + "num_input_tokens_seen": 318468655, + "step": 14772, + "time_per_iteration": 2.7616019248962402 + }, + { + "auxiliary_loss_clip": 0.01079313, + "auxiliary_loss_mlp": 0.01024388, + "balance_loss_clip": 1.03151107, + "balance_loss_mlp": 1.01318812, + "epoch": 0.8882008116639111, + "flos": 25520241496320.0, + "grad_norm": 1.4234953861106903, + "language_loss": 0.76511365, + "learning_rate": 1.296224737033258e-07, + "loss": 0.78615069, + "num_input_tokens_seen": 318488740, + "step": 14773, + "time_per_iteration": 6.201860427856445 + }, + { + "auxiliary_loss_clip": 0.01083069, + "auxiliary_loss_mlp": 0.01026892, + "balance_loss_clip": 1.03498697, + "balance_loss_mlp": 1.01539993, + "epoch": 0.888260934916579, + "flos": 27673696650240.0, + "grad_norm": 1.9867965850384985, + "language_loss": 0.75016356, + "learning_rate": 1.294845814469907e-07, + "loss": 0.77126318, + "num_input_tokens_seen": 318508810, + "step": 14774, + "time_per_iteration": 2.675410270690918 + }, + { + "auxiliary_loss_clip": 0.0106342, + "auxiliary_loss_mlp": 0.00770109, + "balance_loss_clip": 1.03600156, + "balance_loss_mlp": 1.0002929, + "epoch": 0.888321058169247, + "flos": 21611077464960.0, + "grad_norm": 2.763852995715363, + "language_loss": 0.72647572, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.74481106, + "num_input_tokens_seen": 318526860, + "step": 14775, + "time_per_iteration": 2.768602132797241 + }, + { + "auxiliary_loss_clip": 0.01106903, + "auxiliary_loss_mlp": 0.01032645, + "balance_loss_clip": 1.03619862, + "balance_loss_mlp": 1.02074754, + "epoch": 0.8883811814219149, + "flos": 18149109759360.0, + "grad_norm": 1.6801831073555262, + "language_loss": 0.79828447, + "learning_rate": 1.292090097299432e-07, + "loss": 0.81967992, + "num_input_tokens_seen": 318545180, + "step": 14776, + "time_per_iteration": 5.694887399673462 + }, + { + "auxiliary_loss_clip": 0.01103137, + "auxiliary_loss_mlp": 0.01037272, + "balance_loss_clip": 1.03596711, + "balance_loss_mlp": 1.02439141, + "epoch": 0.8884413046745829, + "flos": 28324648874880.0, + "grad_norm": 2.2946403260680746, + "language_loss": 0.69125223, + "learning_rate": 1.290713302796802e-07, + "loss": 0.71265632, + "num_input_tokens_seen": 318564350, + "step": 14777, + "time_per_iteration": 2.6711583137512207 + }, + { + "auxiliary_loss_clip": 0.01091804, + "auxiliary_loss_mlp": 0.01033294, + "balance_loss_clip": 1.0316937, + "balance_loss_mlp": 1.0206517, + "epoch": 0.8885014279272508, + "flos": 15158756649600.0, + "grad_norm": 1.744756226696034, + "language_loss": 0.71044743, + "learning_rate": 1.2893372177522e-07, + "loss": 0.73169839, + "num_input_tokens_seen": 318582275, + "step": 14778, + "time_per_iteration": 2.5861656665802 + }, + { + "auxiliary_loss_clip": 0.01107976, + "auxiliary_loss_mlp": 0.01029742, + "balance_loss_clip": 1.035954, + "balance_loss_mlp": 1.01773167, + "epoch": 0.8885615511799189, + "flos": 19099593498240.0, + "grad_norm": 3.336773779105202, + "language_loss": 0.77618229, + "learning_rate": 1.287961842217804e-07, + "loss": 0.79755944, + "num_input_tokens_seen": 318601230, + "step": 14779, + "time_per_iteration": 2.5533976554870605 + }, + { + "auxiliary_loss_clip": 0.01005115, + "auxiliary_loss_mlp": 0.00999201, + "balance_loss_clip": 1.00668931, + "balance_loss_mlp": 0.99814647, + "epoch": 0.8886216744325868, + "flos": 51186567605760.0, + "grad_norm": 0.8737021693090686, + "language_loss": 0.56777793, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.58782107, + "num_input_tokens_seen": 318645595, + "step": 14780, + "time_per_iteration": 2.964052438735962 + }, + { + "auxiliary_loss_clip": 0.01028008, + "auxiliary_loss_mlp": 0.01000581, + "balance_loss_clip": 1.00549233, + "balance_loss_mlp": 0.99967527, + "epoch": 0.8886817976852548, + "flos": 61612981263360.0, + "grad_norm": 0.7941416089367529, + "language_loss": 0.62353128, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.64381719, + "num_input_tokens_seen": 318707850, + "step": 14781, + "time_per_iteration": 3.181043863296509 + }, + { + "auxiliary_loss_clip": 0.00963643, + "auxiliary_loss_mlp": 0.01006454, + "balance_loss_clip": 1.01169443, + "balance_loss_mlp": 1.00542259, + "epoch": 0.8887419209379227, + "flos": 60646946935680.0, + "grad_norm": 0.7977280372936163, + "language_loss": 0.58126575, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.60096675, + "num_input_tokens_seen": 318764915, + "step": 14782, + "time_per_iteration": 3.2847399711608887 + }, + { + "auxiliary_loss_clip": 0.01106535, + "auxiliary_loss_mlp": 0.01029737, + "balance_loss_clip": 1.03703415, + "balance_loss_mlp": 1.01808405, + "epoch": 0.8888020441905907, + "flos": 29205861235200.0, + "grad_norm": 1.567088080659984, + "language_loss": 0.65746784, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.67883062, + "num_input_tokens_seen": 318785660, + "step": 14783, + "time_per_iteration": 2.841909646987915 + }, + { + "auxiliary_loss_clip": 0.0111198, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.03731346, + "balance_loss_mlp": 1.02278554, + "epoch": 0.8888621674432586, + "flos": 22162701605760.0, + "grad_norm": 1.5542908685815622, + "language_loss": 0.77494425, + "learning_rate": 1.281095609023415e-07, + "loss": 0.796422, + "num_input_tokens_seen": 318806080, + "step": 14784, + "time_per_iteration": 2.597027540206909 + }, + { + "auxiliary_loss_clip": 0.01083474, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.03568983, + "balance_loss_mlp": 1.02146983, + "epoch": 0.8889222906959267, + "flos": 27672834723840.0, + "grad_norm": 15.751964718050344, + "language_loss": 0.6070298, + "learning_rate": 1.279724491644565e-07, + "loss": 0.6282084, + "num_input_tokens_seen": 318826445, + "step": 14785, + "time_per_iteration": 2.7380104064941406 + }, + { + "auxiliary_loss_clip": 0.01073801, + "auxiliary_loss_mlp": 0.01035055, + "balance_loss_clip": 1.03463125, + "balance_loss_mlp": 1.02198935, + "epoch": 0.8889824139485947, + "flos": 14168627274240.0, + "grad_norm": 1.8296614273320466, + "language_loss": 0.65093189, + "learning_rate": 1.278354084140445e-07, + "loss": 0.67202044, + "num_input_tokens_seen": 318843915, + "step": 14786, + "time_per_iteration": 2.774667978286743 + }, + { + "auxiliary_loss_clip": 0.01076771, + "auxiliary_loss_mlp": 0.00771472, + "balance_loss_clip": 1.03597903, + "balance_loss_mlp": 1.00018907, + "epoch": 0.8890425372012626, + "flos": 12853003829760.0, + "grad_norm": 2.7089037879672624, + "language_loss": 0.85490113, + "learning_rate": 1.276984386563009e-07, + "loss": 0.87338352, + "num_input_tokens_seen": 318859670, + "step": 14787, + "time_per_iteration": 2.6649672985076904 + }, + { + "auxiliary_loss_clip": 0.01084573, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.03664386, + "balance_loss_mlp": 1.01775646, + "epoch": 0.8891026604539306, + "flos": 21689291329920.0, + "grad_norm": 2.1717922675442094, + "language_loss": 0.70967633, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.73082221, + "num_input_tokens_seen": 318877855, + "step": 14788, + "time_per_iteration": 2.832113027572632 + }, + { + "auxiliary_loss_clip": 0.01105551, + "auxiliary_loss_mlp": 0.01029542, + "balance_loss_clip": 1.03675187, + "balance_loss_mlp": 1.01768684, + "epoch": 0.8891627837065985, + "flos": 21871430219520.0, + "grad_norm": 1.719821366869133, + "language_loss": 0.69946039, + "learning_rate": 1.274247121395935e-07, + "loss": 0.72081137, + "num_input_tokens_seen": 318896045, + "step": 14789, + "time_per_iteration": 2.6089062690734863 + }, + { + "auxiliary_loss_clip": 0.01100862, + "auxiliary_loss_mlp": 0.01030753, + "balance_loss_clip": 1.03966713, + "balance_loss_mlp": 1.01853967, + "epoch": 0.8892229069592665, + "flos": 21580230660480.0, + "grad_norm": 1.4843336736816757, + "language_loss": 0.70594078, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.72725689, + "num_input_tokens_seen": 318915515, + "step": 14790, + "time_per_iteration": 2.6216959953308105 + }, + { + "auxiliary_loss_clip": 0.01088486, + "auxiliary_loss_mlp": 0.01027627, + "balance_loss_clip": 1.03701544, + "balance_loss_mlp": 1.01623666, + "epoch": 0.8892830302119344, + "flos": 23075981832960.0, + "grad_norm": 1.8356781695474516, + "language_loss": 0.72947121, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.75063235, + "num_input_tokens_seen": 318934305, + "step": 14791, + "time_per_iteration": 2.7145907878875732 + }, + { + "auxiliary_loss_clip": 0.01078142, + "auxiliary_loss_mlp": 0.01033106, + "balance_loss_clip": 1.03699768, + "balance_loss_mlp": 1.02080894, + "epoch": 0.8893431534646025, + "flos": 23072139077760.0, + "grad_norm": 1.7972192952628998, + "language_loss": 0.74159795, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.76271045, + "num_input_tokens_seen": 318953880, + "step": 14792, + "time_per_iteration": 2.689258575439453 + }, + { + "auxiliary_loss_clip": 0.01041593, + "auxiliary_loss_mlp": 0.01037472, + "balance_loss_clip": 1.03244281, + "balance_loss_mlp": 1.02338743, + "epoch": 0.8894032767172704, + "flos": 22454978572800.0, + "grad_norm": 1.9651444821716726, + "language_loss": 0.66043746, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.68122816, + "num_input_tokens_seen": 318971395, + "step": 14793, + "time_per_iteration": 2.73183012008667 + }, + { + "auxiliary_loss_clip": 0.01079264, + "auxiliary_loss_mlp": 0.01031003, + "balance_loss_clip": 1.03588605, + "balance_loss_mlp": 1.01774693, + "epoch": 0.8894633999699384, + "flos": 25338246261120.0, + "grad_norm": 1.671450826366533, + "language_loss": 0.71594059, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.73704326, + "num_input_tokens_seen": 318990580, + "step": 14794, + "time_per_iteration": 2.7042224407196045 + }, + { + "auxiliary_loss_clip": 0.01099154, + "auxiliary_loss_mlp": 0.01034078, + "balance_loss_clip": 1.03871417, + "balance_loss_mlp": 1.02107775, + "epoch": 0.8895235232226063, + "flos": 20994096528000.0, + "grad_norm": 1.7160866842792333, + "language_loss": 0.75350553, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.77483785, + "num_input_tokens_seen": 319010040, + "step": 14795, + "time_per_iteration": 2.5956714153289795 + }, + { + "auxiliary_loss_clip": 0.01003077, + "auxiliary_loss_mlp": 0.00999947, + "balance_loss_clip": 1.00997567, + "balance_loss_mlp": 0.99892819, + "epoch": 0.8895836464752743, + "flos": 69732956764800.0, + "grad_norm": 0.7671992564200865, + "language_loss": 0.56051087, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.58054101, + "num_input_tokens_seen": 319063860, + "step": 14796, + "time_per_iteration": 3.078346014022827 + }, + { + "auxiliary_loss_clip": 0.01111208, + "auxiliary_loss_mlp": 0.01030403, + "balance_loss_clip": 1.03733194, + "balance_loss_mlp": 1.01666403, + "epoch": 0.8896437697279422, + "flos": 23221815050880.0, + "grad_norm": 1.7603233439489925, + "language_loss": 0.70576537, + "learning_rate": 1.263326468169843e-07, + "loss": 0.72718143, + "num_input_tokens_seen": 319082335, + "step": 14797, + "time_per_iteration": 2.576277017593384 + }, + { + "auxiliary_loss_clip": 0.01017004, + "auxiliary_loss_mlp": 0.01002028, + "balance_loss_clip": 1.01229072, + "balance_loss_mlp": 1.00102699, + "epoch": 0.8897038929806103, + "flos": 70752711882240.0, + "grad_norm": 0.7794431422590221, + "language_loss": 0.5794524, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.59964275, + "num_input_tokens_seen": 319147075, + "step": 14798, + "time_per_iteration": 3.218555212020874 + }, + { + "auxiliary_loss_clip": 0.01097846, + "auxiliary_loss_mlp": 0.01029628, + "balance_loss_clip": 1.03578901, + "balance_loss_mlp": 1.01621103, + "epoch": 0.8897640162332782, + "flos": 19245103493760.0, + "grad_norm": 1.822201303291812, + "language_loss": 0.7947073, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.81598198, + "num_input_tokens_seen": 319166630, + "step": 14799, + "time_per_iteration": 2.6169159412384033 + }, + { + "auxiliary_loss_clip": 0.01018703, + "auxiliary_loss_mlp": 0.01003426, + "balance_loss_clip": 1.0060674, + "balance_loss_mlp": 1.00247824, + "epoch": 0.8898241394859462, + "flos": 41356275039360.0, + "grad_norm": 0.8879772067683966, + "language_loss": 0.58123994, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.60146117, + "num_input_tokens_seen": 319221865, + "step": 14800, + "time_per_iteration": 3.090841054916382 + }, + { + "auxiliary_loss_clip": 0.01099994, + "auxiliary_loss_mlp": 0.01034134, + "balance_loss_clip": 1.03885424, + "balance_loss_mlp": 1.02245724, + "epoch": 0.8898842627386142, + "flos": 18986295024000.0, + "grad_norm": 1.5949008184751121, + "language_loss": 0.66234601, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.68368721, + "num_input_tokens_seen": 319240710, + "step": 14801, + "time_per_iteration": 2.5842556953430176 + }, + { + "auxiliary_loss_clip": 0.01073781, + "auxiliary_loss_mlp": 0.01036561, + "balance_loss_clip": 1.03613853, + "balance_loss_mlp": 1.02248251, + "epoch": 0.8899443859912821, + "flos": 13217173868160.0, + "grad_norm": 2.7903408199323496, + "language_loss": 0.7563743, + "learning_rate": 1.256524149358682e-07, + "loss": 0.77747774, + "num_input_tokens_seen": 319256495, + "step": 14802, + "time_per_iteration": 2.6613779067993164 + }, + { + "auxiliary_loss_clip": 0.01091905, + "auxiliary_loss_mlp": 0.01030893, + "balance_loss_clip": 1.03725505, + "balance_loss_mlp": 1.01900768, + "epoch": 0.8900045092439501, + "flos": 22674680110080.0, + "grad_norm": 1.8379867635089826, + "language_loss": 0.73482311, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.75605106, + "num_input_tokens_seen": 319273620, + "step": 14803, + "time_per_iteration": 2.675278425216675 + }, + { + "auxiliary_loss_clip": 0.01081084, + "auxiliary_loss_mlp": 0.01036017, + "balance_loss_clip": 1.03560674, + "balance_loss_mlp": 1.02347028, + "epoch": 0.890064632496618, + "flos": 21141617685120.0, + "grad_norm": 1.8905881524985035, + "language_loss": 0.71867836, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.73984939, + "num_input_tokens_seen": 319291720, + "step": 14804, + "time_per_iteration": 2.637640953063965 + }, + { + "auxiliary_loss_clip": 0.01093595, + "auxiliary_loss_mlp": 0.0103033, + "balance_loss_clip": 1.03525758, + "balance_loss_mlp": 1.01756275, + "epoch": 0.8901247557492861, + "flos": 23397058529280.0, + "grad_norm": 1.8040298487747064, + "language_loss": 0.81148362, + "learning_rate": 1.252451286713123e-07, + "loss": 0.8327229, + "num_input_tokens_seen": 319310380, + "step": 14805, + "time_per_iteration": 2.6288270950317383 + }, + { + "auxiliary_loss_clip": 0.01100196, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.03652012, + "balance_loss_mlp": 1.01704848, + "epoch": 0.890184879001954, + "flos": 29169591477120.0, + "grad_norm": 2.314720607634321, + "language_loss": 0.67655379, + "learning_rate": 1.251095087580505e-07, + "loss": 0.69785488, + "num_input_tokens_seen": 319331765, + "step": 14806, + "time_per_iteration": 2.701447010040283 + }, + { + "auxiliary_loss_clip": 0.01082875, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.03327703, + "balance_loss_mlp": 1.0191853, + "epoch": 0.890245002254622, + "flos": 14427830793600.0, + "grad_norm": 1.860806449184193, + "language_loss": 0.6715759, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.6927194, + "num_input_tokens_seen": 319349135, + "step": 14807, + "time_per_iteration": 2.6722195148468018 + }, + { + "auxiliary_loss_clip": 0.01082528, + "auxiliary_loss_mlp": 0.01030238, + "balance_loss_clip": 1.03432024, + "balance_loss_mlp": 1.01869881, + "epoch": 0.8903051255072899, + "flos": 22382187661440.0, + "grad_norm": 1.7718809355965226, + "language_loss": 0.75224829, + "learning_rate": 1.248384822247732e-07, + "loss": 0.77337593, + "num_input_tokens_seen": 319368410, + "step": 14808, + "time_per_iteration": 2.640336036682129 + }, + { + "auxiliary_loss_clip": 0.0107632, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.0358547, + "balance_loss_mlp": 1.01811981, + "epoch": 0.8903652487599579, + "flos": 20777375819520.0, + "grad_norm": 12.27562140526227, + "language_loss": 0.81525707, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.83632386, + "num_input_tokens_seen": 319387535, + "step": 14809, + "time_per_iteration": 2.6590049266815186 + }, + { + "auxiliary_loss_clip": 0.01099147, + "auxiliary_loss_mlp": 0.0103237, + "balance_loss_clip": 1.03634763, + "balance_loss_mlp": 1.02048481, + "epoch": 0.8904253720126258, + "flos": 24424499157120.0, + "grad_norm": 2.1293350080998747, + "language_loss": 0.68579054, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.70710576, + "num_input_tokens_seen": 319407210, + "step": 14810, + "time_per_iteration": 2.601858139038086 + }, + { + "auxiliary_loss_clip": 0.01074787, + "auxiliary_loss_mlp": 0.01028878, + "balance_loss_clip": 1.03349328, + "balance_loss_mlp": 1.01603925, + "epoch": 0.8904854952652939, + "flos": 19463871277440.0, + "grad_norm": 2.1159045694256124, + "language_loss": 0.70389724, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.72493392, + "num_input_tokens_seen": 319425340, + "step": 14811, + "time_per_iteration": 2.652963876724243 + }, + { + "auxiliary_loss_clip": 0.01077147, + "auxiliary_loss_mlp": 0.00770711, + "balance_loss_clip": 1.03590763, + "balance_loss_mlp": 1.00018835, + "epoch": 0.8905456185179618, + "flos": 50800741666560.0, + "grad_norm": 2.4983735528249182, + "language_loss": 0.66081208, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.67929065, + "num_input_tokens_seen": 319448150, + "step": 14812, + "time_per_iteration": 4.636792182922363 + }, + { + "auxiliary_loss_clip": 0.01060766, + "auxiliary_loss_mlp": 0.01031191, + "balance_loss_clip": 1.03516841, + "balance_loss_mlp": 1.01957428, + "epoch": 0.8906057417706298, + "flos": 17784867893760.0, + "grad_norm": 1.7995676770850613, + "language_loss": 0.68747163, + "learning_rate": 1.24162160341861e-07, + "loss": 0.70839119, + "num_input_tokens_seen": 319466115, + "step": 14813, + "time_per_iteration": 4.193687200546265 + }, + { + "auxiliary_loss_clip": 0.01084515, + "auxiliary_loss_mlp": 0.01040982, + "balance_loss_clip": 1.03238082, + "balance_loss_mlp": 1.02447116, + "epoch": 0.8906658650232978, + "flos": 21944867575680.0, + "grad_norm": 3.876084846753058, + "language_loss": 0.75659066, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.77784562, + "num_input_tokens_seen": 319485255, + "step": 14814, + "time_per_iteration": 2.6463520526885986 + }, + { + "auxiliary_loss_clip": 0.01100125, + "auxiliary_loss_mlp": 0.01030982, + "balance_loss_clip": 1.03604758, + "balance_loss_mlp": 1.01783299, + "epoch": 0.8907259882759657, + "flos": 21287810039040.0, + "grad_norm": 2.0688857636131734, + "language_loss": 0.74374747, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.76505852, + "num_input_tokens_seen": 319501800, + "step": 14815, + "time_per_iteration": 4.110440492630005 + }, + { + "auxiliary_loss_clip": 0.01068212, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.03145206, + "balance_loss_mlp": 1.01870036, + "epoch": 0.8907861115286337, + "flos": 20120426023680.0, + "grad_norm": 2.123609537525354, + "language_loss": 0.75087738, + "learning_rate": 1.237572207545914e-07, + "loss": 0.77187324, + "num_input_tokens_seen": 319520415, + "step": 14816, + "time_per_iteration": 4.275893926620483 + }, + { + "auxiliary_loss_clip": 0.01086936, + "auxiliary_loss_mlp": 0.01031641, + "balance_loss_clip": 1.03456652, + "balance_loss_mlp": 1.01913631, + "epoch": 0.8908462347813016, + "flos": 20084156265600.0, + "grad_norm": 1.7646380277651805, + "language_loss": 0.77968502, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.80087078, + "num_input_tokens_seen": 319538410, + "step": 14817, + "time_per_iteration": 2.694972515106201 + }, + { + "auxiliary_loss_clip": 0.01001525, + "auxiliary_loss_mlp": 0.01001251, + "balance_loss_clip": 1.00726986, + "balance_loss_mlp": 1.00008297, + "epoch": 0.8909063580339697, + "flos": 65503649790720.0, + "grad_norm": 0.7456782467309502, + "language_loss": 0.56431699, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.58434474, + "num_input_tokens_seen": 319602565, + "step": 14818, + "time_per_iteration": 3.234703540802002 + }, + { + "auxiliary_loss_clip": 0.01059509, + "auxiliary_loss_mlp": 0.01034162, + "balance_loss_clip": 1.0355022, + "balance_loss_mlp": 1.02152014, + "epoch": 0.8909664812866376, + "flos": 29863062426240.0, + "grad_norm": 1.7646144877343908, + "language_loss": 0.64705229, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.66798902, + "num_input_tokens_seen": 319624645, + "step": 14819, + "time_per_iteration": 2.7950870990753174 + }, + { + "auxiliary_loss_clip": 0.01097653, + "auxiliary_loss_mlp": 0.01030768, + "balance_loss_clip": 1.03588057, + "balance_loss_mlp": 1.01794112, + "epoch": 0.8910266045393056, + "flos": 25447127362560.0, + "grad_norm": 2.2154062344071312, + "language_loss": 0.78340304, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.80468726, + "num_input_tokens_seen": 319644040, + "step": 14820, + "time_per_iteration": 2.6286323070526123 + }, + { + "auxiliary_loss_clip": 0.01070015, + "auxiliary_loss_mlp": 0.00769687, + "balance_loss_clip": 1.03580856, + "balance_loss_mlp": 1.00026464, + "epoch": 0.8910867277919735, + "flos": 24499121662080.0, + "grad_norm": 1.856207333364825, + "language_loss": 0.76575708, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.78415406, + "num_input_tokens_seen": 319663930, + "step": 14821, + "time_per_iteration": 2.710040330886841 + }, + { + "auxiliary_loss_clip": 0.01014485, + "auxiliary_loss_mlp": 0.00751361, + "balance_loss_clip": 1.00564671, + "balance_loss_mlp": 0.99960148, + "epoch": 0.8911468510446415, + "flos": 60688136856960.0, + "grad_norm": 0.7925502121917717, + "language_loss": 0.59283942, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.61049783, + "num_input_tokens_seen": 319721245, + "step": 14822, + "time_per_iteration": 3.042881727218628 + }, + { + "auxiliary_loss_clip": 0.0109278, + "auxiliary_loss_mlp": 0.0103595, + "balance_loss_clip": 1.03620601, + "balance_loss_mlp": 1.02346885, + "epoch": 0.8912069742973094, + "flos": 25337492075520.0, + "grad_norm": 2.0091476458751845, + "language_loss": 0.69135273, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.71263999, + "num_input_tokens_seen": 319741200, + "step": 14823, + "time_per_iteration": 2.6208603382110596 + }, + { + "auxiliary_loss_clip": 0.01089302, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.03342748, + "balance_loss_mlp": 1.02241755, + "epoch": 0.8912670975499775, + "flos": 18223516782720.0, + "grad_norm": 1.5978850394355568, + "language_loss": 0.69198072, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.71322668, + "num_input_tokens_seen": 319759265, + "step": 14824, + "time_per_iteration": 2.508863687515259 + }, + { + "auxiliary_loss_clip": 0.01058099, + "auxiliary_loss_mlp": 0.0103706, + "balance_loss_clip": 1.03319716, + "balance_loss_mlp": 1.02307105, + "epoch": 0.8913272208026454, + "flos": 26504481041280.0, + "grad_norm": 1.9932021748393736, + "language_loss": 0.70705098, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.72800255, + "num_input_tokens_seen": 319777560, + "step": 14825, + "time_per_iteration": 2.654224157333374 + }, + { + "auxiliary_loss_clip": 0.01085791, + "auxiliary_loss_mlp": 0.01032779, + "balance_loss_clip": 1.03422439, + "balance_loss_mlp": 1.01951671, + "epoch": 0.8913873440553134, + "flos": 18802324540800.0, + "grad_norm": 1.827676363511503, + "language_loss": 0.71464586, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.7358315, + "num_input_tokens_seen": 319794125, + "step": 14826, + "time_per_iteration": 2.5119738578796387 + }, + { + "auxiliary_loss_clip": 0.0109572, + "auxiliary_loss_mlp": 0.01028162, + "balance_loss_clip": 1.03623497, + "balance_loss_mlp": 1.01646793, + "epoch": 0.8914474673079814, + "flos": 20884892204160.0, + "grad_norm": 2.0101591277509243, + "language_loss": 0.75315851, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.77439737, + "num_input_tokens_seen": 319810310, + "step": 14827, + "time_per_iteration": 2.4767954349517822 + }, + { + "auxiliary_loss_clip": 0.01100376, + "auxiliary_loss_mlp": 0.0103277, + "balance_loss_clip": 1.03736746, + "balance_loss_mlp": 1.02037859, + "epoch": 0.8915075905606493, + "flos": 20952439729920.0, + "grad_norm": 2.25546419546836, + "language_loss": 0.78480828, + "learning_rate": 1.221438670423336e-07, + "loss": 0.80613977, + "num_input_tokens_seen": 319828505, + "step": 14828, + "time_per_iteration": 2.4681639671325684 + }, + { + "auxiliary_loss_clip": 0.01068483, + "auxiliary_loss_mlp": 0.01032448, + "balance_loss_clip": 1.0356158, + "balance_loss_mlp": 1.01987755, + "epoch": 0.8915677138133173, + "flos": 23076305055360.0, + "grad_norm": 1.7213243632049227, + "language_loss": 0.75276792, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.77377725, + "num_input_tokens_seen": 319848680, + "step": 14829, + "time_per_iteration": 2.6100480556488037 + }, + { + "auxiliary_loss_clip": 0.01108879, + "auxiliary_loss_mlp": 0.01034466, + "balance_loss_clip": 1.03637552, + "balance_loss_mlp": 1.02299213, + "epoch": 0.8916278370659853, + "flos": 23440259612160.0, + "grad_norm": 1.5513516933839315, + "language_loss": 0.84576946, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.867203, + "num_input_tokens_seen": 319868835, + "step": 14830, + "time_per_iteration": 2.5831005573272705 + }, + { + "auxiliary_loss_clip": 0.01093236, + "auxiliary_loss_mlp": 0.01030146, + "balance_loss_clip": 1.03435206, + "balance_loss_mlp": 1.01864195, + "epoch": 0.8916879603186533, + "flos": 25160488830720.0, + "grad_norm": 1.3477965843038384, + "language_loss": 0.74875772, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.76999158, + "num_input_tokens_seen": 319891585, + "step": 14831, + "time_per_iteration": 2.7232887744903564 + }, + { + "auxiliary_loss_clip": 0.01100471, + "auxiliary_loss_mlp": 0.01029141, + "balance_loss_clip": 1.03624547, + "balance_loss_mlp": 1.01663041, + "epoch": 0.8917480835713212, + "flos": 20229845829120.0, + "grad_norm": 1.8265908258617016, + "language_loss": 0.72934276, + "learning_rate": 1.216083607088847e-07, + "loss": 0.75063884, + "num_input_tokens_seen": 319910315, + "step": 14832, + "time_per_iteration": 2.616689443588257 + }, + { + "auxiliary_loss_clip": 0.01045927, + "auxiliary_loss_mlp": 0.00770458, + "balance_loss_clip": 1.03222537, + "balance_loss_mlp": 1.00019884, + "epoch": 0.8918082068239892, + "flos": 26101922342400.0, + "grad_norm": 3.1162015685797972, + "language_loss": 0.66912735, + "learning_rate": 1.214746621848355e-07, + "loss": 0.68729126, + "num_input_tokens_seen": 319932275, + "step": 14833, + "time_per_iteration": 2.8316352367401123 + }, + { + "auxiliary_loss_clip": 0.01106023, + "auxiliary_loss_mlp": 0.01034649, + "balance_loss_clip": 1.03974128, + "balance_loss_mlp": 1.02139854, + "epoch": 0.8918683300766571, + "flos": 24831439315200.0, + "grad_norm": 1.9997597617659202, + "language_loss": 0.73976004, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.76116675, + "num_input_tokens_seen": 319955335, + "step": 14834, + "time_per_iteration": 2.7026243209838867 + }, + { + "auxiliary_loss_clip": 0.01065475, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.03502977, + "balance_loss_mlp": 1.02165282, + "epoch": 0.8919284533293251, + "flos": 22305158945280.0, + "grad_norm": 1.9340437806838273, + "language_loss": 0.78773081, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.80872452, + "num_input_tokens_seen": 319973990, + "step": 14835, + "time_per_iteration": 2.64371395111084 + }, + { + "auxiliary_loss_clip": 0.01103945, + "auxiliary_loss_mlp": 0.0103108, + "balance_loss_clip": 1.03464007, + "balance_loss_mlp": 1.01960659, + "epoch": 0.891988576581993, + "flos": 30373532559360.0, + "grad_norm": 1.6176322749361627, + "language_loss": 0.74194962, + "learning_rate": 1.210739940361689e-07, + "loss": 0.76329982, + "num_input_tokens_seen": 319995555, + "step": 14836, + "time_per_iteration": 2.6271843910217285 + }, + { + "auxiliary_loss_clip": 0.01087557, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.03471708, + "balance_loss_mlp": 1.01970363, + "epoch": 0.8920486998346611, + "flos": 15552947479680.0, + "grad_norm": 3.2025172292231625, + "language_loss": 0.68644428, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.7076422, + "num_input_tokens_seen": 320012385, + "step": 14837, + "time_per_iteration": 2.612969160079956 + }, + { + "auxiliary_loss_clip": 0.01050841, + "auxiliary_loss_mlp": 0.0103232, + "balance_loss_clip": 1.03323007, + "balance_loss_mlp": 1.01922536, + "epoch": 0.892108823087329, + "flos": 21214983214080.0, + "grad_norm": 1.653711357861068, + "language_loss": 0.67707741, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.697909, + "num_input_tokens_seen": 320032390, + "step": 14838, + "time_per_iteration": 2.7335948944091797 + }, + { + "auxiliary_loss_clip": 0.01096545, + "auxiliary_loss_mlp": 0.01031606, + "balance_loss_clip": 1.03442597, + "balance_loss_mlp": 1.0184747, + "epoch": 0.892168946339997, + "flos": 21978982517760.0, + "grad_norm": 2.2756639024172722, + "language_loss": 0.76234394, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.78362542, + "num_input_tokens_seen": 320052885, + "step": 14839, + "time_per_iteration": 2.6222732067108154 + }, + { + "auxiliary_loss_clip": 0.00999654, + "auxiliary_loss_mlp": 0.00751271, + "balance_loss_clip": 1.00644863, + "balance_loss_mlp": 0.99961644, + "epoch": 0.892229069592665, + "flos": 67475289277440.0, + "grad_norm": 0.6958789552427521, + "language_loss": 0.49386242, + "learning_rate": 1.205407673483978e-07, + "loss": 0.51137161, + "num_input_tokens_seen": 320113685, + "step": 14840, + "time_per_iteration": 3.1971607208251953 + }, + { + "auxiliary_loss_clip": 0.0111346, + "auxiliary_loss_mlp": 0.01031899, + "balance_loss_clip": 1.03685474, + "balance_loss_mlp": 1.01813662, + "epoch": 0.8922891928453329, + "flos": 19459561645440.0, + "grad_norm": 2.2275620590123575, + "language_loss": 0.64040601, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.66185963, + "num_input_tokens_seen": 320130810, + "step": 14841, + "time_per_iteration": 2.5630903244018555 + }, + { + "auxiliary_loss_clip": 0.01073374, + "auxiliary_loss_mlp": 0.00768866, + "balance_loss_clip": 1.03585565, + "balance_loss_mlp": 1.00014949, + "epoch": 0.8923493160980009, + "flos": 23367396873600.0, + "grad_norm": 1.4260666189370539, + "language_loss": 0.68198895, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.70041138, + "num_input_tokens_seen": 320152170, + "step": 14842, + "time_per_iteration": 2.7487709522247314 + }, + { + "auxiliary_loss_clip": 0.01107456, + "auxiliary_loss_mlp": 0.01036165, + "balance_loss_clip": 1.03805566, + "balance_loss_mlp": 1.02464318, + "epoch": 0.8924094393506689, + "flos": 26177047637760.0, + "grad_norm": 2.0828434512728387, + "language_loss": 0.80424309, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.8256793, + "num_input_tokens_seen": 320172360, + "step": 14843, + "time_per_iteration": 2.6367337703704834 + }, + { + "auxiliary_loss_clip": 0.01084909, + "auxiliary_loss_mlp": 0.01033483, + "balance_loss_clip": 1.03361225, + "balance_loss_mlp": 1.01991701, + "epoch": 0.8924695626033369, + "flos": 22018520413440.0, + "grad_norm": 2.382089830168308, + "language_loss": 0.68838096, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.70956492, + "num_input_tokens_seen": 320192130, + "step": 14844, + "time_per_iteration": 2.6400132179260254 + }, + { + "auxiliary_loss_clip": 0.01064131, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.03404808, + "balance_loss_mlp": 1.01779175, + "epoch": 0.8925296858560048, + "flos": 14793940166400.0, + "grad_norm": 2.2436852387053134, + "language_loss": 0.91622436, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.93716609, + "num_input_tokens_seen": 320207760, + "step": 14845, + "time_per_iteration": 2.74336314201355 + }, + { + "auxiliary_loss_clip": 0.01089634, + "auxiliary_loss_mlp": 0.01031578, + "balance_loss_clip": 1.03469348, + "balance_loss_mlp": 1.01985955, + "epoch": 0.8925898091086728, + "flos": 22346636175360.0, + "grad_norm": 1.8155448981855211, + "language_loss": 0.72219133, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.74340343, + "num_input_tokens_seen": 320225325, + "step": 14846, + "time_per_iteration": 2.628924608230591 + }, + { + "auxiliary_loss_clip": 0.01084746, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.03907979, + "balance_loss_mlp": 1.02118051, + "epoch": 0.8926499323613407, + "flos": 45806322067200.0, + "grad_norm": 2.129136173165777, + "language_loss": 0.56949878, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.5906803, + "num_input_tokens_seen": 320247645, + "step": 14847, + "time_per_iteration": 2.8942604064941406 + }, + { + "auxiliary_loss_clip": 0.01071094, + "auxiliary_loss_mlp": 0.01034217, + "balance_loss_clip": 1.03545833, + "balance_loss_mlp": 1.0223918, + "epoch": 0.8927100556140087, + "flos": 22127042378880.0, + "grad_norm": 2.417347097790333, + "language_loss": 0.76218295, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.78323603, + "num_input_tokens_seen": 320266005, + "step": 14848, + "time_per_iteration": 2.703596830368042 + }, + { + "auxiliary_loss_clip": 0.01043101, + "auxiliary_loss_mlp": 0.01046178, + "balance_loss_clip": 1.0295552, + "balance_loss_mlp": 1.03208137, + "epoch": 0.8927701788666766, + "flos": 28330143655680.0, + "grad_norm": 2.5994238973384554, + "language_loss": 0.69254899, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.71344179, + "num_input_tokens_seen": 320285555, + "step": 14849, + "time_per_iteration": 2.7903876304626465 + }, + { + "auxiliary_loss_clip": 0.01099654, + "auxiliary_loss_mlp": 0.01032823, + "balance_loss_clip": 1.03864908, + "balance_loss_mlp": 1.02075911, + "epoch": 0.8928303021193447, + "flos": 25294973351040.0, + "grad_norm": 1.9684228103737367, + "language_loss": 0.80747259, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.8287974, + "num_input_tokens_seen": 320305395, + "step": 14850, + "time_per_iteration": 2.6187615394592285 + }, + { + "auxiliary_loss_clip": 0.01087788, + "auxiliary_loss_mlp": 0.01037651, + "balance_loss_clip": 1.03636372, + "balance_loss_mlp": 1.02547944, + "epoch": 0.8928904253720126, + "flos": 22236713579520.0, + "grad_norm": 1.6645229603446685, + "language_loss": 0.74605459, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.76730895, + "num_input_tokens_seen": 320324220, + "step": 14851, + "time_per_iteration": 2.6631858348846436 + }, + { + "auxiliary_loss_clip": 0.0108452, + "auxiliary_loss_mlp": 0.01029205, + "balance_loss_clip": 1.03504527, + "balance_loss_mlp": 1.01692009, + "epoch": 0.8929505486246806, + "flos": 27092374940160.0, + "grad_norm": 1.5560164927466833, + "language_loss": 0.78718781, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.80832505, + "num_input_tokens_seen": 320347195, + "step": 14852, + "time_per_iteration": 5.973539113998413 + }, + { + "auxiliary_loss_clip": 0.01091326, + "auxiliary_loss_mlp": 0.01033169, + "balance_loss_clip": 1.03806448, + "balance_loss_mlp": 1.0212301, + "epoch": 0.8930106718773486, + "flos": 23039352938880.0, + "grad_norm": 2.4577931840380596, + "language_loss": 0.69120765, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.71245253, + "num_input_tokens_seen": 320366850, + "step": 14853, + "time_per_iteration": 2.6630473136901855 + }, + { + "auxiliary_loss_clip": 0.01060947, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.03697348, + "balance_loss_mlp": 1.01997924, + "epoch": 0.8930707951300165, + "flos": 35626652887680.0, + "grad_norm": 1.537977130569083, + "language_loss": 0.67207319, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.69300473, + "num_input_tokens_seen": 320388895, + "step": 14854, + "time_per_iteration": 4.400064945220947 + }, + { + "auxiliary_loss_clip": 0.01081067, + "auxiliary_loss_mlp": 0.01040836, + "balance_loss_clip": 1.03309155, + "balance_loss_mlp": 1.02784824, + "epoch": 0.8931309183826845, + "flos": 23039891642880.0, + "grad_norm": 1.6742794068707105, + "language_loss": 0.74868983, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.76990891, + "num_input_tokens_seen": 320408520, + "step": 14855, + "time_per_iteration": 4.200139284133911 + }, + { + "auxiliary_loss_clip": 0.01086542, + "auxiliary_loss_mlp": 0.01032762, + "balance_loss_clip": 1.03601122, + "balance_loss_mlp": 1.02056086, + "epoch": 0.8931910416353525, + "flos": 26504624695680.0, + "grad_norm": 2.5885445984861613, + "language_loss": 0.64431441, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.6655075, + "num_input_tokens_seen": 320427400, + "step": 14856, + "time_per_iteration": 2.657810926437378 + }, + { + "auxiliary_loss_clip": 0.0110682, + "auxiliary_loss_mlp": 0.0102884, + "balance_loss_clip": 1.03531027, + "balance_loss_mlp": 1.01715207, + "epoch": 0.8932511648880205, + "flos": 24973609345920.0, + "grad_norm": 1.6750308846874502, + "language_loss": 0.66575366, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.68711025, + "num_input_tokens_seen": 320447570, + "step": 14857, + "time_per_iteration": 2.644740343093872 + }, + { + "auxiliary_loss_clip": 0.01068637, + "auxiliary_loss_mlp": 0.01038826, + "balance_loss_clip": 1.04051542, + "balance_loss_mlp": 1.02599871, + "epoch": 0.8933112881406884, + "flos": 24460733001600.0, + "grad_norm": 2.232767512472365, + "language_loss": 0.75065112, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.77172571, + "num_input_tokens_seen": 320464405, + "step": 14858, + "time_per_iteration": 2.7609682083129883 + }, + { + "auxiliary_loss_clip": 0.01096177, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.03594685, + "balance_loss_mlp": 1.01651311, + "epoch": 0.8933714113933564, + "flos": 28293083798400.0, + "grad_norm": 1.825199606882533, + "language_loss": 0.69551903, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.71677446, + "num_input_tokens_seen": 320485525, + "step": 14859, + "time_per_iteration": 2.6836822032928467 + }, + { + "auxiliary_loss_clip": 0.01056346, + "auxiliary_loss_mlp": 0.01028474, + "balance_loss_clip": 1.03371429, + "balance_loss_mlp": 1.0170536, + "epoch": 0.8934315346460243, + "flos": 21434864319360.0, + "grad_norm": 1.6309895409207762, + "language_loss": 0.75540131, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.77624959, + "num_input_tokens_seen": 320506725, + "step": 14860, + "time_per_iteration": 2.76859450340271 + }, + { + "auxiliary_loss_clip": 0.01086873, + "auxiliary_loss_mlp": 0.01033583, + "balance_loss_clip": 1.03512859, + "balance_loss_mlp": 1.02008855, + "epoch": 0.8934916578986923, + "flos": 23769596436480.0, + "grad_norm": 4.01916529302481, + "language_loss": 0.57677805, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.59798259, + "num_input_tokens_seen": 320525425, + "step": 14861, + "time_per_iteration": 2.661344289779663 + }, + { + "auxiliary_loss_clip": 0.01078056, + "auxiliary_loss_mlp": 0.01033158, + "balance_loss_clip": 1.03267503, + "balance_loss_mlp": 1.02077198, + "epoch": 0.8935517811513602, + "flos": 18916161719040.0, + "grad_norm": 1.9140763695424603, + "language_loss": 0.63545376, + "learning_rate": 1.176284122190685e-07, + "loss": 0.6565659, + "num_input_tokens_seen": 320543010, + "step": 14862, + "time_per_iteration": 2.5856823921203613 + }, + { + "auxiliary_loss_clip": 0.01092562, + "auxiliary_loss_mlp": 0.01026666, + "balance_loss_clip": 1.03338671, + "balance_loss_mlp": 1.01455998, + "epoch": 0.8936119044040283, + "flos": 24061370613120.0, + "grad_norm": 2.1334167708433323, + "language_loss": 0.78088272, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.80207497, + "num_input_tokens_seen": 320562180, + "step": 14863, + "time_per_iteration": 2.611900806427002 + }, + { + "auxiliary_loss_clip": 0.01080768, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.03352034, + "balance_loss_mlp": 1.02448487, + "epoch": 0.8936720276566962, + "flos": 21324079797120.0, + "grad_norm": 1.7735911629661039, + "language_loss": 0.71075487, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.73193425, + "num_input_tokens_seen": 320580395, + "step": 14864, + "time_per_iteration": 2.691619873046875 + }, + { + "auxiliary_loss_clip": 0.01101616, + "auxiliary_loss_mlp": 0.01037124, + "balance_loss_clip": 1.03658938, + "balance_loss_mlp": 1.02402878, + "epoch": 0.8937321509093642, + "flos": 18406122549120.0, + "grad_norm": 2.399528351176047, + "language_loss": 0.76093769, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.78232509, + "num_input_tokens_seen": 320599505, + "step": 14865, + "time_per_iteration": 2.6147727966308594 + }, + { + "auxiliary_loss_clip": 0.01069542, + "auxiliary_loss_mlp": 0.01032163, + "balance_loss_clip": 1.03533304, + "balance_loss_mlp": 1.02058804, + "epoch": 0.8937922741620322, + "flos": 22054754257920.0, + "grad_norm": 1.8011765216812077, + "language_loss": 0.72078204, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.74179912, + "num_input_tokens_seen": 320619825, + "step": 14866, + "time_per_iteration": 2.7329297065734863 + }, + { + "auxiliary_loss_clip": 0.01100829, + "auxiliary_loss_mlp": 0.01029076, + "balance_loss_clip": 1.0382688, + "balance_loss_mlp": 1.01533771, + "epoch": 0.8938523974147001, + "flos": 25664386775040.0, + "grad_norm": 1.830929850281708, + "language_loss": 0.83762133, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.8589204, + "num_input_tokens_seen": 320638515, + "step": 14867, + "time_per_iteration": 2.668128728866577 + }, + { + "auxiliary_loss_clip": 0.0109843, + "auxiliary_loss_mlp": 0.01029062, + "balance_loss_clip": 1.03669333, + "balance_loss_mlp": 1.01795816, + "epoch": 0.8939125206673681, + "flos": 25742852035200.0, + "grad_norm": 1.586495908389307, + "language_loss": 0.80449593, + "learning_rate": 1.168401272009567e-07, + "loss": 0.82577085, + "num_input_tokens_seen": 320659430, + "step": 14868, + "time_per_iteration": 2.680034637451172 + }, + { + "auxiliary_loss_clip": 0.01083053, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.03728485, + "balance_loss_mlp": 1.0209384, + "epoch": 0.8939726439200361, + "flos": 27344503480320.0, + "grad_norm": 1.8649016797962312, + "language_loss": 0.7731384, + "learning_rate": 1.167089962692056e-07, + "loss": 0.79430467, + "num_input_tokens_seen": 320679295, + "step": 14869, + "time_per_iteration": 2.745805263519287 + }, + { + "auxiliary_loss_clip": 0.01097268, + "auxiliary_loss_mlp": 0.00769412, + "balance_loss_clip": 1.03609347, + "balance_loss_mlp": 1.00023556, + "epoch": 0.8940327671727041, + "flos": 20338834671360.0, + "grad_norm": 1.4278023080407176, + "language_loss": 0.65314829, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.67181504, + "num_input_tokens_seen": 320697535, + "step": 14870, + "time_per_iteration": 2.6284589767456055 + }, + { + "auxiliary_loss_clip": 0.00993024, + "auxiliary_loss_mlp": 0.0102124, + "balance_loss_clip": 1.00702477, + "balance_loss_mlp": 1.01970196, + "epoch": 0.894092890425372, + "flos": 58410573235200.0, + "grad_norm": 0.7966327834428544, + "language_loss": 0.55929744, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.57944012, + "num_input_tokens_seen": 320758635, + "step": 14871, + "time_per_iteration": 3.3122901916503906 + }, + { + "auxiliary_loss_clip": 0.01091917, + "auxiliary_loss_mlp": 0.0103181, + "balance_loss_clip": 1.03682566, + "balance_loss_mlp": 1.02005625, + "epoch": 0.89415301367804, + "flos": 19829657427840.0, + "grad_norm": 1.9266754384359623, + "language_loss": 0.76406336, + "learning_rate": 1.16316031981331e-07, + "loss": 0.78530067, + "num_input_tokens_seen": 320777175, + "step": 14872, + "time_per_iteration": 2.6247551441192627 + }, + { + "auxiliary_loss_clip": 0.01094372, + "auxiliary_loss_mlp": 0.01031313, + "balance_loss_clip": 1.03704429, + "balance_loss_mlp": 1.02015495, + "epoch": 0.8942131369307079, + "flos": 25775781828480.0, + "grad_norm": 1.648018727425323, + "language_loss": 0.67068255, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.69193947, + "num_input_tokens_seen": 320797670, + "step": 14873, + "time_per_iteration": 2.6552417278289795 + }, + { + "auxiliary_loss_clip": 0.01105979, + "auxiliary_loss_mlp": 0.01034727, + "balance_loss_clip": 1.03645134, + "balance_loss_mlp": 1.02241898, + "epoch": 0.8942732601833759, + "flos": 23149024139520.0, + "grad_norm": 1.5958829385063367, + "language_loss": 0.59345031, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.61485744, + "num_input_tokens_seen": 320817410, + "step": 14874, + "time_per_iteration": 2.5860843658447266 + }, + { + "auxiliary_loss_clip": 0.01078313, + "auxiliary_loss_mlp": 0.01032167, + "balance_loss_clip": 1.03743887, + "balance_loss_mlp": 1.01903629, + "epoch": 0.8943333834360438, + "flos": 27855548231040.0, + "grad_norm": 1.8290237697003595, + "language_loss": 0.75576758, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.7768724, + "num_input_tokens_seen": 320836745, + "step": 14875, + "time_per_iteration": 2.7420151233673096 + }, + { + "auxiliary_loss_clip": 0.01079183, + "auxiliary_loss_mlp": 0.0103557, + "balance_loss_clip": 1.03446269, + "balance_loss_mlp": 1.0205195, + "epoch": 0.8943935066887119, + "flos": 22163958581760.0, + "grad_norm": 2.3429928427333926, + "language_loss": 0.77405798, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.79520553, + "num_input_tokens_seen": 320853305, + "step": 14876, + "time_per_iteration": 2.816397190093994 + }, + { + "auxiliary_loss_clip": 0.01096244, + "auxiliary_loss_mlp": 0.01025808, + "balance_loss_clip": 1.0358882, + "balance_loss_mlp": 1.01482916, + "epoch": 0.8944536299413798, + "flos": 21470056669440.0, + "grad_norm": 1.6703549010755179, + "language_loss": 0.78432184, + "learning_rate": 1.156625201573287e-07, + "loss": 0.80554235, + "num_input_tokens_seen": 320872885, + "step": 14877, + "time_per_iteration": 2.7098886966705322 + }, + { + "auxiliary_loss_clip": 0.01059905, + "auxiliary_loss_mlp": 0.01039763, + "balance_loss_clip": 1.03192687, + "balance_loss_mlp": 1.02515423, + "epoch": 0.8945137531940478, + "flos": 17748777703680.0, + "grad_norm": 2.0737748491478465, + "language_loss": 0.7512145, + "learning_rate": 1.155320321355151e-07, + "loss": 0.77221119, + "num_input_tokens_seen": 320889755, + "step": 14878, + "time_per_iteration": 2.6619186401367188 + }, + { + "auxiliary_loss_clip": 0.01094053, + "auxiliary_loss_mlp": 0.01030234, + "balance_loss_clip": 1.03389883, + "balance_loss_mlp": 1.01564312, + "epoch": 0.8945738764467158, + "flos": 21142264129920.0, + "grad_norm": 1.682076176326582, + "language_loss": 0.76145089, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.78269374, + "num_input_tokens_seen": 320907860, + "step": 14879, + "time_per_iteration": 2.5775701999664307 + }, + { + "auxiliary_loss_clip": 0.01078076, + "auxiliary_loss_mlp": 0.0103149, + "balance_loss_clip": 1.03829026, + "balance_loss_mlp": 1.01952147, + "epoch": 0.8946339996993837, + "flos": 14903000835840.0, + "grad_norm": 1.842392268931871, + "language_loss": 0.74446988, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.76556557, + "num_input_tokens_seen": 320925825, + "step": 14880, + "time_per_iteration": 2.665179967880249 + }, + { + "auxiliary_loss_clip": 0.0109132, + "auxiliary_loss_mlp": 0.01029879, + "balance_loss_clip": 1.03410816, + "balance_loss_mlp": 1.01687312, + "epoch": 0.8946941229520518, + "flos": 27382173868800.0, + "grad_norm": 1.5269173028094163, + "language_loss": 0.82799721, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.84920919, + "num_input_tokens_seen": 320946165, + "step": 14881, + "time_per_iteration": 2.6503562927246094 + }, + { + "auxiliary_loss_clip": 0.01067605, + "auxiliary_loss_mlp": 0.00770988, + "balance_loss_clip": 1.03390133, + "balance_loss_mlp": 1.00022626, + "epoch": 0.8947542462047197, + "flos": 31796277338880.0, + "grad_norm": 1.614884288144251, + "language_loss": 0.67639142, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.69477737, + "num_input_tokens_seen": 320969330, + "step": 14882, + "time_per_iteration": 2.7693512439727783 + }, + { + "auxiliary_loss_clip": 0.01085159, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.03287458, + "balance_loss_mlp": 1.0205431, + "epoch": 0.8948143694573877, + "flos": 20883599314560.0, + "grad_norm": 2.045453824962206, + "language_loss": 0.74976206, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.77096784, + "num_input_tokens_seen": 320985055, + "step": 14883, + "time_per_iteration": 2.6624233722686768 + }, + { + "auxiliary_loss_clip": 0.01080827, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.03383732, + "balance_loss_mlp": 1.02153993, + "epoch": 0.8948744927100556, + "flos": 28215552291840.0, + "grad_norm": 1.5810148244458424, + "language_loss": 0.72292316, + "learning_rate": 1.147506048211253e-07, + "loss": 0.74406874, + "num_input_tokens_seen": 321004720, + "step": 14884, + "time_per_iteration": 2.6995975971221924 + }, + { + "auxiliary_loss_clip": 0.01076203, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.03102303, + "balance_loss_mlp": 1.0188036, + "epoch": 0.8949346159627236, + "flos": 21902672073600.0, + "grad_norm": 1.6922147897555293, + "language_loss": 0.75564313, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.77671194, + "num_input_tokens_seen": 321022350, + "step": 14885, + "time_per_iteration": 2.628843069076538 + }, + { + "auxiliary_loss_clip": 0.01081812, + "auxiliary_loss_mlp": 0.01031169, + "balance_loss_clip": 1.03561521, + "balance_loss_mlp": 1.01841331, + "epoch": 0.8949947392153915, + "flos": 21359128492800.0, + "grad_norm": 1.9911058650536606, + "language_loss": 0.81962872, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.84075844, + "num_input_tokens_seen": 321040450, + "step": 14886, + "time_per_iteration": 2.6610560417175293 + }, + { + "auxiliary_loss_clip": 0.01047486, + "auxiliary_loss_mlp": 0.01027777, + "balance_loss_clip": 1.03327608, + "balance_loss_mlp": 1.01596951, + "epoch": 0.8950548624680595, + "flos": 52445342799360.0, + "grad_norm": 1.5558434759275688, + "language_loss": 0.63781691, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.65856951, + "num_input_tokens_seen": 321063970, + "step": 14887, + "time_per_iteration": 3.0324647426605225 + }, + { + "auxiliary_loss_clip": 0.01088528, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_clip": 1.03487492, + "balance_loss_mlp": 1.02251327, + "epoch": 0.8951149857207275, + "flos": 20121323863680.0, + "grad_norm": 1.8589921868531927, + "language_loss": 0.60964525, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.63088268, + "num_input_tokens_seen": 321083840, + "step": 14888, + "time_per_iteration": 2.745520830154419 + }, + { + "auxiliary_loss_clip": 0.01110592, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.0367682, + "balance_loss_mlp": 1.01834917, + "epoch": 0.8951751089733955, + "flos": 29862631463040.0, + "grad_norm": 2.002662666178723, + "language_loss": 0.70275199, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.72416401, + "num_input_tokens_seen": 321104165, + "step": 14889, + "time_per_iteration": 2.6176459789276123 + }, + { + "auxiliary_loss_clip": 0.01096532, + "auxiliary_loss_mlp": 0.00770989, + "balance_loss_clip": 1.03800106, + "balance_loss_mlp": 1.00024951, + "epoch": 0.8952352322260634, + "flos": 15262789415040.0, + "grad_norm": 2.7797851150305615, + "language_loss": 0.71586537, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.73454058, + "num_input_tokens_seen": 321117290, + "step": 14890, + "time_per_iteration": 2.5783839225769043 + }, + { + "auxiliary_loss_clip": 0.0102349, + "auxiliary_loss_mlp": 0.00773622, + "balance_loss_clip": 1.02805948, + "balance_loss_mlp": 1.00013435, + "epoch": 0.8952953554787314, + "flos": 26798338206720.0, + "grad_norm": 1.483485143798382, + "language_loss": 0.75744319, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.77541423, + "num_input_tokens_seen": 321137115, + "step": 14891, + "time_per_iteration": 6.244478225708008 + }, + { + "auxiliary_loss_clip": 0.01051483, + "auxiliary_loss_mlp": 0.01035049, + "balance_loss_clip": 1.03069568, + "balance_loss_mlp": 1.02226329, + "epoch": 0.8953554787313994, + "flos": 14137205852160.0, + "grad_norm": 2.0123273105882586, + "language_loss": 0.76453358, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.7853989, + "num_input_tokens_seen": 321154490, + "step": 14892, + "time_per_iteration": 3.087535858154297 + }, + { + "auxiliary_loss_clip": 0.01093667, + "auxiliary_loss_mlp": 0.01032529, + "balance_loss_clip": 1.03796649, + "balance_loss_mlp": 1.02048337, + "epoch": 0.8954156019840673, + "flos": 25703314139520.0, + "grad_norm": 3.335988726881917, + "language_loss": 0.81619698, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.83745897, + "num_input_tokens_seen": 321175625, + "step": 14893, + "time_per_iteration": 4.313986778259277 + }, + { + "auxiliary_loss_clip": 0.01061423, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.03349638, + "balance_loss_mlp": 1.02072382, + "epoch": 0.8954757252367354, + "flos": 21907987286400.0, + "grad_norm": 1.880542691848622, + "language_loss": 0.74994141, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.77088118, + "num_input_tokens_seen": 321193895, + "step": 14894, + "time_per_iteration": 2.9463634490966797 + }, + { + "auxiliary_loss_clip": 0.01097897, + "auxiliary_loss_mlp": 0.0103303, + "balance_loss_clip": 1.03915453, + "balance_loss_mlp": 1.01995826, + "epoch": 0.8955358484894033, + "flos": 12970396454400.0, + "grad_norm": 1.9665552489767175, + "language_loss": 0.66606176, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.68737108, + "num_input_tokens_seen": 321211610, + "step": 14895, + "time_per_iteration": 4.159812927246094 + }, + { + "auxiliary_loss_clip": 0.01099951, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.03752875, + "balance_loss_mlp": 1.01608145, + "epoch": 0.8955959717420713, + "flos": 17273966797440.0, + "grad_norm": 1.671045590451987, + "language_loss": 0.67131901, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.69261479, + "num_input_tokens_seen": 321229805, + "step": 14896, + "time_per_iteration": 2.5856170654296875 + }, + { + "auxiliary_loss_clip": 0.01099928, + "auxiliary_loss_mlp": 0.01033038, + "balance_loss_clip": 1.03831029, + "balance_loss_mlp": 1.0207119, + "epoch": 0.8956560949947392, + "flos": 14793868339200.0, + "grad_norm": 1.8809584975485838, + "language_loss": 0.75465834, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.77598798, + "num_input_tokens_seen": 321247165, + "step": 14897, + "time_per_iteration": 2.657931089401245 + }, + { + "auxiliary_loss_clip": 0.00994794, + "auxiliary_loss_mlp": 0.00751908, + "balance_loss_clip": 1.00807071, + "balance_loss_mlp": 0.99958485, + "epoch": 0.8957162182474072, + "flos": 63607817957760.0, + "grad_norm": 0.7439840356253357, + "language_loss": 0.55338937, + "learning_rate": 1.129372846953931e-07, + "loss": 0.57085639, + "num_input_tokens_seen": 321308425, + "step": 14898, + "time_per_iteration": 3.3162429332733154 + }, + { + "auxiliary_loss_clip": 0.01109726, + "auxiliary_loss_mlp": 0.00771113, + "balance_loss_clip": 1.03748989, + "balance_loss_mlp": 1.00012457, + "epoch": 0.8957763415000751, + "flos": 25009843190400.0, + "grad_norm": 1.500591280857772, + "language_loss": 0.70237386, + "learning_rate": 1.12808298352008e-07, + "loss": 0.72118223, + "num_input_tokens_seen": 321329295, + "step": 14899, + "time_per_iteration": 2.6604552268981934 + }, + { + "auxiliary_loss_clip": 0.01054108, + "auxiliary_loss_mlp": 0.0103587, + "balance_loss_clip": 1.03760815, + "balance_loss_mlp": 1.02217865, + "epoch": 0.8958364647527431, + "flos": 19828615933440.0, + "grad_norm": 1.672513533456995, + "language_loss": 0.73965251, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.76055229, + "num_input_tokens_seen": 321347580, + "step": 14900, + "time_per_iteration": 2.7858917713165283 + }, + { + "auxiliary_loss_clip": 0.00999101, + "auxiliary_loss_mlp": 0.01000333, + "balance_loss_clip": 1.01374125, + "balance_loss_mlp": 0.99923056, + "epoch": 0.895896588005411, + "flos": 65537190115200.0, + "grad_norm": 0.793037706766976, + "language_loss": 0.61771894, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.63771325, + "num_input_tokens_seen": 321407820, + "step": 14901, + "time_per_iteration": 3.225350856781006 + }, + { + "auxiliary_loss_clip": 0.01099179, + "auxiliary_loss_mlp": 0.01029055, + "balance_loss_clip": 1.03669286, + "balance_loss_mlp": 1.01680589, + "epoch": 0.8959567112580791, + "flos": 25591021246080.0, + "grad_norm": 1.6768583776386496, + "language_loss": 0.70434642, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.72562879, + "num_input_tokens_seen": 321426745, + "step": 14902, + "time_per_iteration": 2.629722833633423 + }, + { + "auxiliary_loss_clip": 0.01080163, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.03510499, + "balance_loss_mlp": 1.01877761, + "epoch": 0.896016834510747, + "flos": 24201780877440.0, + "grad_norm": 1.8587409033889455, + "language_loss": 0.78276879, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.80387414, + "num_input_tokens_seen": 321446165, + "step": 14903, + "time_per_iteration": 2.6630077362060547 + }, + { + "auxiliary_loss_clip": 0.01085975, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.03611159, + "balance_loss_mlp": 1.02067935, + "epoch": 0.896076957763415, + "flos": 23075945919360.0, + "grad_norm": 1.7273682997495312, + "language_loss": 0.73095953, + "learning_rate": 1.121644401702877e-07, + "loss": 0.75216204, + "num_input_tokens_seen": 321465285, + "step": 14904, + "time_per_iteration": 2.656641721725464 + }, + { + "auxiliary_loss_clip": 0.01097461, + "auxiliary_loss_mlp": 0.01028056, + "balance_loss_clip": 1.03512216, + "balance_loss_mlp": 1.0144484, + "epoch": 0.8961370810160829, + "flos": 22236605838720.0, + "grad_norm": 1.972644186412881, + "language_loss": 0.74508619, + "learning_rate": 1.12035883275166e-07, + "loss": 0.76634133, + "num_input_tokens_seen": 321483670, + "step": 14905, + "time_per_iteration": 2.5795624256134033 + }, + { + "auxiliary_loss_clip": 0.01096538, + "auxiliary_loss_mlp": 0.01032548, + "balance_loss_clip": 1.03503621, + "balance_loss_mlp": 1.02032363, + "epoch": 0.8961972042687509, + "flos": 23072318645760.0, + "grad_norm": 2.276578769172911, + "language_loss": 0.76414752, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.78543842, + "num_input_tokens_seen": 321501190, + "step": 14906, + "time_per_iteration": 2.608065605163574 + }, + { + "auxiliary_loss_clip": 0.0109916, + "auxiliary_loss_mlp": 0.01034258, + "balance_loss_clip": 1.03820026, + "balance_loss_mlp": 1.02151513, + "epoch": 0.896257327521419, + "flos": 18185882307840.0, + "grad_norm": 1.6230699036233884, + "language_loss": 0.7410239, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.76235807, + "num_input_tokens_seen": 321518540, + "step": 14907, + "time_per_iteration": 2.5740091800689697 + }, + { + "auxiliary_loss_clip": 0.01098288, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.03720152, + "balance_loss_mlp": 1.0237062, + "epoch": 0.8963174507740869, + "flos": 17895472848000.0, + "grad_norm": 3.741927314180935, + "language_loss": 0.82670319, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.84804434, + "num_input_tokens_seen": 321536555, + "step": 14908, + "time_per_iteration": 2.5786521434783936 + }, + { + "auxiliary_loss_clip": 0.01086384, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.0361675, + "balance_loss_mlp": 1.01765347, + "epoch": 0.8963775740267549, + "flos": 21032269706880.0, + "grad_norm": 2.161346134185943, + "language_loss": 0.70245093, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.72362739, + "num_input_tokens_seen": 321557655, + "step": 14909, + "time_per_iteration": 2.652540445327759 + }, + { + "auxiliary_loss_clip": 0.01076255, + "auxiliary_loss_mlp": 0.01036306, + "balance_loss_clip": 1.03869569, + "balance_loss_mlp": 1.02353907, + "epoch": 0.8964376972794228, + "flos": 23179619548800.0, + "grad_norm": 16.444537313084084, + "language_loss": 0.7209096, + "learning_rate": 1.113941727737877e-07, + "loss": 0.74203527, + "num_input_tokens_seen": 321576160, + "step": 14910, + "time_per_iteration": 2.6874682903289795 + }, + { + "auxiliary_loss_clip": 0.01095164, + "auxiliary_loss_mlp": 0.01028278, + "balance_loss_clip": 1.03482211, + "balance_loss_mlp": 1.01633346, + "epoch": 0.8964978205320908, + "flos": 24972998814720.0, + "grad_norm": 2.519588986898142, + "language_loss": 0.6361804, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.65741479, + "num_input_tokens_seen": 321596205, + "step": 14911, + "time_per_iteration": 2.594196081161499 + }, + { + "auxiliary_loss_clip": 0.01082355, + "auxiliary_loss_mlp": 0.00770688, + "balance_loss_clip": 1.03677964, + "balance_loss_mlp": 1.00020111, + "epoch": 0.8965579437847587, + "flos": 19172025273600.0, + "grad_norm": 1.6361676394072804, + "language_loss": 0.74929178, + "learning_rate": 1.111379898520437e-07, + "loss": 0.76782227, + "num_input_tokens_seen": 321614800, + "step": 14912, + "time_per_iteration": 2.620948076248169 + }, + { + "auxiliary_loss_clip": 0.01083336, + "auxiliary_loss_mlp": 0.01037867, + "balance_loss_clip": 1.03508806, + "balance_loss_mlp": 1.02545714, + "epoch": 0.8966180670374267, + "flos": 24276690691200.0, + "grad_norm": 1.791048209942099, + "language_loss": 0.81890047, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.84011245, + "num_input_tokens_seen": 321633445, + "step": 14913, + "time_per_iteration": 2.6343531608581543 + }, + { + "auxiliary_loss_clip": 0.01101255, + "auxiliary_loss_mlp": 0.01035811, + "balance_loss_clip": 1.03797901, + "balance_loss_mlp": 1.02259076, + "epoch": 0.8966781902900947, + "flos": 13553190622080.0, + "grad_norm": 3.5493075869596176, + "language_loss": 0.61391163, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.63528228, + "num_input_tokens_seen": 321650890, + "step": 14914, + "time_per_iteration": 2.611363649368286 + }, + { + "auxiliary_loss_clip": 0.01005981, + "auxiliary_loss_mlp": 0.00999937, + "balance_loss_clip": 1.00627279, + "balance_loss_mlp": 0.99880487, + "epoch": 0.8967383135427627, + "flos": 65066114223360.0, + "grad_norm": 2.6237376103475905, + "language_loss": 0.5505228, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.57058197, + "num_input_tokens_seen": 321710960, + "step": 14915, + "time_per_iteration": 3.191149950027466 + }, + { + "auxiliary_loss_clip": 0.01068433, + "auxiliary_loss_mlp": 0.01032694, + "balance_loss_clip": 1.03356564, + "balance_loss_mlp": 1.02107704, + "epoch": 0.8967984367954306, + "flos": 29713027317120.0, + "grad_norm": 1.506591711885427, + "language_loss": 0.71458489, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.73559618, + "num_input_tokens_seen": 321733290, + "step": 14916, + "time_per_iteration": 2.7623350620269775 + }, + { + "auxiliary_loss_clip": 0.01087907, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.03692842, + "balance_loss_mlp": 1.01904035, + "epoch": 0.8968585600480986, + "flos": 25702488126720.0, + "grad_norm": 1.8194206716370875, + "language_loss": 0.77866107, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.79984742, + "num_input_tokens_seen": 321753120, + "step": 14917, + "time_per_iteration": 2.6854681968688965 + }, + { + "auxiliary_loss_clip": 0.01102374, + "auxiliary_loss_mlp": 0.01041532, + "balance_loss_clip": 1.03805685, + "balance_loss_mlp": 1.02815735, + "epoch": 0.8969186833007665, + "flos": 30044698525440.0, + "grad_norm": 1.9817396257364666, + "language_loss": 0.6853829, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.70682192, + "num_input_tokens_seen": 321772840, + "step": 14918, + "time_per_iteration": 2.6850335597991943 + }, + { + "auxiliary_loss_clip": 0.01059733, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.0353421, + "balance_loss_mlp": 1.02087831, + "epoch": 0.8969788065534345, + "flos": 22818143030400.0, + "grad_norm": 1.8968334913567422, + "language_loss": 0.83584672, + "learning_rate": 1.102436060943881e-07, + "loss": 0.85676813, + "num_input_tokens_seen": 321791020, + "step": 14919, + "time_per_iteration": 2.7944953441619873 + }, + { + "auxiliary_loss_clip": 0.0110904, + "auxiliary_loss_mlp": 0.00771505, + "balance_loss_clip": 1.03592348, + "balance_loss_mlp": 1.00021255, + "epoch": 0.8970389298061026, + "flos": 13261488272640.0, + "grad_norm": 6.002098471284828, + "language_loss": 0.72274148, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.74154693, + "num_input_tokens_seen": 321810075, + "step": 14920, + "time_per_iteration": 2.641122579574585 + }, + { + "auxiliary_loss_clip": 0.01096514, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.03508401, + "balance_loss_mlp": 1.01865947, + "epoch": 0.8970990530587705, + "flos": 10266071345280.0, + "grad_norm": 2.2258437639369753, + "language_loss": 0.90893173, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.93021685, + "num_input_tokens_seen": 321822635, + "step": 14921, + "time_per_iteration": 2.5753695964813232 + }, + { + "auxiliary_loss_clip": 0.0105106, + "auxiliary_loss_mlp": 0.0103667, + "balance_loss_clip": 1.03290153, + "balance_loss_mlp": 1.02218616, + "epoch": 0.8971591763114385, + "flos": 20302708567680.0, + "grad_norm": 1.7221269856692987, + "language_loss": 0.73712015, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.75799739, + "num_input_tokens_seen": 321841130, + "step": 14922, + "time_per_iteration": 2.796809673309326 + }, + { + "auxiliary_loss_clip": 0.01059125, + "auxiliary_loss_mlp": 0.01039549, + "balance_loss_clip": 1.0326159, + "balance_loss_mlp": 1.02563119, + "epoch": 0.8972192995641064, + "flos": 23257043314560.0, + "grad_norm": 1.7526261778537016, + "language_loss": 0.70386976, + "learning_rate": 1.097341060694219e-07, + "loss": 0.7248565, + "num_input_tokens_seen": 321859855, + "step": 14923, + "time_per_iteration": 2.716149091720581 + }, + { + "auxiliary_loss_clip": 0.01087701, + "auxiliary_loss_mlp": 0.01029185, + "balance_loss_clip": 1.03695786, + "balance_loss_mlp": 1.01585746, + "epoch": 0.8972794228167744, + "flos": 18369601395840.0, + "grad_norm": 2.5800290587382606, + "language_loss": 0.7121672, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.73333609, + "num_input_tokens_seen": 321877990, + "step": 14924, + "time_per_iteration": 2.6310861110687256 + }, + { + "auxiliary_loss_clip": 0.01094366, + "auxiliary_loss_mlp": 0.01035713, + "balance_loss_clip": 1.03357565, + "balance_loss_mlp": 1.02411425, + "epoch": 0.8973395460694423, + "flos": 23952058548480.0, + "grad_norm": 1.524405249104344, + "language_loss": 0.720016, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.7413168, + "num_input_tokens_seen": 321898120, + "step": 14925, + "time_per_iteration": 2.665548324584961 + }, + { + "auxiliary_loss_clip": 0.01087294, + "auxiliary_loss_mlp": 0.00773098, + "balance_loss_clip": 1.03590477, + "balance_loss_mlp": 1.0001936, + "epoch": 0.8973996693221103, + "flos": 24970843998720.0, + "grad_norm": 1.8138493402848186, + "language_loss": 0.82518828, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.84379226, + "num_input_tokens_seen": 321918140, + "step": 14926, + "time_per_iteration": 2.6425201892852783 + }, + { + "auxiliary_loss_clip": 0.01054597, + "auxiliary_loss_mlp": 0.01030973, + "balance_loss_clip": 1.03193653, + "balance_loss_mlp": 1.0189445, + "epoch": 0.8974597925747783, + "flos": 25738937452800.0, + "grad_norm": 1.4975243359696364, + "language_loss": 0.7919172, + "learning_rate": 1.092257529095555e-07, + "loss": 0.81277287, + "num_input_tokens_seen": 321938580, + "step": 14927, + "time_per_iteration": 2.760615825653076 + }, + { + "auxiliary_loss_clip": 0.01081394, + "auxiliary_loss_mlp": 0.01029342, + "balance_loss_clip": 1.03361082, + "balance_loss_mlp": 1.01776123, + "epoch": 0.8975199158274463, + "flos": 38071918131840.0, + "grad_norm": 1.6317289116194253, + "language_loss": 0.66483474, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.68594205, + "num_input_tokens_seen": 321961135, + "step": 14928, + "time_per_iteration": 2.778822898864746 + }, + { + "auxiliary_loss_clip": 0.01087431, + "auxiliary_loss_mlp": 0.01043461, + "balance_loss_clip": 1.03568482, + "balance_loss_mlp": 1.02808905, + "epoch": 0.8975800390801142, + "flos": 25411683617280.0, + "grad_norm": 5.313736387639944, + "language_loss": 0.70643723, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.72774613, + "num_input_tokens_seen": 321980945, + "step": 14929, + "time_per_iteration": 2.7232232093811035 + }, + { + "auxiliary_loss_clip": 0.01089831, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.03910744, + "balance_loss_mlp": 1.01977956, + "epoch": 0.8976401623327822, + "flos": 21759604202880.0, + "grad_norm": 1.7936229016193426, + "language_loss": 0.68214059, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.70335329, + "num_input_tokens_seen": 322000350, + "step": 14930, + "time_per_iteration": 4.204017162322998 + }, + { + "auxiliary_loss_clip": 0.01078251, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.03327012, + "balance_loss_mlp": 1.01617682, + "epoch": 0.8977002855854501, + "flos": 13845323934720.0, + "grad_norm": 3.3144307660697994, + "language_loss": 0.74537098, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.76643896, + "num_input_tokens_seen": 322018980, + "step": 14931, + "time_per_iteration": 4.21280837059021 + }, + { + "auxiliary_loss_clip": 0.01098516, + "auxiliary_loss_mlp": 0.0102899, + "balance_loss_clip": 1.0380764, + "balance_loss_mlp": 1.01730156, + "epoch": 0.8977604088381181, + "flos": 19427529692160.0, + "grad_norm": 1.8135529971721605, + "language_loss": 0.62872756, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.6500026, + "num_input_tokens_seen": 322037675, + "step": 14932, + "time_per_iteration": 2.5633347034454346 + }, + { + "auxiliary_loss_clip": 0.01091207, + "auxiliary_loss_mlp": 0.01028215, + "balance_loss_clip": 1.03397417, + "balance_loss_mlp": 1.01719403, + "epoch": 0.8978205320907862, + "flos": 22742083981440.0, + "grad_norm": 1.6627829242988799, + "language_loss": 0.7173481, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.73854238, + "num_input_tokens_seen": 322055130, + "step": 14933, + "time_per_iteration": 5.648598909378052 + }, + { + "auxiliary_loss_clip": 0.01061803, + "auxiliary_loss_mlp": 0.01036521, + "balance_loss_clip": 1.03099751, + "balance_loss_mlp": 1.02254987, + "epoch": 0.8978806553434541, + "flos": 21360529123200.0, + "grad_norm": 1.8892940748793305, + "language_loss": 0.74708331, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.76806653, + "num_input_tokens_seen": 322074850, + "step": 14934, + "time_per_iteration": 2.7452452182769775 + }, + { + "auxiliary_loss_clip": 0.01063828, + "auxiliary_loss_mlp": 0.01038833, + "balance_loss_clip": 1.03115392, + "balance_loss_mlp": 1.02425992, + "epoch": 0.8979407785961221, + "flos": 20924178704640.0, + "grad_norm": 1.7395229013410125, + "language_loss": 0.60459125, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.62561786, + "num_input_tokens_seen": 322093315, + "step": 14935, + "time_per_iteration": 2.6802937984466553 + }, + { + "auxiliary_loss_clip": 0.01067049, + "auxiliary_loss_mlp": 0.0102824, + "balance_loss_clip": 1.03403175, + "balance_loss_mlp": 1.01568127, + "epoch": 0.89800090184879, + "flos": 25228934196480.0, + "grad_norm": 2.3833073137139773, + "language_loss": 0.76938522, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.79033804, + "num_input_tokens_seen": 322112555, + "step": 14936, + "time_per_iteration": 2.6882402896881104 + }, + { + "auxiliary_loss_clip": 0.01084705, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.0342505, + "balance_loss_mlp": 1.02111554, + "epoch": 0.898061025101458, + "flos": 22562674525440.0, + "grad_norm": 1.7261485222993433, + "language_loss": 0.74040693, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.7615869, + "num_input_tokens_seen": 322130440, + "step": 14937, + "time_per_iteration": 2.6710762977600098 + }, + { + "auxiliary_loss_clip": 0.01000999, + "auxiliary_loss_mlp": 0.00999075, + "balance_loss_clip": 1.00671172, + "balance_loss_mlp": 0.9980852, + "epoch": 0.8981211483541259, + "flos": 56192551384320.0, + "grad_norm": 0.843865572085313, + "language_loss": 0.63512671, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.65512741, + "num_input_tokens_seen": 322187295, + "step": 14938, + "time_per_iteration": 3.0942494869232178 + }, + { + "auxiliary_loss_clip": 0.01085887, + "auxiliary_loss_mlp": 0.01026506, + "balance_loss_clip": 1.03574538, + "balance_loss_mlp": 1.0140903, + "epoch": 0.898181271606794, + "flos": 16392718523520.0, + "grad_norm": 2.1860479541490268, + "language_loss": 0.79759568, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.81871951, + "num_input_tokens_seen": 322202965, + "step": 14939, + "time_per_iteration": 2.663742780685425 + }, + { + "auxiliary_loss_clip": 0.01000054, + "auxiliary_loss_mlp": 0.01000102, + "balance_loss_clip": 1.00716364, + "balance_loss_mlp": 0.99917239, + "epoch": 0.8982413948594619, + "flos": 63440259989760.0, + "grad_norm": 0.7229819252676494, + "language_loss": 0.52847624, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.54847777, + "num_input_tokens_seen": 322269490, + "step": 14940, + "time_per_iteration": 3.3590850830078125 + }, + { + "auxiliary_loss_clip": 0.01109001, + "auxiliary_loss_mlp": 0.01032379, + "balance_loss_clip": 1.03646505, + "balance_loss_mlp": 1.01890242, + "epoch": 0.8983015181121299, + "flos": 21835340029440.0, + "grad_norm": 1.950102596930943, + "language_loss": 0.77829498, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.79970872, + "num_input_tokens_seen": 322288060, + "step": 14941, + "time_per_iteration": 2.744305372238159 + }, + { + "auxiliary_loss_clip": 0.01098003, + "auxiliary_loss_mlp": 0.01036176, + "balance_loss_clip": 1.0353359, + "balance_loss_mlp": 1.02308095, + "epoch": 0.8983616413647978, + "flos": 28949961767040.0, + "grad_norm": 2.416025895078288, + "language_loss": 0.73365378, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.75499552, + "num_input_tokens_seen": 322307930, + "step": 14942, + "time_per_iteration": 2.754950523376465 + }, + { + "auxiliary_loss_clip": 0.01087926, + "auxiliary_loss_mlp": 0.01039089, + "balance_loss_clip": 1.03435743, + "balance_loss_mlp": 1.0259459, + "epoch": 0.8984217646174658, + "flos": 17785083375360.0, + "grad_norm": 3.391273382759864, + "language_loss": 0.79918504, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.82045519, + "num_input_tokens_seen": 322326155, + "step": 14943, + "time_per_iteration": 2.7248191833496094 + }, + { + "auxiliary_loss_clip": 0.01085525, + "auxiliary_loss_mlp": 0.01032768, + "balance_loss_clip": 1.0354557, + "balance_loss_mlp": 1.01938701, + "epoch": 0.8984818878701337, + "flos": 23404528558080.0, + "grad_norm": 1.5192187964233135, + "language_loss": 0.71140742, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.73259044, + "num_input_tokens_seen": 322345850, + "step": 14944, + "time_per_iteration": 2.6967763900756836 + }, + { + "auxiliary_loss_clip": 0.01069595, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.03214753, + "balance_loss_mlp": 1.01949, + "epoch": 0.8985420111228017, + "flos": 22346061557760.0, + "grad_norm": 2.216725804590017, + "language_loss": 0.76311302, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.78414327, + "num_input_tokens_seen": 322364715, + "step": 14945, + "time_per_iteration": 2.6679043769836426 + }, + { + "auxiliary_loss_clip": 0.01114813, + "auxiliary_loss_mlp": 0.01031111, + "balance_loss_clip": 1.03778219, + "balance_loss_mlp": 1.01778316, + "epoch": 0.8986021343754698, + "flos": 21392776558080.0, + "grad_norm": 2.148577693611771, + "language_loss": 0.73464406, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.75610334, + "num_input_tokens_seen": 322383570, + "step": 14946, + "time_per_iteration": 2.5922229290008545 + }, + { + "auxiliary_loss_clip": 0.0105656, + "auxiliary_loss_mlp": 0.01032137, + "balance_loss_clip": 1.03178859, + "balance_loss_mlp": 1.01898205, + "epoch": 0.8986622576281377, + "flos": 21325372686720.0, + "grad_norm": 1.8845669239623069, + "language_loss": 0.64757031, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.66845727, + "num_input_tokens_seen": 322401375, + "step": 14947, + "time_per_iteration": 2.7270290851593018 + }, + { + "auxiliary_loss_clip": 0.01087566, + "auxiliary_loss_mlp": 0.01034127, + "balance_loss_clip": 1.03707767, + "balance_loss_mlp": 1.02179492, + "epoch": 0.8987223808808057, + "flos": 23988292392960.0, + "grad_norm": 2.018840894039702, + "language_loss": 0.70409435, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.72531128, + "num_input_tokens_seen": 322421890, + "step": 14948, + "time_per_iteration": 2.712301254272461 + }, + { + "auxiliary_loss_clip": 0.01076508, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.03520036, + "balance_loss_mlp": 1.01646447, + "epoch": 0.8987825041334736, + "flos": 41500956044160.0, + "grad_norm": 1.7555603952219132, + "language_loss": 0.7477864, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.76884139, + "num_input_tokens_seen": 322445730, + "step": 14949, + "time_per_iteration": 2.8739330768585205 + }, + { + "auxiliary_loss_clip": 0.01067975, + "auxiliary_loss_mlp": 0.01032509, + "balance_loss_clip": 1.03525853, + "balance_loss_mlp": 1.01840615, + "epoch": 0.8988426273861416, + "flos": 27564276844800.0, + "grad_norm": 1.6443346508458696, + "language_loss": 0.75822496, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.77922982, + "num_input_tokens_seen": 322464595, + "step": 14950, + "time_per_iteration": 2.801135301589966 + }, + { + "auxiliary_loss_clip": 0.01082227, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.03504586, + "balance_loss_mlp": 1.01929891, + "epoch": 0.8989027506388095, + "flos": 17092653920640.0, + "grad_norm": 1.910819021087814, + "language_loss": 0.66423386, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.68537182, + "num_input_tokens_seen": 322483305, + "step": 14951, + "time_per_iteration": 2.722646951675415 + }, + { + "auxiliary_loss_clip": 0.01110481, + "auxiliary_loss_mlp": 0.01030196, + "balance_loss_clip": 1.03482461, + "balance_loss_mlp": 1.01835823, + "epoch": 0.8989628738914776, + "flos": 20555124416640.0, + "grad_norm": 1.9405005215432696, + "language_loss": 0.73878247, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.76018929, + "num_input_tokens_seen": 322501905, + "step": 14952, + "time_per_iteration": 2.638542413711548 + }, + { + "auxiliary_loss_clip": 0.01108749, + "auxiliary_loss_mlp": 0.01033676, + "balance_loss_clip": 1.03708589, + "balance_loss_mlp": 1.02110505, + "epoch": 0.8990229971441455, + "flos": 16251087196800.0, + "grad_norm": 2.2604855154768595, + "language_loss": 0.56825626, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.58968055, + "num_input_tokens_seen": 322518135, + "step": 14953, + "time_per_iteration": 2.674570083618164 + }, + { + "auxiliary_loss_clip": 0.01083928, + "auxiliary_loss_mlp": 0.01033798, + "balance_loss_clip": 1.03378558, + "balance_loss_mlp": 1.021281, + "epoch": 0.8990831203968135, + "flos": 21981316901760.0, + "grad_norm": 2.264909455658383, + "language_loss": 0.82036901, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.8415463, + "num_input_tokens_seen": 322537905, + "step": 14954, + "time_per_iteration": 2.6860923767089844 + }, + { + "auxiliary_loss_clip": 0.01107036, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.0373497, + "balance_loss_mlp": 1.01929116, + "epoch": 0.8991432436494814, + "flos": 27447171528960.0, + "grad_norm": 1.822158313950773, + "language_loss": 0.59985012, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.621243, + "num_input_tokens_seen": 322557945, + "step": 14955, + "time_per_iteration": 2.645461082458496 + }, + { + "auxiliary_loss_clip": 0.01097918, + "auxiliary_loss_mlp": 0.01032904, + "balance_loss_clip": 1.03783774, + "balance_loss_mlp": 1.02125764, + "epoch": 0.8992033669021494, + "flos": 21579835610880.0, + "grad_norm": 2.0084560499241486, + "language_loss": 0.54700983, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.56831801, + "num_input_tokens_seen": 322575765, + "step": 14956, + "time_per_iteration": 2.6565489768981934 + }, + { + "auxiliary_loss_clip": 0.01063944, + "auxiliary_loss_mlp": 0.01036733, + "balance_loss_clip": 1.03451157, + "balance_loss_mlp": 1.02390623, + "epoch": 0.8992634901548173, + "flos": 28584211530240.0, + "grad_norm": 1.8401685244545993, + "language_loss": 0.79821646, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.81922328, + "num_input_tokens_seen": 322595665, + "step": 14957, + "time_per_iteration": 2.797804117202759 + }, + { + "auxiliary_loss_clip": 0.0111253, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.03749204, + "balance_loss_mlp": 1.01827729, + "epoch": 0.8993236134074853, + "flos": 19867435557120.0, + "grad_norm": 2.4531476671988663, + "language_loss": 0.78357041, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.80501139, + "num_input_tokens_seen": 322614755, + "step": 14958, + "time_per_iteration": 2.6688661575317383 + }, + { + "auxiliary_loss_clip": 0.0104078, + "auxiliary_loss_mlp": 0.01030342, + "balance_loss_clip": 1.03928471, + "balance_loss_mlp": 1.01852262, + "epoch": 0.8993837366601534, + "flos": 19390649402880.0, + "grad_norm": 2.9878038930671362, + "language_loss": 0.74742228, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.76813352, + "num_input_tokens_seen": 322633425, + "step": 14959, + "time_per_iteration": 2.8125593662261963 + }, + { + "auxiliary_loss_clip": 0.01103112, + "auxiliary_loss_mlp": 0.0103209, + "balance_loss_clip": 1.03359628, + "balance_loss_mlp": 1.01983559, + "epoch": 0.8994438599128213, + "flos": 18551740285440.0, + "grad_norm": 2.0035831193239684, + "language_loss": 0.68275356, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.70410562, + "num_input_tokens_seen": 322652065, + "step": 14960, + "time_per_iteration": 2.5540730953216553 + }, + { + "auxiliary_loss_clip": 0.01084725, + "auxiliary_loss_mlp": 0.010279, + "balance_loss_clip": 1.03622437, + "balance_loss_mlp": 1.01650357, + "epoch": 0.8995039831654893, + "flos": 24427587726720.0, + "grad_norm": 1.431119232973545, + "language_loss": 0.65543896, + "learning_rate": 1.049510991294591e-07, + "loss": 0.67656523, + "num_input_tokens_seen": 322673275, + "step": 14961, + "time_per_iteration": 2.7903378009796143 + }, + { + "auxiliary_loss_clip": 0.01084623, + "auxiliary_loss_mlp": 0.01027938, + "balance_loss_clip": 1.03403842, + "balance_loss_mlp": 1.01648808, + "epoch": 0.8995641064181572, + "flos": 21251324799360.0, + "grad_norm": 1.6157800679699814, + "language_loss": 0.83261824, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.85374379, + "num_input_tokens_seen": 322693375, + "step": 14962, + "time_per_iteration": 2.640796661376953 + }, + { + "auxiliary_loss_clip": 0.01090281, + "auxiliary_loss_mlp": 0.01030459, + "balance_loss_clip": 1.0377152, + "balance_loss_mlp": 1.01667809, + "epoch": 0.8996242296708252, + "flos": 23513661054720.0, + "grad_norm": 2.0695727885892095, + "language_loss": 0.76674181, + "learning_rate": 1.047022340612298e-07, + "loss": 0.7879492, + "num_input_tokens_seen": 322712615, + "step": 14963, + "time_per_iteration": 2.6461212635040283 + }, + { + "auxiliary_loss_clip": 0.00991703, + "auxiliary_loss_mlp": 0.01005224, + "balance_loss_clip": 1.01595902, + "balance_loss_mlp": 1.00418079, + "epoch": 0.8996843529234931, + "flos": 62403230430720.0, + "grad_norm": 0.7797202356654998, + "language_loss": 0.57483667, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.59480596, + "num_input_tokens_seen": 322766855, + "step": 14964, + "time_per_iteration": 3.1848866939544678 + }, + { + "auxiliary_loss_clip": 0.0110498, + "auxiliary_loss_mlp": 0.01033929, + "balance_loss_clip": 1.03950953, + "balance_loss_mlp": 1.02064347, + "epoch": 0.8997444761761612, + "flos": 24236829573120.0, + "grad_norm": 3.314723962162985, + "language_loss": 0.6772269, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.69861603, + "num_input_tokens_seen": 322781130, + "step": 14965, + "time_per_iteration": 2.6162235736846924 + }, + { + "auxiliary_loss_clip": 0.01110984, + "auxiliary_loss_mlp": 0.01030286, + "balance_loss_clip": 1.03775227, + "balance_loss_mlp": 1.01822209, + "epoch": 0.8998045994288291, + "flos": 21361103740800.0, + "grad_norm": 2.9087273995519136, + "language_loss": 0.71626663, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.73767936, + "num_input_tokens_seen": 322800310, + "step": 14966, + "time_per_iteration": 2.5625483989715576 + }, + { + "auxiliary_loss_clip": 0.01076915, + "auxiliary_loss_mlp": 0.0103174, + "balance_loss_clip": 1.0351249, + "balance_loss_mlp": 1.01843047, + "epoch": 0.8998647226814971, + "flos": 28986159697920.0, + "grad_norm": 1.8489899153137084, + "language_loss": 0.73536384, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.75645041, + "num_input_tokens_seen": 322820955, + "step": 14967, + "time_per_iteration": 2.785755157470703 + }, + { + "auxiliary_loss_clip": 0.01064386, + "auxiliary_loss_mlp": 0.00770622, + "balance_loss_clip": 1.03535485, + "balance_loss_mlp": 1.00016153, + "epoch": 0.899924845934165, + "flos": 13625909706240.0, + "grad_norm": 1.9571169995533768, + "language_loss": 0.72163457, + "learning_rate": 1.040813291960323e-07, + "loss": 0.73998475, + "num_input_tokens_seen": 322838780, + "step": 14968, + "time_per_iteration": 2.7936058044433594 + }, + { + "auxiliary_loss_clip": 0.01093703, + "auxiliary_loss_mlp": 0.01032931, + "balance_loss_clip": 1.03627658, + "balance_loss_mlp": 1.02080774, + "epoch": 0.899984969186833, + "flos": 20882629647360.0, + "grad_norm": 1.942509479538182, + "language_loss": 0.71323812, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.73450446, + "num_input_tokens_seen": 322856710, + "step": 14969, + "time_per_iteration": 4.1407389640808105 + }, + { + "auxiliary_loss_clip": 0.01111967, + "auxiliary_loss_mlp": 0.01031075, + "balance_loss_clip": 1.039024, + "balance_loss_mlp": 1.01810515, + "epoch": 0.9000450924395009, + "flos": 20921808407040.0, + "grad_norm": 1.978725901368175, + "language_loss": 0.75983673, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.78126717, + "num_input_tokens_seen": 322876070, + "step": 14970, + "time_per_iteration": 4.195037603378296 + }, + { + "auxiliary_loss_clip": 0.01101891, + "auxiliary_loss_mlp": 0.01032608, + "balance_loss_clip": 1.036654, + "balance_loss_mlp": 1.0206151, + "epoch": 0.900105215692169, + "flos": 17165049782400.0, + "grad_norm": 1.6764087084105503, + "language_loss": 0.73020303, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.75154805, + "num_input_tokens_seen": 322895095, + "step": 14971, + "time_per_iteration": 2.5875184535980225 + }, + { + "auxiliary_loss_clip": 0.0107201, + "auxiliary_loss_mlp": 0.01031462, + "balance_loss_clip": 1.03537893, + "balance_loss_mlp": 1.01815248, + "epoch": 0.900165338944837, + "flos": 19931930426880.0, + "grad_norm": 2.0581551062194703, + "language_loss": 0.8157441, + "learning_rate": 1.035858993572476e-07, + "loss": 0.83677888, + "num_input_tokens_seen": 322911845, + "step": 14972, + "time_per_iteration": 4.170926094055176 + }, + { + "auxiliary_loss_clip": 0.01080845, + "auxiliary_loss_mlp": 0.01030543, + "balance_loss_clip": 1.03386259, + "balance_loss_mlp": 1.01756763, + "epoch": 0.9002254621975049, + "flos": 16107085572480.0, + "grad_norm": 5.44647111318727, + "language_loss": 0.8157503, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.83686423, + "num_input_tokens_seen": 322928170, + "step": 14973, + "time_per_iteration": 4.245764493942261 + }, + { + "auxiliary_loss_clip": 0.01108859, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.03745437, + "balance_loss_mlp": 1.02064097, + "epoch": 0.9002855854501729, + "flos": 28476120528000.0, + "grad_norm": 1.8931986793937958, + "language_loss": 0.58183479, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.60326004, + "num_input_tokens_seen": 322948165, + "step": 14974, + "time_per_iteration": 2.6841914653778076 + }, + { + "auxiliary_loss_clip": 0.01112242, + "auxiliary_loss_mlp": 0.01034905, + "balance_loss_clip": 1.04007757, + "balance_loss_mlp": 1.02229297, + "epoch": 0.9003457087028408, + "flos": 25630307746560.0, + "grad_norm": 1.7431363980937327, + "language_loss": 0.63522345, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.65669495, + "num_input_tokens_seen": 322968880, + "step": 14975, + "time_per_iteration": 2.620419979095459 + }, + { + "auxiliary_loss_clip": 0.01098045, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.03662229, + "balance_loss_mlp": 1.02011395, + "epoch": 0.9004058319555088, + "flos": 24389414547840.0, + "grad_norm": 1.7931871131687724, + "language_loss": 0.73011506, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.75142372, + "num_input_tokens_seen": 322989395, + "step": 14976, + "time_per_iteration": 2.6519412994384766 + }, + { + "auxiliary_loss_clip": 0.01092647, + "auxiliary_loss_mlp": 0.01031748, + "balance_loss_clip": 1.03749645, + "balance_loss_mlp": 1.01954126, + "epoch": 0.9004659552081767, + "flos": 29059345658880.0, + "grad_norm": 1.7534579820157172, + "language_loss": 0.69598532, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.71722925, + "num_input_tokens_seen": 323009060, + "step": 14977, + "time_per_iteration": 2.6647446155548096 + }, + { + "auxiliary_loss_clip": 0.01082206, + "auxiliary_loss_mlp": 0.00771481, + "balance_loss_clip": 1.03483725, + "balance_loss_mlp": 1.00023878, + "epoch": 0.9005260784608448, + "flos": 16763855800320.0, + "grad_norm": 2.4242637443808603, + "language_loss": 0.65483779, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.67337465, + "num_input_tokens_seen": 323027530, + "step": 14978, + "time_per_iteration": 2.6061410903930664 + }, + { + "auxiliary_loss_clip": 0.01078235, + "auxiliary_loss_mlp": 0.01038613, + "balance_loss_clip": 1.03480434, + "balance_loss_mlp": 1.02505875, + "epoch": 0.9005862017135127, + "flos": 20376002269440.0, + "grad_norm": 1.675464257364332, + "language_loss": 0.78981739, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.81098592, + "num_input_tokens_seen": 323045370, + "step": 14979, + "time_per_iteration": 2.6818509101867676 + }, + { + "auxiliary_loss_clip": 0.01008335, + "auxiliary_loss_mlp": 0.01001784, + "balance_loss_clip": 1.00541806, + "balance_loss_mlp": 1.00071073, + "epoch": 0.9006463249661807, + "flos": 67580255796480.0, + "grad_norm": 0.7182102286721572, + "language_loss": 0.535707, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.55580819, + "num_input_tokens_seen": 323105660, + "step": 14980, + "time_per_iteration": 3.2093987464904785 + }, + { + "auxiliary_loss_clip": 0.01103101, + "auxiliary_loss_mlp": 0.01041025, + "balance_loss_clip": 1.03967285, + "balance_loss_mlp": 1.0275898, + "epoch": 0.9007064482188486, + "flos": 28293335193600.0, + "grad_norm": 1.66637606590706, + "language_loss": 0.82372773, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.84516907, + "num_input_tokens_seen": 323126365, + "step": 14981, + "time_per_iteration": 2.650113582611084 + }, + { + "auxiliary_loss_clip": 0.01066706, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.03680325, + "balance_loss_mlp": 1.02073503, + "epoch": 0.9007665714715166, + "flos": 21616320850560.0, + "grad_norm": 2.004462394579591, + "language_loss": 0.81781876, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.83881551, + "num_input_tokens_seen": 323145655, + "step": 14982, + "time_per_iteration": 2.7423040866851807 + }, + { + "auxiliary_loss_clip": 0.01075107, + "auxiliary_loss_mlp": 0.01040244, + "balance_loss_clip": 1.03167033, + "balance_loss_mlp": 1.0271908, + "epoch": 0.9008266947241845, + "flos": 26541864120960.0, + "grad_norm": 1.9584785414964334, + "language_loss": 0.71540499, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.73655844, + "num_input_tokens_seen": 323164540, + "step": 14983, + "time_per_iteration": 2.7024779319763184 + }, + { + "auxiliary_loss_clip": 0.01097308, + "auxiliary_loss_mlp": 0.0102736, + "balance_loss_clip": 1.03790069, + "balance_loss_mlp": 1.01611233, + "epoch": 0.9008868179768525, + "flos": 23110527738240.0, + "grad_norm": 1.3086245920828656, + "language_loss": 0.74951446, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.77076113, + "num_input_tokens_seen": 323186960, + "step": 14984, + "time_per_iteration": 2.813418388366699 + }, + { + "auxiliary_loss_clip": 0.01104396, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.03461576, + "balance_loss_mlp": 1.02068746, + "epoch": 0.9009469412295206, + "flos": 19060809788160.0, + "grad_norm": 1.7050072282727156, + "language_loss": 0.70293552, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.72430742, + "num_input_tokens_seen": 323206135, + "step": 14985, + "time_per_iteration": 2.767937183380127 + }, + { + "auxiliary_loss_clip": 0.01087695, + "auxiliary_loss_mlp": 0.01033923, + "balance_loss_clip": 1.03466606, + "balance_loss_mlp": 1.02110791, + "epoch": 0.9010070644821885, + "flos": 23222281927680.0, + "grad_norm": 2.1137974022402575, + "language_loss": 0.70276654, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.72398281, + "num_input_tokens_seen": 323225980, + "step": 14986, + "time_per_iteration": 2.7246689796447754 + }, + { + "auxiliary_loss_clip": 0.01096893, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.03352499, + "balance_loss_mlp": 1.01991105, + "epoch": 0.9010671877348565, + "flos": 17384823146880.0, + "grad_norm": 1.658677803041605, + "language_loss": 0.76861989, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.78991318, + "num_input_tokens_seen": 323243700, + "step": 14987, + "time_per_iteration": 2.5764570236206055 + }, + { + "auxiliary_loss_clip": 0.01092941, + "auxiliary_loss_mlp": 0.0103351, + "balance_loss_clip": 1.03674459, + "balance_loss_mlp": 1.02103519, + "epoch": 0.9011273109875244, + "flos": 21908166854400.0, + "grad_norm": 1.956004475384015, + "language_loss": 0.73540664, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.75667119, + "num_input_tokens_seen": 323261535, + "step": 14988, + "time_per_iteration": 2.558128595352173 + }, + { + "auxiliary_loss_clip": 0.01086646, + "auxiliary_loss_mlp": 0.01032746, + "balance_loss_clip": 1.03845191, + "balance_loss_mlp": 1.01903129, + "epoch": 0.9011874342401924, + "flos": 24060831909120.0, + "grad_norm": 1.6803307482650078, + "language_loss": 0.69135392, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.7125479, + "num_input_tokens_seen": 323281855, + "step": 14989, + "time_per_iteration": 2.650520086288452 + }, + { + "auxiliary_loss_clip": 0.01109667, + "auxiliary_loss_mlp": 0.0102853, + "balance_loss_clip": 1.03716099, + "balance_loss_mlp": 1.01638818, + "epoch": 0.9012475574928603, + "flos": 16758791982720.0, + "grad_norm": 1.8703364751087568, + "language_loss": 0.79935807, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.8207401, + "num_input_tokens_seen": 323299505, + "step": 14990, + "time_per_iteration": 2.5482540130615234 + }, + { + "auxiliary_loss_clip": 0.0107379, + "auxiliary_loss_mlp": 0.01031402, + "balance_loss_clip": 1.03743267, + "balance_loss_mlp": 1.01825941, + "epoch": 0.9013076807455284, + "flos": 19971109186560.0, + "grad_norm": 1.999297573895168, + "language_loss": 0.78150022, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.8025521, + "num_input_tokens_seen": 323318365, + "step": 14991, + "time_per_iteration": 2.7104129791259766 + }, + { + "auxiliary_loss_clip": 0.00995246, + "auxiliary_loss_mlp": 0.00751703, + "balance_loss_clip": 1.00523067, + "balance_loss_mlp": 0.9995659, + "epoch": 0.9013678039981963, + "flos": 65180274624000.0, + "grad_norm": 0.7792478224468473, + "language_loss": 0.60261661, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.62008613, + "num_input_tokens_seen": 323371835, + "step": 14992, + "time_per_iteration": 3.123297691345215 + }, + { + "auxiliary_loss_clip": 0.0109359, + "auxiliary_loss_mlp": 0.0102822, + "balance_loss_clip": 1.03499448, + "balance_loss_mlp": 1.01605463, + "epoch": 0.9014279272508643, + "flos": 20521224956160.0, + "grad_norm": 2.279288260696507, + "language_loss": 0.82825989, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.84947795, + "num_input_tokens_seen": 323388495, + "step": 14993, + "time_per_iteration": 2.574572801589966 + }, + { + "auxiliary_loss_clip": 0.01107431, + "auxiliary_loss_mlp": 0.01034277, + "balance_loss_clip": 1.03556728, + "balance_loss_mlp": 1.02142596, + "epoch": 0.9014880505035322, + "flos": 17309051406720.0, + "grad_norm": 2.2716926772447286, + "language_loss": 0.73481464, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.75623167, + "num_input_tokens_seen": 323405280, + "step": 14994, + "time_per_iteration": 2.538275957107544 + }, + { + "auxiliary_loss_clip": 0.01093439, + "auxiliary_loss_mlp": 0.01026115, + "balance_loss_clip": 1.03458691, + "balance_loss_mlp": 1.01476073, + "epoch": 0.9015481737562002, + "flos": 28402862739840.0, + "grad_norm": 1.830419215860498, + "language_loss": 0.64486498, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.66606051, + "num_input_tokens_seen": 323425310, + "step": 14995, + "time_per_iteration": 2.623666286468506 + }, + { + "auxiliary_loss_clip": 0.01069201, + "auxiliary_loss_mlp": 0.01033584, + "balance_loss_clip": 1.03037524, + "balance_loss_mlp": 1.01956463, + "epoch": 0.9016082970088681, + "flos": 29752672953600.0, + "grad_norm": 1.8771395079815063, + "language_loss": 0.66334212, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.68436992, + "num_input_tokens_seen": 323447805, + "step": 14996, + "time_per_iteration": 2.781064510345459 + }, + { + "auxiliary_loss_clip": 0.01095585, + "auxiliary_loss_mlp": 0.01028339, + "balance_loss_clip": 1.03407955, + "balance_loss_mlp": 1.01678181, + "epoch": 0.9016684202615362, + "flos": 23513230091520.0, + "grad_norm": 1.8246548425287972, + "language_loss": 0.66247928, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.68371856, + "num_input_tokens_seen": 323467150, + "step": 14997, + "time_per_iteration": 2.71907114982605 + }, + { + "auxiliary_loss_clip": 0.01080625, + "auxiliary_loss_mlp": 0.01037629, + "balance_loss_clip": 1.03254914, + "balance_loss_mlp": 1.02483201, + "epoch": 0.9017285435142042, + "flos": 16979247705600.0, + "grad_norm": 1.8261489433850353, + "language_loss": 0.77650619, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.79768866, + "num_input_tokens_seen": 323484250, + "step": 14998, + "time_per_iteration": 2.6528589725494385 + }, + { + "auxiliary_loss_clip": 0.0110937, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.03644896, + "balance_loss_mlp": 1.0174706, + "epoch": 0.9017886667668721, + "flos": 21393351175680.0, + "grad_norm": 1.6740643670552307, + "language_loss": 0.75225437, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.77364427, + "num_input_tokens_seen": 323502910, + "step": 14999, + "time_per_iteration": 2.5951831340789795 + }, + { + "auxiliary_loss_clip": 0.01045283, + "auxiliary_loss_mlp": 0.01030998, + "balance_loss_clip": 1.03599596, + "balance_loss_mlp": 1.01742625, + "epoch": 0.9018487900195401, + "flos": 20996574566400.0, + "grad_norm": 8.557112820549731, + "language_loss": 0.75833976, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.77910256, + "num_input_tokens_seen": 323521820, + "step": 15000, + "time_per_iteration": 2.7700390815734863 + }, + { + "auxiliary_loss_clip": 0.01090367, + "auxiliary_loss_mlp": 0.01028007, + "balance_loss_clip": 1.03579473, + "balance_loss_mlp": 1.01666379, + "epoch": 0.901908913272208, + "flos": 53358443458560.0, + "grad_norm": 2.1350900173970153, + "language_loss": 0.80694187, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.8281256, + "num_input_tokens_seen": 323543200, + "step": 15001, + "time_per_iteration": 2.914686918258667 + }, + { + "auxiliary_loss_clip": 0.01076218, + "auxiliary_loss_mlp": 0.01028009, + "balance_loss_clip": 1.03696334, + "balance_loss_mlp": 1.01640391, + "epoch": 0.901969036524876, + "flos": 22089838867200.0, + "grad_norm": 1.603585496644282, + "language_loss": 0.78372264, + "learning_rate": 9.990687143794407e-08, + "loss": 0.80476493, + "num_input_tokens_seen": 323563075, + "step": 15002, + "time_per_iteration": 2.7641050815582275 + }, + { + "auxiliary_loss_clip": 0.01082464, + "auxiliary_loss_mlp": 0.01045075, + "balance_loss_clip": 1.03580308, + "balance_loss_mlp": 1.03049612, + "epoch": 0.9020291597775439, + "flos": 23835025059840.0, + "grad_norm": 2.0404977120917147, + "language_loss": 0.68748105, + "learning_rate": 9.978535328195347e-08, + "loss": 0.70875645, + "num_input_tokens_seen": 323579065, + "step": 15003, + "time_per_iteration": 2.771782875061035 + }, + { + "auxiliary_loss_clip": 0.01085032, + "auxiliary_loss_mlp": 0.0103817, + "balance_loss_clip": 1.03330088, + "balance_loss_mlp": 1.02506328, + "epoch": 0.902089283030212, + "flos": 18326005263360.0, + "grad_norm": 1.8107770462670902, + "language_loss": 0.85949785, + "learning_rate": 9.9663907182292e-08, + "loss": 0.88072991, + "num_input_tokens_seen": 323594835, + "step": 15004, + "time_per_iteration": 2.666977882385254 + }, + { + "auxiliary_loss_clip": 0.0107511, + "auxiliary_loss_mlp": 0.01035534, + "balance_loss_clip": 1.03480256, + "balance_loss_mlp": 1.02221882, + "epoch": 0.9021494062828799, + "flos": 24170359455360.0, + "grad_norm": 2.562101889878063, + "language_loss": 0.71954483, + "learning_rate": 9.954253314356575e-08, + "loss": 0.74065125, + "num_input_tokens_seen": 323611475, + "step": 15005, + "time_per_iteration": 2.759964942932129 + }, + { + "auxiliary_loss_clip": 0.01100393, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.03423667, + "balance_loss_mlp": 1.01778543, + "epoch": 0.9022095295355479, + "flos": 21616859554560.0, + "grad_norm": 1.9300274914184496, + "language_loss": 0.70556152, + "learning_rate": 9.942123117037748e-08, + "loss": 0.72687459, + "num_input_tokens_seen": 323629730, + "step": 15006, + "time_per_iteration": 2.6384735107421875 + }, + { + "auxiliary_loss_clip": 0.01086555, + "auxiliary_loss_mlp": 0.0102873, + "balance_loss_clip": 1.03485382, + "balance_loss_mlp": 1.01679754, + "epoch": 0.9022696527882158, + "flos": 18726229578240.0, + "grad_norm": 3.0054319156686264, + "language_loss": 0.84866273, + "learning_rate": 9.930000126732618e-08, + "loss": 0.86981565, + "num_input_tokens_seen": 323646000, + "step": 15007, + "time_per_iteration": 2.648921489715576 + }, + { + "auxiliary_loss_clip": 0.01079211, + "auxiliary_loss_mlp": 0.01030937, + "balance_loss_clip": 1.03299296, + "balance_loss_mlp": 1.01809239, + "epoch": 0.9023297760408838, + "flos": 26761206522240.0, + "grad_norm": 1.9317011784213973, + "language_loss": 0.7883476, + "learning_rate": 9.917884343900928e-08, + "loss": 0.80944914, + "num_input_tokens_seen": 323667250, + "step": 15008, + "time_per_iteration": 4.242715120315552 + }, + { + "auxiliary_loss_clip": 0.01063806, + "auxiliary_loss_mlp": 0.01033642, + "balance_loss_clip": 1.03497434, + "balance_loss_mlp": 1.0214889, + "epoch": 0.9023898992935517, + "flos": 20522553759360.0, + "grad_norm": 1.866627762329264, + "language_loss": 0.73153245, + "learning_rate": 9.905775769002156e-08, + "loss": 0.75250691, + "num_input_tokens_seen": 323687150, + "step": 15009, + "time_per_iteration": 2.691822052001953 + }, + { + "auxiliary_loss_clip": 0.01107314, + "auxiliary_loss_mlp": 0.01035667, + "balance_loss_clip": 1.0361793, + "balance_loss_mlp": 1.02314413, + "epoch": 0.9024500225462198, + "flos": 17456644391040.0, + "grad_norm": 1.76387616724559, + "language_loss": 0.73348868, + "learning_rate": 9.893674402495399e-08, + "loss": 0.75491852, + "num_input_tokens_seen": 323703660, + "step": 15010, + "time_per_iteration": 4.291422128677368 + }, + { + "auxiliary_loss_clip": 0.0108209, + "auxiliary_loss_mlp": 0.01035862, + "balance_loss_clip": 1.03634191, + "balance_loss_mlp": 1.02284431, + "epoch": 0.9025101457988878, + "flos": 20813609664000.0, + "grad_norm": 2.097335794667479, + "language_loss": 0.74242449, + "learning_rate": 9.881580244839538e-08, + "loss": 0.76360393, + "num_input_tokens_seen": 323722060, + "step": 15011, + "time_per_iteration": 4.15416693687439 + }, + { + "auxiliary_loss_clip": 0.01101836, + "auxiliary_loss_mlp": 0.01031616, + "balance_loss_clip": 1.03616571, + "balance_loss_mlp": 1.01824617, + "epoch": 0.9025702690515557, + "flos": 19026371623680.0, + "grad_norm": 10.830412851776218, + "language_loss": 0.72975504, + "learning_rate": 9.869493296493204e-08, + "loss": 0.75108945, + "num_input_tokens_seen": 323740645, + "step": 15012, + "time_per_iteration": 4.172262668609619 + }, + { + "auxiliary_loss_clip": 0.01073966, + "auxiliary_loss_mlp": 0.01038584, + "balance_loss_clip": 1.03479862, + "balance_loss_mlp": 1.02719402, + "epoch": 0.9026303923042237, + "flos": 19682818629120.0, + "grad_norm": 1.6805885971222159, + "language_loss": 0.69541949, + "learning_rate": 9.857413557914763e-08, + "loss": 0.71654499, + "num_input_tokens_seen": 323758905, + "step": 15013, + "time_per_iteration": 2.6801204681396484 + }, + { + "auxiliary_loss_clip": 0.01092922, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.03412437, + "balance_loss_mlp": 1.01987374, + "epoch": 0.9026905155568916, + "flos": 24608110504320.0, + "grad_norm": 1.451081928504829, + "language_loss": 0.73157448, + "learning_rate": 9.845341029562249e-08, + "loss": 0.75282216, + "num_input_tokens_seen": 323780595, + "step": 15014, + "time_per_iteration": 2.6699087619781494 + }, + { + "auxiliary_loss_clip": 0.01107905, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.03593612, + "balance_loss_mlp": 1.01995111, + "epoch": 0.9027506388095596, + "flos": 20521799573760.0, + "grad_norm": 1.9727005096909034, + "language_loss": 0.72401255, + "learning_rate": 9.833275711893474e-08, + "loss": 0.74541688, + "num_input_tokens_seen": 323798160, + "step": 15015, + "time_per_iteration": 2.536134958267212 + }, + { + "auxiliary_loss_clip": 0.01083409, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.03356743, + "balance_loss_mlp": 1.02245855, + "epoch": 0.9028107620622275, + "flos": 22784494965120.0, + "grad_norm": 2.2967609307485213, + "language_loss": 0.6894691, + "learning_rate": 9.821217605365895e-08, + "loss": 0.71064925, + "num_input_tokens_seen": 323816810, + "step": 15016, + "time_per_iteration": 2.696544647216797 + }, + { + "auxiliary_loss_clip": 0.01105993, + "auxiliary_loss_mlp": 0.01027953, + "balance_loss_clip": 1.03578448, + "balance_loss_mlp": 1.0165025, + "epoch": 0.9028708853148956, + "flos": 25410534382080.0, + "grad_norm": 1.8623697779544957, + "language_loss": 0.7037698, + "learning_rate": 9.809166710436855e-08, + "loss": 0.72510922, + "num_input_tokens_seen": 323836900, + "step": 15017, + "time_per_iteration": 2.595538377761841 + }, + { + "auxiliary_loss_clip": 0.01086858, + "auxiliary_loss_mlp": 0.01033508, + "balance_loss_clip": 1.03965449, + "balance_loss_mlp": 1.02197492, + "epoch": 0.9029310085675635, + "flos": 21871322478720.0, + "grad_norm": 1.936832914018773, + "language_loss": 0.69448954, + "learning_rate": 9.797123027563237e-08, + "loss": 0.71569324, + "num_input_tokens_seen": 323855325, + "step": 15018, + "time_per_iteration": 2.6294448375701904 + }, + { + "auxiliary_loss_clip": 0.01097184, + "auxiliary_loss_mlp": 0.01030964, + "balance_loss_clip": 1.0363183, + "balance_loss_mlp": 1.01848841, + "epoch": 0.9029911318202315, + "flos": 26214394803840.0, + "grad_norm": 2.670057075172495, + "language_loss": 0.68977821, + "learning_rate": 9.785086557201782e-08, + "loss": 0.71105969, + "num_input_tokens_seen": 323875650, + "step": 15019, + "time_per_iteration": 2.7993857860565186 + }, + { + "auxiliary_loss_clip": 0.01105429, + "auxiliary_loss_mlp": 0.0103295, + "balance_loss_clip": 1.03574717, + "balance_loss_mlp": 1.02123153, + "epoch": 0.9030512550728994, + "flos": 15961360095360.0, + "grad_norm": 1.9111353110117102, + "language_loss": 0.72140992, + "learning_rate": 9.773057299808951e-08, + "loss": 0.74279368, + "num_input_tokens_seen": 323892920, + "step": 15020, + "time_per_iteration": 2.588925361633301 + }, + { + "auxiliary_loss_clip": 0.01094641, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.03369665, + "balance_loss_mlp": 1.01739788, + "epoch": 0.9031113783255674, + "flos": 23987610034560.0, + "grad_norm": 1.5881960753658597, + "language_loss": 0.74275625, + "learning_rate": 9.7610352558408e-08, + "loss": 0.76400447, + "num_input_tokens_seen": 323913835, + "step": 15021, + "time_per_iteration": 2.588358163833618 + }, + { + "auxiliary_loss_clip": 0.01112744, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.03767323, + "balance_loss_mlp": 1.01803565, + "epoch": 0.9031715015782353, + "flos": 22237216369920.0, + "grad_norm": 2.206963784071178, + "language_loss": 0.7280935, + "learning_rate": 9.749020425753251e-08, + "loss": 0.74953449, + "num_input_tokens_seen": 323933440, + "step": 15022, + "time_per_iteration": 2.536369562149048 + }, + { + "auxiliary_loss_clip": 0.01068128, + "auxiliary_loss_mlp": 0.01027903, + "balance_loss_clip": 1.03312647, + "balance_loss_mlp": 1.01603556, + "epoch": 0.9032316248309034, + "flos": 26323168164480.0, + "grad_norm": 2.626283652398094, + "language_loss": 0.72459871, + "learning_rate": 9.737012810001943e-08, + "loss": 0.74555898, + "num_input_tokens_seen": 323954090, + "step": 15023, + "time_per_iteration": 2.7086663246154785 + }, + { + "auxiliary_loss_clip": 0.01095012, + "auxiliary_loss_mlp": 0.01033688, + "balance_loss_clip": 1.03661966, + "balance_loss_mlp": 1.02148056, + "epoch": 0.9032917480835713, + "flos": 22636686499200.0, + "grad_norm": 1.615390594189699, + "language_loss": 0.82334167, + "learning_rate": 9.725012409042155e-08, + "loss": 0.84462869, + "num_input_tokens_seen": 323974040, + "step": 15024, + "time_per_iteration": 2.6185879707336426 + }, + { + "auxiliary_loss_clip": 0.01099161, + "auxiliary_loss_mlp": 0.01028549, + "balance_loss_clip": 1.03624964, + "balance_loss_mlp": 1.01650262, + "epoch": 0.9033518713362393, + "flos": 23878764846720.0, + "grad_norm": 1.6458847486672181, + "language_loss": 0.69518673, + "learning_rate": 9.713019223328966e-08, + "loss": 0.7164638, + "num_input_tokens_seen": 323996125, + "step": 15025, + "time_per_iteration": 2.6076362133026123 + }, + { + "auxiliary_loss_clip": 0.01073996, + "auxiliary_loss_mlp": 0.01035637, + "balance_loss_clip": 1.03491449, + "balance_loss_mlp": 1.02332294, + "epoch": 0.9034119945889073, + "flos": 26905279973760.0, + "grad_norm": 1.5601899591487556, + "language_loss": 0.76521379, + "learning_rate": 9.70103325331717e-08, + "loss": 0.78631014, + "num_input_tokens_seen": 324017645, + "step": 15026, + "time_per_iteration": 2.7674145698547363 + }, + { + "auxiliary_loss_clip": 0.01098222, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.03840423, + "balance_loss_mlp": 1.01899886, + "epoch": 0.9034721178415752, + "flos": 20850166730880.0, + "grad_norm": 2.0222752747400192, + "language_loss": 0.68377501, + "learning_rate": 9.68905449946129e-08, + "loss": 0.70506608, + "num_input_tokens_seen": 324036875, + "step": 15027, + "time_per_iteration": 2.6653904914855957 + }, + { + "auxiliary_loss_clip": 0.01052551, + "auxiliary_loss_mlp": 0.01041128, + "balance_loss_clip": 1.03196084, + "balance_loss_mlp": 1.02769923, + "epoch": 0.9035322410942432, + "flos": 22234307368320.0, + "grad_norm": 1.6548540409634305, + "language_loss": 0.75698447, + "learning_rate": 9.677082962215477e-08, + "loss": 0.7779212, + "num_input_tokens_seen": 324057045, + "step": 15028, + "time_per_iteration": 2.7179388999938965 + }, + { + "auxiliary_loss_clip": 0.01052919, + "auxiliary_loss_mlp": 0.0103746, + "balance_loss_clip": 1.03367233, + "balance_loss_mlp": 1.02507436, + "epoch": 0.9035923643469111, + "flos": 25923410726400.0, + "grad_norm": 1.805593039358967, + "language_loss": 0.69399357, + "learning_rate": 9.665118642033765e-08, + "loss": 0.71489739, + "num_input_tokens_seen": 324079735, + "step": 15029, + "time_per_iteration": 2.813114643096924 + }, + { + "auxiliary_loss_clip": 0.01096672, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.03852797, + "balance_loss_mlp": 1.02123141, + "epoch": 0.9036524875995792, + "flos": 20339804338560.0, + "grad_norm": 1.8345501751502649, + "language_loss": 0.7369951, + "learning_rate": 9.653161539369858e-08, + "loss": 0.75830793, + "num_input_tokens_seen": 324097785, + "step": 15030, + "time_per_iteration": 2.696516990661621 + }, + { + "auxiliary_loss_clip": 0.01101797, + "auxiliary_loss_mlp": 0.01030355, + "balance_loss_clip": 1.03739715, + "balance_loss_mlp": 1.01790965, + "epoch": 0.9037126108522471, + "flos": 40114624677120.0, + "grad_norm": 2.451430150859209, + "language_loss": 0.6831615, + "learning_rate": 9.641211654677151e-08, + "loss": 0.70448303, + "num_input_tokens_seen": 324121625, + "step": 15031, + "time_per_iteration": 2.776313543319702 + }, + { + "auxiliary_loss_clip": 0.01085756, + "auxiliary_loss_mlp": 0.01028456, + "balance_loss_clip": 1.03706944, + "balance_loss_mlp": 1.01662993, + "epoch": 0.9037727341049151, + "flos": 23332024955520.0, + "grad_norm": 1.492349301530935, + "language_loss": 0.76186407, + "learning_rate": 9.629268988408723e-08, + "loss": 0.78300619, + "num_input_tokens_seen": 324142535, + "step": 15032, + "time_per_iteration": 2.722729206085205 + }, + { + "auxiliary_loss_clip": 0.01110023, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.03756511, + "balance_loss_mlp": 1.02144957, + "epoch": 0.903832857357583, + "flos": 12822659815680.0, + "grad_norm": 1.761761043861274, + "language_loss": 0.75420368, + "learning_rate": 9.617333541017502e-08, + "loss": 0.77564085, + "num_input_tokens_seen": 324159610, + "step": 15033, + "time_per_iteration": 2.6790883541107178 + }, + { + "auxiliary_loss_clip": 0.01074569, + "auxiliary_loss_mlp": 0.01038477, + "balance_loss_clip": 1.03108501, + "balance_loss_mlp": 1.02516127, + "epoch": 0.903892980610251, + "flos": 25703026830720.0, + "grad_norm": 1.9648590511752269, + "language_loss": 0.73957044, + "learning_rate": 9.605405312956105e-08, + "loss": 0.76070094, + "num_input_tokens_seen": 324182510, + "step": 15034, + "time_per_iteration": 2.7564845085144043 + }, + { + "auxiliary_loss_clip": 0.01076984, + "auxiliary_loss_mlp": 0.01032868, + "balance_loss_clip": 1.03676867, + "balance_loss_mlp": 1.02031517, + "epoch": 0.9039531038629189, + "flos": 14684089397760.0, + "grad_norm": 2.177722949634339, + "language_loss": 0.6356231, + "learning_rate": 9.593484304676791e-08, + "loss": 0.65672159, + "num_input_tokens_seen": 324200555, + "step": 15035, + "time_per_iteration": 2.714242935180664 + }, + { + "auxiliary_loss_clip": 0.01109298, + "auxiliary_loss_mlp": 0.01032355, + "balance_loss_clip": 1.0378021, + "balance_loss_mlp": 1.01890254, + "epoch": 0.904013227115587, + "flos": 24024921287040.0, + "grad_norm": 2.5713675612269897, + "language_loss": 0.61697221, + "learning_rate": 9.581570516631643e-08, + "loss": 0.63838875, + "num_input_tokens_seen": 324220255, + "step": 15036, + "time_per_iteration": 2.6531126499176025 + }, + { + "auxiliary_loss_clip": 0.01057116, + "auxiliary_loss_mlp": 0.01033081, + "balance_loss_clip": 1.03590751, + "balance_loss_mlp": 1.02079058, + "epoch": 0.9040733503682549, + "flos": 22856459863680.0, + "grad_norm": 1.6688110224130346, + "language_loss": 0.82059491, + "learning_rate": 9.569663949272455e-08, + "loss": 0.84149683, + "num_input_tokens_seen": 324237855, + "step": 15037, + "time_per_iteration": 2.667306661605835 + }, + { + "auxiliary_loss_clip": 0.01111291, + "auxiliary_loss_mlp": 0.01029139, + "balance_loss_clip": 1.03720188, + "balance_loss_mlp": 1.01677668, + "epoch": 0.9041334736209229, + "flos": 19974951941760.0, + "grad_norm": 1.9034264024294631, + "language_loss": 0.67595971, + "learning_rate": 9.557764603050667e-08, + "loss": 0.69736397, + "num_input_tokens_seen": 324257050, + "step": 15038, + "time_per_iteration": 2.546713352203369 + }, + { + "auxiliary_loss_clip": 0.01085126, + "auxiliary_loss_mlp": 0.0103871, + "balance_loss_clip": 1.03417087, + "balance_loss_mlp": 1.02606213, + "epoch": 0.9041935968735909, + "flos": 17530548624000.0, + "grad_norm": 2.007069946827801, + "language_loss": 0.7516647, + "learning_rate": 9.545872478417494e-08, + "loss": 0.77290308, + "num_input_tokens_seen": 324275510, + "step": 15039, + "time_per_iteration": 2.6198740005493164 + }, + { + "auxiliary_loss_clip": 0.01082867, + "auxiliary_loss_mlp": 0.01030216, + "balance_loss_clip": 1.03606772, + "balance_loss_mlp": 1.01828885, + "epoch": 0.9042537201262588, + "flos": 22780149419520.0, + "grad_norm": 1.4865254834014996, + "language_loss": 0.70274264, + "learning_rate": 9.533987575823977e-08, + "loss": 0.7238735, + "num_input_tokens_seen": 324295150, + "step": 15040, + "time_per_iteration": 2.6253907680511475 + }, + { + "auxiliary_loss_clip": 0.01073575, + "auxiliary_loss_mlp": 0.01031021, + "balance_loss_clip": 1.03373194, + "balance_loss_mlp": 1.01905835, + "epoch": 0.9043138433789268, + "flos": 20595416497920.0, + "grad_norm": 1.5884049488424423, + "language_loss": 0.67547166, + "learning_rate": 9.522109895720709e-08, + "loss": 0.69651759, + "num_input_tokens_seen": 324313855, + "step": 15041, + "time_per_iteration": 2.6538193225860596 + }, + { + "auxiliary_loss_clip": 0.01096511, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.0354538, + "balance_loss_mlp": 1.02016878, + "epoch": 0.9043739666315948, + "flos": 32962978995840.0, + "grad_norm": 1.757404597325935, + "language_loss": 0.57556689, + "learning_rate": 9.510239438558155e-08, + "loss": 0.59686273, + "num_input_tokens_seen": 324338465, + "step": 15042, + "time_per_iteration": 2.7718114852905273 + }, + { + "auxiliary_loss_clip": 0.01010523, + "auxiliary_loss_mlp": 0.00751383, + "balance_loss_clip": 1.00739527, + "balance_loss_mlp": 0.99962682, + "epoch": 0.9044340898842628, + "flos": 67296418525440.0, + "grad_norm": 0.79646583953312, + "language_loss": 0.56897914, + "learning_rate": 9.498376204786351e-08, + "loss": 0.58659816, + "num_input_tokens_seen": 324398740, + "step": 15043, + "time_per_iteration": 3.1866395473480225 + }, + { + "auxiliary_loss_clip": 0.01086756, + "auxiliary_loss_mlp": 0.01031593, + "balance_loss_clip": 1.03518081, + "balance_loss_mlp": 1.01791954, + "epoch": 0.9044942131369307, + "flos": 17713154390400.0, + "grad_norm": 2.037927105640118, + "language_loss": 0.69802731, + "learning_rate": 9.486520194855274e-08, + "loss": 0.71921074, + "num_input_tokens_seen": 324417335, + "step": 15044, + "time_per_iteration": 2.6936917304992676 + }, + { + "auxiliary_loss_clip": 0.01089873, + "auxiliary_loss_mlp": 0.01040675, + "balance_loss_clip": 1.03643358, + "balance_loss_mlp": 1.02699018, + "epoch": 0.9045543363895987, + "flos": 17820563034240.0, + "grad_norm": 2.361452153722679, + "language_loss": 0.69954962, + "learning_rate": 9.474671409214407e-08, + "loss": 0.72085512, + "num_input_tokens_seen": 324433240, + "step": 15045, + "time_per_iteration": 2.655958414077759 + }, + { + "auxiliary_loss_clip": 0.01077221, + "auxiliary_loss_mlp": 0.01037261, + "balance_loss_clip": 1.0350486, + "balance_loss_mlp": 1.02417183, + "epoch": 0.9046144596422666, + "flos": 21872723109120.0, + "grad_norm": 1.816781294987572, + "language_loss": 0.65513825, + "learning_rate": 9.462829848313081e-08, + "loss": 0.67628312, + "num_input_tokens_seen": 324452675, + "step": 15046, + "time_per_iteration": 2.704993963241577 + }, + { + "auxiliary_loss_clip": 0.01077406, + "auxiliary_loss_mlp": 0.01039336, + "balance_loss_clip": 1.03620148, + "balance_loss_mlp": 1.02714109, + "epoch": 0.9046745828949346, + "flos": 17672646827520.0, + "grad_norm": 2.027120160637291, + "language_loss": 0.62039495, + "learning_rate": 9.450995512600379e-08, + "loss": 0.6415624, + "num_input_tokens_seen": 324467865, + "step": 15047, + "time_per_iteration": 2.731316089630127 + }, + { + "auxiliary_loss_clip": 0.01109878, + "auxiliary_loss_mlp": 0.00770221, + "balance_loss_clip": 1.03869438, + "balance_loss_mlp": 1.00023651, + "epoch": 0.9047347061476025, + "flos": 25702559953920.0, + "grad_norm": 1.5037316307134132, + "language_loss": 0.71319842, + "learning_rate": 9.439168402525032e-08, + "loss": 0.73199946, + "num_input_tokens_seen": 324490430, + "step": 15048, + "time_per_iteration": 5.092748403549194 + }, + { + "auxiliary_loss_clip": 0.01098767, + "auxiliary_loss_mlp": 0.0103572, + "balance_loss_clip": 1.03479016, + "balance_loss_mlp": 1.02233887, + "epoch": 0.9047948294002706, + "flos": 15158146118400.0, + "grad_norm": 2.1618818731676637, + "language_loss": 0.748658, + "learning_rate": 9.427348518535483e-08, + "loss": 0.7700029, + "num_input_tokens_seen": 324506620, + "step": 15049, + "time_per_iteration": 4.3224146366119385 + }, + { + "auxiliary_loss_clip": 0.01095393, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.0372622, + "balance_loss_mlp": 1.02072453, + "epoch": 0.9048549526529385, + "flos": 21872292145920.0, + "grad_norm": 2.5225470406592105, + "language_loss": 0.75863099, + "learning_rate": 9.415535861079993e-08, + "loss": 0.77991724, + "num_input_tokens_seen": 324525505, + "step": 15050, + "time_per_iteration": 4.230266094207764 + }, + { + "auxiliary_loss_clip": 0.01109636, + "auxiliary_loss_mlp": 0.00769663, + "balance_loss_clip": 1.03721118, + "balance_loss_mlp": 1.00019288, + "epoch": 0.9049150759056065, + "flos": 23546626761600.0, + "grad_norm": 1.8328342559703832, + "language_loss": 0.81820488, + "learning_rate": 9.403730430606472e-08, + "loss": 0.83699787, + "num_input_tokens_seen": 324544415, + "step": 15051, + "time_per_iteration": 4.13810133934021 + }, + { + "auxiliary_loss_clip": 0.0109796, + "auxiliary_loss_mlp": 0.01030797, + "balance_loss_clip": 1.03711987, + "balance_loss_mlp": 1.01926315, + "epoch": 0.9049751991582745, + "flos": 19645902426240.0, + "grad_norm": 2.063226238004681, + "language_loss": 0.89144683, + "learning_rate": 9.391932227562582e-08, + "loss": 0.91273439, + "num_input_tokens_seen": 324562555, + "step": 15052, + "time_per_iteration": 2.5994207859039307 + }, + { + "auxiliary_loss_clip": 0.01101275, + "auxiliary_loss_mlp": 0.01032827, + "balance_loss_clip": 1.03616786, + "balance_loss_mlp": 1.02020848, + "epoch": 0.9050353224109424, + "flos": 15596220389760.0, + "grad_norm": 3.6081086448903616, + "language_loss": 0.77183485, + "learning_rate": 9.380141252395724e-08, + "loss": 0.79317588, + "num_input_tokens_seen": 324580865, + "step": 15053, + "time_per_iteration": 2.546614170074463 + }, + { + "auxiliary_loss_clip": 0.01095283, + "auxiliary_loss_mlp": 0.01033259, + "balance_loss_clip": 1.03654027, + "balance_loss_mlp": 1.02096224, + "epoch": 0.9050954456636104, + "flos": 28183592165760.0, + "grad_norm": 2.4176866972554927, + "language_loss": 0.73160625, + "learning_rate": 9.368357505553049e-08, + "loss": 0.75289166, + "num_input_tokens_seen": 324600665, + "step": 15054, + "time_per_iteration": 2.658132553100586 + }, + { + "auxiliary_loss_clip": 0.01054009, + "auxiliary_loss_mlp": 0.01034334, + "balance_loss_clip": 1.03092122, + "balance_loss_mlp": 1.0217638, + "epoch": 0.9051555689162784, + "flos": 25731611078400.0, + "grad_norm": 1.6662566471194642, + "language_loss": 0.83386469, + "learning_rate": 9.356580987481333e-08, + "loss": 0.85474813, + "num_input_tokens_seen": 324618145, + "step": 15055, + "time_per_iteration": 2.7756059169769287 + }, + { + "auxiliary_loss_clip": 0.01094483, + "auxiliary_loss_mlp": 0.01034211, + "balance_loss_clip": 1.03571832, + "balance_loss_mlp": 1.02193809, + "epoch": 0.9052156921689464, + "flos": 23257258796160.0, + "grad_norm": 1.7590583279943084, + "language_loss": 0.85093272, + "learning_rate": 9.344811698627176e-08, + "loss": 0.87221962, + "num_input_tokens_seen": 324638165, + "step": 15056, + "time_per_iteration": 2.6432409286499023 + }, + { + "auxiliary_loss_clip": 0.01079366, + "auxiliary_loss_mlp": 0.01028685, + "balance_loss_clip": 1.03504348, + "balance_loss_mlp": 1.01706791, + "epoch": 0.9052758154216143, + "flos": 29564285097600.0, + "grad_norm": 2.874678812458683, + "language_loss": 0.72274697, + "learning_rate": 9.333049639436863e-08, + "loss": 0.74382746, + "num_input_tokens_seen": 324658560, + "step": 15057, + "time_per_iteration": 2.729560613632202 + }, + { + "auxiliary_loss_clip": 0.0109434, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.03419363, + "balance_loss_mlp": 1.02033675, + "epoch": 0.9053359386742823, + "flos": 22127688823680.0, + "grad_norm": 1.7504023555736803, + "language_loss": 0.80625844, + "learning_rate": 9.321294810356418e-08, + "loss": 0.82752472, + "num_input_tokens_seen": 324679185, + "step": 15058, + "time_per_iteration": 2.7866742610931396 + }, + { + "auxiliary_loss_clip": 0.01016738, + "auxiliary_loss_mlp": 0.01001155, + "balance_loss_clip": 1.00546241, + "balance_loss_mlp": 1.00027263, + "epoch": 0.9053960619269502, + "flos": 67090112760960.0, + "grad_norm": 0.6742645002897684, + "language_loss": 0.51343101, + "learning_rate": 9.309547211831592e-08, + "loss": 0.53360993, + "num_input_tokens_seen": 324744830, + "step": 15059, + "time_per_iteration": 3.2885544300079346 + }, + { + "auxiliary_loss_clip": 0.01072001, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.04110169, + "balance_loss_mlp": 1.0184902, + "epoch": 0.9054561851796182, + "flos": 15815419136640.0, + "grad_norm": 1.7140803408550112, + "language_loss": 0.67263991, + "learning_rate": 9.297806844307831e-08, + "loss": 0.69366872, + "num_input_tokens_seen": 324762905, + "step": 15060, + "time_per_iteration": 2.8112542629241943 + }, + { + "auxiliary_loss_clip": 0.01089234, + "auxiliary_loss_mlp": 0.01032459, + "balance_loss_clip": 1.03664804, + "balance_loss_mlp": 1.01979876, + "epoch": 0.9055163084322861, + "flos": 17566997950080.0, + "grad_norm": 2.3975546753010915, + "language_loss": 0.64229333, + "learning_rate": 9.286073708230357e-08, + "loss": 0.66351026, + "num_input_tokens_seen": 324781905, + "step": 15061, + "time_per_iteration": 2.6348559856414795 + }, + { + "auxiliary_loss_clip": 0.01083114, + "auxiliary_loss_mlp": 0.01038728, + "balance_loss_clip": 1.03490663, + "balance_loss_mlp": 1.02568662, + "epoch": 0.9055764316849542, + "flos": 17639573379840.0, + "grad_norm": 1.6952248050793448, + "language_loss": 0.71770173, + "learning_rate": 9.274347804044058e-08, + "loss": 0.73892021, + "num_input_tokens_seen": 324799260, + "step": 15062, + "time_per_iteration": 2.889420986175537 + }, + { + "auxiliary_loss_clip": 0.01106793, + "auxiliary_loss_mlp": 0.01031859, + "balance_loss_clip": 1.03594065, + "balance_loss_mlp": 1.01968181, + "epoch": 0.9056365549376221, + "flos": 20120856986880.0, + "grad_norm": 2.4465454482745534, + "language_loss": 0.71081591, + "learning_rate": 9.2626291321936e-08, + "loss": 0.73220247, + "num_input_tokens_seen": 324817800, + "step": 15063, + "time_per_iteration": 2.5845255851745605 + }, + { + "auxiliary_loss_clip": 0.01066505, + "auxiliary_loss_mlp": 0.01033382, + "balance_loss_clip": 1.03441405, + "balance_loss_mlp": 1.02137733, + "epoch": 0.9056966781902901, + "flos": 27598786836480.0, + "grad_norm": 1.6140764840552748, + "language_loss": 0.72192168, + "learning_rate": 9.250917693123406e-08, + "loss": 0.74292052, + "num_input_tokens_seen": 324838445, + "step": 15064, + "time_per_iteration": 2.711472511291504 + }, + { + "auxiliary_loss_clip": 0.01099676, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.0358665, + "balance_loss_mlp": 1.01976943, + "epoch": 0.9057568014429581, + "flos": 25920106675200.0, + "grad_norm": 1.9283380616790378, + "language_loss": 0.69733697, + "learning_rate": 9.23921348727752e-08, + "loss": 0.71865511, + "num_input_tokens_seen": 324859895, + "step": 15065, + "time_per_iteration": 2.6254019737243652 + }, + { + "auxiliary_loss_clip": 0.01076646, + "auxiliary_loss_mlp": 0.01034346, + "balance_loss_clip": 1.03431368, + "balance_loss_mlp": 1.02240729, + "epoch": 0.905816924695626, + "flos": 22930364096640.0, + "grad_norm": 1.5639103383265116, + "language_loss": 0.62895906, + "learning_rate": 9.227516515099743e-08, + "loss": 0.65006894, + "num_input_tokens_seen": 324879580, + "step": 15066, + "time_per_iteration": 2.7154438495635986 + }, + { + "auxiliary_loss_clip": 0.01035849, + "auxiliary_loss_mlp": 0.01033037, + "balance_loss_clip": 1.02947974, + "balance_loss_mlp": 1.01869655, + "epoch": 0.905877047948294, + "flos": 22157422306560.0, + "grad_norm": 1.934180125308043, + "language_loss": 0.80121052, + "learning_rate": 9.215826777033675e-08, + "loss": 0.82189941, + "num_input_tokens_seen": 324898950, + "step": 15067, + "time_per_iteration": 2.7812981605529785 + }, + { + "auxiliary_loss_clip": 0.0108924, + "auxiliary_loss_mlp": 0.01033309, + "balance_loss_clip": 1.0376116, + "balance_loss_mlp": 1.02020836, + "epoch": 0.905937171200962, + "flos": 15304805349120.0, + "grad_norm": 1.6228923811634084, + "language_loss": 0.70006502, + "learning_rate": 9.204144273522563e-08, + "loss": 0.72129059, + "num_input_tokens_seen": 324917455, + "step": 15068, + "time_per_iteration": 2.865957021713257 + }, + { + "auxiliary_loss_clip": 0.01104355, + "auxiliary_loss_mlp": 0.01028972, + "balance_loss_clip": 1.03523481, + "balance_loss_mlp": 1.01681864, + "epoch": 0.90599729445363, + "flos": 19462973437440.0, + "grad_norm": 2.0548899338022064, + "language_loss": 0.85366511, + "learning_rate": 9.19246900500943e-08, + "loss": 0.87499845, + "num_input_tokens_seen": 324934495, + "step": 15069, + "time_per_iteration": 2.5832648277282715 + }, + { + "auxiliary_loss_clip": 0.01100336, + "auxiliary_loss_mlp": 0.01032536, + "balance_loss_clip": 1.03515148, + "balance_loss_mlp": 1.01913118, + "epoch": 0.9060574177062979, + "flos": 23732967542400.0, + "grad_norm": 1.7734674553578826, + "language_loss": 0.59089136, + "learning_rate": 9.180800971936987e-08, + "loss": 0.61222005, + "num_input_tokens_seen": 324953230, + "step": 15070, + "time_per_iteration": 2.6578190326690674 + }, + { + "auxiliary_loss_clip": 0.01073063, + "auxiliary_loss_mlp": 0.01029432, + "balance_loss_clip": 1.03542089, + "balance_loss_mlp": 1.01644397, + "epoch": 0.9061175409589659, + "flos": 17311134395520.0, + "grad_norm": 2.114967180727135, + "language_loss": 0.81690538, + "learning_rate": 9.169140174747724e-08, + "loss": 0.83793026, + "num_input_tokens_seen": 324969880, + "step": 15071, + "time_per_iteration": 2.677042245864868 + }, + { + "auxiliary_loss_clip": 0.0111224, + "auxiliary_loss_mlp": 0.01041359, + "balance_loss_clip": 1.03753805, + "balance_loss_mlp": 1.02798986, + "epoch": 0.9061776642116338, + "flos": 17778439359360.0, + "grad_norm": 1.8991196777690924, + "language_loss": 0.61947775, + "learning_rate": 9.157486613883758e-08, + "loss": 0.64101374, + "num_input_tokens_seen": 324987005, + "step": 15072, + "time_per_iteration": 2.5581016540527344 + }, + { + "auxiliary_loss_clip": 0.01088368, + "auxiliary_loss_mlp": 0.01035581, + "balance_loss_clip": 1.03575015, + "balance_loss_mlp": 1.02321947, + "epoch": 0.9062377874643018, + "flos": 42777688037760.0, + "grad_norm": 1.883547115522317, + "language_loss": 0.73039377, + "learning_rate": 9.145840289787021e-08, + "loss": 0.75163323, + "num_input_tokens_seen": 325010700, + "step": 15073, + "time_per_iteration": 2.933929681777954 + }, + { + "auxiliary_loss_clip": 0.01094334, + "auxiliary_loss_mlp": 0.01027115, + "balance_loss_clip": 1.0359031, + "balance_loss_mlp": 1.01591563, + "epoch": 0.9062979107169697, + "flos": 16361620323840.0, + "grad_norm": 1.8214785876499617, + "language_loss": 0.8087334, + "learning_rate": 9.134201202899161e-08, + "loss": 0.82994789, + "num_input_tokens_seen": 325028760, + "step": 15074, + "time_per_iteration": 2.6201162338256836 + }, + { + "auxiliary_loss_clip": 0.00984336, + "auxiliary_loss_mlp": 0.00752175, + "balance_loss_clip": 1.00953913, + "balance_loss_mlp": 0.99961203, + "epoch": 0.9063580339696378, + "flos": 69313988528640.0, + "grad_norm": 0.7424455220001136, + "language_loss": 0.52306926, + "learning_rate": 9.122569353661513e-08, + "loss": 0.54043436, + "num_input_tokens_seen": 325093545, + "step": 15075, + "time_per_iteration": 3.318652391433716 + }, + { + "auxiliary_loss_clip": 0.00997512, + "auxiliary_loss_mlp": 0.00998485, + "balance_loss_clip": 1.0082109, + "balance_loss_mlp": 0.99731654, + "epoch": 0.9064181572223057, + "flos": 58794747148800.0, + "grad_norm": 0.7354115302623626, + "language_loss": 0.62038195, + "learning_rate": 9.11094474251517e-08, + "loss": 0.640342, + "num_input_tokens_seen": 325152295, + "step": 15076, + "time_per_iteration": 3.1302971839904785 + }, + { + "auxiliary_loss_clip": 0.01095732, + "auxiliary_loss_mlp": 0.01035747, + "balance_loss_clip": 1.0357511, + "balance_loss_mlp": 1.0237844, + "epoch": 0.9064782804749737, + "flos": 21762692772480.0, + "grad_norm": 1.7300331938520934, + "language_loss": 0.81917107, + "learning_rate": 9.09932736990091e-08, + "loss": 0.84048593, + "num_input_tokens_seen": 325169705, + "step": 15077, + "time_per_iteration": 2.6315958499908447 + }, + { + "auxiliary_loss_clip": 0.01081763, + "auxiliary_loss_mlp": 0.00769991, + "balance_loss_clip": 1.03210878, + "balance_loss_mlp": 1.00007868, + "epoch": 0.9065384037276417, + "flos": 21397373498880.0, + "grad_norm": 1.5468663255290942, + "language_loss": 0.83872044, + "learning_rate": 9.08771723625934e-08, + "loss": 0.85723794, + "num_input_tokens_seen": 325189175, + "step": 15078, + "time_per_iteration": 2.727109670639038 + }, + { + "auxiliary_loss_clip": 0.01093852, + "auxiliary_loss_mlp": 0.00770079, + "balance_loss_clip": 1.03619432, + "balance_loss_mlp": 1.00015736, + "epoch": 0.9065985269803096, + "flos": 38283646849920.0, + "grad_norm": 1.6827515544701097, + "language_loss": 0.65606648, + "learning_rate": 9.076114342030617e-08, + "loss": 0.67470574, + "num_input_tokens_seen": 325211020, + "step": 15079, + "time_per_iteration": 2.771944999694824 + }, + { + "auxiliary_loss_clip": 0.01028805, + "auxiliary_loss_mlp": 0.01027589, + "balance_loss_clip": 1.03047419, + "balance_loss_mlp": 1.0151794, + "epoch": 0.9066586502329776, + "flos": 44818562989440.0, + "grad_norm": 1.7893675619126004, + "language_loss": 0.70638371, + "learning_rate": 9.064518687654765e-08, + "loss": 0.72694761, + "num_input_tokens_seen": 325236970, + "step": 15080, + "time_per_iteration": 2.9839913845062256 + }, + { + "auxiliary_loss_clip": 0.01096514, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.03848863, + "balance_loss_mlp": 1.01837301, + "epoch": 0.9067187734856456, + "flos": 18623992492800.0, + "grad_norm": 2.4819155827069452, + "language_loss": 0.71019328, + "learning_rate": 9.052930273571547e-08, + "loss": 0.73147219, + "num_input_tokens_seen": 325252670, + "step": 15081, + "time_per_iteration": 2.5639331340789795 + }, + { + "auxiliary_loss_clip": 0.01082423, + "auxiliary_loss_mlp": 0.01034663, + "balance_loss_clip": 1.03733432, + "balance_loss_mlp": 1.02240872, + "epoch": 0.9067788967383136, + "flos": 22747578762240.0, + "grad_norm": 5.90815505153055, + "language_loss": 0.7437706, + "learning_rate": 9.04134910022032e-08, + "loss": 0.76494145, + "num_input_tokens_seen": 325273860, + "step": 15082, + "time_per_iteration": 2.6862359046936035 + }, + { + "auxiliary_loss_clip": 0.01073569, + "auxiliary_loss_mlp": 0.0103586, + "balance_loss_clip": 1.03576851, + "balance_loss_mlp": 1.02364099, + "epoch": 0.9068390199909815, + "flos": 27670787648640.0, + "grad_norm": 2.0228960329106904, + "language_loss": 0.78056735, + "learning_rate": 9.029775168040266e-08, + "loss": 0.80166161, + "num_input_tokens_seen": 325294140, + "step": 15083, + "time_per_iteration": 2.7631537914276123 + }, + { + "auxiliary_loss_clip": 0.01082943, + "auxiliary_loss_mlp": 0.0076928, + "balance_loss_clip": 1.03722239, + "balance_loss_mlp": 1.00023723, + "epoch": 0.9068991432436495, + "flos": 24244012293120.0, + "grad_norm": 1.5997317426680842, + "language_loss": 0.68783748, + "learning_rate": 9.01820847747028e-08, + "loss": 0.70635974, + "num_input_tokens_seen": 325313130, + "step": 15084, + "time_per_iteration": 2.720623731613159 + }, + { + "auxiliary_loss_clip": 0.01108826, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.03775597, + "balance_loss_mlp": 1.01930761, + "epoch": 0.9069592664963174, + "flos": 28033305661440.0, + "grad_norm": 7.23400764704548, + "language_loss": 0.67128915, + "learning_rate": 9.006649028948965e-08, + "loss": 0.69269109, + "num_input_tokens_seen": 325334880, + "step": 15085, + "time_per_iteration": 2.6862213611602783 + }, + { + "auxiliary_loss_clip": 0.00998184, + "auxiliary_loss_mlp": 0.01017743, + "balance_loss_clip": 1.00960755, + "balance_loss_mlp": 1.01620471, + "epoch": 0.9070193897489854, + "flos": 68778414789120.0, + "grad_norm": 0.7963063657697701, + "language_loss": 0.61316264, + "learning_rate": 8.995096822914638e-08, + "loss": 0.63332188, + "num_input_tokens_seen": 325394175, + "step": 15086, + "time_per_iteration": 3.2537643909454346 + }, + { + "auxiliary_loss_clip": 0.01093775, + "auxiliary_loss_mlp": 0.01038417, + "balance_loss_clip": 1.03427684, + "balance_loss_mlp": 1.02487493, + "epoch": 0.9070795130016533, + "flos": 23441624328960.0, + "grad_norm": 1.436388517862248, + "language_loss": 0.72142053, + "learning_rate": 8.983551859805416e-08, + "loss": 0.74274248, + "num_input_tokens_seen": 325415020, + "step": 15087, + "time_per_iteration": 4.312045335769653 + }, + { + "auxiliary_loss_clip": 0.01084735, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.03397894, + "balance_loss_mlp": 1.01522434, + "epoch": 0.9071396362543214, + "flos": 18916413114240.0, + "grad_norm": 1.949639239308053, + "language_loss": 0.76511991, + "learning_rate": 8.972014140059058e-08, + "loss": 0.78623861, + "num_input_tokens_seen": 325433595, + "step": 15088, + "time_per_iteration": 4.274383783340454 + }, + { + "auxiliary_loss_clip": 0.01073577, + "auxiliary_loss_mlp": 0.01032164, + "balance_loss_clip": 1.03376746, + "balance_loss_mlp": 1.02019525, + "epoch": 0.9071997595069893, + "flos": 25228646887680.0, + "grad_norm": 1.7650984011067665, + "language_loss": 0.73451883, + "learning_rate": 8.960483664113038e-08, + "loss": 0.75557625, + "num_input_tokens_seen": 325451605, + "step": 15089, + "time_per_iteration": 4.142383575439453 + }, + { + "auxiliary_loss_clip": 0.01103445, + "auxiliary_loss_mlp": 0.01034573, + "balance_loss_clip": 1.03631544, + "balance_loss_mlp": 1.02313471, + "epoch": 0.9072598827596573, + "flos": 24346608514560.0, + "grad_norm": 1.785554810845489, + "language_loss": 0.75460756, + "learning_rate": 8.948960432404628e-08, + "loss": 0.77598774, + "num_input_tokens_seen": 325470645, + "step": 15090, + "time_per_iteration": 4.125551462173462 + }, + { + "auxiliary_loss_clip": 0.01081669, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.03531027, + "balance_loss_mlp": 1.0168643, + "epoch": 0.9073200060123253, + "flos": 22674967418880.0, + "grad_norm": 2.644042732321969, + "language_loss": 0.7796579, + "learning_rate": 8.93744444537079e-08, + "loss": 0.8007772, + "num_input_tokens_seen": 325488070, + "step": 15091, + "time_per_iteration": 2.611660957336426 + }, + { + "auxiliary_loss_clip": 0.01080451, + "auxiliary_loss_mlp": 0.01025973, + "balance_loss_clip": 1.03320861, + "balance_loss_mlp": 1.01513076, + "epoch": 0.9073801292649932, + "flos": 23695476721920.0, + "grad_norm": 1.8611765559863347, + "language_loss": 0.85915703, + "learning_rate": 8.925935703448217e-08, + "loss": 0.88022125, + "num_input_tokens_seen": 325509285, + "step": 15092, + "time_per_iteration": 2.6740128993988037 + }, + { + "auxiliary_loss_clip": 0.01084789, + "auxiliary_loss_mlp": 0.0103167, + "balance_loss_clip": 1.03747833, + "balance_loss_mlp": 1.0196414, + "epoch": 0.9074402525176612, + "flos": 25375413859200.0, + "grad_norm": 1.5044941360603252, + "language_loss": 0.78849494, + "learning_rate": 8.914434207073296e-08, + "loss": 0.80965954, + "num_input_tokens_seen": 325529360, + "step": 15093, + "time_per_iteration": 2.680701494216919 + }, + { + "auxiliary_loss_clip": 0.01019381, + "auxiliary_loss_mlp": 0.01002565, + "balance_loss_clip": 1.00606823, + "balance_loss_mlp": 1.00151622, + "epoch": 0.9075003757703292, + "flos": 67649024384640.0, + "grad_norm": 0.7360353686888242, + "language_loss": 0.56958818, + "learning_rate": 8.902939956682188e-08, + "loss": 0.58980775, + "num_input_tokens_seen": 325583565, + "step": 15094, + "time_per_iteration": 3.086918592453003 + }, + { + "auxiliary_loss_clip": 0.01099075, + "auxiliary_loss_mlp": 0.01034812, + "balance_loss_clip": 1.03545427, + "balance_loss_mlp": 1.02190804, + "epoch": 0.9075604990229972, + "flos": 22453649769600.0, + "grad_norm": 1.9406797492354237, + "language_loss": 0.71160638, + "learning_rate": 8.891452952710742e-08, + "loss": 0.73294526, + "num_input_tokens_seen": 325603690, + "step": 15095, + "time_per_iteration": 2.6372621059417725 + }, + { + "auxiliary_loss_clip": 0.01066408, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.0342015, + "balance_loss_mlp": 1.02175641, + "epoch": 0.9076206222756651, + "flos": 19536662188800.0, + "grad_norm": 2.201890556865997, + "language_loss": 0.7416867, + "learning_rate": 8.879973195594526e-08, + "loss": 0.76269424, + "num_input_tokens_seen": 325622255, + "step": 15096, + "time_per_iteration": 2.7420341968536377 + }, + { + "auxiliary_loss_clip": 0.01109715, + "auxiliary_loss_mlp": 0.01038367, + "balance_loss_clip": 1.03712845, + "balance_loss_mlp": 1.02484858, + "epoch": 0.9076807455283331, + "flos": 30116914819200.0, + "grad_norm": 1.8892024302552053, + "language_loss": 0.56777847, + "learning_rate": 8.868500685768898e-08, + "loss": 0.58925933, + "num_input_tokens_seen": 325640165, + "step": 15097, + "time_per_iteration": 2.66786527633667 + }, + { + "auxiliary_loss_clip": 0.01085602, + "auxiliary_loss_mlp": 0.01024669, + "balance_loss_clip": 1.03317809, + "balance_loss_mlp": 1.01340389, + "epoch": 0.907740868781001, + "flos": 18697537589760.0, + "grad_norm": 1.7446964488150043, + "language_loss": 0.79539967, + "learning_rate": 8.857035423668935e-08, + "loss": 0.81650233, + "num_input_tokens_seen": 325659455, + "step": 15098, + "time_per_iteration": 2.6101489067077637 + }, + { + "auxiliary_loss_clip": 0.010671, + "auxiliary_loss_mlp": 0.00771611, + "balance_loss_clip": 1.03485239, + "balance_loss_mlp": 1.00026011, + "epoch": 0.907800992033669, + "flos": 22638805401600.0, + "grad_norm": 18.550819833404994, + "language_loss": 0.66001773, + "learning_rate": 8.845577409729266e-08, + "loss": 0.67840481, + "num_input_tokens_seen": 325678095, + "step": 15099, + "time_per_iteration": 2.782886266708374 + }, + { + "auxiliary_loss_clip": 0.01089093, + "auxiliary_loss_mlp": 0.0103633, + "balance_loss_clip": 1.03569531, + "balance_loss_mlp": 1.02341413, + "epoch": 0.907861115286337, + "flos": 21287666384640.0, + "grad_norm": 2.095035959000706, + "language_loss": 0.70761675, + "learning_rate": 8.834126644384477e-08, + "loss": 0.72887093, + "num_input_tokens_seen": 325695825, + "step": 15100, + "time_per_iteration": 2.718719482421875 + }, + { + "auxiliary_loss_clip": 0.01018547, + "auxiliary_loss_mlp": 0.01002357, + "balance_loss_clip": 1.00599432, + "balance_loss_mlp": 1.00136185, + "epoch": 0.907921238539005, + "flos": 69739493040000.0, + "grad_norm": 0.6221166311541254, + "language_loss": 0.5336588, + "learning_rate": 8.822683128068775e-08, + "loss": 0.55386788, + "num_input_tokens_seen": 325764515, + "step": 15101, + "time_per_iteration": 3.2601447105407715 + }, + { + "auxiliary_loss_clip": 0.01074173, + "auxiliary_loss_mlp": 0.0103007, + "balance_loss_clip": 1.03405142, + "balance_loss_mlp": 1.01715326, + "epoch": 0.9079813617916729, + "flos": 23477391296640.0, + "grad_norm": 1.6841110565912183, + "language_loss": 0.68209207, + "learning_rate": 8.811246861216081e-08, + "loss": 0.70313448, + "num_input_tokens_seen": 325783235, + "step": 15102, + "time_per_iteration": 2.6863279342651367 + }, + { + "auxiliary_loss_clip": 0.01094848, + "auxiliary_loss_mlp": 0.0103185, + "balance_loss_clip": 1.03587008, + "balance_loss_mlp": 1.01915479, + "epoch": 0.9080414850443409, + "flos": 22929933133440.0, + "grad_norm": 1.7674184353723423, + "language_loss": 0.79133558, + "learning_rate": 8.799817844260049e-08, + "loss": 0.81260264, + "num_input_tokens_seen": 325800195, + "step": 15103, + "time_per_iteration": 2.672898054122925 + }, + { + "auxiliary_loss_clip": 0.0108183, + "auxiliary_loss_mlp": 0.01033096, + "balance_loss_clip": 1.03430343, + "balance_loss_mlp": 1.02016127, + "epoch": 0.9081016082970089, + "flos": 26177083551360.0, + "grad_norm": 1.7434121737063208, + "language_loss": 0.71796912, + "learning_rate": 8.78839607763413e-08, + "loss": 0.73911834, + "num_input_tokens_seen": 325820215, + "step": 15104, + "time_per_iteration": 2.6979503631591797 + }, + { + "auxiliary_loss_clip": 0.01083633, + "auxiliary_loss_mlp": 0.01026392, + "balance_loss_clip": 1.03431463, + "balance_loss_mlp": 1.01508558, + "epoch": 0.9081617315496768, + "flos": 24462169545600.0, + "grad_norm": 1.7433195469593918, + "language_loss": 0.77697951, + "learning_rate": 8.77698156177138e-08, + "loss": 0.79807979, + "num_input_tokens_seen": 325838415, + "step": 15105, + "time_per_iteration": 2.693650722503662 + }, + { + "auxiliary_loss_clip": 0.01106144, + "auxiliary_loss_mlp": 0.00770719, + "balance_loss_clip": 1.03435302, + "balance_loss_mlp": 1.00018311, + "epoch": 0.9082218548023449, + "flos": 24746868743040.0, + "grad_norm": 2.4921159969268625, + "language_loss": 0.73882461, + "learning_rate": 8.765574297104628e-08, + "loss": 0.75759327, + "num_input_tokens_seen": 325855580, + "step": 15106, + "time_per_iteration": 2.6928508281707764 + }, + { + "auxiliary_loss_clip": 0.01059785, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.02983892, + "balance_loss_mlp": 1.02212226, + "epoch": 0.9082819780550128, + "flos": 24421302846720.0, + "grad_norm": 1.956694255658617, + "language_loss": 0.80682945, + "learning_rate": 8.754174284066462e-08, + "loss": 0.82778394, + "num_input_tokens_seen": 325874890, + "step": 15107, + "time_per_iteration": 2.8211913108825684 + }, + { + "auxiliary_loss_clip": 0.01003818, + "auxiliary_loss_mlp": 0.01000224, + "balance_loss_clip": 1.00530005, + "balance_loss_mlp": 0.99906158, + "epoch": 0.9083421013076808, + "flos": 59609704872960.0, + "grad_norm": 0.8163194562351376, + "language_loss": 0.59763622, + "learning_rate": 8.742781523089205e-08, + "loss": 0.61767673, + "num_input_tokens_seen": 325935835, + "step": 15108, + "time_per_iteration": 3.176673173904419 + }, + { + "auxiliary_loss_clip": 0.01085744, + "auxiliary_loss_mlp": 0.0102493, + "balance_loss_clip": 1.03396034, + "balance_loss_mlp": 1.01259756, + "epoch": 0.9084022245603487, + "flos": 33620216100480.0, + "grad_norm": 1.5754460951726812, + "language_loss": 0.73228884, + "learning_rate": 8.73139601460482e-08, + "loss": 0.75339556, + "num_input_tokens_seen": 325958035, + "step": 15109, + "time_per_iteration": 2.744368314743042 + }, + { + "auxiliary_loss_clip": 0.01072978, + "auxiliary_loss_mlp": 0.01028525, + "balance_loss_clip": 1.03370285, + "balance_loss_mlp": 1.01687264, + "epoch": 0.9084623478130167, + "flos": 24971705925120.0, + "grad_norm": 4.290837775967832, + "language_loss": 0.71557301, + "learning_rate": 8.720017759045073e-08, + "loss": 0.736588, + "num_input_tokens_seen": 325979870, + "step": 15110, + "time_per_iteration": 2.7875888347625732 + }, + { + "auxiliary_loss_clip": 0.0107739, + "auxiliary_loss_mlp": 0.01035324, + "balance_loss_clip": 1.03073955, + "balance_loss_mlp": 1.0219785, + "epoch": 0.9085224710656846, + "flos": 31461804869760.0, + "grad_norm": 1.8448389320189542, + "language_loss": 0.69122839, + "learning_rate": 8.708646756841421e-08, + "loss": 0.71235561, + "num_input_tokens_seen": 325998245, + "step": 15111, + "time_per_iteration": 2.7633275985717773 + }, + { + "auxiliary_loss_clip": 0.00998747, + "auxiliary_loss_mlp": 0.01004801, + "balance_loss_clip": 1.00629544, + "balance_loss_mlp": 1.00380516, + "epoch": 0.9085825943183526, + "flos": 64917012867840.0, + "grad_norm": 0.6888629438196041, + "language_loss": 0.51703209, + "learning_rate": 8.697283008425026e-08, + "loss": 0.53706759, + "num_input_tokens_seen": 326061770, + "step": 15112, + "time_per_iteration": 3.2464187145233154 + }, + { + "auxiliary_loss_clip": 0.0109824, + "auxiliary_loss_mlp": 0.01033068, + "balance_loss_clip": 1.03503668, + "balance_loss_mlp": 1.02022314, + "epoch": 0.9086427175710206, + "flos": 18953221576320.0, + "grad_norm": 1.723855201970508, + "language_loss": 0.7027775, + "learning_rate": 8.685926514226837e-08, + "loss": 0.72409058, + "num_input_tokens_seen": 326080945, + "step": 15113, + "time_per_iteration": 2.615265130996704 + }, + { + "auxiliary_loss_clip": 0.01098496, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.03785408, + "balance_loss_mlp": 1.0189271, + "epoch": 0.9087028408236886, + "flos": 34014873807360.0, + "grad_norm": 2.0973757596387004, + "language_loss": 0.78994763, + "learning_rate": 8.674577274677508e-08, + "loss": 0.81124145, + "num_input_tokens_seen": 326100630, + "step": 15114, + "time_per_iteration": 2.7337305545806885 + }, + { + "auxiliary_loss_clip": 0.01070616, + "auxiliary_loss_mlp": 0.01033895, + "balance_loss_clip": 1.03684914, + "balance_loss_mlp": 1.02053201, + "epoch": 0.9087629640763565, + "flos": 21944580266880.0, + "grad_norm": 3.929458307617432, + "language_loss": 0.70178634, + "learning_rate": 8.663235290207405e-08, + "loss": 0.72283143, + "num_input_tokens_seen": 326120145, + "step": 15115, + "time_per_iteration": 2.751361131668091 + }, + { + "auxiliary_loss_clip": 0.01086218, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.03923655, + "balance_loss_mlp": 1.01895118, + "epoch": 0.9088230873290245, + "flos": 21762908254080.0, + "grad_norm": 2.3483964506042603, + "language_loss": 0.65777099, + "learning_rate": 8.651900561246561e-08, + "loss": 0.67895895, + "num_input_tokens_seen": 326140715, + "step": 15116, + "time_per_iteration": 2.715759754180908 + }, + { + "auxiliary_loss_clip": 0.01106542, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.0372858, + "balance_loss_mlp": 1.02119398, + "epoch": 0.9088832105816925, + "flos": 21541267382400.0, + "grad_norm": 6.342744698991267, + "language_loss": 0.69591606, + "learning_rate": 8.640573088224812e-08, + "loss": 0.71732175, + "num_input_tokens_seen": 326159130, + "step": 15117, + "time_per_iteration": 2.582552433013916 + }, + { + "auxiliary_loss_clip": 0.01066284, + "auxiliary_loss_mlp": 0.01026525, + "balance_loss_clip": 1.03425217, + "balance_loss_mlp": 1.01489568, + "epoch": 0.9089433338343604, + "flos": 25996704428160.0, + "grad_norm": 3.6808698691711856, + "language_loss": 0.74660701, + "learning_rate": 8.629252871571745e-08, + "loss": 0.76753509, + "num_input_tokens_seen": 326181375, + "step": 15118, + "time_per_iteration": 2.751481056213379 + }, + { + "auxiliary_loss_clip": 0.01083211, + "auxiliary_loss_mlp": 0.01034962, + "balance_loss_clip": 1.03344107, + "balance_loss_mlp": 1.02128291, + "epoch": 0.9090034570870285, + "flos": 21178426147200.0, + "grad_norm": 2.13733826102676, + "language_loss": 0.73172134, + "learning_rate": 8.617939911716554e-08, + "loss": 0.75290304, + "num_input_tokens_seen": 326199740, + "step": 15119, + "time_per_iteration": 2.7050302028656006 + }, + { + "auxiliary_loss_clip": 0.01073499, + "auxiliary_loss_mlp": 0.0103193, + "balance_loss_clip": 1.03588152, + "balance_loss_mlp": 1.01727891, + "epoch": 0.9090635803396964, + "flos": 16141811045760.0, + "grad_norm": 2.3309233232368376, + "language_loss": 0.71525586, + "learning_rate": 8.60663420908827e-08, + "loss": 0.73631012, + "num_input_tokens_seen": 326214350, + "step": 15120, + "time_per_iteration": 2.748596429824829 + }, + { + "auxiliary_loss_clip": 0.01109717, + "auxiliary_loss_mlp": 0.00770513, + "balance_loss_clip": 1.03689528, + "balance_loss_mlp": 1.0002079, + "epoch": 0.9091237035923644, + "flos": 20591537829120.0, + "grad_norm": 2.1685106805534002, + "language_loss": 0.65576839, + "learning_rate": 8.595335764115596e-08, + "loss": 0.67457068, + "num_input_tokens_seen": 326234580, + "step": 15121, + "time_per_iteration": 2.6541824340820312 + }, + { + "auxiliary_loss_clip": 0.01098528, + "auxiliary_loss_mlp": 0.01035932, + "balance_loss_clip": 1.03610933, + "balance_loss_mlp": 1.02343321, + "epoch": 0.9091838268450323, + "flos": 52227760164480.0, + "grad_norm": 2.467654081114951, + "language_loss": 0.70642388, + "learning_rate": 8.58404457722699e-08, + "loss": 0.72776842, + "num_input_tokens_seen": 326259080, + "step": 15122, + "time_per_iteration": 2.925644636154175 + }, + { + "auxiliary_loss_clip": 0.01052109, + "auxiliary_loss_mlp": 0.01030913, + "balance_loss_clip": 1.03168774, + "balance_loss_mlp": 1.01879561, + "epoch": 0.9092439500977003, + "flos": 20559613616640.0, + "grad_norm": 1.4208742035415944, + "language_loss": 0.74525023, + "learning_rate": 8.572760648850575e-08, + "loss": 0.76608044, + "num_input_tokens_seen": 326280175, + "step": 15123, + "time_per_iteration": 2.734441041946411 + }, + { + "auxiliary_loss_clip": 0.0109521, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.03593588, + "balance_loss_mlp": 1.02159882, + "epoch": 0.9093040733503682, + "flos": 28617859595520.0, + "grad_norm": 1.8896970450570774, + "language_loss": 0.7576673, + "learning_rate": 8.561483979414253e-08, + "loss": 0.77895033, + "num_input_tokens_seen": 326297990, + "step": 15124, + "time_per_iteration": 2.6362528800964355 + }, + { + "auxiliary_loss_clip": 0.01090802, + "auxiliary_loss_mlp": 0.01032412, + "balance_loss_clip": 1.03465593, + "balance_loss_mlp": 1.02002668, + "epoch": 0.9093641966030362, + "flos": 23440187784960.0, + "grad_norm": 1.8805276614968602, + "language_loss": 0.71919298, + "learning_rate": 8.55021456934566e-08, + "loss": 0.74042511, + "num_input_tokens_seen": 326316735, + "step": 15125, + "time_per_iteration": 2.5915915966033936 + }, + { + "auxiliary_loss_clip": 0.01068085, + "auxiliary_loss_mlp": 0.01035917, + "balance_loss_clip": 1.03625441, + "balance_loss_mlp": 1.02334034, + "epoch": 0.9094243198557042, + "flos": 16800197385600.0, + "grad_norm": 1.6292066230001099, + "language_loss": 0.79466188, + "learning_rate": 8.538952419072143e-08, + "loss": 0.8157019, + "num_input_tokens_seen": 326334370, + "step": 15126, + "time_per_iteration": 4.219731569290161 + }, + { + "auxiliary_loss_clip": 0.01065083, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.03633046, + "balance_loss_mlp": 1.02255654, + "epoch": 0.9094844431083722, + "flos": 24273278899200.0, + "grad_norm": 1.707765126078796, + "language_loss": 0.75641441, + "learning_rate": 8.527697529020694e-08, + "loss": 0.77741325, + "num_input_tokens_seen": 326353435, + "step": 15127, + "time_per_iteration": 2.7128138542175293 + }, + { + "auxiliary_loss_clip": 0.01027678, + "auxiliary_loss_mlp": 0.01033923, + "balance_loss_clip": 1.03002882, + "balance_loss_mlp": 1.02145934, + "epoch": 0.9095445663610401, + "flos": 21944652094080.0, + "grad_norm": 1.8998405875281965, + "language_loss": 0.62571168, + "learning_rate": 8.516449899618173e-08, + "loss": 0.64632773, + "num_input_tokens_seen": 326371810, + "step": 15128, + "time_per_iteration": 4.432798385620117 + }, + { + "auxiliary_loss_clip": 0.01075251, + "auxiliary_loss_mlp": 0.01024187, + "balance_loss_clip": 1.03530467, + "balance_loss_mlp": 1.01223636, + "epoch": 0.9096046896137081, + "flos": 19792848965760.0, + "grad_norm": 1.7664774928724292, + "language_loss": 0.76836801, + "learning_rate": 8.505209531291013e-08, + "loss": 0.78936237, + "num_input_tokens_seen": 326391380, + "step": 15129, + "time_per_iteration": 4.206790447235107 + }, + { + "auxiliary_loss_clip": 0.01096669, + "auxiliary_loss_mlp": 0.01027809, + "balance_loss_clip": 1.03541172, + "balance_loss_mlp": 1.01559019, + "epoch": 0.909664812866376, + "flos": 22638087129600.0, + "grad_norm": 1.9024505356481058, + "language_loss": 0.83078182, + "learning_rate": 8.49397642446552e-08, + "loss": 0.85202664, + "num_input_tokens_seen": 326408800, + "step": 15130, + "time_per_iteration": 2.6001152992248535 + }, + { + "auxiliary_loss_clip": 0.0108696, + "auxiliary_loss_mlp": 0.01032978, + "balance_loss_clip": 1.0359422, + "balance_loss_mlp": 1.01988339, + "epoch": 0.909724936119044, + "flos": 39852153020160.0, + "grad_norm": 1.6192884326083825, + "language_loss": 0.75177467, + "learning_rate": 8.482750579567644e-08, + "loss": 0.77297407, + "num_input_tokens_seen": 326431565, + "step": 15131, + "time_per_iteration": 2.848465919494629 + }, + { + "auxiliary_loss_clip": 0.01083451, + "auxiliary_loss_mlp": 0.01035354, + "balance_loss_clip": 1.03611147, + "balance_loss_mlp": 1.02193737, + "epoch": 0.9097850593717121, + "flos": 35071616954880.0, + "grad_norm": 1.8781997333884533, + "language_loss": 0.599832, + "learning_rate": 8.471531997023085e-08, + "loss": 0.62102008, + "num_input_tokens_seen": 326451715, + "step": 15132, + "time_per_iteration": 2.7317306995391846 + }, + { + "auxiliary_loss_clip": 0.01068526, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.0371139, + "balance_loss_mlp": 1.02007413, + "epoch": 0.90984518262438, + "flos": 23367468700800.0, + "grad_norm": 1.7969799110161846, + "language_loss": 0.82646108, + "learning_rate": 8.460320677257193e-08, + "loss": 0.84746432, + "num_input_tokens_seen": 326470855, + "step": 15133, + "time_per_iteration": 2.666724920272827 + }, + { + "auxiliary_loss_clip": 0.01084851, + "auxiliary_loss_mlp": 0.01033684, + "balance_loss_clip": 1.03276467, + "balance_loss_mlp": 1.0209645, + "epoch": 0.909905305877048, + "flos": 27523302405120.0, + "grad_norm": 1.9696626904627623, + "language_loss": 0.74180704, + "learning_rate": 8.449116620695118e-08, + "loss": 0.76299238, + "num_input_tokens_seen": 326490480, + "step": 15134, + "time_per_iteration": 2.7024521827697754 + }, + { + "auxiliary_loss_clip": 0.01081442, + "auxiliary_loss_mlp": 0.01032002, + "balance_loss_clip": 1.03796458, + "balance_loss_mlp": 1.01934206, + "epoch": 0.9099654291297159, + "flos": 24347865490560.0, + "grad_norm": 1.5144614886506496, + "language_loss": 0.72592616, + "learning_rate": 8.437919827761786e-08, + "loss": 0.74706054, + "num_input_tokens_seen": 326509445, + "step": 15135, + "time_per_iteration": 2.7127246856689453 + }, + { + "auxiliary_loss_clip": 0.01096766, + "auxiliary_loss_mlp": 0.01030937, + "balance_loss_clip": 1.03764153, + "balance_loss_mlp": 1.01891482, + "epoch": 0.9100255523823839, + "flos": 21215234609280.0, + "grad_norm": 1.683349330463744, + "language_loss": 0.70137173, + "learning_rate": 8.426730298881702e-08, + "loss": 0.72264874, + "num_input_tokens_seen": 326528380, + "step": 15136, + "time_per_iteration": 2.6193113327026367 + }, + { + "auxiliary_loss_clip": 0.00990412, + "auxiliary_loss_mlp": 0.01005783, + "balance_loss_clip": 1.00657475, + "balance_loss_mlp": 1.00484753, + "epoch": 0.9100856756350518, + "flos": 46052276446080.0, + "grad_norm": 0.825688175716241, + "language_loss": 0.59235996, + "learning_rate": 8.415548034479214e-08, + "loss": 0.61232191, + "num_input_tokens_seen": 326576940, + "step": 15137, + "time_per_iteration": 3.083552837371826 + }, + { + "auxiliary_loss_clip": 0.01098465, + "auxiliary_loss_mlp": 0.01035683, + "balance_loss_clip": 1.03574395, + "balance_loss_mlp": 1.02372026, + "epoch": 0.9101457988877198, + "flos": 20229917656320.0, + "grad_norm": 2.33052803483979, + "language_loss": 0.82487237, + "learning_rate": 8.40437303497834e-08, + "loss": 0.84621382, + "num_input_tokens_seen": 326596100, + "step": 15138, + "time_per_iteration": 2.674602508544922 + }, + { + "auxiliary_loss_clip": 0.01094423, + "auxiliary_loss_mlp": 0.01026368, + "balance_loss_clip": 1.037696, + "balance_loss_mlp": 1.01526928, + "epoch": 0.9102059221403878, + "flos": 26615157822720.0, + "grad_norm": 1.5741555664538536, + "language_loss": 0.81272125, + "learning_rate": 8.39320530080283e-08, + "loss": 0.83392918, + "num_input_tokens_seen": 326615700, + "step": 15139, + "time_per_iteration": 2.694201946258545 + }, + { + "auxiliary_loss_clip": 0.01076496, + "auxiliary_loss_mlp": 0.01033648, + "balance_loss_clip": 1.03743947, + "balance_loss_mlp": 1.02160764, + "epoch": 0.9102660453930558, + "flos": 21908561904000.0, + "grad_norm": 2.050091798744291, + "language_loss": 0.77814442, + "learning_rate": 8.382044832376167e-08, + "loss": 0.79924583, + "num_input_tokens_seen": 326635905, + "step": 15140, + "time_per_iteration": 2.722778558731079 + }, + { + "auxiliary_loss_clip": 0.01106393, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.0352447, + "balance_loss_mlp": 1.01943445, + "epoch": 0.9103261686457237, + "flos": 36176660916480.0, + "grad_norm": 1.7205881923201032, + "language_loss": 0.66666603, + "learning_rate": 8.370891630121569e-08, + "loss": 0.68804365, + "num_input_tokens_seen": 326661855, + "step": 15141, + "time_per_iteration": 2.7130444049835205 + }, + { + "auxiliary_loss_clip": 0.01095941, + "auxiliary_loss_mlp": 0.01037339, + "balance_loss_clip": 1.03542638, + "balance_loss_mlp": 1.02527499, + "epoch": 0.9103862918983917, + "flos": 23878549365120.0, + "grad_norm": 1.8944850892633267, + "language_loss": 0.75325441, + "learning_rate": 8.359745694462005e-08, + "loss": 0.77458721, + "num_input_tokens_seen": 326679320, + "step": 15142, + "time_per_iteration": 2.6429429054260254 + }, + { + "auxiliary_loss_clip": 0.01069268, + "auxiliary_loss_mlp": 0.01042478, + "balance_loss_clip": 1.03122544, + "balance_loss_mlp": 1.02982378, + "epoch": 0.9104464151510596, + "flos": 14939521989120.0, + "grad_norm": 1.6543746947107703, + "language_loss": 0.64361405, + "learning_rate": 8.348607025820076e-08, + "loss": 0.6647315, + "num_input_tokens_seen": 326698110, + "step": 15143, + "time_per_iteration": 2.669706344604492 + }, + { + "auxiliary_loss_clip": 0.01110746, + "auxiliary_loss_mlp": 0.0103534, + "balance_loss_clip": 1.03664672, + "balance_loss_mlp": 1.02197671, + "epoch": 0.9105065384037276, + "flos": 33655803500160.0, + "grad_norm": 1.826138803106712, + "language_loss": 0.61111665, + "learning_rate": 8.337475624618152e-08, + "loss": 0.63257754, + "num_input_tokens_seen": 326718370, + "step": 15144, + "time_per_iteration": 2.659849166870117 + }, + { + "auxiliary_loss_clip": 0.01065641, + "auxiliary_loss_mlp": 0.01027587, + "balance_loss_clip": 1.0301441, + "balance_loss_mlp": 1.01508248, + "epoch": 0.9105666616563957, + "flos": 24316695463680.0, + "grad_norm": 1.5990370313133018, + "language_loss": 0.70864612, + "learning_rate": 8.326351491278382e-08, + "loss": 0.72957838, + "num_input_tokens_seen": 326738445, + "step": 15145, + "time_per_iteration": 2.685203790664673 + }, + { + "auxiliary_loss_clip": 0.01047743, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.03243327, + "balance_loss_mlp": 1.02036476, + "epoch": 0.9106267849090636, + "flos": 29971692132480.0, + "grad_norm": 1.5060644265455205, + "language_loss": 0.70642048, + "learning_rate": 8.315234626222545e-08, + "loss": 0.72722512, + "num_input_tokens_seen": 326758855, + "step": 15146, + "time_per_iteration": 2.7676496505737305 + }, + { + "auxiliary_loss_clip": 0.01085776, + "auxiliary_loss_mlp": 0.01032662, + "balance_loss_clip": 1.03418636, + "balance_loss_mlp": 1.02068782, + "epoch": 0.9106869081617316, + "flos": 25337743470720.0, + "grad_norm": 1.8260905410066164, + "language_loss": 0.72899806, + "learning_rate": 8.304125029872233e-08, + "loss": 0.75018245, + "num_input_tokens_seen": 326777140, + "step": 15147, + "time_per_iteration": 2.6421234607696533 + }, + { + "auxiliary_loss_clip": 0.01081187, + "auxiliary_loss_mlp": 0.01031004, + "balance_loss_clip": 1.0361135, + "balance_loss_mlp": 1.01835012, + "epoch": 0.9107470314143995, + "flos": 18187031543040.0, + "grad_norm": 1.914291203586608, + "language_loss": 0.80780458, + "learning_rate": 8.293022702648711e-08, + "loss": 0.82892644, + "num_input_tokens_seen": 326794070, + "step": 15148, + "time_per_iteration": 2.6653599739074707 + }, + { + "auxiliary_loss_clip": 0.01076044, + "auxiliary_loss_mlp": 0.01039047, + "balance_loss_clip": 1.03479314, + "balance_loss_mlp": 1.02636874, + "epoch": 0.9108071546670675, + "flos": 23550828652800.0, + "grad_norm": 2.087055328388918, + "language_loss": 0.67585528, + "learning_rate": 8.281927644972996e-08, + "loss": 0.69700611, + "num_input_tokens_seen": 326814695, + "step": 15149, + "time_per_iteration": 2.758857011795044 + }, + { + "auxiliary_loss_clip": 0.01108552, + "auxiliary_loss_mlp": 0.01030478, + "balance_loss_clip": 1.03687429, + "balance_loss_mlp": 1.01744866, + "epoch": 0.9108672779197354, + "flos": 25630307746560.0, + "grad_norm": 1.9181044295268432, + "language_loss": 0.63203096, + "learning_rate": 8.270839857265776e-08, + "loss": 0.65342128, + "num_input_tokens_seen": 326835295, + "step": 15150, + "time_per_iteration": 2.650240898132324 + }, + { + "auxiliary_loss_clip": 0.01066309, + "auxiliary_loss_mlp": 0.01031448, + "balance_loss_clip": 1.03402328, + "balance_loss_mlp": 1.01881194, + "epoch": 0.9109274011724035, + "flos": 22339094319360.0, + "grad_norm": 2.2733833539943333, + "language_loss": 0.72643161, + "learning_rate": 8.259759339947514e-08, + "loss": 0.74740922, + "num_input_tokens_seen": 326853350, + "step": 15151, + "time_per_iteration": 2.706934690475464 + }, + { + "auxiliary_loss_clip": 0.01095436, + "auxiliary_loss_mlp": 0.01029482, + "balance_loss_clip": 1.03496432, + "balance_loss_mlp": 1.01727509, + "epoch": 0.9109875244250714, + "flos": 26688200129280.0, + "grad_norm": 1.648582433866266, + "language_loss": 0.64558387, + "learning_rate": 8.248686093438429e-08, + "loss": 0.66683304, + "num_input_tokens_seen": 326873425, + "step": 15152, + "time_per_iteration": 2.699647903442383 + }, + { + "auxiliary_loss_clip": 0.0108822, + "auxiliary_loss_mlp": 0.00770055, + "balance_loss_clip": 1.03658628, + "balance_loss_mlp": 1.00032091, + "epoch": 0.9110476476777394, + "flos": 22930112701440.0, + "grad_norm": 1.8488661615298092, + "language_loss": 0.73683035, + "learning_rate": 8.23762011815834e-08, + "loss": 0.75541312, + "num_input_tokens_seen": 326893455, + "step": 15153, + "time_per_iteration": 2.6998884677886963 + }, + { + "auxiliary_loss_clip": 0.01067073, + "auxiliary_loss_mlp": 0.01050891, + "balance_loss_clip": 1.03213048, + "balance_loss_mlp": 1.03591788, + "epoch": 0.9111077709304073, + "flos": 13472857854720.0, + "grad_norm": 2.3413318457237775, + "language_loss": 0.72122753, + "learning_rate": 8.226561414526956e-08, + "loss": 0.74240714, + "num_input_tokens_seen": 326910210, + "step": 15154, + "time_per_iteration": 2.683474540710449 + }, + { + "auxiliary_loss_clip": 0.01088157, + "auxiliary_loss_mlp": 0.01032363, + "balance_loss_clip": 1.03857923, + "balance_loss_mlp": 1.02037024, + "epoch": 0.9111678941830753, + "flos": 20850561780480.0, + "grad_norm": 1.7345027920232028, + "language_loss": 0.82108957, + "learning_rate": 8.215509982963564e-08, + "loss": 0.84229481, + "num_input_tokens_seen": 326929350, + "step": 15155, + "time_per_iteration": 2.7106335163116455 + }, + { + "auxiliary_loss_clip": 0.01096529, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.03773642, + "balance_loss_mlp": 1.01885629, + "epoch": 0.9112280174357432, + "flos": 19682244011520.0, + "grad_norm": 1.8052926393059447, + "language_loss": 0.5958488, + "learning_rate": 8.204465823887252e-08, + "loss": 0.61712825, + "num_input_tokens_seen": 326949060, + "step": 15156, + "time_per_iteration": 2.6679844856262207 + }, + { + "auxiliary_loss_clip": 0.01099444, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.03477848, + "balance_loss_mlp": 1.01498008, + "epoch": 0.9112881406884112, + "flos": 25447163276160.0, + "grad_norm": 2.321869813201265, + "language_loss": 0.74290884, + "learning_rate": 8.193428937716796e-08, + "loss": 0.76418364, + "num_input_tokens_seen": 326968950, + "step": 15157, + "time_per_iteration": 2.6687350273132324 + }, + { + "auxiliary_loss_clip": 0.01063031, + "auxiliary_loss_mlp": 0.01033754, + "balance_loss_clip": 1.03153825, + "balance_loss_mlp": 1.02228022, + "epoch": 0.9113482639410793, + "flos": 33066975847680.0, + "grad_norm": 1.6132914581945528, + "language_loss": 0.59553444, + "learning_rate": 8.182399324870747e-08, + "loss": 0.61650229, + "num_input_tokens_seen": 326989455, + "step": 15158, + "time_per_iteration": 2.8011231422424316 + }, + { + "auxiliary_loss_clip": 0.01050049, + "auxiliary_loss_mlp": 0.01031146, + "balance_loss_clip": 1.03574824, + "balance_loss_mlp": 1.01942158, + "epoch": 0.9114083871937472, + "flos": 21835591424640.0, + "grad_norm": 2.2386737595671047, + "language_loss": 0.68004364, + "learning_rate": 8.171376985767375e-08, + "loss": 0.70085549, + "num_input_tokens_seen": 327009640, + "step": 15159, + "time_per_iteration": 2.772341251373291 + }, + { + "auxiliary_loss_clip": 0.01087373, + "auxiliary_loss_mlp": 0.01029488, + "balance_loss_clip": 1.03617239, + "balance_loss_mlp": 1.0176506, + "epoch": 0.9114685104464152, + "flos": 27088999061760.0, + "grad_norm": 2.7938055787234015, + "language_loss": 0.78473425, + "learning_rate": 8.160361920824588e-08, + "loss": 0.8059029, + "num_input_tokens_seen": 327027690, + "step": 15160, + "time_per_iteration": 2.7388458251953125 + }, + { + "auxiliary_loss_clip": 0.01111531, + "auxiliary_loss_mlp": 0.01029053, + "balance_loss_clip": 1.03913951, + "balance_loss_mlp": 1.01570201, + "epoch": 0.9115286336990831, + "flos": 17967042696960.0, + "grad_norm": 1.6224624723660812, + "language_loss": 0.69028407, + "learning_rate": 8.149354130460073e-08, + "loss": 0.71168995, + "num_input_tokens_seen": 327045915, + "step": 15161, + "time_per_iteration": 2.6148221492767334 + }, + { + "auxiliary_loss_clip": 0.01060884, + "auxiliary_loss_mlp": 0.01040252, + "balance_loss_clip": 1.03252292, + "balance_loss_mlp": 1.02619767, + "epoch": 0.9115887569517511, + "flos": 22929861306240.0, + "grad_norm": 1.7334002472530148, + "language_loss": 0.7660948, + "learning_rate": 8.138353615091321e-08, + "loss": 0.78710622, + "num_input_tokens_seen": 327066355, + "step": 15162, + "time_per_iteration": 2.938532590866089 + }, + { + "auxiliary_loss_clip": 0.01082027, + "auxiliary_loss_mlp": 0.01032242, + "balance_loss_clip": 1.03714919, + "balance_loss_mlp": 1.01954055, + "epoch": 0.911648880204419, + "flos": 23988436047360.0, + "grad_norm": 1.8353414047586432, + "language_loss": 0.66910523, + "learning_rate": 8.127360375135395e-08, + "loss": 0.69024795, + "num_input_tokens_seen": 327086735, + "step": 15163, + "time_per_iteration": 2.6603245735168457 + }, + { + "auxiliary_loss_clip": 0.01066197, + "auxiliary_loss_mlp": 0.01033842, + "balance_loss_clip": 1.03512335, + "balance_loss_mlp": 1.0209856, + "epoch": 0.911709003457087, + "flos": 17055306754560.0, + "grad_norm": 7.686069864980859, + "language_loss": 0.70642608, + "learning_rate": 8.116374411009186e-08, + "loss": 0.72742647, + "num_input_tokens_seen": 327104035, + "step": 15164, + "time_per_iteration": 2.7450454235076904 + }, + { + "auxiliary_loss_clip": 0.01108615, + "auxiliary_loss_mlp": 0.01031494, + "balance_loss_clip": 1.03994727, + "balance_loss_mlp": 1.01950121, + "epoch": 0.911769126709755, + "flos": 21653344794240.0, + "grad_norm": 1.5696959903057297, + "language_loss": 0.76052606, + "learning_rate": 8.105395723129315e-08, + "loss": 0.78192717, + "num_input_tokens_seen": 327124370, + "step": 15165, + "time_per_iteration": 2.588705062866211 + }, + { + "auxiliary_loss_clip": 0.01093363, + "auxiliary_loss_mlp": 0.01033622, + "balance_loss_clip": 1.03510165, + "balance_loss_mlp": 1.02148008, + "epoch": 0.911829249962423, + "flos": 24790321221120.0, + "grad_norm": 2.0749050237393423, + "language_loss": 0.72525322, + "learning_rate": 8.094424311912074e-08, + "loss": 0.74652308, + "num_input_tokens_seen": 327140915, + "step": 15166, + "time_per_iteration": 4.245245933532715 + }, + { + "auxiliary_loss_clip": 0.01060198, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.03464365, + "balance_loss_mlp": 1.02491355, + "epoch": 0.9118893732150909, + "flos": 20959406968320.0, + "grad_norm": 1.8141703562808917, + "language_loss": 0.73241115, + "learning_rate": 8.083460177773482e-08, + "loss": 0.75339532, + "num_input_tokens_seen": 327158940, + "step": 15167, + "time_per_iteration": 5.897623062133789 + }, + { + "auxiliary_loss_clip": 0.0101816, + "auxiliary_loss_mlp": 0.00998888, + "balance_loss_clip": 1.01390624, + "balance_loss_mlp": 0.99787515, + "epoch": 0.9119494964677589, + "flos": 67917385872000.0, + "grad_norm": 0.7753194150086553, + "language_loss": 0.65546739, + "learning_rate": 8.072503321129298e-08, + "loss": 0.67563796, + "num_input_tokens_seen": 327217450, + "step": 15168, + "time_per_iteration": 3.210770845413208 + }, + { + "auxiliary_loss_clip": 0.01078881, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.03501606, + "balance_loss_mlp": 1.01959848, + "epoch": 0.9120096197204268, + "flos": 18551524803840.0, + "grad_norm": 1.9364628157336585, + "language_loss": 0.78129464, + "learning_rate": 8.061553742395033e-08, + "loss": 0.80239916, + "num_input_tokens_seen": 327233905, + "step": 15169, + "time_per_iteration": 4.273360729217529 + }, + { + "auxiliary_loss_clip": 0.01097706, + "auxiliary_loss_mlp": 0.0103039, + "balance_loss_clip": 1.03744388, + "balance_loss_mlp": 1.01839125, + "epoch": 0.9120697429730948, + "flos": 19025725178880.0, + "grad_norm": 1.8060821353354455, + "language_loss": 0.81748688, + "learning_rate": 8.05061144198591e-08, + "loss": 0.83876789, + "num_input_tokens_seen": 327252430, + "step": 15170, + "time_per_iteration": 2.6498122215270996 + }, + { + "auxiliary_loss_clip": 0.01100439, + "auxiliary_loss_mlp": 0.01030541, + "balance_loss_clip": 1.03837538, + "balance_loss_mlp": 1.01746333, + "epoch": 0.9121298662257629, + "flos": 17163685065600.0, + "grad_norm": 2.097374036278885, + "language_loss": 0.76902175, + "learning_rate": 8.039676420316799e-08, + "loss": 0.79033154, + "num_input_tokens_seen": 327269215, + "step": 15171, + "time_per_iteration": 2.6777992248535156 + }, + { + "auxiliary_loss_clip": 0.01025503, + "auxiliary_loss_mlp": 0.01038378, + "balance_loss_clip": 1.03109252, + "balance_loss_mlp": 1.02510428, + "epoch": 0.9121899894784308, + "flos": 19682710888320.0, + "grad_norm": 1.2924384179927062, + "language_loss": 0.66694897, + "learning_rate": 8.02874867780241e-08, + "loss": 0.68758774, + "num_input_tokens_seen": 327290320, + "step": 15172, + "time_per_iteration": 2.851702928543091 + }, + { + "auxiliary_loss_clip": 0.01079756, + "auxiliary_loss_mlp": 0.01033459, + "balance_loss_clip": 1.03634048, + "balance_loss_mlp": 1.02087665, + "epoch": 0.9122501127310988, + "flos": 22235743912320.0, + "grad_norm": 1.6696295487638473, + "language_loss": 0.74975204, + "learning_rate": 8.017828214857103e-08, + "loss": 0.77088416, + "num_input_tokens_seen": 327310150, + "step": 15173, + "time_per_iteration": 2.6567437648773193 + }, + { + "auxiliary_loss_clip": 0.01093131, + "auxiliary_loss_mlp": 0.01034759, + "balance_loss_clip": 1.03830385, + "balance_loss_mlp": 1.02032316, + "epoch": 0.9123102359837667, + "flos": 15957122290560.0, + "grad_norm": 5.127558879454518, + "language_loss": 0.6578263, + "learning_rate": 8.00691503189499e-08, + "loss": 0.67910528, + "num_input_tokens_seen": 327326660, + "step": 15174, + "time_per_iteration": 2.6690120697021484 + }, + { + "auxiliary_loss_clip": 0.01096653, + "auxiliary_loss_mlp": 0.0103167, + "balance_loss_clip": 1.03507042, + "balance_loss_mlp": 1.01747251, + "epoch": 0.9123703592364347, + "flos": 25155784149120.0, + "grad_norm": 1.9591497521426535, + "language_loss": 0.74826527, + "learning_rate": 7.996009129329894e-08, + "loss": 0.76954854, + "num_input_tokens_seen": 327346700, + "step": 15175, + "time_per_iteration": 2.6358284950256348 + }, + { + "auxiliary_loss_clip": 0.01017603, + "auxiliary_loss_mlp": 0.01002357, + "balance_loss_clip": 1.00503564, + "balance_loss_mlp": 1.00146246, + "epoch": 0.9124304824891026, + "flos": 60801650812800.0, + "grad_norm": 0.9602139847905503, + "language_loss": 0.58486784, + "learning_rate": 7.985110507575421e-08, + "loss": 0.60506743, + "num_input_tokens_seen": 327403050, + "step": 15176, + "time_per_iteration": 3.1978743076324463 + }, + { + "auxiliary_loss_clip": 0.01083812, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.03273082, + "balance_loss_mlp": 1.02405846, + "epoch": 0.9124906057417707, + "flos": 18150941352960.0, + "grad_norm": 1.6481113085508423, + "language_loss": 0.65639609, + "learning_rate": 7.97421916704475e-08, + "loss": 0.67760354, + "num_input_tokens_seen": 327422225, + "step": 15177, + "time_per_iteration": 2.6916801929473877 + }, + { + "auxiliary_loss_clip": 0.0107591, + "auxiliary_loss_mlp": 0.01028967, + "balance_loss_clip": 1.03282464, + "balance_loss_mlp": 1.01652193, + "epoch": 0.9125507289944386, + "flos": 11686769049600.0, + "grad_norm": 2.237261929253729, + "language_loss": 0.81215572, + "learning_rate": 7.963335108150926e-08, + "loss": 0.83320451, + "num_input_tokens_seen": 327437025, + "step": 15178, + "time_per_iteration": 2.6279830932617188 + }, + { + "auxiliary_loss_clip": 0.01049012, + "auxiliary_loss_mlp": 0.01039158, + "balance_loss_clip": 1.03083158, + "balance_loss_mlp": 1.02516901, + "epoch": 0.9126108522471066, + "flos": 17748813617280.0, + "grad_norm": 2.000734331425356, + "language_loss": 0.79079652, + "learning_rate": 7.952458331306711e-08, + "loss": 0.81167829, + "num_input_tokens_seen": 327453915, + "step": 15179, + "time_per_iteration": 2.675297737121582 + }, + { + "auxiliary_loss_clip": 0.01084629, + "auxiliary_loss_mlp": 0.01031574, + "balance_loss_clip": 1.0357244, + "balance_loss_mlp": 1.02000451, + "epoch": 0.9126709754997745, + "flos": 27635738952960.0, + "grad_norm": 1.5039394152550116, + "language_loss": 0.67973173, + "learning_rate": 7.941588836924507e-08, + "loss": 0.70089382, + "num_input_tokens_seen": 327474415, + "step": 15180, + "time_per_iteration": 2.697028875350952 + }, + { + "auxiliary_loss_clip": 0.0109496, + "auxiliary_loss_mlp": 0.01027912, + "balance_loss_clip": 1.03393316, + "balance_loss_mlp": 1.01655757, + "epoch": 0.9127310987524425, + "flos": 15924982596480.0, + "grad_norm": 1.6922587349839364, + "language_loss": 0.75127202, + "learning_rate": 7.930726625416495e-08, + "loss": 0.77250075, + "num_input_tokens_seen": 327492750, + "step": 15181, + "time_per_iteration": 2.6039087772369385 + }, + { + "auxiliary_loss_clip": 0.01113895, + "auxiliary_loss_mlp": 0.01031043, + "balance_loss_clip": 1.03893065, + "balance_loss_mlp": 1.01871705, + "epoch": 0.9127912220051104, + "flos": 21536885923200.0, + "grad_norm": 4.263529248122138, + "language_loss": 0.74789053, + "learning_rate": 7.919871697194614e-08, + "loss": 0.76933992, + "num_input_tokens_seen": 327509470, + "step": 15182, + "time_per_iteration": 2.5808985233306885 + }, + { + "auxiliary_loss_clip": 0.01109967, + "auxiliary_loss_mlp": 0.01030307, + "balance_loss_clip": 1.036412, + "balance_loss_mlp": 1.01767075, + "epoch": 0.9128513452577784, + "flos": 24063561342720.0, + "grad_norm": 1.4783992665801426, + "language_loss": 0.7637254, + "learning_rate": 7.909024052670421e-08, + "loss": 0.78512818, + "num_input_tokens_seen": 327530520, + "step": 15183, + "time_per_iteration": 2.690436601638794 + }, + { + "auxiliary_loss_clip": 0.0109821, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.03847337, + "balance_loss_mlp": 1.01838112, + "epoch": 0.9129114685104465, + "flos": 16216469464320.0, + "grad_norm": 2.432279077679038, + "language_loss": 0.76472139, + "learning_rate": 7.898183692255256e-08, + "loss": 0.78601527, + "num_input_tokens_seen": 327546960, + "step": 15184, + "time_per_iteration": 2.643298864364624 + }, + { + "auxiliary_loss_clip": 0.01093284, + "auxiliary_loss_mlp": 0.01035355, + "balance_loss_clip": 1.03755832, + "balance_loss_mlp": 1.02360058, + "epoch": 0.9129715917631144, + "flos": 19384364522880.0, + "grad_norm": 1.6196695380751174, + "language_loss": 0.74525392, + "learning_rate": 7.887350616360233e-08, + "loss": 0.76654035, + "num_input_tokens_seen": 327564830, + "step": 15185, + "time_per_iteration": 2.5846035480499268 + }, + { + "auxiliary_loss_clip": 0.0108538, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.03683412, + "balance_loss_mlp": 1.01925135, + "epoch": 0.9130317150157824, + "flos": 20590460421120.0, + "grad_norm": 2.0406191594638257, + "language_loss": 0.68331826, + "learning_rate": 7.876524825396158e-08, + "loss": 0.70449007, + "num_input_tokens_seen": 327583675, + "step": 15186, + "time_per_iteration": 2.6857335567474365 + }, + { + "auxiliary_loss_clip": 0.01089556, + "auxiliary_loss_mlp": 0.01041285, + "balance_loss_clip": 1.03548872, + "balance_loss_mlp": 1.02558517, + "epoch": 0.9130918382684503, + "flos": 20189230525440.0, + "grad_norm": 2.094200267173926, + "language_loss": 0.77826124, + "learning_rate": 7.865706319773502e-08, + "loss": 0.79956973, + "num_input_tokens_seen": 327602280, + "step": 15187, + "time_per_iteration": 2.707458972930908 + }, + { + "auxiliary_loss_clip": 0.01108019, + "auxiliary_loss_mlp": 0.007702, + "balance_loss_clip": 1.03599858, + "balance_loss_mlp": 1.00022209, + "epoch": 0.9131519615211183, + "flos": 25556870390400.0, + "grad_norm": 6.79519157361436, + "language_loss": 0.65794706, + "learning_rate": 7.854895099902515e-08, + "loss": 0.6767292, + "num_input_tokens_seen": 327623515, + "step": 15188, + "time_per_iteration": 2.6106925010681152 + }, + { + "auxiliary_loss_clip": 0.0103354, + "auxiliary_loss_mlp": 0.01035627, + "balance_loss_clip": 1.02865291, + "balance_loss_mlp": 1.02201962, + "epoch": 0.9132120847737862, + "flos": 17931563038080.0, + "grad_norm": 1.7656682346209025, + "language_loss": 0.76258671, + "learning_rate": 7.844091166193157e-08, + "loss": 0.78327841, + "num_input_tokens_seen": 327642875, + "step": 15189, + "time_per_iteration": 2.8081729412078857 + }, + { + "auxiliary_loss_clip": 0.0109744, + "auxiliary_loss_mlp": 0.01029243, + "balance_loss_clip": 1.03559053, + "balance_loss_mlp": 1.0180254, + "epoch": 0.9132722080264543, + "flos": 20047635112320.0, + "grad_norm": 1.7520638649774822, + "language_loss": 0.75371557, + "learning_rate": 7.8332945190551e-08, + "loss": 0.77498239, + "num_input_tokens_seen": 327662450, + "step": 15190, + "time_per_iteration": 2.6704981327056885 + }, + { + "auxiliary_loss_clip": 0.01019225, + "auxiliary_loss_mlp": 0.01003714, + "balance_loss_clip": 1.00641704, + "balance_loss_mlp": 1.00264728, + "epoch": 0.9133323312791222, + "flos": 70439967141120.0, + "grad_norm": 0.7014520418780014, + "language_loss": 0.57308424, + "learning_rate": 7.822505158897797e-08, + "loss": 0.59331357, + "num_input_tokens_seen": 327723845, + "step": 15191, + "time_per_iteration": 3.21588134765625 + }, + { + "auxiliary_loss_clip": 0.01113051, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.03901196, + "balance_loss_mlp": 1.02014041, + "epoch": 0.9133924545317902, + "flos": 25483792170240.0, + "grad_norm": 1.7022640616397489, + "language_loss": 0.74351078, + "learning_rate": 7.81172308613034e-08, + "loss": 0.76497352, + "num_input_tokens_seen": 327742590, + "step": 15192, + "time_per_iteration": 2.615525245666504 + }, + { + "auxiliary_loss_clip": 0.01096745, + "auxiliary_loss_mlp": 0.01028743, + "balance_loss_clip": 1.03728342, + "balance_loss_mlp": 1.01645255, + "epoch": 0.9134525777844581, + "flos": 39930690107520.0, + "grad_norm": 1.536018225691407, + "language_loss": 0.69412756, + "learning_rate": 7.800948301161647e-08, + "loss": 0.71538246, + "num_input_tokens_seen": 327764350, + "step": 15193, + "time_per_iteration": 2.774912118911743 + }, + { + "auxiliary_loss_clip": 0.01095342, + "auxiliary_loss_mlp": 0.0103875, + "balance_loss_clip": 1.03767395, + "balance_loss_mlp": 1.02737117, + "epoch": 0.9135127010371261, + "flos": 20886723797760.0, + "grad_norm": 1.712567345292954, + "language_loss": 0.73434842, + "learning_rate": 7.790180804400215e-08, + "loss": 0.75568932, + "num_input_tokens_seen": 327783120, + "step": 15194, + "time_per_iteration": 2.581974983215332 + }, + { + "auxiliary_loss_clip": 0.01063051, + "auxiliary_loss_mlp": 0.01041182, + "balance_loss_clip": 1.03309762, + "balance_loss_mlp": 1.02517855, + "epoch": 0.913572824289794, + "flos": 20813250528000.0, + "grad_norm": 1.8550488642948777, + "language_loss": 0.61682135, + "learning_rate": 7.779420596254383e-08, + "loss": 0.63786364, + "num_input_tokens_seen": 327801960, + "step": 15195, + "time_per_iteration": 2.881197929382324 + }, + { + "auxiliary_loss_clip": 0.01098691, + "auxiliary_loss_mlp": 0.01034267, + "balance_loss_clip": 1.03617358, + "balance_loss_mlp": 1.02182126, + "epoch": 0.913632947542462, + "flos": 25703278225920.0, + "grad_norm": 1.4758121064048373, + "language_loss": 0.71160495, + "learning_rate": 7.768667677132201e-08, + "loss": 0.73293453, + "num_input_tokens_seen": 327823795, + "step": 15196, + "time_per_iteration": 2.6203744411468506 + }, + { + "auxiliary_loss_clip": 0.01084959, + "auxiliary_loss_mlp": 0.01035793, + "balance_loss_clip": 1.034657, + "balance_loss_mlp": 1.02372885, + "epoch": 0.9136930707951301, + "flos": 26286216048000.0, + "grad_norm": 1.471790705908436, + "language_loss": 0.71344984, + "learning_rate": 7.757922047441411e-08, + "loss": 0.73465735, + "num_input_tokens_seen": 327845175, + "step": 15197, + "time_per_iteration": 2.6849207878112793 + }, + { + "auxiliary_loss_clip": 0.01088436, + "auxiliary_loss_mlp": 0.01027135, + "balance_loss_clip": 1.03387213, + "balance_loss_mlp": 1.01404572, + "epoch": 0.913753194047798, + "flos": 22091885942400.0, + "grad_norm": 1.7806883440096042, + "language_loss": 0.7787807, + "learning_rate": 7.747183707589489e-08, + "loss": 0.79993641, + "num_input_tokens_seen": 327863150, + "step": 15198, + "time_per_iteration": 2.629854202270508 + }, + { + "auxiliary_loss_clip": 0.01089748, + "auxiliary_loss_mlp": 0.01030529, + "balance_loss_clip": 1.03545046, + "balance_loss_mlp": 1.01816726, + "epoch": 0.913813317300466, + "flos": 23587206151680.0, + "grad_norm": 1.509412256528269, + "language_loss": 0.67781103, + "learning_rate": 7.736452657983616e-08, + "loss": 0.69901383, + "num_input_tokens_seen": 327883445, + "step": 15199, + "time_per_iteration": 2.6181437969207764 + }, + { + "auxiliary_loss_clip": 0.01097631, + "auxiliary_loss_mlp": 0.00769993, + "balance_loss_clip": 1.03525543, + "balance_loss_mlp": 1.00025439, + "epoch": 0.9138734405531339, + "flos": 28876452583680.0, + "grad_norm": 1.5467213284534869, + "language_loss": 0.67587829, + "learning_rate": 7.725728899030714e-08, + "loss": 0.69455445, + "num_input_tokens_seen": 327905745, + "step": 15200, + "time_per_iteration": 2.768298387527466 + }, + { + "auxiliary_loss_clip": 0.0109491, + "auxiliary_loss_mlp": 0.01031874, + "balance_loss_clip": 1.03708506, + "balance_loss_mlp": 1.020787, + "epoch": 0.9139335638058019, + "flos": 22821087945600.0, + "grad_norm": 1.5631891180078048, + "language_loss": 0.71305549, + "learning_rate": 7.715012431137435e-08, + "loss": 0.73432332, + "num_input_tokens_seen": 327925435, + "step": 15201, + "time_per_iteration": 2.6898791790008545 + }, + { + "auxiliary_loss_clip": 0.01096112, + "auxiliary_loss_mlp": 0.01027534, + "balance_loss_clip": 1.03487992, + "balance_loss_mlp": 1.01640594, + "epoch": 0.9139936870584698, + "flos": 18004174381440.0, + "grad_norm": 1.9050793527303824, + "language_loss": 0.70880222, + "learning_rate": 7.704303254710165e-08, + "loss": 0.73003864, + "num_input_tokens_seen": 327944145, + "step": 15202, + "time_per_iteration": 2.645087718963623 + }, + { + "auxiliary_loss_clip": 0.01107696, + "auxiliary_loss_mlp": 0.01031158, + "balance_loss_clip": 1.03578544, + "balance_loss_mlp": 1.01858711, + "epoch": 0.9140538103111379, + "flos": 15813767111040.0, + "grad_norm": 5.183790549538575, + "language_loss": 0.66272342, + "learning_rate": 7.693601370155001e-08, + "loss": 0.68411195, + "num_input_tokens_seen": 327960565, + "step": 15203, + "time_per_iteration": 2.5569849014282227 + }, + { + "auxiliary_loss_clip": 0.01099433, + "auxiliary_loss_mlp": 0.0102949, + "balance_loss_clip": 1.03735852, + "balance_loss_mlp": 1.01664543, + "epoch": 0.9141139335638058, + "flos": 23987035416960.0, + "grad_norm": 1.5350852350505626, + "language_loss": 0.68632525, + "learning_rate": 7.682906777877751e-08, + "loss": 0.70761448, + "num_input_tokens_seen": 327981180, + "step": 15204, + "time_per_iteration": 2.609595537185669 + }, + { + "auxiliary_loss_clip": 0.01096665, + "auxiliary_loss_mlp": 0.01024903, + "balance_loss_clip": 1.03312159, + "balance_loss_mlp": 1.01215935, + "epoch": 0.9141740568164738, + "flos": 24024418496640.0, + "grad_norm": 1.940906740500505, + "language_loss": 0.59392846, + "learning_rate": 7.672219478283915e-08, + "loss": 0.61514413, + "num_input_tokens_seen": 328001500, + "step": 15205, + "time_per_iteration": 4.150220632553101 + }, + { + "auxiliary_loss_clip": 0.01065472, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.03354537, + "balance_loss_mlp": 1.01977837, + "epoch": 0.9142341800691417, + "flos": 27018291139200.0, + "grad_norm": 1.7151218871860374, + "language_loss": 0.81336343, + "learning_rate": 7.661539471778811e-08, + "loss": 0.83434355, + "num_input_tokens_seen": 328023025, + "step": 15206, + "time_per_iteration": 4.417832612991333 + }, + { + "auxiliary_loss_clip": 0.01062676, + "auxiliary_loss_mlp": 0.0102896, + "balance_loss_clip": 1.03224123, + "balance_loss_mlp": 1.01588321, + "epoch": 0.9142943033218097, + "flos": 20412487509120.0, + "grad_norm": 2.7859643949116695, + "language_loss": 0.73940361, + "learning_rate": 7.650866758767382e-08, + "loss": 0.76031995, + "num_input_tokens_seen": 328041410, + "step": 15207, + "time_per_iteration": 2.729606866836548 + }, + { + "auxiliary_loss_clip": 0.01068037, + "auxiliary_loss_mlp": 0.01037257, + "balance_loss_clip": 1.04014826, + "balance_loss_mlp": 1.02391171, + "epoch": 0.9143544265744776, + "flos": 19755322231680.0, + "grad_norm": 1.6574585771542836, + "language_loss": 0.7323935, + "learning_rate": 7.640201339654373e-08, + "loss": 0.75344646, + "num_input_tokens_seen": 328060495, + "step": 15208, + "time_per_iteration": 4.227857351303101 + }, + { + "auxiliary_loss_clip": 0.01091165, + "auxiliary_loss_mlp": 0.01027772, + "balance_loss_clip": 1.03750086, + "balance_loss_mlp": 1.01647067, + "epoch": 0.9144145498271457, + "flos": 17165444832000.0, + "grad_norm": 2.0923542291564545, + "language_loss": 0.8601079, + "learning_rate": 7.629543214844237e-08, + "loss": 0.88129735, + "num_input_tokens_seen": 328076905, + "step": 15209, + "time_per_iteration": 2.590949058532715 + }, + { + "auxiliary_loss_clip": 0.01091262, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.04051423, + "balance_loss_mlp": 1.0222261, + "epoch": 0.9144746730798137, + "flos": 23726072131200.0, + "grad_norm": 1.9387336499719838, + "language_loss": 0.75063741, + "learning_rate": 7.618892384741093e-08, + "loss": 0.77189153, + "num_input_tokens_seen": 328096960, + "step": 15210, + "time_per_iteration": 2.6469690799713135 + }, + { + "auxiliary_loss_clip": 0.01083487, + "auxiliary_loss_mlp": 0.01032807, + "balance_loss_clip": 1.03146422, + "balance_loss_mlp": 1.02025414, + "epoch": 0.9145347963324816, + "flos": 25847854467840.0, + "grad_norm": 2.0189583543818994, + "language_loss": 0.78215957, + "learning_rate": 7.6082488497488e-08, + "loss": 0.80332255, + "num_input_tokens_seen": 328115445, + "step": 15211, + "time_per_iteration": 2.6844332218170166 + }, + { + "auxiliary_loss_clip": 0.01100808, + "auxiliary_loss_mlp": 0.01026537, + "balance_loss_clip": 1.03790462, + "balance_loss_mlp": 1.01447928, + "epoch": 0.9145949195851496, + "flos": 19242769109760.0, + "grad_norm": 1.6970166038949297, + "language_loss": 0.82861638, + "learning_rate": 7.597612610270986e-08, + "loss": 0.84988987, + "num_input_tokens_seen": 328133965, + "step": 15212, + "time_per_iteration": 2.670666217803955 + }, + { + "auxiliary_loss_clip": 0.01095988, + "auxiliary_loss_mlp": 0.01029094, + "balance_loss_clip": 1.03628695, + "balance_loss_mlp": 1.01744699, + "epoch": 0.9146550428378175, + "flos": 18296379521280.0, + "grad_norm": 1.816708490158756, + "language_loss": 0.83801937, + "learning_rate": 7.586983666711022e-08, + "loss": 0.85927022, + "num_input_tokens_seen": 328151520, + "step": 15213, + "time_per_iteration": 2.5807952880859375 + }, + { + "auxiliary_loss_clip": 0.01092484, + "auxiliary_loss_mlp": 0.0102752, + "balance_loss_clip": 1.03717518, + "balance_loss_mlp": 1.01593268, + "epoch": 0.9147151660904855, + "flos": 20084264006400.0, + "grad_norm": 1.7762329213074084, + "language_loss": 0.70716697, + "learning_rate": 7.576362019471894e-08, + "loss": 0.72836697, + "num_input_tokens_seen": 328171275, + "step": 15214, + "time_per_iteration": 2.606302499771118 + }, + { + "auxiliary_loss_clip": 0.01100282, + "auxiliary_loss_mlp": 0.0103609, + "balance_loss_clip": 1.03756428, + "balance_loss_mlp": 1.02288795, + "epoch": 0.9147752893431534, + "flos": 24389127239040.0, + "grad_norm": 2.6763056235741876, + "language_loss": 0.62738419, + "learning_rate": 7.565747668956413e-08, + "loss": 0.64874792, + "num_input_tokens_seen": 328192115, + "step": 15215, + "time_per_iteration": 2.624128580093384 + }, + { + "auxiliary_loss_clip": 0.01083257, + "auxiliary_loss_mlp": 0.01031489, + "balance_loss_clip": 1.04120791, + "balance_loss_mlp": 1.0186621, + "epoch": 0.9148354125958215, + "flos": 18150402648960.0, + "grad_norm": 2.856196608513459, + "language_loss": 0.75838691, + "learning_rate": 7.555140615567058e-08, + "loss": 0.77953434, + "num_input_tokens_seen": 328208990, + "step": 15216, + "time_per_iteration": 2.683112144470215 + }, + { + "auxiliary_loss_clip": 0.01082061, + "auxiliary_loss_mlp": 0.0104043, + "balance_loss_clip": 1.0344038, + "balance_loss_mlp": 1.02597594, + "epoch": 0.9148955358484894, + "flos": 23367540528000.0, + "grad_norm": 2.1556116302861223, + "language_loss": 0.679968, + "learning_rate": 7.544540859706062e-08, + "loss": 0.70119286, + "num_input_tokens_seen": 328227840, + "step": 15217, + "time_per_iteration": 2.7583320140838623 + }, + { + "auxiliary_loss_clip": 0.01096251, + "auxiliary_loss_mlp": 0.01034398, + "balance_loss_clip": 1.03755021, + "balance_loss_mlp": 1.02222061, + "epoch": 0.9149556591011574, + "flos": 18076498416000.0, + "grad_norm": 1.7866598830816114, + "language_loss": 0.79880273, + "learning_rate": 7.533948401775347e-08, + "loss": 0.82010925, + "num_input_tokens_seen": 328246250, + "step": 15218, + "time_per_iteration": 2.5897185802459717 + }, + { + "auxiliary_loss_clip": 0.0099941, + "auxiliary_loss_mlp": 0.00999986, + "balance_loss_clip": 1.00879896, + "balance_loss_mlp": 0.99891329, + "epoch": 0.9150157823538253, + "flos": 54586374825600.0, + "grad_norm": 0.8465659506320653, + "language_loss": 0.59200621, + "learning_rate": 7.523363242176595e-08, + "loss": 0.61200017, + "num_input_tokens_seen": 328303625, + "step": 15219, + "time_per_iteration": 3.1801815032958984 + }, + { + "auxiliary_loss_clip": 0.01096152, + "auxiliary_loss_mlp": 0.01034464, + "balance_loss_clip": 1.03535295, + "balance_loss_mlp": 1.0223403, + "epoch": 0.9150759056064933, + "flos": 17893102550400.0, + "grad_norm": 2.4543943314261063, + "language_loss": 0.78340375, + "learning_rate": 7.512785381311216e-08, + "loss": 0.80470991, + "num_input_tokens_seen": 328322135, + "step": 15220, + "time_per_iteration": 2.595521926879883 + }, + { + "auxiliary_loss_clip": 0.01057387, + "auxiliary_loss_mlp": 0.01042337, + "balance_loss_clip": 1.03327441, + "balance_loss_mlp": 1.02777517, + "epoch": 0.9151360288591612, + "flos": 18073517587200.0, + "grad_norm": 2.0267534769754683, + "language_loss": 0.66091788, + "learning_rate": 7.50221481958031e-08, + "loss": 0.68191504, + "num_input_tokens_seen": 328340750, + "step": 15221, + "time_per_iteration": 2.7066280841827393 + }, + { + "auxiliary_loss_clip": 0.01086188, + "auxiliary_loss_mlp": 0.01031232, + "balance_loss_clip": 1.0361774, + "balance_loss_mlp": 1.01978827, + "epoch": 0.9151961521118293, + "flos": 19354523299200.0, + "grad_norm": 1.6413784171664523, + "language_loss": 0.84243524, + "learning_rate": 7.491651557384692e-08, + "loss": 0.86360949, + "num_input_tokens_seen": 328359995, + "step": 15222, + "time_per_iteration": 2.6501386165618896 + }, + { + "auxiliary_loss_clip": 0.01014171, + "auxiliary_loss_mlp": 0.0100656, + "balance_loss_clip": 1.01053584, + "balance_loss_mlp": 1.00542736, + "epoch": 0.9152562753644973, + "flos": 72146621018880.0, + "grad_norm": 0.7238738726338669, + "language_loss": 0.49580848, + "learning_rate": 7.481095595124953e-08, + "loss": 0.51601577, + "num_input_tokens_seen": 328426865, + "step": 15223, + "time_per_iteration": 3.214282751083374 + }, + { + "auxiliary_loss_clip": 0.01078844, + "auxiliary_loss_mlp": 0.01037006, + "balance_loss_clip": 1.03739119, + "balance_loss_mlp": 1.02367282, + "epoch": 0.9153163986171652, + "flos": 20777016683520.0, + "grad_norm": 2.2306023467876175, + "language_loss": 0.72199959, + "learning_rate": 7.470546933201349e-08, + "loss": 0.7431581, + "num_input_tokens_seen": 328445970, + "step": 15224, + "time_per_iteration": 2.673509359359741 + }, + { + "auxiliary_loss_clip": 0.01093298, + "auxiliary_loss_mlp": 0.01029185, + "balance_loss_clip": 1.03519857, + "balance_loss_mlp": 1.01645935, + "epoch": 0.9153765218698332, + "flos": 23040107124480.0, + "grad_norm": 1.873148880683522, + "language_loss": 0.81030774, + "learning_rate": 7.460005572013895e-08, + "loss": 0.83153254, + "num_input_tokens_seen": 328464585, + "step": 15225, + "time_per_iteration": 2.5755882263183594 + }, + { + "auxiliary_loss_clip": 0.01105808, + "auxiliary_loss_mlp": 0.01023692, + "balance_loss_clip": 1.03513598, + "balance_loss_mlp": 1.01225948, + "epoch": 0.9154366451225011, + "flos": 28990900293120.0, + "grad_norm": 1.4093561745696859, + "language_loss": 0.71350908, + "learning_rate": 7.44947151196238e-08, + "loss": 0.73480415, + "num_input_tokens_seen": 328490155, + "step": 15226, + "time_per_iteration": 2.658024787902832 + }, + { + "auxiliary_loss_clip": 0.01038791, + "auxiliary_loss_mlp": 0.01029628, + "balance_loss_clip": 1.03364909, + "balance_loss_mlp": 1.01687872, + "epoch": 0.9154967683751691, + "flos": 22309504490880.0, + "grad_norm": 2.8076014846483166, + "language_loss": 0.74480593, + "learning_rate": 7.43894475344613e-08, + "loss": 0.76549006, + "num_input_tokens_seen": 328508275, + "step": 15227, + "time_per_iteration": 2.8204689025878906 + }, + { + "auxiliary_loss_clip": 0.01084535, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.03527713, + "balance_loss_mlp": 1.01924694, + "epoch": 0.915556891627837, + "flos": 24571481610240.0, + "grad_norm": 1.973795210729136, + "language_loss": 0.74037504, + "learning_rate": 7.428425296864404e-08, + "loss": 0.7615304, + "num_input_tokens_seen": 328529425, + "step": 15228, + "time_per_iteration": 2.745267152786255 + }, + { + "auxiliary_loss_clip": 0.0106924, + "auxiliary_loss_mlp": 0.01029082, + "balance_loss_clip": 1.03406215, + "balance_loss_mlp": 1.01733994, + "epoch": 0.9156170148805051, + "flos": 22164676853760.0, + "grad_norm": 1.4512437253894719, + "language_loss": 0.71928173, + "learning_rate": 7.417913142616106e-08, + "loss": 0.74026489, + "num_input_tokens_seen": 328550200, + "step": 15229, + "time_per_iteration": 2.8107035160064697 + }, + { + "auxiliary_loss_clip": 0.01111837, + "auxiliary_loss_mlp": 0.01035356, + "balance_loss_clip": 1.03959012, + "balance_loss_mlp": 1.0219506, + "epoch": 0.915677138133173, + "flos": 20920659171840.0, + "grad_norm": 1.9803845760849772, + "language_loss": 0.83079779, + "learning_rate": 7.407408291099848e-08, + "loss": 0.85226971, + "num_input_tokens_seen": 328568540, + "step": 15230, + "time_per_iteration": 2.5778980255126953 + }, + { + "auxiliary_loss_clip": 0.01068692, + "auxiliary_loss_mlp": 0.0102912, + "balance_loss_clip": 1.03630972, + "balance_loss_mlp": 1.01733065, + "epoch": 0.915737261385841, + "flos": 24345136056960.0, + "grad_norm": 1.5638994000303916, + "language_loss": 0.83665484, + "learning_rate": 7.396910742713957e-08, + "loss": 0.85763288, + "num_input_tokens_seen": 328587300, + "step": 15231, + "time_per_iteration": 2.757667303085327 + }, + { + "auxiliary_loss_clip": 0.0109037, + "auxiliary_loss_mlp": 0.0102554, + "balance_loss_clip": 1.03120708, + "balance_loss_mlp": 1.01339293, + "epoch": 0.9157973846385089, + "flos": 26761386090240.0, + "grad_norm": 1.4862156687145838, + "language_loss": 0.72474539, + "learning_rate": 7.386420497856516e-08, + "loss": 0.74590445, + "num_input_tokens_seen": 328610055, + "step": 15232, + "time_per_iteration": 2.65309739112854 + }, + { + "auxiliary_loss_clip": 0.01110021, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.03648698, + "balance_loss_mlp": 1.02338552, + "epoch": 0.9158575078911769, + "flos": 18478733892480.0, + "grad_norm": 2.186963867327178, + "language_loss": 0.67672479, + "learning_rate": 7.375937556925338e-08, + "loss": 0.69818151, + "num_input_tokens_seen": 328626815, + "step": 15233, + "time_per_iteration": 2.5290985107421875 + }, + { + "auxiliary_loss_clip": 0.01084574, + "auxiliary_loss_mlp": 0.01037951, + "balance_loss_clip": 1.03832459, + "balance_loss_mlp": 1.02474308, + "epoch": 0.9159176311438448, + "flos": 21798926616960.0, + "grad_norm": 1.9371619126619564, + "language_loss": 0.69512558, + "learning_rate": 7.365461920317861e-08, + "loss": 0.71635091, + "num_input_tokens_seen": 328643995, + "step": 15234, + "time_per_iteration": 2.6468849182128906 + }, + { + "auxiliary_loss_clip": 0.01086822, + "auxiliary_loss_mlp": 0.01034886, + "balance_loss_clip": 1.0372566, + "balance_loss_mlp": 1.02233958, + "epoch": 0.9159777543965129, + "flos": 24783749032320.0, + "grad_norm": 1.9164787678121122, + "language_loss": 0.88101876, + "learning_rate": 7.354993588431391e-08, + "loss": 0.90223587, + "num_input_tokens_seen": 328659565, + "step": 15235, + "time_per_iteration": 2.681330919265747 + }, + { + "auxiliary_loss_clip": 0.0104198, + "auxiliary_loss_mlp": 0.01037221, + "balance_loss_clip": 1.03242683, + "balance_loss_mlp": 1.0227077, + "epoch": 0.9160378776491809, + "flos": 26868758820480.0, + "grad_norm": 1.7189420130737911, + "language_loss": 0.77287024, + "learning_rate": 7.344532561662853e-08, + "loss": 0.79366231, + "num_input_tokens_seen": 328679045, + "step": 15236, + "time_per_iteration": 2.7985198497772217 + }, + { + "auxiliary_loss_clip": 0.00988696, + "auxiliary_loss_mlp": 0.01006396, + "balance_loss_clip": 1.01333547, + "balance_loss_mlp": 1.00522804, + "epoch": 0.9160980009018488, + "flos": 70578222589440.0, + "grad_norm": 0.6745147326692066, + "language_loss": 0.62227875, + "learning_rate": 7.334078840409019e-08, + "loss": 0.64222974, + "num_input_tokens_seen": 328744565, + "step": 15237, + "time_per_iteration": 3.2159206867218018 + }, + { + "auxiliary_loss_clip": 0.0111032, + "auxiliary_loss_mlp": 0.00770462, + "balance_loss_clip": 1.03761566, + "balance_loss_mlp": 1.00039566, + "epoch": 0.9161581241545168, + "flos": 16289332202880.0, + "grad_norm": 2.2962314429529638, + "language_loss": 0.75145757, + "learning_rate": 7.323632425066151e-08, + "loss": 0.77026534, + "num_input_tokens_seen": 328762455, + "step": 15238, + "time_per_iteration": 2.5952906608581543 + }, + { + "auxiliary_loss_clip": 0.01108796, + "auxiliary_loss_mlp": 0.01025665, + "balance_loss_clip": 1.03680956, + "balance_loss_mlp": 1.01369047, + "epoch": 0.9162182474071847, + "flos": 18438154502400.0, + "grad_norm": 2.6834766833849693, + "language_loss": 0.7463975, + "learning_rate": 7.313193316030464e-08, + "loss": 0.76774204, + "num_input_tokens_seen": 328780320, + "step": 15239, + "time_per_iteration": 2.5366570949554443 + }, + { + "auxiliary_loss_clip": 0.01078699, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.03494883, + "balance_loss_mlp": 1.02270627, + "epoch": 0.9162783706598527, + "flos": 19167248764800.0, + "grad_norm": 3.1115685298181797, + "language_loss": 0.63496542, + "learning_rate": 7.302761513697819e-08, + "loss": 0.65610296, + "num_input_tokens_seen": 328797570, + "step": 15240, + "time_per_iteration": 2.654343366622925 + }, + { + "auxiliary_loss_clip": 0.01084597, + "auxiliary_loss_mlp": 0.00769911, + "balance_loss_clip": 1.0354557, + "balance_loss_mlp": 1.00024796, + "epoch": 0.9163384939125206, + "flos": 20412990299520.0, + "grad_norm": 1.818210522630089, + "language_loss": 0.7633701, + "learning_rate": 7.292337018463746e-08, + "loss": 0.78191519, + "num_input_tokens_seen": 328814075, + "step": 15241, + "time_per_iteration": 2.681783676147461 + }, + { + "auxiliary_loss_clip": 0.01103855, + "auxiliary_loss_mlp": 0.01030564, + "balance_loss_clip": 1.03727055, + "balance_loss_mlp": 1.01654494, + "epoch": 0.9163986171651887, + "flos": 19645902426240.0, + "grad_norm": 2.120916469746568, + "language_loss": 0.67877054, + "learning_rate": 7.281919830723549e-08, + "loss": 0.70011473, + "num_input_tokens_seen": 328831990, + "step": 15242, + "time_per_iteration": 2.695181131362915 + }, + { + "auxiliary_loss_clip": 0.01095195, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.03303313, + "balance_loss_mlp": 1.02215111, + "epoch": 0.9164587404178566, + "flos": 12823054865280.0, + "grad_norm": 2.0974216325015944, + "language_loss": 0.80733311, + "learning_rate": 7.271509950872334e-08, + "loss": 0.8286351, + "num_input_tokens_seen": 328849105, + "step": 15243, + "time_per_iteration": 2.634120464324951 + }, + { + "auxiliary_loss_clip": 0.01082905, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.03140903, + "balance_loss_mlp": 1.01816344, + "epoch": 0.9165188636705246, + "flos": 22309396750080.0, + "grad_norm": 1.8693748825507899, + "language_loss": 0.82145083, + "learning_rate": 7.261107379304721e-08, + "loss": 0.84259009, + "num_input_tokens_seen": 328866810, + "step": 15244, + "time_per_iteration": 4.170153617858887 + }, + { + "auxiliary_loss_clip": 0.01113607, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.0378207, + "balance_loss_mlp": 1.02204251, + "epoch": 0.9165789869231925, + "flos": 18223337214720.0, + "grad_norm": 3.40014047465237, + "language_loss": 0.71937442, + "learning_rate": 7.250712116415214e-08, + "loss": 0.74086428, + "num_input_tokens_seen": 328885325, + "step": 15245, + "time_per_iteration": 4.17969822883606 + }, + { + "auxiliary_loss_clip": 0.01083804, + "auxiliary_loss_mlp": 0.01029232, + "balance_loss_clip": 1.03430676, + "balance_loss_mlp": 1.01741219, + "epoch": 0.9166391101758605, + "flos": 13691553811200.0, + "grad_norm": 1.6435574883208541, + "language_loss": 0.74527669, + "learning_rate": 7.240324162598033e-08, + "loss": 0.76640707, + "num_input_tokens_seen": 328902655, + "step": 15246, + "time_per_iteration": 4.363448858261108 + }, + { + "auxiliary_loss_clip": 0.01080629, + "auxiliary_loss_mlp": 0.01034328, + "balance_loss_clip": 1.03388071, + "balance_loss_mlp": 1.02122653, + "epoch": 0.9166992334285284, + "flos": 17346793622400.0, + "grad_norm": 1.9577914656696735, + "language_loss": 0.75327551, + "learning_rate": 7.229943518247106e-08, + "loss": 0.77442503, + "num_input_tokens_seen": 328918440, + "step": 15247, + "time_per_iteration": 2.664409637451172 + }, + { + "auxiliary_loss_clip": 0.01101374, + "auxiliary_loss_mlp": 0.01027057, + "balance_loss_clip": 1.03908849, + "balance_loss_mlp": 1.01476669, + "epoch": 0.9167593566811965, + "flos": 23731135948800.0, + "grad_norm": 1.669323682742113, + "language_loss": 0.76257682, + "learning_rate": 7.219570183756052e-08, + "loss": 0.7838611, + "num_input_tokens_seen": 328938055, + "step": 15248, + "time_per_iteration": 4.128343820571899 + }, + { + "auxiliary_loss_clip": 0.01097593, + "auxiliary_loss_mlp": 0.01037777, + "balance_loss_clip": 1.03494728, + "balance_loss_mlp": 1.02446711, + "epoch": 0.9168194799338644, + "flos": 27818201064960.0, + "grad_norm": 2.2661509072382424, + "language_loss": 0.72809201, + "learning_rate": 7.209204159518178e-08, + "loss": 0.74944574, + "num_input_tokens_seen": 328957895, + "step": 15249, + "time_per_iteration": 2.67682147026062 + }, + { + "auxiliary_loss_clip": 0.01060539, + "auxiliary_loss_mlp": 0.01029442, + "balance_loss_clip": 1.03332496, + "balance_loss_mlp": 1.01615024, + "epoch": 0.9168796031865324, + "flos": 21717552355200.0, + "grad_norm": 3.1939184411772406, + "language_loss": 0.75809246, + "learning_rate": 7.198845445926616e-08, + "loss": 0.7789923, + "num_input_tokens_seen": 328971365, + "step": 15250, + "time_per_iteration": 2.738577365875244 + }, + { + "auxiliary_loss_clip": 0.01066866, + "auxiliary_loss_mlp": 0.01026181, + "balance_loss_clip": 1.03519356, + "balance_loss_mlp": 1.01423001, + "epoch": 0.9169397264392004, + "flos": 23404420817280.0, + "grad_norm": 1.6784135757036345, + "language_loss": 0.75771379, + "learning_rate": 7.188494043374138e-08, + "loss": 0.77864426, + "num_input_tokens_seen": 328990830, + "step": 15251, + "time_per_iteration": 2.7864675521850586 + }, + { + "auxiliary_loss_clip": 0.01084617, + "auxiliary_loss_mlp": 0.01033127, + "balance_loss_clip": 1.03682351, + "balance_loss_mlp": 1.01889396, + "epoch": 0.9169998496918683, + "flos": 23950981140480.0, + "grad_norm": 3.1809452254911896, + "language_loss": 0.79785126, + "learning_rate": 7.178149952253298e-08, + "loss": 0.81902874, + "num_input_tokens_seen": 329008345, + "step": 15252, + "time_per_iteration": 2.67496395111084 + }, + { + "auxiliary_loss_clip": 0.01108344, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.03633821, + "balance_loss_mlp": 1.02253342, + "epoch": 0.9170599729445363, + "flos": 18332469711360.0, + "grad_norm": 1.6979211858236058, + "language_loss": 0.77028179, + "learning_rate": 7.167813172956316e-08, + "loss": 0.79171169, + "num_input_tokens_seen": 329027440, + "step": 15253, + "time_per_iteration": 2.5820562839508057 + }, + { + "auxiliary_loss_clip": 0.01099567, + "auxiliary_loss_mlp": 0.01026712, + "balance_loss_clip": 1.03753924, + "balance_loss_mlp": 1.0148387, + "epoch": 0.9171200961972042, + "flos": 22674859678080.0, + "grad_norm": 1.9944636420338524, + "language_loss": 0.73225999, + "learning_rate": 7.157483705875256e-08, + "loss": 0.75352275, + "num_input_tokens_seen": 329046445, + "step": 15254, + "time_per_iteration": 2.66645884513855 + }, + { + "auxiliary_loss_clip": 0.01069043, + "auxiliary_loss_mlp": 0.01024866, + "balance_loss_clip": 1.03459096, + "balance_loss_mlp": 1.01324344, + "epoch": 0.9171802194498723, + "flos": 26719298328960.0, + "grad_norm": 1.757918865833482, + "language_loss": 0.79068267, + "learning_rate": 7.14716155140167e-08, + "loss": 0.81162179, + "num_input_tokens_seen": 329065555, + "step": 15255, + "time_per_iteration": 2.791233539581299 + }, + { + "auxiliary_loss_clip": 0.01099583, + "auxiliary_loss_mlp": 0.01032665, + "balance_loss_clip": 1.0360918, + "balance_loss_mlp": 1.01973057, + "epoch": 0.9172403427025402, + "flos": 37889240538240.0, + "grad_norm": 2.1163696122590228, + "language_loss": 0.68610239, + "learning_rate": 7.136846709927047e-08, + "loss": 0.70742488, + "num_input_tokens_seen": 329087515, + "step": 15256, + "time_per_iteration": 2.8768861293792725 + }, + { + "auxiliary_loss_clip": 0.0109198, + "auxiliary_loss_mlp": 0.01039298, + "balance_loss_clip": 1.03456831, + "balance_loss_mlp": 1.02614951, + "epoch": 0.9173004659552082, + "flos": 17055163100160.0, + "grad_norm": 1.585817342288342, + "language_loss": 0.83782554, + "learning_rate": 7.126539181842561e-08, + "loss": 0.85913831, + "num_input_tokens_seen": 329106820, + "step": 15257, + "time_per_iteration": 2.65502667427063 + }, + { + "auxiliary_loss_clip": 0.01082945, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.03255379, + "balance_loss_mlp": 1.0220809, + "epoch": 0.9173605892078761, + "flos": 22201593056640.0, + "grad_norm": 1.60833944396701, + "language_loss": 0.7756505, + "learning_rate": 7.116238967539012e-08, + "loss": 0.79681796, + "num_input_tokens_seen": 329126515, + "step": 15258, + "time_per_iteration": 2.6512203216552734 + }, + { + "auxiliary_loss_clip": 0.01093895, + "auxiliary_loss_mlp": 0.01030397, + "balance_loss_clip": 1.03959584, + "balance_loss_mlp": 1.01836896, + "epoch": 0.9174207124605441, + "flos": 16507776764160.0, + "grad_norm": 2.0334925748000794, + "language_loss": 0.78772163, + "learning_rate": 7.105946067406999e-08, + "loss": 0.80896461, + "num_input_tokens_seen": 329142660, + "step": 15259, + "time_per_iteration": 2.5838210582733154 + }, + { + "auxiliary_loss_clip": 0.01059246, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.03190184, + "balance_loss_mlp": 1.02319837, + "epoch": 0.917480835713212, + "flos": 24535606901760.0, + "grad_norm": 1.6551719766080486, + "language_loss": 0.76302671, + "learning_rate": 7.095660481836895e-08, + "loss": 0.7839697, + "num_input_tokens_seen": 329162575, + "step": 15260, + "time_per_iteration": 2.682069778442383 + }, + { + "auxiliary_loss_clip": 0.01066153, + "auxiliary_loss_mlp": 0.01028553, + "balance_loss_clip": 1.03227329, + "balance_loss_mlp": 1.0160774, + "epoch": 0.9175409589658801, + "flos": 20880726226560.0, + "grad_norm": 1.5511805000911754, + "language_loss": 0.61173445, + "learning_rate": 7.085382211218637e-08, + "loss": 0.63268149, + "num_input_tokens_seen": 329182090, + "step": 15261, + "time_per_iteration": 2.681443929672241 + }, + { + "auxiliary_loss_clip": 0.01080586, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.03192782, + "balance_loss_mlp": 1.01745868, + "epoch": 0.917601082218548, + "flos": 14276035918080.0, + "grad_norm": 1.8788230361145468, + "language_loss": 0.73716688, + "learning_rate": 7.075111255942002e-08, + "loss": 0.75826824, + "num_input_tokens_seen": 329196535, + "step": 15262, + "time_per_iteration": 2.6560230255126953 + }, + { + "auxiliary_loss_clip": 0.01110257, + "auxiliary_loss_mlp": 0.01038053, + "balance_loss_clip": 1.03490496, + "balance_loss_mlp": 1.0255841, + "epoch": 0.917661205471216, + "flos": 19099234362240.0, + "grad_norm": 1.8199175016949676, + "language_loss": 0.77784705, + "learning_rate": 7.064847616396496e-08, + "loss": 0.79933017, + "num_input_tokens_seen": 329215135, + "step": 15263, + "time_per_iteration": 2.5552515983581543 + }, + { + "auxiliary_loss_clip": 0.01110998, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.03634441, + "balance_loss_mlp": 1.017097, + "epoch": 0.917721328723884, + "flos": 21106568989440.0, + "grad_norm": 2.03433288811874, + "language_loss": 0.75501031, + "learning_rate": 7.054591292971324e-08, + "loss": 0.776416, + "num_input_tokens_seen": 329235150, + "step": 15264, + "time_per_iteration": 2.5273077487945557 + }, + { + "auxiliary_loss_clip": 0.01085288, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.03628254, + "balance_loss_mlp": 1.02340412, + "epoch": 0.9177814519765519, + "flos": 21943215550080.0, + "grad_norm": 1.6751862272881284, + "language_loss": 0.83633941, + "learning_rate": 7.044342286055394e-08, + "loss": 0.8575418, + "num_input_tokens_seen": 329254365, + "step": 15265, + "time_per_iteration": 2.6066534519195557 + }, + { + "auxiliary_loss_clip": 0.01114086, + "auxiliary_loss_mlp": 0.01040959, + "balance_loss_clip": 1.03847134, + "balance_loss_mlp": 1.02778673, + "epoch": 0.9178415752292199, + "flos": 24205982768640.0, + "grad_norm": 1.6645706894370145, + "language_loss": 0.7328164, + "learning_rate": 7.034100596037306e-08, + "loss": 0.75436687, + "num_input_tokens_seen": 329274385, + "step": 15266, + "time_per_iteration": 2.5833418369293213 + }, + { + "auxiliary_loss_clip": 0.01108539, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.03649783, + "balance_loss_mlp": 1.01844811, + "epoch": 0.9179016984818879, + "flos": 20042068504320.0, + "grad_norm": 1.59558924604592, + "language_loss": 0.77707624, + "learning_rate": 7.023866223305486e-08, + "loss": 0.79846251, + "num_input_tokens_seen": 329292160, + "step": 15267, + "time_per_iteration": 2.551771879196167 + }, + { + "auxiliary_loss_clip": 0.01017255, + "auxiliary_loss_mlp": 0.00751276, + "balance_loss_clip": 1.00686395, + "balance_loss_mlp": 0.99959415, + "epoch": 0.9179618217345559, + "flos": 65555901100800.0, + "grad_norm": 0.7374753112947235, + "language_loss": 0.56223977, + "learning_rate": 7.013639168247975e-08, + "loss": 0.57992506, + "num_input_tokens_seen": 329351870, + "step": 15268, + "time_per_iteration": 3.2256064414978027 + }, + { + "auxiliary_loss_clip": 0.01110226, + "auxiliary_loss_mlp": 0.00770103, + "balance_loss_clip": 1.03661978, + "balance_loss_mlp": 1.00023341, + "epoch": 0.9180219449872238, + "flos": 21324618501120.0, + "grad_norm": 1.9824828423996201, + "language_loss": 0.76052523, + "learning_rate": 7.0034194312526e-08, + "loss": 0.77932847, + "num_input_tokens_seen": 329370930, + "step": 15269, + "time_per_iteration": 2.571711540222168 + }, + { + "auxiliary_loss_clip": 0.01074295, + "auxiliary_loss_mlp": 0.0103616, + "balance_loss_clip": 1.03202271, + "balance_loss_mlp": 1.02265382, + "epoch": 0.9180820682398918, + "flos": 41060008684800.0, + "grad_norm": 1.7451151800168656, + "language_loss": 0.72839332, + "learning_rate": 6.993207012706936e-08, + "loss": 0.74949783, + "num_input_tokens_seen": 329391275, + "step": 15270, + "time_per_iteration": 2.877145290374756 + }, + { + "auxiliary_loss_clip": 0.01105632, + "auxiliary_loss_mlp": 0.01032058, + "balance_loss_clip": 1.03500867, + "balance_loss_mlp": 1.0196898, + "epoch": 0.9181421914925597, + "flos": 28072915384320.0, + "grad_norm": 1.5262987533233972, + "language_loss": 0.80171967, + "learning_rate": 6.98300191299821e-08, + "loss": 0.82309657, + "num_input_tokens_seen": 329412775, + "step": 15271, + "time_per_iteration": 2.696314573287964 + }, + { + "auxiliary_loss_clip": 0.0106623, + "auxiliary_loss_mlp": 0.01035218, + "balance_loss_clip": 1.03281534, + "balance_loss_mlp": 1.02193236, + "epoch": 0.9182023147452277, + "flos": 29169411909120.0, + "grad_norm": 2.0203873157492387, + "language_loss": 0.72958052, + "learning_rate": 6.972804132513355e-08, + "loss": 0.75059497, + "num_input_tokens_seen": 329432440, + "step": 15272, + "time_per_iteration": 2.7418758869171143 + }, + { + "auxiliary_loss_clip": 0.01080541, + "auxiliary_loss_mlp": 0.01034356, + "balance_loss_clip": 1.03587949, + "balance_loss_mlp": 1.0225302, + "epoch": 0.9182624379978956, + "flos": 24060831909120.0, + "grad_norm": 1.8761263587608576, + "language_loss": 0.72443533, + "learning_rate": 6.962613671639105e-08, + "loss": 0.74558425, + "num_input_tokens_seen": 329450605, + "step": 15273, + "time_per_iteration": 2.5915794372558594 + }, + { + "auxiliary_loss_clip": 0.01068999, + "auxiliary_loss_mlp": 0.01026814, + "balance_loss_clip": 1.033952, + "balance_loss_mlp": 1.01544738, + "epoch": 0.9183225612505637, + "flos": 23293528554240.0, + "grad_norm": 1.6815527096411953, + "language_loss": 0.74486136, + "learning_rate": 6.952430530761933e-08, + "loss": 0.76581949, + "num_input_tokens_seen": 329470550, + "step": 15274, + "time_per_iteration": 2.757570266723633 + }, + { + "auxiliary_loss_clip": 0.01095676, + "auxiliary_loss_mlp": 0.01038928, + "balance_loss_clip": 1.03320456, + "balance_loss_mlp": 1.02651846, + "epoch": 0.9183826845032316, + "flos": 19609237618560.0, + "grad_norm": 1.4749833825049345, + "language_loss": 0.68892634, + "learning_rate": 6.942254710267902e-08, + "loss": 0.71027237, + "num_input_tokens_seen": 329489765, + "step": 15275, + "time_per_iteration": 2.5961973667144775 + }, + { + "auxiliary_loss_clip": 0.01094254, + "auxiliary_loss_mlp": 0.0103149, + "balance_loss_clip": 1.03530109, + "balance_loss_mlp": 1.01921117, + "epoch": 0.9184428077558996, + "flos": 18479057114880.0, + "grad_norm": 1.9188925482656494, + "language_loss": 0.72735369, + "learning_rate": 6.932086210542953e-08, + "loss": 0.74861109, + "num_input_tokens_seen": 329507040, + "step": 15276, + "time_per_iteration": 2.557286024093628 + }, + { + "auxiliary_loss_clip": 0.01086791, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.03626883, + "balance_loss_mlp": 1.02049232, + "epoch": 0.9185029310085676, + "flos": 20741034234240.0, + "grad_norm": 1.5932066455164264, + "language_loss": 0.73415935, + "learning_rate": 6.921925031972642e-08, + "loss": 0.75534868, + "num_input_tokens_seen": 329525540, + "step": 15277, + "time_per_iteration": 2.6720054149627686 + }, + { + "auxiliary_loss_clip": 0.01000655, + "auxiliary_loss_mlp": 0.00999523, + "balance_loss_clip": 1.00764501, + "balance_loss_mlp": 0.99853915, + "epoch": 0.9185630542612355, + "flos": 68209231875840.0, + "grad_norm": 0.7136545127762665, + "language_loss": 0.59176219, + "learning_rate": 6.91177117494226e-08, + "loss": 0.61176395, + "num_input_tokens_seen": 329592905, + "step": 15278, + "time_per_iteration": 3.3310906887054443 + }, + { + "auxiliary_loss_clip": 0.01068097, + "auxiliary_loss_mlp": 0.01030448, + "balance_loss_clip": 1.03167319, + "balance_loss_mlp": 1.01953995, + "epoch": 0.9186231775139035, + "flos": 12239470598400.0, + "grad_norm": 1.6780534662903475, + "language_loss": 0.63930976, + "learning_rate": 6.901624639836879e-08, + "loss": 0.66029525, + "num_input_tokens_seen": 329610150, + "step": 15279, + "time_per_iteration": 2.6621286869049072 + }, + { + "auxiliary_loss_clip": 0.0102767, + "auxiliary_loss_mlp": 0.00751159, + "balance_loss_clip": 1.0052371, + "balance_loss_mlp": 0.99961108, + "epoch": 0.9186833007665715, + "flos": 63939237770880.0, + "grad_norm": 0.8547221489704414, + "language_loss": 0.60236037, + "learning_rate": 6.891485427041211e-08, + "loss": 0.62014866, + "num_input_tokens_seen": 329673650, + "step": 15280, + "time_per_iteration": 3.122877836227417 + }, + { + "auxiliary_loss_clip": 0.01090206, + "auxiliary_loss_mlp": 0.01032261, + "balance_loss_clip": 1.03693056, + "balance_loss_mlp": 1.01968455, + "epoch": 0.9187434240192395, + "flos": 19974700546560.0, + "grad_norm": 2.122988890708145, + "language_loss": 0.69674432, + "learning_rate": 6.881353536939815e-08, + "loss": 0.717969, + "num_input_tokens_seen": 329692520, + "step": 15281, + "time_per_iteration": 2.6311352252960205 + }, + { + "auxiliary_loss_clip": 0.01086175, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.03539133, + "balance_loss_mlp": 1.01567471, + "epoch": 0.9188035472719074, + "flos": 25227820874880.0, + "grad_norm": 1.7776435853136854, + "language_loss": 0.84506124, + "learning_rate": 6.871228969916831e-08, + "loss": 0.86621511, + "num_input_tokens_seen": 329713750, + "step": 15282, + "time_per_iteration": 2.6757116317749023 + }, + { + "auxiliary_loss_clip": 0.01082882, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.03398228, + "balance_loss_mlp": 1.01928234, + "epoch": 0.9188636705245754, + "flos": 18405547931520.0, + "grad_norm": 1.9199915461067039, + "language_loss": 0.60464978, + "learning_rate": 6.861111726356194e-08, + "loss": 0.62579608, + "num_input_tokens_seen": 329730960, + "step": 15283, + "time_per_iteration": 2.666703224182129 + }, + { + "auxiliary_loss_clip": 0.01100933, + "auxiliary_loss_mlp": 0.00770887, + "balance_loss_clip": 1.03721941, + "balance_loss_mlp": 1.00024927, + "epoch": 0.9189237937772433, + "flos": 23769129559680.0, + "grad_norm": 1.5468834987808995, + "language_loss": 0.65656137, + "learning_rate": 6.851001806641554e-08, + "loss": 0.67527962, + "num_input_tokens_seen": 329750975, + "step": 15284, + "time_per_iteration": 5.761394023895264 + }, + { + "auxiliary_loss_clip": 0.01106112, + "auxiliary_loss_mlp": 0.01032597, + "balance_loss_clip": 1.03494716, + "balance_loss_mlp": 1.02003229, + "epoch": 0.9189839170299113, + "flos": 21214624078080.0, + "grad_norm": 2.0145798278900164, + "language_loss": 0.73759109, + "learning_rate": 6.840899211156292e-08, + "loss": 0.75897819, + "num_input_tokens_seen": 329769645, + "step": 15285, + "time_per_iteration": 2.5861620903015137 + }, + { + "auxiliary_loss_clip": 0.0110641, + "auxiliary_loss_mlp": 0.01035061, + "balance_loss_clip": 1.03556252, + "balance_loss_mlp": 1.02236581, + "epoch": 0.9190440402825792, + "flos": 16727370560640.0, + "grad_norm": 1.8392842036391315, + "language_loss": 0.71751177, + "learning_rate": 6.830803940283458e-08, + "loss": 0.73892653, + "num_input_tokens_seen": 329788185, + "step": 15286, + "time_per_iteration": 4.326793193817139 + }, + { + "auxiliary_loss_clip": 0.01109819, + "auxiliary_loss_mlp": 0.01033357, + "balance_loss_clip": 1.03742743, + "balance_loss_mlp": 1.02026165, + "epoch": 0.9191041635352473, + "flos": 23441193365760.0, + "grad_norm": 1.8870763932590424, + "language_loss": 0.73988366, + "learning_rate": 6.820715994405945e-08, + "loss": 0.76131546, + "num_input_tokens_seen": 329806780, + "step": 15287, + "time_per_iteration": 2.582787275314331 + }, + { + "auxiliary_loss_clip": 0.01110857, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.03881836, + "balance_loss_mlp": 1.01651573, + "epoch": 0.9191642867879152, + "flos": 18807532012800.0, + "grad_norm": 2.0747808934421883, + "language_loss": 0.65521705, + "learning_rate": 6.810635373906226e-08, + "loss": 0.67662358, + "num_input_tokens_seen": 329826350, + "step": 15288, + "time_per_iteration": 4.104849338531494 + }, + { + "auxiliary_loss_clip": 0.01112827, + "auxiliary_loss_mlp": 0.010354, + "balance_loss_clip": 1.04050171, + "balance_loss_mlp": 1.02340722, + "epoch": 0.9192244100405832, + "flos": 32160950167680.0, + "grad_norm": 1.8366679912878503, + "language_loss": 0.71489662, + "learning_rate": 6.800562079166549e-08, + "loss": 0.73637891, + "num_input_tokens_seen": 329846160, + "step": 15289, + "time_per_iteration": 2.628432273864746 + }, + { + "auxiliary_loss_clip": 0.01067852, + "auxiliary_loss_mlp": 0.01037164, + "balance_loss_clip": 1.03277981, + "balance_loss_mlp": 1.02398539, + "epoch": 0.9192845332932512, + "flos": 16357669827840.0, + "grad_norm": 2.022083421674923, + "language_loss": 0.7447117, + "learning_rate": 6.790496110568921e-08, + "loss": 0.76576185, + "num_input_tokens_seen": 329862020, + "step": 15290, + "time_per_iteration": 2.6732118129730225 + }, + { + "auxiliary_loss_clip": 0.01067483, + "auxiliary_loss_mlp": 0.01027695, + "balance_loss_clip": 1.03620386, + "balance_loss_mlp": 1.01607156, + "epoch": 0.9193446565459191, + "flos": 26614475464320.0, + "grad_norm": 1.914747567902696, + "language_loss": 0.72083873, + "learning_rate": 6.78043746849506e-08, + "loss": 0.74179053, + "num_input_tokens_seen": 329880185, + "step": 15291, + "time_per_iteration": 2.72456431388855 + }, + { + "auxiliary_loss_clip": 0.01083225, + "auxiliary_loss_mlp": 0.01026967, + "balance_loss_clip": 1.03504729, + "balance_loss_mlp": 1.0149684, + "epoch": 0.9194047797985871, + "flos": 22492182084480.0, + "grad_norm": 1.6500392247637397, + "language_loss": 0.71124983, + "learning_rate": 6.770386153326346e-08, + "loss": 0.73235166, + "num_input_tokens_seen": 329900255, + "step": 15292, + "time_per_iteration": 2.6152868270874023 + }, + { + "auxiliary_loss_clip": 0.01087602, + "auxiliary_loss_mlp": 0.01029375, + "balance_loss_clip": 1.03518999, + "balance_loss_mlp": 1.01654267, + "epoch": 0.9194649030512551, + "flos": 25078791346560.0, + "grad_norm": 2.1012892949543454, + "language_loss": 0.72765195, + "learning_rate": 6.760342165443988e-08, + "loss": 0.74882174, + "num_input_tokens_seen": 329919095, + "step": 15293, + "time_per_iteration": 2.7014577388763428 + }, + { + "auxiliary_loss_clip": 0.01106702, + "auxiliary_loss_mlp": 0.01026876, + "balance_loss_clip": 1.03621578, + "balance_loss_mlp": 1.01458549, + "epoch": 0.9195250263039231, + "flos": 11911139354880.0, + "grad_norm": 1.8656934281191482, + "language_loss": 0.78315026, + "learning_rate": 6.750305505228837e-08, + "loss": 0.80448604, + "num_input_tokens_seen": 329936505, + "step": 15294, + "time_per_iteration": 2.547825813293457 + }, + { + "auxiliary_loss_clip": 0.01088089, + "auxiliary_loss_mlp": 0.01036683, + "balance_loss_clip": 1.0347265, + "balance_loss_mlp": 1.02261615, + "epoch": 0.919585149556591, + "flos": 21834154880640.0, + "grad_norm": 1.8102816220705245, + "language_loss": 0.77170849, + "learning_rate": 6.74027617306141e-08, + "loss": 0.79295617, + "num_input_tokens_seen": 329956795, + "step": 15295, + "time_per_iteration": 2.7039098739624023 + }, + { + "auxiliary_loss_clip": 0.01106989, + "auxiliary_loss_mlp": 0.01029958, + "balance_loss_clip": 1.03723979, + "balance_loss_mlp": 1.01890755, + "epoch": 0.919645272809259, + "flos": 28184059042560.0, + "grad_norm": 2.3118295307682066, + "language_loss": 0.7140969, + "learning_rate": 6.730254169322114e-08, + "loss": 0.73546642, + "num_input_tokens_seen": 329977195, + "step": 15296, + "time_per_iteration": 2.6299383640289307 + }, + { + "auxiliary_loss_clip": 0.01109705, + "auxiliary_loss_mlp": 0.01039854, + "balance_loss_clip": 1.03783691, + "balance_loss_mlp": 1.02766538, + "epoch": 0.9197053960619269, + "flos": 18332828847360.0, + "grad_norm": 2.0214476637003567, + "language_loss": 0.75176775, + "learning_rate": 6.720239494390912e-08, + "loss": 0.77326334, + "num_input_tokens_seen": 329992095, + "step": 15297, + "time_per_iteration": 2.5208096504211426 + }, + { + "auxiliary_loss_clip": 0.01093577, + "auxiliary_loss_mlp": 0.00770462, + "balance_loss_clip": 1.03651249, + "balance_loss_mlp": 1.00015736, + "epoch": 0.9197655193145949, + "flos": 28183448511360.0, + "grad_norm": 1.600708869843347, + "language_loss": 0.73453987, + "learning_rate": 6.710232148647676e-08, + "loss": 0.75318027, + "num_input_tokens_seen": 330011490, + "step": 15298, + "time_per_iteration": 2.5899410247802734 + }, + { + "auxiliary_loss_clip": 0.01084548, + "auxiliary_loss_mlp": 0.01035002, + "balance_loss_clip": 1.03919554, + "balance_loss_mlp": 1.02254462, + "epoch": 0.9198256425672628, + "flos": 17306321973120.0, + "grad_norm": 1.9381032663604973, + "language_loss": 0.79355192, + "learning_rate": 6.70023213247175e-08, + "loss": 0.81474739, + "num_input_tokens_seen": 330027885, + "step": 15299, + "time_per_iteration": 2.618654251098633 + }, + { + "auxiliary_loss_clip": 0.01078356, + "auxiliary_loss_mlp": 0.01023938, + "balance_loss_clip": 1.03582788, + "balance_loss_mlp": 1.01230943, + "epoch": 0.9198857658199309, + "flos": 17858520731520.0, + "grad_norm": 2.334922484548837, + "language_loss": 0.63701689, + "learning_rate": 6.690239446242385e-08, + "loss": 0.65803981, + "num_input_tokens_seen": 330046230, + "step": 15300, + "time_per_iteration": 2.6653809547424316 + }, + { + "auxiliary_loss_clip": 0.01079487, + "auxiliary_loss_mlp": 0.00768011, + "balance_loss_clip": 1.03474522, + "balance_loss_mlp": 1.00012684, + "epoch": 0.9199458890725988, + "flos": 22127545169280.0, + "grad_norm": 1.7470881044851607, + "language_loss": 0.69722879, + "learning_rate": 6.680254090338545e-08, + "loss": 0.71570385, + "num_input_tokens_seen": 330065535, + "step": 15301, + "time_per_iteration": 2.6812119483947754 + }, + { + "auxiliary_loss_clip": 0.01096515, + "auxiliary_loss_mlp": 0.01040071, + "balance_loss_clip": 1.0358305, + "balance_loss_mlp": 1.02490699, + "epoch": 0.9200060123252668, + "flos": 16034043265920.0, + "grad_norm": 1.711835493107777, + "language_loss": 0.71127915, + "learning_rate": 6.670276065138814e-08, + "loss": 0.73264498, + "num_input_tokens_seen": 330082920, + "step": 15302, + "time_per_iteration": 2.5945441722869873 + }, + { + "auxiliary_loss_clip": 0.01110029, + "auxiliary_loss_mlp": 0.01030513, + "balance_loss_clip": 1.03716493, + "balance_loss_mlp": 1.0183115, + "epoch": 0.9200661355779348, + "flos": 26864521015680.0, + "grad_norm": 2.8681928190187556, + "language_loss": 0.76527154, + "learning_rate": 6.660305371021579e-08, + "loss": 0.78667694, + "num_input_tokens_seen": 330101165, + "step": 15303, + "time_per_iteration": 2.641113519668579 + }, + { + "auxiliary_loss_clip": 0.01088214, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.03663945, + "balance_loss_mlp": 1.01886749, + "epoch": 0.9201262588306027, + "flos": 12786749193600.0, + "grad_norm": 3.150146783563773, + "language_loss": 0.88236862, + "learning_rate": 6.650342008365006e-08, + "loss": 0.90356123, + "num_input_tokens_seen": 330118775, + "step": 15304, + "time_per_iteration": 2.6956560611724854 + }, + { + "auxiliary_loss_clip": 0.0104635, + "auxiliary_loss_mlp": 0.01037753, + "balance_loss_clip": 1.03275561, + "balance_loss_mlp": 1.02204168, + "epoch": 0.9201863820832707, + "flos": 20631614428800.0, + "grad_norm": 2.036135949691738, + "language_loss": 0.77178156, + "learning_rate": 6.64038597754677e-08, + "loss": 0.79262257, + "num_input_tokens_seen": 330135570, + "step": 15305, + "time_per_iteration": 2.817863941192627 + }, + { + "auxiliary_loss_clip": 0.01091635, + "auxiliary_loss_mlp": 0.01036183, + "balance_loss_clip": 1.03597045, + "balance_loss_mlp": 1.02348161, + "epoch": 0.9202465053359387, + "flos": 26395815421440.0, + "grad_norm": 6.064835951868583, + "language_loss": 0.8149547, + "learning_rate": 6.630437278944501e-08, + "loss": 0.8362329, + "num_input_tokens_seen": 330152840, + "step": 15306, + "time_per_iteration": 2.6748034954071045 + }, + { + "auxiliary_loss_clip": 0.01067915, + "auxiliary_loss_mlp": 0.01030425, + "balance_loss_clip": 1.03378415, + "balance_loss_mlp": 1.01910639, + "epoch": 0.9203066285886067, + "flos": 10488179093760.0, + "grad_norm": 1.9090343566843708, + "language_loss": 0.72313774, + "learning_rate": 6.62049591293541e-08, + "loss": 0.74412113, + "num_input_tokens_seen": 330168605, + "step": 15307, + "time_per_iteration": 2.707096815109253 + }, + { + "auxiliary_loss_clip": 0.01100301, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.03705478, + "balance_loss_mlp": 1.01726282, + "epoch": 0.9203667518412746, + "flos": 19390721230080.0, + "grad_norm": 2.092849830568689, + "language_loss": 0.78399515, + "learning_rate": 6.610561879896526e-08, + "loss": 0.80529916, + "num_input_tokens_seen": 330186160, + "step": 15308, + "time_per_iteration": 2.606255531311035 + }, + { + "auxiliary_loss_clip": 0.01084659, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.03293347, + "balance_loss_mlp": 1.01967311, + "epoch": 0.9204268750939426, + "flos": 15924982596480.0, + "grad_norm": 2.276895603959481, + "language_loss": 0.77885747, + "learning_rate": 6.600635180204484e-08, + "loss": 0.80003333, + "num_input_tokens_seen": 330201780, + "step": 15309, + "time_per_iteration": 2.637420654296875 + }, + { + "auxiliary_loss_clip": 0.01054204, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.03081393, + "balance_loss_mlp": 1.01686096, + "epoch": 0.9204869983466105, + "flos": 16471758401280.0, + "grad_norm": 1.8296999045819267, + "language_loss": 0.66413641, + "learning_rate": 6.590715814235781e-08, + "loss": 0.68498123, + "num_input_tokens_seen": 330219165, + "step": 15310, + "time_per_iteration": 2.7335994243621826 + }, + { + "auxiliary_loss_clip": 0.01044089, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.03122044, + "balance_loss_mlp": 1.01953483, + "epoch": 0.9205471215992785, + "flos": 21539220307200.0, + "grad_norm": 1.6521926444868564, + "language_loss": 0.66375726, + "learning_rate": 6.580803782366495e-08, + "loss": 0.6845206, + "num_input_tokens_seen": 330238975, + "step": 15311, + "time_per_iteration": 2.8604588508605957 + }, + { + "auxiliary_loss_clip": 0.01097174, + "auxiliary_loss_mlp": 0.01034831, + "balance_loss_clip": 1.03502798, + "balance_loss_mlp": 1.02240396, + "epoch": 0.9206072448519464, + "flos": 25005892694400.0, + "grad_norm": 1.6158209988301302, + "language_loss": 0.7622931, + "learning_rate": 6.570899084972503e-08, + "loss": 0.78361315, + "num_input_tokens_seen": 330259755, + "step": 15312, + "time_per_iteration": 2.664778232574463 + }, + { + "auxiliary_loss_clip": 0.01095599, + "auxiliary_loss_mlp": 0.01038065, + "balance_loss_clip": 1.03726745, + "balance_loss_mlp": 1.02628684, + "epoch": 0.9206673681046145, + "flos": 20522661500160.0, + "grad_norm": 1.6943388606397072, + "language_loss": 0.79487884, + "learning_rate": 6.561001722429394e-08, + "loss": 0.81621552, + "num_input_tokens_seen": 330277660, + "step": 15313, + "time_per_iteration": 2.5808446407318115 + }, + { + "auxiliary_loss_clip": 0.01100191, + "auxiliary_loss_mlp": 0.01030615, + "balance_loss_clip": 1.03598011, + "balance_loss_mlp": 1.01823509, + "epoch": 0.9207274913572824, + "flos": 20883455660160.0, + "grad_norm": 2.6484489133321976, + "language_loss": 0.78395313, + "learning_rate": 6.55111169511251e-08, + "loss": 0.80526119, + "num_input_tokens_seen": 330295455, + "step": 15314, + "time_per_iteration": 2.6530680656433105 + }, + { + "auxiliary_loss_clip": 0.01093159, + "auxiliary_loss_mlp": 0.01034883, + "balance_loss_clip": 1.0372566, + "balance_loss_mlp": 1.02071548, + "epoch": 0.9207876146099504, + "flos": 22708256348160.0, + "grad_norm": 1.9276699965768014, + "language_loss": 0.79122138, + "learning_rate": 6.541229003396864e-08, + "loss": 0.81250179, + "num_input_tokens_seen": 330315310, + "step": 15315, + "time_per_iteration": 2.6690027713775635 + }, + { + "auxiliary_loss_clip": 0.01089675, + "auxiliary_loss_mlp": 0.01032551, + "balance_loss_clip": 1.03612041, + "balance_loss_mlp": 1.01993239, + "epoch": 0.9208477378626184, + "flos": 18507354053760.0, + "grad_norm": 1.761446107604308, + "language_loss": 0.75961876, + "learning_rate": 6.531353647657156e-08, + "loss": 0.78084099, + "num_input_tokens_seen": 330333260, + "step": 15316, + "time_per_iteration": 2.5912938117980957 + }, + { + "auxiliary_loss_clip": 0.01108895, + "auxiliary_loss_mlp": 0.01034715, + "balance_loss_clip": 1.03550375, + "balance_loss_mlp": 1.02175713, + "epoch": 0.9209078611152863, + "flos": 22999635475200.0, + "grad_norm": 1.6108706295980322, + "language_loss": 0.69277954, + "learning_rate": 6.521485628267931e-08, + "loss": 0.71421564, + "num_input_tokens_seen": 330352465, + "step": 15317, + "time_per_iteration": 2.5787100791931152 + }, + { + "auxiliary_loss_clip": 0.01098793, + "auxiliary_loss_mlp": 0.01031182, + "balance_loss_clip": 1.03747189, + "balance_loss_mlp": 1.01845622, + "epoch": 0.9209679843679544, + "flos": 24061514267520.0, + "grad_norm": 1.6422186031600345, + "language_loss": 0.8337481, + "learning_rate": 6.511624945603378e-08, + "loss": 0.85504782, + "num_input_tokens_seen": 330372685, + "step": 15318, + "time_per_iteration": 2.655625820159912 + }, + { + "auxiliary_loss_clip": 0.01087423, + "auxiliary_loss_mlp": 0.01030908, + "balance_loss_clip": 1.03772366, + "balance_loss_mlp": 1.01855183, + "epoch": 0.9210281076206223, + "flos": 13553370190080.0, + "grad_norm": 1.8520706427370603, + "language_loss": 0.85584986, + "learning_rate": 6.501771600037354e-08, + "loss": 0.87703317, + "num_input_tokens_seen": 330388860, + "step": 15319, + "time_per_iteration": 2.62506103515625 + }, + { + "auxiliary_loss_clip": 0.01027307, + "auxiliary_loss_mlp": 0.01001328, + "balance_loss_clip": 1.00478411, + "balance_loss_mlp": 1.0003742, + "epoch": 0.9210882308732903, + "flos": 71426289674880.0, + "grad_norm": 0.7696536988394306, + "language_loss": 0.56245381, + "learning_rate": 6.491925591943559e-08, + "loss": 0.58274013, + "num_input_tokens_seen": 330448735, + "step": 15320, + "time_per_iteration": 3.1641623973846436 + }, + { + "auxiliary_loss_clip": 0.01060714, + "auxiliary_loss_mlp": 0.01048122, + "balance_loss_clip": 1.03641021, + "balance_loss_mlp": 1.03252339, + "epoch": 0.9211483541259582, + "flos": 18509113820160.0, + "grad_norm": 3.2582738862572156, + "language_loss": 0.63959485, + "learning_rate": 6.482086921695384e-08, + "loss": 0.66068316, + "num_input_tokens_seen": 330465600, + "step": 15321, + "time_per_iteration": 2.677826404571533 + }, + { + "auxiliary_loss_clip": 0.01068249, + "auxiliary_loss_mlp": 0.01028212, + "balance_loss_clip": 1.03475666, + "balance_loss_mlp": 1.01626706, + "epoch": 0.9212084773786262, + "flos": 23258228463360.0, + "grad_norm": 1.6685688646331795, + "language_loss": 0.71651804, + "learning_rate": 6.47225558966582e-08, + "loss": 0.73748261, + "num_input_tokens_seen": 330485770, + "step": 15322, + "time_per_iteration": 2.740342855453491 + }, + { + "auxiliary_loss_clip": 0.01058964, + "auxiliary_loss_mlp": 0.01032827, + "balance_loss_clip": 1.03519404, + "balance_loss_mlp": 1.02108479, + "epoch": 0.9212686006312941, + "flos": 16289511770880.0, + "grad_norm": 1.8646866235028916, + "language_loss": 0.69607079, + "learning_rate": 6.462431596227725e-08, + "loss": 0.71698868, + "num_input_tokens_seen": 330504255, + "step": 15323, + "time_per_iteration": 4.281275987625122 + }, + { + "auxiliary_loss_clip": 0.010823, + "auxiliary_loss_mlp": 0.01039009, + "balance_loss_clip": 1.03287673, + "balance_loss_mlp": 1.02479923, + "epoch": 0.9213287238839621, + "flos": 19785773986560.0, + "grad_norm": 1.8454161764247499, + "language_loss": 0.74490941, + "learning_rate": 6.452614941753597e-08, + "loss": 0.76612252, + "num_input_tokens_seen": 330520705, + "step": 15324, + "time_per_iteration": 4.212737798690796 + }, + { + "auxiliary_loss_clip": 0.01099326, + "auxiliary_loss_mlp": 0.01041809, + "balance_loss_clip": 1.0375061, + "balance_loss_mlp": 1.02970934, + "epoch": 0.92138884713663, + "flos": 21030402199680.0, + "grad_norm": 1.8170423555389452, + "language_loss": 0.71340334, + "learning_rate": 6.442805626615744e-08, + "loss": 0.73481476, + "num_input_tokens_seen": 330539245, + "step": 15325, + "time_per_iteration": 4.3058435916900635 + }, + { + "auxiliary_loss_clip": 0.01081418, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.03530788, + "balance_loss_mlp": 1.0195992, + "epoch": 0.9214489703892981, + "flos": 28587264186240.0, + "grad_norm": 1.763186738417038, + "language_loss": 0.78558946, + "learning_rate": 6.433003651186109e-08, + "loss": 0.80672109, + "num_input_tokens_seen": 330561815, + "step": 15326, + "time_per_iteration": 2.703559160232544 + }, + { + "auxiliary_loss_clip": 0.01101844, + "auxiliary_loss_mlp": 0.01033367, + "balance_loss_clip": 1.03805542, + "balance_loss_mlp": 1.02046824, + "epoch": 0.921509093641966, + "flos": 16361476669440.0, + "grad_norm": 2.8495287751902754, + "language_loss": 0.71737856, + "learning_rate": 6.42320901583635e-08, + "loss": 0.73873067, + "num_input_tokens_seen": 330579760, + "step": 15327, + "time_per_iteration": 4.265162706375122 + }, + { + "auxiliary_loss_clip": 0.01101192, + "auxiliary_loss_mlp": 0.0104188, + "balance_loss_clip": 1.03807735, + "balance_loss_mlp": 1.02861834, + "epoch": 0.921569216894634, + "flos": 26830837036800.0, + "grad_norm": 1.806043843779226, + "language_loss": 0.77786517, + "learning_rate": 6.413421720937906e-08, + "loss": 0.79929584, + "num_input_tokens_seen": 330598545, + "step": 15328, + "time_per_iteration": 2.7142398357391357 + }, + { + "auxiliary_loss_clip": 0.01088664, + "auxiliary_loss_mlp": 0.01032833, + "balance_loss_clip": 1.03698349, + "balance_loss_mlp": 1.02065539, + "epoch": 0.921629340147302, + "flos": 24645134448000.0, + "grad_norm": 3.3382204523213455, + "language_loss": 0.71625078, + "learning_rate": 6.4036417668619e-08, + "loss": 0.73746574, + "num_input_tokens_seen": 330616700, + "step": 15329, + "time_per_iteration": 2.8545138835906982 + }, + { + "auxiliary_loss_clip": 0.01095503, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.03497839, + "balance_loss_mlp": 1.01688147, + "epoch": 0.9216894633999699, + "flos": 15086504442240.0, + "grad_norm": 2.2459771654219067, + "language_loss": 0.86542726, + "learning_rate": 6.393869153979192e-08, + "loss": 0.88666344, + "num_input_tokens_seen": 330633355, + "step": 15330, + "time_per_iteration": 2.5924322605133057 + }, + { + "auxiliary_loss_clip": 0.01074582, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.03277349, + "balance_loss_mlp": 1.0190115, + "epoch": 0.921749586652638, + "flos": 19204524103680.0, + "grad_norm": 2.041503026001501, + "language_loss": 0.75815696, + "learning_rate": 6.384103882660397e-08, + "loss": 0.77922112, + "num_input_tokens_seen": 330651470, + "step": 15331, + "time_per_iteration": 2.6607861518859863 + }, + { + "auxiliary_loss_clip": 0.01096924, + "auxiliary_loss_mlp": 0.01029082, + "balance_loss_clip": 1.03500032, + "balance_loss_mlp": 1.01668429, + "epoch": 0.9218097099053059, + "flos": 20522446018560.0, + "grad_norm": 1.901322595674086, + "language_loss": 0.75386262, + "learning_rate": 6.374345953275794e-08, + "loss": 0.7751227, + "num_input_tokens_seen": 330669170, + "step": 15332, + "time_per_iteration": 2.682168483734131 + }, + { + "auxiliary_loss_clip": 0.01055682, + "auxiliary_loss_mlp": 0.01030851, + "balance_loss_clip": 1.03246427, + "balance_loss_mlp": 1.0191865, + "epoch": 0.9218698331579739, + "flos": 17348625216000.0, + "grad_norm": 1.7775108010679808, + "language_loss": 0.74603796, + "learning_rate": 6.364595366195358e-08, + "loss": 0.76690328, + "num_input_tokens_seen": 330686635, + "step": 15333, + "time_per_iteration": 2.7291512489318848 + }, + { + "auxiliary_loss_clip": 0.01017268, + "auxiliary_loss_mlp": 0.01001133, + "balance_loss_clip": 1.00694776, + "balance_loss_mlp": 1.00006628, + "epoch": 0.9219299564106418, + "flos": 61958332575360.0, + "grad_norm": 0.8092949717587729, + "language_loss": 0.52865499, + "learning_rate": 6.354852121788879e-08, + "loss": 0.54883903, + "num_input_tokens_seen": 330749160, + "step": 15334, + "time_per_iteration": 3.11421275138855 + }, + { + "auxiliary_loss_clip": 0.01080248, + "auxiliary_loss_mlp": 0.01032803, + "balance_loss_clip": 1.03543484, + "balance_loss_mlp": 1.02087057, + "epoch": 0.9219900796633098, + "flos": 15701761526400.0, + "grad_norm": 1.9555030178553923, + "language_loss": 0.62425917, + "learning_rate": 6.345116220425839e-08, + "loss": 0.64538974, + "num_input_tokens_seen": 330766840, + "step": 15335, + "time_per_iteration": 2.64497971534729 + }, + { + "auxiliary_loss_clip": 0.01055617, + "auxiliary_loss_mlp": 0.0103028, + "balance_loss_clip": 1.03126609, + "balance_loss_mlp": 1.01756644, + "epoch": 0.9220502029159777, + "flos": 24932670819840.0, + "grad_norm": 1.6552756447627857, + "language_loss": 0.71621144, + "learning_rate": 6.335387662475366e-08, + "loss": 0.73707038, + "num_input_tokens_seen": 330785585, + "step": 15336, + "time_per_iteration": 2.7646801471710205 + }, + { + "auxiliary_loss_clip": 0.01083887, + "auxiliary_loss_mlp": 0.01032421, + "balance_loss_clip": 1.03509367, + "balance_loss_mlp": 1.02121532, + "epoch": 0.9221103261686457, + "flos": 15667215621120.0, + "grad_norm": 1.8250492219467316, + "language_loss": 0.71701425, + "learning_rate": 6.325666448306433e-08, + "loss": 0.7381773, + "num_input_tokens_seen": 330800750, + "step": 15337, + "time_per_iteration": 2.6583242416381836 + }, + { + "auxiliary_loss_clip": 0.01020516, + "auxiliary_loss_mlp": 0.01000329, + "balance_loss_clip": 1.00723362, + "balance_loss_mlp": 0.99938756, + "epoch": 0.9221704494213137, + "flos": 67516299630720.0, + "grad_norm": 0.8846440580369678, + "language_loss": 0.65341711, + "learning_rate": 6.31595257828763e-08, + "loss": 0.67362559, + "num_input_tokens_seen": 330863640, + "step": 15338, + "time_per_iteration": 3.1719980239868164 + }, + { + "auxiliary_loss_clip": 0.01101462, + "auxiliary_loss_mlp": 0.01033959, + "balance_loss_clip": 1.0384047, + "balance_loss_mlp": 1.02131093, + "epoch": 0.9222305726739817, + "flos": 30226945155840.0, + "grad_norm": 1.9711725803511775, + "language_loss": 0.66986012, + "learning_rate": 6.306246052787289e-08, + "loss": 0.69121432, + "num_input_tokens_seen": 330884675, + "step": 15339, + "time_per_iteration": 2.7261481285095215 + }, + { + "auxiliary_loss_clip": 0.01109081, + "auxiliary_loss_mlp": 0.01029578, + "balance_loss_clip": 1.03689742, + "balance_loss_mlp": 1.01729918, + "epoch": 0.9222906959266496, + "flos": 25337204766720.0, + "grad_norm": 2.2637051134502015, + "language_loss": 0.71722078, + "learning_rate": 6.296546872173513e-08, + "loss": 0.73860735, + "num_input_tokens_seen": 330904125, + "step": 15340, + "time_per_iteration": 2.571516275405884 + }, + { + "auxiliary_loss_clip": 0.01074794, + "auxiliary_loss_mlp": 0.01031494, + "balance_loss_clip": 1.03479934, + "balance_loss_mlp": 1.01920938, + "epoch": 0.9223508191793176, + "flos": 27599864244480.0, + "grad_norm": 1.6254811741615818, + "language_loss": 0.70379698, + "learning_rate": 6.286855036814098e-08, + "loss": 0.72485995, + "num_input_tokens_seen": 330925140, + "step": 15341, + "time_per_iteration": 2.8622758388519287 + }, + { + "auxiliary_loss_clip": 0.01056229, + "auxiliary_loss_mlp": 0.01028103, + "balance_loss_clip": 1.03552556, + "balance_loss_mlp": 1.01709414, + "epoch": 0.9224109424319856, + "flos": 27307587277440.0, + "grad_norm": 1.6316629656961243, + "language_loss": 0.67473853, + "learning_rate": 6.277170547076571e-08, + "loss": 0.69558185, + "num_input_tokens_seen": 330946625, + "step": 15342, + "time_per_iteration": 2.9130048751831055 + }, + { + "auxiliary_loss_clip": 0.01059826, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.03590834, + "balance_loss_mlp": 1.01951027, + "epoch": 0.9224710656846535, + "flos": 48208314401280.0, + "grad_norm": 2.4312862834547175, + "language_loss": 0.6953969, + "learning_rate": 6.26749340332815e-08, + "loss": 0.71630651, + "num_input_tokens_seen": 330967795, + "step": 15343, + "time_per_iteration": 3.0083987712860107 + }, + { + "auxiliary_loss_clip": 0.01011696, + "auxiliary_loss_mlp": 0.0100494, + "balance_loss_clip": 1.008178, + "balance_loss_mlp": 1.00394428, + "epoch": 0.9225311889373216, + "flos": 66722171794560.0, + "grad_norm": 0.7265525500100153, + "language_loss": 0.51983988, + "learning_rate": 6.257823605935786e-08, + "loss": 0.54000616, + "num_input_tokens_seen": 331040850, + "step": 15344, + "time_per_iteration": 3.4099650382995605 + }, + { + "auxiliary_loss_clip": 0.01104022, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.03630853, + "balance_loss_mlp": 1.0211345, + "epoch": 0.9225913121899895, + "flos": 22271295398400.0, + "grad_norm": 1.703377825859211, + "language_loss": 0.70327353, + "learning_rate": 6.248161155266162e-08, + "loss": 0.7246387, + "num_input_tokens_seen": 331060595, + "step": 15345, + "time_per_iteration": 2.576371431350708 + }, + { + "auxiliary_loss_clip": 0.01087623, + "auxiliary_loss_mlp": 0.01037657, + "balance_loss_clip": 1.03598809, + "balance_loss_mlp": 1.02505088, + "epoch": 0.9226514354426575, + "flos": 20082719721600.0, + "grad_norm": 2.157686246893833, + "language_loss": 0.77242136, + "learning_rate": 6.238506051685677e-08, + "loss": 0.79367411, + "num_input_tokens_seen": 331080195, + "step": 15346, + "time_per_iteration": 2.6608493328094482 + }, + { + "auxiliary_loss_clip": 0.01089778, + "auxiliary_loss_mlp": 0.01037854, + "balance_loss_clip": 1.03755546, + "balance_loss_mlp": 1.02469873, + "epoch": 0.9227115586953254, + "flos": 16070851728000.0, + "grad_norm": 1.7988632334787429, + "language_loss": 0.76320672, + "learning_rate": 6.228858295560457e-08, + "loss": 0.78448308, + "num_input_tokens_seen": 331097645, + "step": 15347, + "time_per_iteration": 2.6887784004211426 + }, + { + "auxiliary_loss_clip": 0.01095866, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.03849506, + "balance_loss_mlp": 1.01933718, + "epoch": 0.9227716819479934, + "flos": 20446027833600.0, + "grad_norm": 1.7976243281525446, + "language_loss": 0.76849055, + "learning_rate": 6.219217887256367e-08, + "loss": 0.78975642, + "num_input_tokens_seen": 331116830, + "step": 15348, + "time_per_iteration": 2.6028568744659424 + }, + { + "auxiliary_loss_clip": 0.01087325, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.03495049, + "balance_loss_mlp": 1.02063584, + "epoch": 0.9228318052006613, + "flos": 25007401065600.0, + "grad_norm": 1.9643980003377204, + "language_loss": 0.67811698, + "learning_rate": 6.209584827138959e-08, + "loss": 0.69932616, + "num_input_tokens_seen": 331137235, + "step": 15349, + "time_per_iteration": 2.6671433448791504 + }, + { + "auxiliary_loss_clip": 0.01067448, + "auxiliary_loss_mlp": 0.01030854, + "balance_loss_clip": 1.03284109, + "balance_loss_mlp": 1.01793194, + "epoch": 0.9228919284533293, + "flos": 12677257560960.0, + "grad_norm": 5.153703084259653, + "language_loss": 0.86846638, + "learning_rate": 6.199959115573495e-08, + "loss": 0.88944942, + "num_input_tokens_seen": 331153155, + "step": 15350, + "time_per_iteration": 2.703225612640381 + }, + { + "auxiliary_loss_clip": 0.01009812, + "auxiliary_loss_mlp": 0.01000808, + "balance_loss_clip": 1.00661051, + "balance_loss_mlp": 0.9998011, + "epoch": 0.9229520517059973, + "flos": 69986162712960.0, + "grad_norm": 0.7762360061430656, + "language_loss": 0.60365206, + "learning_rate": 6.190340752924994e-08, + "loss": 0.62375826, + "num_input_tokens_seen": 331214895, + "step": 15351, + "time_per_iteration": 3.158869504928589 + }, + { + "auxiliary_loss_clip": 0.01083781, + "auxiliary_loss_mlp": 0.01026683, + "balance_loss_clip": 1.03323722, + "balance_loss_mlp": 1.01475024, + "epoch": 0.9230121749586653, + "flos": 14793832425600.0, + "grad_norm": 1.832472265730707, + "language_loss": 0.77387846, + "learning_rate": 6.180729739558233e-08, + "loss": 0.79498303, + "num_input_tokens_seen": 331232185, + "step": 15352, + "time_per_iteration": 2.627760171890259 + }, + { + "auxiliary_loss_clip": 0.0107378, + "auxiliary_loss_mlp": 0.01042996, + "balance_loss_clip": 1.03285944, + "balance_loss_mlp": 1.0284164, + "epoch": 0.9230722982113332, + "flos": 22967208472320.0, + "grad_norm": 1.8679415758316251, + "language_loss": 0.59430194, + "learning_rate": 6.171126075837585e-08, + "loss": 0.61546969, + "num_input_tokens_seen": 331251065, + "step": 15353, + "time_per_iteration": 2.7041702270507812 + }, + { + "auxiliary_loss_clip": 0.01083679, + "auxiliary_loss_mlp": 0.01026903, + "balance_loss_clip": 1.034688, + "balance_loss_mlp": 1.01505327, + "epoch": 0.9231324214640012, + "flos": 18551452976640.0, + "grad_norm": 1.711390419205093, + "language_loss": 0.7429471, + "learning_rate": 6.161529762127293e-08, + "loss": 0.76405293, + "num_input_tokens_seen": 331269110, + "step": 15354, + "time_per_iteration": 2.6137607097625732 + }, + { + "auxiliary_loss_clip": 0.01112951, + "auxiliary_loss_mlp": 0.01036765, + "balance_loss_clip": 1.03797793, + "balance_loss_mlp": 1.02363443, + "epoch": 0.9231925447166691, + "flos": 22082727974400.0, + "grad_norm": 2.2506024709408345, + "language_loss": 0.64660299, + "learning_rate": 6.1519407987912e-08, + "loss": 0.66810012, + "num_input_tokens_seen": 331286555, + "step": 15355, + "time_per_iteration": 2.562422275543213 + }, + { + "auxiliary_loss_clip": 0.0108125, + "auxiliary_loss_mlp": 0.01041248, + "balance_loss_clip": 1.03451049, + "balance_loss_mlp": 1.02839768, + "epoch": 0.9232526679693371, + "flos": 26541145848960.0, + "grad_norm": 1.5394884585282018, + "language_loss": 0.7420373, + "learning_rate": 6.142359186192947e-08, + "loss": 0.76326227, + "num_input_tokens_seen": 331307660, + "step": 15356, + "time_per_iteration": 2.6385319232940674 + }, + { + "auxiliary_loss_clip": 0.01084284, + "auxiliary_loss_mlp": 0.01035835, + "balance_loss_clip": 1.03417146, + "balance_loss_mlp": 1.02270436, + "epoch": 0.9233127912220052, + "flos": 14756664827520.0, + "grad_norm": 1.6931372093662804, + "language_loss": 0.60944784, + "learning_rate": 6.132784924695844e-08, + "loss": 0.63064903, + "num_input_tokens_seen": 331324885, + "step": 15357, + "time_per_iteration": 2.6290340423583984 + }, + { + "auxiliary_loss_clip": 0.01082317, + "auxiliary_loss_mlp": 0.01033403, + "balance_loss_clip": 1.03603005, + "balance_loss_mlp": 1.01992083, + "epoch": 0.9233729144746731, + "flos": 25261792162560.0, + "grad_norm": 1.453584491070713, + "language_loss": 0.70108932, + "learning_rate": 6.123218014662956e-08, + "loss": 0.72224653, + "num_input_tokens_seen": 331345885, + "step": 15358, + "time_per_iteration": 2.752317190170288 + }, + { + "auxiliary_loss_clip": 0.01108354, + "auxiliary_loss_mlp": 0.01033314, + "balance_loss_clip": 1.03619421, + "balance_loss_mlp": 1.02111292, + "epoch": 0.9234330377273411, + "flos": 27849837968640.0, + "grad_norm": 2.186505896470512, + "language_loss": 0.73299533, + "learning_rate": 6.113658456457104e-08, + "loss": 0.754412, + "num_input_tokens_seen": 331364320, + "step": 15359, + "time_per_iteration": 2.597811460494995 + }, + { + "auxiliary_loss_clip": 0.01047199, + "auxiliary_loss_mlp": 0.01033574, + "balance_loss_clip": 1.03515124, + "balance_loss_mlp": 1.02113438, + "epoch": 0.923493160980009, + "flos": 24608361899520.0, + "grad_norm": 1.822606379106128, + "language_loss": 0.64573818, + "learning_rate": 6.104106250440732e-08, + "loss": 0.66654599, + "num_input_tokens_seen": 331384135, + "step": 15360, + "time_per_iteration": 2.8328487873077393 + }, + { + "auxiliary_loss_clip": 0.01017958, + "auxiliary_loss_mlp": 0.00751388, + "balance_loss_clip": 1.00556254, + "balance_loss_mlp": 0.99968225, + "epoch": 0.923553284232677, + "flos": 67700916558720.0, + "grad_norm": 0.7601562180978135, + "language_loss": 0.5516786, + "learning_rate": 6.094561396976083e-08, + "loss": 0.56937212, + "num_input_tokens_seen": 331440645, + "step": 15361, + "time_per_iteration": 3.0788414478302 + }, + { + "auxiliary_loss_clip": 0.01075936, + "auxiliary_loss_mlp": 0.01031297, + "balance_loss_clip": 1.03308797, + "balance_loss_mlp": 1.01755273, + "epoch": 0.9236134074853449, + "flos": 18807244704000.0, + "grad_norm": 1.8816264544050445, + "language_loss": 0.69994414, + "learning_rate": 6.085023896425112e-08, + "loss": 0.72101647, + "num_input_tokens_seen": 331459580, + "step": 15362, + "time_per_iteration": 4.193416118621826 + }, + { + "auxiliary_loss_clip": 0.01094932, + "auxiliary_loss_mlp": 0.0103223, + "balance_loss_clip": 1.0347358, + "balance_loss_mlp": 1.01748371, + "epoch": 0.923673530738013, + "flos": 27782362270080.0, + "grad_norm": 1.488556622948845, + "language_loss": 0.75529814, + "learning_rate": 6.075493749149463e-08, + "loss": 0.77656972, + "num_input_tokens_seen": 331481560, + "step": 15363, + "time_per_iteration": 2.6561429500579834 + }, + { + "auxiliary_loss_clip": 0.01109631, + "auxiliary_loss_mlp": 0.01029622, + "balance_loss_clip": 1.03737402, + "balance_loss_mlp": 1.01739717, + "epoch": 0.9237336539906809, + "flos": 26797117144320.0, + "grad_norm": 1.930031039204044, + "language_loss": 0.82993495, + "learning_rate": 6.065970955510514e-08, + "loss": 0.85132754, + "num_input_tokens_seen": 331499090, + "step": 15364, + "time_per_iteration": 5.842444181442261 + }, + { + "auxiliary_loss_clip": 0.01074668, + "auxiliary_loss_mlp": 0.0102538, + "balance_loss_clip": 1.03544402, + "balance_loss_mlp": 1.01388252, + "epoch": 0.9237937772433489, + "flos": 23587708942080.0, + "grad_norm": 1.4281985355444542, + "language_loss": 0.67964804, + "learning_rate": 6.056455515869419e-08, + "loss": 0.70064855, + "num_input_tokens_seen": 331519420, + "step": 15365, + "time_per_iteration": 2.7319743633270264 + }, + { + "auxiliary_loss_clip": 0.01109561, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.03753567, + "balance_loss_mlp": 1.01805329, + "epoch": 0.9238539004960168, + "flos": 26140562398080.0, + "grad_norm": 2.080228129033925, + "language_loss": 0.62466252, + "learning_rate": 6.046947430586913e-08, + "loss": 0.64606285, + "num_input_tokens_seen": 331538720, + "step": 15366, + "time_per_iteration": 4.141804456710815 + }, + { + "auxiliary_loss_clip": 0.01076799, + "auxiliary_loss_mlp": 0.01028677, + "balance_loss_clip": 1.03669524, + "balance_loss_mlp": 1.01587987, + "epoch": 0.9239140237486848, + "flos": 21068000760960.0, + "grad_norm": 1.4710054055259818, + "language_loss": 0.74650168, + "learning_rate": 6.037446700023619e-08, + "loss": 0.76755643, + "num_input_tokens_seen": 331558505, + "step": 15367, + "time_per_iteration": 2.6937508583068848 + }, + { + "auxiliary_loss_clip": 0.01083975, + "auxiliary_loss_mlp": 0.00768707, + "balance_loss_clip": 1.03666592, + "balance_loss_mlp": 1.0002172, + "epoch": 0.9239741470013527, + "flos": 24607930936320.0, + "grad_norm": 2.0965464238234857, + "language_loss": 0.65042406, + "learning_rate": 6.027953324539759e-08, + "loss": 0.66895086, + "num_input_tokens_seen": 331578440, + "step": 15368, + "time_per_iteration": 2.7437262535095215 + }, + { + "auxiliary_loss_clip": 0.01101382, + "auxiliary_loss_mlp": 0.01034709, + "balance_loss_clip": 1.03610659, + "balance_loss_mlp": 1.02171481, + "epoch": 0.9240342702540207, + "flos": 24718248581760.0, + "grad_norm": 1.7086931963123835, + "language_loss": 0.74773824, + "learning_rate": 6.018467304495401e-08, + "loss": 0.76909912, + "num_input_tokens_seen": 331598945, + "step": 15369, + "time_per_iteration": 2.6743035316467285 + }, + { + "auxiliary_loss_clip": 0.01104923, + "auxiliary_loss_mlp": 0.01037418, + "balance_loss_clip": 1.04013598, + "balance_loss_mlp": 1.02334499, + "epoch": 0.9240943935066888, + "flos": 20849987162880.0, + "grad_norm": 2.038986001331123, + "language_loss": 0.76338404, + "learning_rate": 6.008988640250145e-08, + "loss": 0.78480744, + "num_input_tokens_seen": 331616700, + "step": 15370, + "time_per_iteration": 2.607760429382324 + }, + { + "auxiliary_loss_clip": 0.01109143, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.03663468, + "balance_loss_mlp": 1.0196085, + "epoch": 0.9241545167593567, + "flos": 24462313200000.0, + "grad_norm": 2.4044373841458495, + "language_loss": 0.66958445, + "learning_rate": 5.999517332163528e-08, + "loss": 0.69099462, + "num_input_tokens_seen": 331635625, + "step": 15371, + "time_per_iteration": 2.6164920330047607 + }, + { + "auxiliary_loss_clip": 0.01011382, + "auxiliary_loss_mlp": 0.01002865, + "balance_loss_clip": 1.00752091, + "balance_loss_mlp": 1.00176203, + "epoch": 0.9242146400120247, + "flos": 61827259847040.0, + "grad_norm": 0.7346104185663305, + "language_loss": 0.57681966, + "learning_rate": 5.99005338059464e-08, + "loss": 0.59696221, + "num_input_tokens_seen": 331698595, + "step": 15372, + "time_per_iteration": 3.1782453060150146 + }, + { + "auxiliary_loss_clip": 0.01108932, + "auxiliary_loss_mlp": 0.01030295, + "balance_loss_clip": 1.03912938, + "balance_loss_mlp": 1.01884508, + "epoch": 0.9242747632646926, + "flos": 22048397550720.0, + "grad_norm": 2.204946411354503, + "language_loss": 0.70037317, + "learning_rate": 5.98059678590237e-08, + "loss": 0.72176552, + "num_input_tokens_seen": 331717975, + "step": 15373, + "time_per_iteration": 2.5716300010681152 + }, + { + "auxiliary_loss_clip": 0.01093668, + "auxiliary_loss_mlp": 0.01037655, + "balance_loss_clip": 1.03443408, + "balance_loss_mlp": 1.02547812, + "epoch": 0.9243348865173606, + "flos": 18478338842880.0, + "grad_norm": 2.311286050873559, + "language_loss": 0.75668836, + "learning_rate": 5.971147548445299e-08, + "loss": 0.77800161, + "num_input_tokens_seen": 331737220, + "step": 15374, + "time_per_iteration": 2.6773972511291504 + }, + { + "auxiliary_loss_clip": 0.01071113, + "auxiliary_loss_mlp": 0.01034847, + "balance_loss_clip": 1.03411102, + "balance_loss_mlp": 1.02240705, + "epoch": 0.9243950097700285, + "flos": 23258767167360.0, + "grad_norm": 1.61997040297718, + "language_loss": 0.64933169, + "learning_rate": 5.961705668581784e-08, + "loss": 0.67039132, + "num_input_tokens_seen": 331757300, + "step": 15375, + "time_per_iteration": 2.724712371826172 + }, + { + "auxiliary_loss_clip": 0.01080494, + "auxiliary_loss_mlp": 0.01034023, + "balance_loss_clip": 1.03776979, + "balance_loss_mlp": 1.02213168, + "epoch": 0.9244551330226966, + "flos": 29749081593600.0, + "grad_norm": 1.73726236759537, + "language_loss": 0.66592222, + "learning_rate": 5.952271146669829e-08, + "loss": 0.68706739, + "num_input_tokens_seen": 331776995, + "step": 15376, + "time_per_iteration": 2.7318432331085205 + }, + { + "auxiliary_loss_clip": 0.01027325, + "auxiliary_loss_mlp": 0.01000699, + "balance_loss_clip": 1.004807, + "balance_loss_mlp": 0.99974555, + "epoch": 0.9245152562753645, + "flos": 68864960609280.0, + "grad_norm": 0.6503040166791668, + "language_loss": 0.61148441, + "learning_rate": 5.94284398306717e-08, + "loss": 0.63176465, + "num_input_tokens_seen": 331845015, + "step": 15377, + "time_per_iteration": 3.3028318881988525 + }, + { + "auxiliary_loss_clip": 0.01066627, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.03230667, + "balance_loss_mlp": 1.02378881, + "epoch": 0.9245753795280325, + "flos": 21579260993280.0, + "grad_norm": 1.8441759941724984, + "language_loss": 0.74442959, + "learning_rate": 5.933424178131341e-08, + "loss": 0.76545691, + "num_input_tokens_seen": 331862795, + "step": 15378, + "time_per_iteration": 2.7058117389678955 + }, + { + "auxiliary_loss_clip": 0.01111214, + "auxiliary_loss_mlp": 0.0103357, + "balance_loss_clip": 1.03782296, + "balance_loss_mlp": 1.02046919, + "epoch": 0.9246355027807004, + "flos": 34496077334400.0, + "grad_norm": 2.4430826807179122, + "language_loss": 0.62603706, + "learning_rate": 5.924011732219503e-08, + "loss": 0.6474849, + "num_input_tokens_seen": 331882535, + "step": 15379, + "time_per_iteration": 2.672102928161621 + }, + { + "auxiliary_loss_clip": 0.01027241, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.03090858, + "balance_loss_mlp": 1.02008975, + "epoch": 0.9246956260333684, + "flos": 15953854152960.0, + "grad_norm": 2.0533190004869133, + "language_loss": 0.83975178, + "learning_rate": 5.914606645688591e-08, + "loss": 0.86036825, + "num_input_tokens_seen": 331899335, + "step": 15380, + "time_per_iteration": 2.8909592628479004 + }, + { + "auxiliary_loss_clip": 0.01110328, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.03606331, + "balance_loss_mlp": 1.02165866, + "epoch": 0.9247557492860363, + "flos": 23368366540800.0, + "grad_norm": 1.485445137788739, + "language_loss": 0.73372233, + "learning_rate": 5.905208918895233e-08, + "loss": 0.75517505, + "num_input_tokens_seen": 331919030, + "step": 15381, + "time_per_iteration": 2.6360280513763428 + }, + { + "auxiliary_loss_clip": 0.01093808, + "auxiliary_loss_mlp": 0.01032177, + "balance_loss_clip": 1.03822911, + "balance_loss_mlp": 1.01991057, + "epoch": 0.9248158725387043, + "flos": 23039855729280.0, + "grad_norm": 1.7916241982506211, + "language_loss": 0.78368294, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.80494279, + "num_input_tokens_seen": 331936465, + "step": 15382, + "time_per_iteration": 2.6322009563446045 + }, + { + "auxiliary_loss_clip": 0.01085867, + "auxiliary_loss_mlp": 0.01035748, + "balance_loss_clip": 1.03582263, + "balance_loss_mlp": 1.02334988, + "epoch": 0.9248759957913724, + "flos": 22522418357760.0, + "grad_norm": 1.7508606986515263, + "language_loss": 0.75239515, + "learning_rate": 5.886435545946455e-08, + "loss": 0.77361131, + "num_input_tokens_seen": 331954625, + "step": 15383, + "time_per_iteration": 2.6861977577209473 + }, + { + "auxiliary_loss_clip": 0.01084507, + "auxiliary_loss_mlp": 0.01026683, + "balance_loss_clip": 1.03370142, + "balance_loss_mlp": 1.0149585, + "epoch": 0.9249361190440403, + "flos": 25447271016960.0, + "grad_norm": 1.6935679354612814, + "language_loss": 0.75976408, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.78087592, + "num_input_tokens_seen": 331975865, + "step": 15384, + "time_per_iteration": 2.7150700092315674 + }, + { + "auxiliary_loss_clip": 0.01075864, + "auxiliary_loss_mlp": 0.01031764, + "balance_loss_clip": 1.03653836, + "balance_loss_mlp": 1.01944935, + "epoch": 0.9249962422967083, + "flos": 12378623886720.0, + "grad_norm": 3.7142425878175223, + "language_loss": 0.66128278, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.68235904, + "num_input_tokens_seen": 331992760, + "step": 15385, + "time_per_iteration": 2.7027785778045654 + }, + { + "auxiliary_loss_clip": 0.01106721, + "auxiliary_loss_mlp": 0.01033952, + "balance_loss_clip": 1.03535783, + "balance_loss_mlp": 1.02156663, + "epoch": 0.9250563655493762, + "flos": 22929430343040.0, + "grad_norm": 1.9713792057532418, + "language_loss": 0.81076729, + "learning_rate": 5.85833069345496e-08, + "loss": 0.83217394, + "num_input_tokens_seen": 332011890, + "step": 15386, + "time_per_iteration": 2.5687849521636963 + }, + { + "auxiliary_loss_clip": 0.01094924, + "auxiliary_loss_mlp": 0.0103698, + "balance_loss_clip": 1.03508687, + "balance_loss_mlp": 1.02440369, + "epoch": 0.9251164888020442, + "flos": 18478662065280.0, + "grad_norm": 1.617640933108123, + "language_loss": 0.75817406, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.77949309, + "num_input_tokens_seen": 332029485, + "step": 15387, + "time_per_iteration": 2.6368582248687744 + }, + { + "auxiliary_loss_clip": 0.0109213, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.03527617, + "balance_loss_mlp": 1.02329326, + "epoch": 0.9251766120547121, + "flos": 33037062796800.0, + "grad_norm": 1.4014991534530432, + "language_loss": 0.700683, + "learning_rate": 5.839630933893014e-08, + "loss": 0.72195256, + "num_input_tokens_seen": 332052970, + "step": 15388, + "time_per_iteration": 2.754608392715454 + }, + { + "auxiliary_loss_clip": 0.01097095, + "auxiliary_loss_mlp": 0.01030709, + "balance_loss_clip": 1.03522015, + "balance_loss_mlp": 1.01837683, + "epoch": 0.9252367353073802, + "flos": 24387906176640.0, + "grad_norm": 2.101057031092642, + "language_loss": 0.82379329, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.84507132, + "num_input_tokens_seen": 332070395, + "step": 15389, + "time_per_iteration": 2.6602766513824463 + }, + { + "auxiliary_loss_clip": 0.01104924, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.03799844, + "balance_loss_mlp": 1.01822543, + "epoch": 0.9252968585600481, + "flos": 18916844077440.0, + "grad_norm": 1.6739329388921937, + "language_loss": 0.79294932, + "learning_rate": 5.820960624653381e-08, + "loss": 0.81431639, + "num_input_tokens_seen": 332090185, + "step": 15390, + "time_per_iteration": 2.623624563217163 + }, + { + "auxiliary_loss_clip": 0.01076005, + "auxiliary_loss_mlp": 0.01039788, + "balance_loss_clip": 1.03474212, + "balance_loss_mlp": 1.02668691, + "epoch": 0.9253569818127161, + "flos": 21725345606400.0, + "grad_norm": 1.640565709280766, + "language_loss": 0.75278354, + "learning_rate": 5.811636514789597e-08, + "loss": 0.77394152, + "num_input_tokens_seen": 332109050, + "step": 15391, + "time_per_iteration": 2.6627962589263916 + }, + { + "auxiliary_loss_clip": 0.0108717, + "auxiliary_loss_mlp": 0.01033982, + "balance_loss_clip": 1.03444612, + "balance_loss_mlp": 1.02011776, + "epoch": 0.925417105065384, + "flos": 34240357434240.0, + "grad_norm": 2.199534306893137, + "language_loss": 0.52898717, + "learning_rate": 5.80231976856802e-08, + "loss": 0.55019867, + "num_input_tokens_seen": 332131180, + "step": 15392, + "time_per_iteration": 2.780339479446411 + }, + { + "auxiliary_loss_clip": 0.01106895, + "auxiliary_loss_mlp": 0.01032462, + "balance_loss_clip": 1.03467309, + "balance_loss_mlp": 1.02046967, + "epoch": 0.925477228318052, + "flos": 25959536830080.0, + "grad_norm": 2.305545825428345, + "language_loss": 0.77058631, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.79197991, + "num_input_tokens_seen": 332149555, + "step": 15393, + "time_per_iteration": 2.602755069732666 + }, + { + "auxiliary_loss_clip": 0.01078205, + "auxiliary_loss_mlp": 0.01032904, + "balance_loss_clip": 1.03438604, + "balance_loss_mlp": 1.02044106, + "epoch": 0.9255373515707199, + "flos": 11838240702720.0, + "grad_norm": 3.40082651052626, + "language_loss": 0.69679272, + "learning_rate": 5.783708368464357e-08, + "loss": 0.71790373, + "num_input_tokens_seen": 332165830, + "step": 15394, + "time_per_iteration": 2.6185452938079834 + }, + { + "auxiliary_loss_clip": 0.0110989, + "auxiliary_loss_mlp": 0.01030861, + "balance_loss_clip": 1.0379678, + "balance_loss_mlp": 1.01829052, + "epoch": 0.925597474823388, + "flos": 21434325615360.0, + "grad_norm": 1.7473180632451621, + "language_loss": 0.72795445, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.74936193, + "num_input_tokens_seen": 332185130, + "step": 15395, + "time_per_iteration": 2.6088504791259766 + }, + { + "auxiliary_loss_clip": 0.01057286, + "auxiliary_loss_mlp": 0.01032502, + "balance_loss_clip": 1.03256583, + "balance_loss_mlp": 1.02103376, + "epoch": 0.925657598076056, + "flos": 22857573185280.0, + "grad_norm": 1.8404815888609334, + "language_loss": 0.71465933, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.7355572, + "num_input_tokens_seen": 332203695, + "step": 15396, + "time_per_iteration": 2.7053260803222656 + }, + { + "auxiliary_loss_clip": 0.01106531, + "auxiliary_loss_mlp": 0.01030786, + "balance_loss_clip": 1.03571641, + "balance_loss_mlp": 1.01807809, + "epoch": 0.9257177213287239, + "flos": 25704032411520.0, + "grad_norm": 1.6857437416132761, + "language_loss": 0.87266874, + "learning_rate": 5.755846504448603e-08, + "loss": 0.8940419, + "num_input_tokens_seen": 332224850, + "step": 15397, + "time_per_iteration": 2.5987706184387207 + }, + { + "auxiliary_loss_clip": 0.01027242, + "auxiliary_loss_mlp": 0.00998861, + "balance_loss_clip": 1.00477362, + "balance_loss_mlp": 0.9978596, + "epoch": 0.9257778445813919, + "flos": 59592933221760.0, + "grad_norm": 0.80472899949222, + "language_loss": 0.55124933, + "learning_rate": 5.746573947489586e-08, + "loss": 0.57151037, + "num_input_tokens_seen": 332278085, + "step": 15398, + "time_per_iteration": 3.022796869277954 + }, + { + "auxiliary_loss_clip": 0.01088846, + "auxiliary_loss_mlp": 0.01032308, + "balance_loss_clip": 1.03451252, + "balance_loss_mlp": 1.01805639, + "epoch": 0.9258379678340598, + "flos": 27709427704320.0, + "grad_norm": 1.9788973951742164, + "language_loss": 0.76231545, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.78352696, + "num_input_tokens_seen": 332297875, + "step": 15399, + "time_per_iteration": 2.6782071590423584 + }, + { + "auxiliary_loss_clip": 0.01077436, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.03120267, + "balance_loss_mlp": 1.01828766, + "epoch": 0.9258980910867278, + "flos": 24863543095680.0, + "grad_norm": 1.6019049009837816, + "language_loss": 0.78070617, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.8017754, + "num_input_tokens_seen": 332318500, + "step": 15400, + "time_per_iteration": 2.6918084621429443 + }, + { + "auxiliary_loss_clip": 0.01019125, + "auxiliary_loss_mlp": 0.01002511, + "balance_loss_clip": 1.00581372, + "balance_loss_mlp": 1.00144386, + "epoch": 0.9259582143393957, + "flos": 63134587249920.0, + "grad_norm": 0.7229223052902047, + "language_loss": 0.51348114, + "learning_rate": 5.718800474673946e-08, + "loss": 0.53369749, + "num_input_tokens_seen": 332381980, + "step": 15401, + "time_per_iteration": 4.655211448669434 + }, + { + "auxiliary_loss_clip": 0.01095608, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.03721333, + "balance_loss_mlp": 1.02316141, + "epoch": 0.9260183375920638, + "flos": 24127122458880.0, + "grad_norm": 1.7293202030573633, + "language_loss": 0.8252185, + "learning_rate": 5.709557384259378e-08, + "loss": 0.84652448, + "num_input_tokens_seen": 332399510, + "step": 15402, + "time_per_iteration": 2.7125723361968994 + }, + { + "auxiliary_loss_clip": 0.01027546, + "auxiliary_loss_mlp": 0.01001395, + "balance_loss_clip": 1.00508666, + "balance_loss_mlp": 1.0004828, + "epoch": 0.9260784608447317, + "flos": 63042872849280.0, + "grad_norm": 0.7337152983858821, + "language_loss": 0.51039803, + "learning_rate": 5.700321661357876e-08, + "loss": 0.53068745, + "num_input_tokens_seen": 332459130, + "step": 15403, + "time_per_iteration": 4.670861005783081 + }, + { + "auxiliary_loss_clip": 0.01007604, + "auxiliary_loss_mlp": 0.01001265, + "balance_loss_clip": 1.00503612, + "balance_loss_mlp": 1.0004487, + "epoch": 0.9261385840973997, + "flos": 70585979927040.0, + "grad_norm": 0.6850090598665447, + "language_loss": 0.58746749, + "learning_rate": 5.69109330631965e-08, + "loss": 0.60755622, + "num_input_tokens_seen": 332526555, + "step": 15404, + "time_per_iteration": 4.747823238372803 + }, + { + "auxiliary_loss_clip": 0.01083395, + "auxiliary_loss_mlp": 0.01035934, + "balance_loss_clip": 1.03603053, + "balance_loss_mlp": 1.02242208, + "epoch": 0.9261987073500676, + "flos": 20229917656320.0, + "grad_norm": 2.2086830252227903, + "language_loss": 0.71290517, + "learning_rate": 5.681872319494596e-08, + "loss": 0.73409843, + "num_input_tokens_seen": 332544005, + "step": 15405, + "time_per_iteration": 2.6791491508483887 + }, + { + "auxiliary_loss_clip": 0.01061911, + "auxiliary_loss_mlp": 0.01037759, + "balance_loss_clip": 1.03471172, + "balance_loss_mlp": 1.02468121, + "epoch": 0.9262588306027356, + "flos": 20954163582720.0, + "grad_norm": 1.7027063262346462, + "language_loss": 0.68240035, + "learning_rate": 5.672658701232458e-08, + "loss": 0.70339704, + "num_input_tokens_seen": 332563070, + "step": 15406, + "time_per_iteration": 4.413249731063843 + }, + { + "auxiliary_loss_clip": 0.01056836, + "auxiliary_loss_mlp": 0.01046779, + "balance_loss_clip": 1.03164291, + "balance_loss_mlp": 1.03166914, + "epoch": 0.9263189538554035, + "flos": 22158679282560.0, + "grad_norm": 3.2449194361819234, + "language_loss": 0.76197219, + "learning_rate": 5.663452451882555e-08, + "loss": 0.78300834, + "num_input_tokens_seen": 332579620, + "step": 15407, + "time_per_iteration": 2.7800557613372803 + }, + { + "auxiliary_loss_clip": 0.01076765, + "auxiliary_loss_mlp": 0.01037766, + "balance_loss_clip": 1.03282285, + "balance_loss_mlp": 1.02446783, + "epoch": 0.9263790771080715, + "flos": 18187211111040.0, + "grad_norm": 1.9744947993410575, + "language_loss": 0.72376311, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.74490839, + "num_input_tokens_seen": 332597795, + "step": 15408, + "time_per_iteration": 2.654872179031372 + }, + { + "auxiliary_loss_clip": 0.01077908, + "auxiliary_loss_mlp": 0.01028464, + "balance_loss_clip": 1.03418171, + "balance_loss_mlp": 1.01781821, + "epoch": 0.9264392003607396, + "flos": 48178545004800.0, + "grad_norm": 1.6454364766131493, + "language_loss": 0.68587399, + "learning_rate": 5.645062061315675e-08, + "loss": 0.70693767, + "num_input_tokens_seen": 332620375, + "step": 15409, + "time_per_iteration": 2.850269317626953 + }, + { + "auxiliary_loss_clip": 0.01074672, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.03626847, + "balance_loss_mlp": 1.01791096, + "epoch": 0.9264993236134075, + "flos": 26389458714240.0, + "grad_norm": 2.1360446991021416, + "language_loss": 0.75711519, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.77817023, + "num_input_tokens_seen": 332639510, + "step": 15410, + "time_per_iteration": 2.7220871448516846 + }, + { + "auxiliary_loss_clip": 0.01057013, + "auxiliary_loss_mlp": 0.01030911, + "balance_loss_clip": 1.03571475, + "balance_loss_mlp": 1.01858473, + "epoch": 0.9265594468660755, + "flos": 20920084554240.0, + "grad_norm": 1.538032014217413, + "language_loss": 0.82166702, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.84254622, + "num_input_tokens_seen": 332658350, + "step": 15411, + "time_per_iteration": 2.7539865970611572 + }, + { + "auxiliary_loss_clip": 0.01085605, + "auxiliary_loss_mlp": 0.01037973, + "balance_loss_clip": 1.03824568, + "balance_loss_mlp": 1.02592707, + "epoch": 0.9266195701187434, + "flos": 17525017929600.0, + "grad_norm": 1.9373145629003894, + "language_loss": 0.75171757, + "learning_rate": 5.617531751025728e-08, + "loss": 0.77295339, + "num_input_tokens_seen": 332676715, + "step": 15412, + "time_per_iteration": 2.6214661598205566 + }, + { + "auxiliary_loss_clip": 0.0110647, + "auxiliary_loss_mlp": 0.01029847, + "balance_loss_clip": 1.03467417, + "balance_loss_mlp": 1.01769996, + "epoch": 0.9266796933714114, + "flos": 33688733293440.0, + "grad_norm": 1.8589935579070962, + "language_loss": 0.66795665, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.68931985, + "num_input_tokens_seen": 332701470, + "step": 15413, + "time_per_iteration": 2.690272808074951 + }, + { + "auxiliary_loss_clip": 0.0105034, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.0341413, + "balance_loss_mlp": 1.0243938, + "epoch": 0.9267398166240793, + "flos": 18916520855040.0, + "grad_norm": 2.0080072670412794, + "language_loss": 0.76213551, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.78301573, + "num_input_tokens_seen": 332719060, + "step": 15414, + "time_per_iteration": 2.858206033706665 + }, + { + "auxiliary_loss_clip": 0.01094062, + "auxiliary_loss_mlp": 0.01028504, + "balance_loss_clip": 1.03724313, + "balance_loss_mlp": 1.01658273, + "epoch": 0.9267999398767474, + "flos": 20478957626880.0, + "grad_norm": 3.161190355469832, + "language_loss": 0.81600469, + "learning_rate": 5.59006777975819e-08, + "loss": 0.83723032, + "num_input_tokens_seen": 332736345, + "step": 15415, + "time_per_iteration": 2.6205687522888184 + }, + { + "auxiliary_loss_clip": 0.01086858, + "auxiliary_loss_mlp": 0.01034472, + "balance_loss_clip": 1.03274202, + "balance_loss_mlp": 1.02163887, + "epoch": 0.9268600631294153, + "flos": 24789351553920.0, + "grad_norm": 1.3966462489262188, + "language_loss": 0.54340827, + "learning_rate": 5.580927866294671e-08, + "loss": 0.56462157, + "num_input_tokens_seen": 332756270, + "step": 15416, + "time_per_iteration": 2.73563814163208 + }, + { + "auxiliary_loss_clip": 0.01067608, + "auxiliary_loss_mlp": 0.01035609, + "balance_loss_clip": 1.03344822, + "balance_loss_mlp": 1.02311552, + "epoch": 0.9269201863820833, + "flos": 18697178453760.0, + "grad_norm": 1.5074603678728897, + "language_loss": 0.72186983, + "learning_rate": 5.571795325221807e-08, + "loss": 0.74290192, + "num_input_tokens_seen": 332775185, + "step": 15417, + "time_per_iteration": 2.715012788772583 + }, + { + "auxiliary_loss_clip": 0.01094578, + "auxiliary_loss_mlp": 0.01033031, + "balance_loss_clip": 1.03809214, + "balance_loss_mlp": 1.02029991, + "epoch": 0.9269803096347512, + "flos": 20923999136640.0, + "grad_norm": 4.4376132167371365, + "language_loss": 0.7579149, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.77919102, + "num_input_tokens_seen": 332794320, + "step": 15418, + "time_per_iteration": 2.6377668380737305 + }, + { + "auxiliary_loss_clip": 0.01095083, + "auxiliary_loss_mlp": 0.01030369, + "balance_loss_clip": 1.03478622, + "balance_loss_mlp": 1.01755381, + "epoch": 0.9270404328874192, + "flos": 28002710252160.0, + "grad_norm": 1.4381586641081634, + "language_loss": 0.76076263, + "learning_rate": 5.553552361633174e-08, + "loss": 0.78201711, + "num_input_tokens_seen": 332818095, + "step": 15419, + "time_per_iteration": 2.7292606830596924 + }, + { + "auxiliary_loss_clip": 0.01104418, + "auxiliary_loss_mlp": 0.0103305, + "balance_loss_clip": 1.03501427, + "balance_loss_mlp": 1.02151632, + "epoch": 0.9271005561400871, + "flos": 25889870401920.0, + "grad_norm": 1.6372901887386972, + "language_loss": 0.75610423, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.77747887, + "num_input_tokens_seen": 332839860, + "step": 15420, + "time_per_iteration": 2.6438181400299072 + }, + { + "auxiliary_loss_clip": 0.01099967, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.03629184, + "balance_loss_mlp": 1.01973283, + "epoch": 0.9271606793927551, + "flos": 27053914452480.0, + "grad_norm": 1.6035461883801339, + "language_loss": 0.77056849, + "learning_rate": 5.535338891759389e-08, + "loss": 0.79189527, + "num_input_tokens_seen": 332861155, + "step": 15421, + "time_per_iteration": 2.6203770637512207 + }, + { + "auxiliary_loss_clip": 0.0108251, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.03615677, + "balance_loss_mlp": 1.0196619, + "epoch": 0.9272208026454232, + "flos": 26209869690240.0, + "grad_norm": 2.078534179324168, + "language_loss": 0.72883129, + "learning_rate": 5.526243217829041e-08, + "loss": 0.74997723, + "num_input_tokens_seen": 332881110, + "step": 15422, + "time_per_iteration": 2.700307607650757 + }, + { + "auxiliary_loss_clip": 0.01099345, + "auxiliary_loss_mlp": 0.01039891, + "balance_loss_clip": 1.03540778, + "balance_loss_mlp": 1.0265038, + "epoch": 0.9272809258980911, + "flos": 12458453863680.0, + "grad_norm": 1.8814873155718879, + "language_loss": 0.77395117, + "learning_rate": 5.517154918363065e-08, + "loss": 0.79534352, + "num_input_tokens_seen": 332899350, + "step": 15423, + "time_per_iteration": 2.7268893718719482 + }, + { + "auxiliary_loss_clip": 0.01099209, + "auxiliary_loss_mlp": 0.01033057, + "balance_loss_clip": 1.03573775, + "balance_loss_mlp": 1.01999736, + "epoch": 0.9273410491507591, + "flos": 22856890826880.0, + "grad_norm": 2.3478977381899364, + "language_loss": 0.75240654, + "learning_rate": 5.508073993706053e-08, + "loss": 0.77372921, + "num_input_tokens_seen": 332918105, + "step": 15424, + "time_per_iteration": 2.6554524898529053 + }, + { + "auxiliary_loss_clip": 0.01019493, + "auxiliary_loss_mlp": 0.01002831, + "balance_loss_clip": 1.006253, + "balance_loss_mlp": 1.00180626, + "epoch": 0.927401172403427, + "flos": 47665384329600.0, + "grad_norm": 0.7785886890233412, + "language_loss": 0.60644341, + "learning_rate": 5.499000444202351e-08, + "loss": 0.62666667, + "num_input_tokens_seen": 332969490, + "step": 15425, + "time_per_iteration": 2.9746127128601074 + }, + { + "auxiliary_loss_clip": 0.01086691, + "auxiliary_loss_mlp": 0.00770701, + "balance_loss_clip": 1.03668869, + "balance_loss_mlp": 1.00019503, + "epoch": 0.927461295656095, + "flos": 29972374490880.0, + "grad_norm": 1.4170561273174695, + "language_loss": 0.70912516, + "learning_rate": 5.489934270196106e-08, + "loss": 0.72769904, + "num_input_tokens_seen": 332988805, + "step": 15426, + "time_per_iteration": 2.7353477478027344 + }, + { + "auxiliary_loss_clip": 0.01083876, + "auxiliary_loss_mlp": 0.01027564, + "balance_loss_clip": 1.03567636, + "balance_loss_mlp": 1.01585722, + "epoch": 0.9275214189087629, + "flos": 20375427651840.0, + "grad_norm": 1.8095946188152212, + "language_loss": 0.82924026, + "learning_rate": 5.480875472030977e-08, + "loss": 0.85035467, + "num_input_tokens_seen": 333007960, + "step": 15427, + "time_per_iteration": 2.6290063858032227 + }, + { + "auxiliary_loss_clip": 0.01074923, + "auxiliary_loss_mlp": 0.01034079, + "balance_loss_clip": 1.03522468, + "balance_loss_mlp": 1.02114439, + "epoch": 0.927581542161431, + "flos": 22383193242240.0, + "grad_norm": 1.5641364814856114, + "language_loss": 0.77063322, + "learning_rate": 5.471824050050555e-08, + "loss": 0.79172319, + "num_input_tokens_seen": 333026035, + "step": 15428, + "time_per_iteration": 2.7724509239196777 + }, + { + "auxiliary_loss_clip": 0.01068711, + "auxiliary_loss_mlp": 0.01034095, + "balance_loss_clip": 1.03291845, + "balance_loss_mlp": 1.02194142, + "epoch": 0.9276416654140989, + "flos": 23952453598080.0, + "grad_norm": 1.8224763848392078, + "language_loss": 0.74805522, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.76908326, + "num_input_tokens_seen": 333045590, + "step": 15429, + "time_per_iteration": 2.859591245651245 + }, + { + "auxiliary_loss_clip": 0.01070146, + "auxiliary_loss_mlp": 0.01033725, + "balance_loss_clip": 1.03224564, + "balance_loss_mlp": 1.02171516, + "epoch": 0.9277017886667669, + "flos": 13917719796480.0, + "grad_norm": 1.7936478622179974, + "language_loss": 0.74859536, + "learning_rate": 5.45374333601647e-08, + "loss": 0.76963401, + "num_input_tokens_seen": 333063355, + "step": 15430, + "time_per_iteration": 2.7022671699523926 + }, + { + "auxiliary_loss_clip": 0.01097528, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.03492427, + "balance_loss_mlp": 1.02224135, + "epoch": 0.9277619119194348, + "flos": 35666478092160.0, + "grad_norm": 1.3597069220305837, + "language_loss": 0.76239693, + "learning_rate": 5.444714044648391e-08, + "loss": 0.78372276, + "num_input_tokens_seen": 333088045, + "step": 15431, + "time_per_iteration": 2.7746591567993164 + }, + { + "auxiliary_loss_clip": 0.01095653, + "auxiliary_loss_mlp": 0.01030326, + "balance_loss_clip": 1.03667474, + "balance_loss_mlp": 1.01806593, + "epoch": 0.9278220351721028, + "flos": 23841238112640.0, + "grad_norm": 1.8615596661189457, + "language_loss": 0.70812196, + "learning_rate": 5.4356921308363e-08, + "loss": 0.72938174, + "num_input_tokens_seen": 333108005, + "step": 15432, + "time_per_iteration": 2.6617555618286133 + }, + { + "auxiliary_loss_clip": 0.01063577, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.03703666, + "balance_loss_mlp": 1.02040219, + "epoch": 0.9278821584247707, + "flos": 15228135768960.0, + "grad_norm": 2.1745322103620044, + "language_loss": 0.81965214, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.84061331, + "num_input_tokens_seen": 333124335, + "step": 15433, + "time_per_iteration": 2.669423818588257 + }, + { + "auxiliary_loss_clip": 0.01104445, + "auxiliary_loss_mlp": 0.01028126, + "balance_loss_clip": 1.03630841, + "balance_loss_mlp": 1.01734364, + "epoch": 0.9279422816774388, + "flos": 24681404206080.0, + "grad_norm": 1.9129971221663375, + "language_loss": 0.66100991, + "learning_rate": 5.417670437248056e-08, + "loss": 0.68233562, + "num_input_tokens_seen": 333143995, + "step": 15434, + "time_per_iteration": 2.5970053672790527 + }, + { + "auxiliary_loss_clip": 0.01077405, + "auxiliary_loss_mlp": 0.01030104, + "balance_loss_clip": 1.03205669, + "balance_loss_mlp": 1.0184691, + "epoch": 0.9280024049301068, + "flos": 19169188099200.0, + "grad_norm": 1.6749341071635755, + "language_loss": 0.68276256, + "learning_rate": 5.40867065815529e-08, + "loss": 0.70383763, + "num_input_tokens_seen": 333162805, + "step": 15435, + "time_per_iteration": 2.6586270332336426 + }, + { + "auxiliary_loss_clip": 0.0110958, + "auxiliary_loss_mlp": 0.01031502, + "balance_loss_clip": 1.03709114, + "balance_loss_mlp": 1.01851392, + "epoch": 0.9280625281827747, + "flos": 11393701983360.0, + "grad_norm": 1.9176254237254773, + "language_loss": 0.72329485, + "learning_rate": 5.399678257985263e-08, + "loss": 0.74470568, + "num_input_tokens_seen": 333175770, + "step": 15436, + "time_per_iteration": 2.5913889408111572 + }, + { + "auxiliary_loss_clip": 0.01083394, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.03532553, + "balance_loss_mlp": 1.01925242, + "epoch": 0.9281226514354427, + "flos": 24785616539520.0, + "grad_norm": 1.984161101089214, + "language_loss": 0.66967964, + "learning_rate": 5.390693237078925e-08, + "loss": 0.69082779, + "num_input_tokens_seen": 333194775, + "step": 15437, + "time_per_iteration": 2.681321144104004 + }, + { + "auxiliary_loss_clip": 0.01098237, + "auxiliary_loss_mlp": 0.01033718, + "balance_loss_clip": 1.03667545, + "balance_loss_mlp": 1.02040851, + "epoch": 0.9281827746881106, + "flos": 15083128563840.0, + "grad_norm": 1.9610204855405788, + "language_loss": 0.71449178, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.73581135, + "num_input_tokens_seen": 333208920, + "step": 15438, + "time_per_iteration": 2.6930477619171143 + }, + { + "auxiliary_loss_clip": 0.01108794, + "auxiliary_loss_mlp": 0.01029773, + "balance_loss_clip": 1.03654385, + "balance_loss_mlp": 1.01757753, + "epoch": 0.9282428979407786, + "flos": 24135059364480.0, + "grad_norm": 1.5819700006479365, + "language_loss": 0.64896679, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.67035246, + "num_input_tokens_seen": 333229350, + "step": 15439, + "time_per_iteration": 2.6033389568328857 + }, + { + "auxiliary_loss_clip": 0.01085049, + "auxiliary_loss_mlp": 0.01030625, + "balance_loss_clip": 1.0345124, + "balance_loss_mlp": 1.01829338, + "epoch": 0.9283030211934465, + "flos": 24823215100800.0, + "grad_norm": 1.6494013213160663, + "language_loss": 0.70283854, + "learning_rate": 5.363782453347876e-08, + "loss": 0.72399533, + "num_input_tokens_seen": 333246125, + "step": 15440, + "time_per_iteration": 4.2000510692596436 + }, + { + "auxiliary_loss_clip": 0.01072935, + "auxiliary_loss_mlp": 0.00771755, + "balance_loss_clip": 1.03401041, + "balance_loss_mlp": 1.00015736, + "epoch": 0.9283631444461146, + "flos": 23981037845760.0, + "grad_norm": 1.6732819714869454, + "language_loss": 0.76933122, + "learning_rate": 5.354826952900682e-08, + "loss": 0.78777814, + "num_input_tokens_seen": 333263685, + "step": 15441, + "time_per_iteration": 2.6747872829437256 + }, + { + "auxiliary_loss_clip": 0.01091447, + "auxiliary_loss_mlp": 0.01029564, + "balance_loss_clip": 1.03517056, + "balance_loss_mlp": 1.01912725, + "epoch": 0.9284232676987825, + "flos": 22784530878720.0, + "grad_norm": 2.5725907020773073, + "language_loss": 0.64269817, + "learning_rate": 5.345878833417949e-08, + "loss": 0.6639083, + "num_input_tokens_seen": 333282435, + "step": 15442, + "time_per_iteration": 4.172094106674194 + }, + { + "auxiliary_loss_clip": 0.01064932, + "auxiliary_loss_mlp": 0.01047387, + "balance_loss_clip": 1.03302109, + "balance_loss_mlp": 1.03327239, + "epoch": 0.9284833909514505, + "flos": 19500500171520.0, + "grad_norm": 1.9244730176476685, + "language_loss": 0.80415273, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.8252759, + "num_input_tokens_seen": 333300400, + "step": 15443, + "time_per_iteration": 4.384172201156616 + }, + { + "auxiliary_loss_clip": 0.01098927, + "auxiliary_loss_mlp": 0.00770098, + "balance_loss_clip": 1.03640699, + "balance_loss_mlp": 1.00019741, + "epoch": 0.9285435142041184, + "flos": 23185976256000.0, + "grad_norm": 2.3934730189580278, + "language_loss": 0.6569308, + "learning_rate": 5.328004738702896e-08, + "loss": 0.67562109, + "num_input_tokens_seen": 333318980, + "step": 15444, + "time_per_iteration": 2.6536576747894287 + }, + { + "auxiliary_loss_clip": 0.01066958, + "auxiliary_loss_mlp": 0.01030149, + "balance_loss_clip": 1.03394997, + "balance_loss_mlp": 1.01776958, + "epoch": 0.9286036374567864, + "flos": 17675519915520.0, + "grad_norm": 1.9812585053043275, + "language_loss": 0.73909259, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.76006365, + "num_input_tokens_seen": 333334135, + "step": 15445, + "time_per_iteration": 4.150733947753906 + }, + { + "auxiliary_loss_clip": 0.0109372, + "auxiliary_loss_mlp": 0.01039239, + "balance_loss_clip": 1.03644156, + "balance_loss_mlp": 1.02563095, + "epoch": 0.9286637607094543, + "flos": 20886687884160.0, + "grad_norm": 1.737150069171567, + "language_loss": 0.71495092, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.7362805, + "num_input_tokens_seen": 333353325, + "step": 15446, + "time_per_iteration": 2.6059980392456055 + }, + { + "auxiliary_loss_clip": 0.0105076, + "auxiliary_loss_mlp": 0.01033839, + "balance_loss_clip": 1.0349431, + "balance_loss_mlp": 1.02085745, + "epoch": 0.9287238839621224, + "flos": 19026012487680.0, + "grad_norm": 1.7396993406776888, + "language_loss": 0.69318455, + "learning_rate": 5.301248962337523e-08, + "loss": 0.71403056, + "num_input_tokens_seen": 333371110, + "step": 15447, + "time_per_iteration": 2.785006046295166 + }, + { + "auxiliary_loss_clip": 0.01101911, + "auxiliary_loss_mlp": 0.01030242, + "balance_loss_clip": 1.03475642, + "balance_loss_mlp": 1.01898217, + "epoch": 0.9287840072147904, + "flos": 20557027837440.0, + "grad_norm": 1.5415290454981314, + "language_loss": 0.72406214, + "learning_rate": 5.292345135757403e-08, + "loss": 0.74538368, + "num_input_tokens_seen": 333391420, + "step": 15448, + "time_per_iteration": 2.5804696083068848 + }, + { + "auxiliary_loss_clip": 0.01108235, + "auxiliary_loss_mlp": 0.01028889, + "balance_loss_clip": 1.03618634, + "balance_loss_mlp": 1.01485801, + "epoch": 0.9288441304674583, + "flos": 21250822008960.0, + "grad_norm": 1.5546568234640936, + "language_loss": 0.74195588, + "learning_rate": 5.283448692511072e-08, + "loss": 0.76332712, + "num_input_tokens_seen": 333410365, + "step": 15449, + "time_per_iteration": 2.5949409008026123 + }, + { + "auxiliary_loss_clip": 0.01108056, + "auxiliary_loss_mlp": 0.00770321, + "balance_loss_clip": 1.03558385, + "balance_loss_mlp": 1.00032389, + "epoch": 0.9289042537201263, + "flos": 27669853895040.0, + "grad_norm": 1.737092405076848, + "language_loss": 0.67934465, + "learning_rate": 5.27455963293586e-08, + "loss": 0.69812846, + "num_input_tokens_seen": 333430000, + "step": 15450, + "time_per_iteration": 2.666686773300171 + }, + { + "auxiliary_loss_clip": 0.01076756, + "auxiliary_loss_mlp": 0.01028132, + "balance_loss_clip": 1.03465009, + "balance_loss_mlp": 1.01593733, + "epoch": 0.9289643769727942, + "flos": 19317750750720.0, + "grad_norm": 2.158761151214465, + "language_loss": 0.71885049, + "learning_rate": 5.265677957368875e-08, + "loss": 0.73989934, + "num_input_tokens_seen": 333445800, + "step": 15451, + "time_per_iteration": 2.7205255031585693 + }, + { + "auxiliary_loss_clip": 0.01083407, + "auxiliary_loss_mlp": 0.010432, + "balance_loss_clip": 1.033409, + "balance_loss_mlp": 1.03013515, + "epoch": 0.9290245002254622, + "flos": 14058058233600.0, + "grad_norm": 1.931438285562406, + "language_loss": 0.732077, + "learning_rate": 5.25680366614687e-08, + "loss": 0.75334305, + "num_input_tokens_seen": 333461550, + "step": 15452, + "time_per_iteration": 2.7509524822235107 + }, + { + "auxiliary_loss_clip": 0.01089897, + "auxiliary_loss_mlp": 0.01029462, + "balance_loss_clip": 1.03840566, + "balance_loss_mlp": 1.01675391, + "epoch": 0.9290846234781301, + "flos": 20047132321920.0, + "grad_norm": 2.107575184826188, + "language_loss": 0.74144852, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.76264215, + "num_input_tokens_seen": 333478835, + "step": 15453, + "time_per_iteration": 2.7099177837371826 + }, + { + "auxiliary_loss_clip": 0.00992131, + "auxiliary_loss_mlp": 0.0100121, + "balance_loss_clip": 1.00884318, + "balance_loss_mlp": 1.00022078, + "epoch": 0.9291447467307982, + "flos": 61227514460160.0, + "grad_norm": 0.8159194207787102, + "language_loss": 0.60648072, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.62641418, + "num_input_tokens_seen": 333535250, + "step": 15454, + "time_per_iteration": 3.143502950668335 + }, + { + "auxiliary_loss_clip": 0.01082676, + "auxiliary_loss_mlp": 0.01042707, + "balance_loss_clip": 1.03245759, + "balance_loss_mlp": 1.02956378, + "epoch": 0.9292048699834661, + "flos": 20553328736640.0, + "grad_norm": 1.6995805540725026, + "language_loss": 0.68820882, + "learning_rate": 5.230225101914709e-08, + "loss": 0.70946264, + "num_input_tokens_seen": 333553805, + "step": 15455, + "time_per_iteration": 2.6724045276641846 + }, + { + "auxiliary_loss_clip": 0.01063528, + "auxiliary_loss_mlp": 0.01034362, + "balance_loss_clip": 1.03471339, + "balance_loss_mlp": 1.02136862, + "epoch": 0.9292649932361341, + "flos": 23623655477760.0, + "grad_norm": 1.7980946522964238, + "language_loss": 0.64908248, + "learning_rate": 5.22138035143509e-08, + "loss": 0.67006135, + "num_input_tokens_seen": 333572800, + "step": 15456, + "time_per_iteration": 2.6736927032470703 + }, + { + "auxiliary_loss_clip": 0.01061601, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.03250432, + "balance_loss_mlp": 1.01845431, + "epoch": 0.929325116488802, + "flos": 15009942602880.0, + "grad_norm": 2.326561847657641, + "language_loss": 0.68176067, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.70268989, + "num_input_tokens_seen": 333588520, + "step": 15457, + "time_per_iteration": 2.722505807876587 + }, + { + "auxiliary_loss_clip": 0.01086966, + "auxiliary_loss_mlp": 0.01029179, + "balance_loss_clip": 1.03466225, + "balance_loss_mlp": 1.01685286, + "epoch": 0.92938523974147, + "flos": 17967365919360.0, + "grad_norm": 3.4374144419000388, + "language_loss": 0.80816442, + "learning_rate": 5.203713008885291e-08, + "loss": 0.82932585, + "num_input_tokens_seen": 333603435, + "step": 15458, + "time_per_iteration": 2.7122104167938232 + }, + { + "auxiliary_loss_clip": 0.01100699, + "auxiliary_loss_mlp": 0.01034056, + "balance_loss_clip": 1.03775263, + "balance_loss_mlp": 1.02139056, + "epoch": 0.9294453629941379, + "flos": 23003047267200.0, + "grad_norm": 1.5848358931339006, + "language_loss": 0.72326326, + "learning_rate": 5.194890417485065e-08, + "loss": 0.74461079, + "num_input_tokens_seen": 333623305, + "step": 15459, + "time_per_iteration": 2.6949429512023926 + }, + { + "auxiliary_loss_clip": 0.01070452, + "auxiliary_loss_mlp": 0.01035576, + "balance_loss_clip": 1.03244114, + "balance_loss_mlp": 1.02314806, + "epoch": 0.929505486246806, + "flos": 17055234927360.0, + "grad_norm": 2.206929984070201, + "language_loss": 0.58746719, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.60852748, + "num_input_tokens_seen": 333641205, + "step": 15460, + "time_per_iteration": 2.7503440380096436 + }, + { + "auxiliary_loss_clip": 0.01057483, + "auxiliary_loss_mlp": 0.01033599, + "balance_loss_clip": 1.03144884, + "balance_loss_mlp": 1.02011657, + "epoch": 0.9295656094994739, + "flos": 27340409329920.0, + "grad_norm": 1.8867552786807409, + "language_loss": 0.80609381, + "learning_rate": 5.177267396106733e-08, + "loss": 0.82700461, + "num_input_tokens_seen": 333659615, + "step": 15461, + "time_per_iteration": 2.773244857788086 + }, + { + "auxiliary_loss_clip": 0.01083444, + "auxiliary_loss_mlp": 0.01025336, + "balance_loss_clip": 1.03467631, + "balance_loss_mlp": 1.01330769, + "epoch": 0.9296257327521419, + "flos": 21470954509440.0, + "grad_norm": 2.177903881361931, + "language_loss": 0.78115839, + "learning_rate": 5.168466966796869e-08, + "loss": 0.80224615, + "num_input_tokens_seen": 333678985, + "step": 15462, + "time_per_iteration": 2.6601200103759766 + }, + { + "auxiliary_loss_clip": 0.01065181, + "auxiliary_loss_mlp": 0.0102857, + "balance_loss_clip": 1.02944088, + "balance_loss_mlp": 1.01573753, + "epoch": 0.9296858560048099, + "flos": 16362661818240.0, + "grad_norm": 1.838020182070701, + "language_loss": 0.62704271, + "learning_rate": 5.159673925518282e-08, + "loss": 0.64798021, + "num_input_tokens_seen": 333696410, + "step": 15463, + "time_per_iteration": 2.65974760055542 + }, + { + "auxiliary_loss_clip": 0.01082053, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.03119493, + "balance_loss_mlp": 1.020522, + "epoch": 0.9297459792574778, + "flos": 29858609139840.0, + "grad_norm": 1.45056589452118, + "language_loss": 0.70977533, + "learning_rate": 5.15088827260437e-08, + "loss": 0.73091793, + "num_input_tokens_seen": 333716615, + "step": 15464, + "time_per_iteration": 2.7430808544158936 + }, + { + "auxiliary_loss_clip": 0.01082227, + "auxiliary_loss_mlp": 0.01031071, + "balance_loss_clip": 1.0332849, + "balance_loss_mlp": 1.01884627, + "epoch": 0.9298061025101458, + "flos": 15924838942080.0, + "grad_norm": 1.7994510129894437, + "language_loss": 0.77051908, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.79165208, + "num_input_tokens_seen": 333732800, + "step": 15465, + "time_per_iteration": 2.645766019821167 + }, + { + "auxiliary_loss_clip": 0.0098002, + "auxiliary_loss_mlp": 0.01003378, + "balance_loss_clip": 1.01005721, + "balance_loss_mlp": 1.00234056, + "epoch": 0.9298662257628137, + "flos": 64096994304000.0, + "grad_norm": 0.6965140655325198, + "language_loss": 0.56410694, + "learning_rate": 5.133339133202952e-08, + "loss": 0.58394086, + "num_input_tokens_seen": 333799300, + "step": 15466, + "time_per_iteration": 3.565849781036377 + }, + { + "auxiliary_loss_clip": 0.01085072, + "auxiliary_loss_mlp": 0.01039957, + "balance_loss_clip": 1.03284919, + "balance_loss_mlp": 1.02618241, + "epoch": 0.9299263490154818, + "flos": 24280210224000.0, + "grad_norm": 1.7308849012204341, + "language_loss": 0.72874355, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.7499938, + "num_input_tokens_seen": 333820360, + "step": 15467, + "time_per_iteration": 3.236931800842285 + }, + { + "auxiliary_loss_clip": 0.01080183, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.03504908, + "balance_loss_mlp": 1.02171993, + "epoch": 0.9299864722681497, + "flos": 23294354567040.0, + "grad_norm": 1.5735167762659585, + "language_loss": 0.7158711, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.73701859, + "num_input_tokens_seen": 333840415, + "step": 15468, + "time_per_iteration": 2.7365386486053467 + }, + { + "auxiliary_loss_clip": 0.01094813, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.0341078, + "balance_loss_mlp": 1.02179384, + "epoch": 0.9300465955208177, + "flos": 21395972868480.0, + "grad_norm": 1.7470237335941396, + "language_loss": 0.75426078, + "learning_rate": 5.107070845155737e-08, + "loss": 0.77557051, + "num_input_tokens_seen": 333859910, + "step": 15469, + "time_per_iteration": 2.7027781009674072 + }, + { + "auxiliary_loss_clip": 0.01082382, + "auxiliary_loss_mlp": 0.01034783, + "balance_loss_clip": 1.03710604, + "balance_loss_mlp": 1.02252209, + "epoch": 0.9301067187734856, + "flos": 24571445696640.0, + "grad_norm": 2.784639154051725, + "language_loss": 0.75578332, + "learning_rate": 5.098329529416379e-08, + "loss": 0.77695501, + "num_input_tokens_seen": 333880495, + "step": 15470, + "time_per_iteration": 2.7347373962402344 + }, + { + "auxiliary_loss_clip": 0.01067813, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.03560543, + "balance_loss_mlp": 1.02088356, + "epoch": 0.9301668420261536, + "flos": 22196960202240.0, + "grad_norm": 2.387489989885237, + "language_loss": 0.74822462, + "learning_rate": 5.089595604367902e-08, + "loss": 0.76922727, + "num_input_tokens_seen": 333897640, + "step": 15471, + "time_per_iteration": 2.756758213043213 + }, + { + "auxiliary_loss_clip": 0.01093505, + "auxiliary_loss_mlp": 0.01030868, + "balance_loss_clip": 1.03590918, + "balance_loss_mlp": 1.01813686, + "epoch": 0.9302269652788215, + "flos": 17747628468480.0, + "grad_norm": 3.1400649028733345, + "language_loss": 0.68857515, + "learning_rate": 5.080869070341487e-08, + "loss": 0.70981896, + "num_input_tokens_seen": 333913670, + "step": 15472, + "time_per_iteration": 2.6190030574798584 + }, + { + "auxiliary_loss_clip": 0.01078893, + "auxiliary_loss_mlp": 0.01028134, + "balance_loss_clip": 1.03282297, + "balance_loss_mlp": 1.01614726, + "epoch": 0.9302870885314896, + "flos": 19390793057280.0, + "grad_norm": 1.7830315106807422, + "language_loss": 0.88541853, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.90648878, + "num_input_tokens_seen": 333934105, + "step": 15473, + "time_per_iteration": 2.5981593132019043 + }, + { + "auxiliary_loss_clip": 0.01087498, + "auxiliary_loss_mlp": 0.01036678, + "balance_loss_clip": 1.03732419, + "balance_loss_mlp": 1.0225395, + "epoch": 0.9303472117841575, + "flos": 21760286561280.0, + "grad_norm": 1.8849274973342631, + "language_loss": 0.64160311, + "learning_rate": 5.063438176678203e-08, + "loss": 0.6628449, + "num_input_tokens_seen": 333953635, + "step": 15474, + "time_per_iteration": 2.6480371952056885 + }, + { + "auxiliary_loss_clip": 0.01109387, + "auxiliary_loss_mlp": 0.01034944, + "balance_loss_clip": 1.03766966, + "balance_loss_mlp": 1.0225817, + "epoch": 0.9304073350368255, + "flos": 19609740408960.0, + "grad_norm": 1.7431064439867472, + "language_loss": 0.74580079, + "learning_rate": 5.054733817702339e-08, + "loss": 0.7672441, + "num_input_tokens_seen": 333971825, + "step": 15475, + "time_per_iteration": 2.9433109760284424 + }, + { + "auxiliary_loss_clip": 0.01094883, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.03424644, + "balance_loss_mlp": 1.01804042, + "epoch": 0.9304674582894935, + "flos": 30441582875520.0, + "grad_norm": 1.8594741529837064, + "language_loss": 0.66352129, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.68476784, + "num_input_tokens_seen": 333990120, + "step": 15476, + "time_per_iteration": 2.669774293899536 + }, + { + "auxiliary_loss_clip": 0.01066383, + "auxiliary_loss_mlp": 0.01033281, + "balance_loss_clip": 1.03680027, + "balance_loss_mlp": 1.02047777, + "epoch": 0.9305275815421614, + "flos": 17785693906560.0, + "grad_norm": 1.928617647812536, + "language_loss": 0.68966222, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.71065891, + "num_input_tokens_seen": 334007970, + "step": 15477, + "time_per_iteration": 2.7191553115844727 + }, + { + "auxiliary_loss_clip": 0.01087769, + "auxiliary_loss_mlp": 0.01030039, + "balance_loss_clip": 1.03722644, + "balance_loss_mlp": 1.01820755, + "epoch": 0.9305877047948294, + "flos": 25298456970240.0, + "grad_norm": 3.581472725351638, + "language_loss": 0.58644545, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.60762358, + "num_input_tokens_seen": 334027120, + "step": 15478, + "time_per_iteration": 2.6942026615142822 + }, + { + "auxiliary_loss_clip": 0.01089048, + "auxiliary_loss_mlp": 0.01030623, + "balance_loss_clip": 1.03869212, + "balance_loss_mlp": 1.0165447, + "epoch": 0.9306478280474973, + "flos": 16977236544000.0, + "grad_norm": 2.150126266839501, + "language_loss": 0.78858852, + "learning_rate": 5.01999030853566e-08, + "loss": 0.80978525, + "num_input_tokens_seen": 334042785, + "step": 15479, + "time_per_iteration": 4.2738165855407715 + }, + { + "auxiliary_loss_clip": 0.01109061, + "auxiliary_loss_mlp": 0.01031407, + "balance_loss_clip": 1.03678465, + "balance_loss_mlp": 1.0195874, + "epoch": 0.9307079513001654, + "flos": 35663353608960.0, + "grad_norm": 1.63393685516193, + "language_loss": 0.6846534, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.70605814, + "num_input_tokens_seen": 334063480, + "step": 15480, + "time_per_iteration": 2.7149746417999268 + }, + { + "auxiliary_loss_clip": 0.01109905, + "auxiliary_loss_mlp": 0.01031927, + "balance_loss_clip": 1.0378406, + "balance_loss_mlp": 1.01929665, + "epoch": 0.9307680745528333, + "flos": 19208151377280.0, + "grad_norm": 1.6956958650454903, + "language_loss": 0.67578673, + "learning_rate": 5.002662914604583e-08, + "loss": 0.69720507, + "num_input_tokens_seen": 334082005, + "step": 15481, + "time_per_iteration": 4.039956331253052 + }, + { + "auxiliary_loss_clip": 0.01080993, + "auxiliary_loss_mlp": 0.01032175, + "balance_loss_clip": 1.033005, + "balance_loss_mlp": 1.0192585, + "epoch": 0.9308281978055013, + "flos": 19062641381760.0, + "grad_norm": 1.8467240460111785, + "language_loss": 0.74883473, + "learning_rate": 4.994010308952701e-08, + "loss": 0.76996636, + "num_input_tokens_seen": 334101375, + "step": 15482, + "time_per_iteration": 2.6518447399139404 + }, + { + "auxiliary_loss_clip": 0.01094658, + "auxiliary_loss_mlp": 0.01028267, + "balance_loss_clip": 1.03394866, + "balance_loss_mlp": 1.01649547, + "epoch": 0.9308883210581692, + "flos": 20521548178560.0, + "grad_norm": 1.9180378851164899, + "language_loss": 0.80164105, + "learning_rate": 4.985365097947469e-08, + "loss": 0.82287037, + "num_input_tokens_seen": 334119460, + "step": 15483, + "time_per_iteration": 4.348911285400391 + }, + { + "auxiliary_loss_clip": 0.01082688, + "auxiliary_loss_mlp": 0.01033349, + "balance_loss_clip": 1.03657448, + "balance_loss_mlp": 1.02083826, + "epoch": 0.9309484443108372, + "flos": 13001422826880.0, + "grad_norm": 1.8698210896088554, + "language_loss": 0.74462926, + "learning_rate": 4.976727281916782e-08, + "loss": 0.76578963, + "num_input_tokens_seen": 334136065, + "step": 15484, + "time_per_iteration": 2.664431095123291 + }, + { + "auxiliary_loss_clip": 0.01086381, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.03691006, + "balance_loss_mlp": 1.01994252, + "epoch": 0.9310085675635051, + "flos": 12567765928320.0, + "grad_norm": 2.155944343427052, + "language_loss": 0.76539695, + "learning_rate": 4.968096861188087e-08, + "loss": 0.78658748, + "num_input_tokens_seen": 334153690, + "step": 15485, + "time_per_iteration": 4.154724597930908 + }, + { + "auxiliary_loss_clip": 0.01063188, + "auxiliary_loss_mlp": 0.01035653, + "balance_loss_clip": 1.03246868, + "balance_loss_mlp": 1.02108002, + "epoch": 0.9310686908161732, + "flos": 23477570864640.0, + "grad_norm": 2.136554797063734, + "language_loss": 0.78668422, + "learning_rate": 4.959473836088723e-08, + "loss": 0.80767262, + "num_input_tokens_seen": 334171880, + "step": 15486, + "time_per_iteration": 2.7599616050720215 + }, + { + "auxiliary_loss_clip": 0.01079287, + "auxiliary_loss_mlp": 0.01030569, + "balance_loss_clip": 1.03827739, + "balance_loss_mlp": 1.01740253, + "epoch": 0.9311288140688411, + "flos": 24170287628160.0, + "grad_norm": 1.753625770007885, + "language_loss": 0.7688942, + "learning_rate": 4.950858206945674e-08, + "loss": 0.78999281, + "num_input_tokens_seen": 334190005, + "step": 15487, + "time_per_iteration": 2.6973049640655518 + }, + { + "auxiliary_loss_clip": 0.01080553, + "auxiliary_loss_mlp": 0.01029283, + "balance_loss_clip": 1.0376873, + "balance_loss_mlp": 1.01600909, + "epoch": 0.9311889373215091, + "flos": 35590203561600.0, + "grad_norm": 2.2700820281078213, + "language_loss": 0.67102247, + "learning_rate": 4.942249974085633e-08, + "loss": 0.69212085, + "num_input_tokens_seen": 334209545, + "step": 15488, + "time_per_iteration": 2.7322824001312256 + }, + { + "auxiliary_loss_clip": 0.01083742, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.03624427, + "balance_loss_mlp": 1.01787865, + "epoch": 0.9312490605741771, + "flos": 20230528187520.0, + "grad_norm": 1.926714614555793, + "language_loss": 0.75009143, + "learning_rate": 4.933649137834983e-08, + "loss": 0.77123499, + "num_input_tokens_seen": 334228900, + "step": 15489, + "time_per_iteration": 2.5786027908325195 + }, + { + "auxiliary_loss_clip": 0.01111206, + "auxiliary_loss_mlp": 0.01032778, + "balance_loss_clip": 1.03734827, + "balance_loss_mlp": 1.01991534, + "epoch": 0.931309183826845, + "flos": 13950577762560.0, + "grad_norm": 3.9683320091837406, + "language_loss": 0.80892265, + "learning_rate": 4.925055698519931e-08, + "loss": 0.83036256, + "num_input_tokens_seen": 334245500, + "step": 15490, + "time_per_iteration": 2.5186619758605957 + }, + { + "auxiliary_loss_clip": 0.01061842, + "auxiliary_loss_mlp": 0.0103459, + "balance_loss_clip": 1.03452861, + "balance_loss_mlp": 1.02170372, + "epoch": 0.931369307079513, + "flos": 20156731695360.0, + "grad_norm": 1.6571168923434456, + "language_loss": 0.72082543, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.74178976, + "num_input_tokens_seen": 334264370, + "step": 15491, + "time_per_iteration": 2.6573195457458496 + }, + { + "auxiliary_loss_clip": 0.0108057, + "auxiliary_loss_mlp": 0.0076884, + "balance_loss_clip": 1.03255057, + "balance_loss_mlp": 1.00006044, + "epoch": 0.931429430332181, + "flos": 25338569483520.0, + "grad_norm": 1.8544048731654292, + "language_loss": 0.74552429, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.7640183, + "num_input_tokens_seen": 334283905, + "step": 15492, + "time_per_iteration": 2.5493483543395996 + }, + { + "auxiliary_loss_clip": 0.01019334, + "auxiliary_loss_mlp": 0.01002201, + "balance_loss_clip": 1.0056994, + "balance_loss_mlp": 1.00118196, + "epoch": 0.931489553584849, + "flos": 71226193985280.0, + "grad_norm": 0.707578892585582, + "language_loss": 0.53487962, + "learning_rate": 4.899319765445442e-08, + "loss": 0.55509502, + "num_input_tokens_seen": 334339925, + "step": 15493, + "time_per_iteration": 2.947209358215332 + }, + { + "auxiliary_loss_clip": 0.01097309, + "auxiliary_loss_mlp": 0.01033315, + "balance_loss_clip": 1.03523576, + "balance_loss_mlp": 1.02147126, + "epoch": 0.9315496768375169, + "flos": 14643653662080.0, + "grad_norm": 1.8350932820938002, + "language_loss": 0.70629972, + "learning_rate": 4.890755917128531e-08, + "loss": 0.72760594, + "num_input_tokens_seen": 334357225, + "step": 15494, + "time_per_iteration": 2.721458673477173 + }, + { + "auxiliary_loss_clip": 0.01093067, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.03598893, + "balance_loss_mlp": 1.01683998, + "epoch": 0.9316098000901849, + "flos": 28329928174080.0, + "grad_norm": 2.829925163289834, + "language_loss": 0.68261808, + "learning_rate": 4.882199467373671e-08, + "loss": 0.70384157, + "num_input_tokens_seen": 334375945, + "step": 15495, + "time_per_iteration": 2.6126840114593506 + }, + { + "auxiliary_loss_clip": 0.01104588, + "auxiliary_loss_mlp": 0.01034305, + "balance_loss_clip": 1.03474832, + "balance_loss_mlp": 1.02263463, + "epoch": 0.9316699233428528, + "flos": 28512677594880.0, + "grad_norm": 2.116367071751547, + "language_loss": 0.61655867, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.63794762, + "num_input_tokens_seen": 334395310, + "step": 15496, + "time_per_iteration": 2.5984606742858887 + }, + { + "auxiliary_loss_clip": 0.01099753, + "auxiliary_loss_mlp": 0.01032832, + "balance_loss_clip": 1.03712618, + "balance_loss_mlp": 1.0196712, + "epoch": 0.9317300465955208, + "flos": 33693402061440.0, + "grad_norm": 1.4743797033954789, + "language_loss": 0.76852232, + "learning_rate": 4.865108764847825e-08, + "loss": 0.78984821, + "num_input_tokens_seen": 334416965, + "step": 15497, + "time_per_iteration": 2.694024085998535 + }, + { + "auxiliary_loss_clip": 0.01102298, + "auxiliary_loss_mlp": 0.00771221, + "balance_loss_clip": 1.0387435, + "balance_loss_mlp": 1.00019717, + "epoch": 0.9317901698481887, + "flos": 23658237296640.0, + "grad_norm": 1.6171461718999425, + "language_loss": 0.66427922, + "learning_rate": 4.856574512724898e-08, + "loss": 0.68301445, + "num_input_tokens_seen": 334435620, + "step": 15498, + "time_per_iteration": 2.617232084274292 + }, + { + "auxiliary_loss_clip": 0.01087695, + "auxiliary_loss_mlp": 0.01035624, + "balance_loss_clip": 1.03662407, + "balance_loss_mlp": 1.02294576, + "epoch": 0.9318502931008568, + "flos": 20960017499520.0, + "grad_norm": 1.6376617462499037, + "language_loss": 0.79631472, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.81754798, + "num_input_tokens_seen": 334456210, + "step": 15499, + "time_per_iteration": 2.663939952850342 + }, + { + "auxiliary_loss_clip": 0.01065124, + "auxiliary_loss_mlp": 0.01033992, + "balance_loss_clip": 1.03444028, + "balance_loss_mlp": 1.02104545, + "epoch": 0.9319104163535247, + "flos": 23441049711360.0, + "grad_norm": 1.9165014592612015, + "language_loss": 0.76588839, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.78687954, + "num_input_tokens_seen": 334475485, + "step": 15500, + "time_per_iteration": 2.8950650691986084 + }, + { + "auxiliary_loss_clip": 0.01075294, + "auxiliary_loss_mlp": 0.01026196, + "balance_loss_clip": 1.03517914, + "balance_loss_mlp": 1.01429939, + "epoch": 0.9319705396061927, + "flos": 22347426274560.0, + "grad_norm": 1.8007520102301227, + "language_loss": 0.72160745, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.74262238, + "num_input_tokens_seen": 334494740, + "step": 15501, + "time_per_iteration": 2.671736240386963 + }, + { + "auxiliary_loss_clip": 0.0111059, + "auxiliary_loss_mlp": 0.01035234, + "balance_loss_clip": 1.0367043, + "balance_loss_mlp": 1.02227604, + "epoch": 0.9320306628588607, + "flos": 20993557824000.0, + "grad_norm": 1.6678005947570964, + "language_loss": 0.66245615, + "learning_rate": 4.822511506047666e-08, + "loss": 0.68391442, + "num_input_tokens_seen": 334511910, + "step": 15502, + "time_per_iteration": 2.640803098678589 + }, + { + "auxiliary_loss_clip": 0.01100429, + "auxiliary_loss_mlp": 0.00770206, + "balance_loss_clip": 1.03718948, + "balance_loss_mlp": 1.00017929, + "epoch": 0.9320907861115286, + "flos": 24538300421760.0, + "grad_norm": 1.5006777821326498, + "language_loss": 0.65834957, + "learning_rate": 4.814014256446586e-08, + "loss": 0.67705584, + "num_input_tokens_seen": 334533150, + "step": 15503, + "time_per_iteration": 2.6871988773345947 + }, + { + "auxiliary_loss_clip": 0.01070897, + "auxiliary_loss_mlp": 0.01037527, + "balance_loss_clip": 1.03008294, + "balance_loss_mlp": 1.02359104, + "epoch": 0.9321509093641966, + "flos": 19785414850560.0, + "grad_norm": 1.5576125560522005, + "language_loss": 0.75215459, + "learning_rate": 4.805524408317652e-08, + "loss": 0.77323884, + "num_input_tokens_seen": 334550940, + "step": 15504, + "time_per_iteration": 2.7060647010803223 + }, + { + "auxiliary_loss_clip": 0.01099259, + "auxiliary_loss_mlp": 0.00770405, + "balance_loss_clip": 1.03809631, + "balance_loss_mlp": 1.00028038, + "epoch": 0.9322110326168646, + "flos": 24972675592320.0, + "grad_norm": 2.4285521975478472, + "language_loss": 0.70985043, + "learning_rate": 4.797041961982762e-08, + "loss": 0.7285471, + "num_input_tokens_seen": 334570935, + "step": 15505, + "time_per_iteration": 2.632615089416504 + }, + { + "auxiliary_loss_clip": 0.01089757, + "auxiliary_loss_mlp": 0.01032109, + "balance_loss_clip": 1.03672528, + "balance_loss_mlp": 1.01909173, + "epoch": 0.9322711558695326, + "flos": 16143642639360.0, + "grad_norm": 2.685326437023756, + "language_loss": 0.75406563, + "learning_rate": 4.788566917763614e-08, + "loss": 0.77528429, + "num_input_tokens_seen": 334589315, + "step": 15506, + "time_per_iteration": 2.6244046688079834 + }, + { + "auxiliary_loss_clip": 0.01069315, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.03417552, + "balance_loss_mlp": 1.01864064, + "epoch": 0.9323312791222005, + "flos": 23732428838400.0, + "grad_norm": 1.7967505184283636, + "language_loss": 0.830755, + "learning_rate": 4.780099275981597e-08, + "loss": 0.85176057, + "num_input_tokens_seen": 334608990, + "step": 15507, + "time_per_iteration": 2.7944211959838867 + }, + { + "auxiliary_loss_clip": 0.01110233, + "auxiliary_loss_mlp": 0.01033256, + "balance_loss_clip": 1.03728533, + "balance_loss_mlp": 1.02054238, + "epoch": 0.9323914023748685, + "flos": 20777914523520.0, + "grad_norm": 1.7255820052461415, + "language_loss": 0.68139851, + "learning_rate": 4.771639036957742e-08, + "loss": 0.70283341, + "num_input_tokens_seen": 334628655, + "step": 15508, + "time_per_iteration": 2.6024084091186523 + }, + { + "auxiliary_loss_clip": 0.01074834, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.03604794, + "balance_loss_mlp": 1.01885068, + "epoch": 0.9324515256275364, + "flos": 23915178259200.0, + "grad_norm": 1.6572638063256202, + "language_loss": 0.72395205, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.74501491, + "num_input_tokens_seen": 334648295, + "step": 15509, + "time_per_iteration": 2.7021539211273193 + }, + { + "auxiliary_loss_clip": 0.01097551, + "auxiliary_loss_mlp": 0.01032405, + "balance_loss_clip": 1.03582215, + "balance_loss_mlp": 1.02004337, + "epoch": 0.9325116488802044, + "flos": 18005215875840.0, + "grad_norm": 1.9028125229589103, + "language_loss": 0.73969936, + "learning_rate": 4.754740768467624e-08, + "loss": 0.7609989, + "num_input_tokens_seen": 334666280, + "step": 15510, + "time_per_iteration": 2.5746052265167236 + }, + { + "auxiliary_loss_clip": 0.0109828, + "auxiliary_loss_mlp": 0.01029114, + "balance_loss_clip": 1.03353786, + "balance_loss_mlp": 1.01676393, + "epoch": 0.9325717721328723, + "flos": 29021603443200.0, + "grad_norm": 2.4238247125248304, + "language_loss": 0.70348555, + "learning_rate": 4.746302739642161e-08, + "loss": 0.72475946, + "num_input_tokens_seen": 334688830, + "step": 15511, + "time_per_iteration": 2.6687567234039307 + }, + { + "auxiliary_loss_clip": 0.01080656, + "auxiliary_loss_mlp": 0.01040972, + "balance_loss_clip": 1.03440976, + "balance_loss_mlp": 1.02819276, + "epoch": 0.9326318953855404, + "flos": 21646341642240.0, + "grad_norm": 1.8803327029828805, + "language_loss": 0.78425443, + "learning_rate": 4.737872114856412e-08, + "loss": 0.80547071, + "num_input_tokens_seen": 334705205, + "step": 15512, + "time_per_iteration": 2.614408016204834 + }, + { + "auxiliary_loss_clip": 0.01107297, + "auxiliary_loss_mlp": 0.01028882, + "balance_loss_clip": 1.03561831, + "balance_loss_mlp": 1.01595938, + "epoch": 0.9326920186382083, + "flos": 26065724411520.0, + "grad_norm": 2.094641259738662, + "language_loss": 0.80607069, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.82743245, + "num_input_tokens_seen": 334723830, + "step": 15513, + "time_per_iteration": 2.6240689754486084 + }, + { + "auxiliary_loss_clip": 0.01086027, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.03791964, + "balance_loss_mlp": 1.01878834, + "epoch": 0.9327521418908763, + "flos": 12057116227200.0, + "grad_norm": 1.8470318509254438, + "language_loss": 0.80033004, + "learning_rate": 4.721033078682768e-08, + "loss": 0.82151413, + "num_input_tokens_seen": 334740825, + "step": 15514, + "time_per_iteration": 2.6301167011260986 + }, + { + "auxiliary_loss_clip": 0.01074823, + "auxiliary_loss_mlp": 0.01037395, + "balance_loss_clip": 1.0366869, + "balance_loss_mlp": 1.02556312, + "epoch": 0.9328122651435443, + "flos": 43834395271680.0, + "grad_norm": 1.7635259328932469, + "language_loss": 0.71414572, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.73526788, + "num_input_tokens_seen": 334765825, + "step": 15515, + "time_per_iteration": 2.9563915729522705 + }, + { + "auxiliary_loss_clip": 0.01093417, + "auxiliary_loss_mlp": 0.01033132, + "balance_loss_clip": 1.03784752, + "balance_loss_mlp": 1.02009666, + "epoch": 0.9328723883962122, + "flos": 15194954580480.0, + "grad_norm": 3.7303999678347153, + "language_loss": 0.80836952, + "learning_rate": 4.704223662500806e-08, + "loss": 0.82963496, + "num_input_tokens_seen": 334782680, + "step": 15516, + "time_per_iteration": 2.6183342933654785 + }, + { + "auxiliary_loss_clip": 0.01070452, + "auxiliary_loss_mlp": 0.01039697, + "balance_loss_clip": 1.03149724, + "balance_loss_mlp": 1.02574384, + "epoch": 0.9329325116488802, + "flos": 20261770041600.0, + "grad_norm": 1.6405205567812482, + "language_loss": 0.80559999, + "learning_rate": 4.695830062703643e-08, + "loss": 0.82670152, + "num_input_tokens_seen": 334800160, + "step": 15517, + "time_per_iteration": 2.6957180500030518 + }, + { + "auxiliary_loss_clip": 0.0108811, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.03524601, + "balance_loss_mlp": 1.01821351, + "epoch": 0.9329926349015482, + "flos": 13115008609920.0, + "grad_norm": 2.5144620557591364, + "language_loss": 0.74485952, + "learning_rate": 4.687443868860219e-08, + "loss": 0.76605237, + "num_input_tokens_seen": 334815840, + "step": 15518, + "time_per_iteration": 2.6164944171905518 + }, + { + "auxiliary_loss_clip": 0.01083865, + "auxiliary_loss_mlp": 0.01042053, + "balance_loss_clip": 1.03354347, + "balance_loss_mlp": 1.02916634, + "epoch": 0.9330527581542162, + "flos": 23040250778880.0, + "grad_norm": 2.0399988917234904, + "language_loss": 0.76014853, + "learning_rate": 4.679065081288458e-08, + "loss": 0.78140771, + "num_input_tokens_seen": 334834735, + "step": 15519, + "time_per_iteration": 4.253001689910889 + }, + { + "auxiliary_loss_clip": 0.01053866, + "auxiliary_loss_mlp": 0.01036997, + "balance_loss_clip": 1.0320313, + "balance_loss_mlp": 1.02326381, + "epoch": 0.9331128814068841, + "flos": 15559627409280.0, + "grad_norm": 2.031373785693728, + "language_loss": 0.83167887, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.85258746, + "num_input_tokens_seen": 334853490, + "step": 15520, + "time_per_iteration": 4.2622270584106445 + }, + { + "auxiliary_loss_clip": 0.01096231, + "auxiliary_loss_mlp": 0.01030529, + "balance_loss_clip": 1.03505969, + "balance_loss_mlp": 1.01851249, + "epoch": 0.9331730046595521, + "flos": 22271762275200.0, + "grad_norm": 1.641224021105838, + "language_loss": 0.76203525, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.78330284, + "num_input_tokens_seen": 334873675, + "step": 15521, + "time_per_iteration": 2.6855099201202393 + }, + { + "auxiliary_loss_clip": 0.01098694, + "auxiliary_loss_mlp": 0.01030692, + "balance_loss_clip": 1.03746796, + "balance_loss_mlp": 1.01878297, + "epoch": 0.93323312791222, + "flos": 15777641007360.0, + "grad_norm": 1.8982902543298203, + "language_loss": 0.77620465, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.79749846, + "num_input_tokens_seen": 334890970, + "step": 15522, + "time_per_iteration": 4.228564977645874 + }, + { + "auxiliary_loss_clip": 0.01075483, + "auxiliary_loss_mlp": 0.00770903, + "balance_loss_clip": 1.03559947, + "balance_loss_mlp": 1.00036263, + "epoch": 0.933293251164888, + "flos": 22010978557440.0, + "grad_norm": 2.1529045189336076, + "language_loss": 0.62858284, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.64704674, + "num_input_tokens_seen": 334906635, + "step": 15523, + "time_per_iteration": 2.720323324203491 + }, + { + "auxiliary_loss_clip": 0.01085105, + "auxiliary_loss_mlp": 0.01031084, + "balance_loss_clip": 1.0346992, + "balance_loss_mlp": 1.0190022, + "epoch": 0.933353374417556, + "flos": 26031358074240.0, + "grad_norm": 2.0555270470512035, + "language_loss": 0.6804812, + "learning_rate": 4.63728224861577e-08, + "loss": 0.70164317, + "num_input_tokens_seen": 334926230, + "step": 15524, + "time_per_iteration": 4.186232805252075 + }, + { + "auxiliary_loss_clip": 0.01065918, + "auxiliary_loss_mlp": 0.01036023, + "balance_loss_clip": 1.03418577, + "balance_loss_mlp": 1.02345872, + "epoch": 0.933413497670224, + "flos": 24900100162560.0, + "grad_norm": 1.5473346576932632, + "language_loss": 0.73752666, + "learning_rate": 4.628947905336589e-08, + "loss": 0.75854605, + "num_input_tokens_seen": 334946680, + "step": 15525, + "time_per_iteration": 2.740737199783325 + }, + { + "auxiliary_loss_clip": 0.01054757, + "auxiliary_loss_mlp": 0.01041762, + "balance_loss_clip": 1.03350389, + "balance_loss_mlp": 1.02915573, + "epoch": 0.9334736209228919, + "flos": 23688689051520.0, + "grad_norm": 1.733935635799957, + "language_loss": 0.83531857, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.85628378, + "num_input_tokens_seen": 334964785, + "step": 15526, + "time_per_iteration": 2.6978201866149902 + }, + { + "auxiliary_loss_clip": 0.01062457, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.03334141, + "balance_loss_mlp": 1.02088642, + "epoch": 0.9335337441755599, + "flos": 15377344865280.0, + "grad_norm": 1.8748787634103812, + "language_loss": 0.69386899, + "learning_rate": 4.61230144456366e-08, + "loss": 0.71482921, + "num_input_tokens_seen": 334982400, + "step": 15527, + "time_per_iteration": 2.7441039085388184 + }, + { + "auxiliary_loss_clip": 0.01110964, + "auxiliary_loss_mlp": 0.01030308, + "balance_loss_clip": 1.03706026, + "balance_loss_mlp": 1.01640248, + "epoch": 0.9335938674282279, + "flos": 16106726436480.0, + "grad_norm": 2.0901783734577535, + "language_loss": 0.65065324, + "learning_rate": 4.603989327701141e-08, + "loss": 0.67206597, + "num_input_tokens_seen": 334999685, + "step": 15528, + "time_per_iteration": 2.643949270248413 + }, + { + "auxiliary_loss_clip": 0.01110618, + "auxiliary_loss_mlp": 0.01029994, + "balance_loss_clip": 1.03654647, + "balance_loss_mlp": 1.01733375, + "epoch": 0.9336539906808958, + "flos": 18952898353920.0, + "grad_norm": 1.8767297797851592, + "language_loss": 0.74917662, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.77058274, + "num_input_tokens_seen": 335019160, + "step": 15529, + "time_per_iteration": 2.634995698928833 + }, + { + "auxiliary_loss_clip": 0.01062705, + "auxiliary_loss_mlp": 0.01032394, + "balance_loss_clip": 1.03274131, + "balance_loss_mlp": 1.0203954, + "epoch": 0.9337141139335638, + "flos": 18109104986880.0, + "grad_norm": 1.748892845656801, + "language_loss": 0.62968796, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.65063894, + "num_input_tokens_seen": 335037350, + "step": 15530, + "time_per_iteration": 2.7044005393981934 + }, + { + "auxiliary_loss_clip": 0.01088546, + "auxiliary_loss_mlp": 0.01028529, + "balance_loss_clip": 1.03820205, + "balance_loss_mlp": 1.01650035, + "epoch": 0.9337742371862318, + "flos": 17345716214400.0, + "grad_norm": 1.6832056860999263, + "language_loss": 0.72579157, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.74696231, + "num_input_tokens_seen": 335056060, + "step": 15531, + "time_per_iteration": 2.6301660537719727 + }, + { + "auxiliary_loss_clip": 0.01085265, + "auxiliary_loss_mlp": 0.01030468, + "balance_loss_clip": 1.0341419, + "balance_loss_mlp": 1.01802897, + "epoch": 0.9338343604388998, + "flos": 29058986522880.0, + "grad_norm": 1.6070200509837302, + "language_loss": 0.7085079, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.72966528, + "num_input_tokens_seen": 335075410, + "step": 15532, + "time_per_iteration": 2.6982882022857666 + }, + { + "auxiliary_loss_clip": 0.01110813, + "auxiliary_loss_mlp": 0.00770577, + "balance_loss_clip": 1.03735983, + "balance_loss_mlp": 1.00021112, + "epoch": 0.9338944836915677, + "flos": 18660908695680.0, + "grad_norm": 1.6762698781307237, + "language_loss": 0.73232746, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.75114131, + "num_input_tokens_seen": 335095190, + "step": 15533, + "time_per_iteration": 2.570868730545044 + }, + { + "auxiliary_loss_clip": 0.0107118, + "auxiliary_loss_mlp": 0.01029018, + "balance_loss_clip": 1.03274035, + "balance_loss_mlp": 1.01729369, + "epoch": 0.9339546069442357, + "flos": 16617735273600.0, + "grad_norm": 1.7512837189908117, + "language_loss": 0.7965132, + "learning_rate": 4.554272235700507e-08, + "loss": 0.81751519, + "num_input_tokens_seen": 335113825, + "step": 15534, + "time_per_iteration": 2.659533977508545 + }, + { + "auxiliary_loss_clip": 0.01104268, + "auxiliary_loss_mlp": 0.0102812, + "balance_loss_clip": 1.03758454, + "balance_loss_mlp": 1.01707494, + "epoch": 0.9340147301969036, + "flos": 23693106424320.0, + "grad_norm": 1.785516509672164, + "language_loss": 0.74561787, + "learning_rate": 4.546011991495513e-08, + "loss": 0.76694173, + "num_input_tokens_seen": 335136425, + "step": 15535, + "time_per_iteration": 2.615487575531006 + }, + { + "auxiliary_loss_clip": 0.01094475, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.03818846, + "balance_loss_mlp": 1.0162499, + "epoch": 0.9340748534495716, + "flos": 28654452576000.0, + "grad_norm": 2.554180895365387, + "language_loss": 0.77858245, + "learning_rate": 4.537759158925292e-08, + "loss": 0.79981351, + "num_input_tokens_seen": 335157925, + "step": 15536, + "time_per_iteration": 2.6514716148376465 + }, + { + "auxiliary_loss_clip": 0.01078909, + "auxiliary_loss_mlp": 0.01027656, + "balance_loss_clip": 1.0358901, + "balance_loss_mlp": 1.01566911, + "epoch": 0.9341349767022396, + "flos": 24899633285760.0, + "grad_norm": 1.5285441088297342, + "language_loss": 0.80702901, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.82809466, + "num_input_tokens_seen": 335177840, + "step": 15537, + "time_per_iteration": 2.71079683303833 + }, + { + "auxiliary_loss_clip": 0.01089177, + "auxiliary_loss_mlp": 0.01033529, + "balance_loss_clip": 1.03725171, + "balance_loss_mlp": 1.02132761, + "epoch": 0.9341950999549076, + "flos": 29059525226880.0, + "grad_norm": 2.04950932055524, + "language_loss": 0.77909076, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.80031782, + "num_input_tokens_seen": 335199470, + "step": 15538, + "time_per_iteration": 2.7233970165252686 + }, + { + "auxiliary_loss_clip": 0.01080561, + "auxiliary_loss_mlp": 0.01028684, + "balance_loss_clip": 1.03509426, + "balance_loss_mlp": 1.01653659, + "epoch": 0.9342552232075755, + "flos": 23587062497280.0, + "grad_norm": 1.7693540282121059, + "language_loss": 0.73224825, + "learning_rate": 4.513045134151672e-08, + "loss": 0.75334066, + "num_input_tokens_seen": 335218885, + "step": 15539, + "time_per_iteration": 2.7510504722595215 + }, + { + "auxiliary_loss_clip": 0.01063064, + "auxiliary_loss_mlp": 0.01030385, + "balance_loss_clip": 1.03815532, + "balance_loss_mlp": 1.01935768, + "epoch": 0.9343153464602435, + "flos": 36721389646080.0, + "grad_norm": 1.501458356905732, + "language_loss": 0.64681369, + "learning_rate": 4.504821951247373e-08, + "loss": 0.66774815, + "num_input_tokens_seen": 335239485, + "step": 15540, + "time_per_iteration": 2.8745980262756348 + }, + { + "auxiliary_loss_clip": 0.01095708, + "auxiliary_loss_mlp": 0.0103328, + "balance_loss_clip": 1.03505111, + "balance_loss_mlp": 1.02149022, + "epoch": 0.9343754697129115, + "flos": 22236498097920.0, + "grad_norm": 1.8557589560760206, + "language_loss": 0.76556802, + "learning_rate": 4.496606181539864e-08, + "loss": 0.78685796, + "num_input_tokens_seen": 335258355, + "step": 15541, + "time_per_iteration": 2.651571035385132 + }, + { + "auxiliary_loss_clip": 0.01096825, + "auxiliary_loss_mlp": 0.01033897, + "balance_loss_clip": 1.03985929, + "balance_loss_mlp": 1.02145147, + "epoch": 0.9344355929655794, + "flos": 29710333797120.0, + "grad_norm": 1.9744538682632873, + "language_loss": 0.66810614, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.68941331, + "num_input_tokens_seen": 335276835, + "step": 15542, + "time_per_iteration": 2.760667085647583 + }, + { + "auxiliary_loss_clip": 0.01065848, + "auxiliary_loss_mlp": 0.01029314, + "balance_loss_clip": 1.03444171, + "balance_loss_mlp": 1.01672554, + "epoch": 0.9344957162182475, + "flos": 18880394751360.0, + "grad_norm": 1.8654482805757453, + "language_loss": 0.69444913, + "learning_rate": 4.480196882960907e-08, + "loss": 0.71540076, + "num_input_tokens_seen": 335296220, + "step": 15543, + "time_per_iteration": 2.7620866298675537 + }, + { + "auxiliary_loss_clip": 0.0109899, + "auxiliary_loss_mlp": 0.01029553, + "balance_loss_clip": 1.03395653, + "balance_loss_mlp": 1.01592147, + "epoch": 0.9345558394709154, + "flos": 27417761268480.0, + "grad_norm": 2.0257017035114493, + "language_loss": 0.69519067, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.71647608, + "num_input_tokens_seen": 335316335, + "step": 15544, + "time_per_iteration": 2.7088634967803955 + }, + { + "auxiliary_loss_clip": 0.01094451, + "auxiliary_loss_mlp": 0.01046236, + "balance_loss_clip": 1.03500128, + "balance_loss_mlp": 1.03233039, + "epoch": 0.9346159627235834, + "flos": 20741285629440.0, + "grad_norm": 1.7227662917872677, + "language_loss": 0.77327919, + "learning_rate": 4.463817240903789e-08, + "loss": 0.79468608, + "num_input_tokens_seen": 335335545, + "step": 15545, + "time_per_iteration": 2.630438804626465 + }, + { + "auxiliary_loss_clip": 0.0109898, + "auxiliary_loss_mlp": 0.01026698, + "balance_loss_clip": 1.03614378, + "balance_loss_mlp": 1.01519418, + "epoch": 0.9346760859762513, + "flos": 21069221823360.0, + "grad_norm": 1.7337615176350853, + "language_loss": 0.68626702, + "learning_rate": 4.455638541847495e-08, + "loss": 0.70752382, + "num_input_tokens_seen": 335355350, + "step": 15546, + "time_per_iteration": 2.5841619968414307 + }, + { + "auxiliary_loss_clip": 0.01066558, + "auxiliary_loss_mlp": 0.01029596, + "balance_loss_clip": 1.03200841, + "balance_loss_mlp": 1.01754951, + "epoch": 0.9347362092289193, + "flos": 29204927481600.0, + "grad_norm": 2.0917837457460466, + "language_loss": 0.82409191, + "learning_rate": 4.447467257852966e-08, + "loss": 0.84505343, + "num_input_tokens_seen": 335375160, + "step": 15547, + "time_per_iteration": 2.737194538116455 + }, + { + "auxiliary_loss_clip": 0.01089071, + "auxiliary_loss_mlp": 0.01038533, + "balance_loss_clip": 1.03189087, + "balance_loss_mlp": 1.02542627, + "epoch": 0.9347963324815872, + "flos": 19427350124160.0, + "grad_norm": 1.9497482945485352, + "language_loss": 0.83475363, + "learning_rate": 4.439303389230087e-08, + "loss": 0.85602963, + "num_input_tokens_seen": 335394080, + "step": 15548, + "time_per_iteration": 2.550107479095459 + }, + { + "auxiliary_loss_clip": 0.01101099, + "auxiliary_loss_mlp": 0.01037896, + "balance_loss_clip": 1.03632116, + "balance_loss_mlp": 1.0238775, + "epoch": 0.9348564557342552, + "flos": 36901840596480.0, + "grad_norm": 1.5421911327365105, + "language_loss": 0.65587002, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.67725998, + "num_input_tokens_seen": 335414230, + "step": 15549, + "time_per_iteration": 2.7219295501708984 + }, + { + "auxiliary_loss_clip": 0.01101825, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.03933716, + "balance_loss_mlp": 1.02354288, + "epoch": 0.9349165789869232, + "flos": 21690117342720.0, + "grad_norm": 2.010328548001079, + "language_loss": 0.80039644, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.82178628, + "num_input_tokens_seen": 335432890, + "step": 15550, + "time_per_iteration": 2.640012741088867 + }, + { + "auxiliary_loss_clip": 0.01096493, + "auxiliary_loss_mlp": 0.01032466, + "balance_loss_clip": 1.0383265, + "balance_loss_mlp": 1.02041388, + "epoch": 0.9349767022395912, + "flos": 18844053166080.0, + "grad_norm": 1.681452605729496, + "language_loss": 0.75687659, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.77816617, + "num_input_tokens_seen": 335452085, + "step": 15551, + "time_per_iteration": 2.584329843521118 + }, + { + "auxiliary_loss_clip": 0.01051893, + "auxiliary_loss_mlp": 0.01030912, + "balance_loss_clip": 1.03308678, + "balance_loss_mlp": 1.02025425, + "epoch": 0.9350368254922591, + "flos": 24973429777920.0, + "grad_norm": 1.499035355144879, + "language_loss": 0.73651052, + "learning_rate": 4.406722074642255e-08, + "loss": 0.75733852, + "num_input_tokens_seen": 335472130, + "step": 15552, + "time_per_iteration": 2.7739923000335693 + }, + { + "auxiliary_loss_clip": 0.01059946, + "auxiliary_loss_mlp": 0.01040283, + "balance_loss_clip": 1.03191781, + "balance_loss_mlp": 1.02666366, + "epoch": 0.9350969487449271, + "flos": 23070594792960.0, + "grad_norm": 1.5949406765998282, + "language_loss": 0.77295089, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.79395318, + "num_input_tokens_seen": 335489970, + "step": 15553, + "time_per_iteration": 2.7346534729003906 + }, + { + "auxiliary_loss_clip": 0.01074123, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.03367734, + "balance_loss_mlp": 1.02047396, + "epoch": 0.9351570719975951, + "flos": 18625177641600.0, + "grad_norm": 1.630847703889005, + "language_loss": 0.78214866, + "learning_rate": 4.390475917613723e-08, + "loss": 0.8032288, + "num_input_tokens_seen": 335509125, + "step": 15554, + "time_per_iteration": 2.6710941791534424 + }, + { + "auxiliary_loss_clip": 0.01077218, + "auxiliary_loss_mlp": 0.01037117, + "balance_loss_clip": 1.03197753, + "balance_loss_mlp": 1.02502322, + "epoch": 0.935217195250263, + "flos": 15888353702400.0, + "grad_norm": 2.49632757150129, + "language_loss": 0.69451249, + "learning_rate": 4.382363965244695e-08, + "loss": 0.7156558, + "num_input_tokens_seen": 335525620, + "step": 15555, + "time_per_iteration": 2.6385841369628906 + }, + { + "auxiliary_loss_clip": 0.01014929, + "auxiliary_loss_mlp": 0.01045967, + "balance_loss_clip": 1.02853274, + "balance_loss_mlp": 1.0316503, + "epoch": 0.935277318502931, + "flos": 24390312387840.0, + "grad_norm": 1.5017504373533854, + "language_loss": 0.75400025, + "learning_rate": 4.374259430715965e-08, + "loss": 0.77460921, + "num_input_tokens_seen": 335547565, + "step": 15556, + "time_per_iteration": 3.059551477432251 + }, + { + "auxiliary_loss_clip": 0.01085152, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.03423309, + "balance_loss_mlp": 1.02010441, + "epoch": 0.935337441755599, + "flos": 27600259294080.0, + "grad_norm": 1.4976349419869153, + "language_loss": 0.72337437, + "learning_rate": 4.366162314334953e-08, + "loss": 0.74454439, + "num_input_tokens_seen": 335570285, + "step": 15557, + "time_per_iteration": 4.6448400020599365 + }, + { + "auxiliary_loss_clip": 0.01108474, + "auxiliary_loss_mlp": 0.01033173, + "balance_loss_clip": 1.0365355, + "balance_loss_mlp": 1.01982188, + "epoch": 0.935397565008267, + "flos": 20482872209280.0, + "grad_norm": 1.660550178775489, + "language_loss": 0.63404226, + "learning_rate": 4.358072616408681e-08, + "loss": 0.65545875, + "num_input_tokens_seen": 335588600, + "step": 15558, + "time_per_iteration": 2.6054418087005615 + }, + { + "auxiliary_loss_clip": 0.01087055, + "auxiliary_loss_mlp": 0.01030987, + "balance_loss_clip": 1.03696275, + "balance_loss_mlp": 1.01723039, + "epoch": 0.9354576882609349, + "flos": 23654394541440.0, + "grad_norm": 1.8757208660532867, + "language_loss": 0.72988653, + "learning_rate": 4.34999033724388e-08, + "loss": 0.75106692, + "num_input_tokens_seen": 335606235, + "step": 15559, + "time_per_iteration": 2.6042425632476807 + }, + { + "auxiliary_loss_clip": 0.01053197, + "auxiliary_loss_mlp": 0.00769565, + "balance_loss_clip": 1.03214157, + "balance_loss_mlp": 1.00029421, + "epoch": 0.9355178115136029, + "flos": 36684904406400.0, + "grad_norm": 2.2075476861746526, + "language_loss": 0.63396823, + "learning_rate": 4.341915477147062e-08, + "loss": 0.65219581, + "num_input_tokens_seen": 335628240, + "step": 15560, + "time_per_iteration": 4.612861633300781 + }, + { + "auxiliary_loss_clip": 0.01049187, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.03704762, + "balance_loss_mlp": 1.02041054, + "epoch": 0.9355779347662708, + "flos": 14460401450880.0, + "grad_norm": 2.3052566052568193, + "language_loss": 0.64168519, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.66252398, + "num_input_tokens_seen": 335643755, + "step": 15561, + "time_per_iteration": 4.437899827957153 + }, + { + "auxiliary_loss_clip": 0.0110932, + "auxiliary_loss_mlp": 0.01036594, + "balance_loss_clip": 1.03827786, + "balance_loss_mlp": 1.02375484, + "epoch": 0.9356380580189388, + "flos": 23185976256000.0, + "grad_norm": 1.6937389446463813, + "language_loss": 0.75591785, + "learning_rate": 4.325788015381859e-08, + "loss": 0.77737701, + "num_input_tokens_seen": 335665160, + "step": 15562, + "time_per_iteration": 2.7620413303375244 + }, + { + "auxiliary_loss_clip": 0.01016437, + "auxiliary_loss_mlp": 0.01002066, + "balance_loss_clip": 1.00517988, + "balance_loss_mlp": 1.0011481, + "epoch": 0.9356981812716068, + "flos": 67471626090240.0, + "grad_norm": 0.9484711819717426, + "language_loss": 0.62294793, + "learning_rate": 4.31773541432503e-08, + "loss": 0.64313298, + "num_input_tokens_seen": 335715240, + "step": 15563, + "time_per_iteration": 4.560046672821045 + }, + { + "auxiliary_loss_clip": 0.01059821, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.0360657, + "balance_loss_mlp": 1.02043045, + "epoch": 0.9357583045242748, + "flos": 24681619687680.0, + "grad_norm": 1.6275464297875282, + "language_loss": 0.78383303, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.80475569, + "num_input_tokens_seen": 335734970, + "step": 15564, + "time_per_iteration": 2.7581684589385986 + }, + { + "auxiliary_loss_clip": 0.01111071, + "auxiliary_loss_mlp": 0.0103029, + "balance_loss_clip": 1.0369916, + "balance_loss_mlp": 1.0166707, + "epoch": 0.9358184277769427, + "flos": 19463727623040.0, + "grad_norm": 2.0197933120923164, + "language_loss": 0.78051835, + "learning_rate": 4.301652473389694e-08, + "loss": 0.80193192, + "num_input_tokens_seen": 335753435, + "step": 15565, + "time_per_iteration": 2.6100919246673584 + }, + { + "auxiliary_loss_clip": 0.01094214, + "auxiliary_loss_mlp": 0.01031155, + "balance_loss_clip": 1.03455317, + "balance_loss_mlp": 1.01927018, + "epoch": 0.9358785510296107, + "flos": 18916987731840.0, + "grad_norm": 3.310186857599268, + "language_loss": 0.72122169, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.74247533, + "num_input_tokens_seen": 335772105, + "step": 15566, + "time_per_iteration": 2.5962870121002197 + }, + { + "auxiliary_loss_clip": 0.0106957, + "auxiliary_loss_mlp": 0.00771396, + "balance_loss_clip": 1.03290153, + "balance_loss_mlp": 1.00026023, + "epoch": 0.9359386742822787, + "flos": 23441265192960.0, + "grad_norm": 1.9305985811175064, + "language_loss": 0.67621976, + "learning_rate": 4.285599216057889e-08, + "loss": 0.69462943, + "num_input_tokens_seen": 335789125, + "step": 15567, + "time_per_iteration": 2.6770172119140625 + }, + { + "auxiliary_loss_clip": 0.01078108, + "auxiliary_loss_mlp": 0.0103394, + "balance_loss_clip": 1.03551221, + "balance_loss_mlp": 1.02124989, + "epoch": 0.9359987975349466, + "flos": 32744067557760.0, + "grad_norm": 3.1950642417815778, + "language_loss": 0.62192923, + "learning_rate": 4.277583719504418e-08, + "loss": 0.64304972, + "num_input_tokens_seen": 335810995, + "step": 15568, + "time_per_iteration": 2.7253639698028564 + }, + { + "auxiliary_loss_clip": 0.01082433, + "auxiliary_loss_mlp": 0.01037861, + "balance_loss_clip": 1.03121352, + "balance_loss_mlp": 1.02551699, + "epoch": 0.9360589207876147, + "flos": 22819651401600.0, + "grad_norm": 1.8305652991188055, + "language_loss": 0.7874766, + "learning_rate": 4.269575644764556e-08, + "loss": 0.80867952, + "num_input_tokens_seen": 335830580, + "step": 15569, + "time_per_iteration": 2.648876905441284 + }, + { + "auxiliary_loss_clip": 0.01090445, + "auxiliary_loss_mlp": 0.01033037, + "balance_loss_clip": 1.03811383, + "balance_loss_mlp": 1.02041864, + "epoch": 0.9361190440402826, + "flos": 20885251340160.0, + "grad_norm": 3.2418615597680263, + "language_loss": 0.697613, + "learning_rate": 4.261574992142014e-08, + "loss": 0.71884787, + "num_input_tokens_seen": 335846515, + "step": 15570, + "time_per_iteration": 2.695789337158203 + }, + { + "auxiliary_loss_clip": 0.0109347, + "auxiliary_loss_mlp": 0.01030501, + "balance_loss_clip": 1.03646827, + "balance_loss_mlp": 1.0180912, + "epoch": 0.9361791672929506, + "flos": 19317822577920.0, + "grad_norm": 3.942506001346151, + "language_loss": 0.78369403, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.80493373, + "num_input_tokens_seen": 335863350, + "step": 15571, + "time_per_iteration": 2.613274335861206 + }, + { + "auxiliary_loss_clip": 0.01076748, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.03460646, + "balance_loss_mlp": 1.01874197, + "epoch": 0.9362392905456185, + "flos": 15158182032000.0, + "grad_norm": 2.841657798727435, + "language_loss": 0.77677691, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.79785693, + "num_input_tokens_seen": 335880510, + "step": 15572, + "time_per_iteration": 2.803063154220581 + }, + { + "auxiliary_loss_clip": 0.01082647, + "auxiliary_loss_mlp": 0.01041561, + "balance_loss_clip": 1.03344643, + "balance_loss_mlp": 1.0294075, + "epoch": 0.9362994137982865, + "flos": 22085888371200.0, + "grad_norm": 1.8952922672820693, + "language_loss": 0.78173578, + "learning_rate": 4.237617570010688e-08, + "loss": 0.80297786, + "num_input_tokens_seen": 335899440, + "step": 15573, + "time_per_iteration": 2.64709734916687 + }, + { + "auxiliary_loss_clip": 0.01072731, + "auxiliary_loss_mlp": 0.01028381, + "balance_loss_clip": 1.03296494, + "balance_loss_mlp": 1.01635885, + "epoch": 0.9363595370509544, + "flos": 23512260424320.0, + "grad_norm": 2.4715293938233316, + "language_loss": 0.74473417, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.76574528, + "num_input_tokens_seen": 335919540, + "step": 15574, + "time_per_iteration": 2.8169214725494385 + }, + { + "auxiliary_loss_clip": 0.01050172, + "auxiliary_loss_mlp": 0.01035605, + "balance_loss_clip": 1.03272486, + "balance_loss_mlp": 1.02266467, + "epoch": 0.9364196603036224, + "flos": 27123473139840.0, + "grad_norm": 1.920556373302248, + "language_loss": 0.68192244, + "learning_rate": 4.221683071397564e-08, + "loss": 0.70278013, + "num_input_tokens_seen": 335939665, + "step": 15575, + "time_per_iteration": 2.798386573791504 + }, + { + "auxiliary_loss_clip": 0.01078254, + "auxiliary_loss_mlp": 0.01033935, + "balance_loss_clip": 1.03272521, + "balance_loss_mlp": 1.02136481, + "epoch": 0.9364797835562904, + "flos": 18479057114880.0, + "grad_norm": 1.7600184524514564, + "language_loss": 0.65193367, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.67305553, + "num_input_tokens_seen": 335958580, + "step": 15576, + "time_per_iteration": 2.6554365158081055 + }, + { + "auxiliary_loss_clip": 0.01093147, + "auxiliary_loss_mlp": 0.01030045, + "balance_loss_clip": 1.03174019, + "balance_loss_mlp": 1.0161159, + "epoch": 0.9365399068089584, + "flos": 13005552890880.0, + "grad_norm": 2.420160931511476, + "language_loss": 0.76176679, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.78299868, + "num_input_tokens_seen": 335974965, + "step": 15577, + "time_per_iteration": 2.5658376216888428 + }, + { + "auxiliary_loss_clip": 0.01062399, + "auxiliary_loss_mlp": 0.01030216, + "balance_loss_clip": 1.03228045, + "balance_loss_mlp": 1.01722205, + "epoch": 0.9366000300616263, + "flos": 25666433850240.0, + "grad_norm": 3.270982347260187, + "language_loss": 0.52259952, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.5435257, + "num_input_tokens_seen": 335996575, + "step": 15578, + "time_per_iteration": 2.753800392150879 + }, + { + "auxiliary_loss_clip": 0.01044474, + "auxiliary_loss_mlp": 0.01035654, + "balance_loss_clip": 1.03016138, + "balance_loss_mlp": 1.02336335, + "epoch": 0.9366601533142943, + "flos": 21433355948160.0, + "grad_norm": 1.5769540357516516, + "language_loss": 0.70730215, + "learning_rate": 4.189903163783692e-08, + "loss": 0.7281034, + "num_input_tokens_seen": 336017265, + "step": 15579, + "time_per_iteration": 2.776789903640747 + }, + { + "auxiliary_loss_clip": 0.01081419, + "auxiliary_loss_mlp": 0.01027227, + "balance_loss_clip": 1.03318858, + "balance_loss_mlp": 1.01544309, + "epoch": 0.9367202765669622, + "flos": 24093222998400.0, + "grad_norm": 1.8470459873947023, + "language_loss": 0.76132309, + "learning_rate": 4.181976748973959e-08, + "loss": 0.78240955, + "num_input_tokens_seen": 336035905, + "step": 15580, + "time_per_iteration": 2.685457468032837 + }, + { + "auxiliary_loss_clip": 0.01097941, + "auxiliary_loss_mlp": 0.01031636, + "balance_loss_clip": 1.03599906, + "balance_loss_mlp": 1.01848698, + "epoch": 0.9367803998196302, + "flos": 20888842700160.0, + "grad_norm": 1.6988000782542536, + "language_loss": 0.66216934, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.68346512, + "num_input_tokens_seen": 336055585, + "step": 15581, + "time_per_iteration": 2.642705202102661 + }, + { + "auxiliary_loss_clip": 0.01099156, + "auxiliary_loss_mlp": 0.01028235, + "balance_loss_clip": 1.03769445, + "balance_loss_mlp": 1.01575327, + "epoch": 0.9368405230722983, + "flos": 22564362464640.0, + "grad_norm": 1.6283591696925621, + "language_loss": 0.76962942, + "learning_rate": 4.166146195972042e-08, + "loss": 0.79090333, + "num_input_tokens_seen": 336076695, + "step": 15582, + "time_per_iteration": 2.6836650371551514 + }, + { + "auxiliary_loss_clip": 0.01033952, + "auxiliary_loss_mlp": 0.01033194, + "balance_loss_clip": 1.03131258, + "balance_loss_mlp": 1.02007508, + "epoch": 0.9369006463249662, + "flos": 18880215183360.0, + "grad_norm": 1.9959612516768654, + "language_loss": 0.73610139, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.75677288, + "num_input_tokens_seen": 336094740, + "step": 15583, + "time_per_iteration": 2.9247751235961914 + }, + { + "auxiliary_loss_clip": 0.01113025, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.03807962, + "balance_loss_mlp": 1.01861954, + "epoch": 0.9369607695776342, + "flos": 26432516142720.0, + "grad_norm": 2.0019759787362417, + "language_loss": 0.84050167, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.86195087, + "num_input_tokens_seen": 336113985, + "step": 15584, + "time_per_iteration": 2.7832884788513184 + }, + { + "auxiliary_loss_clip": 0.01098693, + "auxiliary_loss_mlp": 0.00771025, + "balance_loss_clip": 1.03800797, + "balance_loss_mlp": 1.00032699, + "epoch": 0.9370208928303021, + "flos": 39567346081920.0, + "grad_norm": 1.4532418436154226, + "language_loss": 0.72163695, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.74033409, + "num_input_tokens_seen": 336136395, + "step": 15585, + "time_per_iteration": 2.81025767326355 + }, + { + "auxiliary_loss_clip": 0.01073011, + "auxiliary_loss_mlp": 0.01021827, + "balance_loss_clip": 1.03393424, + "balance_loss_mlp": 1.01078236, + "epoch": 0.9370810160829701, + "flos": 22963114321920.0, + "grad_norm": 1.7172742978336988, + "language_loss": 0.8027873, + "learning_rate": 4.134574204836316e-08, + "loss": 0.82373559, + "num_input_tokens_seen": 336156345, + "step": 15586, + "time_per_iteration": 2.66705322265625 + }, + { + "auxiliary_loss_clip": 0.01068881, + "auxiliary_loss_mlp": 0.01036223, + "balance_loss_clip": 1.03491676, + "balance_loss_mlp": 1.0236938, + "epoch": 0.937141139335638, + "flos": 23075048079360.0, + "grad_norm": 1.5900972808595355, + "language_loss": 0.76568019, + "learning_rate": 4.126699774396258e-08, + "loss": 0.78673124, + "num_input_tokens_seen": 336176760, + "step": 15587, + "time_per_iteration": 2.696638345718384 + }, + { + "auxiliary_loss_clip": 0.01089529, + "auxiliary_loss_mlp": 0.01037342, + "balance_loss_clip": 1.03515196, + "balance_loss_mlp": 1.02427721, + "epoch": 0.937201262588306, + "flos": 16356664247040.0, + "grad_norm": 1.8569642874741914, + "language_loss": 0.87623429, + "learning_rate": 4.118832771491387e-08, + "loss": 0.89750302, + "num_input_tokens_seen": 336193285, + "step": 15588, + "time_per_iteration": 2.6571919918060303 + }, + { + "auxiliary_loss_clip": 0.01106178, + "auxiliary_loss_mlp": 0.00770286, + "balance_loss_clip": 1.03689957, + "balance_loss_mlp": 1.0001812, + "epoch": 0.937261385840974, + "flos": 20194078861440.0, + "grad_norm": 1.9442823727757126, + "language_loss": 0.78136659, + "learning_rate": 4.11097319642002e-08, + "loss": 0.80013114, + "num_input_tokens_seen": 336211425, + "step": 15589, + "time_per_iteration": 2.5364420413970947 + }, + { + "auxiliary_loss_clip": 0.01106688, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.03706598, + "balance_loss_mlp": 1.02196836, + "epoch": 0.937321509093642, + "flos": 18295948558080.0, + "grad_norm": 1.7833559240011974, + "language_loss": 0.77980852, + "learning_rate": 4.103121049480163e-08, + "loss": 0.80121559, + "num_input_tokens_seen": 336230205, + "step": 15590, + "time_per_iteration": 2.5236690044403076 + }, + { + "auxiliary_loss_clip": 0.01079152, + "auxiliary_loss_mlp": 0.01038917, + "balance_loss_clip": 1.03359151, + "balance_loss_mlp": 1.02445698, + "epoch": 0.9373816323463099, + "flos": 25884662929920.0, + "grad_norm": 1.8039863283037736, + "language_loss": 0.71324873, + "learning_rate": 4.095276330969577e-08, + "loss": 0.73442948, + "num_input_tokens_seen": 336252440, + "step": 15591, + "time_per_iteration": 2.675104856491089 + }, + { + "auxiliary_loss_clip": 0.01097841, + "auxiliary_loss_mlp": 0.00771749, + "balance_loss_clip": 1.03754783, + "balance_loss_mlp": 1.00026131, + "epoch": 0.9374417555989779, + "flos": 27198849830400.0, + "grad_norm": 2.2483638992357844, + "language_loss": 0.53910917, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.55780506, + "num_input_tokens_seen": 336273845, + "step": 15592, + "time_per_iteration": 2.620513439178467 + }, + { + "auxiliary_loss_clip": 0.01092328, + "auxiliary_loss_mlp": 0.01027667, + "balance_loss_clip": 1.03775334, + "balance_loss_mlp": 1.01602066, + "epoch": 0.9375018788516458, + "flos": 23621249266560.0, + "grad_norm": 1.5680593734812756, + "language_loss": 0.67480534, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.69600528, + "num_input_tokens_seen": 336292790, + "step": 15593, + "time_per_iteration": 2.606893301010132 + }, + { + "auxiliary_loss_clip": 0.01086764, + "auxiliary_loss_mlp": 0.01028451, + "balance_loss_clip": 1.03426361, + "balance_loss_mlp": 1.01641703, + "epoch": 0.9375620021043138, + "flos": 22678774260480.0, + "grad_norm": 1.5375149732930165, + "language_loss": 0.74182671, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.76297885, + "num_input_tokens_seen": 336312600, + "step": 15594, + "time_per_iteration": 2.6576709747314453 + }, + { + "auxiliary_loss_clip": 0.01093114, + "auxiliary_loss_mlp": 0.01027231, + "balance_loss_clip": 1.03431714, + "balance_loss_mlp": 1.01590586, + "epoch": 0.9376221253569819, + "flos": 27560254521600.0, + "grad_norm": 1.6954995365401158, + "language_loss": 0.74231362, + "learning_rate": 4.063971747165351e-08, + "loss": 0.76351708, + "num_input_tokens_seen": 336332770, + "step": 15595, + "time_per_iteration": 2.6582190990448 + }, + { + "auxiliary_loss_clip": 0.01080536, + "auxiliary_loss_mlp": 0.01029991, + "balance_loss_clip": 1.03524542, + "balance_loss_mlp": 1.01823688, + "epoch": 0.9376822486096498, + "flos": 24129887806080.0, + "grad_norm": 1.8418600900837818, + "language_loss": 0.75974333, + "learning_rate": 4.056164175257626e-08, + "loss": 0.78084862, + "num_input_tokens_seen": 336351445, + "step": 15596, + "time_per_iteration": 2.6803321838378906 + }, + { + "auxiliary_loss_clip": 0.01079836, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.03544092, + "balance_loss_mlp": 1.01825309, + "epoch": 0.9377423718623178, + "flos": 22784028088320.0, + "grad_norm": 1.7137269862110038, + "language_loss": 0.78881788, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.80992472, + "num_input_tokens_seen": 336368690, + "step": 15597, + "time_per_iteration": 4.308673143386841 + }, + { + "auxiliary_loss_clip": 0.01113389, + "auxiliary_loss_mlp": 0.01033675, + "balance_loss_clip": 1.03775406, + "balance_loss_mlp": 1.02094936, + "epoch": 0.9378024951149857, + "flos": 19168900790400.0, + "grad_norm": 1.564327070136616, + "language_loss": 0.81037343, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.83184403, + "num_input_tokens_seen": 336388165, + "step": 15598, + "time_per_iteration": 2.5458343029022217 + }, + { + "auxiliary_loss_clip": 0.01077427, + "auxiliary_loss_mlp": 0.01031736, + "balance_loss_clip": 1.03376913, + "balance_loss_mlp": 1.01846755, + "epoch": 0.9378626183676537, + "flos": 23505508667520.0, + "grad_norm": 2.005294265343008, + "language_loss": 0.62860727, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.64969885, + "num_input_tokens_seen": 336406475, + "step": 15599, + "time_per_iteration": 2.638820171356201 + }, + { + "auxiliary_loss_clip": 0.01068952, + "auxiliary_loss_mlp": 0.01033888, + "balance_loss_clip": 1.03511238, + "balance_loss_mlp": 1.0210557, + "epoch": 0.9379227416203216, + "flos": 18405655672320.0, + "grad_norm": 1.8480598201397724, + "language_loss": 0.73232383, + "learning_rate": 4.0250081926821e-08, + "loss": 0.75335222, + "num_input_tokens_seen": 336424690, + "step": 15600, + "time_per_iteration": 6.016250848770142 + }, + { + "auxiliary_loss_clip": 0.01083039, + "auxiliary_loss_mlp": 0.01031732, + "balance_loss_clip": 1.03592873, + "balance_loss_mlp": 1.02013838, + "epoch": 0.9379828648729897, + "flos": 17821855923840.0, + "grad_norm": 1.7892851032269996, + "language_loss": 0.69339931, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.71454704, + "num_input_tokens_seen": 336443055, + "step": 15601, + "time_per_iteration": 2.6296818256378174 + }, + { + "auxiliary_loss_clip": 0.01019215, + "auxiliary_loss_mlp": 0.01003727, + "balance_loss_clip": 1.00596642, + "balance_loss_mlp": 1.00267816, + "epoch": 0.9380429881256576, + "flos": 68024399466240.0, + "grad_norm": 0.7524579876237703, + "language_loss": 0.58074123, + "learning_rate": 4.009474788561573e-08, + "loss": 0.60097063, + "num_input_tokens_seen": 336510190, + "step": 15602, + "time_per_iteration": 3.3650712966918945 + }, + { + "auxiliary_loss_clip": 0.01035142, + "auxiliary_loss_mlp": 0.01039086, + "balance_loss_clip": 1.03298295, + "balance_loss_mlp": 1.02651513, + "epoch": 0.9381031113783256, + "flos": 20776980769920.0, + "grad_norm": 2.016134606608171, + "language_loss": 0.71942216, + "learning_rate": 4.001719234324663e-08, + "loss": 0.7401644, + "num_input_tokens_seen": 336529250, + "step": 15603, + "time_per_iteration": 4.292678356170654 + }, + { + "auxiliary_loss_clip": 0.01100161, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.03342152, + "balance_loss_mlp": 1.01796222, + "epoch": 0.9381632346309935, + "flos": 19025078734080.0, + "grad_norm": 1.630988444834905, + "language_loss": 0.76084709, + "learning_rate": 3.993971112362171e-08, + "loss": 0.78214121, + "num_input_tokens_seen": 336548530, + "step": 15604, + "time_per_iteration": 2.5863354206085205 + }, + { + "auxiliary_loss_clip": 0.01083382, + "auxiliary_loss_mlp": 0.01039836, + "balance_loss_clip": 1.03308749, + "balance_loss_mlp": 1.02494097, + "epoch": 0.9382233578836615, + "flos": 23513840622720.0, + "grad_norm": 2.0761522756468005, + "language_loss": 0.65524292, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.67647505, + "num_input_tokens_seen": 336568510, + "step": 15605, + "time_per_iteration": 2.7903220653533936 + }, + { + "auxiliary_loss_clip": 0.01075306, + "auxiliary_loss_mlp": 0.00770626, + "balance_loss_clip": 1.03514504, + "balance_loss_mlp": 1.00017333, + "epoch": 0.9382834811363294, + "flos": 43067882016000.0, + "grad_norm": 2.098655820983203, + "language_loss": 0.67783493, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.69629425, + "num_input_tokens_seen": 336592020, + "step": 15606, + "time_per_iteration": 2.8691816329956055 + }, + { + "auxiliary_loss_clip": 0.01091361, + "auxiliary_loss_mlp": 0.01027503, + "balance_loss_clip": 1.03324687, + "balance_loss_mlp": 1.01593983, + "epoch": 0.9383436043889974, + "flos": 16436242828800.0, + "grad_norm": 1.7643369071420325, + "language_loss": 0.77210492, + "learning_rate": 3.970771343058166e-08, + "loss": 0.7932936, + "num_input_tokens_seen": 336610010, + "step": 15607, + "time_per_iteration": 2.670970916748047 + }, + { + "auxiliary_loss_clip": 0.01098186, + "auxiliary_loss_mlp": 0.01027702, + "balance_loss_clip": 1.03540564, + "balance_loss_mlp": 1.01609111, + "epoch": 0.9384037276416655, + "flos": 20740603271040.0, + "grad_norm": 2.3923436535832927, + "language_loss": 0.82524753, + "learning_rate": 3.963052953128776e-08, + "loss": 0.84650642, + "num_input_tokens_seen": 336628520, + "step": 15608, + "time_per_iteration": 2.6184029579162598 + }, + { + "auxiliary_loss_clip": 0.01099685, + "auxiliary_loss_mlp": 0.01035669, + "balance_loss_clip": 1.0386703, + "balance_loss_mlp": 1.02291393, + "epoch": 0.9384638508943334, + "flos": 19062677295360.0, + "grad_norm": 1.6462700950548765, + "language_loss": 0.68830276, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.7096563, + "num_input_tokens_seen": 336647365, + "step": 15609, + "time_per_iteration": 2.5987517833709717 + }, + { + "auxiliary_loss_clip": 0.01080403, + "auxiliary_loss_mlp": 0.01031835, + "balance_loss_clip": 1.03563523, + "balance_loss_mlp": 1.01835871, + "epoch": 0.9385239741470014, + "flos": 23404887694080.0, + "grad_norm": 2.499558460038554, + "language_loss": 0.75453949, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.77566183, + "num_input_tokens_seen": 336667165, + "step": 15610, + "time_per_iteration": 2.7642691135406494 + }, + { + "auxiliary_loss_clip": 0.01044401, + "auxiliary_loss_mlp": 0.01027529, + "balance_loss_clip": 1.03432107, + "balance_loss_mlp": 1.0161804, + "epoch": 0.9385840973996693, + "flos": 12824742804480.0, + "grad_norm": 2.318341323536946, + "language_loss": 0.75083077, + "learning_rate": 3.939942386953987e-08, + "loss": 0.77155006, + "num_input_tokens_seen": 336684130, + "step": 15611, + "time_per_iteration": 2.753612518310547 + }, + { + "auxiliary_loss_clip": 0.01069021, + "auxiliary_loss_mlp": 0.01029199, + "balance_loss_clip": 1.03686237, + "balance_loss_mlp": 1.01732564, + "epoch": 0.9386442206523373, + "flos": 15486980152320.0, + "grad_norm": 1.8818734447956798, + "language_loss": 0.6593554, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.68033767, + "num_input_tokens_seen": 336701520, + "step": 15612, + "time_per_iteration": 2.637763738632202 + }, + { + "auxiliary_loss_clip": 0.01095795, + "auxiliary_loss_mlp": 0.01028771, + "balance_loss_clip": 1.03593373, + "balance_loss_mlp": 1.01703501, + "epoch": 0.9387043439050052, + "flos": 21178821196800.0, + "grad_norm": 2.000722743721445, + "language_loss": 0.57039118, + "learning_rate": 3.924572515435742e-08, + "loss": 0.59163684, + "num_input_tokens_seen": 336720675, + "step": 15613, + "time_per_iteration": 2.733313798904419 + }, + { + "auxiliary_loss_clip": 0.01084485, + "auxiliary_loss_mlp": 0.01036056, + "balance_loss_clip": 1.03319824, + "balance_loss_mlp": 1.02405143, + "epoch": 0.9387644671576733, + "flos": 27668273696640.0, + "grad_norm": 2.367003168945266, + "language_loss": 0.70944715, + "learning_rate": 3.916898732330764e-08, + "loss": 0.73065257, + "num_input_tokens_seen": 336741005, + "step": 15614, + "time_per_iteration": 2.706362009048462 + }, + { + "auxiliary_loss_clip": 0.01101068, + "auxiliary_loss_mlp": 0.01031027, + "balance_loss_clip": 1.03795266, + "balance_loss_mlp": 1.018224, + "epoch": 0.9388245904103412, + "flos": 18836331742080.0, + "grad_norm": 1.9586081993753126, + "language_loss": 0.81213439, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.83345532, + "num_input_tokens_seen": 336757990, + "step": 15615, + "time_per_iteration": 2.5509698390960693 + }, + { + "auxiliary_loss_clip": 0.01078844, + "auxiliary_loss_mlp": 0.01030958, + "balance_loss_clip": 1.03181601, + "balance_loss_mlp": 1.01881695, + "epoch": 0.9388847136630092, + "flos": 25483828083840.0, + "grad_norm": 1.8203668140159897, + "language_loss": 0.71924144, + "learning_rate": 3.901573472884134e-08, + "loss": 0.7403394, + "num_input_tokens_seen": 336777705, + "step": 15616, + "time_per_iteration": 2.6393303871154785 + }, + { + "auxiliary_loss_clip": 0.01108573, + "auxiliary_loss_mlp": 0.01029358, + "balance_loss_clip": 1.03755164, + "balance_loss_mlp": 1.01691222, + "epoch": 0.9389448369156771, + "flos": 18734992496640.0, + "grad_norm": 2.3142633085226536, + "language_loss": 0.66507453, + "learning_rate": 3.89392199712355e-08, + "loss": 0.68645382, + "num_input_tokens_seen": 336798275, + "step": 15617, + "time_per_iteration": 2.5801546573638916 + }, + { + "auxiliary_loss_clip": 0.01100466, + "auxiliary_loss_mlp": 0.01036181, + "balance_loss_clip": 1.03689265, + "balance_loss_mlp": 1.02243066, + "epoch": 0.9390049601683451, + "flos": 21717839664000.0, + "grad_norm": 2.370004086672154, + "language_loss": 0.73481232, + "learning_rate": 3.886277957725092e-08, + "loss": 0.7561788, + "num_input_tokens_seen": 336813835, + "step": 15618, + "time_per_iteration": 2.6102712154388428 + }, + { + "auxiliary_loss_clip": 0.01114877, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.03841376, + "balance_loss_mlp": 1.01817656, + "epoch": 0.939065083421013, + "flos": 19391224020480.0, + "grad_norm": 1.8942748596777075, + "language_loss": 0.70133412, + "learning_rate": 3.878641354978662e-08, + "loss": 0.7228024, + "num_input_tokens_seen": 336832210, + "step": 15619, + "time_per_iteration": 2.5149004459381104 + }, + { + "auxiliary_loss_clip": 0.01083274, + "auxiliary_loss_mlp": 0.01031368, + "balance_loss_clip": 1.03280878, + "balance_loss_mlp": 1.01836836, + "epoch": 0.939125206673681, + "flos": 24681511946880.0, + "grad_norm": 1.6109808579498737, + "language_loss": 0.7760632, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.79720962, + "num_input_tokens_seen": 336851380, + "step": 15620, + "time_per_iteration": 2.6531193256378174 + }, + { + "auxiliary_loss_clip": 0.01092968, + "auxiliary_loss_mlp": 0.01027438, + "balance_loss_clip": 1.03448093, + "balance_loss_mlp": 1.01568961, + "epoch": 0.9391853299263491, + "flos": 16325961096960.0, + "grad_norm": 3.857357976396781, + "language_loss": 0.73641354, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.75761759, + "num_input_tokens_seen": 336868525, + "step": 15621, + "time_per_iteration": 2.5519356727600098 + }, + { + "auxiliary_loss_clip": 0.01077862, + "auxiliary_loss_mlp": 0.01033031, + "balance_loss_clip": 1.03406405, + "balance_loss_mlp": 1.01961446, + "epoch": 0.939245453179017, + "flos": 11655778590720.0, + "grad_norm": 2.005738336602588, + "language_loss": 0.66011965, + "learning_rate": 3.855776169545688e-08, + "loss": 0.68122858, + "num_input_tokens_seen": 336886200, + "step": 15622, + "time_per_iteration": 2.649592876434326 + }, + { + "auxiliary_loss_clip": 0.01080227, + "auxiliary_loss_mlp": 0.01039822, + "balance_loss_clip": 1.03199553, + "balance_loss_mlp": 1.02594018, + "epoch": 0.939305576431685, + "flos": 23148700917120.0, + "grad_norm": 1.5853407957277033, + "language_loss": 0.71721888, + "learning_rate": 3.848169316300209e-08, + "loss": 0.73841941, + "num_input_tokens_seen": 336905815, + "step": 15623, + "time_per_iteration": 2.6309492588043213 + }, + { + "auxiliary_loss_clip": 0.01101847, + "auxiliary_loss_mlp": 0.01031834, + "balance_loss_clip": 1.03930306, + "balance_loss_mlp": 1.01934707, + "epoch": 0.9393656996843529, + "flos": 33287790706560.0, + "grad_norm": 1.923924688159949, + "language_loss": 0.72363102, + "learning_rate": 3.84056990115178e-08, + "loss": 0.74496788, + "num_input_tokens_seen": 336928460, + "step": 15624, + "time_per_iteration": 2.7837047576904297 + }, + { + "auxiliary_loss_clip": 0.01071928, + "auxiliary_loss_mlp": 0.01033178, + "balance_loss_clip": 1.03422618, + "balance_loss_mlp": 1.02049983, + "epoch": 0.9394258229370209, + "flos": 21689434984320.0, + "grad_norm": 2.3403915430461333, + "language_loss": 0.89429879, + "learning_rate": 3.832977924388614e-08, + "loss": 0.91534984, + "num_input_tokens_seen": 336948320, + "step": 15625, + "time_per_iteration": 2.7144711017608643 + }, + { + "auxiliary_loss_clip": 0.01096935, + "auxiliary_loss_mlp": 0.01031047, + "balance_loss_clip": 1.03645694, + "balance_loss_mlp": 1.01787996, + "epoch": 0.9394859461896888, + "flos": 23874203819520.0, + "grad_norm": 2.0332287450304074, + "language_loss": 0.83621097, + "learning_rate": 3.825393386298592e-08, + "loss": 0.85749084, + "num_input_tokens_seen": 336967670, + "step": 15626, + "time_per_iteration": 2.71279239654541 + }, + { + "auxiliary_loss_clip": 0.01012548, + "auxiliary_loss_mlp": 0.01006796, + "balance_loss_clip": 1.00825083, + "balance_loss_mlp": 1.00575864, + "epoch": 0.9395460694423569, + "flos": 61566116993280.0, + "grad_norm": 0.7779274792928904, + "language_loss": 0.56076801, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.58096135, + "num_input_tokens_seen": 337028395, + "step": 15627, + "time_per_iteration": 3.1956591606140137 + }, + { + "auxiliary_loss_clip": 0.01058297, + "auxiliary_loss_mlp": 0.01041812, + "balance_loss_clip": 1.0335449, + "balance_loss_mlp": 1.02838874, + "epoch": 0.9396061926950248, + "flos": 20995712640000.0, + "grad_norm": 1.8076515347951383, + "language_loss": 0.70110631, + "learning_rate": 3.810246627288105e-08, + "loss": 0.72210741, + "num_input_tokens_seen": 337048150, + "step": 15628, + "time_per_iteration": 2.6945135593414307 + }, + { + "auxiliary_loss_clip": 0.01096653, + "auxiliary_loss_mlp": 0.01028484, + "balance_loss_clip": 1.03629029, + "balance_loss_mlp": 1.01632452, + "epoch": 0.9396663159476928, + "flos": 27487786832640.0, + "grad_norm": 1.4683164605088868, + "language_loss": 0.75408161, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.77533293, + "num_input_tokens_seen": 337069315, + "step": 15629, + "time_per_iteration": 2.697967052459717 + }, + { + "auxiliary_loss_clip": 0.01044306, + "auxiliary_loss_mlp": 0.01039724, + "balance_loss_clip": 1.030352, + "balance_loss_mlp": 1.02693844, + "epoch": 0.9397264392003607, + "flos": 19427457864960.0, + "grad_norm": 1.8515111751581672, + "language_loss": 0.74173099, + "learning_rate": 3.795129626417748e-08, + "loss": 0.76257128, + "num_input_tokens_seen": 337087765, + "step": 15630, + "time_per_iteration": 2.7693710327148438 + }, + { + "auxiliary_loss_clip": 0.01073693, + "auxiliary_loss_mlp": 0.01035386, + "balance_loss_clip": 1.03482318, + "balance_loss_mlp": 1.02306604, + "epoch": 0.9397865624530287, + "flos": 18004820826240.0, + "grad_norm": 2.3868141185330485, + "language_loss": 0.69397956, + "learning_rate": 3.787582286001845e-08, + "loss": 0.71507031, + "num_input_tokens_seen": 337106265, + "step": 15631, + "time_per_iteration": 2.7210657596588135 + }, + { + "auxiliary_loss_clip": 0.01057041, + "auxiliary_loss_mlp": 0.01038236, + "balance_loss_clip": 1.0333792, + "balance_loss_mlp": 1.02626777, + "epoch": 0.9398466857056966, + "flos": 22564613859840.0, + "grad_norm": 1.5129301375877884, + "language_loss": 0.75246739, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.77342016, + "num_input_tokens_seen": 337126090, + "step": 15632, + "time_per_iteration": 2.7409205436706543 + }, + { + "auxiliary_loss_clip": 0.01103425, + "auxiliary_loss_mlp": 0.01036148, + "balance_loss_clip": 1.03828955, + "balance_loss_mlp": 1.02260005, + "epoch": 0.9399068089583646, + "flos": 24535678728960.0, + "grad_norm": 1.6016514710570828, + "language_loss": 0.74265265, + "learning_rate": 3.772509926639622e-08, + "loss": 0.76404846, + "num_input_tokens_seen": 337145655, + "step": 15633, + "time_per_iteration": 2.5950539112091064 + }, + { + "auxiliary_loss_clip": 0.01110088, + "auxiliary_loss_mlp": 0.01034464, + "balance_loss_clip": 1.03653955, + "balance_loss_mlp": 1.0211246, + "epoch": 0.9399669322110327, + "flos": 25630343660160.0, + "grad_norm": 1.9203445491908095, + "language_loss": 0.72707498, + "learning_rate": 3.764984908264823e-08, + "loss": 0.74852049, + "num_input_tokens_seen": 337164805, + "step": 15634, + "time_per_iteration": 2.5872409343719482 + }, + { + "auxiliary_loss_clip": 0.01098967, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.03486001, + "balance_loss_mlp": 1.01823497, + "epoch": 0.9400270554637006, + "flos": 17089385783040.0, + "grad_norm": 2.514594285895435, + "language_loss": 0.68870479, + "learning_rate": 3.75746733114144e-08, + "loss": 0.71000671, + "num_input_tokens_seen": 337182280, + "step": 15635, + "time_per_iteration": 2.600447654724121 + }, + { + "auxiliary_loss_clip": 0.01056848, + "auxiliary_loss_mlp": 0.01029105, + "balance_loss_clip": 1.03640127, + "balance_loss_mlp": 1.01715422, + "epoch": 0.9400871787163686, + "flos": 22055113393920.0, + "grad_norm": 1.5676691824914186, + "language_loss": 0.74045342, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.76131296, + "num_input_tokens_seen": 337203495, + "step": 15636, + "time_per_iteration": 2.6919074058532715 + }, + { + "auxiliary_loss_clip": 0.01099321, + "auxiliary_loss_mlp": 0.01032818, + "balance_loss_clip": 1.03794205, + "balance_loss_mlp": 1.02044368, + "epoch": 0.9401473019690365, + "flos": 16982767238400.0, + "grad_norm": 2.177328379740788, + "language_loss": 0.82646513, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.84778643, + "num_input_tokens_seen": 337220435, + "step": 15637, + "time_per_iteration": 4.119058132171631 + }, + { + "auxiliary_loss_clip": 0.01065361, + "auxiliary_loss_mlp": 0.01033669, + "balance_loss_clip": 1.03724432, + "balance_loss_mlp": 1.02082443, + "epoch": 0.9402074252217045, + "flos": 19681956702720.0, + "grad_norm": 2.151104061404543, + "language_loss": 0.6892854, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.71027565, + "num_input_tokens_seen": 337238095, + "step": 15638, + "time_per_iteration": 2.720820426940918 + }, + { + "auxiliary_loss_clip": 0.01093316, + "auxiliary_loss_mlp": 0.01038281, + "balance_loss_clip": 1.03545761, + "balance_loss_mlp": 1.02702212, + "epoch": 0.9402675484743724, + "flos": 24754302858240.0, + "grad_norm": 1.6914013320453911, + "language_loss": 0.84974968, + "learning_rate": 3.727471440859498e-08, + "loss": 0.87106568, + "num_input_tokens_seen": 337256645, + "step": 15639, + "time_per_iteration": 5.851804733276367 + }, + { + "auxiliary_loss_clip": 0.01083189, + "auxiliary_loss_mlp": 0.00770067, + "balance_loss_clip": 1.03247952, + "balance_loss_mlp": 1.00016117, + "epoch": 0.9403276717270405, + "flos": 25558630156800.0, + "grad_norm": 1.7768265457850463, + "language_loss": 0.78339088, + "learning_rate": 3.719991074263662e-08, + "loss": 0.80192345, + "num_input_tokens_seen": 337278360, + "step": 15640, + "time_per_iteration": 2.7363038063049316 + }, + { + "auxiliary_loss_clip": 0.01100045, + "auxiliary_loss_mlp": 0.0103256, + "balance_loss_clip": 1.03647268, + "balance_loss_mlp": 1.0201323, + "epoch": 0.9403877949797084, + "flos": 26689852154880.0, + "grad_norm": 1.8593795246940288, + "language_loss": 0.74102533, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.76235145, + "num_input_tokens_seen": 337302480, + "step": 15641, + "time_per_iteration": 2.7518787384033203 + }, + { + "auxiliary_loss_clip": 0.0110061, + "auxiliary_loss_mlp": 0.01034479, + "balance_loss_clip": 1.03686595, + "balance_loss_mlp": 1.01987553, + "epoch": 0.9404479182323764, + "flos": 15011666455680.0, + "grad_norm": 2.217224537042475, + "language_loss": 0.8267206, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.84807152, + "num_input_tokens_seen": 337316600, + "step": 15642, + "time_per_iteration": 4.0844972133636475 + }, + { + "auxiliary_loss_clip": 0.01090346, + "auxiliary_loss_mlp": 0.01030611, + "balance_loss_clip": 1.03500628, + "balance_loss_mlp": 1.01894593, + "epoch": 0.9405080414850443, + "flos": 24973573432320.0, + "grad_norm": 1.8062008321344256, + "language_loss": 0.68693364, + "learning_rate": 3.697594633355084e-08, + "loss": 0.70814323, + "num_input_tokens_seen": 337336895, + "step": 15643, + "time_per_iteration": 2.57680344581604 + }, + { + "auxiliary_loss_clip": 0.01098869, + "auxiliary_loss_mlp": 0.01036038, + "balance_loss_clip": 1.03659177, + "balance_loss_mlp": 1.02258563, + "epoch": 0.9405681647377123, + "flos": 20844743777280.0, + "grad_norm": 1.933266647542814, + "language_loss": 0.76611924, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.78746843, + "num_input_tokens_seen": 337355105, + "step": 15644, + "time_per_iteration": 2.573357343673706 + }, + { + "auxiliary_loss_clip": 0.01090012, + "auxiliary_loss_mlp": 0.0103297, + "balance_loss_clip": 1.03489494, + "balance_loss_mlp": 1.02147841, + "epoch": 0.9406282879903802, + "flos": 23805578885760.0, + "grad_norm": 1.6178233820471488, + "language_loss": 0.67622656, + "learning_rate": 3.682700891311974e-08, + "loss": 0.69745636, + "num_input_tokens_seen": 337374905, + "step": 15645, + "time_per_iteration": 2.615952730178833 + }, + { + "auxiliary_loss_clip": 0.01077394, + "auxiliary_loss_mlp": 0.00769887, + "balance_loss_clip": 1.03552616, + "balance_loss_mlp": 1.00019598, + "epoch": 0.9406884112430483, + "flos": 27674953626240.0, + "grad_norm": 1.3954115728125809, + "language_loss": 0.70446187, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.72293472, + "num_input_tokens_seen": 337397130, + "step": 15646, + "time_per_iteration": 2.6904642581939697 + }, + { + "auxiliary_loss_clip": 0.01090467, + "auxiliary_loss_mlp": 0.01031509, + "balance_loss_clip": 1.03259134, + "balance_loss_mlp": 1.0194633, + "epoch": 0.9407485344957163, + "flos": 23075048079360.0, + "grad_norm": 1.4996623163528855, + "language_loss": 0.74028134, + "learning_rate": 3.667836926755208e-08, + "loss": 0.76150107, + "num_input_tokens_seen": 337418660, + "step": 15647, + "time_per_iteration": 2.6018729209899902 + }, + { + "auxiliary_loss_clip": 0.01010109, + "auxiliary_loss_mlp": 0.01000406, + "balance_loss_clip": 1.00723958, + "balance_loss_mlp": 0.99945861, + "epoch": 0.9408086577483842, + "flos": 71014034304000.0, + "grad_norm": 0.8881598455052471, + "language_loss": 0.63527632, + "learning_rate": 3.660416111738907e-08, + "loss": 0.65538144, + "num_input_tokens_seen": 337478055, + "step": 15648, + "time_per_iteration": 3.2934350967407227 + }, + { + "auxiliary_loss_clip": 0.01104482, + "auxiliary_loss_mlp": 0.01034208, + "balance_loss_clip": 1.03579104, + "balance_loss_mlp": 1.02340806, + "epoch": 0.9408687810010522, + "flos": 23730956380800.0, + "grad_norm": 1.5536213392749576, + "language_loss": 0.66520309, + "learning_rate": 3.653002741939337e-08, + "loss": 0.68659002, + "num_input_tokens_seen": 337499405, + "step": 15649, + "time_per_iteration": 2.553529739379883 + }, + { + "auxiliary_loss_clip": 0.01075375, + "auxiliary_loss_mlp": 0.01026505, + "balance_loss_clip": 1.0331924, + "balance_loss_mlp": 1.01497114, + "epoch": 0.9409289042537201, + "flos": 18369314087040.0, + "grad_norm": 4.521362372265656, + "language_loss": 0.77431417, + "learning_rate": 3.645596817637586e-08, + "loss": 0.79533303, + "num_input_tokens_seen": 337517195, + "step": 15650, + "time_per_iteration": 2.665523052215576 + }, + { + "auxiliary_loss_clip": 0.01064771, + "auxiliary_loss_mlp": 0.01032458, + "balance_loss_clip": 1.03697872, + "balance_loss_mlp": 1.0203402, + "epoch": 0.9409890275063881, + "flos": 23878333883520.0, + "grad_norm": 2.2066131550931942, + "language_loss": 0.74314982, + "learning_rate": 3.638198339114451e-08, + "loss": 0.76412213, + "num_input_tokens_seen": 337535245, + "step": 15651, + "time_per_iteration": 2.790637969970703 + }, + { + "auxiliary_loss_clip": 0.0110668, + "auxiliary_loss_mlp": 0.01032091, + "balance_loss_clip": 1.03554559, + "balance_loss_mlp": 1.01934731, + "epoch": 0.941049150759056, + "flos": 16545088016640.0, + "grad_norm": 1.7324074128675258, + "language_loss": 0.72721291, + "learning_rate": 3.630807306650507e-08, + "loss": 0.74860054, + "num_input_tokens_seen": 337553040, + "step": 15652, + "time_per_iteration": 2.5541346073150635 + }, + { + "auxiliary_loss_clip": 0.01073797, + "auxiliary_loss_mlp": 0.01037686, + "balance_loss_clip": 1.03517735, + "balance_loss_mlp": 1.02407205, + "epoch": 0.9411092740117241, + "flos": 25118401069440.0, + "grad_norm": 1.8143834266468624, + "language_loss": 0.66641271, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.68752754, + "num_input_tokens_seen": 337574580, + "step": 15653, + "time_per_iteration": 2.7330899238586426 + }, + { + "auxiliary_loss_clip": 0.01109084, + "auxiliary_loss_mlp": 0.01035177, + "balance_loss_clip": 1.03644657, + "balance_loss_mlp": 1.02239227, + "epoch": 0.941169397264392, + "flos": 21142264129920.0, + "grad_norm": 1.9183010885495058, + "language_loss": 0.77979028, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.80123287, + "num_input_tokens_seen": 337593010, + "step": 15654, + "time_per_iteration": 2.5508615970611572 + }, + { + "auxiliary_loss_clip": 0.01104499, + "auxiliary_loss_mlp": 0.01029373, + "balance_loss_clip": 1.03763437, + "balance_loss_mlp": 1.01693344, + "epoch": 0.94122952051706, + "flos": 38508914995200.0, + "grad_norm": 1.5713777366197268, + "language_loss": 0.69984704, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.7211858, + "num_input_tokens_seen": 337616170, + "step": 15655, + "time_per_iteration": 2.7416152954101562 + }, + { + "auxiliary_loss_clip": 0.01107647, + "auxiliary_loss_mlp": 0.01036833, + "balance_loss_clip": 1.03607106, + "balance_loss_mlp": 1.02323759, + "epoch": 0.9412896437697279, + "flos": 18369206346240.0, + "grad_norm": 5.7482907771024045, + "language_loss": 0.72394556, + "learning_rate": 3.601317642987944e-08, + "loss": 0.74539036, + "num_input_tokens_seen": 337635215, + "step": 15656, + "time_per_iteration": 2.569613456726074 + }, + { + "auxiliary_loss_clip": 0.01074485, + "auxiliary_loss_mlp": 0.0102962, + "balance_loss_clip": 1.03419089, + "balance_loss_mlp": 1.01772296, + "epoch": 0.9413497670223959, + "flos": 25884950238720.0, + "grad_norm": 1.8612272314279366, + "language_loss": 0.78241754, + "learning_rate": 3.593963845018377e-08, + "loss": 0.80345851, + "num_input_tokens_seen": 337654195, + "step": 15657, + "time_per_iteration": 2.6432650089263916 + }, + { + "auxiliary_loss_clip": 0.01072209, + "auxiliary_loss_mlp": 0.01029018, + "balance_loss_clip": 1.03471482, + "balance_loss_mlp": 1.01653671, + "epoch": 0.9414098902750638, + "flos": 16618309891200.0, + "grad_norm": 2.5566622926725193, + "language_loss": 0.84468395, + "learning_rate": 3.586617494785371e-08, + "loss": 0.86569619, + "num_input_tokens_seen": 337671810, + "step": 15658, + "time_per_iteration": 2.6943564414978027 + }, + { + "auxiliary_loss_clip": 0.01112714, + "auxiliary_loss_mlp": 0.01033078, + "balance_loss_clip": 1.03760839, + "balance_loss_mlp": 1.01849866, + "epoch": 0.9414700135277319, + "flos": 18625033987200.0, + "grad_norm": 2.5090872722582627, + "language_loss": 0.70395422, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.72541213, + "num_input_tokens_seen": 337689410, + "step": 15659, + "time_per_iteration": 2.537353038787842 + }, + { + "auxiliary_loss_clip": 0.01079214, + "auxiliary_loss_mlp": 0.01039877, + "balance_loss_clip": 1.03404224, + "balance_loss_mlp": 1.02849793, + "epoch": 0.9415301367803999, + "flos": 26280146649600.0, + "grad_norm": 1.7562040343891887, + "language_loss": 0.79511106, + "learning_rate": 3.571947138643172e-08, + "loss": 0.81630188, + "num_input_tokens_seen": 337709950, + "step": 15660, + "time_per_iteration": 2.7002146244049072 + }, + { + "auxiliary_loss_clip": 0.01071861, + "auxiliary_loss_mlp": 0.01028763, + "balance_loss_clip": 1.03252554, + "balance_loss_mlp": 1.0167948, + "epoch": 0.9415902600330678, + "flos": 23261388860160.0, + "grad_norm": 1.4022153462876712, + "language_loss": 0.67921788, + "learning_rate": 3.564623133290201e-08, + "loss": 0.70022404, + "num_input_tokens_seen": 337731320, + "step": 15661, + "time_per_iteration": 2.755877733230591 + }, + { + "auxiliary_loss_clip": 0.01092284, + "auxiliary_loss_mlp": 0.01031361, + "balance_loss_clip": 1.03276324, + "balance_loss_mlp": 1.01883173, + "epoch": 0.9416503832857358, + "flos": 14719138093440.0, + "grad_norm": 2.2599485934603725, + "language_loss": 0.66300029, + "learning_rate": 3.557306576786434e-08, + "loss": 0.68423676, + "num_input_tokens_seen": 337747720, + "step": 15662, + "time_per_iteration": 2.5741324424743652 + }, + { + "auxiliary_loss_clip": 0.01009662, + "auxiliary_loss_mlp": 0.01000042, + "balance_loss_clip": 1.00710607, + "balance_loss_mlp": 0.99910659, + "epoch": 0.9417105065384037, + "flos": 70312698276480.0, + "grad_norm": 0.7619674820211261, + "language_loss": 0.59235966, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.61245668, + "num_input_tokens_seen": 337806930, + "step": 15663, + "time_per_iteration": 3.3059024810791016 + }, + { + "auxiliary_loss_clip": 0.01103713, + "auxiliary_loss_mlp": 0.01035518, + "balance_loss_clip": 1.03747571, + "balance_loss_mlp": 1.02217829, + "epoch": 0.9417706297910717, + "flos": 34057895322240.0, + "grad_norm": 4.1621354717950885, + "language_loss": 0.66886747, + "learning_rate": 3.542695811435914e-08, + "loss": 0.69025975, + "num_input_tokens_seen": 337828100, + "step": 15664, + "time_per_iteration": 2.7219324111938477 + }, + { + "auxiliary_loss_clip": 0.01083442, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.03674304, + "balance_loss_mlp": 1.01874244, + "epoch": 0.9418307530437396, + "flos": 16471614746880.0, + "grad_norm": 2.258809019140803, + "language_loss": 0.73858142, + "learning_rate": 3.535401603143207e-08, + "loss": 0.75972033, + "num_input_tokens_seen": 337844805, + "step": 15665, + "time_per_iteration": 2.6257636547088623 + }, + { + "auxiliary_loss_clip": 0.01105775, + "auxiliary_loss_mlp": 0.01032956, + "balance_loss_clip": 1.03694832, + "balance_loss_mlp": 1.02096939, + "epoch": 0.9418908762964077, + "flos": 11253543114240.0, + "grad_norm": 3.0049644569052907, + "language_loss": 0.63581872, + "learning_rate": 3.528114844807773e-08, + "loss": 0.65720612, + "num_input_tokens_seen": 337860490, + "step": 15666, + "time_per_iteration": 2.5537686347961426 + }, + { + "auxiliary_loss_clip": 0.01072039, + "auxiliary_loss_mlp": 0.01029378, + "balance_loss_clip": 1.0352211, + "balance_loss_mlp": 1.01712298, + "epoch": 0.9419509995490756, + "flos": 18438836860800.0, + "grad_norm": 1.6687010232077268, + "language_loss": 0.78841943, + "learning_rate": 3.520835536705902e-08, + "loss": 0.80943358, + "num_input_tokens_seen": 337878360, + "step": 15667, + "time_per_iteration": 2.66939377784729 + }, + { + "auxiliary_loss_clip": 0.01105116, + "auxiliary_loss_mlp": 0.01027413, + "balance_loss_clip": 1.03544164, + "balance_loss_mlp": 1.01629639, + "epoch": 0.9420111228017436, + "flos": 20737945664640.0, + "grad_norm": 1.8566898819332656, + "language_loss": 0.75282031, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.7741456, + "num_input_tokens_seen": 337895635, + "step": 15668, + "time_per_iteration": 2.5508882999420166 + }, + { + "auxiliary_loss_clip": 0.0105425, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.03423977, + "balance_loss_mlp": 1.01744199, + "epoch": 0.9420712460544115, + "flos": 21141940907520.0, + "grad_norm": 2.159724886055292, + "language_loss": 0.59023595, + "learning_rate": 3.506299272306723e-08, + "loss": 0.61107475, + "num_input_tokens_seen": 337913940, + "step": 15669, + "time_per_iteration": 2.73180890083313 + }, + { + "auxiliary_loss_clip": 0.01067029, + "auxiliary_loss_mlp": 0.01027243, + "balance_loss_clip": 1.03234708, + "balance_loss_mlp": 1.01523852, + "epoch": 0.9421313693070795, + "flos": 15851760721920.0, + "grad_norm": 1.528198079062627, + "language_loss": 0.77025855, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.79120123, + "num_input_tokens_seen": 337932015, + "step": 15670, + "time_per_iteration": 2.69807767868042 + }, + { + "auxiliary_loss_clip": 0.01109553, + "auxiliary_loss_mlp": 0.01036081, + "balance_loss_clip": 1.03725696, + "balance_loss_mlp": 1.02321219, + "epoch": 0.9421914925597474, + "flos": 32415915882240.0, + "grad_norm": 1.8154793470935222, + "language_loss": 0.65174937, + "learning_rate": 3.491792812150574e-08, + "loss": 0.67320567, + "num_input_tokens_seen": 337953345, + "step": 15671, + "time_per_iteration": 2.7444138526916504 + }, + { + "auxiliary_loss_clip": 0.01082811, + "auxiliary_loss_mlp": 0.01033924, + "balance_loss_clip": 1.03383374, + "balance_loss_mlp": 1.02096009, + "epoch": 0.9422516158124155, + "flos": 19718513769600.0, + "grad_norm": 1.558684648583432, + "language_loss": 0.79916745, + "learning_rate": 3.48455075935139e-08, + "loss": 0.82033479, + "num_input_tokens_seen": 337973685, + "step": 15672, + "time_per_iteration": 2.803809881210327 + }, + { + "auxiliary_loss_clip": 0.01075344, + "auxiliary_loss_mlp": 0.01036959, + "balance_loss_clip": 1.03470707, + "balance_loss_mlp": 1.02285063, + "epoch": 0.9423117390650835, + "flos": 16253277926400.0, + "grad_norm": 1.9824694157705243, + "language_loss": 0.73236197, + "learning_rate": 3.47731615843776e-08, + "loss": 0.75348502, + "num_input_tokens_seen": 337989175, + "step": 15673, + "time_per_iteration": 2.755509614944458 + }, + { + "auxiliary_loss_clip": 0.01091118, + "auxiliary_loss_mlp": 0.01030642, + "balance_loss_clip": 1.03414345, + "balance_loss_mlp": 1.01794672, + "epoch": 0.9423718623177514, + "flos": 31796564647680.0, + "grad_norm": 1.4558092155999423, + "language_loss": 0.70178533, + "learning_rate": 3.470089009683974e-08, + "loss": 0.72300291, + "num_input_tokens_seen": 338011800, + "step": 15674, + "time_per_iteration": 2.695003032684326 + }, + { + "auxiliary_loss_clip": 0.01107385, + "auxiliary_loss_mlp": 0.01025942, + "balance_loss_clip": 1.03574955, + "balance_loss_mlp": 1.01402664, + "epoch": 0.9424319855704194, + "flos": 23331809473920.0, + "grad_norm": 1.9582607226770616, + "language_loss": 0.81163412, + "learning_rate": 3.462869313364125e-08, + "loss": 0.8329674, + "num_input_tokens_seen": 338032120, + "step": 15675, + "time_per_iteration": 2.6292521953582764 + }, + { + "auxiliary_loss_clip": 0.01081718, + "auxiliary_loss_mlp": 0.01031642, + "balance_loss_clip": 1.03442502, + "balance_loss_mlp": 1.01966715, + "epoch": 0.9424921088230873, + "flos": 20777627214720.0, + "grad_norm": 1.7260765463945456, + "language_loss": 0.62643492, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.64756858, + "num_input_tokens_seen": 338051880, + "step": 15676, + "time_per_iteration": 4.179499387741089 + }, + { + "auxiliary_loss_clip": 0.01092941, + "auxiliary_loss_mlp": 0.01038222, + "balance_loss_clip": 1.03998232, + "balance_loss_mlp": 1.02615166, + "epoch": 0.9425522320757553, + "flos": 19026658932480.0, + "grad_norm": 1.777162834544334, + "language_loss": 0.67122662, + "learning_rate": 3.448452279120984e-08, + "loss": 0.69253826, + "num_input_tokens_seen": 338069665, + "step": 15677, + "time_per_iteration": 2.6239006519317627 + }, + { + "auxiliary_loss_clip": 0.01072255, + "auxiliary_loss_mlp": 0.01035548, + "balance_loss_clip": 1.03186798, + "balance_loss_mlp": 1.02190459, + "epoch": 0.9426123553284232, + "flos": 25155353185920.0, + "grad_norm": 2.176683290780186, + "language_loss": 0.641137, + "learning_rate": 3.441254941744387e-08, + "loss": 0.66221505, + "num_input_tokens_seen": 338090490, + "step": 15678, + "time_per_iteration": 4.263075113296509 + }, + { + "auxiliary_loss_clip": 0.01082508, + "auxiliary_loss_mlp": 0.01028954, + "balance_loss_clip": 1.04040313, + "balance_loss_mlp": 1.01706934, + "epoch": 0.9426724785810913, + "flos": 21179359900800.0, + "grad_norm": 1.4630832933179898, + "language_loss": 0.74250793, + "learning_rate": 3.434065057895097e-08, + "loss": 0.76362252, + "num_input_tokens_seen": 338109825, + "step": 15679, + "time_per_iteration": 4.329301357269287 + }, + { + "auxiliary_loss_clip": 0.01089711, + "auxiliary_loss_mlp": 0.01034379, + "balance_loss_clip": 1.0365119, + "balance_loss_mlp": 1.02231526, + "epoch": 0.9427326018337592, + "flos": 14756916222720.0, + "grad_norm": 3.717209623940925, + "language_loss": 0.77565658, + "learning_rate": 3.426882627845762e-08, + "loss": 0.79689747, + "num_input_tokens_seen": 338125790, + "step": 15680, + "time_per_iteration": 2.6704599857330322 + }, + { + "auxiliary_loss_clip": 0.01097961, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.0371449, + "balance_loss_mlp": 1.02055609, + "epoch": 0.9427927250864272, + "flos": 20923640000640.0, + "grad_norm": 2.190384071517057, + "language_loss": 0.75626266, + "learning_rate": 3.419707651868742e-08, + "loss": 0.77757394, + "num_input_tokens_seen": 338145610, + "step": 15681, + "time_per_iteration": 2.6899359226226807 + }, + { + "auxiliary_loss_clip": 0.01082824, + "auxiliary_loss_mlp": 0.0103551, + "balance_loss_clip": 1.03667855, + "balance_loss_mlp": 1.02248073, + "epoch": 0.9428528483390951, + "flos": 19752520970880.0, + "grad_norm": 2.3199120236961144, + "language_loss": 0.65754902, + "learning_rate": 3.412540130236086e-08, + "loss": 0.6787324, + "num_input_tokens_seen": 338165960, + "step": 15682, + "time_per_iteration": 4.124305963516235 + }, + { + "auxiliary_loss_clip": 0.01071222, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.03226089, + "balance_loss_mlp": 1.01655436, + "epoch": 0.9429129715917631, + "flos": 24534996370560.0, + "grad_norm": 3.097159468574502, + "language_loss": 0.76566684, + "learning_rate": 3.405380063219665e-08, + "loss": 0.78666705, + "num_input_tokens_seen": 338187215, + "step": 15683, + "time_per_iteration": 2.71305775642395 + }, + { + "auxiliary_loss_clip": 0.01100547, + "auxiliary_loss_mlp": 0.01040046, + "balance_loss_clip": 1.03645873, + "balance_loss_mlp": 1.02684927, + "epoch": 0.942973094844431, + "flos": 17959824063360.0, + "grad_norm": 2.6265213886695826, + "language_loss": 0.75407404, + "learning_rate": 3.398227451090885e-08, + "loss": 0.77547991, + "num_input_tokens_seen": 338201825, + "step": 15684, + "time_per_iteration": 2.6331522464752197 + }, + { + "auxiliary_loss_clip": 0.01104685, + "auxiliary_loss_mlp": 0.01027432, + "balance_loss_clip": 1.03484631, + "balance_loss_mlp": 1.01599407, + "epoch": 0.9430332180970991, + "flos": 26137689310080.0, + "grad_norm": 1.6361176573488942, + "language_loss": 0.77129638, + "learning_rate": 3.391082294121017e-08, + "loss": 0.79261756, + "num_input_tokens_seen": 338220865, + "step": 15685, + "time_per_iteration": 2.7566094398498535 + }, + { + "auxiliary_loss_clip": 0.01092602, + "auxiliary_loss_mlp": 0.01030988, + "balance_loss_clip": 1.03414559, + "balance_loss_mlp": 1.01951969, + "epoch": 0.943093341349767, + "flos": 23951376190080.0, + "grad_norm": 2.1272798688132775, + "language_loss": 0.75766367, + "learning_rate": 3.383944592581023e-08, + "loss": 0.77889955, + "num_input_tokens_seen": 338240160, + "step": 15686, + "time_per_iteration": 2.6965436935424805 + }, + { + "auxiliary_loss_clip": 0.01097717, + "auxiliary_loss_mlp": 0.0103222, + "balance_loss_clip": 1.03482318, + "balance_loss_mlp": 1.01981652, + "epoch": 0.943153464602435, + "flos": 17968407413760.0, + "grad_norm": 1.7020578922272096, + "language_loss": 0.80628002, + "learning_rate": 3.376814346741575e-08, + "loss": 0.82757938, + "num_input_tokens_seen": 338259305, + "step": 15687, + "time_per_iteration": 2.5866737365722656 + }, + { + "auxiliary_loss_clip": 0.01089927, + "auxiliary_loss_mlp": 0.01034435, + "balance_loss_clip": 1.03616667, + "balance_loss_mlp": 1.02021337, + "epoch": 0.943213587855103, + "flos": 14501519544960.0, + "grad_norm": 2.167672264682041, + "language_loss": 0.75638962, + "learning_rate": 3.369691556873011e-08, + "loss": 0.77763325, + "num_input_tokens_seen": 338274950, + "step": 15688, + "time_per_iteration": 2.6230926513671875 + }, + { + "auxiliary_loss_clip": 0.01078255, + "auxiliary_loss_mlp": 0.01026704, + "balance_loss_clip": 1.03318596, + "balance_loss_mlp": 1.01392508, + "epoch": 0.9432737111077709, + "flos": 28986411093120.0, + "grad_norm": 1.671527451823547, + "language_loss": 0.68622327, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.70727283, + "num_input_tokens_seen": 338295585, + "step": 15689, + "time_per_iteration": 2.7073707580566406 + }, + { + "auxiliary_loss_clip": 0.01094693, + "auxiliary_loss_mlp": 0.0103362, + "balance_loss_clip": 1.03498852, + "balance_loss_mlp": 1.0225811, + "epoch": 0.9433338343604389, + "flos": 21609066303360.0, + "grad_norm": 1.766005404007046, + "language_loss": 0.80373913, + "learning_rate": 3.35546834612872e-08, + "loss": 0.82502228, + "num_input_tokens_seen": 338314555, + "step": 15690, + "time_per_iteration": 2.5873029232025146 + }, + { + "auxiliary_loss_clip": 0.0109645, + "auxiliary_loss_mlp": 0.01031556, + "balance_loss_clip": 1.03644657, + "balance_loss_mlp": 1.0193367, + "epoch": 0.9433939576131068, + "flos": 33182285483520.0, + "grad_norm": 2.148376299443603, + "language_loss": 0.59993267, + "learning_rate": 3.348367925792317e-08, + "loss": 0.62121278, + "num_input_tokens_seen": 338336260, + "step": 15691, + "time_per_iteration": 2.7108116149902344 + }, + { + "auxiliary_loss_clip": 0.01070974, + "auxiliary_loss_mlp": 0.01032195, + "balance_loss_clip": 1.03521907, + "balance_loss_mlp": 1.01911151, + "epoch": 0.9434540808657749, + "flos": 20486391742080.0, + "grad_norm": 2.2447371927481545, + "language_loss": 0.66576785, + "learning_rate": 3.341274962505514e-08, + "loss": 0.68679953, + "num_input_tokens_seen": 338354680, + "step": 15692, + "time_per_iteration": 2.6925716400146484 + }, + { + "auxiliary_loss_clip": 0.01093305, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.03605986, + "balance_loss_mlp": 1.01980531, + "epoch": 0.9435142041184428, + "flos": 21542955321600.0, + "grad_norm": 2.467286667437946, + "language_loss": 0.74455351, + "learning_rate": 3.334189456537251e-08, + "loss": 0.76580673, + "num_input_tokens_seen": 338372490, + "step": 15693, + "time_per_iteration": 2.6023404598236084 + }, + { + "auxiliary_loss_clip": 0.01074066, + "auxiliary_loss_mlp": 0.01035078, + "balance_loss_clip": 1.03401875, + "balance_loss_mlp": 1.0216012, + "epoch": 0.9435743273711108, + "flos": 25009089004800.0, + "grad_norm": 3.213380675885908, + "language_loss": 0.73401213, + "learning_rate": 3.327111408156291e-08, + "loss": 0.75510359, + "num_input_tokens_seen": 338390870, + "step": 15694, + "time_per_iteration": 2.695995569229126 + }, + { + "auxiliary_loss_clip": 0.00992652, + "auxiliary_loss_mlp": 0.01001259, + "balance_loss_clip": 1.00752378, + "balance_loss_mlp": 1.00031126, + "epoch": 0.9436344506237787, + "flos": 60158707320960.0, + "grad_norm": 0.6938858298712827, + "language_loss": 0.50570488, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.52564394, + "num_input_tokens_seen": 338453075, + "step": 15695, + "time_per_iteration": 3.2824831008911133 + }, + { + "auxiliary_loss_clip": 0.01078605, + "auxiliary_loss_mlp": 0.01034023, + "balance_loss_clip": 1.03206229, + "balance_loss_mlp": 1.02225113, + "epoch": 0.9436945738764467, + "flos": 22237252283520.0, + "grad_norm": 1.8024183486894638, + "language_loss": 0.65296769, + "learning_rate": 3.312977685229335e-08, + "loss": 0.67409396, + "num_input_tokens_seen": 338471770, + "step": 15696, + "time_per_iteration": 2.7027387619018555 + }, + { + "auxiliary_loss_clip": 0.01097587, + "auxiliary_loss_mlp": 0.01027656, + "balance_loss_clip": 1.03637338, + "balance_loss_mlp": 1.01574719, + "epoch": 0.9437546971291146, + "flos": 25045179194880.0, + "grad_norm": 1.5868519209040974, + "language_loss": 0.65894949, + "learning_rate": 3.305922011219353e-08, + "loss": 0.68020189, + "num_input_tokens_seen": 338492190, + "step": 15697, + "time_per_iteration": 2.696575880050659 + }, + { + "auxiliary_loss_clip": 0.00999497, + "auxiliary_loss_mlp": 0.01001147, + "balance_loss_clip": 1.00481725, + "balance_loss_mlp": 1.0002768, + "epoch": 0.9438148203817827, + "flos": 56790788400000.0, + "grad_norm": 0.8460520685296222, + "language_loss": 0.63194656, + "learning_rate": 3.298873795868506e-08, + "loss": 0.6519531, + "num_input_tokens_seen": 338552560, + "step": 15698, + "time_per_iteration": 3.1992437839508057 + }, + { + "auxiliary_loss_clip": 0.01088557, + "auxiliary_loss_mlp": 0.01040163, + "balance_loss_clip": 1.03655159, + "balance_loss_mlp": 1.02691269, + "epoch": 0.9438749436344506, + "flos": 22346384780160.0, + "grad_norm": 1.744031032402157, + "language_loss": 0.69575948, + "learning_rate": 3.291833039444092e-08, + "loss": 0.71704668, + "num_input_tokens_seen": 338571770, + "step": 15699, + "time_per_iteration": 2.71105694770813 + }, + { + "auxiliary_loss_clip": 0.01069184, + "auxiliary_loss_mlp": 0.01031098, + "balance_loss_clip": 1.03235722, + "balance_loss_mlp": 1.01913548, + "epoch": 0.9439350668871186, + "flos": 13370800337280.0, + "grad_norm": 2.0518256803371444, + "language_loss": 0.74715513, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.76815796, + "num_input_tokens_seen": 338587310, + "step": 15700, + "time_per_iteration": 2.7928857803344727 + }, + { + "auxiliary_loss_clip": 0.01031212, + "auxiliary_loss_mlp": 0.01031676, + "balance_loss_clip": 1.02990246, + "balance_loss_mlp": 1.02022016, + "epoch": 0.9439951901397866, + "flos": 17785334770560.0, + "grad_norm": 1.7345550747234506, + "language_loss": 0.70444047, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.7250694, + "num_input_tokens_seen": 338606235, + "step": 15701, + "time_per_iteration": 2.956749200820923 + }, + { + "auxiliary_loss_clip": 0.01067175, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.03305924, + "balance_loss_mlp": 1.01637959, + "epoch": 0.9440553133924545, + "flos": 18879568738560.0, + "grad_norm": 6.919178162697029, + "language_loss": 0.77767622, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.79863775, + "num_input_tokens_seen": 338624090, + "step": 15702, + "time_per_iteration": 3.149764060974121 + }, + { + "auxiliary_loss_clip": 0.01093668, + "auxiliary_loss_mlp": 0.01043554, + "balance_loss_clip": 1.03391433, + "balance_loss_mlp": 1.03058994, + "epoch": 0.9441154366451225, + "flos": 19572967860480.0, + "grad_norm": 2.1610357777231397, + "language_loss": 0.66376126, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.68513346, + "num_input_tokens_seen": 338643695, + "step": 15703, + "time_per_iteration": 2.5990066528320312 + }, + { + "auxiliary_loss_clip": 0.01099113, + "auxiliary_loss_mlp": 0.01029886, + "balance_loss_clip": 1.03849339, + "balance_loss_mlp": 1.01657009, + "epoch": 0.9441755598977905, + "flos": 30294995472000.0, + "grad_norm": 6.247002123537392, + "language_loss": 0.73099834, + "learning_rate": 3.256741150552833e-08, + "loss": 0.75228834, + "num_input_tokens_seen": 338664725, + "step": 15704, + "time_per_iteration": 2.649864673614502 + }, + { + "auxiliary_loss_clip": 0.01094284, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.03578568, + "balance_loss_mlp": 1.01978898, + "epoch": 0.9442356831504585, + "flos": 20667884186880.0, + "grad_norm": 1.839574518559296, + "language_loss": 0.74311668, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.76438308, + "num_input_tokens_seen": 338683990, + "step": 15705, + "time_per_iteration": 2.611238956451416 + }, + { + "auxiliary_loss_clip": 0.01087617, + "auxiliary_loss_mlp": 0.01034408, + "balance_loss_clip": 1.03792405, + "balance_loss_mlp": 1.02303529, + "epoch": 0.9442958064031264, + "flos": 16107265140480.0, + "grad_norm": 1.8860922128318132, + "language_loss": 0.76915097, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.79037118, + "num_input_tokens_seen": 338702025, + "step": 15706, + "time_per_iteration": 2.651951313018799 + }, + { + "auxiliary_loss_clip": 0.01091977, + "auxiliary_loss_mlp": 0.01029566, + "balance_loss_clip": 1.03399932, + "balance_loss_mlp": 1.01796126, + "epoch": 0.9443559296557944, + "flos": 20447392550400.0, + "grad_norm": 1.4620649428574009, + "language_loss": 0.69324106, + "learning_rate": 3.23577554137866e-08, + "loss": 0.7144565, + "num_input_tokens_seen": 338720920, + "step": 15707, + "time_per_iteration": 2.674379825592041 + }, + { + "auxiliary_loss_clip": 0.0110044, + "auxiliary_loss_mlp": 0.01027284, + "balance_loss_clip": 1.0323143, + "balance_loss_mlp": 1.01660287, + "epoch": 0.9444160529084623, + "flos": 21610897896960.0, + "grad_norm": 1.6031633884107506, + "language_loss": 0.69253683, + "learning_rate": 3.22880192727244e-08, + "loss": 0.71381414, + "num_input_tokens_seen": 338739590, + "step": 15708, + "time_per_iteration": 2.6171586513519287 + }, + { + "auxiliary_loss_clip": 0.01096213, + "auxiliary_loss_mlp": 0.01030391, + "balance_loss_clip": 1.03588486, + "balance_loss_mlp": 1.01868427, + "epoch": 0.9444761761611303, + "flos": 18441781776000.0, + "grad_norm": 2.3501834209242305, + "language_loss": 0.70614785, + "learning_rate": 3.221835774749748e-08, + "loss": 0.72741389, + "num_input_tokens_seen": 338757240, + "step": 15709, + "time_per_iteration": 2.5730903148651123 + }, + { + "auxiliary_loss_clip": 0.01067094, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.03753853, + "balance_loss_mlp": 1.01969028, + "epoch": 0.9445362994137982, + "flos": 20957144411520.0, + "grad_norm": 2.0328452779578208, + "language_loss": 0.84886342, + "learning_rate": 3.214877084074774e-08, + "loss": 0.86985362, + "num_input_tokens_seen": 338773750, + "step": 15710, + "time_per_iteration": 2.803764581680298 + }, + { + "auxiliary_loss_clip": 0.01086062, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.03906393, + "balance_loss_mlp": 1.02019763, + "epoch": 0.9445964226664663, + "flos": 20303283185280.0, + "grad_norm": 1.710819363130834, + "language_loss": 0.71519732, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.73638898, + "num_input_tokens_seen": 338792115, + "step": 15711, + "time_per_iteration": 2.786268711090088 + }, + { + "auxiliary_loss_clip": 0.01097144, + "auxiliary_loss_mlp": 0.01032234, + "balance_loss_clip": 1.03653646, + "balance_loss_mlp": 1.01912701, + "epoch": 0.9446565459191342, + "flos": 26396030903040.0, + "grad_norm": 1.6870746080253851, + "language_loss": 0.69105422, + "learning_rate": 3.200982089323179e-08, + "loss": 0.71234798, + "num_input_tokens_seen": 338812480, + "step": 15712, + "time_per_iteration": 2.7278430461883545 + }, + { + "auxiliary_loss_clip": 0.01102036, + "auxiliary_loss_mlp": 0.01036765, + "balance_loss_clip": 1.03873301, + "balance_loss_mlp": 1.02347302, + "epoch": 0.9447166691718022, + "flos": 16544764794240.0, + "grad_norm": 2.4121732835994405, + "language_loss": 0.70365906, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.72504705, + "num_input_tokens_seen": 338829105, + "step": 15713, + "time_per_iteration": 2.6644036769866943 + }, + { + "auxiliary_loss_clip": 0.01083151, + "auxiliary_loss_mlp": 0.0103271, + "balance_loss_clip": 1.03377235, + "balance_loss_mlp": 1.01964462, + "epoch": 0.9447767924244702, + "flos": 29164635400320.0, + "grad_norm": 1.5448448829872168, + "language_loss": 0.7672528, + "learning_rate": 3.187116945125212e-08, + "loss": 0.7884115, + "num_input_tokens_seen": 338850670, + "step": 15714, + "time_per_iteration": 2.713848114013672 + }, + { + "auxiliary_loss_clip": 0.01083406, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.03877974, + "balance_loss_mlp": 1.01808619, + "epoch": 0.9448369156771381, + "flos": 19274908803840.0, + "grad_norm": 3.6710113259545456, + "language_loss": 0.67744088, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.69858289, + "num_input_tokens_seen": 338867795, + "step": 15715, + "time_per_iteration": 4.1955413818359375 + }, + { + "auxiliary_loss_clip": 0.01076435, + "auxiliary_loss_mlp": 0.01033314, + "balance_loss_clip": 1.03516388, + "balance_loss_mlp": 1.02042139, + "epoch": 0.9448970389298061, + "flos": 23841166285440.0, + "grad_norm": 1.6653982996808796, + "language_loss": 0.74771553, + "learning_rate": 3.173281653583948e-08, + "loss": 0.76881307, + "num_input_tokens_seen": 338887205, + "step": 15716, + "time_per_iteration": 2.7072696685791016 + }, + { + "auxiliary_loss_clip": 0.01092174, + "auxiliary_loss_mlp": 0.01031218, + "balance_loss_clip": 1.03965962, + "balance_loss_mlp": 1.01850486, + "epoch": 0.944957162182474, + "flos": 22382259488640.0, + "grad_norm": 2.4365311852184797, + "language_loss": 0.62516659, + "learning_rate": 3.166375203215565e-08, + "loss": 0.64640057, + "num_input_tokens_seen": 338906130, + "step": 15717, + "time_per_iteration": 4.276852369308472 + }, + { + "auxiliary_loss_clip": 0.01094123, + "auxiliary_loss_mlp": 0.0103479, + "balance_loss_clip": 1.03850865, + "balance_loss_mlp": 1.02269626, + "epoch": 0.9450172854351421, + "flos": 17383889393280.0, + "grad_norm": 1.77862512223437, + "language_loss": 0.79134482, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.81263399, + "num_input_tokens_seen": 338923045, + "step": 15718, + "time_per_iteration": 4.204078674316406 + }, + { + "auxiliary_loss_clip": 0.01018497, + "auxiliary_loss_mlp": 0.00999465, + "balance_loss_clip": 1.00589895, + "balance_loss_mlp": 0.99857122, + "epoch": 0.94507740868781, + "flos": 68466352406400.0, + "grad_norm": 0.6985194200865079, + "language_loss": 0.57825208, + "learning_rate": 3.152584694592719e-08, + "loss": 0.59843159, + "num_input_tokens_seen": 338987545, + "step": 15719, + "time_per_iteration": 3.1477670669555664 + }, + { + "auxiliary_loss_clip": 0.0106753, + "auxiliary_loss_mlp": 0.00770827, + "balance_loss_clip": 1.03413999, + "balance_loss_mlp": 1.0002296, + "epoch": 0.945137531940478, + "flos": 21142479611520.0, + "grad_norm": 1.6417560155484736, + "language_loss": 0.75850344, + "learning_rate": 3.145700636861193e-08, + "loss": 0.77688694, + "num_input_tokens_seen": 339007830, + "step": 15720, + "time_per_iteration": 2.7489445209503174 + }, + { + "auxiliary_loss_clip": 0.01092778, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.0348984, + "balance_loss_mlp": 1.01603997, + "epoch": 0.9451976551931459, + "flos": 24533918962560.0, + "grad_norm": 1.6214864220397953, + "language_loss": 0.72730792, + "learning_rate": 3.138824043864452e-08, + "loss": 0.74850857, + "num_input_tokens_seen": 339028980, + "step": 15721, + "time_per_iteration": 4.25614595413208 + }, + { + "auxiliary_loss_clip": 0.01062633, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.03165364, + "balance_loss_mlp": 1.01968312, + "epoch": 0.9452577784458139, + "flos": 23440582834560.0, + "grad_norm": 2.1250289522384933, + "language_loss": 0.85435033, + "learning_rate": 3.131954915863244e-08, + "loss": 0.87530422, + "num_input_tokens_seen": 339047950, + "step": 15722, + "time_per_iteration": 2.7739651203155518 + }, + { + "auxiliary_loss_clip": 0.01008256, + "auxiliary_loss_mlp": 0.00999124, + "balance_loss_clip": 1.00595665, + "balance_loss_mlp": 0.99822962, + "epoch": 0.9453179016984818, + "flos": 52017686449920.0, + "grad_norm": 0.8877194495304748, + "language_loss": 0.64485419, + "learning_rate": 3.125093253118005e-08, + "loss": 0.66492796, + "num_input_tokens_seen": 339104535, + "step": 15723, + "time_per_iteration": 3.120633363723755 + }, + { + "auxiliary_loss_clip": 0.01069344, + "auxiliary_loss_mlp": 0.01031555, + "balance_loss_clip": 1.03639483, + "balance_loss_mlp": 1.01878786, + "epoch": 0.9453780249511499, + "flos": 13473001509120.0, + "grad_norm": 2.137858204283182, + "language_loss": 0.73015231, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.75116134, + "num_input_tokens_seen": 339122050, + "step": 15724, + "time_per_iteration": 2.7390730381011963 + }, + { + "auxiliary_loss_clip": 0.01075665, + "auxiliary_loss_mlp": 0.01027448, + "balance_loss_clip": 1.03523171, + "balance_loss_mlp": 1.01556301, + "epoch": 0.9454381482038178, + "flos": 23258515772160.0, + "grad_norm": 2.0036441460727676, + "language_loss": 0.84524632, + "learning_rate": 3.111392324436024e-08, + "loss": 0.8662774, + "num_input_tokens_seen": 339138940, + "step": 15725, + "time_per_iteration": 2.7032201290130615 + }, + { + "auxiliary_loss_clip": 0.01092034, + "auxiliary_loss_mlp": 0.01027805, + "balance_loss_clip": 1.03934574, + "balance_loss_mlp": 1.01518655, + "epoch": 0.9454982714564858, + "flos": 19496621502720.0, + "grad_norm": 1.7112621255845237, + "language_loss": 0.71131301, + "learning_rate": 3.104553059018822e-08, + "loss": 0.7325114, + "num_input_tokens_seen": 339158245, + "step": 15726, + "time_per_iteration": 2.633211135864258 + }, + { + "auxiliary_loss_clip": 0.01083425, + "auxiliary_loss_mlp": 0.0103136, + "balance_loss_clip": 1.03504848, + "balance_loss_mlp": 1.01800275, + "epoch": 0.9455583947091538, + "flos": 23258120722560.0, + "grad_norm": 1.770426483467669, + "language_loss": 0.60957873, + "learning_rate": 3.097721259896735e-08, + "loss": 0.63072664, + "num_input_tokens_seen": 339178200, + "step": 15727, + "time_per_iteration": 2.66964054107666 + }, + { + "auxiliary_loss_clip": 0.01093477, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.03381276, + "balance_loss_mlp": 1.02250743, + "epoch": 0.9456185179618217, + "flos": 17673041877120.0, + "grad_norm": 1.6947492443167869, + "language_loss": 0.81717706, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.83844954, + "num_input_tokens_seen": 339193950, + "step": 15728, + "time_per_iteration": 2.6493005752563477 + }, + { + "auxiliary_loss_clip": 0.00982318, + "auxiliary_loss_mlp": 0.01006045, + "balance_loss_clip": 1.00669122, + "balance_loss_mlp": 1.00475144, + "epoch": 0.9456786412144897, + "flos": 61415040389760.0, + "grad_norm": 0.7309632127975088, + "language_loss": 0.59005105, + "learning_rate": 3.08408006157368e-08, + "loss": 0.60993469, + "num_input_tokens_seen": 339252330, + "step": 15729, + "time_per_iteration": 3.3251638412475586 + }, + { + "auxiliary_loss_clip": 0.01106055, + "auxiliary_loss_mlp": 0.01026459, + "balance_loss_clip": 1.03561211, + "balance_loss_mlp": 1.01384068, + "epoch": 0.9457387644671577, + "flos": 18588369179520.0, + "grad_norm": 1.8376283241487172, + "language_loss": 0.76239592, + "learning_rate": 3.077270662890052e-08, + "loss": 0.78372103, + "num_input_tokens_seen": 339270325, + "step": 15730, + "time_per_iteration": 2.822908401489258 + }, + { + "auxiliary_loss_clip": 0.01086637, + "auxiliary_loss_mlp": 0.01031322, + "balance_loss_clip": 1.03977942, + "balance_loss_mlp": 1.01842904, + "epoch": 0.9457988877198257, + "flos": 21108544237440.0, + "grad_norm": 1.4741875108902043, + "language_loss": 0.6241951, + "learning_rate": 3.070468731536047e-08, + "loss": 0.64537472, + "num_input_tokens_seen": 339291980, + "step": 15731, + "time_per_iteration": 2.780259370803833 + }, + { + "auxiliary_loss_clip": 0.01098616, + "auxiliary_loss_mlp": 0.01027483, + "balance_loss_clip": 1.03492093, + "balance_loss_mlp": 1.01464427, + "epoch": 0.9458590109724936, + "flos": 26688379697280.0, + "grad_norm": 1.8389995497060174, + "language_loss": 0.63829595, + "learning_rate": 3.063674267769589e-08, + "loss": 0.65955698, + "num_input_tokens_seen": 339311795, + "step": 15732, + "time_per_iteration": 2.6928884983062744 + }, + { + "auxiliary_loss_clip": 0.01097602, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.03819144, + "balance_loss_mlp": 1.01691008, + "epoch": 0.9459191342251616, + "flos": 18661591054080.0, + "grad_norm": 2.312355275604837, + "language_loss": 0.83734918, + "learning_rate": 3.056887271848363e-08, + "loss": 0.85862809, + "num_input_tokens_seen": 339327745, + "step": 15733, + "time_per_iteration": 2.573761463165283 + }, + { + "auxiliary_loss_clip": 0.01093698, + "auxiliary_loss_mlp": 0.01029334, + "balance_loss_clip": 1.03431845, + "balance_loss_mlp": 1.01787257, + "epoch": 0.9459792574778295, + "flos": 23398459159680.0, + "grad_norm": 2.0402128352906135, + "language_loss": 0.7230435, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.74427378, + "num_input_tokens_seen": 339346445, + "step": 15734, + "time_per_iteration": 2.6132256984710693 + }, + { + "auxiliary_loss_clip": 0.01092017, + "auxiliary_loss_mlp": 0.01030532, + "balance_loss_clip": 1.03411341, + "balance_loss_mlp": 1.01973712, + "epoch": 0.9460393807304975, + "flos": 24392969994240.0, + "grad_norm": 1.706482735493318, + "language_loss": 0.86788249, + "learning_rate": 3.043335684570692e-08, + "loss": 0.88910794, + "num_input_tokens_seen": 339367945, + "step": 15735, + "time_per_iteration": 2.6257829666137695 + }, + { + "auxiliary_loss_clip": 0.01088315, + "auxiliary_loss_mlp": 0.01028057, + "balance_loss_clip": 1.0354389, + "balance_loss_mlp": 1.0162971, + "epoch": 0.9460995039831654, + "flos": 21939408708480.0, + "grad_norm": 2.0029981426507026, + "language_loss": 0.6727972, + "learning_rate": 3.036571093728102e-08, + "loss": 0.69396096, + "num_input_tokens_seen": 339386060, + "step": 15736, + "time_per_iteration": 2.6414105892181396 + }, + { + "auxiliary_loss_clip": 0.00990794, + "auxiliary_loss_mlp": 0.01001581, + "balance_loss_clip": 1.01297307, + "balance_loss_mlp": 1.00051391, + "epoch": 0.9461596272358335, + "flos": 70322466775680.0, + "grad_norm": 0.8681687595231652, + "language_loss": 0.65302682, + "learning_rate": 3.029813971758499e-08, + "loss": 0.67295063, + "num_input_tokens_seen": 339446695, + "step": 15737, + "time_per_iteration": 3.2643556594848633 + }, + { + "auxiliary_loss_clip": 0.01016522, + "auxiliary_loss_mlp": 0.01001018, + "balance_loss_clip": 1.00645328, + "balance_loss_mlp": 0.99996263, + "epoch": 0.9462197504885014, + "flos": 58591242645120.0, + "grad_norm": 0.8027511027658571, + "language_loss": 0.58797008, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.60814548, + "num_input_tokens_seen": 339510080, + "step": 15738, + "time_per_iteration": 3.1644718647003174 + }, + { + "auxiliary_loss_clip": 0.01093604, + "auxiliary_loss_mlp": 0.0103372, + "balance_loss_clip": 1.03396416, + "balance_loss_mlp": 1.02237749, + "epoch": 0.9462798737411694, + "flos": 23433759250560.0, + "grad_norm": 1.7606066776130818, + "language_loss": 0.71841681, + "learning_rate": 3.016322135462834e-08, + "loss": 0.73969007, + "num_input_tokens_seen": 339529335, + "step": 15739, + "time_per_iteration": 2.6679999828338623 + }, + { + "auxiliary_loss_clip": 0.01093944, + "auxiliary_loss_mlp": 0.01027852, + "balance_loss_clip": 1.03307033, + "balance_loss_mlp": 1.01536524, + "epoch": 0.9463399969938374, + "flos": 25046077034880.0, + "grad_norm": 2.442002689709471, + "language_loss": 0.65025353, + "learning_rate": 3.009587421648363e-08, + "loss": 0.67147148, + "num_input_tokens_seen": 339548820, + "step": 15740, + "time_per_iteration": 2.703686237335205 + }, + { + "auxiliary_loss_clip": 0.01082274, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.0356245, + "balance_loss_mlp": 1.01749706, + "epoch": 0.9464001202465053, + "flos": 24352606085760.0, + "grad_norm": 1.7330701520859766, + "language_loss": 0.66210133, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.68321818, + "num_input_tokens_seen": 339566775, + "step": 15741, + "time_per_iteration": 2.664438009262085 + }, + { + "auxiliary_loss_clip": 0.01097651, + "auxiliary_loss_mlp": 0.01026578, + "balance_loss_clip": 1.03633511, + "balance_loss_mlp": 1.01461589, + "epoch": 0.9464602434991733, + "flos": 17165444832000.0, + "grad_norm": 1.8465493774504513, + "language_loss": 0.76130718, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.7825495, + "num_input_tokens_seen": 339581905, + "step": 15742, + "time_per_iteration": 2.5938029289245605 + }, + { + "auxiliary_loss_clip": 0.01092873, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.0342133, + "balance_loss_mlp": 1.01929605, + "epoch": 0.9465203667518413, + "flos": 19938107566080.0, + "grad_norm": 1.7769383818229454, + "language_loss": 0.72399694, + "learning_rate": 2.989428100602187e-08, + "loss": 0.74523461, + "num_input_tokens_seen": 339599870, + "step": 15743, + "time_per_iteration": 2.678401470184326 + }, + { + "auxiliary_loss_clip": 0.01073999, + "auxiliary_loss_mlp": 0.01031411, + "balance_loss_clip": 1.03740954, + "balance_loss_mlp": 1.01843548, + "epoch": 0.9465804900045093, + "flos": 20120318282880.0, + "grad_norm": 1.6693013333008395, + "language_loss": 0.79701877, + "learning_rate": 2.982723267901943e-08, + "loss": 0.81807292, + "num_input_tokens_seen": 339620250, + "step": 15744, + "time_per_iteration": 2.7061126232147217 + }, + { + "auxiliary_loss_clip": 0.01086196, + "auxiliary_loss_mlp": 0.01038507, + "balance_loss_clip": 1.03539801, + "balance_loss_mlp": 1.02565646, + "epoch": 0.9466406132571772, + "flos": 23911622812800.0, + "grad_norm": 1.6306815093715024, + "language_loss": 0.77796626, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.79921329, + "num_input_tokens_seen": 339639900, + "step": 15745, + "time_per_iteration": 2.667794704437256 + }, + { + "auxiliary_loss_clip": 0.01082416, + "auxiliary_loss_mlp": 0.01035804, + "balance_loss_clip": 1.03260911, + "balance_loss_mlp": 1.02258968, + "epoch": 0.9467007365098452, + "flos": 19933223316480.0, + "grad_norm": 1.9540472383542953, + "language_loss": 0.70444429, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.72562647, + "num_input_tokens_seen": 339658970, + "step": 15746, + "time_per_iteration": 2.671787738800049 + }, + { + "auxiliary_loss_clip": 0.01083981, + "auxiliary_loss_mlp": 0.01028625, + "balance_loss_clip": 1.03787088, + "balance_loss_mlp": 1.01583958, + "epoch": 0.9467608597625131, + "flos": 19310496203520.0, + "grad_norm": 2.086321056520881, + "language_loss": 0.55439335, + "learning_rate": 2.962653596305964e-08, + "loss": 0.57551944, + "num_input_tokens_seen": 339675600, + "step": 15747, + "time_per_iteration": 2.6125731468200684 + }, + { + "auxiliary_loss_clip": 0.00971726, + "auxiliary_loss_mlp": 0.0100543, + "balance_loss_clip": 1.00657761, + "balance_loss_mlp": 1.00431538, + "epoch": 0.9468209830151811, + "flos": 69630252802560.0, + "grad_norm": 0.6607022799380584, + "language_loss": 0.53227079, + "learning_rate": 2.955978648787871e-08, + "loss": 0.55204231, + "num_input_tokens_seen": 339744505, + "step": 15748, + "time_per_iteration": 3.6713624000549316 + }, + { + "auxiliary_loss_clip": 0.01087901, + "auxiliary_loss_mlp": 0.01036855, + "balance_loss_clip": 1.03579473, + "balance_loss_mlp": 1.02451098, + "epoch": 0.946881106267849, + "flos": 27016639113600.0, + "grad_norm": 1.6437037457333494, + "language_loss": 0.6632542, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.68450171, + "num_input_tokens_seen": 339765810, + "step": 15749, + "time_per_iteration": 4.178863286972046 + }, + { + "auxiliary_loss_clip": 0.0107672, + "auxiliary_loss_mlp": 0.01030414, + "balance_loss_clip": 1.03375602, + "balance_loss_mlp": 1.01670504, + "epoch": 0.9469412295205171, + "flos": 20190092451840.0, + "grad_norm": 3.088565110530085, + "language_loss": 0.75976688, + "learning_rate": 2.942651169791621e-08, + "loss": 0.78083825, + "num_input_tokens_seen": 339784125, + "step": 15750, + "time_per_iteration": 2.7121167182922363 + }, + { + "auxiliary_loss_clip": 0.01096615, + "auxiliary_loss_mlp": 0.01028166, + "balance_loss_clip": 1.0368098, + "balance_loss_mlp": 1.01631653, + "epoch": 0.947001352773185, + "flos": 21324905809920.0, + "grad_norm": 1.5838311482694045, + "language_loss": 0.67727458, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.69852245, + "num_input_tokens_seen": 339803450, + "step": 15751, + "time_per_iteration": 2.709989070892334 + }, + { + "auxiliary_loss_clip": 0.01076359, + "auxiliary_loss_mlp": 0.01030459, + "balance_loss_clip": 1.03434587, + "balance_loss_mlp": 1.01857424, + "epoch": 0.947061476025853, + "flos": 21944041562880.0, + "grad_norm": 1.6112027169393213, + "language_loss": 0.65785074, + "learning_rate": 2.929353580532723e-08, + "loss": 0.6789189, + "num_input_tokens_seen": 339823215, + "step": 15752, + "time_per_iteration": 2.731290102005005 + }, + { + "auxiliary_loss_clip": 0.01092841, + "auxiliary_loss_mlp": 0.01035403, + "balance_loss_clip": 1.03419137, + "balance_loss_mlp": 1.0214498, + "epoch": 0.947121599278521, + "flos": 21394715892480.0, + "grad_norm": 1.9387183547098805, + "language_loss": 0.71516705, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.73644954, + "num_input_tokens_seen": 339842230, + "step": 15753, + "time_per_iteration": 2.6081583499908447 + }, + { + "auxiliary_loss_clip": 0.01109554, + "auxiliary_loss_mlp": 0.01032964, + "balance_loss_clip": 1.03532398, + "balance_loss_mlp": 1.01883757, + "epoch": 0.9471817225311889, + "flos": 23075730437760.0, + "grad_norm": 1.76483043172341, + "language_loss": 0.70370275, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.72512788, + "num_input_tokens_seen": 339861640, + "step": 15754, + "time_per_iteration": 2.580967426300049 + }, + { + "auxiliary_loss_clip": 0.0110967, + "auxiliary_loss_mlp": 0.01032839, + "balance_loss_clip": 1.03552377, + "balance_loss_mlp": 1.02043486, + "epoch": 0.947241845783857, + "flos": 11910744305280.0, + "grad_norm": 3.2827981258328207, + "language_loss": 0.78840715, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.80983222, + "num_input_tokens_seen": 339878210, + "step": 15755, + "time_per_iteration": 4.16628360748291 + }, + { + "auxiliary_loss_clip": 0.01070124, + "auxiliary_loss_mlp": 0.0103368, + "balance_loss_clip": 1.03388035, + "balance_loss_mlp": 1.01844525, + "epoch": 0.9473019690365249, + "flos": 20740675098240.0, + "grad_norm": 2.275356608148397, + "language_loss": 0.75449395, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.77553201, + "num_input_tokens_seen": 339894255, + "step": 15756, + "time_per_iteration": 4.229847431182861 + }, + { + "auxiliary_loss_clip": 0.01083084, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.03162217, + "balance_loss_mlp": 1.01807952, + "epoch": 0.9473620922891929, + "flos": 17639896602240.0, + "grad_norm": 2.263017805746966, + "language_loss": 0.74833083, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.76946425, + "num_input_tokens_seen": 339912425, + "step": 15757, + "time_per_iteration": 2.64909291267395 + }, + { + "auxiliary_loss_clip": 0.01089898, + "auxiliary_loss_mlp": 0.0103155, + "balance_loss_clip": 1.03554904, + "balance_loss_mlp": 1.01819229, + "epoch": 0.9474222155418608, + "flos": 23550002640000.0, + "grad_norm": 1.9625522630071812, + "language_loss": 0.79462659, + "learning_rate": 2.889640171327512e-08, + "loss": 0.81584108, + "num_input_tokens_seen": 339929635, + "step": 15758, + "time_per_iteration": 4.308099031448364 + }, + { + "auxiliary_loss_clip": 0.01077085, + "auxiliary_loss_mlp": 0.00769854, + "balance_loss_clip": 1.03425276, + "balance_loss_mlp": 1.00017619, + "epoch": 0.9474823387945288, + "flos": 27089753247360.0, + "grad_norm": 1.3762409201655417, + "language_loss": 0.71830899, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.73677838, + "num_input_tokens_seen": 339951200, + "step": 15759, + "time_per_iteration": 2.7510428428649902 + }, + { + "auxiliary_loss_clip": 0.01091647, + "auxiliary_loss_mlp": 0.01028463, + "balance_loss_clip": 1.0367434, + "balance_loss_mlp": 1.01805592, + "epoch": 0.9475424620471967, + "flos": 22966526113920.0, + "grad_norm": 1.5226325808492376, + "language_loss": 0.75499642, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.77619755, + "num_input_tokens_seen": 339971820, + "step": 15760, + "time_per_iteration": 4.22639799118042 + }, + { + "auxiliary_loss_clip": 0.01107661, + "auxiliary_loss_mlp": 0.00769288, + "balance_loss_clip": 1.0366137, + "balance_loss_mlp": 1.00024486, + "epoch": 0.9476025852998647, + "flos": 20047671025920.0, + "grad_norm": 1.877401984510813, + "language_loss": 0.7275269, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.74629641, + "num_input_tokens_seen": 339989420, + "step": 15761, + "time_per_iteration": 2.6196117401123047 + }, + { + "auxiliary_loss_clip": 0.01086146, + "auxiliary_loss_mlp": 0.01036281, + "balance_loss_clip": 1.03789508, + "balance_loss_mlp": 1.02441955, + "epoch": 0.9476627085525327, + "flos": 14975468524800.0, + "grad_norm": 2.729446835084705, + "language_loss": 0.71608138, + "learning_rate": 2.863314050734722e-08, + "loss": 0.73730564, + "num_input_tokens_seen": 340006690, + "step": 15762, + "time_per_iteration": 2.579223155975342 + }, + { + "auxiliary_loss_clip": 0.01111512, + "auxiliary_loss_mlp": 0.01036338, + "balance_loss_clip": 1.03547406, + "balance_loss_mlp": 1.02280796, + "epoch": 0.9477228318052007, + "flos": 18697788984960.0, + "grad_norm": 2.07051809457296, + "language_loss": 0.66850615, + "learning_rate": 2.856751208570518e-08, + "loss": 0.68998462, + "num_input_tokens_seen": 340025480, + "step": 15763, + "time_per_iteration": 2.5970752239227295 + }, + { + "auxiliary_loss_clip": 0.01107023, + "auxiliary_loss_mlp": 0.01036664, + "balance_loss_clip": 1.03498352, + "balance_loss_mlp": 1.02424252, + "epoch": 0.9477829550578686, + "flos": 23875065745920.0, + "grad_norm": 1.7866390242550823, + "language_loss": 0.69783157, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.71926844, + "num_input_tokens_seen": 340043785, + "step": 15764, + "time_per_iteration": 2.5758285522460938 + }, + { + "auxiliary_loss_clip": 0.01095569, + "auxiliary_loss_mlp": 0.00768781, + "balance_loss_clip": 1.03836465, + "balance_loss_mlp": 1.00017273, + "epoch": 0.9478430783105366, + "flos": 22562890007040.0, + "grad_norm": 1.6268430916699592, + "language_loss": 0.71237898, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.73102248, + "num_input_tokens_seen": 340064360, + "step": 15765, + "time_per_iteration": 2.7962822914123535 + }, + { + "auxiliary_loss_clip": 0.01008188, + "auxiliary_loss_mlp": 0.01003115, + "balance_loss_clip": 1.00526595, + "balance_loss_mlp": 1.00220859, + "epoch": 0.9479032015632046, + "flos": 60857885554560.0, + "grad_norm": 0.805215239265766, + "language_loss": 0.59051013, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.61062312, + "num_input_tokens_seen": 340114425, + "step": 15766, + "time_per_iteration": 2.9193778038024902 + }, + { + "auxiliary_loss_clip": 0.01055212, + "auxiliary_loss_mlp": 0.0104033, + "balance_loss_clip": 1.03303111, + "balance_loss_mlp": 1.02758038, + "epoch": 0.9479633248158725, + "flos": 14683873916160.0, + "grad_norm": 1.7099233652082526, + "language_loss": 0.74133235, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.7622878, + "num_input_tokens_seen": 340132200, + "step": 15767, + "time_per_iteration": 2.805891990661621 + }, + { + "auxiliary_loss_clip": 0.0108313, + "auxiliary_loss_mlp": 0.01032242, + "balance_loss_clip": 1.03779268, + "balance_loss_mlp": 1.0191226, + "epoch": 0.9480234480685406, + "flos": 20333878594560.0, + "grad_norm": 2.313278201082517, + "language_loss": 0.73025, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.75140369, + "num_input_tokens_seen": 340149175, + "step": 15768, + "time_per_iteration": 2.6399149894714355 + }, + { + "auxiliary_loss_clip": 0.00990186, + "auxiliary_loss_mlp": 0.01003636, + "balance_loss_clip": 1.00721884, + "balance_loss_mlp": 1.0025754, + "epoch": 0.9480835713212085, + "flos": 70293092428800.0, + "grad_norm": 0.7345736556606803, + "language_loss": 0.55274725, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.57268548, + "num_input_tokens_seen": 340208155, + "step": 15769, + "time_per_iteration": 3.346592664718628 + }, + { + "auxiliary_loss_clip": 0.01060494, + "auxiliary_loss_mlp": 0.01031034, + "balance_loss_clip": 1.03297341, + "balance_loss_mlp": 1.01898789, + "epoch": 0.9481436945738765, + "flos": 25449749055360.0, + "grad_norm": 1.3356689895855771, + "language_loss": 0.77657175, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.79748702, + "num_input_tokens_seen": 340229275, + "step": 15770, + "time_per_iteration": 2.967400074005127 + }, + { + "auxiliary_loss_clip": 0.01090847, + "auxiliary_loss_mlp": 0.0103804, + "balance_loss_clip": 1.0389564, + "balance_loss_mlp": 1.02452743, + "epoch": 0.9482038178265444, + "flos": 26979902478720.0, + "grad_norm": 3.3844723552272304, + "language_loss": 0.79788053, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.81916934, + "num_input_tokens_seen": 340248920, + "step": 15771, + "time_per_iteration": 2.6720709800720215 + }, + { + "auxiliary_loss_clip": 0.0107385, + "auxiliary_loss_mlp": 0.01029719, + "balance_loss_clip": 1.03290439, + "balance_loss_mlp": 1.01731515, + "epoch": 0.9482639410792124, + "flos": 17785442511360.0, + "grad_norm": 1.7305869022137186, + "language_loss": 0.69742543, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.71846116, + "num_input_tokens_seen": 340266775, + "step": 15772, + "time_per_iteration": 2.7055277824401855 + }, + { + "auxiliary_loss_clip": 0.01091743, + "auxiliary_loss_mlp": 0.0103086, + "balance_loss_clip": 1.03463781, + "balance_loss_mlp": 1.01827109, + "epoch": 0.9483240643318803, + "flos": 20996682307200.0, + "grad_norm": 1.5174213608604383, + "language_loss": 0.73862821, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.7598542, + "num_input_tokens_seen": 340285295, + "step": 15773, + "time_per_iteration": 2.594517469406128 + }, + { + "auxiliary_loss_clip": 0.01075154, + "auxiliary_loss_mlp": 0.01037032, + "balance_loss_clip": 1.03320599, + "balance_loss_mlp": 1.02424073, + "epoch": 0.9483841875845483, + "flos": 20083294339200.0, + "grad_norm": 2.198949028085397, + "language_loss": 0.62984806, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.65096992, + "num_input_tokens_seen": 340304265, + "step": 15774, + "time_per_iteration": 2.6656346321105957 + }, + { + "auxiliary_loss_clip": 0.01108855, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.03615785, + "balance_loss_mlp": 1.01968336, + "epoch": 0.9484443108372163, + "flos": 20813645577600.0, + "grad_norm": 1.8448156686123751, + "language_loss": 0.59319341, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.61461002, + "num_input_tokens_seen": 340323690, + "step": 15775, + "time_per_iteration": 2.6134490966796875 + }, + { + "auxiliary_loss_clip": 0.0108818, + "auxiliary_loss_mlp": 0.01028388, + "balance_loss_clip": 1.03665876, + "balance_loss_mlp": 1.01553738, + "epoch": 0.9485044340898843, + "flos": 36429184506240.0, + "grad_norm": 1.5672743954307715, + "language_loss": 0.61733031, + "learning_rate": 2.772114638584555e-08, + "loss": 0.63849604, + "num_input_tokens_seen": 340345830, + "step": 15776, + "time_per_iteration": 2.759727954864502 + }, + { + "auxiliary_loss_clip": 0.01079507, + "auxiliary_loss_mlp": 0.01031725, + "balance_loss_clip": 1.03297567, + "balance_loss_mlp": 1.01894581, + "epoch": 0.9485645573425522, + "flos": 22602535643520.0, + "grad_norm": 1.5939795755888917, + "language_loss": 0.73614502, + "learning_rate": 2.765656478622458e-08, + "loss": 0.75725728, + "num_input_tokens_seen": 340365910, + "step": 15777, + "time_per_iteration": 2.6045753955841064 + }, + { + "auxiliary_loss_clip": 0.01108311, + "auxiliary_loss_mlp": 0.01035184, + "balance_loss_clip": 1.03904653, + "balance_loss_mlp": 1.0216893, + "epoch": 0.9486246805952202, + "flos": 22017766227840.0, + "grad_norm": 2.932173295404769, + "language_loss": 0.7171486, + "learning_rate": 2.759205797806441e-08, + "loss": 0.73858356, + "num_input_tokens_seen": 340383935, + "step": 15778, + "time_per_iteration": 2.5818030834198 + }, + { + "auxiliary_loss_clip": 0.0109326, + "auxiliary_loss_mlp": 0.00769105, + "balance_loss_clip": 1.03678966, + "balance_loss_mlp": 1.00016212, + "epoch": 0.9486848038478882, + "flos": 16508674604160.0, + "grad_norm": 1.785656818158453, + "language_loss": 0.70001411, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.7186377, + "num_input_tokens_seen": 340402760, + "step": 15779, + "time_per_iteration": 2.5735414028167725 + }, + { + "auxiliary_loss_clip": 0.01109892, + "auxiliary_loss_mlp": 0.01032242, + "balance_loss_clip": 1.03769064, + "balance_loss_mlp": 1.01942062, + "epoch": 0.9487449271005561, + "flos": 19244385221760.0, + "grad_norm": 2.1485694494900045, + "language_loss": 0.78390372, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.80532503, + "num_input_tokens_seen": 340422105, + "step": 15780, + "time_per_iteration": 2.571122169494629 + }, + { + "auxiliary_loss_clip": 0.0108342, + "auxiliary_loss_mlp": 0.00770056, + "balance_loss_clip": 1.03853536, + "balance_loss_mlp": 1.00020254, + "epoch": 0.9488050503532242, + "flos": 21762692772480.0, + "grad_norm": 1.7364814662461427, + "language_loss": 0.66234344, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.68087816, + "num_input_tokens_seen": 340441160, + "step": 15781, + "time_per_iteration": 2.6827192306518555 + }, + { + "auxiliary_loss_clip": 0.01107411, + "auxiliary_loss_mlp": 0.01034117, + "balance_loss_clip": 1.03690338, + "balance_loss_mlp": 1.02159953, + "epoch": 0.9488651736058921, + "flos": 18368919037440.0, + "grad_norm": 9.76335675616754, + "language_loss": 0.79928899, + "learning_rate": 2.733477870890999e-08, + "loss": 0.82070434, + "num_input_tokens_seen": 340458200, + "step": 15782, + "time_per_iteration": 2.567207098007202 + }, + { + "auxiliary_loss_clip": 0.010185, + "auxiliary_loss_mlp": 0.01003019, + "balance_loss_clip": 1.00588965, + "balance_loss_mlp": 1.00194001, + "epoch": 0.9489252968585601, + "flos": 70084057230720.0, + "grad_norm": 0.7221824593756564, + "language_loss": 0.59740299, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.6176182, + "num_input_tokens_seen": 340526420, + "step": 15783, + "time_per_iteration": 3.296163558959961 + }, + { + "auxiliary_loss_clip": 0.01096688, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.0347774, + "balance_loss_mlp": 1.02289176, + "epoch": 0.948985420111228, + "flos": 27855440490240.0, + "grad_norm": 1.6602222433603364, + "language_loss": 0.73771024, + "learning_rate": 2.720658788656105e-08, + "loss": 0.75903332, + "num_input_tokens_seen": 340546325, + "step": 15784, + "time_per_iteration": 2.671168804168701 + }, + { + "auxiliary_loss_clip": 0.01060019, + "auxiliary_loss_mlp": 0.01031532, + "balance_loss_clip": 1.03550255, + "balance_loss_mlp": 1.01762056, + "epoch": 0.949045543363896, + "flos": 24316049018880.0, + "grad_norm": 1.7690180821758892, + "language_loss": 0.69829547, + "learning_rate": 2.714260468695806e-08, + "loss": 0.71921104, + "num_input_tokens_seen": 340565145, + "step": 15785, + "time_per_iteration": 2.718092203140259 + }, + { + "auxiliary_loss_clip": 0.01108856, + "auxiliary_loss_mlp": 0.01028867, + "balance_loss_clip": 1.03556883, + "balance_loss_mlp": 1.01650548, + "epoch": 0.9491056666165639, + "flos": 24241677909120.0, + "grad_norm": 1.499623149824644, + "language_loss": 0.75997609, + "learning_rate": 2.707869629830495e-08, + "loss": 0.78135335, + "num_input_tokens_seen": 340585465, + "step": 15786, + "time_per_iteration": 2.5866501331329346 + }, + { + "auxiliary_loss_clip": 0.01075928, + "auxiliary_loss_mlp": 0.01032218, + "balance_loss_clip": 1.03659058, + "balance_loss_mlp": 1.02088714, + "epoch": 0.949165789869232, + "flos": 24531261356160.0, + "grad_norm": 1.9121797564334724, + "language_loss": 0.78743112, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.80851257, + "num_input_tokens_seen": 340606010, + "step": 15787, + "time_per_iteration": 2.6785271167755127 + }, + { + "auxiliary_loss_clip": 0.0109935, + "auxiliary_loss_mlp": 0.01029051, + "balance_loss_clip": 1.03999209, + "balance_loss_mlp": 1.01711869, + "epoch": 0.9492259131218999, + "flos": 22235348862720.0, + "grad_norm": 1.5253176882486765, + "language_loss": 0.76644206, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.78772604, + "num_input_tokens_seen": 340626135, + "step": 15788, + "time_per_iteration": 2.7900092601776123 + }, + { + "auxiliary_loss_clip": 0.01098885, + "auxiliary_loss_mlp": 0.01032975, + "balance_loss_clip": 1.03593767, + "balance_loss_mlp": 1.019647, + "epoch": 0.9492860363745679, + "flos": 22966310632320.0, + "grad_norm": 1.7589364420140376, + "language_loss": 0.71141213, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.73273069, + "num_input_tokens_seen": 340644870, + "step": 15789, + "time_per_iteration": 2.6160874366760254 + }, + { + "auxiliary_loss_clip": 0.01059097, + "auxiliary_loss_mlp": 0.01031485, + "balance_loss_clip": 1.03295982, + "balance_loss_mlp": 1.01794267, + "epoch": 0.9493461596272358, + "flos": 18370283754240.0, + "grad_norm": 2.83542221151725, + "language_loss": 0.73137754, + "learning_rate": 2.682381090161989e-08, + "loss": 0.75228333, + "num_input_tokens_seen": 340663695, + "step": 15790, + "time_per_iteration": 2.6108055114746094 + }, + { + "auxiliary_loss_clip": 0.01073497, + "auxiliary_loss_mlp": 0.01035956, + "balance_loss_clip": 1.03374732, + "balance_loss_mlp": 1.02253366, + "epoch": 0.9494062828799038, + "flos": 20011724490240.0, + "grad_norm": 1.9002383614444849, + "language_loss": 0.77333057, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.79442513, + "num_input_tokens_seen": 340682970, + "step": 15791, + "time_per_iteration": 2.688148260116577 + }, + { + "auxiliary_loss_clip": 0.01102402, + "auxiliary_loss_mlp": 0.01034148, + "balance_loss_clip": 1.03735161, + "balance_loss_mlp": 1.02070129, + "epoch": 0.9494664061325718, + "flos": 27228583313280.0, + "grad_norm": 1.7874682888186109, + "language_loss": 0.73599547, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.75736099, + "num_input_tokens_seen": 340702275, + "step": 15792, + "time_per_iteration": 2.643265962600708 + }, + { + "auxiliary_loss_clip": 0.01095336, + "auxiliary_loss_mlp": 0.01034889, + "balance_loss_clip": 1.03347254, + "balance_loss_mlp": 1.02266979, + "epoch": 0.9495265293852397, + "flos": 18369816877440.0, + "grad_norm": 1.8451462038230002, + "language_loss": 0.78138769, + "learning_rate": 2.663343248754679e-08, + "loss": 0.80268991, + "num_input_tokens_seen": 340719060, + "step": 15793, + "time_per_iteration": 2.5426347255706787 + }, + { + "auxiliary_loss_clip": 0.01081824, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.03453922, + "balance_loss_mlp": 1.0182879, + "epoch": 0.9495866526379078, + "flos": 23075766351360.0, + "grad_norm": 1.686462964828876, + "language_loss": 0.77439916, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.79551899, + "num_input_tokens_seen": 340737815, + "step": 15794, + "time_per_iteration": 4.211062669754028 + }, + { + "auxiliary_loss_clip": 0.01078065, + "auxiliary_loss_mlp": 0.00770639, + "balance_loss_clip": 1.03476417, + "balance_loss_mlp": 1.00026226, + "epoch": 0.9496467758905757, + "flos": 17529902179200.0, + "grad_norm": 1.8326530487226782, + "language_loss": 0.61200684, + "learning_rate": 2.650688769211107e-08, + "loss": 0.63049388, + "num_input_tokens_seen": 340756035, + "step": 15795, + "time_per_iteration": 4.150991201400757 + }, + { + "auxiliary_loss_clip": 0.01096105, + "auxiliary_loss_mlp": 0.01034842, + "balance_loss_clip": 1.03731775, + "balance_loss_mlp": 1.02214646, + "epoch": 0.9497068991432437, + "flos": 24133910129280.0, + "grad_norm": 1.6119216372134806, + "language_loss": 0.79217291, + "learning_rate": 2.644372754577895e-08, + "loss": 0.81348234, + "num_input_tokens_seen": 340775620, + "step": 15796, + "time_per_iteration": 2.6128690242767334 + }, + { + "auxiliary_loss_clip": 0.01097993, + "auxiliary_loss_mlp": 0.01029872, + "balance_loss_clip": 1.03628421, + "balance_loss_mlp": 1.01681852, + "epoch": 0.9497670223959116, + "flos": 20303319098880.0, + "grad_norm": 1.8328846097658669, + "language_loss": 0.75668991, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.77796859, + "num_input_tokens_seen": 340794510, + "step": 15797, + "time_per_iteration": 4.209908723831177 + }, + { + "auxiliary_loss_clip": 0.01076014, + "auxiliary_loss_mlp": 0.00770873, + "balance_loss_clip": 1.03560376, + "balance_loss_mlp": 1.00026107, + "epoch": 0.9498271456485796, + "flos": 13698916099200.0, + "grad_norm": 2.550624917313258, + "language_loss": 0.6578297, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.67629862, + "num_input_tokens_seen": 340812955, + "step": 15798, + "time_per_iteration": 2.6348631381988525 + }, + { + "auxiliary_loss_clip": 0.011003, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.03773224, + "balance_loss_mlp": 1.02130818, + "epoch": 0.9498872689012475, + "flos": 20814004713600.0, + "grad_norm": 1.909884412198324, + "language_loss": 0.77439278, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.79573023, + "num_input_tokens_seen": 340829200, + "step": 15799, + "time_per_iteration": 2.6085915565490723 + }, + { + "auxiliary_loss_clip": 0.01091765, + "auxiliary_loss_mlp": 0.01034579, + "balance_loss_clip": 1.03405094, + "balance_loss_mlp": 1.02236009, + "epoch": 0.9499473921539155, + "flos": 21032700670080.0, + "grad_norm": 1.7422846362169004, + "language_loss": 0.71096122, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.7322247, + "num_input_tokens_seen": 340848035, + "step": 15800, + "time_per_iteration": 4.11196756362915 + }, + { + "auxiliary_loss_clip": 0.01081003, + "auxiliary_loss_mlp": 0.01027785, + "balance_loss_clip": 1.03178167, + "balance_loss_mlp": 1.01504791, + "epoch": 0.9500075154065835, + "flos": 20998693468800.0, + "grad_norm": 1.6797265038283544, + "language_loss": 0.7196418, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.74072969, + "num_input_tokens_seen": 340870025, + "step": 15801, + "time_per_iteration": 2.7228105068206787 + }, + { + "auxiliary_loss_clip": 0.01098003, + "auxiliary_loss_mlp": 0.01032345, + "balance_loss_clip": 1.03618026, + "balance_loss_mlp": 1.02023363, + "epoch": 0.9500676386592515, + "flos": 25121956515840.0, + "grad_norm": 1.5618247598543729, + "language_loss": 0.80991805, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.83122152, + "num_input_tokens_seen": 340892290, + "step": 15802, + "time_per_iteration": 2.6597704887390137 + }, + { + "auxiliary_loss_clip": 0.01111718, + "auxiliary_loss_mlp": 0.01031196, + "balance_loss_clip": 1.03881001, + "balance_loss_mlp": 1.01899517, + "epoch": 0.9501277619119194, + "flos": 27523625627520.0, + "grad_norm": 1.6749081287524619, + "language_loss": 0.67810452, + "learning_rate": 2.60037021038646e-08, + "loss": 0.69953358, + "num_input_tokens_seen": 340912260, + "step": 15803, + "time_per_iteration": 2.6744706630706787 + }, + { + "auxiliary_loss_clip": 0.01082837, + "auxiliary_loss_mlp": 0.01036561, + "balance_loss_clip": 1.03429604, + "balance_loss_mlp": 1.02377534, + "epoch": 0.9501878851645874, + "flos": 20813968800000.0, + "grad_norm": 6.246974750170738, + "language_loss": 0.76370931, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.78490329, + "num_input_tokens_seen": 340928930, + "step": 15804, + "time_per_iteration": 2.721076726913452 + }, + { + "auxiliary_loss_clip": 0.01096211, + "auxiliary_loss_mlp": 0.01035081, + "balance_loss_clip": 1.03763199, + "balance_loss_mlp": 1.0223074, + "epoch": 0.9502480084172553, + "flos": 18369385914240.0, + "grad_norm": 1.716451063779602, + "language_loss": 0.73370028, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.75501317, + "num_input_tokens_seen": 340946615, + "step": 15805, + "time_per_iteration": 2.573842763900757 + }, + { + "auxiliary_loss_clip": 0.01084759, + "auxiliary_loss_mlp": 0.01033637, + "balance_loss_clip": 1.03832221, + "balance_loss_mlp": 1.02066755, + "epoch": 0.9503081316699233, + "flos": 23549607590400.0, + "grad_norm": 1.446963145068923, + "language_loss": 0.80407286, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.82525682, + "num_input_tokens_seen": 340967545, + "step": 15806, + "time_per_iteration": 2.7522966861724854 + }, + { + "auxiliary_loss_clip": 0.0107262, + "auxiliary_loss_mlp": 0.010333, + "balance_loss_clip": 1.03583097, + "balance_loss_mlp": 1.0209856, + "epoch": 0.9503682549225914, + "flos": 18040444139520.0, + "grad_norm": 2.3524275166414688, + "language_loss": 0.82226515, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.8433243, + "num_input_tokens_seen": 340984955, + "step": 15807, + "time_per_iteration": 2.6490519046783447 + }, + { + "auxiliary_loss_clip": 0.01089448, + "auxiliary_loss_mlp": 0.01031628, + "balance_loss_clip": 1.03269625, + "balance_loss_mlp": 1.01936102, + "epoch": 0.9504283781752593, + "flos": 25886135387520.0, + "grad_norm": 9.284971191525596, + "language_loss": 0.71851462, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.73972535, + "num_input_tokens_seen": 341007300, + "step": 15808, + "time_per_iteration": 2.6571197509765625 + }, + { + "auxiliary_loss_clip": 0.01097791, + "auxiliary_loss_mlp": 0.01030365, + "balance_loss_clip": 1.03632784, + "balance_loss_mlp": 1.01830709, + "epoch": 0.9504885014279273, + "flos": 22124025636480.0, + "grad_norm": 1.4241274902229573, + "language_loss": 0.69725883, + "learning_rate": 2.562945671948058e-08, + "loss": 0.71854043, + "num_input_tokens_seen": 341026695, + "step": 15809, + "time_per_iteration": 2.602086067199707 + }, + { + "auxiliary_loss_clip": 0.0108373, + "auxiliary_loss_mlp": 0.01027915, + "balance_loss_clip": 1.03374672, + "balance_loss_mlp": 1.01552939, + "epoch": 0.9505486246805952, + "flos": 21615961714560.0, + "grad_norm": 1.5381287986116137, + "language_loss": 0.75574476, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.77686119, + "num_input_tokens_seen": 341047080, + "step": 15810, + "time_per_iteration": 2.7851271629333496 + }, + { + "auxiliary_loss_clip": 0.01074163, + "auxiliary_loss_mlp": 0.01043017, + "balance_loss_clip": 1.03387725, + "balance_loss_mlp": 1.03033352, + "epoch": 0.9506087479332632, + "flos": 22528236360960.0, + "grad_norm": 1.4680738031168652, + "language_loss": 0.79985034, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.82102215, + "num_input_tokens_seen": 341067310, + "step": 15811, + "time_per_iteration": 2.716155767440796 + }, + { + "auxiliary_loss_clip": 0.01082329, + "auxiliary_loss_mlp": 0.01038784, + "balance_loss_clip": 1.03409791, + "balance_loss_mlp": 1.02606487, + "epoch": 0.9506688711859311, + "flos": 27527360641920.0, + "grad_norm": 2.1864110496701823, + "language_loss": 0.69794703, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.71915817, + "num_input_tokens_seen": 341085110, + "step": 15812, + "time_per_iteration": 2.7080633640289307 + }, + { + "auxiliary_loss_clip": 0.01071236, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.03449655, + "balance_loss_mlp": 1.0230056, + "epoch": 0.9507289944385992, + "flos": 19865783531520.0, + "grad_norm": 1.621391502442825, + "language_loss": 0.65664506, + "learning_rate": 2.538145713158446e-08, + "loss": 0.67772174, + "num_input_tokens_seen": 341103190, + "step": 15813, + "time_per_iteration": 2.6422770023345947 + }, + { + "auxiliary_loss_clip": 0.01099547, + "auxiliary_loss_mlp": 0.01037026, + "balance_loss_clip": 1.03611267, + "balance_loss_mlp": 1.02409816, + "epoch": 0.9507891176912671, + "flos": 25193274969600.0, + "grad_norm": 1.4581482978793308, + "language_loss": 0.70320028, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.72456604, + "num_input_tokens_seen": 341125695, + "step": 15814, + "time_per_iteration": 2.658942699432373 + }, + { + "auxiliary_loss_clip": 0.01097344, + "auxiliary_loss_mlp": 0.01028268, + "balance_loss_clip": 1.03695726, + "balance_loss_mlp": 1.01665115, + "epoch": 0.9508492409439351, + "flos": 24899561458560.0, + "grad_norm": 1.8950917263769373, + "language_loss": 0.63310945, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.65436554, + "num_input_tokens_seen": 341143930, + "step": 15815, + "time_per_iteration": 2.633420944213867 + }, + { + "auxiliary_loss_clip": 0.01084007, + "auxiliary_loss_mlp": 0.01027739, + "balance_loss_clip": 1.03480506, + "balance_loss_mlp": 1.01581824, + "epoch": 0.950909364196603, + "flos": 29784094375680.0, + "grad_norm": 1.8730237235822342, + "language_loss": 0.58833039, + "learning_rate": 2.519624364862061e-08, + "loss": 0.60944784, + "num_input_tokens_seen": 341164280, + "step": 15816, + "time_per_iteration": 2.7500061988830566 + }, + { + "auxiliary_loss_clip": 0.0110715, + "auxiliary_loss_mlp": 0.01039761, + "balance_loss_clip": 1.03585255, + "balance_loss_mlp": 1.02707124, + "epoch": 0.950969487449271, + "flos": 24717781704960.0, + "grad_norm": 1.491116548169098, + "language_loss": 0.73515993, + "learning_rate": 2.513465558735994e-08, + "loss": 0.75662911, + "num_input_tokens_seen": 341183670, + "step": 15817, + "time_per_iteration": 2.6232523918151855 + }, + { + "auxiliary_loss_clip": 0.01089005, + "auxiliary_loss_mlp": 0.01034701, + "balance_loss_clip": 1.03726554, + "balance_loss_mlp": 1.02046156, + "epoch": 0.9510296107019389, + "flos": 13699167494400.0, + "grad_norm": 1.5602178845218895, + "language_loss": 0.60236609, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.62360317, + "num_input_tokens_seen": 341201900, + "step": 15818, + "time_per_iteration": 2.6116764545440674 + }, + { + "auxiliary_loss_clip": 0.01109081, + "auxiliary_loss_mlp": 0.01034014, + "balance_loss_clip": 1.0376842, + "balance_loss_mlp": 1.02124643, + "epoch": 0.9510897339546069, + "flos": 17311852667520.0, + "grad_norm": 2.0566483218675438, + "language_loss": 0.6942215, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.71565247, + "num_input_tokens_seen": 341218340, + "step": 15819, + "time_per_iteration": 2.560081958770752 + }, + { + "auxiliary_loss_clip": 0.0107016, + "auxiliary_loss_mlp": 0.01028125, + "balance_loss_clip": 1.0372014, + "balance_loss_mlp": 1.0156858, + "epoch": 0.951149857207275, + "flos": 14793940166400.0, + "grad_norm": 1.7393168966248527, + "language_loss": 0.73959541, + "learning_rate": 2.49503407354561e-08, + "loss": 0.76057822, + "num_input_tokens_seen": 341235885, + "step": 15820, + "time_per_iteration": 2.797940969467163 + }, + { + "auxiliary_loss_clip": 0.01089647, + "auxiliary_loss_mlp": 0.01033566, + "balance_loss_clip": 1.03681791, + "balance_loss_mlp": 1.02076864, + "epoch": 0.9512099804599429, + "flos": 19391152193280.0, + "grad_norm": 1.9037531735176971, + "language_loss": 0.78643155, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.80766368, + "num_input_tokens_seen": 341255280, + "step": 15821, + "time_per_iteration": 2.6202476024627686 + }, + { + "auxiliary_loss_clip": 0.01068626, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.03432822, + "balance_loss_mlp": 1.01609325, + "epoch": 0.9512701037126109, + "flos": 36757874885760.0, + "grad_norm": 1.5233600677568924, + "language_loss": 0.71154618, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.73252249, + "num_input_tokens_seen": 341279055, + "step": 15822, + "time_per_iteration": 2.8137216567993164 + }, + { + "auxiliary_loss_clip": 0.01094806, + "auxiliary_loss_mlp": 0.01037452, + "balance_loss_clip": 1.03667974, + "balance_loss_mlp": 1.02534676, + "epoch": 0.9513302269652788, + "flos": 22638266697600.0, + "grad_norm": 1.6180069901826792, + "language_loss": 0.65828168, + "learning_rate": 2.47666999302647e-08, + "loss": 0.67960423, + "num_input_tokens_seen": 341298560, + "step": 15823, + "time_per_iteration": 2.616811513900757 + }, + { + "auxiliary_loss_clip": 0.01090848, + "auxiliary_loss_mlp": 0.01031876, + "balance_loss_clip": 1.03517807, + "balance_loss_mlp": 1.01999104, + "epoch": 0.9513903502179468, + "flos": 22893232412160.0, + "grad_norm": 1.8863282557920107, + "language_loss": 0.77391565, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.79514301, + "num_input_tokens_seen": 341316650, + "step": 15824, + "time_per_iteration": 2.5897138118743896 + }, + { + "auxiliary_loss_clip": 0.01110536, + "auxiliary_loss_mlp": 0.01031394, + "balance_loss_clip": 1.0360415, + "balance_loss_mlp": 1.01814985, + "epoch": 0.9514504734706147, + "flos": 27928626451200.0, + "grad_norm": 1.8804632984111238, + "language_loss": 0.73739725, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.75881654, + "num_input_tokens_seen": 341336185, + "step": 15825, + "time_per_iteration": 2.59452223777771 + }, + { + "auxiliary_loss_clip": 0.0101482, + "auxiliary_loss_mlp": 0.00999967, + "balance_loss_clip": 1.00606704, + "balance_loss_mlp": 0.99895328, + "epoch": 0.9515105967232828, + "flos": 67366767312000.0, + "grad_norm": 0.8525119381835639, + "language_loss": 0.53459394, + "learning_rate": 2.458373323445806e-08, + "loss": 0.55474186, + "num_input_tokens_seen": 341395795, + "step": 15826, + "time_per_iteration": 3.0530049800872803 + }, + { + "auxiliary_loss_clip": 0.01084306, + "auxiliary_loss_mlp": 0.01035155, + "balance_loss_clip": 1.03550363, + "balance_loss_mlp": 1.02248907, + "epoch": 0.9515707199759507, + "flos": 25846525664640.0, + "grad_norm": 2.1311197223836458, + "language_loss": 0.72489649, + "learning_rate": 2.452289414874076e-08, + "loss": 0.74609113, + "num_input_tokens_seen": 341415675, + "step": 15827, + "time_per_iteration": 2.67301869392395 + }, + { + "auxiliary_loss_clip": 0.01086796, + "auxiliary_loss_mlp": 0.01030812, + "balance_loss_clip": 1.03601933, + "balance_loss_mlp": 1.01807427, + "epoch": 0.9516308432286187, + "flos": 21828983322240.0, + "grad_norm": 1.785994656352798, + "language_loss": 0.7409234, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.7620995, + "num_input_tokens_seen": 341432990, + "step": 15828, + "time_per_iteration": 2.6235291957855225 + }, + { + "auxiliary_loss_clip": 0.0106734, + "auxiliary_loss_mlp": 0.01033444, + "balance_loss_clip": 1.03639388, + "balance_loss_mlp": 1.02239347, + "epoch": 0.9516909664812866, + "flos": 27269593666560.0, + "grad_norm": 1.6745966727407804, + "language_loss": 0.72937709, + "learning_rate": 2.440144071047978e-08, + "loss": 0.75038493, + "num_input_tokens_seen": 341454100, + "step": 15829, + "time_per_iteration": 2.831969738006592 + }, + { + "auxiliary_loss_clip": 0.01093583, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.03440034, + "balance_loss_mlp": 1.02001393, + "epoch": 0.9517510897339546, + "flos": 21215342350080.0, + "grad_norm": 2.2807166636074863, + "language_loss": 0.61247396, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.6337471, + "num_input_tokens_seen": 341472955, + "step": 15830, + "time_per_iteration": 2.57916522026062 + }, + { + "auxiliary_loss_clip": 0.01095854, + "auxiliary_loss_mlp": 0.01031396, + "balance_loss_clip": 1.03783762, + "balance_loss_mlp": 1.01720452, + "epoch": 0.9518112129866225, + "flos": 18733986915840.0, + "grad_norm": 2.414229225065315, + "language_loss": 0.72597665, + "learning_rate": 2.428028693179729e-08, + "loss": 0.74724913, + "num_input_tokens_seen": 341490165, + "step": 15831, + "time_per_iteration": 2.590857982635498 + }, + { + "auxiliary_loss_clip": 0.01054785, + "auxiliary_loss_mlp": 0.01024066, + "balance_loss_clip": 1.03245831, + "balance_loss_mlp": 1.01274085, + "epoch": 0.9518713362392905, + "flos": 16763676232320.0, + "grad_norm": 1.6809065599907524, + "language_loss": 0.65303266, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.67382115, + "num_input_tokens_seen": 341508055, + "step": 15832, + "time_per_iteration": 2.7475693225860596 + }, + { + "auxiliary_loss_clip": 0.01093001, + "auxiliary_loss_mlp": 0.01036123, + "balance_loss_clip": 1.03763044, + "balance_loss_mlp": 1.02308214, + "epoch": 0.9519314594919586, + "flos": 15230649720960.0, + "grad_norm": 1.7187781750552136, + "language_loss": 0.77851391, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.79980505, + "num_input_tokens_seen": 341526155, + "step": 15833, + "time_per_iteration": 2.5683181285858154 + }, + { + "auxiliary_loss_clip": 0.01069974, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.03459656, + "balance_loss_mlp": 1.01919472, + "epoch": 0.9519915827446265, + "flos": 19352943100800.0, + "grad_norm": 1.9155025813330617, + "language_loss": 0.75245464, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.77346802, + "num_input_tokens_seen": 341540450, + "step": 15834, + "time_per_iteration": 5.729520559310913 + }, + { + "auxiliary_loss_clip": 0.01098407, + "auxiliary_loss_mlp": 0.01035182, + "balance_loss_clip": 1.038692, + "balance_loss_mlp": 1.02169371, + "epoch": 0.9520517059972945, + "flos": 22266303408000.0, + "grad_norm": 2.297648558034633, + "language_loss": 0.7629987, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.78433454, + "num_input_tokens_seen": 341557865, + "step": 15835, + "time_per_iteration": 2.570033073425293 + }, + { + "auxiliary_loss_clip": 0.01086302, + "auxiliary_loss_mlp": 0.01031867, + "balance_loss_clip": 1.03379786, + "balance_loss_mlp": 1.01873004, + "epoch": 0.9521118292499624, + "flos": 14862313704960.0, + "grad_norm": 1.9369044520517964, + "language_loss": 0.6651296, + "learning_rate": 2.397871361623238e-08, + "loss": 0.68631124, + "num_input_tokens_seen": 341573890, + "step": 15836, + "time_per_iteration": 4.36873197555542 + }, + { + "auxiliary_loss_clip": 0.01072203, + "auxiliary_loss_mlp": 0.01027392, + "balance_loss_clip": 1.03464746, + "balance_loss_mlp": 1.01531649, + "epoch": 0.9521719525026304, + "flos": 23508812718720.0, + "grad_norm": 1.945512889089952, + "language_loss": 0.70333862, + "learning_rate": 2.391862373676057e-08, + "loss": 0.72433454, + "num_input_tokens_seen": 341593770, + "step": 15837, + "time_per_iteration": 2.705793619155884 + }, + { + "auxiliary_loss_clip": 0.01110794, + "auxiliary_loss_mlp": 0.01033236, + "balance_loss_clip": 1.03770208, + "balance_loss_mlp": 1.01952064, + "epoch": 0.9522320757552983, + "flos": 19714922409600.0, + "grad_norm": 2.0871537703897673, + "language_loss": 0.73548734, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.75692767, + "num_input_tokens_seen": 341612065, + "step": 15838, + "time_per_iteration": 2.626145362854004 + }, + { + "auxiliary_loss_clip": 0.01076517, + "auxiliary_loss_mlp": 0.01027932, + "balance_loss_clip": 1.03396976, + "balance_loss_mlp": 1.0152297, + "epoch": 0.9522921990079664, + "flos": 25921291824000.0, + "grad_norm": 4.0429942363631275, + "language_loss": 0.78156877, + "learning_rate": 2.379866877970449e-08, + "loss": 0.80261326, + "num_input_tokens_seen": 341631365, + "step": 15839, + "time_per_iteration": 4.274654865264893 + }, + { + "auxiliary_loss_clip": 0.01085718, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.04032528, + "balance_loss_mlp": 1.02013206, + "epoch": 0.9523523222606343, + "flos": 19208115463680.0, + "grad_norm": 1.5000947489157939, + "language_loss": 0.80272675, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.82390767, + "num_input_tokens_seen": 341650300, + "step": 15840, + "time_per_iteration": 2.7204654216766357 + }, + { + "auxiliary_loss_clip": 0.01078473, + "auxiliary_loss_mlp": 0.01028715, + "balance_loss_clip": 1.0350554, + "balance_loss_mlp": 1.0179565, + "epoch": 0.9524124455133023, + "flos": 20921269703040.0, + "grad_norm": 2.113087759766638, + "language_loss": 0.73338723, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.75445914, + "num_input_tokens_seen": 341667680, + "step": 15841, + "time_per_iteration": 2.6518993377685547 + }, + { + "auxiliary_loss_clip": 0.01080022, + "auxiliary_loss_mlp": 0.01026928, + "balance_loss_clip": 1.03612816, + "balance_loss_mlp": 1.01526952, + "epoch": 0.9524725687659702, + "flos": 18843550375680.0, + "grad_norm": 1.7318468009780055, + "language_loss": 0.79018557, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.8112551, + "num_input_tokens_seen": 341685760, + "step": 15842, + "time_per_iteration": 2.620762825012207 + }, + { + "auxiliary_loss_clip": 0.01085992, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.03697205, + "balance_loss_mlp": 1.01970291, + "epoch": 0.9525326920186382, + "flos": 22674680110080.0, + "grad_norm": 2.120857377915384, + "language_loss": 0.72623742, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.74742007, + "num_input_tokens_seen": 341705300, + "step": 15843, + "time_per_iteration": 2.643082618713379 + }, + { + "auxiliary_loss_clip": 0.01080268, + "auxiliary_loss_mlp": 0.00770279, + "balance_loss_clip": 1.03644204, + "balance_loss_mlp": 1.00023687, + "epoch": 0.9525928152713061, + "flos": 22086642556800.0, + "grad_norm": 1.7610421238919713, + "language_loss": 0.78494173, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.80344719, + "num_input_tokens_seen": 341724565, + "step": 15844, + "time_per_iteration": 2.672140121459961 + }, + { + "auxiliary_loss_clip": 0.0107313, + "auxiliary_loss_mlp": 0.01034185, + "balance_loss_clip": 1.03377759, + "balance_loss_mlp": 1.0192126, + "epoch": 0.9526529385239741, + "flos": 20704728562560.0, + "grad_norm": 2.3272849000884133, + "language_loss": 0.70132804, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.7224012, + "num_input_tokens_seen": 341743605, + "step": 15845, + "time_per_iteration": 2.6669421195983887 + }, + { + "auxiliary_loss_clip": 0.01073757, + "auxiliary_loss_mlp": 0.01035061, + "balance_loss_clip": 1.03685403, + "balance_loss_mlp": 1.02289605, + "epoch": 0.9527130617766422, + "flos": 23368043318400.0, + "grad_norm": 1.600943165785114, + "language_loss": 0.75702989, + "learning_rate": 2.338118708818282e-08, + "loss": 0.77811807, + "num_input_tokens_seen": 341763475, + "step": 15846, + "time_per_iteration": 2.7024073600769043 + }, + { + "auxiliary_loss_clip": 0.01078418, + "auxiliary_loss_mlp": 0.01025955, + "balance_loss_clip": 1.03588128, + "balance_loss_mlp": 1.01366425, + "epoch": 0.9527731850293101, + "flos": 18985935888000.0, + "grad_norm": 1.8711646240490332, + "language_loss": 0.78105325, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.80209702, + "num_input_tokens_seen": 341781265, + "step": 15847, + "time_per_iteration": 2.780184507369995 + }, + { + "auxiliary_loss_clip": 0.01066366, + "auxiliary_loss_mlp": 0.01035518, + "balance_loss_clip": 1.03419328, + "balance_loss_mlp": 1.0241214, + "epoch": 0.9528333082819781, + "flos": 19318038059520.0, + "grad_norm": 1.9530188537907924, + "language_loss": 0.7798357, + "learning_rate": 2.326258115328672e-08, + "loss": 0.80085456, + "num_input_tokens_seen": 341798825, + "step": 15848, + "time_per_iteration": 2.7238736152648926 + }, + { + "auxiliary_loss_clip": 0.01089796, + "auxiliary_loss_mlp": 0.01042438, + "balance_loss_clip": 1.03580141, + "balance_loss_mlp": 1.02845478, + "epoch": 0.952893431534646, + "flos": 23951340276480.0, + "grad_norm": 1.8605077163556365, + "language_loss": 0.72040188, + "learning_rate": 2.320339062183674e-08, + "loss": 0.74172425, + "num_input_tokens_seen": 341819480, + "step": 15849, + "time_per_iteration": 2.682178258895874 + }, + { + "auxiliary_loss_clip": 0.01105363, + "auxiliary_loss_mlp": 0.01038455, + "balance_loss_clip": 1.04046464, + "balance_loss_mlp": 1.02527022, + "epoch": 0.952953554787314, + "flos": 21030545854080.0, + "grad_norm": 1.819487619719596, + "language_loss": 0.75498259, + "learning_rate": 2.314427505071226e-08, + "loss": 0.77642077, + "num_input_tokens_seen": 341838035, + "step": 15850, + "time_per_iteration": 2.6890413761138916 + }, + { + "auxiliary_loss_clip": 0.01080509, + "auxiliary_loss_mlp": 0.01034554, + "balance_loss_clip": 1.03441119, + "balance_loss_mlp": 1.02248454, + "epoch": 0.9530136780399819, + "flos": 22382870019840.0, + "grad_norm": 2.121651511514479, + "language_loss": 0.72852147, + "learning_rate": 2.308523444215482e-08, + "loss": 0.74967206, + "num_input_tokens_seen": 341855895, + "step": 15851, + "time_per_iteration": 2.681929111480713 + }, + { + "auxiliary_loss_clip": 0.01082039, + "auxiliary_loss_mlp": 0.01026961, + "balance_loss_clip": 1.03587413, + "balance_loss_mlp": 1.01521945, + "epoch": 0.95307380129265, + "flos": 22159613036160.0, + "grad_norm": 1.7583423782489798, + "language_loss": 0.79609531, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.8171854, + "num_input_tokens_seen": 341875240, + "step": 15852, + "time_per_iteration": 2.6543726921081543 + }, + { + "auxiliary_loss_clip": 0.01097888, + "auxiliary_loss_mlp": 0.01036154, + "balance_loss_clip": 1.03511071, + "balance_loss_mlp": 1.02323198, + "epoch": 0.9531339245453179, + "flos": 44022747214080.0, + "grad_norm": 1.5582381447981437, + "language_loss": 0.59615147, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.61749196, + "num_input_tokens_seen": 341901020, + "step": 15853, + "time_per_iteration": 2.7729127407073975 + }, + { + "auxiliary_loss_clip": 0.01084188, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.0343461, + "balance_loss_mlp": 1.02051926, + "epoch": 0.9531940477979859, + "flos": 20266690204800.0, + "grad_norm": 1.8458954546465922, + "language_loss": 0.72333086, + "learning_rate": 2.290856241425998e-08, + "loss": 0.74449503, + "num_input_tokens_seen": 341919365, + "step": 15854, + "time_per_iteration": 2.667217254638672 + }, + { + "auxiliary_loss_clip": 0.01081866, + "auxiliary_loss_mlp": 0.01031192, + "balance_loss_clip": 1.03433609, + "balance_loss_mlp": 1.01909232, + "epoch": 0.9532541710506538, + "flos": 25335732309120.0, + "grad_norm": 2.1969630613589057, + "language_loss": 0.67196018, + "learning_rate": 2.284982167833127e-08, + "loss": 0.69309074, + "num_input_tokens_seen": 341939985, + "step": 15855, + "time_per_iteration": 2.6534695625305176 + }, + { + "auxiliary_loss_clip": 0.01109271, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.0367763, + "balance_loss_mlp": 1.01885402, + "epoch": 0.9533142943033218, + "flos": 26469288691200.0, + "grad_norm": 1.5274275727026758, + "language_loss": 0.76655555, + "learning_rate": 2.279115591613556e-08, + "loss": 0.78795809, + "num_input_tokens_seen": 341959255, + "step": 15856, + "time_per_iteration": 2.6008455753326416 + }, + { + "auxiliary_loss_clip": 0.01080944, + "auxiliary_loss_mlp": 0.0103369, + "balance_loss_clip": 1.03132057, + "balance_loss_mlp": 1.02190578, + "epoch": 0.9533744175559897, + "flos": 23656944407040.0, + "grad_norm": 1.7148081146844736, + "language_loss": 0.77968013, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.80082643, + "num_input_tokens_seen": 341977205, + "step": 15857, + "time_per_iteration": 2.6481335163116455 + }, + { + "auxiliary_loss_clip": 0.01019391, + "auxiliary_loss_mlp": 0.01003272, + "balance_loss_clip": 1.00663853, + "balance_loss_mlp": 1.00225866, + "epoch": 0.9534345408086577, + "flos": 61052055500160.0, + "grad_norm": 0.7153193040155459, + "language_loss": 0.6259079, + "learning_rate": 2.267404932183803e-08, + "loss": 0.6461345, + "num_input_tokens_seen": 342038545, + "step": 15858, + "time_per_iteration": 3.112011671066284 + }, + { + "auxiliary_loss_clip": 0.01057029, + "auxiliary_loss_mlp": 0.01028806, + "balance_loss_clip": 1.03275323, + "balance_loss_mlp": 1.01722491, + "epoch": 0.9534946640613258, + "flos": 18951677291520.0, + "grad_norm": 1.5293384678689539, + "language_loss": 0.56808496, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.58894336, + "num_input_tokens_seen": 342058195, + "step": 15859, + "time_per_iteration": 2.699678897857666 + }, + { + "auxiliary_loss_clip": 0.01104207, + "auxiliary_loss_mlp": 0.01030176, + "balance_loss_clip": 1.03593767, + "balance_loss_mlp": 1.01922047, + "epoch": 0.9535547873139937, + "flos": 16654292340480.0, + "grad_norm": 2.076157934676356, + "language_loss": 0.81695747, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.8383013, + "num_input_tokens_seen": 342075025, + "step": 15860, + "time_per_iteration": 2.5248684883117676 + }, + { + "auxiliary_loss_clip": 0.01057722, + "auxiliary_loss_mlp": 0.0076914, + "balance_loss_clip": 1.03329587, + "balance_loss_mlp": 1.00022173, + "epoch": 0.9536149105666617, + "flos": 20667776446080.0, + "grad_norm": 2.0339237108527195, + "language_loss": 0.66784334, + "learning_rate": 2.249895178891159e-08, + "loss": 0.68611193, + "num_input_tokens_seen": 342094595, + "step": 15861, + "time_per_iteration": 2.764711856842041 + }, + { + "auxiliary_loss_clip": 0.01097732, + "auxiliary_loss_mlp": 0.01036637, + "balance_loss_clip": 1.03534853, + "balance_loss_mlp": 1.02341676, + "epoch": 0.9536750338193296, + "flos": 30700499086080.0, + "grad_norm": 1.722759616430161, + "language_loss": 0.65783358, + "learning_rate": 2.244073591573037e-08, + "loss": 0.67917728, + "num_input_tokens_seen": 342115970, + "step": 15862, + "time_per_iteration": 2.8370909690856934 + }, + { + "auxiliary_loss_clip": 0.01067937, + "auxiliary_loss_mlp": 0.01033375, + "balance_loss_clip": 1.03313565, + "balance_loss_mlp": 1.02116823, + "epoch": 0.9537351570719976, + "flos": 20405484357120.0, + "grad_norm": 1.5180821577389316, + "language_loss": 0.67942423, + "learning_rate": 2.238259503179485e-08, + "loss": 0.70043731, + "num_input_tokens_seen": 342134080, + "step": 15863, + "time_per_iteration": 2.85260272026062 + }, + { + "auxiliary_loss_clip": 0.01087422, + "auxiliary_loss_mlp": 0.01028951, + "balance_loss_clip": 1.03365183, + "balance_loss_mlp": 1.01648188, + "epoch": 0.9537952803246655, + "flos": 29929245235200.0, + "grad_norm": 2.093402061794127, + "language_loss": 0.78434008, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.80550379, + "num_input_tokens_seen": 342154725, + "step": 15864, + "time_per_iteration": 2.7751903533935547 + }, + { + "auxiliary_loss_clip": 0.01077785, + "auxiliary_loss_mlp": 0.01026692, + "balance_loss_clip": 1.03687298, + "balance_loss_mlp": 1.01424658, + "epoch": 0.9538554035773336, + "flos": 20521404524160.0, + "grad_norm": 2.5427902857740463, + "language_loss": 0.60073441, + "learning_rate": 2.226653824047586e-08, + "loss": 0.6217792, + "num_input_tokens_seen": 342172275, + "step": 15865, + "time_per_iteration": 2.668893337249756 + }, + { + "auxiliary_loss_clip": 0.01066094, + "auxiliary_loss_mlp": 0.01038068, + "balance_loss_clip": 1.03419495, + "balance_loss_mlp": 1.02391815, + "epoch": 0.9539155268300015, + "flos": 18406517598720.0, + "grad_norm": 1.825358390609407, + "language_loss": 0.70074368, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.72178537, + "num_input_tokens_seen": 342190880, + "step": 15866, + "time_per_iteration": 2.6656248569488525 + }, + { + "auxiliary_loss_clip": 0.01083648, + "auxiliary_loss_mlp": 0.0103856, + "balance_loss_clip": 1.03381348, + "balance_loss_mlp": 1.02469635, + "epoch": 0.9539756500826695, + "flos": 26213281482240.0, + "grad_norm": 7.66097760902825, + "language_loss": 0.84885997, + "learning_rate": 2.215078143255855e-08, + "loss": 0.87008202, + "num_input_tokens_seen": 342208165, + "step": 15867, + "time_per_iteration": 2.7268359661102295 + }, + { + "auxiliary_loss_clip": 0.01016664, + "auxiliary_loss_mlp": 0.0100223, + "balance_loss_clip": 1.0065484, + "balance_loss_mlp": 1.00118721, + "epoch": 0.9540357733353374, + "flos": 68289097766400.0, + "grad_norm": 0.8394455572132413, + "language_loss": 0.61809933, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.63828826, + "num_input_tokens_seen": 342277110, + "step": 15868, + "time_per_iteration": 3.1767897605895996 + }, + { + "auxiliary_loss_clip": 0.01070741, + "auxiliary_loss_mlp": 0.0102867, + "balance_loss_clip": 1.03546822, + "balance_loss_mlp": 1.01636136, + "epoch": 0.9540958965880054, + "flos": 21288276915840.0, + "grad_norm": 1.9166883744985537, + "language_loss": 0.60024238, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.62123656, + "num_input_tokens_seen": 342294695, + "step": 15869, + "time_per_iteration": 2.825597047805786 + }, + { + "auxiliary_loss_clip": 0.01069204, + "auxiliary_loss_mlp": 0.00772179, + "balance_loss_clip": 1.032269, + "balance_loss_mlp": 1.00027442, + "epoch": 0.9541560198406733, + "flos": 19751407649280.0, + "grad_norm": 1.8610543982135193, + "language_loss": 0.71071583, + "learning_rate": 2.197770872795579e-08, + "loss": 0.72912961, + "num_input_tokens_seen": 342314970, + "step": 15870, + "time_per_iteration": 2.7531421184539795 + }, + { + "auxiliary_loss_clip": 0.01067012, + "auxiliary_loss_mlp": 0.01028174, + "balance_loss_clip": 1.03300095, + "balance_loss_mlp": 1.01587772, + "epoch": 0.9542161430933414, + "flos": 24715626888960.0, + "grad_norm": 1.7781564124647944, + "language_loss": 0.76756346, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.78851533, + "num_input_tokens_seen": 342334255, + "step": 15871, + "time_per_iteration": 2.724163770675659 + }, + { + "auxiliary_loss_clip": 0.01096753, + "auxiliary_loss_mlp": 0.01035234, + "balance_loss_clip": 1.0351069, + "balance_loss_mlp": 1.02213871, + "epoch": 0.9542762663460094, + "flos": 31065818359680.0, + "grad_norm": 1.9388679259393415, + "language_loss": 0.58526534, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.60658514, + "num_input_tokens_seen": 342354730, + "step": 15872, + "time_per_iteration": 2.7208635807037354 + }, + { + "auxiliary_loss_clip": 0.01085098, + "auxiliary_loss_mlp": 0.01034184, + "balance_loss_clip": 1.03341579, + "balance_loss_mlp": 1.02033806, + "epoch": 0.9543363895986773, + "flos": 20776729374720.0, + "grad_norm": 2.397894266058994, + "language_loss": 0.74827802, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.76947081, + "num_input_tokens_seen": 342374565, + "step": 15873, + "time_per_iteration": 4.379558086395264 + }, + { + "auxiliary_loss_clip": 0.01111454, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.0387311, + "balance_loss_mlp": 1.01963258, + "epoch": 0.9543965128513453, + "flos": 24462744163200.0, + "grad_norm": 1.9355802772017296, + "language_loss": 0.62851435, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.64995706, + "num_input_tokens_seen": 342394590, + "step": 15874, + "time_per_iteration": 2.5884764194488525 + }, + { + "auxiliary_loss_clip": 0.01084158, + "auxiliary_loss_mlp": 0.01034983, + "balance_loss_clip": 1.03476036, + "balance_loss_mlp": 1.02254963, + "epoch": 0.9544566361040132, + "flos": 15261532439040.0, + "grad_norm": 1.953622113172995, + "language_loss": 0.89690936, + "learning_rate": 2.169075438538104e-08, + "loss": 0.91810071, + "num_input_tokens_seen": 342410445, + "step": 15875, + "time_per_iteration": 4.317510604858398 + }, + { + "auxiliary_loss_clip": 0.01112734, + "auxiliary_loss_mlp": 0.01033431, + "balance_loss_clip": 1.03820455, + "balance_loss_mlp": 1.02027059, + "epoch": 0.9545167593566812, + "flos": 25918777872000.0, + "grad_norm": 1.5906794520251055, + "language_loss": 0.67873561, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.70019734, + "num_input_tokens_seen": 342430970, + "step": 15876, + "time_per_iteration": 2.599390983581543 + }, + { + "auxiliary_loss_clip": 0.01097415, + "auxiliary_loss_mlp": 0.01036316, + "balance_loss_clip": 1.03623271, + "balance_loss_mlp": 1.02300572, + "epoch": 0.9545768826093491, + "flos": 25628188844160.0, + "grad_norm": 1.8099744313437123, + "language_loss": 0.69018167, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.711519, + "num_input_tokens_seen": 342449505, + "step": 15877, + "time_per_iteration": 2.621135711669922 + }, + { + "auxiliary_loss_clip": 0.01068154, + "auxiliary_loss_mlp": 0.01036314, + "balance_loss_clip": 1.03443968, + "balance_loss_mlp": 1.02281344, + "epoch": 0.9546370058620172, + "flos": 22491499726080.0, + "grad_norm": 1.864175160647524, + "language_loss": 0.71021724, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.73126197, + "num_input_tokens_seen": 342470390, + "step": 15878, + "time_per_iteration": 4.388718843460083 + }, + { + "auxiliary_loss_clip": 0.01104891, + "auxiliary_loss_mlp": 0.01030745, + "balance_loss_clip": 1.03498161, + "balance_loss_mlp": 1.01849008, + "epoch": 0.9546971291146851, + "flos": 24609582961920.0, + "grad_norm": 1.3169717238469103, + "language_loss": 0.67999732, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.70135367, + "num_input_tokens_seen": 342492560, + "step": 15879, + "time_per_iteration": 2.634164571762085 + }, + { + "auxiliary_loss_clip": 0.01071861, + "auxiliary_loss_mlp": 0.0076975, + "balance_loss_clip": 1.03325868, + "balance_loss_mlp": 1.00017679, + "epoch": 0.9547572523673531, + "flos": 28657756627200.0, + "grad_norm": 1.892359769973216, + "language_loss": 0.84921825, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.86763442, + "num_input_tokens_seen": 342512315, + "step": 15880, + "time_per_iteration": 2.7207343578338623 + }, + { + "auxiliary_loss_clip": 0.01043217, + "auxiliary_loss_mlp": 0.01034447, + "balance_loss_clip": 1.03212571, + "balance_loss_mlp": 1.02152514, + "epoch": 0.954817375620021, + "flos": 33802606385280.0, + "grad_norm": 1.832275853665566, + "language_loss": 0.7208662, + "learning_rate": 2.134888478151753e-08, + "loss": 0.74164283, + "num_input_tokens_seen": 342533060, + "step": 15881, + "time_per_iteration": 3.034219980239868 + }, + { + "auxiliary_loss_clip": 0.01097589, + "auxiliary_loss_mlp": 0.01035091, + "balance_loss_clip": 1.03802884, + "balance_loss_mlp": 1.02235389, + "epoch": 0.954877498872689, + "flos": 14428225843200.0, + "grad_norm": 1.8582437523117474, + "language_loss": 0.71399862, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.73532546, + "num_input_tokens_seen": 342550830, + "step": 15882, + "time_per_iteration": 2.5682435035705566 + }, + { + "auxiliary_loss_clip": 0.0108781, + "auxiliary_loss_mlp": 0.0103464, + "balance_loss_clip": 1.0364579, + "balance_loss_mlp": 1.02317178, + "epoch": 0.9549376221253569, + "flos": 59269447336320.0, + "grad_norm": 1.5893740552045255, + "language_loss": 0.65766758, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.67889214, + "num_input_tokens_seen": 342575070, + "step": 15883, + "time_per_iteration": 3.0329291820526123 + }, + { + "auxiliary_loss_clip": 0.01099334, + "auxiliary_loss_mlp": 0.0103149, + "balance_loss_clip": 1.03810024, + "balance_loss_mlp": 1.01847863, + "epoch": 0.954997745378025, + "flos": 17274397760640.0, + "grad_norm": 2.161620424639411, + "language_loss": 0.78009343, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.80140173, + "num_input_tokens_seen": 342592215, + "step": 15884, + "time_per_iteration": 2.62176513671875 + }, + { + "auxiliary_loss_clip": 0.01109312, + "auxiliary_loss_mlp": 0.01029181, + "balance_loss_clip": 1.03558671, + "balance_loss_mlp": 1.01618683, + "epoch": 0.955057868630693, + "flos": 13006378903680.0, + "grad_norm": 1.803007960356649, + "language_loss": 0.77870518, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.80009007, + "num_input_tokens_seen": 342610030, + "step": 15885, + "time_per_iteration": 2.5647974014282227 + }, + { + "auxiliary_loss_clip": 0.01108326, + "auxiliary_loss_mlp": 0.01033941, + "balance_loss_clip": 1.03567576, + "balance_loss_mlp": 1.02171004, + "epoch": 0.9551179918833609, + "flos": 22637692080000.0, + "grad_norm": 1.6846495820783678, + "language_loss": 0.69959128, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.7210139, + "num_input_tokens_seen": 342626475, + "step": 15886, + "time_per_iteration": 2.6008517742156982 + }, + { + "auxiliary_loss_clip": 0.01074503, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.03510761, + "balance_loss_mlp": 1.02006149, + "epoch": 0.9551781151360289, + "flos": 21542811667200.0, + "grad_norm": 1.590980896681407, + "language_loss": 0.72832477, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.74940896, + "num_input_tokens_seen": 342646645, + "step": 15887, + "time_per_iteration": 2.6831438541412354 + }, + { + "auxiliary_loss_clip": 0.01084236, + "auxiliary_loss_mlp": 0.01031385, + "balance_loss_clip": 1.03418398, + "balance_loss_mlp": 1.01973784, + "epoch": 0.9552382383886968, + "flos": 20702250524160.0, + "grad_norm": 1.933274372299018, + "language_loss": 0.5720163, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.59317255, + "num_input_tokens_seen": 342663615, + "step": 15888, + "time_per_iteration": 2.630725860595703 + }, + { + "auxiliary_loss_clip": 0.01019029, + "auxiliary_loss_mlp": 0.01004801, + "balance_loss_clip": 1.00631261, + "balance_loss_mlp": 1.0038352, + "epoch": 0.9552983616413648, + "flos": 67769792887680.0, + "grad_norm": 0.7050649141864813, + "language_loss": 0.57804728, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.59828568, + "num_input_tokens_seen": 342728275, + "step": 15889, + "time_per_iteration": 3.214216470718384 + }, + { + "auxiliary_loss_clip": 0.01108889, + "auxiliary_loss_mlp": 0.01030731, + "balance_loss_clip": 1.03501582, + "balance_loss_mlp": 1.01774263, + "epoch": 0.9553584848940327, + "flos": 21579979265280.0, + "grad_norm": 1.4933948635050138, + "language_loss": 0.6719625, + "learning_rate": 2.084114508877466e-08, + "loss": 0.69335872, + "num_input_tokens_seen": 342748860, + "step": 15890, + "time_per_iteration": 2.600853443145752 + }, + { + "auxiliary_loss_clip": 0.01108529, + "auxiliary_loss_mlp": 0.01026392, + "balance_loss_clip": 1.03781927, + "balance_loss_mlp": 1.01449537, + "epoch": 0.9554186081467008, + "flos": 24208173498240.0, + "grad_norm": 1.434726031550495, + "language_loss": 0.74308884, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.76443803, + "num_input_tokens_seen": 342769705, + "step": 15891, + "time_per_iteration": 2.604349374771118 + }, + { + "auxiliary_loss_clip": 0.01069647, + "auxiliary_loss_mlp": 0.01028677, + "balance_loss_clip": 1.03273499, + "balance_loss_mlp": 1.01785886, + "epoch": 0.9554787313993687, + "flos": 16251554073600.0, + "grad_norm": 1.905721456172026, + "language_loss": 0.77943361, + "learning_rate": 2.072913954011435e-08, + "loss": 0.80041689, + "num_input_tokens_seen": 342787000, + "step": 15892, + "time_per_iteration": 2.6338727474212646 + }, + { + "auxiliary_loss_clip": 0.0110724, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.03596187, + "balance_loss_mlp": 1.01690435, + "epoch": 0.9555388546520367, + "flos": 23404133508480.0, + "grad_norm": 1.3333935754800896, + "language_loss": 0.6973961, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.71876323, + "num_input_tokens_seen": 342807795, + "step": 15893, + "time_per_iteration": 2.64900803565979 + }, + { + "auxiliary_loss_clip": 0.01089703, + "auxiliary_loss_mlp": 0.00770181, + "balance_loss_clip": 1.03906655, + "balance_loss_mlp": 1.00022793, + "epoch": 0.9555989779047046, + "flos": 14794047907200.0, + "grad_norm": 1.8780898151887826, + "language_loss": 0.65497565, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.67357445, + "num_input_tokens_seen": 342825490, + "step": 15894, + "time_per_iteration": 2.640239953994751 + }, + { + "auxiliary_loss_clip": 0.01098184, + "auxiliary_loss_mlp": 0.01032823, + "balance_loss_clip": 1.03628969, + "balance_loss_mlp": 1.01997232, + "epoch": 0.9556591011573726, + "flos": 22236749493120.0, + "grad_norm": 1.917235716935355, + "language_loss": 0.82155561, + "learning_rate": 2.056169412853581e-08, + "loss": 0.84286571, + "num_input_tokens_seen": 342844965, + "step": 15895, + "time_per_iteration": 2.605703592300415 + }, + { + "auxiliary_loss_clip": 0.01083186, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.0364809, + "balance_loss_mlp": 1.01701021, + "epoch": 0.9557192244100405, + "flos": 27855296835840.0, + "grad_norm": 1.5092096829888868, + "language_loss": 0.72777927, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.74890918, + "num_input_tokens_seen": 342865915, + "step": 15896, + "time_per_iteration": 2.800420045852661 + }, + { + "auxiliary_loss_clip": 0.01105404, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.03542614, + "balance_loss_mlp": 1.01926875, + "epoch": 0.9557793476627086, + "flos": 17602800831360.0, + "grad_norm": 1.8673042529516892, + "language_loss": 0.79697645, + "learning_rate": 2.045043915311706e-08, + "loss": 0.81834352, + "num_input_tokens_seen": 342884000, + "step": 15897, + "time_per_iteration": 2.58010196685791 + }, + { + "auxiliary_loss_clip": 0.01081754, + "auxiliary_loss_mlp": 0.0103489, + "balance_loss_clip": 1.03217518, + "balance_loss_mlp": 1.02133036, + "epoch": 0.9558394709153766, + "flos": 23875496709120.0, + "grad_norm": 1.7392744855605553, + "language_loss": 0.7268827, + "learning_rate": 2.03949242614303e-08, + "loss": 0.74804914, + "num_input_tokens_seen": 342903095, + "step": 15898, + "time_per_iteration": 2.675769567489624 + }, + { + "auxiliary_loss_clip": 0.010026, + "auxiliary_loss_mlp": 0.01003805, + "balance_loss_clip": 1.0089612, + "balance_loss_mlp": 1.00289333, + "epoch": 0.9558995941680445, + "flos": 53682001171200.0, + "grad_norm": 0.8518438349506685, + "language_loss": 0.52328175, + "learning_rate": 2.033948443656652e-08, + "loss": 0.54334575, + "num_input_tokens_seen": 342958155, + "step": 15899, + "time_per_iteration": 3.1892175674438477 + }, + { + "auxiliary_loss_clip": 0.01101857, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.03783405, + "balance_loss_mlp": 1.01899076, + "epoch": 0.9559597174207125, + "flos": 13764488376960.0, + "grad_norm": 2.0360903333402183, + "language_loss": 0.68228984, + "learning_rate": 2.028411968062782e-08, + "loss": 0.70363533, + "num_input_tokens_seen": 342972500, + "step": 15900, + "time_per_iteration": 2.5987586975097656 + }, + { + "auxiliary_loss_clip": 0.0109791, + "auxiliary_loss_mlp": 0.0077014, + "balance_loss_clip": 1.03479004, + "balance_loss_mlp": 1.00019574, + "epoch": 0.9560198406733804, + "flos": 19936347799680.0, + "grad_norm": 2.313544780745396, + "language_loss": 0.83186281, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.85054326, + "num_input_tokens_seen": 342989035, + "step": 15901, + "time_per_iteration": 2.593118667602539 + }, + { + "auxiliary_loss_clip": 0.00997227, + "auxiliary_loss_mlp": 0.01005499, + "balance_loss_clip": 1.00669014, + "balance_loss_mlp": 1.00429535, + "epoch": 0.9560799639260484, + "flos": 57289550699520.0, + "grad_norm": 0.70780862037499, + "language_loss": 0.54323339, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.56326067, + "num_input_tokens_seen": 343051675, + "step": 15902, + "time_per_iteration": 3.3085649013519287 + }, + { + "auxiliary_loss_clip": 0.01086623, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.03854203, + "balance_loss_mlp": 1.02167904, + "epoch": 0.9561400871787163, + "flos": 18917167299840.0, + "grad_norm": 1.7381730365709078, + "language_loss": 0.8538748, + "learning_rate": 2.01184758473425e-08, + "loss": 0.87506413, + "num_input_tokens_seen": 343068895, + "step": 15903, + "time_per_iteration": 2.6358137130737305 + }, + { + "auxiliary_loss_clip": 0.01082056, + "auxiliary_loss_mlp": 0.00772044, + "balance_loss_clip": 1.03525889, + "balance_loss_mlp": 1.00020206, + "epoch": 0.9562002104313844, + "flos": 18038576632320.0, + "grad_norm": 1.7984010377221487, + "language_loss": 0.80295885, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.82149988, + "num_input_tokens_seen": 343087115, + "step": 15904, + "time_per_iteration": 2.7303686141967773 + }, + { + "auxiliary_loss_clip": 0.01098663, + "auxiliary_loss_mlp": 0.01031681, + "balance_loss_clip": 1.03495884, + "balance_loss_mlp": 1.01843715, + "epoch": 0.9562603336840523, + "flos": 24717673964160.0, + "grad_norm": 4.96352662326913, + "language_loss": 0.60007298, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.6213764, + "num_input_tokens_seen": 343105575, + "step": 15905, + "time_per_iteration": 2.655217170715332 + }, + { + "auxiliary_loss_clip": 0.01096188, + "auxiliary_loss_mlp": 0.01028565, + "balance_loss_clip": 1.03525162, + "balance_loss_mlp": 1.01632822, + "epoch": 0.9563204569367203, + "flos": 21177205084800.0, + "grad_norm": 1.9515891856264378, + "language_loss": 0.70387208, + "learning_rate": 1.995350770979254e-08, + "loss": 0.72511959, + "num_input_tokens_seen": 343123025, + "step": 15906, + "time_per_iteration": 2.6145029067993164 + }, + { + "auxiliary_loss_clip": 0.01055579, + "auxiliary_loss_mlp": 0.01030633, + "balance_loss_clip": 1.03260493, + "balance_loss_mlp": 1.01775253, + "epoch": 0.9563805801893882, + "flos": 20229738088320.0, + "grad_norm": 1.7038523385332285, + "language_loss": 0.70973694, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.73059911, + "num_input_tokens_seen": 343141625, + "step": 15907, + "time_per_iteration": 2.831192970275879 + }, + { + "auxiliary_loss_clip": 0.0106678, + "auxiliary_loss_mlp": 0.01031766, + "balance_loss_clip": 1.03346992, + "balance_loss_mlp": 1.01988053, + "epoch": 0.9564407034420562, + "flos": 25411001258880.0, + "grad_norm": 1.938524110619909, + "language_loss": 0.70548427, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.72646976, + "num_input_tokens_seen": 343161300, + "step": 15908, + "time_per_iteration": 2.704686164855957 + }, + { + "auxiliary_loss_clip": 0.01085855, + "auxiliary_loss_mlp": 0.00770126, + "balance_loss_clip": 1.03650773, + "balance_loss_mlp": 1.00021219, + "epoch": 0.9565008266947241, + "flos": 18623884752000.0, + "grad_norm": 1.9260740881631984, + "language_loss": 0.83019876, + "learning_rate": 1.978921532427802e-08, + "loss": 0.84875852, + "num_input_tokens_seen": 343177815, + "step": 15909, + "time_per_iteration": 2.6200265884399414 + }, + { + "auxiliary_loss_clip": 0.01096482, + "auxiliary_loss_mlp": 0.01033356, + "balance_loss_clip": 1.03509748, + "balance_loss_mlp": 1.02116704, + "epoch": 0.9565609499473922, + "flos": 24862142465280.0, + "grad_norm": 2.1314572111323717, + "language_loss": 0.67602086, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.69731927, + "num_input_tokens_seen": 343198140, + "step": 15910, + "time_per_iteration": 2.6983892917633057 + }, + { + "auxiliary_loss_clip": 0.01101245, + "auxiliary_loss_mlp": 0.01034074, + "balance_loss_clip": 1.03767276, + "balance_loss_mlp": 1.02156329, + "epoch": 0.9566210732000601, + "flos": 21798459740160.0, + "grad_norm": 1.6976880535824044, + "language_loss": 0.74343169, + "learning_rate": 1.968006251276444e-08, + "loss": 0.76478493, + "num_input_tokens_seen": 343218280, + "step": 15911, + "time_per_iteration": 2.6060009002685547 + }, + { + "auxiliary_loss_clip": 0.01096979, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.03550327, + "balance_loss_mlp": 1.01809359, + "epoch": 0.9566811964527281, + "flos": 18697609416960.0, + "grad_norm": 2.080677167597926, + "language_loss": 0.69605064, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.71732366, + "num_input_tokens_seen": 343236850, + "step": 15912, + "time_per_iteration": 4.122835874557495 + }, + { + "auxiliary_loss_clip": 0.01086077, + "auxiliary_loss_mlp": 0.01036299, + "balance_loss_clip": 1.03359342, + "balance_loss_mlp": 1.02379942, + "epoch": 0.9567413197053961, + "flos": 13000632727680.0, + "grad_norm": 2.5288406213063466, + "language_loss": 0.72268087, + "learning_rate": 1.95712100769696e-08, + "loss": 0.74390459, + "num_input_tokens_seen": 343253065, + "step": 15913, + "time_per_iteration": 4.12858247756958 + }, + { + "auxiliary_loss_clip": 0.01026666, + "auxiliary_loss_mlp": 0.01032498, + "balance_loss_clip": 1.03391707, + "balance_loss_mlp": 1.02069664, + "epoch": 0.956801442958064, + "flos": 19719267955200.0, + "grad_norm": 20.323205931222148, + "language_loss": 0.73863947, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.75923109, + "num_input_tokens_seen": 343270330, + "step": 15914, + "time_per_iteration": 2.7809512615203857 + }, + { + "auxiliary_loss_clip": 0.01107977, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.0365274, + "balance_loss_mlp": 1.01769543, + "epoch": 0.956861566210732, + "flos": 18222834424320.0, + "grad_norm": 1.3806320366510194, + "language_loss": 0.67305696, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.69443822, + "num_input_tokens_seen": 343289625, + "step": 15915, + "time_per_iteration": 4.22941780090332 + }, + { + "auxiliary_loss_clip": 0.01092649, + "auxiliary_loss_mlp": 0.01028226, + "balance_loss_clip": 1.03482556, + "balance_loss_mlp": 1.01616824, + "epoch": 0.9569216894634, + "flos": 22196960202240.0, + "grad_norm": 1.7476602306443554, + "language_loss": 0.64463937, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.66584814, + "num_input_tokens_seen": 343309200, + "step": 15916, + "time_per_iteration": 2.5847983360290527 + }, + { + "auxiliary_loss_clip": 0.0110232, + "auxiliary_loss_mlp": 0.01028805, + "balance_loss_clip": 1.03600883, + "balance_loss_mlp": 1.01722336, + "epoch": 0.956981812716068, + "flos": 21689291329920.0, + "grad_norm": 1.8359549722537702, + "language_loss": 0.80332065, + "learning_rate": 1.935440639853536e-08, + "loss": 0.82463187, + "num_input_tokens_seen": 343326270, + "step": 15917, + "time_per_iteration": 2.5821292400360107 + }, + { + "auxiliary_loss_clip": 0.01077457, + "auxiliary_loss_mlp": 0.01034662, + "balance_loss_clip": 1.0340178, + "balance_loss_mlp": 1.02204955, + "epoch": 0.9570419359687359, + "flos": 13990905757440.0, + "grad_norm": 1.923592863089018, + "language_loss": 0.73075807, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.75187922, + "num_input_tokens_seen": 343344430, + "step": 15918, + "time_per_iteration": 4.2131946086883545 + }, + { + "auxiliary_loss_clip": 0.01002537, + "auxiliary_loss_mlp": 0.0100177, + "balance_loss_clip": 1.0084734, + "balance_loss_mlp": 1.00083399, + "epoch": 0.9571020592214039, + "flos": 65196938534400.0, + "grad_norm": 0.6358192020761055, + "language_loss": 0.53063756, + "learning_rate": 1.924645518878032e-08, + "loss": 0.5506807, + "num_input_tokens_seen": 343416155, + "step": 15919, + "time_per_iteration": 3.3149333000183105 + }, + { + "auxiliary_loss_clip": 0.01106277, + "auxiliary_loss_mlp": 0.01037331, + "balance_loss_clip": 1.04041994, + "balance_loss_mlp": 1.02374721, + "epoch": 0.9571621824740718, + "flos": 17384068961280.0, + "grad_norm": 50.750331888616735, + "language_loss": 0.74972582, + "learning_rate": 1.919259224843972e-08, + "loss": 0.77116191, + "num_input_tokens_seen": 343431715, + "step": 15920, + "time_per_iteration": 2.6216814517974854 + }, + { + "auxiliary_loss_clip": 0.01074302, + "auxiliary_loss_mlp": 0.01033346, + "balance_loss_clip": 1.03634095, + "balance_loss_mlp": 1.02012527, + "epoch": 0.9572223057267398, + "flos": 14538184352640.0, + "grad_norm": 1.6187560061674033, + "language_loss": 0.7876358, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.80871224, + "num_input_tokens_seen": 343450425, + "step": 15921, + "time_per_iteration": 2.6776888370513916 + }, + { + "auxiliary_loss_clip": 0.01102004, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.03531837, + "balance_loss_mlp": 1.01865196, + "epoch": 0.9572824289794077, + "flos": 33947793158400.0, + "grad_norm": 2.2537365266474776, + "language_loss": 0.51078975, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.53213173, + "num_input_tokens_seen": 343470445, + "step": 15922, + "time_per_iteration": 2.7087700366973877 + }, + { + "auxiliary_loss_clip": 0.01055425, + "auxiliary_loss_mlp": 0.01043646, + "balance_loss_clip": 1.0309701, + "balance_loss_mlp": 1.02935278, + "epoch": 0.9573425522320758, + "flos": 18694915896960.0, + "grad_norm": 1.958282285271952, + "language_loss": 0.84238583, + "learning_rate": 1.903145411006557e-08, + "loss": 0.86337662, + "num_input_tokens_seen": 343485200, + "step": 15923, + "time_per_iteration": 2.6815152168273926 + }, + { + "auxiliary_loss_clip": 0.010812, + "auxiliary_loss_mlp": 0.01036534, + "balance_loss_clip": 1.03293872, + "balance_loss_mlp": 1.02475667, + "epoch": 0.9574026754847437, + "flos": 28510307297280.0, + "grad_norm": 1.538843441694693, + "language_loss": 0.75172049, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.77289784, + "num_input_tokens_seen": 343505080, + "step": 15924, + "time_per_iteration": 2.7213785648345947 + }, + { + "auxiliary_loss_clip": 0.01087824, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.03699768, + "balance_loss_mlp": 1.01958311, + "epoch": 0.9574627987374117, + "flos": 24352390604160.0, + "grad_norm": 2.227622693008034, + "language_loss": 0.86090326, + "learning_rate": 1.892440427371711e-08, + "loss": 0.88210118, + "num_input_tokens_seen": 343523995, + "step": 15925, + "time_per_iteration": 2.8542959690093994 + }, + { + "auxiliary_loss_clip": 0.01079041, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.03561556, + "balance_loss_mlp": 1.02103734, + "epoch": 0.9575229219900797, + "flos": 23510680225920.0, + "grad_norm": 2.011468601980382, + "language_loss": 0.75676179, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.77789414, + "num_input_tokens_seen": 343542015, + "step": 15926, + "time_per_iteration": 2.7330782413482666 + }, + { + "auxiliary_loss_clip": 0.0108326, + "auxiliary_loss_mlp": 0.01031405, + "balance_loss_clip": 1.03742075, + "balance_loss_mlp": 1.0197587, + "epoch": 0.9575830452427476, + "flos": 22674823764480.0, + "grad_norm": 2.0888079382528333, + "language_loss": 0.77707171, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.79821837, + "num_input_tokens_seen": 343561680, + "step": 15927, + "time_per_iteration": 2.704115390777588 + }, + { + "auxiliary_loss_clip": 0.01063185, + "auxiliary_loss_mlp": 0.01031462, + "balance_loss_clip": 1.03502405, + "balance_loss_mlp": 1.0171268, + "epoch": 0.9576431684954156, + "flos": 30485250835200.0, + "grad_norm": 1.8055478063953943, + "language_loss": 0.68572605, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.70667255, + "num_input_tokens_seen": 343585290, + "step": 15928, + "time_per_iteration": 2.8810582160949707 + }, + { + "auxiliary_loss_clip": 0.01089186, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.03828859, + "balance_loss_mlp": 1.018381, + "epoch": 0.9577032917480836, + "flos": 21687387909120.0, + "grad_norm": 1.5717055472294992, + "language_loss": 0.822155, + "learning_rate": 1.871120608822485e-08, + "loss": 0.84335887, + "num_input_tokens_seen": 343604045, + "step": 15929, + "time_per_iteration": 2.6657960414886475 + }, + { + "auxiliary_loss_clip": 0.01077088, + "auxiliary_loss_mlp": 0.01046119, + "balance_loss_clip": 1.03563619, + "balance_loss_mlp": 1.03236794, + "epoch": 0.9577634150007516, + "flos": 29023147728000.0, + "grad_norm": 1.8215200487797032, + "language_loss": 0.72274351, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.74397558, + "num_input_tokens_seen": 343626595, + "step": 15930, + "time_per_iteration": 2.795675277709961 + }, + { + "auxiliary_loss_clip": 0.0103609, + "auxiliary_loss_mlp": 0.01032785, + "balance_loss_clip": 1.03149176, + "balance_loss_mlp": 1.01960611, + "epoch": 0.9578235382534195, + "flos": 19282235178240.0, + "grad_norm": 1.4066251693487615, + "language_loss": 0.62494546, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.64563417, + "num_input_tokens_seen": 343646195, + "step": 15931, + "time_per_iteration": 2.7418274879455566 + }, + { + "auxiliary_loss_clip": 0.01106716, + "auxiliary_loss_mlp": 0.01030993, + "balance_loss_clip": 1.03693795, + "balance_loss_mlp": 1.01945996, + "epoch": 0.9578836615060875, + "flos": 13699275235200.0, + "grad_norm": 2.1628834321357746, + "language_loss": 0.69288397, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.71426117, + "num_input_tokens_seen": 343663665, + "step": 15932, + "time_per_iteration": 2.6367006301879883 + }, + { + "auxiliary_loss_clip": 0.01080267, + "auxiliary_loss_mlp": 0.01036892, + "balance_loss_clip": 1.0359807, + "balance_loss_mlp": 1.02269387, + "epoch": 0.9579437847587554, + "flos": 17054516655360.0, + "grad_norm": 1.883637531567824, + "language_loss": 0.75359249, + "learning_rate": 1.849920999338961e-08, + "loss": 0.77476406, + "num_input_tokens_seen": 343682145, + "step": 15933, + "time_per_iteration": 2.692196846008301 + }, + { + "auxiliary_loss_clip": 0.00998155, + "auxiliary_loss_mlp": 0.00999946, + "balance_loss_clip": 1.01311505, + "balance_loss_mlp": 0.99865836, + "epoch": 0.9580039080114234, + "flos": 60570887886720.0, + "grad_norm": 0.7032232478309851, + "language_loss": 0.57280135, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.59278238, + "num_input_tokens_seen": 343744685, + "step": 15934, + "time_per_iteration": 3.389955997467041 + }, + { + "auxiliary_loss_clip": 0.01027383, + "auxiliary_loss_mlp": 0.00751072, + "balance_loss_clip": 1.00506508, + "balance_loss_mlp": 0.99959391, + "epoch": 0.9580640312640913, + "flos": 66235365745920.0, + "grad_norm": 0.9122482390110158, + "language_loss": 0.65885007, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.67663455, + "num_input_tokens_seen": 343801835, + "step": 15935, + "time_per_iteration": 3.0524590015411377 + }, + { + "auxiliary_loss_clip": 0.01007227, + "auxiliary_loss_mlp": 0.01002986, + "balance_loss_clip": 1.00997615, + "balance_loss_mlp": 1.00185907, + "epoch": 0.9581241545167594, + "flos": 62218002971520.0, + "grad_norm": 0.7845197827637053, + "language_loss": 0.57026505, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.5903672, + "num_input_tokens_seen": 343861515, + "step": 15936, + "time_per_iteration": 3.161888837814331 + }, + { + "auxiliary_loss_clip": 0.01048485, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.03310895, + "balance_loss_mlp": 1.01798666, + "epoch": 0.9581842777694273, + "flos": 23768088065280.0, + "grad_norm": 1.4930030330503186, + "language_loss": 0.78472948, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.80551857, + "num_input_tokens_seen": 343881240, + "step": 15937, + "time_per_iteration": 2.777000665664673 + }, + { + "auxiliary_loss_clip": 0.01096104, + "auxiliary_loss_mlp": 0.01032818, + "balance_loss_clip": 1.03548694, + "balance_loss_mlp": 1.01956844, + "epoch": 0.9582444010220953, + "flos": 21213079793280.0, + "grad_norm": 1.6538903091453836, + "language_loss": 0.6840139, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.70530319, + "num_input_tokens_seen": 343900885, + "step": 15938, + "time_per_iteration": 2.638640880584717 + }, + { + "auxiliary_loss_clip": 0.01076145, + "auxiliary_loss_mlp": 0.01029859, + "balance_loss_clip": 1.0352782, + "balance_loss_mlp": 1.01772964, + "epoch": 0.9583045242747633, + "flos": 23805147922560.0, + "grad_norm": 3.037057978485483, + "language_loss": 0.6558556, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.67691565, + "num_input_tokens_seen": 343918460, + "step": 15939, + "time_per_iteration": 2.8998749256134033 + }, + { + "auxiliary_loss_clip": 0.01079284, + "auxiliary_loss_mlp": 0.01037887, + "balance_loss_clip": 1.03366089, + "balance_loss_mlp": 1.02551913, + "epoch": 0.9583646475274312, + "flos": 24131468004480.0, + "grad_norm": 1.828143592735246, + "language_loss": 0.73795086, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.75912249, + "num_input_tokens_seen": 343938030, + "step": 15940, + "time_per_iteration": 2.8199172019958496 + }, + { + "auxiliary_loss_clip": 0.01109084, + "auxiliary_loss_mlp": 0.01033619, + "balance_loss_clip": 1.03673601, + "balance_loss_mlp": 1.02029765, + "epoch": 0.9584247707800992, + "flos": 20886651970560.0, + "grad_norm": 2.719526095639371, + "language_loss": 0.72758561, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.74901259, + "num_input_tokens_seen": 343956635, + "step": 15941, + "time_per_iteration": 2.580655097961426 + }, + { + "auxiliary_loss_clip": 0.01087013, + "auxiliary_loss_mlp": 0.01036207, + "balance_loss_clip": 1.03728151, + "balance_loss_mlp": 1.02411318, + "epoch": 0.9584848940327672, + "flos": 26067591918720.0, + "grad_norm": 2.197358569248491, + "language_loss": 0.7112202, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.73245239, + "num_input_tokens_seen": 343976625, + "step": 15942, + "time_per_iteration": 2.6756019592285156 + }, + { + "auxiliary_loss_clip": 0.0110919, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.03630304, + "balance_loss_mlp": 1.02057219, + "epoch": 0.9585450172854352, + "flos": 34492988764800.0, + "grad_norm": 1.5723180530156076, + "language_loss": 0.72362167, + "learning_rate": 1.797447974521571e-08, + "loss": 0.74505079, + "num_input_tokens_seen": 343997790, + "step": 15943, + "time_per_iteration": 2.6411077976226807 + }, + { + "auxiliary_loss_clip": 0.0110037, + "auxiliary_loss_mlp": 0.01037182, + "balance_loss_clip": 1.03692496, + "balance_loss_mlp": 1.02418852, + "epoch": 0.9586051405381031, + "flos": 23110743219840.0, + "grad_norm": 2.3152366176868036, + "language_loss": 0.68444526, + "learning_rate": 1.792242006001965e-08, + "loss": 0.7058208, + "num_input_tokens_seen": 344016935, + "step": 15944, + "time_per_iteration": 2.608394145965576 + }, + { + "auxiliary_loss_clip": 0.0110797, + "auxiliary_loss_mlp": 0.01034536, + "balance_loss_clip": 1.03546226, + "balance_loss_mlp": 1.02167356, + "epoch": 0.9586652637907711, + "flos": 19603994232960.0, + "grad_norm": 1.6133021726163184, + "language_loss": 0.66145849, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.6828835, + "num_input_tokens_seen": 344035590, + "step": 15945, + "time_per_iteration": 2.590332508087158 + }, + { + "auxiliary_loss_clip": 0.00971603, + "auxiliary_loss_mlp": 0.01001306, + "balance_loss_clip": 1.01690745, + "balance_loss_mlp": 1.00031078, + "epoch": 0.958725387043439, + "flos": 72073327317120.0, + "grad_norm": 0.7794478770145054, + "language_loss": 0.61829185, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.63802093, + "num_input_tokens_seen": 344100845, + "step": 15946, + "time_per_iteration": 3.602818489074707 + }, + { + "auxiliary_loss_clip": 0.0110601, + "auxiliary_loss_mlp": 0.01029414, + "balance_loss_clip": 1.03621078, + "balance_loss_mlp": 1.01780248, + "epoch": 0.958785510296107, + "flos": 28911932242560.0, + "grad_norm": 2.157451201161463, + "language_loss": 0.7515372, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.7728914, + "num_input_tokens_seen": 344121780, + "step": 15947, + "time_per_iteration": 2.7516565322875977 + }, + { + "auxiliary_loss_clip": 0.01080438, + "auxiliary_loss_mlp": 0.01027644, + "balance_loss_clip": 1.03239012, + "balance_loss_mlp": 1.01584816, + "epoch": 0.958845633548775, + "flos": 18477189607680.0, + "grad_norm": 2.209516368331818, + "language_loss": 0.69477844, + "learning_rate": 1.771493294473747e-08, + "loss": 0.71585929, + "num_input_tokens_seen": 344140150, + "step": 15948, + "time_per_iteration": 2.6244988441467285 + }, + { + "auxiliary_loss_clip": 0.01057363, + "auxiliary_loss_mlp": 0.01032491, + "balance_loss_clip": 1.03523755, + "balance_loss_mlp": 1.02053475, + "epoch": 0.958905756801443, + "flos": 24206916522240.0, + "grad_norm": 2.7387902232592944, + "language_loss": 0.78748626, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.80838478, + "num_input_tokens_seen": 344158200, + "step": 15949, + "time_per_iteration": 2.7260756492614746 + }, + { + "auxiliary_loss_clip": 0.01111297, + "auxiliary_loss_mlp": 0.01033396, + "balance_loss_clip": 1.03865027, + "balance_loss_mlp": 1.0205152, + "epoch": 0.9589658800541109, + "flos": 25007939769600.0, + "grad_norm": 2.2665397116809634, + "language_loss": 0.68637884, + "learning_rate": 1.761164038992602e-08, + "loss": 0.70782578, + "num_input_tokens_seen": 344174720, + "step": 15950, + "time_per_iteration": 2.5775585174560547 + }, + { + "auxiliary_loss_clip": 0.01089548, + "auxiliary_loss_mlp": 0.0103189, + "balance_loss_clip": 1.03689277, + "balance_loss_mlp": 1.02061236, + "epoch": 0.9590260033067789, + "flos": 23514558894720.0, + "grad_norm": 1.7626457055742824, + "language_loss": 0.8612389, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.88245326, + "num_input_tokens_seen": 344192580, + "step": 15951, + "time_per_iteration": 2.691873550415039 + }, + { + "auxiliary_loss_clip": 0.01085942, + "auxiliary_loss_mlp": 0.01037676, + "balance_loss_clip": 1.03608108, + "balance_loss_mlp": 1.02455699, + "epoch": 0.9590861265594469, + "flos": 25520349237120.0, + "grad_norm": 2.7020454087453434, + "language_loss": 0.79673147, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.81796771, + "num_input_tokens_seen": 344210345, + "step": 15952, + "time_per_iteration": 5.9034318923950195 + }, + { + "auxiliary_loss_clip": 0.01098084, + "auxiliary_loss_mlp": 0.01032288, + "balance_loss_clip": 1.03800857, + "balance_loss_mlp": 1.01903152, + "epoch": 0.9591462498121148, + "flos": 21179323987200.0, + "grad_norm": 1.6067487690898035, + "language_loss": 0.69543386, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.71673763, + "num_input_tokens_seen": 344229540, + "step": 15953, + "time_per_iteration": 2.684041976928711 + }, + { + "auxiliary_loss_clip": 0.0104366, + "auxiliary_loss_mlp": 0.01036069, + "balance_loss_clip": 1.03161478, + "balance_loss_mlp": 1.02161503, + "epoch": 0.9592063730647828, + "flos": 21723047136000.0, + "grad_norm": 2.5164695030249096, + "language_loss": 0.58295131, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.60374862, + "num_input_tokens_seen": 344247830, + "step": 15954, + "time_per_iteration": 4.495413064956665 + }, + { + "auxiliary_loss_clip": 0.01098901, + "auxiliary_loss_mlp": 0.01035688, + "balance_loss_clip": 1.03649294, + "balance_loss_mlp": 1.02196693, + "epoch": 0.9592664963174508, + "flos": 29891395278720.0, + "grad_norm": 2.3199338562306027, + "language_loss": 0.74007273, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.76141858, + "num_input_tokens_seen": 344267760, + "step": 15955, + "time_per_iteration": 2.659421443939209 + }, + { + "auxiliary_loss_clip": 0.0108768, + "auxiliary_loss_mlp": 0.0103626, + "balance_loss_clip": 1.03656662, + "balance_loss_mlp": 1.02313542, + "epoch": 0.9593266195701188, + "flos": 17999613354240.0, + "grad_norm": 1.8573128231358735, + "language_loss": 0.62227011, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.64350951, + "num_input_tokens_seen": 344284905, + "step": 15956, + "time_per_iteration": 2.6006531715393066 + }, + { + "auxiliary_loss_clip": 0.01071121, + "auxiliary_loss_mlp": 0.01030387, + "balance_loss_clip": 1.03647816, + "balance_loss_mlp": 1.01758385, + "epoch": 0.9593867428227867, + "flos": 18838271076480.0, + "grad_norm": 1.7774918774932997, + "language_loss": 0.59834391, + "learning_rate": 1.725248447997507e-08, + "loss": 0.61935902, + "num_input_tokens_seen": 344302025, + "step": 15957, + "time_per_iteration": 4.193215847015381 + }, + { + "auxiliary_loss_clip": 0.0107309, + "auxiliary_loss_mlp": 0.01038609, + "balance_loss_clip": 1.03605032, + "balance_loss_mlp": 1.02551365, + "epoch": 0.9594468660754547, + "flos": 29567050444800.0, + "grad_norm": 1.9567714575730066, + "language_loss": 0.74019581, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.76131284, + "num_input_tokens_seen": 344321935, + "step": 15958, + "time_per_iteration": 2.7699391841888428 + }, + { + "auxiliary_loss_clip": 0.01084783, + "auxiliary_loss_mlp": 0.00770183, + "balance_loss_clip": 1.03334033, + "balance_loss_mlp": 1.00015187, + "epoch": 0.9595069893281226, + "flos": 20703256104960.0, + "grad_norm": 1.5869581385449567, + "language_loss": 0.74366057, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.76221019, + "num_input_tokens_seen": 344340405, + "step": 15959, + "time_per_iteration": 2.6944100856781006 + }, + { + "auxiliary_loss_clip": 0.01095064, + "auxiliary_loss_mlp": 0.01030122, + "balance_loss_clip": 1.03618193, + "balance_loss_mlp": 1.01764679, + "epoch": 0.9595671125807906, + "flos": 22453613856000.0, + "grad_norm": 2.137039778819024, + "language_loss": 0.65102017, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.67227197, + "num_input_tokens_seen": 344359925, + "step": 15960, + "time_per_iteration": 2.6418590545654297 + }, + { + "auxiliary_loss_clip": 0.01105547, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.03602636, + "balance_loss_mlp": 1.02172852, + "epoch": 0.9596272358334585, + "flos": 23915214172800.0, + "grad_norm": 1.6573536017589419, + "language_loss": 0.78154403, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.80294573, + "num_input_tokens_seen": 344379100, + "step": 15961, + "time_per_iteration": 2.5798726081848145 + }, + { + "auxiliary_loss_clip": 0.01064092, + "auxiliary_loss_mlp": 0.01028572, + "balance_loss_clip": 1.03795755, + "balance_loss_mlp": 1.0165441, + "epoch": 0.9596873590861266, + "flos": 17672539086720.0, + "grad_norm": 1.9079571548212244, + "language_loss": 0.75957453, + "learning_rate": 1.699820008484698e-08, + "loss": 0.78050113, + "num_input_tokens_seen": 344396895, + "step": 15962, + "time_per_iteration": 2.6588690280914307 + }, + { + "auxiliary_loss_clip": 0.01089965, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.03721845, + "balance_loss_mlp": 1.02023554, + "epoch": 0.9597474823387945, + "flos": 25808532053760.0, + "grad_norm": 2.2256779220037965, + "language_loss": 0.71570283, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.73693591, + "num_input_tokens_seen": 344415115, + "step": 15963, + "time_per_iteration": 2.6878271102905273 + }, + { + "auxiliary_loss_clip": 0.01079235, + "auxiliary_loss_mlp": 0.01033914, + "balance_loss_clip": 1.03706479, + "balance_loss_mlp": 1.02217138, + "epoch": 0.9598076055914625, + "flos": 23768519028480.0, + "grad_norm": 1.5220973192302523, + "language_loss": 0.74199623, + "learning_rate": 1.689701268270527e-08, + "loss": 0.76312768, + "num_input_tokens_seen": 344435185, + "step": 15964, + "time_per_iteration": 2.6809606552124023 + }, + { + "auxiliary_loss_clip": 0.00990624, + "auxiliary_loss_mlp": 0.01004877, + "balance_loss_clip": 1.00604916, + "balance_loss_mlp": 1.00392365, + "epoch": 0.9598677288441305, + "flos": 56515962464640.0, + "grad_norm": 0.8796440193210406, + "language_loss": 0.57517397, + "learning_rate": 1.684653177987161e-08, + "loss": 0.59512901, + "num_input_tokens_seen": 344488950, + "step": 15965, + "time_per_iteration": 3.202644109725952 + }, + { + "auxiliary_loss_clip": 0.0110834, + "auxiliary_loss_mlp": 0.01031239, + "balance_loss_clip": 1.03589642, + "balance_loss_mlp": 1.01969969, + "epoch": 0.9599278520967984, + "flos": 22997480659200.0, + "grad_norm": 1.6089991513926745, + "language_loss": 0.79091173, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.81230754, + "num_input_tokens_seen": 344506740, + "step": 15966, + "time_per_iteration": 2.545722723007202 + }, + { + "auxiliary_loss_clip": 0.01080372, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.03162789, + "balance_loss_mlp": 1.01910806, + "epoch": 0.9599879753494664, + "flos": 23039676161280.0, + "grad_norm": 1.6133407382349438, + "language_loss": 0.79225981, + "learning_rate": 1.674579558025102e-08, + "loss": 0.81337535, + "num_input_tokens_seen": 344526670, + "step": 15967, + "time_per_iteration": 2.7037363052368164 + }, + { + "auxiliary_loss_clip": 0.01052446, + "auxiliary_loss_mlp": 0.01030727, + "balance_loss_clip": 1.03173876, + "balance_loss_mlp": 1.01654053, + "epoch": 0.9600480986021344, + "flos": 16392287560320.0, + "grad_norm": 4.094831568098365, + "language_loss": 0.80607283, + "learning_rate": 1.669554028728348e-08, + "loss": 0.82690465, + "num_input_tokens_seen": 344541995, + "step": 15968, + "time_per_iteration": 2.6827492713928223 + }, + { + "auxiliary_loss_clip": 0.01061685, + "auxiliary_loss_mlp": 0.01040969, + "balance_loss_clip": 1.0351243, + "balance_loss_mlp": 1.02595484, + "epoch": 0.9601082218548024, + "flos": 24276439296000.0, + "grad_norm": 2.307236682413655, + "language_loss": 0.6711151, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.69214165, + "num_input_tokens_seen": 344559980, + "step": 15969, + "time_per_iteration": 2.709578037261963 + }, + { + "auxiliary_loss_clip": 0.01097154, + "auxiliary_loss_mlp": 0.01041579, + "balance_loss_clip": 1.03613186, + "balance_loss_mlp": 1.02947915, + "epoch": 0.9601683451074703, + "flos": 19609991804160.0, + "grad_norm": 4.544697653030829, + "language_loss": 0.79086411, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.81225151, + "num_input_tokens_seen": 344577765, + "step": 15970, + "time_per_iteration": 2.7411954402923584 + }, + { + "auxiliary_loss_clip": 0.01094881, + "auxiliary_loss_mlp": 0.01030457, + "balance_loss_clip": 1.03728533, + "balance_loss_mlp": 1.01805329, + "epoch": 0.9602284683601383, + "flos": 26651104358400.0, + "grad_norm": 1.536259054605733, + "language_loss": 0.7747072, + "learning_rate": 1.654522565861316e-08, + "loss": 0.79596055, + "num_input_tokens_seen": 344597650, + "step": 15971, + "time_per_iteration": 2.7451272010803223 + }, + { + "auxiliary_loss_clip": 0.01091946, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.0364567, + "balance_loss_mlp": 1.01459122, + "epoch": 0.9602885916128062, + "flos": 15554096714880.0, + "grad_norm": 1.7996222753948563, + "language_loss": 0.67346907, + "learning_rate": 1.64952712054669e-08, + "loss": 0.69466674, + "num_input_tokens_seen": 344613580, + "step": 15972, + "time_per_iteration": 2.6623332500457764 + }, + { + "auxiliary_loss_clip": 0.01094982, + "auxiliary_loss_mlp": 0.00769511, + "balance_loss_clip": 1.03505695, + "balance_loss_mlp": 1.00020421, + "epoch": 0.9603487148654742, + "flos": 16502353810560.0, + "grad_norm": 2.1539804803600555, + "language_loss": 0.76114738, + "learning_rate": 1.644539196701844e-08, + "loss": 0.77979231, + "num_input_tokens_seen": 344626910, + "step": 15973, + "time_per_iteration": 2.6319777965545654 + }, + { + "auxiliary_loss_clip": 0.01068013, + "auxiliary_loss_mlp": 0.01045452, + "balance_loss_clip": 1.03971171, + "balance_loss_mlp": 1.03173113, + "epoch": 0.9604088381181421, + "flos": 20845354308480.0, + "grad_norm": 1.5935467286542793, + "language_loss": 0.68907356, + "learning_rate": 1.639558794515983e-08, + "loss": 0.71020818, + "num_input_tokens_seen": 344644330, + "step": 15974, + "time_per_iteration": 2.722294569015503 + }, + { + "auxiliary_loss_clip": 0.01097463, + "auxiliary_loss_mlp": 0.01029415, + "balance_loss_clip": 1.03470838, + "balance_loss_mlp": 1.01666546, + "epoch": 0.9604689613708102, + "flos": 19683105937920.0, + "grad_norm": 1.5822354896144846, + "language_loss": 0.67808646, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.69935524, + "num_input_tokens_seen": 344663910, + "step": 15975, + "time_per_iteration": 2.5872485637664795 + }, + { + "auxiliary_loss_clip": 0.01105768, + "auxiliary_loss_mlp": 0.0102977, + "balance_loss_clip": 1.03735399, + "balance_loss_mlp": 1.01801634, + "epoch": 0.9605290846234781, + "flos": 24097568544000.0, + "grad_norm": 2.0352421643496554, + "language_loss": 0.55362296, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.57497835, + "num_input_tokens_seen": 344682320, + "step": 15976, + "time_per_iteration": 2.5711615085601807 + }, + { + "auxiliary_loss_clip": 0.01079409, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.03170323, + "balance_loss_mlp": 1.01589346, + "epoch": 0.9605892078761461, + "flos": 27122575299840.0, + "grad_norm": 1.7385849927083583, + "language_loss": 0.68164247, + "learning_rate": 1.624662719799219e-08, + "loss": 0.7027179, + "num_input_tokens_seen": 344701355, + "step": 15977, + "time_per_iteration": 2.671110153198242 + }, + { + "auxiliary_loss_clip": 0.01096711, + "auxiliary_loss_mlp": 0.01039725, + "balance_loss_clip": 1.0339942, + "balance_loss_mlp": 1.02705932, + "epoch": 0.9606493311288141, + "flos": 14136918543360.0, + "grad_norm": 1.9552633904927965, + "language_loss": 0.81768823, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.83905256, + "num_input_tokens_seen": 344717980, + "step": 15978, + "time_per_iteration": 2.555152177810669 + }, + { + "auxiliary_loss_clip": 0.01100379, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.03526115, + "balance_loss_mlp": 1.02026129, + "epoch": 0.960709454381482, + "flos": 15813336147840.0, + "grad_norm": 2.5958973786310664, + "language_loss": 0.83387029, + "learning_rate": 1.614769615070921e-08, + "loss": 0.85520506, + "num_input_tokens_seen": 344733480, + "step": 15979, + "time_per_iteration": 2.5497281551361084 + }, + { + "auxiliary_loss_clip": 0.0110855, + "auxiliary_loss_mlp": 0.0103807, + "balance_loss_clip": 1.03590512, + "balance_loss_mlp": 1.02634561, + "epoch": 0.96076957763415, + "flos": 22565403959040.0, + "grad_norm": 1.5387344792405315, + "language_loss": 0.79981411, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.82128036, + "num_input_tokens_seen": 344752130, + "step": 15980, + "time_per_iteration": 2.5905473232269287 + }, + { + "auxiliary_loss_clip": 0.0109877, + "auxiliary_loss_mlp": 0.01028852, + "balance_loss_clip": 1.03493583, + "balance_loss_mlp": 1.01669884, + "epoch": 0.960829700886818, + "flos": 24681260551680.0, + "grad_norm": 2.1133956076478664, + "language_loss": 0.68550336, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.7067796, + "num_input_tokens_seen": 344771195, + "step": 15981, + "time_per_iteration": 2.612859010696411 + }, + { + "auxiliary_loss_clip": 0.01093593, + "auxiliary_loss_mlp": 0.00769185, + "balance_loss_clip": 1.03381348, + "balance_loss_mlp": 1.00022793, + "epoch": 0.960889824139486, + "flos": 26542223256960.0, + "grad_norm": 1.7427488082202907, + "language_loss": 0.69655585, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.71518368, + "num_input_tokens_seen": 344793150, + "step": 15982, + "time_per_iteration": 2.5976712703704834 + }, + { + "auxiliary_loss_clip": 0.00999386, + "auxiliary_loss_mlp": 0.00999842, + "balance_loss_clip": 1.00873065, + "balance_loss_mlp": 0.99883503, + "epoch": 0.9609499473921539, + "flos": 71114942586240.0, + "grad_norm": 0.6662874466097782, + "language_loss": 0.53221011, + "learning_rate": 1.595073680563286e-08, + "loss": 0.5522024, + "num_input_tokens_seen": 344852855, + "step": 15983, + "time_per_iteration": 3.3874897956848145 + }, + { + "auxiliary_loss_clip": 0.01107834, + "auxiliary_loss_mlp": 0.01034719, + "balance_loss_clip": 1.03694439, + "balance_loss_mlp": 1.02233326, + "epoch": 0.9610100706448219, + "flos": 20552466810240.0, + "grad_norm": 2.121316600078336, + "language_loss": 0.67938662, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.70081216, + "num_input_tokens_seen": 344869830, + "step": 15984, + "time_per_iteration": 2.5932650566101074 + }, + { + "auxiliary_loss_clip": 0.01074236, + "auxiliary_loss_mlp": 0.01033486, + "balance_loss_clip": 1.03595209, + "balance_loss_mlp": 1.02153563, + "epoch": 0.9610701938974898, + "flos": 14064199459200.0, + "grad_norm": 1.5705983940163633, + "language_loss": 0.67496943, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.69604665, + "num_input_tokens_seen": 344888905, + "step": 15985, + "time_per_iteration": 2.726486921310425 + }, + { + "auxiliary_loss_clip": 0.01108849, + "auxiliary_loss_mlp": 0.01032566, + "balance_loss_clip": 1.03674269, + "balance_loss_mlp": 1.02043021, + "epoch": 0.9611303171501578, + "flos": 20229989483520.0, + "grad_norm": 2.8268896406333237, + "language_loss": 0.78626662, + "learning_rate": 1.580380726142283e-08, + "loss": 0.80768073, + "num_input_tokens_seen": 344907160, + "step": 15986, + "time_per_iteration": 2.585028886795044 + }, + { + "auxiliary_loss_clip": 0.01059902, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.03704977, + "balance_loss_mlp": 1.01792264, + "epoch": 0.9611904404028258, + "flos": 20951075013120.0, + "grad_norm": 3.983829786169989, + "language_loss": 0.64043385, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.6613459, + "num_input_tokens_seen": 344922400, + "step": 15987, + "time_per_iteration": 2.6663405895233154 + }, + { + "auxiliary_loss_clip": 0.01105457, + "auxiliary_loss_mlp": 0.01030427, + "balance_loss_clip": 1.03672767, + "balance_loss_mlp": 1.01882839, + "epoch": 0.9612505636554938, + "flos": 24827740214400.0, + "grad_norm": 1.6472459038973077, + "language_loss": 0.66917932, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.69053823, + "num_input_tokens_seen": 344941910, + "step": 15988, + "time_per_iteration": 2.6424877643585205 + }, + { + "auxiliary_loss_clip": 0.01096712, + "auxiliary_loss_mlp": 0.01043698, + "balance_loss_clip": 1.03608358, + "balance_loss_mlp": 1.03205132, + "epoch": 0.9613106869081617, + "flos": 17164977955200.0, + "grad_norm": 2.1053842108044876, + "language_loss": 0.74786007, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.76926422, + "num_input_tokens_seen": 344960020, + "step": 15989, + "time_per_iteration": 2.5956602096557617 + }, + { + "auxiliary_loss_clip": 0.01009811, + "auxiliary_loss_mlp": 0.01009546, + "balance_loss_clip": 1.00601673, + "balance_loss_mlp": 1.00818145, + "epoch": 0.9613708101608297, + "flos": 61563818522880.0, + "grad_norm": 0.8478358014550572, + "language_loss": 0.63107759, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.65127116, + "num_input_tokens_seen": 345018290, + "step": 15990, + "time_per_iteration": 3.152273178100586 + }, + { + "auxiliary_loss_clip": 0.01096035, + "auxiliary_loss_mlp": 0.01034605, + "balance_loss_clip": 1.03537118, + "balance_loss_mlp": 1.02268958, + "epoch": 0.9614309334134977, + "flos": 27417904922880.0, + "grad_norm": 1.9574002604644196, + "language_loss": 0.77676558, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.79807198, + "num_input_tokens_seen": 345040235, + "step": 15991, + "time_per_iteration": 4.212686538696289 + }, + { + "auxiliary_loss_clip": 0.01114207, + "auxiliary_loss_mlp": 0.01034295, + "balance_loss_clip": 1.0376842, + "balance_loss_mlp": 1.02099133, + "epoch": 0.9614910566661656, + "flos": 22819148611200.0, + "grad_norm": 2.7719678193079247, + "language_loss": 0.84980291, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.87128794, + "num_input_tokens_seen": 345054540, + "step": 15992, + "time_per_iteration": 2.5331294536590576 + }, + { + "auxiliary_loss_clip": 0.01084656, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.0333364, + "balance_loss_mlp": 1.01737833, + "epoch": 0.9615511799188337, + "flos": 20667812359680.0, + "grad_norm": 5.393855831063401, + "language_loss": 0.7277714, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.74892598, + "num_input_tokens_seen": 345074035, + "step": 15993, + "time_per_iteration": 4.279495000839233 + }, + { + "auxiliary_loss_clip": 0.01071095, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.03617215, + "balance_loss_mlp": 1.02064323, + "epoch": 0.9616113031715016, + "flos": 33149212035840.0, + "grad_norm": 1.4893605821515894, + "language_loss": 0.68342292, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.70446742, + "num_input_tokens_seen": 345099270, + "step": 15994, + "time_per_iteration": 2.7772884368896484 + }, + { + "auxiliary_loss_clip": 0.01072149, + "auxiliary_loss_mlp": 0.01034975, + "balance_loss_clip": 1.03700161, + "balance_loss_mlp": 1.02212477, + "epoch": 0.9616714264241696, + "flos": 25009807276800.0, + "grad_norm": 1.9283329127731719, + "language_loss": 0.84783322, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.86890447, + "num_input_tokens_seen": 345116975, + "step": 15995, + "time_per_iteration": 2.7321677207946777 + }, + { + "auxiliary_loss_clip": 0.0110129, + "auxiliary_loss_mlp": 0.01035588, + "balance_loss_clip": 1.03743207, + "balance_loss_mlp": 1.02226043, + "epoch": 0.9617315496768375, + "flos": 13547480359680.0, + "grad_norm": 2.3727684194410865, + "language_loss": 0.75815755, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.77952629, + "num_input_tokens_seen": 345133645, + "step": 15996, + "time_per_iteration": 2.5802130699157715 + }, + { + "auxiliary_loss_clip": 0.01082505, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.03420186, + "balance_loss_mlp": 1.02108765, + "epoch": 0.9617916729295055, + "flos": 11254512781440.0, + "grad_norm": 3.2502659425610156, + "language_loss": 0.76369971, + "learning_rate": 1.52708595287494e-08, + "loss": 0.7848649, + "num_input_tokens_seen": 345150740, + "step": 15997, + "time_per_iteration": 4.12961745262146 + }, + { + "auxiliary_loss_clip": 0.01103332, + "auxiliary_loss_mlp": 0.00769549, + "balance_loss_clip": 1.03523898, + "balance_loss_mlp": 1.0002147, + "epoch": 0.9618517961821734, + "flos": 22819723228800.0, + "grad_norm": 1.6933641489070883, + "language_loss": 0.67267382, + "learning_rate": 1.522286126505001e-08, + "loss": 0.69140267, + "num_input_tokens_seen": 345170365, + "step": 15998, + "time_per_iteration": 2.5632731914520264 + }, + { + "auxiliary_loss_clip": 0.01079044, + "auxiliary_loss_mlp": 0.01030668, + "balance_loss_clip": 1.03057599, + "balance_loss_mlp": 1.01695287, + "epoch": 0.9619119194348414, + "flos": 16617340224000.0, + "grad_norm": 1.6277881889337782, + "language_loss": 0.7250607, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.74615777, + "num_input_tokens_seen": 345188930, + "step": 15999, + "time_per_iteration": 2.5826117992401123 + }, + { + "auxiliary_loss_clip": 0.01079594, + "auxiliary_loss_mlp": 0.01023964, + "balance_loss_clip": 1.03278232, + "balance_loss_mlp": 1.01237655, + "epoch": 0.9619720426875094, + "flos": 24535140024960.0, + "grad_norm": 1.9376682350372685, + "language_loss": 0.65341753, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.67445314, + "num_input_tokens_seen": 345209615, + "step": 16000, + "time_per_iteration": 2.6649346351623535 + }, + { + "auxiliary_loss_clip": 0.01074444, + "auxiliary_loss_mlp": 0.01027888, + "balance_loss_clip": 1.034127, + "balance_loss_mlp": 1.0147984, + "epoch": 0.9620321659401774, + "flos": 20632224960000.0, + "grad_norm": 1.8942735733189127, + "language_loss": 0.75229144, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.77331471, + "num_input_tokens_seen": 345229175, + "step": 16001, + "time_per_iteration": 2.690169095993042 + }, + { + "auxiliary_loss_clip": 0.01093786, + "auxiliary_loss_mlp": 0.0103193, + "balance_loss_clip": 1.03392005, + "balance_loss_mlp": 1.01907945, + "epoch": 0.9620922891928453, + "flos": 18515290959360.0, + "grad_norm": 1.9242649128413576, + "language_loss": 0.68372071, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.70497787, + "num_input_tokens_seen": 345247815, + "step": 16002, + "time_per_iteration": 2.609285831451416 + }, + { + "auxiliary_loss_clip": 0.01096986, + "auxiliary_loss_mlp": 0.01032704, + "balance_loss_clip": 1.03659725, + "balance_loss_mlp": 1.0204016, + "epoch": 0.9621524124455133, + "flos": 28767391914240.0, + "grad_norm": 1.3322402005995133, + "language_loss": 0.64338034, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.66467726, + "num_input_tokens_seen": 345269935, + "step": 16003, + "time_per_iteration": 2.64509654045105 + }, + { + "auxiliary_loss_clip": 0.01056283, + "auxiliary_loss_mlp": 0.0103757, + "balance_loss_clip": 1.03516269, + "balance_loss_mlp": 1.02617371, + "epoch": 0.9622125356981813, + "flos": 19098875226240.0, + "grad_norm": 1.8799726685356375, + "language_loss": 0.75980008, + "learning_rate": 1.493645226826512e-08, + "loss": 0.78073859, + "num_input_tokens_seen": 345288310, + "step": 16004, + "time_per_iteration": 2.746777057647705 + }, + { + "auxiliary_loss_clip": 0.01096501, + "auxiliary_loss_mlp": 0.01030981, + "balance_loss_clip": 1.03659928, + "balance_loss_mlp": 1.01776099, + "epoch": 0.9622726589508492, + "flos": 20302816308480.0, + "grad_norm": 1.8988665709450379, + "language_loss": 0.79441619, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.81569099, + "num_input_tokens_seen": 345306615, + "step": 16005, + "time_per_iteration": 2.6173338890075684 + }, + { + "auxiliary_loss_clip": 0.01093237, + "auxiliary_loss_mlp": 0.01030093, + "balance_loss_clip": 1.0344584, + "balance_loss_mlp": 1.01876855, + "epoch": 0.9623327822035173, + "flos": 54929750889600.0, + "grad_norm": 30.35501867161595, + "language_loss": 0.67897928, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.7002126, + "num_input_tokens_seen": 345331935, + "step": 16006, + "time_per_iteration": 2.912827730178833 + }, + { + "auxiliary_loss_clip": 0.0107661, + "auxiliary_loss_mlp": 0.01037957, + "balance_loss_clip": 1.03514838, + "balance_loss_mlp": 1.02623272, + "epoch": 0.9623929054561852, + "flos": 21759029585280.0, + "grad_norm": 1.8340702205023383, + "language_loss": 0.77994108, + "learning_rate": 1.479426394188521e-08, + "loss": 0.80108666, + "num_input_tokens_seen": 345351510, + "step": 16007, + "time_per_iteration": 2.6248257160186768 + }, + { + "auxiliary_loss_clip": 0.0111027, + "auxiliary_loss_mlp": 0.010323, + "balance_loss_clip": 1.03747129, + "balance_loss_mlp": 1.01994443, + "epoch": 0.9624530287088532, + "flos": 17931563038080.0, + "grad_norm": 2.1097968556783244, + "language_loss": 0.67964327, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.701069, + "num_input_tokens_seen": 345367750, + "step": 16008, + "time_per_iteration": 2.537191867828369 + }, + { + "auxiliary_loss_clip": 0.01085992, + "auxiliary_loss_mlp": 0.01032638, + "balance_loss_clip": 1.03743672, + "balance_loss_mlp": 1.01911998, + "epoch": 0.9625131519615211, + "flos": 23253739263360.0, + "grad_norm": 2.1552022323644846, + "language_loss": 0.72934628, + "learning_rate": 1.469984811730529e-08, + "loss": 0.75053251, + "num_input_tokens_seen": 345384790, + "step": 16009, + "time_per_iteration": 2.6170432567596436 + }, + { + "auxiliary_loss_clip": 0.01094692, + "auxiliary_loss_mlp": 0.01032269, + "balance_loss_clip": 1.03521502, + "balance_loss_mlp": 1.02035999, + "epoch": 0.9625732752141891, + "flos": 18916628595840.0, + "grad_norm": 2.236210012080847, + "language_loss": 0.75740463, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.77867424, + "num_input_tokens_seen": 345403390, + "step": 16010, + "time_per_iteration": 2.6094565391540527 + }, + { + "auxiliary_loss_clip": 0.0110126, + "auxiliary_loss_mlp": 0.01033383, + "balance_loss_clip": 1.03804505, + "balance_loss_mlp": 1.01812458, + "epoch": 0.962633398466857, + "flos": 16252918790400.0, + "grad_norm": 1.8499312675955801, + "language_loss": 0.69607782, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.71742427, + "num_input_tokens_seen": 345418685, + "step": 16011, + "time_per_iteration": 2.5665814876556396 + }, + { + "auxiliary_loss_clip": 0.01096422, + "auxiliary_loss_mlp": 0.01034569, + "balance_loss_clip": 1.03724504, + "balance_loss_mlp": 1.02317858, + "epoch": 0.962693521719525, + "flos": 54197424403200.0, + "grad_norm": 2.090107239169434, + "language_loss": 0.68528754, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.70659745, + "num_input_tokens_seen": 345442380, + "step": 16012, + "time_per_iteration": 2.8673768043518066 + }, + { + "auxiliary_loss_clip": 0.0109098, + "auxiliary_loss_mlp": 0.01034537, + "balance_loss_clip": 1.03467774, + "balance_loss_mlp": 1.02032721, + "epoch": 0.962753644972193, + "flos": 33105795471360.0, + "grad_norm": 1.7699818597155268, + "language_loss": 0.72427005, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.74552524, + "num_input_tokens_seen": 345463815, + "step": 16013, + "time_per_iteration": 2.75661301612854 + }, + { + "auxiliary_loss_clip": 0.01075741, + "auxiliary_loss_mlp": 0.01033286, + "balance_loss_clip": 1.03560877, + "balance_loss_mlp": 1.02013755, + "epoch": 0.962813768224861, + "flos": 42230660837760.0, + "grad_norm": 2.2049191715413996, + "language_loss": 0.63640058, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.65749085, + "num_input_tokens_seen": 345484525, + "step": 16014, + "time_per_iteration": 2.801541328430176 + }, + { + "auxiliary_loss_clip": 0.01084087, + "auxiliary_loss_mlp": 0.01031024, + "balance_loss_clip": 1.03718603, + "balance_loss_mlp": 1.02020597, + "epoch": 0.9628738914775289, + "flos": 43944677003520.0, + "grad_norm": 1.6444594137562585, + "language_loss": 0.71679461, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.73794574, + "num_input_tokens_seen": 345508295, + "step": 16015, + "time_per_iteration": 2.8245065212249756 + }, + { + "auxiliary_loss_clip": 0.01070924, + "auxiliary_loss_mlp": 0.01031789, + "balance_loss_clip": 1.03087783, + "balance_loss_mlp": 1.01914668, + "epoch": 0.9629340147301969, + "flos": 15596184476160.0, + "grad_norm": 1.8324403843710784, + "language_loss": 0.77434921, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.79537642, + "num_input_tokens_seen": 345525155, + "step": 16016, + "time_per_iteration": 2.7069830894470215 + }, + { + "auxiliary_loss_clip": 0.01027071, + "auxiliary_loss_mlp": 0.01000442, + "balance_loss_clip": 1.0047729, + "balance_loss_mlp": 0.99953043, + "epoch": 0.9629941379828649, + "flos": 62951011816320.0, + "grad_norm": 0.808956080436883, + "language_loss": 0.63018364, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.65045875, + "num_input_tokens_seen": 345578905, + "step": 16017, + "time_per_iteration": 3.0989813804626465 + }, + { + "auxiliary_loss_clip": 0.01093389, + "auxiliary_loss_mlp": 0.0102702, + "balance_loss_clip": 1.03960085, + "balance_loss_mlp": 1.01511717, + "epoch": 0.9630542612355328, + "flos": 29899116702720.0, + "grad_norm": 1.8256798404845316, + "language_loss": 0.66259742, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.68380153, + "num_input_tokens_seen": 345598965, + "step": 16018, + "time_per_iteration": 2.7493810653686523 + }, + { + "auxiliary_loss_clip": 0.01059951, + "auxiliary_loss_mlp": 0.01036182, + "balance_loss_clip": 1.03763199, + "balance_loss_mlp": 1.02381968, + "epoch": 0.9631143844882009, + "flos": 17894575008000.0, + "grad_norm": 1.944806621091631, + "language_loss": 0.79563761, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.81659889, + "num_input_tokens_seen": 345617945, + "step": 16019, + "time_per_iteration": 2.6809628009796143 + }, + { + "auxiliary_loss_clip": 0.01070109, + "auxiliary_loss_mlp": 0.01029701, + "balance_loss_clip": 1.03343022, + "balance_loss_mlp": 1.01866817, + "epoch": 0.9631745077408688, + "flos": 26139161767680.0, + "grad_norm": 1.782612322393727, + "language_loss": 0.71960497, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.74060309, + "num_input_tokens_seen": 345637920, + "step": 16020, + "time_per_iteration": 2.724942684173584 + }, + { + "auxiliary_loss_clip": 0.01084456, + "auxiliary_loss_mlp": 0.01026989, + "balance_loss_clip": 1.03556895, + "balance_loss_mlp": 1.01536036, + "epoch": 0.9632346309935368, + "flos": 24973645259520.0, + "grad_norm": 2.276745158926346, + "language_loss": 0.77092677, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.79204124, + "num_input_tokens_seen": 345656195, + "step": 16021, + "time_per_iteration": 2.6800484657287598 + }, + { + "auxiliary_loss_clip": 0.01074317, + "auxiliary_loss_mlp": 0.01030211, + "balance_loss_clip": 1.03503883, + "balance_loss_mlp": 1.0156492, + "epoch": 0.9632947542462047, + "flos": 23617226943360.0, + "grad_norm": 2.2141346213360498, + "language_loss": 0.6477133, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.66875851, + "num_input_tokens_seen": 345676700, + "step": 16022, + "time_per_iteration": 2.6913392543792725 + }, + { + "auxiliary_loss_clip": 0.01079957, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.03176844, + "balance_loss_mlp": 1.02543032, + "epoch": 0.9633548774988727, + "flos": 26395599939840.0, + "grad_norm": 1.8655459266575873, + "language_loss": 0.73232532, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.75349891, + "num_input_tokens_seen": 345696725, + "step": 16023, + "time_per_iteration": 2.8063480854034424 + }, + { + "auxiliary_loss_clip": 0.01092328, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.03424549, + "balance_loss_mlp": 1.02042127, + "epoch": 0.9634150007515406, + "flos": 23767728929280.0, + "grad_norm": 1.4474458645948844, + "language_loss": 0.81416321, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.8354218, + "num_input_tokens_seen": 345716245, + "step": 16024, + "time_per_iteration": 2.6448142528533936 + }, + { + "auxiliary_loss_clip": 0.01102103, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.03745365, + "balance_loss_mlp": 1.01969528, + "epoch": 0.9634751240042086, + "flos": 24135346673280.0, + "grad_norm": 1.5219305560935168, + "language_loss": 0.81457579, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.83592141, + "num_input_tokens_seen": 345739060, + "step": 16025, + "time_per_iteration": 2.6108663082122803 + }, + { + "auxiliary_loss_clip": 0.0110069, + "auxiliary_loss_mlp": 0.01030354, + "balance_loss_clip": 1.03579926, + "balance_loss_mlp": 1.017694, + "epoch": 0.9635352472568766, + "flos": 24349086552960.0, + "grad_norm": 2.10441973449587, + "language_loss": 0.75937688, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.78068733, + "num_input_tokens_seen": 345758325, + "step": 16026, + "time_per_iteration": 2.6266496181488037 + }, + { + "auxiliary_loss_clip": 0.01073067, + "auxiliary_loss_mlp": 0.00772375, + "balance_loss_clip": 1.03285146, + "balance_loss_mlp": 1.00015044, + "epoch": 0.9635953705095446, + "flos": 23984772860160.0, + "grad_norm": 1.7472949763500514, + "language_loss": 0.632388, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.65084237, + "num_input_tokens_seen": 345778530, + "step": 16027, + "time_per_iteration": 2.7170257568359375 + }, + { + "auxiliary_loss_clip": 0.01099141, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.0367496, + "balance_loss_mlp": 1.01860225, + "epoch": 0.9636554937622125, + "flos": 19828436365440.0, + "grad_norm": 2.4648071032004997, + "language_loss": 0.87019849, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.89150786, + "num_input_tokens_seen": 345796535, + "step": 16028, + "time_per_iteration": 2.6614620685577393 + }, + { + "auxiliary_loss_clip": 0.00989988, + "auxiliary_loss_mlp": 0.009984, + "balance_loss_clip": 1.01412296, + "balance_loss_mlp": 0.99733889, + "epoch": 0.9637156170148805, + "flos": 67435499986560.0, + "grad_norm": 0.7245646375690661, + "language_loss": 0.53189749, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.55178136, + "num_input_tokens_seen": 345859700, + "step": 16029, + "time_per_iteration": 3.479651927947998 + }, + { + "auxiliary_loss_clip": 0.01110359, + "auxiliary_loss_mlp": 0.01030913, + "balance_loss_clip": 1.0374155, + "balance_loss_mlp": 1.01844335, + "epoch": 0.9637757402675484, + "flos": 20300912887680.0, + "grad_norm": 1.5210490896959066, + "language_loss": 0.7357558, + "learning_rate": 1.372666546129797e-08, + "loss": 0.75716853, + "num_input_tokens_seen": 345878760, + "step": 16030, + "time_per_iteration": 4.589270353317261 + }, + { + "auxiliary_loss_clip": 0.01082803, + "auxiliary_loss_mlp": 0.01030981, + "balance_loss_clip": 1.03516376, + "balance_loss_mlp": 1.01882792, + "epoch": 0.9638358635202164, + "flos": 27234544970880.0, + "grad_norm": 2.0480859370229485, + "language_loss": 0.66053402, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.68167186, + "num_input_tokens_seen": 345900445, + "step": 16031, + "time_per_iteration": 4.3295276165008545 + }, + { + "auxiliary_loss_clip": 0.0101801, + "auxiliary_loss_mlp": 0.00751055, + "balance_loss_clip": 1.00562906, + "balance_loss_mlp": 0.99969733, + "epoch": 0.9638959867728845, + "flos": 70288998278400.0, + "grad_norm": 0.8510769072154526, + "language_loss": 0.60678655, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.62447721, + "num_input_tokens_seen": 345961020, + "step": 16032, + "time_per_iteration": 4.807501554489136 + }, + { + "auxiliary_loss_clip": 0.01087947, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.03354275, + "balance_loss_mlp": 1.01818657, + "epoch": 0.9639561100255524, + "flos": 25407517639680.0, + "grad_norm": 1.683266113413322, + "language_loss": 0.66466224, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.68583459, + "num_input_tokens_seen": 345980210, + "step": 16033, + "time_per_iteration": 2.6166305541992188 + }, + { + "auxiliary_loss_clip": 0.01049582, + "auxiliary_loss_mlp": 0.01033058, + "balance_loss_clip": 1.03215432, + "balance_loss_mlp": 1.02048707, + "epoch": 0.9640162332782204, + "flos": 18113881495680.0, + "grad_norm": 1.6681343659776384, + "language_loss": 0.65576452, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.67659092, + "num_input_tokens_seen": 345998280, + "step": 16034, + "time_per_iteration": 2.727808713912964 + }, + { + "auxiliary_loss_clip": 0.01064646, + "auxiliary_loss_mlp": 0.01034078, + "balance_loss_clip": 1.03320181, + "balance_loss_mlp": 1.02124476, + "epoch": 0.9640763565308883, + "flos": 23440295525760.0, + "grad_norm": 4.072427623407378, + "language_loss": 0.74320328, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.7641905, + "num_input_tokens_seen": 346015545, + "step": 16035, + "time_per_iteration": 2.690566301345825 + }, + { + "auxiliary_loss_clip": 0.01111339, + "auxiliary_loss_mlp": 0.0102927, + "balance_loss_clip": 1.03984404, + "balance_loss_mlp": 1.01689649, + "epoch": 0.9641364797835563, + "flos": 22419355259520.0, + "grad_norm": 1.9463375206505085, + "language_loss": 0.81678671, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.83819282, + "num_input_tokens_seen": 346034055, + "step": 16036, + "time_per_iteration": 4.158876180648804 + }, + { + "auxiliary_loss_clip": 0.0107928, + "auxiliary_loss_mlp": 0.0103524, + "balance_loss_clip": 1.03454709, + "balance_loss_mlp": 1.0221982, + "epoch": 0.9641966030362242, + "flos": 30622357048320.0, + "grad_norm": 1.959482947327249, + "language_loss": 0.69556695, + "learning_rate": 1.340965177371789e-08, + "loss": 0.71671212, + "num_input_tokens_seen": 346054130, + "step": 16037, + "time_per_iteration": 2.7688260078430176 + }, + { + "auxiliary_loss_clip": 0.01107935, + "auxiliary_loss_mlp": 0.01027132, + "balance_loss_clip": 1.03539455, + "balance_loss_mlp": 1.014907, + "epoch": 0.9642567262888923, + "flos": 20953122088320.0, + "grad_norm": 1.9894338477603324, + "language_loss": 0.63357198, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.65492266, + "num_input_tokens_seen": 346072990, + "step": 16038, + "time_per_iteration": 2.5850584506988525 + }, + { + "auxiliary_loss_clip": 0.01074768, + "auxiliary_loss_mlp": 0.00773215, + "balance_loss_clip": 1.03389633, + "balance_loss_mlp": 1.00020552, + "epoch": 0.9643168495415602, + "flos": 22639415932800.0, + "grad_norm": 1.7788742009808307, + "language_loss": 0.71187615, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.73035598, + "num_input_tokens_seen": 346093745, + "step": 16039, + "time_per_iteration": 2.845629930496216 + }, + { + "auxiliary_loss_clip": 0.01065131, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.03418183, + "balance_loss_mlp": 1.02005541, + "epoch": 0.9643769727942282, + "flos": 20266259241600.0, + "grad_norm": 2.1186364424609376, + "language_loss": 0.73193431, + "learning_rate": 1.327491870605657e-08, + "loss": 0.7529155, + "num_input_tokens_seen": 346110115, + "step": 16040, + "time_per_iteration": 2.786925792694092 + }, + { + "auxiliary_loss_clip": 0.01098258, + "auxiliary_loss_mlp": 0.01030326, + "balance_loss_clip": 1.03442872, + "balance_loss_mlp": 1.0174036, + "epoch": 0.9644370960468961, + "flos": 13881845088000.0, + "grad_norm": 2.252747259214268, + "language_loss": 0.72871804, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.75000393, + "num_input_tokens_seen": 346127165, + "step": 16041, + "time_per_iteration": 2.6087379455566406 + }, + { + "auxiliary_loss_clip": 0.01079942, + "auxiliary_loss_mlp": 0.01032192, + "balance_loss_clip": 1.03319049, + "balance_loss_mlp": 1.0204258, + "epoch": 0.9644972192995641, + "flos": 17238199829760.0, + "grad_norm": 2.3450259817434675, + "language_loss": 0.7170828, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.73820412, + "num_input_tokens_seen": 346145950, + "step": 16042, + "time_per_iteration": 2.630866765975952 + }, + { + "auxiliary_loss_clip": 0.01071379, + "auxiliary_loss_mlp": 0.0103485, + "balance_loss_clip": 1.03418255, + "balance_loss_mlp": 1.02246428, + "epoch": 0.964557342552232, + "flos": 23840340272640.0, + "grad_norm": 2.7842681771990954, + "language_loss": 0.80969441, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.83075678, + "num_input_tokens_seen": 346165005, + "step": 16043, + "time_per_iteration": 2.7390518188476562 + }, + { + "auxiliary_loss_clip": 0.01080445, + "auxiliary_loss_mlp": 0.01033983, + "balance_loss_clip": 1.03533363, + "balance_loss_mlp": 1.0219785, + "epoch": 0.9646174658049, + "flos": 21653129312640.0, + "grad_norm": 1.6601766857412785, + "language_loss": 0.71968645, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.74083078, + "num_input_tokens_seen": 346185095, + "step": 16044, + "time_per_iteration": 2.7201802730560303 + }, + { + "auxiliary_loss_clip": 0.01082368, + "auxiliary_loss_mlp": 0.01030048, + "balance_loss_clip": 1.03337395, + "balance_loss_mlp": 1.01734579, + "epoch": 0.9646775890575681, + "flos": 17129570123520.0, + "grad_norm": 1.8680563800690775, + "language_loss": 0.70015121, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.72127533, + "num_input_tokens_seen": 346202580, + "step": 16045, + "time_per_iteration": 2.612548589706421 + }, + { + "auxiliary_loss_clip": 0.01038509, + "auxiliary_loss_mlp": 0.01033861, + "balance_loss_clip": 1.03134286, + "balance_loss_mlp": 1.02050328, + "epoch": 0.964737712310236, + "flos": 13005732458880.0, + "grad_norm": 1.8349369977772942, + "language_loss": 0.74999833, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.77072203, + "num_input_tokens_seen": 346219395, + "step": 16046, + "time_per_iteration": 2.7376601696014404 + }, + { + "auxiliary_loss_clip": 0.0110139, + "auxiliary_loss_mlp": 0.01036417, + "balance_loss_clip": 1.03724825, + "balance_loss_mlp": 1.0229938, + "epoch": 0.964797835562904, + "flos": 24279240556800.0, + "grad_norm": 2.6746075842728643, + "language_loss": 0.62799901, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.64937705, + "num_input_tokens_seen": 346239715, + "step": 16047, + "time_per_iteration": 2.6332709789276123 + }, + { + "auxiliary_loss_clip": 0.01088739, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.0394733, + "balance_loss_mlp": 1.02437496, + "epoch": 0.9648579588155719, + "flos": 20522697413760.0, + "grad_norm": 3.1779887131346722, + "language_loss": 0.68779409, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.70904881, + "num_input_tokens_seen": 346258500, + "step": 16048, + "time_per_iteration": 2.6534385681152344 + }, + { + "auxiliary_loss_clip": 0.01099634, + "auxiliary_loss_mlp": 0.01033136, + "balance_loss_clip": 1.03766835, + "balance_loss_mlp": 1.02040398, + "epoch": 0.9649180820682399, + "flos": 32154844855680.0, + "grad_norm": 1.6641327759979738, + "language_loss": 0.63842821, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.65975595, + "num_input_tokens_seen": 346279110, + "step": 16049, + "time_per_iteration": 2.7865707874298096 + }, + { + "auxiliary_loss_clip": 0.0110081, + "auxiliary_loss_mlp": 0.01031714, + "balance_loss_clip": 1.03886986, + "balance_loss_mlp": 1.01909614, + "epoch": 0.9649782053209078, + "flos": 20522589672960.0, + "grad_norm": 1.7000541737371648, + "language_loss": 0.70881176, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.73013705, + "num_input_tokens_seen": 346297860, + "step": 16050, + "time_per_iteration": 2.6416265964508057 + }, + { + "auxiliary_loss_clip": 0.01097319, + "auxiliary_loss_mlp": 0.01036578, + "balance_loss_clip": 1.03291678, + "balance_loss_mlp": 1.02254736, + "epoch": 0.9650383285735759, + "flos": 43067953843200.0, + "grad_norm": 1.8954759239301664, + "language_loss": 0.70080233, + "learning_rate": 1.278669873970606e-08, + "loss": 0.72214133, + "num_input_tokens_seen": 346319860, + "step": 16051, + "time_per_iteration": 2.8770833015441895 + }, + { + "auxiliary_loss_clip": 0.0101809, + "auxiliary_loss_mlp": 0.01006389, + "balance_loss_clip": 1.0055362, + "balance_loss_mlp": 1.00536346, + "epoch": 0.9650984518262438, + "flos": 61748255882880.0, + "grad_norm": 0.8414397743745523, + "language_loss": 0.59155834, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.61180305, + "num_input_tokens_seen": 346379025, + "step": 16052, + "time_per_iteration": 3.190720796585083 + }, + { + "auxiliary_loss_clip": 0.01103599, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_clip": 1.03430974, + "balance_loss_mlp": 1.01511848, + "epoch": 0.9651585750789118, + "flos": 29789337761280.0, + "grad_norm": 1.6456088019089208, + "language_loss": 0.74250531, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.76381516, + "num_input_tokens_seen": 346402250, + "step": 16053, + "time_per_iteration": 2.707024335861206 + }, + { + "auxiliary_loss_clip": 0.01083745, + "auxiliary_loss_mlp": 0.01030902, + "balance_loss_clip": 1.03504825, + "balance_loss_mlp": 1.01819479, + "epoch": 0.9652186983315797, + "flos": 16873060124160.0, + "grad_norm": 2.539273604923119, + "language_loss": 0.68519378, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.70634031, + "num_input_tokens_seen": 346419555, + "step": 16054, + "time_per_iteration": 2.650216817855835 + }, + { + "auxiliary_loss_clip": 0.01091665, + "auxiliary_loss_mlp": 0.00769869, + "balance_loss_clip": 1.03783798, + "balance_loss_mlp": 1.00018322, + "epoch": 0.9652788215842477, + "flos": 31649761762560.0, + "grad_norm": 1.504057282029753, + "language_loss": 0.6170547, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.63567007, + "num_input_tokens_seen": 346441245, + "step": 16055, + "time_per_iteration": 2.708653450012207 + }, + { + "auxiliary_loss_clip": 0.01069001, + "auxiliary_loss_mlp": 0.01032832, + "balance_loss_clip": 1.03551924, + "balance_loss_mlp": 1.02045822, + "epoch": 0.9653389448369156, + "flos": 24754266944640.0, + "grad_norm": 1.8401015391403723, + "language_loss": 0.77219534, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.79321373, + "num_input_tokens_seen": 346460065, + "step": 16056, + "time_per_iteration": 2.860055446624756 + }, + { + "auxiliary_loss_clip": 0.01081129, + "auxiliary_loss_mlp": 0.01031879, + "balance_loss_clip": 1.03277361, + "balance_loss_mlp": 1.01884317, + "epoch": 0.9653990680895836, + "flos": 20297249700480.0, + "grad_norm": 1.5426450454186225, + "language_loss": 0.71504593, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.73617601, + "num_input_tokens_seen": 346478005, + "step": 16057, + "time_per_iteration": 2.6402721405029297 + }, + { + "auxiliary_loss_clip": 0.01104126, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.03448784, + "balance_loss_mlp": 1.02154279, + "epoch": 0.9654591913422517, + "flos": 22528775064960.0, + "grad_norm": 2.1611646514701786, + "language_loss": 0.71808469, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.73945796, + "num_input_tokens_seen": 346497575, + "step": 16058, + "time_per_iteration": 2.5751798152923584 + }, + { + "auxiliary_loss_clip": 0.01095378, + "auxiliary_loss_mlp": 0.01032935, + "balance_loss_clip": 1.0353595, + "balance_loss_mlp": 1.02075791, + "epoch": 0.9655193145949196, + "flos": 26763002202240.0, + "grad_norm": 1.6714085825517457, + "language_loss": 0.74098462, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.76226771, + "num_input_tokens_seen": 346520000, + "step": 16059, + "time_per_iteration": 2.7003426551818848 + }, + { + "auxiliary_loss_clip": 0.01090004, + "auxiliary_loss_mlp": 0.01034435, + "balance_loss_clip": 1.03552127, + "balance_loss_mlp": 1.02234113, + "epoch": 0.9655794378475876, + "flos": 41970703132800.0, + "grad_norm": 1.9389350053805974, + "language_loss": 0.73612213, + "learning_rate": 1.239402791721722e-08, + "loss": 0.75736654, + "num_input_tokens_seen": 346541605, + "step": 16060, + "time_per_iteration": 2.784961462020874 + }, + { + "auxiliary_loss_clip": 0.01084764, + "auxiliary_loss_mlp": 0.01031011, + "balance_loss_clip": 1.03691041, + "balance_loss_mlp": 1.019889, + "epoch": 0.9656395611002555, + "flos": 27709427704320.0, + "grad_norm": 2.3988386788091502, + "language_loss": 0.76481092, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.78596866, + "num_input_tokens_seen": 346560955, + "step": 16061, + "time_per_iteration": 2.7270572185516357 + }, + { + "auxiliary_loss_clip": 0.01012338, + "auxiliary_loss_mlp": 0.01000976, + "balance_loss_clip": 1.00929773, + "balance_loss_mlp": 1.00001049, + "epoch": 0.9656996843529235, + "flos": 68968562411520.0, + "grad_norm": 0.7235401443187384, + "language_loss": 0.64154565, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.66167879, + "num_input_tokens_seen": 346621615, + "step": 16062, + "time_per_iteration": 3.263425827026367 + }, + { + "auxiliary_loss_clip": 0.01055166, + "auxiliary_loss_mlp": 0.01028261, + "balance_loss_clip": 1.02995956, + "balance_loss_mlp": 1.01672757, + "epoch": 0.9657598076055914, + "flos": 20631327120000.0, + "grad_norm": 1.9907973555494325, + "language_loss": 0.92924762, + "learning_rate": 1.226449424760867e-08, + "loss": 0.95008188, + "num_input_tokens_seen": 346637460, + "step": 16063, + "time_per_iteration": 2.728024959564209 + }, + { + "auxiliary_loss_clip": 0.01099068, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.03742814, + "balance_loss_mlp": 1.02153897, + "epoch": 0.9658199308582595, + "flos": 20448577699200.0, + "grad_norm": 1.7715018792062844, + "language_loss": 0.82029349, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.84162194, + "num_input_tokens_seen": 346655625, + "step": 16064, + "time_per_iteration": 2.633328914642334 + }, + { + "auxiliary_loss_clip": 0.01095042, + "auxiliary_loss_mlp": 0.00770428, + "balance_loss_clip": 1.03698933, + "balance_loss_mlp": 1.0001657, + "epoch": 0.9658800541109274, + "flos": 24718033100160.0, + "grad_norm": 1.5465951740979789, + "language_loss": 0.84208536, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.86074007, + "num_input_tokens_seen": 346675220, + "step": 16065, + "time_per_iteration": 2.6656553745269775 + }, + { + "auxiliary_loss_clip": 0.01083456, + "auxiliary_loss_mlp": 0.01029904, + "balance_loss_clip": 1.03509152, + "balance_loss_mlp": 1.01748872, + "epoch": 0.9659401773635954, + "flos": 21610035970560.0, + "grad_norm": 1.7587516083331964, + "language_loss": 0.67517728, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.69631088, + "num_input_tokens_seen": 346694710, + "step": 16066, + "time_per_iteration": 2.6471195220947266 + }, + { + "auxiliary_loss_clip": 0.01107434, + "auxiliary_loss_mlp": 0.0102656, + "balance_loss_clip": 1.03636479, + "balance_loss_mlp": 1.01466918, + "epoch": 0.9660003006162633, + "flos": 20301200196480.0, + "grad_norm": 1.8111019231916714, + "language_loss": 0.82353568, + "learning_rate": 1.209283794752558e-08, + "loss": 0.84487563, + "num_input_tokens_seen": 346712645, + "step": 16067, + "time_per_iteration": 2.605968952178955 + }, + { + "auxiliary_loss_clip": 0.01087949, + "auxiliary_loss_mlp": 0.01029805, + "balance_loss_clip": 1.03769147, + "balance_loss_mlp": 1.01721048, + "epoch": 0.9660604238689313, + "flos": 24461954064000.0, + "grad_norm": 2.0465290050813496, + "language_loss": 0.69553685, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.71671438, + "num_input_tokens_seen": 346732375, + "step": 16068, + "time_per_iteration": 2.7153985500335693 + }, + { + "auxiliary_loss_clip": 0.01085915, + "auxiliary_loss_mlp": 0.01031292, + "balance_loss_clip": 1.0330863, + "balance_loss_mlp": 1.02038455, + "epoch": 0.9661205471215992, + "flos": 19864023765120.0, + "grad_norm": 1.6826807111904172, + "language_loss": 0.68126762, + "learning_rate": 1.20074620808146e-08, + "loss": 0.70243973, + "num_input_tokens_seen": 346750430, + "step": 16069, + "time_per_iteration": 2.576427936553955 + }, + { + "auxiliary_loss_clip": 0.01089339, + "auxiliary_loss_mlp": 0.01028006, + "balance_loss_clip": 1.03860068, + "balance_loss_mlp": 1.01594257, + "epoch": 0.9661806703742672, + "flos": 20557889763840.0, + "grad_norm": 1.979804846920071, + "language_loss": 0.8906877, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.91186118, + "num_input_tokens_seen": 346768455, + "step": 16070, + "time_per_iteration": 5.773402214050293 + }, + { + "auxiliary_loss_clip": 0.01111791, + "auxiliary_loss_mlp": 0.01038495, + "balance_loss_clip": 1.03955567, + "balance_loss_mlp": 1.02573359, + "epoch": 0.9662407936269353, + "flos": 21430949736960.0, + "grad_norm": 2.2069490271978327, + "language_loss": 0.77111554, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.79261839, + "num_input_tokens_seen": 346786530, + "step": 16071, + "time_per_iteration": 4.432383060455322 + }, + { + "auxiliary_loss_clip": 0.01083604, + "auxiliary_loss_mlp": 0.01031428, + "balance_loss_clip": 1.03396428, + "balance_loss_mlp": 1.01729596, + "epoch": 0.9663009168796032, + "flos": 14902893095040.0, + "grad_norm": 1.7077316996855652, + "language_loss": 0.65930271, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.680453, + "num_input_tokens_seen": 346804635, + "step": 16072, + "time_per_iteration": 2.6231675148010254 + }, + { + "auxiliary_loss_clip": 0.01101171, + "auxiliary_loss_mlp": 0.01031913, + "balance_loss_clip": 1.03714108, + "balance_loss_mlp": 1.02001643, + "epoch": 0.9663610401322712, + "flos": 24310877460480.0, + "grad_norm": 1.7479386785661417, + "language_loss": 0.77363575, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.7949667, + "num_input_tokens_seen": 346823070, + "step": 16073, + "time_per_iteration": 2.6588406562805176 + }, + { + "auxiliary_loss_clip": 0.01113364, + "auxiliary_loss_mlp": 0.01036179, + "balance_loss_clip": 1.03833628, + "balance_loss_mlp": 1.02317297, + "epoch": 0.9664211633849391, + "flos": 17637849527040.0, + "grad_norm": 4.0139714248409515, + "language_loss": 0.7596699, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.78116536, + "num_input_tokens_seen": 346841180, + "step": 16074, + "time_per_iteration": 2.5176475048065186 + }, + { + "auxiliary_loss_clip": 0.01085316, + "auxiliary_loss_mlp": 0.01031408, + "balance_loss_clip": 1.03595638, + "balance_loss_mlp": 1.01841474, + "epoch": 0.9664812866376071, + "flos": 29789409588480.0, + "grad_norm": 1.5863798083052476, + "language_loss": 0.75684714, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.77801442, + "num_input_tokens_seen": 346864250, + "step": 16075, + "time_per_iteration": 2.740597724914551 + }, + { + "auxiliary_loss_clip": 0.01078205, + "auxiliary_loss_mlp": 0.01035752, + "balance_loss_clip": 1.03695774, + "balance_loss_mlp": 1.02323484, + "epoch": 0.966541409890275, + "flos": 14282320798080.0, + "grad_norm": 1.8962598568271254, + "language_loss": 0.78820133, + "learning_rate": 1.171102125547696e-08, + "loss": 0.80934089, + "num_input_tokens_seen": 346881955, + "step": 16076, + "time_per_iteration": 4.21985650062561 + }, + { + "auxiliary_loss_clip": 0.01089256, + "auxiliary_loss_mlp": 0.01043191, + "balance_loss_clip": 1.03779173, + "balance_loss_mlp": 1.02938676, + "epoch": 0.9666015331429431, + "flos": 19860432405120.0, + "grad_norm": 1.7349669135653192, + "language_loss": 0.7190969, + "learning_rate": 1.166897413780532e-08, + "loss": 0.74042135, + "num_input_tokens_seen": 346900445, + "step": 16077, + "time_per_iteration": 2.626159191131592 + }, + { + "auxiliary_loss_clip": 0.01093266, + "auxiliary_loss_mlp": 0.01032581, + "balance_loss_clip": 1.03399146, + "balance_loss_mlp": 1.01980758, + "epoch": 0.966661656395611, + "flos": 27125951178240.0, + "grad_norm": 1.9266659878552612, + "language_loss": 0.593472, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.61473054, + "num_input_tokens_seen": 346920135, + "step": 16078, + "time_per_iteration": 2.6967270374298096 + }, + { + "auxiliary_loss_clip": 0.01101009, + "auxiliary_loss_mlp": 0.0103522, + "balance_loss_clip": 1.03683424, + "balance_loss_mlp": 1.02165985, + "epoch": 0.966721779648279, + "flos": 21508229848320.0, + "grad_norm": 1.883589824979691, + "language_loss": 0.72105432, + "learning_rate": 1.158510609718899e-08, + "loss": 0.74241656, + "num_input_tokens_seen": 346940450, + "step": 16079, + "time_per_iteration": 2.63110089302063 + }, + { + "auxiliary_loss_clip": 0.01094454, + "auxiliary_loss_mlp": 0.01027425, + "balance_loss_clip": 1.03552699, + "balance_loss_mlp": 1.01592135, + "epoch": 0.9667819029009469, + "flos": 23878118401920.0, + "grad_norm": 1.528864037931963, + "language_loss": 0.71972895, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.74094772, + "num_input_tokens_seen": 346960935, + "step": 16080, + "time_per_iteration": 2.6290252208709717 + }, + { + "auxiliary_loss_clip": 0.01075746, + "auxiliary_loss_mlp": 0.0103416, + "balance_loss_clip": 1.03217447, + "balance_loss_mlp": 1.02045643, + "epoch": 0.9668420261536149, + "flos": 21507224267520.0, + "grad_norm": 1.6987481016197885, + "language_loss": 0.73362374, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.75472283, + "num_input_tokens_seen": 346980100, + "step": 16081, + "time_per_iteration": 2.6839892864227295 + }, + { + "auxiliary_loss_clip": 0.01080983, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.03344238, + "balance_loss_mlp": 1.01646531, + "epoch": 0.9669021494062828, + "flos": 26687266375680.0, + "grad_norm": 1.771131179159937, + "language_loss": 0.67452699, + "learning_rate": 1.145986954691236e-08, + "loss": 0.69563329, + "num_input_tokens_seen": 347001250, + "step": 16082, + "time_per_iteration": 2.7003889083862305 + }, + { + "auxiliary_loss_clip": 0.01065498, + "auxiliary_loss_mlp": 0.01042485, + "balance_loss_clip": 1.0319593, + "balance_loss_mlp": 1.02886534, + "epoch": 0.9669622726589508, + "flos": 29825032901760.0, + "grad_norm": 1.870561288505191, + "language_loss": 0.76813722, + "learning_rate": 1.141827483932789e-08, + "loss": 0.78921711, + "num_input_tokens_seen": 347022975, + "step": 16083, + "time_per_iteration": 2.736612558364868 + }, + { + "auxiliary_loss_clip": 0.01061787, + "auxiliary_loss_mlp": 0.01033012, + "balance_loss_clip": 1.03434837, + "balance_loss_mlp": 1.0202508, + "epoch": 0.9670223959116189, + "flos": 22922499018240.0, + "grad_norm": 2.1852037151642203, + "language_loss": 0.79155672, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.81250471, + "num_input_tokens_seen": 347038780, + "step": 16084, + "time_per_iteration": 2.7562255859375 + }, + { + "auxiliary_loss_clip": 0.01101094, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.03601408, + "balance_loss_mlp": 1.01673627, + "epoch": 0.9670825191642868, + "flos": 18624495283200.0, + "grad_norm": 2.1888422828676655, + "language_loss": 0.6779865, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.69929618, + "num_input_tokens_seen": 347056705, + "step": 16085, + "time_per_iteration": 2.5915327072143555 + }, + { + "auxiliary_loss_clip": 0.01089717, + "auxiliary_loss_mlp": 0.01031896, + "balance_loss_clip": 1.03720474, + "balance_loss_mlp": 1.01825213, + "epoch": 0.9671426424169548, + "flos": 24497936513280.0, + "grad_norm": 2.1661579345086097, + "language_loss": 0.69027126, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.71148735, + "num_input_tokens_seen": 347075710, + "step": 16086, + "time_per_iteration": 2.6948018074035645 + }, + { + "auxiliary_loss_clip": 0.01095229, + "auxiliary_loss_mlp": 0.01033969, + "balance_loss_clip": 1.03497195, + "balance_loss_mlp": 1.02086806, + "epoch": 0.9672027656696227, + "flos": 20371189847040.0, + "grad_norm": 1.6309967969700185, + "language_loss": 0.78254652, + "learning_rate": 1.125265009690235e-08, + "loss": 0.80383849, + "num_input_tokens_seen": 347092325, + "step": 16087, + "time_per_iteration": 2.638317346572876 + }, + { + "auxiliary_loss_clip": 0.0107816, + "auxiliary_loss_mlp": 0.01029378, + "balance_loss_clip": 1.0323391, + "balance_loss_mlp": 1.0173974, + "epoch": 0.9672628889222907, + "flos": 18880179269760.0, + "grad_norm": 1.8693958793677588, + "language_loss": 0.71220851, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.73328388, + "num_input_tokens_seen": 347110595, + "step": 16088, + "time_per_iteration": 2.7119359970092773 + }, + { + "auxiliary_loss_clip": 0.01105883, + "auxiliary_loss_mlp": 0.00770003, + "balance_loss_clip": 1.03694296, + "balance_loss_mlp": 1.00009024, + "epoch": 0.9673230121749586, + "flos": 28695247447680.0, + "grad_norm": 1.6915080049875915, + "language_loss": 0.70655894, + "learning_rate": 1.117029020040916e-08, + "loss": 0.72531772, + "num_input_tokens_seen": 347131625, + "step": 16089, + "time_per_iteration": 2.5807154178619385 + }, + { + "auxiliary_loss_clip": 0.01110035, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.03722262, + "balance_loss_mlp": 1.02004623, + "epoch": 0.9673831354276267, + "flos": 20484452407680.0, + "grad_norm": 2.217046221899868, + "language_loss": 0.7484473, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.76987088, + "num_input_tokens_seen": 347147910, + "step": 16090, + "time_per_iteration": 2.5390427112579346 + }, + { + "auxiliary_loss_clip": 0.01087487, + "auxiliary_loss_mlp": 0.0102933, + "balance_loss_clip": 1.03507531, + "balance_loss_mlp": 1.01677692, + "epoch": 0.9674432586802946, + "flos": 26797548107520.0, + "grad_norm": 2.2040190235185158, + "language_loss": 0.69111538, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.71228355, + "num_input_tokens_seen": 347168805, + "step": 16091, + "time_per_iteration": 2.672116279602051 + }, + { + "auxiliary_loss_clip": 0.01106741, + "auxiliary_loss_mlp": 0.01031458, + "balance_loss_clip": 1.03664362, + "balance_loss_mlp": 1.01816666, + "epoch": 0.9675033819329626, + "flos": 22310941034880.0, + "grad_norm": 1.7246952798155581, + "language_loss": 0.76974177, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.79112375, + "num_input_tokens_seen": 347189455, + "step": 16092, + "time_per_iteration": 2.562080144882202 + }, + { + "auxiliary_loss_clip": 0.01107911, + "auxiliary_loss_mlp": 0.01030137, + "balance_loss_clip": 1.0373435, + "balance_loss_mlp": 1.01862192, + "epoch": 0.9675635051856305, + "flos": 12675713276160.0, + "grad_norm": 1.9074028879734577, + "language_loss": 0.76118815, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.78256863, + "num_input_tokens_seen": 347206030, + "step": 16093, + "time_per_iteration": 2.5782711505889893 + }, + { + "auxiliary_loss_clip": 0.01083204, + "auxiliary_loss_mlp": 0.01028136, + "balance_loss_clip": 1.03609204, + "balance_loss_mlp": 1.01477861, + "epoch": 0.9676236284382985, + "flos": 24608469640320.0, + "grad_norm": 1.8185482437095273, + "language_loss": 0.68996257, + "learning_rate": 1.096571027726112e-08, + "loss": 0.71107602, + "num_input_tokens_seen": 347226250, + "step": 16094, + "time_per_iteration": 2.642312526702881 + }, + { + "auxiliary_loss_clip": 0.01099843, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.03670728, + "balance_loss_mlp": 1.01940703, + "epoch": 0.9676837516909664, + "flos": 23367145478400.0, + "grad_norm": 3.1538628444307912, + "language_loss": 0.7587145, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.7800281, + "num_input_tokens_seen": 347247350, + "step": 16095, + "time_per_iteration": 2.6397533416748047 + }, + { + "auxiliary_loss_clip": 0.01114159, + "auxiliary_loss_mlp": 0.01035616, + "balance_loss_clip": 1.03943849, + "balance_loss_mlp": 1.0225327, + "epoch": 0.9677438749436345, + "flos": 20486894532480.0, + "grad_norm": 3.4142773987990513, + "language_loss": 0.70483637, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.7263341, + "num_input_tokens_seen": 347266870, + "step": 16096, + "time_per_iteration": 2.571568727493286 + }, + { + "auxiliary_loss_clip": 0.01086881, + "auxiliary_loss_mlp": 0.01026382, + "balance_loss_clip": 1.03495574, + "balance_loss_mlp": 1.01391292, + "epoch": 0.9678039981963025, + "flos": 47555889719040.0, + "grad_norm": 1.7358126863243992, + "language_loss": 0.7179426, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.73907518, + "num_input_tokens_seen": 347290120, + "step": 16097, + "time_per_iteration": 2.8643288612365723 + }, + { + "auxiliary_loss_clip": 0.01107467, + "auxiliary_loss_mlp": 0.01035068, + "balance_loss_clip": 1.03668487, + "balance_loss_mlp": 1.02286124, + "epoch": 0.9678641214489704, + "flos": 25040474513280.0, + "grad_norm": 1.9803265483631816, + "language_loss": 0.78437316, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.80579853, + "num_input_tokens_seen": 347308785, + "step": 16098, + "time_per_iteration": 2.5864875316619873 + }, + { + "auxiliary_loss_clip": 0.0107379, + "auxiliary_loss_mlp": 0.01027915, + "balance_loss_clip": 1.03619742, + "balance_loss_mlp": 1.01629841, + "epoch": 0.9679242447016384, + "flos": 19240937516160.0, + "grad_norm": 2.361712995723687, + "language_loss": 0.90639651, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.92741358, + "num_input_tokens_seen": 347326375, + "step": 16099, + "time_per_iteration": 2.786999464035034 + }, + { + "auxiliary_loss_clip": 0.01100177, + "auxiliary_loss_mlp": 0.01034182, + "balance_loss_clip": 1.03700566, + "balance_loss_mlp": 1.02131963, + "epoch": 0.9679843679543063, + "flos": 33254681345280.0, + "grad_norm": 1.5241755755242299, + "language_loss": 0.66061008, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.68195367, + "num_input_tokens_seen": 347348250, + "step": 16100, + "time_per_iteration": 2.6941099166870117 + }, + { + "auxiliary_loss_clip": 0.01069319, + "auxiliary_loss_mlp": 0.01035758, + "balance_loss_clip": 1.03754771, + "balance_loss_mlp": 1.02276969, + "epoch": 0.9680444912069743, + "flos": 22783633038720.0, + "grad_norm": 1.6628994278317477, + "language_loss": 0.73592603, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.75697684, + "num_input_tokens_seen": 347367400, + "step": 16101, + "time_per_iteration": 2.6911606788635254 + }, + { + "auxiliary_loss_clip": 0.01085079, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.03593111, + "balance_loss_mlp": 1.01842427, + "epoch": 0.9681046144596422, + "flos": 24024095274240.0, + "grad_norm": 1.6107351715516067, + "language_loss": 0.73375201, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.75491893, + "num_input_tokens_seen": 347387600, + "step": 16102, + "time_per_iteration": 2.6399521827697754 + }, + { + "auxiliary_loss_clip": 0.01076768, + "auxiliary_loss_mlp": 0.01035627, + "balance_loss_clip": 1.04000163, + "balance_loss_mlp": 1.02195942, + "epoch": 0.9681647377123103, + "flos": 23441013797760.0, + "grad_norm": 1.9528459851096875, + "language_loss": 0.77444363, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.79556757, + "num_input_tokens_seen": 347406915, + "step": 16103, + "time_per_iteration": 2.7056915760040283 + }, + { + "auxiliary_loss_clip": 0.01086653, + "auxiliary_loss_mlp": 0.0103195, + "balance_loss_clip": 1.03456104, + "balance_loss_mlp": 1.0200057, + "epoch": 0.9682248609649782, + "flos": 22675075159680.0, + "grad_norm": 1.639893337105475, + "language_loss": 0.80586064, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.82704663, + "num_input_tokens_seen": 347425140, + "step": 16104, + "time_per_iteration": 2.648461103439331 + }, + { + "auxiliary_loss_clip": 0.01088229, + "auxiliary_loss_mlp": 0.01035055, + "balance_loss_clip": 1.03242385, + "balance_loss_mlp": 1.02403986, + "epoch": 0.9682849842176462, + "flos": 24428413739520.0, + "grad_norm": 1.4906398802277745, + "language_loss": 0.77576089, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.79699373, + "num_input_tokens_seen": 347446350, + "step": 16105, + "time_per_iteration": 2.6988043785095215 + }, + { + "auxiliary_loss_clip": 0.01000224, + "auxiliary_loss_mlp": 0.01003466, + "balance_loss_clip": 1.00602651, + "balance_loss_mlp": 1.00240505, + "epoch": 0.9683451074703141, + "flos": 59995132784640.0, + "grad_norm": 0.8146584091458852, + "language_loss": 0.56716478, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.58720171, + "num_input_tokens_seen": 347510135, + "step": 16106, + "time_per_iteration": 3.2270421981811523 + }, + { + "auxiliary_loss_clip": 0.01008919, + "auxiliary_loss_mlp": 0.0100775, + "balance_loss_clip": 1.01353073, + "balance_loss_mlp": 1.00654626, + "epoch": 0.9684052307229821, + "flos": 52696145514240.0, + "grad_norm": 0.9121301732264848, + "language_loss": 0.61534059, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.63550723, + "num_input_tokens_seen": 347562505, + "step": 16107, + "time_per_iteration": 3.1101765632629395 + }, + { + "auxiliary_loss_clip": 0.01098789, + "auxiliary_loss_mlp": 0.01035497, + "balance_loss_clip": 1.03623629, + "balance_loss_mlp": 1.02143073, + "epoch": 0.96846535397565, + "flos": 22783848520320.0, + "grad_norm": 2.462094722606181, + "language_loss": 0.74264908, + "learning_rate": 1.040291854638875e-08, + "loss": 0.76399195, + "num_input_tokens_seen": 347579150, + "step": 16108, + "time_per_iteration": 2.743326187133789 + }, + { + "auxiliary_loss_clip": 0.01093024, + "auxiliary_loss_mlp": 0.01027661, + "balance_loss_clip": 1.03480208, + "balance_loss_mlp": 1.01471508, + "epoch": 0.968525477228318, + "flos": 23323980309120.0, + "grad_norm": 2.296731755168933, + "language_loss": 0.56901729, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.59022415, + "num_input_tokens_seen": 347596705, + "step": 16109, + "time_per_iteration": 5.880841255187988 + }, + { + "auxiliary_loss_clip": 0.01018006, + "auxiliary_loss_mlp": 0.01003432, + "balance_loss_clip": 1.0045774, + "balance_loss_mlp": 1.00251389, + "epoch": 0.9685856004809861, + "flos": 67882947707520.0, + "grad_norm": 0.6721206293032698, + "language_loss": 0.54183471, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.56204915, + "num_input_tokens_seen": 347661870, + "step": 16110, + "time_per_iteration": 3.1392929553985596 + }, + { + "auxiliary_loss_clip": 0.01040803, + "auxiliary_loss_mlp": 0.01042377, + "balance_loss_clip": 1.03240716, + "balance_loss_mlp": 1.02792311, + "epoch": 0.968645723733654, + "flos": 33947900899200.0, + "grad_norm": 1.336975675669519, + "language_loss": 0.62198687, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.64281869, + "num_input_tokens_seen": 347684295, + "step": 16111, + "time_per_iteration": 4.477367401123047 + }, + { + "auxiliary_loss_clip": 0.01084355, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.0346024, + "balance_loss_mlp": 1.01831102, + "epoch": 0.968705846986322, + "flos": 18551488890240.0, + "grad_norm": 1.8865919253091008, + "language_loss": 0.74626237, + "learning_rate": 1.024483677309118e-08, + "loss": 0.76740086, + "num_input_tokens_seen": 347702585, + "step": 16112, + "time_per_iteration": 2.6802995204925537 + }, + { + "auxiliary_loss_clip": 0.01096094, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.03605258, + "balance_loss_mlp": 1.01711464, + "epoch": 0.9687659702389899, + "flos": 17420913336960.0, + "grad_norm": 3.1829431624893, + "language_loss": 0.66342431, + "learning_rate": 1.020550495531558e-08, + "loss": 0.68467641, + "num_input_tokens_seen": 347721810, + "step": 16113, + "time_per_iteration": 2.6158058643341064 + }, + { + "auxiliary_loss_clip": 0.01016205, + "auxiliary_loss_mlp": 0.01002001, + "balance_loss_clip": 1.00489581, + "balance_loss_mlp": 1.00113058, + "epoch": 0.9688260934916579, + "flos": 62047176865920.0, + "grad_norm": 0.7821411345284778, + "language_loss": 0.56506634, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.58524841, + "num_input_tokens_seen": 347782330, + "step": 16114, + "time_per_iteration": 3.127088785171509 + }, + { + "auxiliary_loss_clip": 0.01081645, + "auxiliary_loss_mlp": 0.01038103, + "balance_loss_clip": 1.03432035, + "balance_loss_mlp": 1.02492452, + "epoch": 0.9688862167443258, + "flos": 15076520461440.0, + "grad_norm": 1.9569215626202732, + "language_loss": 0.82965726, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.85085475, + "num_input_tokens_seen": 347794835, + "step": 16115, + "time_per_iteration": 4.220075607299805 + }, + { + "auxiliary_loss_clip": 0.01092985, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.03631961, + "balance_loss_mlp": 1.01743448, + "epoch": 0.9689463399969939, + "flos": 19938215306880.0, + "grad_norm": 1.7648967305379544, + "language_loss": 0.72280598, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.74402535, + "num_input_tokens_seen": 347814320, + "step": 16116, + "time_per_iteration": 2.603519916534424 + }, + { + "auxiliary_loss_clip": 0.01068294, + "auxiliary_loss_mlp": 0.01034519, + "balance_loss_clip": 1.03542447, + "balance_loss_mlp": 1.02141237, + "epoch": 0.9690064632496618, + "flos": 19573039687680.0, + "grad_norm": 2.086312122853078, + "language_loss": 0.75657129, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.77759945, + "num_input_tokens_seen": 347832125, + "step": 16117, + "time_per_iteration": 2.6519157886505127 + }, + { + "auxiliary_loss_clip": 0.01109753, + "auxiliary_loss_mlp": 0.01030569, + "balance_loss_clip": 1.03619337, + "balance_loss_mlp": 1.01758695, + "epoch": 0.9690665865023298, + "flos": 21872292145920.0, + "grad_norm": 2.3448073541677275, + "language_loss": 0.77482766, + "learning_rate": 1.000997769426548e-08, + "loss": 0.79623091, + "num_input_tokens_seen": 347850765, + "step": 16118, + "time_per_iteration": 2.5268216133117676 + }, + { + "auxiliary_loss_clip": 0.0108528, + "auxiliary_loss_mlp": 0.00771043, + "balance_loss_clip": 1.03405607, + "balance_loss_mlp": 1.00030315, + "epoch": 0.9691267097549977, + "flos": 20994491577600.0, + "grad_norm": 1.8097325369712165, + "language_loss": 0.78219616, + "learning_rate": 9.971098618001272e-09, + "loss": 0.80075938, + "num_input_tokens_seen": 347870125, + "step": 16119, + "time_per_iteration": 2.629453659057617 + }, + { + "auxiliary_loss_clip": 0.01056904, + "auxiliary_loss_mlp": 0.01034502, + "balance_loss_clip": 1.03209758, + "balance_loss_mlp": 1.0223546, + "epoch": 0.9691868330076657, + "flos": 24279132816000.0, + "grad_norm": 1.885470946971698, + "language_loss": 0.75497305, + "learning_rate": 9.932295003832747e-09, + "loss": 0.77588713, + "num_input_tokens_seen": 347890615, + "step": 16120, + "time_per_iteration": 2.746344566345215 + }, + { + "auxiliary_loss_clip": 0.01097943, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.03581011, + "balance_loss_mlp": 1.02084804, + "epoch": 0.9692469562603336, + "flos": 17675699483520.0, + "grad_norm": 1.8693618447103497, + "language_loss": 0.70098805, + "learning_rate": 9.89356685323095e-09, + "loss": 0.72229403, + "num_input_tokens_seen": 347908685, + "step": 16121, + "time_per_iteration": 2.5618736743927 + }, + { + "auxiliary_loss_clip": 0.01094421, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.03476155, + "balance_loss_mlp": 1.02091098, + "epoch": 0.9693070795130017, + "flos": 26834392483200.0, + "grad_norm": 1.8372756092604514, + "language_loss": 0.69241065, + "learning_rate": 9.854914167664486e-09, + "loss": 0.71368432, + "num_input_tokens_seen": 347926385, + "step": 16122, + "time_per_iteration": 2.56386661529541 + }, + { + "auxiliary_loss_clip": 0.01066781, + "auxiliary_loss_mlp": 0.0103272, + "balance_loss_clip": 1.0308547, + "balance_loss_mlp": 1.02011967, + "epoch": 0.9693672027656697, + "flos": 18077288515200.0, + "grad_norm": 2.0146395561935058, + "language_loss": 0.7544961, + "learning_rate": 9.81633694859907e-09, + "loss": 0.77549112, + "num_input_tokens_seen": 347945290, + "step": 16123, + "time_per_iteration": 2.6407599449157715 + }, + { + "auxiliary_loss_clip": 0.01072153, + "auxiliary_loss_mlp": 0.01038605, + "balance_loss_clip": 1.03460908, + "balance_loss_mlp": 1.02459204, + "epoch": 0.9694273260183376, + "flos": 21763015994880.0, + "grad_norm": 1.5149029001764542, + "language_loss": 0.74644059, + "learning_rate": 9.777835197497753e-09, + "loss": 0.7675482, + "num_input_tokens_seen": 347966330, + "step": 16124, + "time_per_iteration": 2.671185255050659 + }, + { + "auxiliary_loss_clip": 0.01098188, + "auxiliary_loss_mlp": 0.01035267, + "balance_loss_clip": 1.0364728, + "balance_loss_mlp": 1.02335227, + "epoch": 0.9694874492710056, + "flos": 24426115269120.0, + "grad_norm": 2.520792760553443, + "language_loss": 0.74161977, + "learning_rate": 9.739408915820258e-09, + "loss": 0.76295435, + "num_input_tokens_seen": 347982590, + "step": 16125, + "time_per_iteration": 2.6353843212127686 + }, + { + "auxiliary_loss_clip": 0.01019443, + "auxiliary_loss_mlp": 0.01000194, + "balance_loss_clip": 1.00674295, + "balance_loss_mlp": 0.99920446, + "epoch": 0.9695475725236735, + "flos": 67650748237440.0, + "grad_norm": 0.8991349506597905, + "language_loss": 0.61446786, + "learning_rate": 9.70105810502364e-09, + "loss": 0.63466424, + "num_input_tokens_seen": 348043310, + "step": 16126, + "time_per_iteration": 3.190199851989746 + }, + { + "auxiliary_loss_clip": 0.01097272, + "auxiliary_loss_mlp": 0.01035813, + "balance_loss_clip": 1.03880358, + "balance_loss_mlp": 1.02390397, + "epoch": 0.9696076957763415, + "flos": 19129326981120.0, + "grad_norm": 1.964418438296789, + "language_loss": 0.75083786, + "learning_rate": 9.662782766562738e-09, + "loss": 0.77216876, + "num_input_tokens_seen": 348062200, + "step": 16127, + "time_per_iteration": 2.6186792850494385 + }, + { + "auxiliary_loss_clip": 0.01063108, + "auxiliary_loss_mlp": 0.01033166, + "balance_loss_clip": 1.03249013, + "balance_loss_mlp": 1.02036893, + "epoch": 0.9696678190290094, + "flos": 15486836497920.0, + "grad_norm": 1.6000021312142574, + "language_loss": 0.69262868, + "learning_rate": 9.62458290188839e-09, + "loss": 0.71359146, + "num_input_tokens_seen": 348080685, + "step": 16128, + "time_per_iteration": 2.6917450428009033 + }, + { + "auxiliary_loss_clip": 0.01076173, + "auxiliary_loss_mlp": 0.0103545, + "balance_loss_clip": 1.03701282, + "balance_loss_mlp": 1.02326083, + "epoch": 0.9697279422816775, + "flos": 36208692869760.0, + "grad_norm": 1.6481386717416904, + "language_loss": 0.65212297, + "learning_rate": 9.586458512449213e-09, + "loss": 0.67323917, + "num_input_tokens_seen": 348102500, + "step": 16129, + "time_per_iteration": 2.761218309402466 + }, + { + "auxiliary_loss_clip": 0.01076577, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.03635514, + "balance_loss_mlp": 1.01933169, + "epoch": 0.9697880655343454, + "flos": 25484007651840.0, + "grad_norm": 2.2154494130728852, + "language_loss": 0.6313777, + "learning_rate": 9.548409599691166e-09, + "loss": 0.6524685, + "num_input_tokens_seen": 348122515, + "step": 16130, + "time_per_iteration": 2.6841318607330322 + }, + { + "auxiliary_loss_clip": 0.01098965, + "auxiliary_loss_mlp": 0.01031057, + "balance_loss_clip": 1.0350318, + "balance_loss_mlp": 1.01859963, + "epoch": 0.9698481887870134, + "flos": 15333533251200.0, + "grad_norm": 2.2812543570754005, + "language_loss": 0.69271004, + "learning_rate": 9.510436165056867e-09, + "loss": 0.7140103, + "num_input_tokens_seen": 348138775, + "step": 16131, + "time_per_iteration": 2.5763492584228516 + }, + { + "auxiliary_loss_clip": 0.0110919, + "auxiliary_loss_mlp": 0.00770076, + "balance_loss_clip": 1.03628075, + "balance_loss_mlp": 1.00023901, + "epoch": 0.9699083120396813, + "flos": 21982250655360.0, + "grad_norm": 1.8419150080244562, + "language_loss": 0.76590043, + "learning_rate": 9.472538209986058e-09, + "loss": 0.78469312, + "num_input_tokens_seen": 348157115, + "step": 16132, + "time_per_iteration": 2.563215732574463 + }, + { + "auxiliary_loss_clip": 0.01075956, + "auxiliary_loss_mlp": 0.01038228, + "balance_loss_clip": 1.03480387, + "balance_loss_mlp": 1.02540684, + "epoch": 0.9699684352923493, + "flos": 15664055224320.0, + "grad_norm": 2.851724008499009, + "language_loss": 0.79010421, + "learning_rate": 9.434715735916477e-09, + "loss": 0.81124604, + "num_input_tokens_seen": 348173035, + "step": 16133, + "time_per_iteration": 2.623619794845581 + }, + { + "auxiliary_loss_clip": 0.01078402, + "auxiliary_loss_mlp": 0.01028089, + "balance_loss_clip": 1.03522897, + "balance_loss_mlp": 1.01685965, + "epoch": 0.9700285585450172, + "flos": 21908382336000.0, + "grad_norm": 2.3483627644840444, + "language_loss": 0.64470112, + "learning_rate": 9.396968744281863e-09, + "loss": 0.66576606, + "num_input_tokens_seen": 348192960, + "step": 16134, + "time_per_iteration": 2.6657004356384277 + }, + { + "auxiliary_loss_clip": 0.01083734, + "auxiliary_loss_mlp": 0.01032972, + "balance_loss_clip": 1.03266311, + "balance_loss_mlp": 1.01973999, + "epoch": 0.9700886817976853, + "flos": 23914890950400.0, + "grad_norm": 1.8798527935954052, + "language_loss": 0.80912268, + "learning_rate": 9.359297236513519e-09, + "loss": 0.83028972, + "num_input_tokens_seen": 348212805, + "step": 16135, + "time_per_iteration": 2.744619131088257 + }, + { + "auxiliary_loss_clip": 0.01099551, + "auxiliary_loss_mlp": 0.01032267, + "balance_loss_clip": 1.03586113, + "balance_loss_mlp": 1.01880264, + "epoch": 0.9701488050503532, + "flos": 25447845634560.0, + "grad_norm": 1.8400261223826226, + "language_loss": 0.7311669, + "learning_rate": 9.321701214040079e-09, + "loss": 0.7524851, + "num_input_tokens_seen": 348232900, + "step": 16136, + "time_per_iteration": 2.6270158290863037 + }, + { + "auxiliary_loss_clip": 0.01106517, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.03631723, + "balance_loss_mlp": 1.02158737, + "epoch": 0.9702089283030212, + "flos": 20590855470720.0, + "grad_norm": 1.723008357652219, + "language_loss": 0.7604568, + "learning_rate": 9.28418067828729e-09, + "loss": 0.78184789, + "num_input_tokens_seen": 348253065, + "step": 16137, + "time_per_iteration": 2.611590623855591 + }, + { + "auxiliary_loss_clip": 0.0099169, + "auxiliary_loss_mlp": 0.01002259, + "balance_loss_clip": 1.01290679, + "balance_loss_mlp": 1.00113201, + "epoch": 0.9702690515556892, + "flos": 70651516291200.0, + "grad_norm": 0.7712451352581947, + "language_loss": 0.54897171, + "learning_rate": 9.246735630678015e-09, + "loss": 0.56891119, + "num_input_tokens_seen": 348316075, + "step": 16138, + "time_per_iteration": 3.3687798976898193 + }, + { + "auxiliary_loss_clip": 0.01087536, + "auxiliary_loss_mlp": 0.01031679, + "balance_loss_clip": 1.03544235, + "balance_loss_mlp": 1.02002001, + "epoch": 0.9703291748083571, + "flos": 35881439034240.0, + "grad_norm": 1.941978950715942, + "language_loss": 0.7094661, + "learning_rate": 9.209366072632007e-09, + "loss": 0.73065829, + "num_input_tokens_seen": 348337605, + "step": 16139, + "time_per_iteration": 2.725593328475952 + }, + { + "auxiliary_loss_clip": 0.01100195, + "auxiliary_loss_mlp": 0.01032609, + "balance_loss_clip": 1.03781474, + "balance_loss_mlp": 1.01973999, + "epoch": 0.9703892980610251, + "flos": 24316479982080.0, + "grad_norm": 1.5269759850750149, + "language_loss": 0.72774076, + "learning_rate": 9.172072005566134e-09, + "loss": 0.7490688, + "num_input_tokens_seen": 348359430, + "step": 16140, + "time_per_iteration": 2.6335747241973877 + }, + { + "auxiliary_loss_clip": 0.01102225, + "auxiliary_loss_mlp": 0.00771179, + "balance_loss_clip": 1.03837323, + "balance_loss_mlp": 1.00030136, + "epoch": 0.970449421313693, + "flos": 18003743418240.0, + "grad_norm": 2.2771543266487586, + "language_loss": 0.67710316, + "learning_rate": 9.13485343089504e-09, + "loss": 0.6958372, + "num_input_tokens_seen": 348377890, + "step": 16141, + "time_per_iteration": 2.588693141937256 + }, + { + "auxiliary_loss_clip": 0.01093094, + "auxiliary_loss_mlp": 0.01033621, + "balance_loss_clip": 1.03493285, + "balance_loss_mlp": 1.02134275, + "epoch": 0.9705095445663611, + "flos": 25337994865920.0, + "grad_norm": 2.0049530138805856, + "language_loss": 0.69002879, + "learning_rate": 9.097710350029597e-09, + "loss": 0.71129596, + "num_input_tokens_seen": 348396550, + "step": 16142, + "time_per_iteration": 2.727897882461548 + }, + { + "auxiliary_loss_clip": 0.01052884, + "auxiliary_loss_mlp": 0.01032081, + "balance_loss_clip": 1.03308058, + "balance_loss_mlp": 1.01940298, + "epoch": 0.970569667819029, + "flos": 26833602384000.0, + "grad_norm": 1.764320667349442, + "language_loss": 0.55796802, + "learning_rate": 9.060642764378457e-09, + "loss": 0.57881761, + "num_input_tokens_seen": 348417120, + "step": 16143, + "time_per_iteration": 2.7790820598602295 + }, + { + "auxiliary_loss_clip": 0.01097025, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.0362649, + "balance_loss_mlp": 1.02201712, + "epoch": 0.970629791071697, + "flos": 25848644567040.0, + "grad_norm": 2.1311740509263157, + "language_loss": 0.67920631, + "learning_rate": 9.023650675347382e-09, + "loss": 0.70051003, + "num_input_tokens_seen": 348437750, + "step": 16144, + "time_per_iteration": 2.6120004653930664 + }, + { + "auxiliary_loss_clip": 0.01096108, + "auxiliary_loss_mlp": 0.0103709, + "balance_loss_clip": 1.03683603, + "balance_loss_mlp": 1.0254854, + "epoch": 0.9706899143243649, + "flos": 36540184510080.0, + "grad_norm": 1.6337482713348195, + "language_loss": 0.71880758, + "learning_rate": 8.986734084339253e-09, + "loss": 0.74013954, + "num_input_tokens_seen": 348460935, + "step": 16145, + "time_per_iteration": 2.7305266857147217 + }, + { + "auxiliary_loss_clip": 0.0108585, + "auxiliary_loss_mlp": 0.0102977, + "balance_loss_clip": 1.03421783, + "balance_loss_mlp": 1.01635957, + "epoch": 0.9707500375770329, + "flos": 12268234414080.0, + "grad_norm": 5.028438763995754, + "language_loss": 0.80458283, + "learning_rate": 8.949892992753395e-09, + "loss": 0.82573903, + "num_input_tokens_seen": 348474480, + "step": 16146, + "time_per_iteration": 2.6035280227661133 + }, + { + "auxiliary_loss_clip": 0.00997894, + "auxiliary_loss_mlp": 0.0100175, + "balance_loss_clip": 1.00757813, + "balance_loss_mlp": 1.00062394, + "epoch": 0.9708101608297008, + "flos": 60853040196480.0, + "grad_norm": 0.7531682380125572, + "language_loss": 0.54495502, + "learning_rate": 8.91312740198713e-09, + "loss": 0.56495154, + "num_input_tokens_seen": 348541220, + "step": 16147, + "time_per_iteration": 3.225588798522949 + }, + { + "auxiliary_loss_clip": 0.01073097, + "auxiliary_loss_mlp": 0.00771677, + "balance_loss_clip": 1.03335106, + "balance_loss_mlp": 1.00021338, + "epoch": 0.9708702840823689, + "flos": 27124766029440.0, + "grad_norm": 3.20403684561858, + "language_loss": 0.61148691, + "learning_rate": 8.876437313434682e-09, + "loss": 0.62993467, + "num_input_tokens_seen": 348559230, + "step": 16148, + "time_per_iteration": 4.195791482925415 + }, + { + "auxiliary_loss_clip": 0.01070921, + "auxiliary_loss_mlp": 0.01038429, + "balance_loss_clip": 1.03563893, + "balance_loss_mlp": 1.02597761, + "epoch": 0.9709304073350368, + "flos": 20777699041920.0, + "grad_norm": 1.6574498866467469, + "language_loss": 0.73563087, + "learning_rate": 8.839822728487155e-09, + "loss": 0.75672436, + "num_input_tokens_seen": 348577850, + "step": 16149, + "time_per_iteration": 4.327805519104004 + }, + { + "auxiliary_loss_clip": 0.01096097, + "auxiliary_loss_mlp": 0.01036533, + "balance_loss_clip": 1.03510022, + "balance_loss_mlp": 1.02391517, + "epoch": 0.9709905305877048, + "flos": 41934541115520.0, + "grad_norm": 2.151336781292665, + "language_loss": 0.75191128, + "learning_rate": 8.803283648533222e-09, + "loss": 0.77323759, + "num_input_tokens_seen": 348598345, + "step": 16150, + "time_per_iteration": 4.396034479141235 + }, + { + "auxiliary_loss_clip": 0.0109299, + "auxiliary_loss_mlp": 0.01030493, + "balance_loss_clip": 1.03820729, + "balance_loss_mlp": 1.01590753, + "epoch": 0.9710506538403728, + "flos": 17165588486400.0, + "grad_norm": 1.9672912428051808, + "language_loss": 0.73628724, + "learning_rate": 8.766820074958214e-09, + "loss": 0.75752205, + "num_input_tokens_seen": 348616300, + "step": 16151, + "time_per_iteration": 2.6692330837249756 + }, + { + "auxiliary_loss_clip": 0.0109559, + "auxiliary_loss_mlp": 0.01028623, + "balance_loss_clip": 1.03646886, + "balance_loss_mlp": 1.01655281, + "epoch": 0.9711107770930407, + "flos": 21173470070400.0, + "grad_norm": 2.2868567232439787, + "language_loss": 0.74524468, + "learning_rate": 8.730432009145027e-09, + "loss": 0.76648676, + "num_input_tokens_seen": 348633845, + "step": 16152, + "time_per_iteration": 2.639920473098755 + }, + { + "auxiliary_loss_clip": 0.0107224, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.03668654, + "balance_loss_mlp": 1.02151465, + "epoch": 0.9711709003457087, + "flos": 22237072715520.0, + "grad_norm": 1.850919590804903, + "language_loss": 0.67173874, + "learning_rate": 8.694119452473448e-09, + "loss": 0.69279528, + "num_input_tokens_seen": 348653070, + "step": 16153, + "time_per_iteration": 2.69380521774292 + }, + { + "auxiliary_loss_clip": 0.01048504, + "auxiliary_loss_mlp": 0.01029289, + "balance_loss_clip": 1.03318441, + "balance_loss_mlp": 1.01809549, + "epoch": 0.9712310235983767, + "flos": 26213856099840.0, + "grad_norm": 13.061634148388642, + "language_loss": 0.70930749, + "learning_rate": 8.65788240632037e-09, + "loss": 0.73008543, + "num_input_tokens_seen": 348672145, + "step": 16154, + "time_per_iteration": 4.310068607330322 + }, + { + "auxiliary_loss_clip": 0.01063978, + "auxiliary_loss_mlp": 0.01032173, + "balance_loss_clip": 1.04066324, + "balance_loss_mlp": 1.01833844, + "epoch": 0.9712911468510447, + "flos": 20668171495680.0, + "grad_norm": 1.6587681231692977, + "language_loss": 0.80700165, + "learning_rate": 8.621720872059812e-09, + "loss": 0.82796311, + "num_input_tokens_seen": 348690615, + "step": 16155, + "time_per_iteration": 2.7987523078918457 + }, + { + "auxiliary_loss_clip": 0.01098298, + "auxiliary_loss_mlp": 0.00771693, + "balance_loss_clip": 1.03783345, + "balance_loss_mlp": 1.00030363, + "epoch": 0.9713512701037126, + "flos": 13552903313280.0, + "grad_norm": 1.945476752267927, + "language_loss": 0.6769433, + "learning_rate": 8.58563485106334e-09, + "loss": 0.69564319, + "num_input_tokens_seen": 348708665, + "step": 16156, + "time_per_iteration": 2.679084062576294 + }, + { + "auxiliary_loss_clip": 0.01098233, + "auxiliary_loss_mlp": 0.01031321, + "balance_loss_clip": 1.03533268, + "balance_loss_mlp": 1.01955533, + "epoch": 0.9714113933563806, + "flos": 25848752307840.0, + "grad_norm": 2.586712346196416, + "language_loss": 0.9075287, + "learning_rate": 8.54962434469919e-09, + "loss": 0.92882419, + "num_input_tokens_seen": 348726105, + "step": 16157, + "time_per_iteration": 2.6537325382232666 + }, + { + "auxiliary_loss_clip": 0.01071902, + "auxiliary_loss_mlp": 0.00770052, + "balance_loss_clip": 1.03688991, + "balance_loss_mlp": 1.00026488, + "epoch": 0.9714715166090485, + "flos": 12743081233920.0, + "grad_norm": 1.749407686127521, + "language_loss": 0.72465503, + "learning_rate": 8.513689354332721e-09, + "loss": 0.74307454, + "num_input_tokens_seen": 348743360, + "step": 16158, + "time_per_iteration": 2.7036380767822266 + }, + { + "auxiliary_loss_clip": 0.01059022, + "auxiliary_loss_mlp": 0.01037853, + "balance_loss_clip": 1.03384304, + "balance_loss_mlp": 1.02509737, + "epoch": 0.9715316398617165, + "flos": 18405547931520.0, + "grad_norm": 2.013888583799996, + "language_loss": 0.60360491, + "learning_rate": 8.477829881326836e-09, + "loss": 0.62457371, + "num_input_tokens_seen": 348759045, + "step": 16159, + "time_per_iteration": 2.6209466457366943 + }, + { + "auxiliary_loss_clip": 0.01103648, + "auxiliary_loss_mlp": 0.01025108, + "balance_loss_clip": 1.03575277, + "balance_loss_mlp": 1.01424837, + "epoch": 0.9715917631143844, + "flos": 28913799749760.0, + "grad_norm": 1.651339792325088, + "language_loss": 0.78989285, + "learning_rate": 8.44204592704112e-09, + "loss": 0.81118041, + "num_input_tokens_seen": 348779910, + "step": 16160, + "time_per_iteration": 2.5234336853027344 + }, + { + "auxiliary_loss_clip": 0.01027371, + "auxiliary_loss_mlp": 0.01000477, + "balance_loss_clip": 1.00497746, + "balance_loss_mlp": 0.99951786, + "epoch": 0.9716518863670525, + "flos": 65939712900480.0, + "grad_norm": 0.7683763573739155, + "language_loss": 0.54203629, + "learning_rate": 8.406337492832704e-09, + "loss": 0.56231475, + "num_input_tokens_seen": 348838995, + "step": 16161, + "time_per_iteration": 3.0858347415924072 + }, + { + "auxiliary_loss_clip": 0.01094745, + "auxiliary_loss_mlp": 0.00769904, + "balance_loss_clip": 1.03734314, + "balance_loss_mlp": 1.00019956, + "epoch": 0.9717120096197204, + "flos": 17712759340800.0, + "grad_norm": 1.8388776753499438, + "language_loss": 0.72078347, + "learning_rate": 8.3707045800554e-09, + "loss": 0.73942995, + "num_input_tokens_seen": 348858090, + "step": 16162, + "time_per_iteration": 2.4713857173919678 + }, + { + "auxiliary_loss_clip": 0.01070522, + "auxiliary_loss_mlp": 0.010289, + "balance_loss_clip": 1.03172445, + "balance_loss_mlp": 1.01611447, + "epoch": 0.9717721328723884, + "flos": 24463426521600.0, + "grad_norm": 1.6638325085203318, + "language_loss": 0.78620613, + "learning_rate": 8.335147190060787e-09, + "loss": 0.80720031, + "num_input_tokens_seen": 348877885, + "step": 16163, + "time_per_iteration": 2.6069257259368896 + }, + { + "auxiliary_loss_clip": 0.01083213, + "auxiliary_loss_mlp": 0.01027578, + "balance_loss_clip": 1.03707957, + "balance_loss_mlp": 1.01624179, + "epoch": 0.9718322561250564, + "flos": 20776477979520.0, + "grad_norm": 2.364704456697354, + "language_loss": 0.72864258, + "learning_rate": 8.299665324196903e-09, + "loss": 0.74975049, + "num_input_tokens_seen": 348897720, + "step": 16164, + "time_per_iteration": 2.6400234699249268 + }, + { + "auxiliary_loss_clip": 0.01045604, + "auxiliary_loss_mlp": 0.01044632, + "balance_loss_clip": 1.03097391, + "balance_loss_mlp": 1.029773, + "epoch": 0.9718923793777243, + "flos": 19025904746880.0, + "grad_norm": 1.8541776614197814, + "language_loss": 0.83818543, + "learning_rate": 8.264258983809114e-09, + "loss": 0.85908771, + "num_input_tokens_seen": 348915410, + "step": 16165, + "time_per_iteration": 2.729191303253174 + }, + { + "auxiliary_loss_clip": 0.01071333, + "auxiliary_loss_mlp": 0.01027443, + "balance_loss_clip": 1.03399253, + "balance_loss_mlp": 1.01615393, + "epoch": 0.9719525026303923, + "flos": 21871717528320.0, + "grad_norm": 2.4684136710713664, + "language_loss": 0.79201269, + "learning_rate": 8.228928170240345e-09, + "loss": 0.81300044, + "num_input_tokens_seen": 348934335, + "step": 16166, + "time_per_iteration": 2.6733477115631104 + }, + { + "auxiliary_loss_clip": 0.01084172, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.03812957, + "balance_loss_mlp": 1.01548481, + "epoch": 0.9720126258830603, + "flos": 14429303251200.0, + "grad_norm": 1.7663595445124196, + "language_loss": 0.70758253, + "learning_rate": 8.193672884830195e-09, + "loss": 0.72869724, + "num_input_tokens_seen": 348952405, + "step": 16167, + "time_per_iteration": 2.7085564136505127 + }, + { + "auxiliary_loss_clip": 0.01079731, + "auxiliary_loss_mlp": 0.01035805, + "balance_loss_clip": 1.03778422, + "balance_loss_mlp": 1.02379441, + "epoch": 0.9720727491357283, + "flos": 26251167352320.0, + "grad_norm": 1.8138771680519867, + "language_loss": 0.75927782, + "learning_rate": 8.158493128915812e-09, + "loss": 0.78043312, + "num_input_tokens_seen": 348973580, + "step": 16168, + "time_per_iteration": 2.67354154586792 + }, + { + "auxiliary_loss_clip": 0.01049039, + "auxiliary_loss_mlp": 0.01050689, + "balance_loss_clip": 1.03055644, + "balance_loss_mlp": 1.03582323, + "epoch": 0.9721328723883962, + "flos": 22674105492480.0, + "grad_norm": 2.5093639466048896, + "language_loss": 0.72537249, + "learning_rate": 8.123388903830797e-09, + "loss": 0.74636978, + "num_input_tokens_seen": 348992035, + "step": 16169, + "time_per_iteration": 2.7542500495910645 + }, + { + "auxiliary_loss_clip": 0.01073449, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_clip": 1.03180361, + "balance_loss_mlp": 1.02368569, + "epoch": 0.9721929956410642, + "flos": 28074172360320.0, + "grad_norm": 1.7172146559968202, + "language_loss": 0.57560009, + "learning_rate": 8.088360210906309e-09, + "loss": 0.59671509, + "num_input_tokens_seen": 349013160, + "step": 16170, + "time_per_iteration": 2.784191370010376 + }, + { + "auxiliary_loss_clip": 0.01075999, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.03437006, + "balance_loss_mlp": 1.01930237, + "epoch": 0.9722531188937321, + "flos": 20996251344000.0, + "grad_norm": 1.991276787299532, + "language_loss": 0.71702683, + "learning_rate": 8.053407051471062e-09, + "loss": 0.7381115, + "num_input_tokens_seen": 349033485, + "step": 16171, + "time_per_iteration": 2.7290470600128174 + }, + { + "auxiliary_loss_clip": 0.01074193, + "auxiliary_loss_mlp": 0.01036428, + "balance_loss_clip": 1.03374755, + "balance_loss_mlp": 1.02373838, + "epoch": 0.9723132421464001, + "flos": 16070600332800.0, + "grad_norm": 3.7050893371500973, + "language_loss": 0.68799138, + "learning_rate": 8.018529426850218e-09, + "loss": 0.70909762, + "num_input_tokens_seen": 349051705, + "step": 16172, + "time_per_iteration": 2.7984087467193604 + }, + { + "auxiliary_loss_clip": 0.01092548, + "auxiliary_loss_mlp": 0.01030417, + "balance_loss_clip": 1.03369451, + "balance_loss_mlp": 1.01790619, + "epoch": 0.972373365399068, + "flos": 27745769289600.0, + "grad_norm": 2.273393003122684, + "language_loss": 0.85909021, + "learning_rate": 7.983727338366274e-09, + "loss": 0.88031983, + "num_input_tokens_seen": 349070825, + "step": 16173, + "time_per_iteration": 2.637646198272705 + }, + { + "auxiliary_loss_clip": 0.01058492, + "auxiliary_loss_mlp": 0.01037401, + "balance_loss_clip": 1.03226995, + "balance_loss_mlp": 1.02290511, + "epoch": 0.9724334886517361, + "flos": 23002939526400.0, + "grad_norm": 2.532344740288213, + "language_loss": 0.64345253, + "learning_rate": 7.949000787339289e-09, + "loss": 0.66441143, + "num_input_tokens_seen": 349089730, + "step": 16174, + "time_per_iteration": 2.6890182495117188 + }, + { + "auxiliary_loss_clip": 0.0109623, + "auxiliary_loss_mlp": 0.01029844, + "balance_loss_clip": 1.03573728, + "balance_loss_mlp": 1.01808977, + "epoch": 0.972493611904404, + "flos": 25447055535360.0, + "grad_norm": 1.5574440695217635, + "language_loss": 0.78149283, + "learning_rate": 7.914349775085538e-09, + "loss": 0.80275363, + "num_input_tokens_seen": 349111315, + "step": 16175, + "time_per_iteration": 2.65380597114563 + }, + { + "auxiliary_loss_clip": 0.01098527, + "auxiliary_loss_mlp": 0.0103633, + "balance_loss_clip": 1.03696692, + "balance_loss_mlp": 1.02305567, + "epoch": 0.972553735157072, + "flos": 16983054547200.0, + "grad_norm": 2.4406961253744637, + "language_loss": 0.56965649, + "learning_rate": 7.879774302919307e-09, + "loss": 0.59100509, + "num_input_tokens_seen": 349129495, + "step": 16176, + "time_per_iteration": 2.564636707305908 + }, + { + "auxiliary_loss_clip": 0.01088801, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.03812397, + "balance_loss_mlp": 1.02081394, + "epoch": 0.97261385840974, + "flos": 26104651776000.0, + "grad_norm": 2.4918025895156557, + "language_loss": 0.72519267, + "learning_rate": 7.845274372151545e-09, + "loss": 0.74640346, + "num_input_tokens_seen": 349148850, + "step": 16177, + "time_per_iteration": 2.677704334259033 + }, + { + "auxiliary_loss_clip": 0.01087782, + "auxiliary_loss_mlp": 0.01029248, + "balance_loss_clip": 1.03436661, + "balance_loss_mlp": 1.01660562, + "epoch": 0.9726739816624079, + "flos": 25447881548160.0, + "grad_norm": 1.6303663037965777, + "language_loss": 0.68360388, + "learning_rate": 7.810849984090984e-09, + "loss": 0.70477414, + "num_input_tokens_seen": 349167620, + "step": 16178, + "time_per_iteration": 2.6498606204986572 + }, + { + "auxiliary_loss_clip": 0.01054589, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.03061843, + "balance_loss_mlp": 1.01890159, + "epoch": 0.972734104915076, + "flos": 29014923513600.0, + "grad_norm": 1.7151954479923888, + "language_loss": 0.66904575, + "learning_rate": 7.776501140042358e-09, + "loss": 0.68991244, + "num_input_tokens_seen": 349185845, + "step": 16179, + "time_per_iteration": 2.9617762565612793 + }, + { + "auxiliary_loss_clip": 0.01083826, + "auxiliary_loss_mlp": 0.00768898, + "balance_loss_clip": 1.03630555, + "balance_loss_mlp": 1.0001514, + "epoch": 0.9727942281677439, + "flos": 23437637919360.0, + "grad_norm": 2.630214780518977, + "language_loss": 0.77113461, + "learning_rate": 7.742227841308624e-09, + "loss": 0.78966182, + "num_input_tokens_seen": 349204525, + "step": 16180, + "time_per_iteration": 2.6464152336120605 + }, + { + "auxiliary_loss_clip": 0.01098634, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.03539276, + "balance_loss_mlp": 1.01826119, + "epoch": 0.9728543514204119, + "flos": 31724599749120.0, + "grad_norm": 2.262434982216008, + "language_loss": 0.76220429, + "learning_rate": 7.708030089189188e-09, + "loss": 0.78349876, + "num_input_tokens_seen": 349228075, + "step": 16181, + "time_per_iteration": 2.677198648452759 + }, + { + "auxiliary_loss_clip": 0.01106677, + "auxiliary_loss_mlp": 0.01035405, + "balance_loss_clip": 1.03586745, + "balance_loss_mlp": 1.02323365, + "epoch": 0.9729144746730798, + "flos": 16289368116480.0, + "grad_norm": 1.5196924475010567, + "language_loss": 0.63252479, + "learning_rate": 7.67390788498079e-09, + "loss": 0.65394562, + "num_input_tokens_seen": 349246990, + "step": 16182, + "time_per_iteration": 2.554809093475342 + }, + { + "auxiliary_loss_clip": 0.01041817, + "auxiliary_loss_mlp": 0.01042152, + "balance_loss_clip": 1.04146159, + "balance_loss_mlp": 1.0289433, + "epoch": 0.9729745979257478, + "flos": 25041408266880.0, + "grad_norm": 1.789194263856678, + "language_loss": 0.62447584, + "learning_rate": 7.639861229977507e-09, + "loss": 0.64531553, + "num_input_tokens_seen": 349265890, + "step": 16183, + "time_per_iteration": 3.175109624862671 + }, + { + "auxiliary_loss_clip": 0.01085962, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.03510141, + "balance_loss_mlp": 1.02473438, + "epoch": 0.9730347211784157, + "flos": 22638733574400.0, + "grad_norm": 1.6456930738589919, + "language_loss": 0.78234679, + "learning_rate": 7.605890125470527e-09, + "loss": 0.80358338, + "num_input_tokens_seen": 349285275, + "step": 16184, + "time_per_iteration": 2.9018943309783936 + }, + { + "auxiliary_loss_clip": 0.01068538, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.03069115, + "balance_loss_mlp": 1.02024758, + "epoch": 0.9730948444310837, + "flos": 10998613313280.0, + "grad_norm": 2.161376757576218, + "language_loss": 0.79345584, + "learning_rate": 7.571994572747709e-09, + "loss": 0.8144781, + "num_input_tokens_seen": 349301515, + "step": 16185, + "time_per_iteration": 2.641317129135132 + }, + { + "auxiliary_loss_clip": 0.01077077, + "auxiliary_loss_mlp": 0.01028307, + "balance_loss_clip": 1.03456235, + "balance_loss_mlp": 1.01660085, + "epoch": 0.9731549676837516, + "flos": 16799479113600.0, + "grad_norm": 2.015706111725158, + "language_loss": 0.77789813, + "learning_rate": 7.538174573094469e-09, + "loss": 0.79895198, + "num_input_tokens_seen": 349319590, + "step": 16186, + "time_per_iteration": 2.698368787765503 + }, + { + "auxiliary_loss_clip": 0.01084734, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.0357089, + "balance_loss_mlp": 1.01675642, + "epoch": 0.9732150909364197, + "flos": 21141761339520.0, + "grad_norm": 1.7572799383983544, + "language_loss": 0.65494901, + "learning_rate": 7.504430127793337e-09, + "loss": 0.67608833, + "num_input_tokens_seen": 349339230, + "step": 16187, + "time_per_iteration": 4.130638122558594 + }, + { + "auxiliary_loss_clip": 0.01079645, + "auxiliary_loss_mlp": 0.01038619, + "balance_loss_clip": 1.03164029, + "balance_loss_mlp": 1.02523208, + "epoch": 0.9732752141890876, + "flos": 33727337435520.0, + "grad_norm": 1.8431543667714356, + "language_loss": 0.80137229, + "learning_rate": 7.47076123812418e-09, + "loss": 0.82255495, + "num_input_tokens_seen": 349361155, + "step": 16188, + "time_per_iteration": 4.257014989852905 + }, + { + "auxiliary_loss_clip": 0.01072207, + "auxiliary_loss_mlp": 0.0103015, + "balance_loss_clip": 1.03375018, + "balance_loss_mlp": 1.01883137, + "epoch": 0.9733353374417556, + "flos": 23404384903680.0, + "grad_norm": 1.7664281085479938, + "language_loss": 0.78316271, + "learning_rate": 7.437167905363084e-09, + "loss": 0.80418628, + "num_input_tokens_seen": 349379335, + "step": 16189, + "time_per_iteration": 2.675529718399048 + }, + { + "auxiliary_loss_clip": 0.01092046, + "auxiliary_loss_mlp": 0.0102785, + "balance_loss_clip": 1.03294408, + "balance_loss_mlp": 1.01514196, + "epoch": 0.9733954606944236, + "flos": 39165792963840.0, + "grad_norm": 1.7197781596757225, + "language_loss": 0.51230407, + "learning_rate": 7.403650130784367e-09, + "loss": 0.533503, + "num_input_tokens_seen": 349401575, + "step": 16190, + "time_per_iteration": 4.908695459365845 + }, + { + "auxiliary_loss_clip": 0.01098154, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.03689873, + "balance_loss_mlp": 1.01865995, + "epoch": 0.9734555839470915, + "flos": 21981819692160.0, + "grad_norm": 1.7152390443101855, + "language_loss": 0.80948341, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.83077252, + "num_input_tokens_seen": 349420650, + "step": 16191, + "time_per_iteration": 2.6668500900268555 + }, + { + "auxiliary_loss_clip": 0.01091143, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.03202808, + "balance_loss_mlp": 1.01971912, + "epoch": 0.9735157071997596, + "flos": 16575539771520.0, + "grad_norm": 1.6910464048805458, + "language_loss": 0.8259176, + "learning_rate": 7.336841261255111e-09, + "loss": 0.84714502, + "num_input_tokens_seen": 349436830, + "step": 16192, + "time_per_iteration": 2.569251537322998 + }, + { + "auxiliary_loss_clip": 0.01046721, + "auxiliary_loss_mlp": 0.01039813, + "balance_loss_clip": 1.03544569, + "balance_loss_mlp": 1.02665234, + "epoch": 0.9735758304524275, + "flos": 20223237726720.0, + "grad_norm": 1.8106504266161225, + "language_loss": 0.74773109, + "learning_rate": 7.303550168837658e-09, + "loss": 0.76859641, + "num_input_tokens_seen": 349454325, + "step": 16193, + "time_per_iteration": 4.564434051513672 + }, + { + "auxiliary_loss_clip": 0.01079567, + "auxiliary_loss_mlp": 0.01031423, + "balance_loss_clip": 1.03505838, + "balance_loss_mlp": 1.020015, + "epoch": 0.9736359537050955, + "flos": 23653353047040.0, + "grad_norm": 1.8191654334710798, + "language_loss": 0.85254693, + "learning_rate": 7.270334639669417e-09, + "loss": 0.87365687, + "num_input_tokens_seen": 349470230, + "step": 16194, + "time_per_iteration": 2.687668561935425 + }, + { + "auxiliary_loss_clip": 0.01070428, + "auxiliary_loss_mlp": 0.01037259, + "balance_loss_clip": 1.03369021, + "balance_loss_mlp": 1.02468801, + "epoch": 0.9736960769577634, + "flos": 15560202026880.0, + "grad_norm": 1.6441349965800172, + "language_loss": 0.75818932, + "learning_rate": 7.237194675009828e-09, + "loss": 0.77926624, + "num_input_tokens_seen": 349486250, + "step": 16195, + "time_per_iteration": 2.6451404094696045 + }, + { + "auxiliary_loss_clip": 0.01004847, + "auxiliary_loss_mlp": 0.01000872, + "balance_loss_clip": 1.00990903, + "balance_loss_mlp": 0.99979365, + "epoch": 0.9737562002104314, + "flos": 65351783088000.0, + "grad_norm": 0.708245154030494, + "language_loss": 0.52467954, + "learning_rate": 7.204130276115439e-09, + "loss": 0.54473674, + "num_input_tokens_seen": 349545865, + "step": 16196, + "time_per_iteration": 3.186091184616089 + }, + { + "auxiliary_loss_clip": 0.01084909, + "auxiliary_loss_mlp": 0.0103142, + "balance_loss_clip": 1.03660226, + "balance_loss_mlp": 1.01945114, + "epoch": 0.9738163234630993, + "flos": 27196730928000.0, + "grad_norm": 2.030454883195646, + "language_loss": 0.7627387, + "learning_rate": 7.171141444240136e-09, + "loss": 0.78390199, + "num_input_tokens_seen": 349566080, + "step": 16197, + "time_per_iteration": 2.8780059814453125 + }, + { + "auxiliary_loss_clip": 0.0111131, + "auxiliary_loss_mlp": 0.01028167, + "balance_loss_clip": 1.03635693, + "balance_loss_mlp": 1.01535797, + "epoch": 0.9738764467157673, + "flos": 21069365477760.0, + "grad_norm": 1.7142052132721648, + "language_loss": 0.67503351, + "learning_rate": 7.13822818063492e-09, + "loss": 0.69642824, + "num_input_tokens_seen": 349585665, + "step": 16198, + "time_per_iteration": 2.689474582672119 + }, + { + "auxiliary_loss_clip": 0.01107297, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.03572273, + "balance_loss_mlp": 1.01887083, + "epoch": 0.9739365699684353, + "flos": 21361211481600.0, + "grad_norm": 1.916549844614904, + "language_loss": 0.78117663, + "learning_rate": 7.10539048654768e-09, + "loss": 0.80256933, + "num_input_tokens_seen": 349605125, + "step": 16199, + "time_per_iteration": 2.5536978244781494 + }, + { + "auxiliary_loss_clip": 0.0108445, + "auxiliary_loss_mlp": 0.01035036, + "balance_loss_clip": 1.03713942, + "balance_loss_mlp": 1.02260256, + "epoch": 0.9739966932211033, + "flos": 21902061542400.0, + "grad_norm": 1.9705409422067974, + "language_loss": 0.79409683, + "learning_rate": 7.072628363223865e-09, + "loss": 0.81529176, + "num_input_tokens_seen": 349623360, + "step": 16200, + "time_per_iteration": 2.6256768703460693 + }, + { + "auxiliary_loss_clip": 0.01058782, + "auxiliary_loss_mlp": 0.01035774, + "balance_loss_clip": 1.03694201, + "balance_loss_mlp": 1.02227926, + "epoch": 0.9740568164737712, + "flos": 24827345164800.0, + "grad_norm": 2.0331349042288878, + "language_loss": 0.68434143, + "learning_rate": 7.039941811905592e-09, + "loss": 0.70528698, + "num_input_tokens_seen": 349644390, + "step": 16201, + "time_per_iteration": 2.8037257194519043 + }, + { + "auxiliary_loss_clip": 0.01075577, + "auxiliary_loss_mlp": 0.01033677, + "balance_loss_clip": 1.03323948, + "balance_loss_mlp": 1.02163649, + "epoch": 0.9741169397264392, + "flos": 23623583650560.0, + "grad_norm": 1.5025292618741064, + "language_loss": 0.72862577, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.74971825, + "num_input_tokens_seen": 349663200, + "step": 16202, + "time_per_iteration": 2.662804126739502 + }, + { + "auxiliary_loss_clip": 0.0108729, + "auxiliary_loss_mlp": 0.01034456, + "balance_loss_clip": 1.03576303, + "balance_loss_mlp": 1.02150416, + "epoch": 0.9741770629791072, + "flos": 18841144164480.0, + "grad_norm": 2.6824456959299052, + "language_loss": 0.72871369, + "learning_rate": 6.974795430241265e-09, + "loss": 0.74993122, + "num_input_tokens_seen": 349681975, + "step": 16203, + "time_per_iteration": 2.5910871028900146 + }, + { + "auxiliary_loss_clip": 0.01109424, + "auxiliary_loss_mlp": 0.01033432, + "balance_loss_clip": 1.03729725, + "balance_loss_mlp": 1.02117181, + "epoch": 0.9742371862317751, + "flos": 22346241125760.0, + "grad_norm": 1.9882435140281416, + "language_loss": 0.77292311, + "learning_rate": 6.942335602365235e-09, + "loss": 0.7943517, + "num_input_tokens_seen": 349701185, + "step": 16204, + "time_per_iteration": 2.599534273147583 + }, + { + "auxiliary_loss_clip": 0.01091233, + "auxiliary_loss_mlp": 0.01034218, + "balance_loss_clip": 1.03830349, + "balance_loss_mlp": 1.02127194, + "epoch": 0.9742973094844432, + "flos": 21762764599680.0, + "grad_norm": 2.04301514318933, + "language_loss": 0.79557073, + "learning_rate": 6.909951351435905e-09, + "loss": 0.81682527, + "num_input_tokens_seen": 349720360, + "step": 16205, + "time_per_iteration": 2.611509323120117 + }, + { + "auxiliary_loss_clip": 0.01106984, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.03618968, + "balance_loss_mlp": 1.02133942, + "epoch": 0.9743574327371111, + "flos": 26248725227520.0, + "grad_norm": 1.7129263404714312, + "language_loss": 0.74342418, + "learning_rate": 6.87764267868074e-09, + "loss": 0.76482964, + "num_input_tokens_seen": 349741040, + "step": 16206, + "time_per_iteration": 2.5808560848236084 + }, + { + "auxiliary_loss_clip": 0.01055158, + "auxiliary_loss_mlp": 0.01032161, + "balance_loss_clip": 1.03472948, + "balance_loss_mlp": 1.019382, + "epoch": 0.9744175559897791, + "flos": 12349321367040.0, + "grad_norm": 2.3020742472105375, + "language_loss": 0.83948338, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.86035663, + "num_input_tokens_seen": 349758895, + "step": 16207, + "time_per_iteration": 3.118260622024536 + }, + { + "auxiliary_loss_clip": 0.01096985, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.03668702, + "balance_loss_mlp": 1.0217663, + "epoch": 0.974477679242447, + "flos": 28397834835840.0, + "grad_norm": 1.7142608213779496, + "language_loss": 0.71005446, + "learning_rate": 6.813252072591425e-09, + "loss": 0.73136348, + "num_input_tokens_seen": 349779740, + "step": 16208, + "time_per_iteration": 2.631779909133911 + }, + { + "auxiliary_loss_clip": 0.01068659, + "auxiliary_loss_mlp": 0.01025995, + "balance_loss_clip": 1.03373158, + "balance_loss_mlp": 1.01523638, + "epoch": 0.974537802495115, + "flos": 17785370684160.0, + "grad_norm": 1.6324180098602632, + "language_loss": 0.77270913, + "learning_rate": 6.781170141698878e-09, + "loss": 0.79365563, + "num_input_tokens_seen": 349796820, + "step": 16209, + "time_per_iteration": 2.648383617401123 + }, + { + "auxiliary_loss_clip": 0.01070166, + "auxiliary_loss_mlp": 0.0077274, + "balance_loss_clip": 1.03177297, + "balance_loss_mlp": 1.0000906, + "epoch": 0.9745979257477829, + "flos": 23842315520640.0, + "grad_norm": 1.7959688952091581, + "language_loss": 0.79134548, + "learning_rate": 6.749163793864144e-09, + "loss": 0.80977452, + "num_input_tokens_seen": 349816550, + "step": 16210, + "time_per_iteration": 2.693124294281006 + }, + { + "auxiliary_loss_clip": 0.01082394, + "auxiliary_loss_mlp": 0.01035247, + "balance_loss_clip": 1.03368235, + "balance_loss_mlp": 1.02293837, + "epoch": 0.9746580490004509, + "flos": 27016172236800.0, + "grad_norm": 2.111674380643122, + "language_loss": 0.7811175, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.80229396, + "num_input_tokens_seen": 349834350, + "step": 16211, + "time_per_iteration": 2.7423813343048096 + }, + { + "auxiliary_loss_clip": 0.01074533, + "auxiliary_loss_mlp": 0.0103461, + "balance_loss_clip": 1.03468013, + "balance_loss_mlp": 1.02106786, + "epoch": 0.9747181722531189, + "flos": 19792022952960.0, + "grad_norm": 2.2346090535911345, + "language_loss": 0.78309953, + "learning_rate": 6.685377852219787e-09, + "loss": 0.80419093, + "num_input_tokens_seen": 349853460, + "step": 16212, + "time_per_iteration": 2.7550909519195557 + }, + { + "auxiliary_loss_clip": 0.01076477, + "auxiliary_loss_mlp": 0.01032217, + "balance_loss_clip": 1.03523839, + "balance_loss_mlp": 1.02030253, + "epoch": 0.9747782955057869, + "flos": 31430598929280.0, + "grad_norm": 1.4958012465208934, + "language_loss": 0.79993176, + "learning_rate": 6.653598260829118e-09, + "loss": 0.8210187, + "num_input_tokens_seen": 349874830, + "step": 16213, + "time_per_iteration": 2.8637707233428955 + }, + { + "auxiliary_loss_clip": 0.01062528, + "auxiliary_loss_mlp": 0.01026041, + "balance_loss_clip": 1.03252554, + "balance_loss_mlp": 1.01400709, + "epoch": 0.9748384187584548, + "flos": 15961288268160.0, + "grad_norm": 1.9405763574770338, + "language_loss": 0.66294038, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.68382609, + "num_input_tokens_seen": 349893690, + "step": 16214, + "time_per_iteration": 2.699460029602051 + }, + { + "auxiliary_loss_clip": 0.01095715, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.04145873, + "balance_loss_mlp": 1.01690817, + "epoch": 0.9748985420111228, + "flos": 20558715776640.0, + "grad_norm": 1.7124616404956563, + "language_loss": 0.73894978, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.76020747, + "num_input_tokens_seen": 349912480, + "step": 16215, + "time_per_iteration": 2.6812703609466553 + }, + { + "auxiliary_loss_clip": 0.01057347, + "auxiliary_loss_mlp": 0.01034106, + "balance_loss_clip": 1.03506923, + "balance_loss_mlp": 1.02194071, + "epoch": 0.9749586652637908, + "flos": 36721605127680.0, + "grad_norm": 2.1409618688352583, + "language_loss": 0.6697464, + "learning_rate": 6.558713018834483e-09, + "loss": 0.69066095, + "num_input_tokens_seen": 349932470, + "step": 16216, + "time_per_iteration": 2.8369500637054443 + }, + { + "auxiliary_loss_clip": 0.01053374, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.03023767, + "balance_loss_mlp": 1.01911426, + "epoch": 0.9750187885164587, + "flos": 10999223844480.0, + "grad_norm": 2.017970280416665, + "language_loss": 0.71706629, + "learning_rate": 6.527235786226937e-09, + "loss": 0.73791993, + "num_input_tokens_seen": 349949060, + "step": 16217, + "time_per_iteration": 2.7381694316864014 + }, + { + "auxiliary_loss_clip": 0.01074463, + "auxiliary_loss_mlp": 0.01028184, + "balance_loss_clip": 1.03641343, + "balance_loss_mlp": 1.01587594, + "epoch": 0.9750789117691268, + "flos": 25739512070400.0, + "grad_norm": 1.6438610736190364, + "language_loss": 0.78195894, + "learning_rate": 6.495834146306167e-09, + "loss": 0.80298543, + "num_input_tokens_seen": 349968010, + "step": 16218, + "time_per_iteration": 2.7634236812591553 + }, + { + "auxiliary_loss_clip": 0.01079204, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.03390265, + "balance_loss_mlp": 1.01689315, + "epoch": 0.9751390350217947, + "flos": 13333955961600.0, + "grad_norm": 2.591457126969395, + "language_loss": 0.77337241, + "learning_rate": 6.464508100263222e-09, + "loss": 0.79445708, + "num_input_tokens_seen": 349985270, + "step": 16219, + "time_per_iteration": 2.7380733489990234 + }, + { + "auxiliary_loss_clip": 0.01087952, + "auxiliary_loss_mlp": 0.01032908, + "balance_loss_clip": 1.03535342, + "balance_loss_mlp": 1.02096331, + "epoch": 0.9751991582744627, + "flos": 22820621068800.0, + "grad_norm": 1.7048563405563817, + "language_loss": 0.81480777, + "learning_rate": 6.433257649285817e-09, + "loss": 0.83601636, + "num_input_tokens_seen": 350003935, + "step": 16220, + "time_per_iteration": 2.6495344638824463 + }, + { + "auxiliary_loss_clip": 0.01106693, + "auxiliary_loss_mlp": 0.01032081, + "balance_loss_clip": 1.03613138, + "balance_loss_mlp": 1.02025533, + "epoch": 0.9752592815271306, + "flos": 19646189735040.0, + "grad_norm": 1.7107968412659516, + "language_loss": 0.75237, + "learning_rate": 6.402082794559227e-09, + "loss": 0.77375782, + "num_input_tokens_seen": 350023595, + "step": 16221, + "time_per_iteration": 2.5049870014190674 + }, + { + "auxiliary_loss_clip": 0.01072645, + "auxiliary_loss_mlp": 0.01031441, + "balance_loss_clip": 1.0333364, + "balance_loss_mlp": 1.01963329, + "epoch": 0.9753194047797986, + "flos": 26690462686080.0, + "grad_norm": 1.478633421454376, + "language_loss": 0.66371262, + "learning_rate": 6.370983537265395e-09, + "loss": 0.68475342, + "num_input_tokens_seen": 350045920, + "step": 16222, + "time_per_iteration": 2.7511966228485107 + }, + { + "auxiliary_loss_clip": 0.0109569, + "auxiliary_loss_mlp": 0.01029193, + "balance_loss_clip": 1.03627598, + "balance_loss_mlp": 1.01753998, + "epoch": 0.9753795280324665, + "flos": 23221779137280.0, + "grad_norm": 1.9485164555129428, + "language_loss": 0.8856619, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.90691066, + "num_input_tokens_seen": 350063925, + "step": 16223, + "time_per_iteration": 2.864657163619995 + }, + { + "auxiliary_loss_clip": 0.01045431, + "auxiliary_loss_mlp": 0.01035052, + "balance_loss_clip": 1.03191626, + "balance_loss_mlp": 1.02308941, + "epoch": 0.9754396512851345, + "flos": 19463835363840.0, + "grad_norm": 1.8028212337000589, + "language_loss": 0.74985182, + "learning_rate": 6.309011819690457e-09, + "loss": 0.7706567, + "num_input_tokens_seen": 350080900, + "step": 16224, + "time_per_iteration": 2.7734134197235107 + }, + { + "auxiliary_loss_clip": 0.01010696, + "auxiliary_loss_mlp": 0.0100273, + "balance_loss_clip": 1.00753188, + "balance_loss_mlp": 1.00170505, + "epoch": 0.9754997745378025, + "flos": 68459313340800.0, + "grad_norm": 0.8348178291134782, + "language_loss": 0.5909391, + "learning_rate": 6.278139361759249e-09, + "loss": 0.61107337, + "num_input_tokens_seen": 350144550, + "step": 16225, + "time_per_iteration": 3.203700065612793 + }, + { + "auxiliary_loss_clip": 0.01075593, + "auxiliary_loss_mlp": 0.00770137, + "balance_loss_clip": 1.03655672, + "balance_loss_mlp": 1.00027668, + "epoch": 0.9755598977904705, + "flos": 26395168976640.0, + "grad_norm": 2.1280321736121364, + "language_loss": 0.68929291, + "learning_rate": 6.247342505960818e-09, + "loss": 0.7077502, + "num_input_tokens_seen": 350164050, + "step": 16226, + "time_per_iteration": 2.7182259559631348 + }, + { + "auxiliary_loss_clip": 0.01094266, + "auxiliary_loss_mlp": 0.01042676, + "balance_loss_clip": 1.03407538, + "balance_loss_mlp": 1.02954507, + "epoch": 0.9756200210431384, + "flos": 16617663446400.0, + "grad_norm": 1.92516234582211, + "language_loss": 0.82812244, + "learning_rate": 6.216621253462894e-09, + "loss": 0.84949183, + "num_input_tokens_seen": 350181350, + "step": 16227, + "time_per_iteration": 4.278809547424316 + }, + { + "auxiliary_loss_clip": 0.01106745, + "auxiliary_loss_mlp": 0.01029012, + "balance_loss_clip": 1.03587723, + "balance_loss_mlp": 1.01710916, + "epoch": 0.9756801442958064, + "flos": 23623044946560.0, + "grad_norm": 1.6986847521997988, + "language_loss": 0.77753866, + "learning_rate": 6.185975605430549e-09, + "loss": 0.79889619, + "num_input_tokens_seen": 350199765, + "step": 16228, + "time_per_iteration": 4.098712205886841 + }, + { + "auxiliary_loss_clip": 0.01018838, + "auxiliary_loss_mlp": 0.01000083, + "balance_loss_clip": 1.00571454, + "balance_loss_mlp": 0.99909353, + "epoch": 0.9757402675484744, + "flos": 61625799440640.0, + "grad_norm": 0.84298125837055, + "language_loss": 0.55775201, + "learning_rate": 6.155405563025962e-09, + "loss": 0.57794118, + "num_input_tokens_seen": 350256420, + "step": 16229, + "time_per_iteration": 4.671915292739868 + }, + { + "auxiliary_loss_clip": 0.01097026, + "auxiliary_loss_mlp": 0.01031698, + "balance_loss_clip": 1.03510642, + "balance_loss_mlp": 1.01906228, + "epoch": 0.9758003908011423, + "flos": 24058964401920.0, + "grad_norm": 1.6630448372353723, + "language_loss": 0.74857068, + "learning_rate": 6.124911127407984e-09, + "loss": 0.76985788, + "num_input_tokens_seen": 350276270, + "step": 16230, + "time_per_iteration": 2.637298822402954 + }, + { + "auxiliary_loss_clip": 0.01080882, + "auxiliary_loss_mlp": 0.01029958, + "balance_loss_clip": 1.03464866, + "balance_loss_mlp": 1.01841259, + "epoch": 0.9758605140538104, + "flos": 17493093717120.0, + "grad_norm": 2.3627285859767992, + "language_loss": 0.72050405, + "learning_rate": 6.094492299733245e-09, + "loss": 0.74161243, + "num_input_tokens_seen": 350295000, + "step": 16231, + "time_per_iteration": 2.606243133544922 + }, + { + "auxiliary_loss_clip": 0.01087789, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.03723931, + "balance_loss_mlp": 1.019225, + "epoch": 0.9759206373064783, + "flos": 24826950115200.0, + "grad_norm": 1.897185559618263, + "language_loss": 0.76273429, + "learning_rate": 6.064149081155267e-09, + "loss": 0.78393269, + "num_input_tokens_seen": 350314980, + "step": 16232, + "time_per_iteration": 4.806816816329956 + }, + { + "auxiliary_loss_clip": 0.01007054, + "auxiliary_loss_mlp": 0.00999094, + "balance_loss_clip": 1.0077014, + "balance_loss_mlp": 0.99789584, + "epoch": 0.9759807605591463, + "flos": 68161182456960.0, + "grad_norm": 0.7408233152349849, + "language_loss": 0.53817546, + "learning_rate": 6.033881472824465e-09, + "loss": 0.55823696, + "num_input_tokens_seen": 350371985, + "step": 16233, + "time_per_iteration": 3.143988847732544 + }, + { + "auxiliary_loss_clip": 0.01108543, + "auxiliary_loss_mlp": 0.0103539, + "balance_loss_clip": 1.03642726, + "balance_loss_mlp": 1.02313495, + "epoch": 0.9760408838118142, + "flos": 18989239939200.0, + "grad_norm": 1.8846866025891749, + "language_loss": 0.71843183, + "learning_rate": 6.003689475888807e-09, + "loss": 0.7398712, + "num_input_tokens_seen": 350390590, + "step": 16234, + "time_per_iteration": 2.5556411743164062 + }, + { + "auxiliary_loss_clip": 0.01098185, + "auxiliary_loss_mlp": 0.0103115, + "balance_loss_clip": 1.03558266, + "balance_loss_mlp": 1.01847827, + "epoch": 0.9761010070644822, + "flos": 17125978763520.0, + "grad_norm": 2.9772233129130825, + "language_loss": 0.79668027, + "learning_rate": 5.973573091493156e-09, + "loss": 0.81797361, + "num_input_tokens_seen": 350403770, + "step": 16235, + "time_per_iteration": 2.5155222415924072 + }, + { + "auxiliary_loss_clip": 0.0109002, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.03545177, + "balance_loss_mlp": 1.02070475, + "epoch": 0.9761611303171501, + "flos": 22052599441920.0, + "grad_norm": 2.420578950017542, + "language_loss": 0.7674818, + "learning_rate": 5.943532320779265e-09, + "loss": 0.7887249, + "num_input_tokens_seen": 350421870, + "step": 16236, + "time_per_iteration": 2.641690731048584 + }, + { + "auxiliary_loss_clip": 0.01096794, + "auxiliary_loss_mlp": 0.01027707, + "balance_loss_clip": 1.03507769, + "balance_loss_mlp": 1.01571465, + "epoch": 0.9762212535698181, + "flos": 21757521214080.0, + "grad_norm": 3.537228150180641, + "language_loss": 0.75424302, + "learning_rate": 5.913567164886446e-09, + "loss": 0.77548802, + "num_input_tokens_seen": 350440025, + "step": 16237, + "time_per_iteration": 2.5821526050567627 + }, + { + "auxiliary_loss_clip": 0.01061626, + "auxiliary_loss_mlp": 0.01037494, + "balance_loss_clip": 1.03076112, + "balance_loss_mlp": 1.02306354, + "epoch": 0.9762813768224861, + "flos": 25921615046400.0, + "grad_norm": 1.5766064307721592, + "language_loss": 0.72649348, + "learning_rate": 5.8836776249509e-09, + "loss": 0.74748468, + "num_input_tokens_seen": 350459435, + "step": 16238, + "time_per_iteration": 2.716170072555542 + }, + { + "auxiliary_loss_clip": 0.01090292, + "auxiliary_loss_mlp": 0.00771217, + "balance_loss_clip": 1.03843439, + "balance_loss_mlp": 1.0002383, + "epoch": 0.9763415000751541, + "flos": 24051853509120.0, + "grad_norm": 2.1577792438101646, + "language_loss": 0.83911026, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.85772538, + "num_input_tokens_seen": 350472655, + "step": 16239, + "time_per_iteration": 2.7628393173217773 + }, + { + "auxiliary_loss_clip": 0.01067831, + "auxiliary_loss_mlp": 0.01043243, + "balance_loss_clip": 1.03342855, + "balance_loss_mlp": 1.02861595, + "epoch": 0.976401623327822, + "flos": 17018677860480.0, + "grad_norm": 3.0421443760450266, + "language_loss": 0.60336649, + "learning_rate": 5.824125397483115e-09, + "loss": 0.62447721, + "num_input_tokens_seen": 350488160, + "step": 16240, + "time_per_iteration": 2.6417906284332275 + }, + { + "auxiliary_loss_clip": 0.01069406, + "auxiliary_loss_mlp": 0.01029004, + "balance_loss_clip": 1.0350244, + "balance_loss_mlp": 1.01704097, + "epoch": 0.97646174658049, + "flos": 16106941918080.0, + "grad_norm": 1.952892588808636, + "language_loss": 0.82362419, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.84460825, + "num_input_tokens_seen": 350506065, + "step": 16241, + "time_per_iteration": 2.6529223918914795 + }, + { + "auxiliary_loss_clip": 0.01069965, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.03480554, + "balance_loss_mlp": 1.02566481, + "epoch": 0.9765218698331579, + "flos": 21252725429760.0, + "grad_norm": 1.9374011472646437, + "language_loss": 0.83271652, + "learning_rate": 5.764875647408463e-09, + "loss": 0.85379553, + "num_input_tokens_seen": 350524495, + "step": 16242, + "time_per_iteration": 2.7075135707855225 + }, + { + "auxiliary_loss_clip": 0.01097999, + "auxiliary_loss_mlp": 0.01027861, + "balance_loss_clip": 1.03740764, + "balance_loss_mlp": 1.01545691, + "epoch": 0.9765819930858259, + "flos": 18588045957120.0, + "grad_norm": 1.5885372986539104, + "language_loss": 0.75616562, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.77742422, + "num_input_tokens_seen": 350544185, + "step": 16243, + "time_per_iteration": 2.8476173877716064 + }, + { + "auxiliary_loss_clip": 0.01096151, + "auxiliary_loss_mlp": 0.0103737, + "balance_loss_clip": 1.03450549, + "balance_loss_mlp": 1.02472222, + "epoch": 0.976642116338494, + "flos": 20266833859200.0, + "grad_norm": 1.629420195076715, + "language_loss": 0.70183492, + "learning_rate": 5.705928383713754e-09, + "loss": 0.72317016, + "num_input_tokens_seen": 350562675, + "step": 16244, + "time_per_iteration": 2.648705244064331 + }, + { + "auxiliary_loss_clip": 0.01090661, + "auxiliary_loss_mlp": 0.01030766, + "balance_loss_clip": 1.03870106, + "balance_loss_mlp": 1.01780796, + "epoch": 0.9767022395911619, + "flos": 25550477769600.0, + "grad_norm": 1.816908720128117, + "language_loss": 0.83598977, + "learning_rate": 5.676568187055197e-09, + "loss": 0.85720408, + "num_input_tokens_seen": 350581535, + "step": 16245, + "time_per_iteration": 2.7069408893585205 + }, + { + "auxiliary_loss_clip": 0.01056812, + "auxiliary_loss_mlp": 0.01028217, + "balance_loss_clip": 1.03245211, + "balance_loss_mlp": 1.0164988, + "epoch": 0.9767623628438299, + "flos": 21762656858880.0, + "grad_norm": 1.6507047411461764, + "language_loss": 0.78559917, + "learning_rate": 5.647283615340726e-09, + "loss": 0.80644941, + "num_input_tokens_seen": 350601615, + "step": 16246, + "time_per_iteration": 2.766493558883667 + }, + { + "auxiliary_loss_clip": 0.01101377, + "auxiliary_loss_mlp": 0.01033211, + "balance_loss_clip": 1.03545284, + "balance_loss_mlp": 1.02206457, + "epoch": 0.9768224860964978, + "flos": 15851114277120.0, + "grad_norm": 1.4053965502785082, + "language_loss": 0.74026012, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.76160598, + "num_input_tokens_seen": 350619580, + "step": 16247, + "time_per_iteration": 2.56381893157959 + }, + { + "auxiliary_loss_clip": 0.01053333, + "auxiliary_loss_mlp": 0.01033648, + "balance_loss_clip": 1.03346825, + "balance_loss_mlp": 1.02078581, + "epoch": 0.9768826093491658, + "flos": 25151151294720.0, + "grad_norm": 4.399397738721356, + "language_loss": 0.79704082, + "learning_rate": 5.58894135118404e-09, + "loss": 0.81791055, + "num_input_tokens_seen": 350640015, + "step": 16248, + "time_per_iteration": 2.8011584281921387 + }, + { + "auxiliary_loss_clip": 0.01049095, + "auxiliary_loss_mlp": 0.01046497, + "balance_loss_clip": 1.03563344, + "balance_loss_mlp": 1.03080893, + "epoch": 0.9769427326018337, + "flos": 22967028904320.0, + "grad_norm": 1.8090514517823602, + "language_loss": 0.79385042, + "learning_rate": 5.559883660954278e-09, + "loss": 0.81480634, + "num_input_tokens_seen": 350659155, + "step": 16249, + "time_per_iteration": 2.7398455142974854 + }, + { + "auxiliary_loss_clip": 0.01092723, + "auxiliary_loss_mlp": 0.01035301, + "balance_loss_clip": 1.03559923, + "balance_loss_mlp": 1.02318323, + "epoch": 0.9770028558545018, + "flos": 15264297786240.0, + "grad_norm": 1.9233029914398667, + "language_loss": 0.66280472, + "learning_rate": 5.530901600093507e-09, + "loss": 0.68408501, + "num_input_tokens_seen": 350676615, + "step": 16250, + "time_per_iteration": 2.556757688522339 + }, + { + "auxiliary_loss_clip": 0.01027067, + "auxiliary_loss_mlp": 0.01001957, + "balance_loss_clip": 1.00477159, + "balance_loss_mlp": 1.0009917, + "epoch": 0.9770629791071697, + "flos": 71450348808960.0, + "grad_norm": 0.7726336256949028, + "language_loss": 0.59797513, + "learning_rate": 5.501995169700846e-09, + "loss": 0.61826539, + "num_input_tokens_seen": 350736805, + "step": 16251, + "time_per_iteration": 3.1876869201660156 + }, + { + "auxiliary_loss_clip": 0.01093869, + "auxiliary_loss_mlp": 0.01031057, + "balance_loss_clip": 1.03425992, + "balance_loss_mlp": 1.01817012, + "epoch": 0.9771231023598377, + "flos": 22412854897920.0, + "grad_norm": 1.7259349246741458, + "language_loss": 0.78470027, + "learning_rate": 5.473164370872307e-09, + "loss": 0.80594945, + "num_input_tokens_seen": 350753600, + "step": 16252, + "time_per_iteration": 2.606030225753784 + }, + { + "auxiliary_loss_clip": 0.01090281, + "auxiliary_loss_mlp": 0.01034389, + "balance_loss_clip": 1.0339278, + "balance_loss_mlp": 1.02142549, + "epoch": 0.9771832256125056, + "flos": 19025940660480.0, + "grad_norm": 2.220084536547545, + "language_loss": 0.64542538, + "learning_rate": 5.444409204701461e-09, + "loss": 0.66667211, + "num_input_tokens_seen": 350771225, + "step": 16253, + "time_per_iteration": 2.5694305896759033 + }, + { + "auxiliary_loss_clip": 0.01101639, + "auxiliary_loss_mlp": 0.0103353, + "balance_loss_clip": 1.03819561, + "balance_loss_mlp": 1.01936197, + "epoch": 0.9772433488651736, + "flos": 17822143232640.0, + "grad_norm": 2.129791137286544, + "language_loss": 0.7626065, + "learning_rate": 5.415729672278324e-09, + "loss": 0.7839582, + "num_input_tokens_seen": 350789100, + "step": 16254, + "time_per_iteration": 2.6212127208709717 + }, + { + "auxiliary_loss_clip": 0.0110148, + "auxiliary_loss_mlp": 0.01031853, + "balance_loss_clip": 1.03698289, + "balance_loss_mlp": 1.0193603, + "epoch": 0.9773034721178415, + "flos": 37629785623680.0, + "grad_norm": 1.8907694352431208, + "language_loss": 0.63917691, + "learning_rate": 5.387125774690471e-09, + "loss": 0.66051024, + "num_input_tokens_seen": 350811085, + "step": 16255, + "time_per_iteration": 2.7545289993286133 + }, + { + "auxiliary_loss_clip": 0.01080709, + "auxiliary_loss_mlp": 0.00771506, + "balance_loss_clip": 1.03611016, + "balance_loss_mlp": 1.0002296, + "epoch": 0.9773635953705095, + "flos": 20302457172480.0, + "grad_norm": 1.5349410458335684, + "language_loss": 0.75715804, + "learning_rate": 5.358597513023033e-09, + "loss": 0.77568018, + "num_input_tokens_seen": 350831065, + "step": 16256, + "time_per_iteration": 2.718520164489746 + }, + { + "auxiliary_loss_clip": 0.01107482, + "auxiliary_loss_mlp": 0.010355, + "balance_loss_clip": 1.0382638, + "balance_loss_mlp": 1.02249467, + "epoch": 0.9774237186231776, + "flos": 22309253095680.0, + "grad_norm": 4.923302241947984, + "language_loss": 0.77929807, + "learning_rate": 5.330144888357369e-09, + "loss": 0.80072796, + "num_input_tokens_seen": 350849675, + "step": 16257, + "time_per_iteration": 2.578667163848877 + }, + { + "auxiliary_loss_clip": 0.01092876, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.03653014, + "balance_loss_mlp": 1.01965332, + "epoch": 0.9774838418758455, + "flos": 24204905360640.0, + "grad_norm": 1.5736578002879344, + "language_loss": 0.75143224, + "learning_rate": 5.301767901772391e-09, + "loss": 0.77268535, + "num_input_tokens_seen": 350868955, + "step": 16258, + "time_per_iteration": 2.679143190383911 + }, + { + "auxiliary_loss_clip": 0.01019519, + "auxiliary_loss_mlp": 0.01001235, + "balance_loss_clip": 1.00671029, + "balance_loss_mlp": 1.00025165, + "epoch": 0.9775439651285135, + "flos": 66357139829760.0, + "grad_norm": 0.6768337597392673, + "language_loss": 0.59785736, + "learning_rate": 5.273466554344353e-09, + "loss": 0.61806488, + "num_input_tokens_seen": 350935110, + "step": 16259, + "time_per_iteration": 3.1992921829223633 + }, + { + "auxiliary_loss_clip": 0.01093161, + "auxiliary_loss_mlp": 0.0103161, + "balance_loss_clip": 1.03717732, + "balance_loss_mlp": 1.01865828, + "epoch": 0.9776040883811814, + "flos": 22601565976320.0, + "grad_norm": 1.55851171401808, + "language_loss": 0.73553669, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.75678444, + "num_input_tokens_seen": 350953220, + "step": 16260, + "time_per_iteration": 2.639127731323242 + }, + { + "auxiliary_loss_clip": 0.01098909, + "auxiliary_loss_mlp": 0.01032118, + "balance_loss_clip": 1.03642654, + "balance_loss_mlp": 1.01898706, + "epoch": 0.9776642116338494, + "flos": 18442176825600.0, + "grad_norm": 1.9328284113468908, + "language_loss": 0.79923123, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.8205415, + "num_input_tokens_seen": 350971915, + "step": 16261, + "time_per_iteration": 2.5895767211914062 + }, + { + "auxiliary_loss_clip": 0.01099762, + "auxiliary_loss_mlp": 0.01026055, + "balance_loss_clip": 1.03615069, + "balance_loss_mlp": 1.01340127, + "epoch": 0.9777243348865173, + "flos": 22638446265600.0, + "grad_norm": 2.216385126324637, + "language_loss": 0.74283129, + "learning_rate": 5.189016357718845e-09, + "loss": 0.76408947, + "num_input_tokens_seen": 350990470, + "step": 16262, + "time_per_iteration": 2.5935211181640625 + }, + { + "auxiliary_loss_clip": 0.01098991, + "auxiliary_loss_mlp": 0.01033569, + "balance_loss_clip": 1.03628135, + "balance_loss_mlp": 1.01945508, + "epoch": 0.9777844581391854, + "flos": 31321394605440.0, + "grad_norm": 2.37269608442218, + "language_loss": 0.70012951, + "learning_rate": 5.16101757762133e-09, + "loss": 0.7214551, + "num_input_tokens_seen": 351010755, + "step": 16263, + "time_per_iteration": 2.8126862049102783 + }, + { + "auxiliary_loss_clip": 0.01098892, + "auxiliary_loss_mlp": 0.01029577, + "balance_loss_clip": 1.03766048, + "balance_loss_mlp": 1.01819265, + "epoch": 0.9778445813918533, + "flos": 23039101543680.0, + "grad_norm": 2.465472735999823, + "language_loss": 0.66363978, + "learning_rate": 5.133094442018038e-09, + "loss": 0.68492448, + "num_input_tokens_seen": 351029965, + "step": 16264, + "time_per_iteration": 2.6721160411834717 + }, + { + "auxiliary_loss_clip": 0.01063654, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.03675711, + "balance_loss_mlp": 1.01770782, + "epoch": 0.9779047046445213, + "flos": 17566351505280.0, + "grad_norm": 1.8950171968116294, + "language_loss": 0.73092592, + "learning_rate": 5.105246951967679e-09, + "loss": 0.7518791, + "num_input_tokens_seen": 351046205, + "step": 16265, + "time_per_iteration": 2.7303049564361572 + }, + { + "auxiliary_loss_clip": 0.01095694, + "auxiliary_loss_mlp": 0.01031211, + "balance_loss_clip": 1.03564298, + "balance_loss_mlp": 1.01908779, + "epoch": 0.9779648278971892, + "flos": 20741141975040.0, + "grad_norm": 1.771422811009788, + "language_loss": 0.68976378, + "learning_rate": 5.077475108526297e-09, + "loss": 0.71103287, + "num_input_tokens_seen": 351065390, + "step": 16266, + "time_per_iteration": 4.168168306350708 + }, + { + "auxiliary_loss_clip": 0.01058776, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.03172088, + "balance_loss_mlp": 1.01640654, + "epoch": 0.9780249511498572, + "flos": 21026954494080.0, + "grad_norm": 1.5799613960571957, + "language_loss": 0.86905551, + "learning_rate": 5.049778912747049e-09, + "loss": 0.88992071, + "num_input_tokens_seen": 351084355, + "step": 16267, + "time_per_iteration": 4.231276512145996 + }, + { + "auxiliary_loss_clip": 0.01043069, + "auxiliary_loss_mlp": 0.01029392, + "balance_loss_clip": 1.03164184, + "balance_loss_mlp": 1.01611769, + "epoch": 0.9780850744025251, + "flos": 30774223751040.0, + "grad_norm": 1.9539809119147387, + "language_loss": 0.70374393, + "learning_rate": 5.022158365679985e-09, + "loss": 0.72446853, + "num_input_tokens_seen": 351105870, + "step": 16268, + "time_per_iteration": 4.722951412200928 + }, + { + "auxiliary_loss_clip": 0.01087833, + "auxiliary_loss_mlp": 0.01024552, + "balance_loss_clip": 1.0350287, + "balance_loss_mlp": 1.01256526, + "epoch": 0.9781451976551931, + "flos": 20302995876480.0, + "grad_norm": 1.5865109872612446, + "language_loss": 0.7393145, + "learning_rate": 4.994613468372711e-09, + "loss": 0.76043838, + "num_input_tokens_seen": 351124760, + "step": 16269, + "time_per_iteration": 3.1291208267211914 + }, + { + "auxiliary_loss_clip": 0.01085029, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.03650665, + "balance_loss_mlp": 1.02071679, + "epoch": 0.9782053209078612, + "flos": 24316479982080.0, + "grad_norm": 2.0431692702838613, + "language_loss": 0.70405006, + "learning_rate": 4.967144221869501e-09, + "loss": 0.72524822, + "num_input_tokens_seen": 351142820, + "step": 16270, + "time_per_iteration": 2.6683366298675537 + }, + { + "auxiliary_loss_clip": 0.01110841, + "auxiliary_loss_mlp": 0.01034856, + "balance_loss_clip": 1.03801334, + "balance_loss_mlp": 1.02240467, + "epoch": 0.9782654441605291, + "flos": 32489425065600.0, + "grad_norm": 1.7714650926331987, + "language_loss": 0.63896102, + "learning_rate": 4.939750627212191e-09, + "loss": 0.66041803, + "num_input_tokens_seen": 351164805, + "step": 16271, + "time_per_iteration": 4.501726388931274 + }, + { + "auxiliary_loss_clip": 0.01082074, + "auxiliary_loss_mlp": 0.01033993, + "balance_loss_clip": 1.03665876, + "balance_loss_mlp": 1.02143383, + "epoch": 0.9783255674131971, + "flos": 26979076465920.0, + "grad_norm": 1.43784561644357, + "language_loss": 0.70358956, + "learning_rate": 4.912432685439505e-09, + "loss": 0.72475022, + "num_input_tokens_seen": 351187005, + "step": 16272, + "time_per_iteration": 2.727437734603882 + }, + { + "auxiliary_loss_clip": 0.0105355, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.03778529, + "balance_loss_mlp": 1.02120066, + "epoch": 0.978385690665865, + "flos": 23112251591040.0, + "grad_norm": 1.7381276500973775, + "language_loss": 0.66595173, + "learning_rate": 4.88519039758728e-09, + "loss": 0.68682981, + "num_input_tokens_seen": 351208450, + "step": 16273, + "time_per_iteration": 2.929959774017334 + }, + { + "auxiliary_loss_clip": 0.01075306, + "auxiliary_loss_mlp": 0.01023021, + "balance_loss_clip": 1.03366828, + "balance_loss_mlp": 1.01021206, + "epoch": 0.978445813918533, + "flos": 25409672455680.0, + "grad_norm": 1.7200499959996831, + "language_loss": 0.7406745, + "learning_rate": 4.85802376468869e-09, + "loss": 0.76165771, + "num_input_tokens_seen": 351229585, + "step": 16274, + "time_per_iteration": 2.6932880878448486 + }, + { + "auxiliary_loss_clip": 0.01084441, + "auxiliary_loss_mlp": 0.01029809, + "balance_loss_clip": 1.03532362, + "balance_loss_mlp": 1.01775074, + "epoch": 0.9785059371712009, + "flos": 23550218121600.0, + "grad_norm": 1.6821946772712648, + "language_loss": 0.77833498, + "learning_rate": 4.830932787773579e-09, + "loss": 0.79947746, + "num_input_tokens_seen": 351249525, + "step": 16275, + "time_per_iteration": 2.6410744190216064 + }, + { + "auxiliary_loss_clip": 0.01037951, + "auxiliary_loss_mlp": 0.0103008, + "balance_loss_clip": 1.03559256, + "balance_loss_mlp": 1.01765788, + "epoch": 0.978566060423869, + "flos": 34351177870080.0, + "grad_norm": 2.314426015292287, + "language_loss": 0.71095657, + "learning_rate": 4.803917467869567e-09, + "loss": 0.73163688, + "num_input_tokens_seen": 351272530, + "step": 16276, + "time_per_iteration": 2.91654109954834 + }, + { + "auxiliary_loss_clip": 0.01077494, + "auxiliary_loss_mlp": 0.01032563, + "balance_loss_clip": 1.033566, + "balance_loss_mlp": 1.02052927, + "epoch": 0.9786261836765369, + "flos": 11618862387840.0, + "grad_norm": 2.1039915765233674, + "language_loss": 0.85744458, + "learning_rate": 4.776977806000726e-09, + "loss": 0.87854517, + "num_input_tokens_seen": 351288530, + "step": 16277, + "time_per_iteration": 2.6657748222351074 + }, + { + "auxiliary_loss_clip": 0.01090092, + "auxiliary_loss_mlp": 0.01031629, + "balance_loss_clip": 1.03429365, + "balance_loss_mlp": 1.01809239, + "epoch": 0.9786863069292049, + "flos": 17420949250560.0, + "grad_norm": 1.719720636117993, + "language_loss": 0.70917892, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.73039615, + "num_input_tokens_seen": 351305890, + "step": 16278, + "time_per_iteration": 2.5898592472076416 + }, + { + "auxiliary_loss_clip": 0.01087893, + "auxiliary_loss_mlp": 0.01034892, + "balance_loss_clip": 1.03455925, + "balance_loss_mlp": 1.02192235, + "epoch": 0.9787464301818728, + "flos": 20844923345280.0, + "grad_norm": 1.8454549463354188, + "language_loss": 0.84530413, + "learning_rate": 4.723325460453065e-09, + "loss": 0.86653197, + "num_input_tokens_seen": 351325010, + "step": 16279, + "time_per_iteration": 2.6659061908721924 + }, + { + "auxiliary_loss_clip": 0.01096633, + "auxiliary_loss_mlp": 0.01032409, + "balance_loss_clip": 1.0337534, + "balance_loss_mlp": 1.01924253, + "epoch": 0.9788065534345408, + "flos": 18222942165120.0, + "grad_norm": 2.8453433920494753, + "language_loss": 0.79117471, + "learning_rate": 4.696612778808395e-09, + "loss": 0.81246513, + "num_input_tokens_seen": 351343060, + "step": 16280, + "time_per_iteration": 2.636876106262207 + }, + { + "auxiliary_loss_clip": 0.01064547, + "auxiliary_loss_mlp": 0.01034092, + "balance_loss_clip": 1.03490829, + "balance_loss_mlp": 1.02280319, + "epoch": 0.9788666766872087, + "flos": 21578219498880.0, + "grad_norm": 2.843907217465553, + "language_loss": 0.79550928, + "learning_rate": 4.669975759268085e-09, + "loss": 0.81649566, + "num_input_tokens_seen": 351363260, + "step": 16281, + "time_per_iteration": 2.710759162902832 + }, + { + "auxiliary_loss_clip": 0.01096946, + "auxiliary_loss_mlp": 0.01032797, + "balance_loss_clip": 1.03685427, + "balance_loss_mlp": 1.01961863, + "epoch": 0.9789267999398767, + "flos": 24900495212160.0, + "grad_norm": 1.5976604846892302, + "language_loss": 0.80062044, + "learning_rate": 4.643414402842216e-09, + "loss": 0.82191795, + "num_input_tokens_seen": 351382610, + "step": 16282, + "time_per_iteration": 2.6593406200408936 + }, + { + "auxiliary_loss_clip": 0.0108946, + "auxiliary_loss_mlp": 0.01043417, + "balance_loss_clip": 1.03729296, + "balance_loss_mlp": 1.03109717, + "epoch": 0.9789869231925448, + "flos": 19573111514880.0, + "grad_norm": 1.9818178483973194, + "language_loss": 0.82860035, + "learning_rate": 4.616928710538204e-09, + "loss": 0.84992909, + "num_input_tokens_seen": 351401075, + "step": 16283, + "time_per_iteration": 2.696199655532837 + }, + { + "auxiliary_loss_clip": 0.01092588, + "auxiliary_loss_mlp": 0.01034978, + "balance_loss_clip": 1.0365355, + "balance_loss_mlp": 1.02216268, + "epoch": 0.9790470464452127, + "flos": 16796641939200.0, + "grad_norm": 1.8242453499893954, + "language_loss": 0.71959805, + "learning_rate": 4.590518683360134e-09, + "loss": 0.74087369, + "num_input_tokens_seen": 351419275, + "step": 16284, + "time_per_iteration": 2.651407241821289 + }, + { + "auxiliary_loss_clip": 0.01094663, + "auxiliary_loss_mlp": 0.01033179, + "balance_loss_clip": 1.03582513, + "balance_loss_mlp": 1.02136493, + "epoch": 0.9791071696978807, + "flos": 18369350000640.0, + "grad_norm": 1.8718782161226852, + "language_loss": 0.64339489, + "learning_rate": 4.56418432230965e-09, + "loss": 0.66467333, + "num_input_tokens_seen": 351437375, + "step": 16285, + "time_per_iteration": 2.651705026626587 + }, + { + "auxiliary_loss_clip": 0.01084456, + "auxiliary_loss_mlp": 0.01031691, + "balance_loss_clip": 1.0361805, + "balance_loss_mlp": 1.01931071, + "epoch": 0.9791672929505486, + "flos": 24170323541760.0, + "grad_norm": 1.7019695394425336, + "language_loss": 0.70606256, + "learning_rate": 4.537925628385286e-09, + "loss": 0.72722405, + "num_input_tokens_seen": 351457810, + "step": 16286, + "time_per_iteration": 2.6652472019195557 + }, + { + "auxiliary_loss_clip": 0.01091075, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.03533125, + "balance_loss_mlp": 1.02069998, + "epoch": 0.9792274162032166, + "flos": 24354114456960.0, + "grad_norm": 2.0320936196191526, + "language_loss": 0.58058381, + "learning_rate": 4.511742602582691e-09, + "loss": 0.60182172, + "num_input_tokens_seen": 351478825, + "step": 16287, + "time_per_iteration": 2.617825746536255 + }, + { + "auxiliary_loss_clip": 0.01096267, + "auxiliary_loss_mlp": 0.01035937, + "balance_loss_clip": 1.0363822, + "balance_loss_mlp": 1.02341986, + "epoch": 0.9792875394558845, + "flos": 26395779507840.0, + "grad_norm": 1.7210061810048285, + "language_loss": 0.81500298, + "learning_rate": 4.485635245894626e-09, + "loss": 0.83632499, + "num_input_tokens_seen": 351498785, + "step": 16288, + "time_per_iteration": 2.657498359680176 + }, + { + "auxiliary_loss_clip": 0.01082554, + "auxiliary_loss_mlp": 0.00771248, + "balance_loss_clip": 1.03415895, + "balance_loss_mlp": 1.00014818, + "epoch": 0.9793476627085526, + "flos": 28148004766080.0, + "grad_norm": 1.396084239179073, + "language_loss": 0.71853596, + "learning_rate": 4.459603559311631e-09, + "loss": 0.73707396, + "num_input_tokens_seen": 351520235, + "step": 16289, + "time_per_iteration": 2.8403937816619873 + }, + { + "auxiliary_loss_clip": 0.01073083, + "auxiliary_loss_mlp": 0.01036624, + "balance_loss_clip": 1.03831482, + "balance_loss_mlp": 1.02417815, + "epoch": 0.9794077859612205, + "flos": 16763927627520.0, + "grad_norm": 2.8328336335773523, + "language_loss": 0.75429696, + "learning_rate": 4.43364754382003e-09, + "loss": 0.77539402, + "num_input_tokens_seen": 351538900, + "step": 16290, + "time_per_iteration": 2.6202123165130615 + }, + { + "auxiliary_loss_clip": 0.01099176, + "auxiliary_loss_mlp": 0.01032346, + "balance_loss_clip": 1.03652453, + "balance_loss_mlp": 1.01942921, + "epoch": 0.9794679092138885, + "flos": 19280834547840.0, + "grad_norm": 1.5711405452036733, + "language_loss": 0.6725769, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.69389218, + "num_input_tokens_seen": 351558715, + "step": 16291, + "time_per_iteration": 2.5787715911865234 + }, + { + "auxiliary_loss_clip": 0.01111756, + "auxiliary_loss_mlp": 0.00770961, + "balance_loss_clip": 1.03711116, + "balance_loss_mlp": 1.00023437, + "epoch": 0.9795280324665564, + "flos": 32156640535680.0, + "grad_norm": 1.7868335154862072, + "language_loss": 0.63048244, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.64930964, + "num_input_tokens_seen": 351578450, + "step": 16292, + "time_per_iteration": 2.6509621143341064 + }, + { + "auxiliary_loss_clip": 0.01072425, + "auxiliary_loss_mlp": 0.01030776, + "balance_loss_clip": 1.03524005, + "balance_loss_mlp": 1.01897478, + "epoch": 0.9795881557192244, + "flos": 19060953442560.0, + "grad_norm": 1.8984825692794804, + "language_loss": 0.73462898, + "learning_rate": 4.356233533724829e-09, + "loss": 0.75566101, + "num_input_tokens_seen": 351597195, + "step": 16293, + "time_per_iteration": 2.64638614654541 + }, + { + "auxiliary_loss_clip": 0.01100837, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.03659904, + "balance_loss_mlp": 1.01774597, + "epoch": 0.9796482789718923, + "flos": 28329928174080.0, + "grad_norm": 1.6810560431936798, + "language_loss": 0.84062809, + "learning_rate": 4.330580212414503e-09, + "loss": 0.86193907, + "num_input_tokens_seen": 351617460, + "step": 16294, + "time_per_iteration": 2.614396095275879 + }, + { + "auxiliary_loss_clip": 0.01071284, + "auxiliary_loss_mlp": 0.01032864, + "balance_loss_clip": 1.03327644, + "balance_loss_mlp": 1.02134895, + "epoch": 0.9797084022245603, + "flos": 17967976450560.0, + "grad_norm": 2.3230891850787168, + "language_loss": 0.71972656, + "learning_rate": 4.305002567088767e-09, + "loss": 0.74076802, + "num_input_tokens_seen": 351635900, + "step": 16295, + "time_per_iteration": 2.6593565940856934 + }, + { + "auxiliary_loss_clip": 0.01103524, + "auxiliary_loss_mlp": 0.01035987, + "balance_loss_clip": 1.0375762, + "balance_loss_mlp": 1.02305257, + "epoch": 0.9797685254772284, + "flos": 20266726118400.0, + "grad_norm": 1.590993337993389, + "language_loss": 0.80806482, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.82946002, + "num_input_tokens_seen": 351655400, + "step": 16296, + "time_per_iteration": 2.571876287460327 + }, + { + "auxiliary_loss_clip": 0.01079454, + "auxiliary_loss_mlp": 0.01033969, + "balance_loss_clip": 1.03264189, + "balance_loss_mlp": 1.02170789, + "epoch": 0.9798286487298963, + "flos": 26907147480960.0, + "grad_norm": 2.4272229698670986, + "language_loss": 0.75518107, + "learning_rate": 4.254074308266853e-09, + "loss": 0.77631521, + "num_input_tokens_seen": 351675505, + "step": 16297, + "time_per_iteration": 2.737135410308838 + }, + { + "auxiliary_loss_clip": 0.01097573, + "auxiliary_loss_mlp": 0.01036214, + "balance_loss_clip": 1.03408372, + "balance_loss_mlp": 1.02367926, + "epoch": 0.9798887719825643, + "flos": 27161071701120.0, + "grad_norm": 2.8449878357962, + "language_loss": 0.78244084, + "learning_rate": 4.228723696702019e-09, + "loss": 0.80377865, + "num_input_tokens_seen": 351697920, + "step": 16298, + "time_per_iteration": 2.662205457687378 + }, + { + "auxiliary_loss_clip": 0.01092637, + "auxiliary_loss_mlp": 0.01026617, + "balance_loss_clip": 1.03448844, + "balance_loss_mlp": 1.01479197, + "epoch": 0.9799488952352322, + "flos": 20668422890880.0, + "grad_norm": 1.5107423407180305, + "language_loss": 0.72837794, + "learning_rate": 4.203448764984019e-09, + "loss": 0.74957049, + "num_input_tokens_seen": 351717615, + "step": 16299, + "time_per_iteration": 2.6172263622283936 + }, + { + "auxiliary_loss_clip": 0.01084816, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.0337168, + "balance_loss_mlp": 1.01527619, + "epoch": 0.9800090184879002, + "flos": 21981209160960.0, + "grad_norm": 2.196732565554413, + "language_loss": 0.89433563, + "learning_rate": 4.178249514071419e-09, + "loss": 0.91547084, + "num_input_tokens_seen": 351735260, + "step": 16300, + "time_per_iteration": 2.665531873703003 + }, + { + "auxiliary_loss_clip": 0.01099488, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.03554893, + "balance_loss_mlp": 1.01669717, + "epoch": 0.9800691417405681, + "flos": 21288420570240.0, + "grad_norm": 3.318186041205299, + "language_loss": 0.7811656, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.80245435, + "num_input_tokens_seen": 351755800, + "step": 16301, + "time_per_iteration": 2.6590991020202637 + }, + { + "auxiliary_loss_clip": 0.01085984, + "auxiliary_loss_mlp": 0.01035207, + "balance_loss_clip": 1.03470898, + "balance_loss_mlp": 1.022452, + "epoch": 0.9801292649932362, + "flos": 18439878355200.0, + "grad_norm": 2.4753221911438525, + "language_loss": 0.75696325, + "learning_rate": 4.128078058480921e-09, + "loss": 0.77817523, + "num_input_tokens_seen": 351774790, + "step": 16302, + "time_per_iteration": 2.5974133014678955 + }, + { + "auxiliary_loss_clip": 0.01080371, + "auxiliary_loss_mlp": 0.01032591, + "balance_loss_clip": 1.03640592, + "balance_loss_mlp": 1.01979423, + "epoch": 0.9801893882459041, + "flos": 25046364343680.0, + "grad_norm": 1.7046850739781914, + "language_loss": 0.79628474, + "learning_rate": 4.103105855705724e-09, + "loss": 0.8174144, + "num_input_tokens_seen": 351792855, + "step": 16303, + "time_per_iteration": 2.6679980754852295 + }, + { + "auxiliary_loss_clip": 0.01066992, + "auxiliary_loss_mlp": 0.0103858, + "balance_loss_clip": 1.03263092, + "balance_loss_mlp": 1.02466226, + "epoch": 0.9802495114985721, + "flos": 18511484117760.0, + "grad_norm": 2.0636991947077696, + "language_loss": 0.83625126, + "learning_rate": 4.078209337540883e-09, + "loss": 0.85730696, + "num_input_tokens_seen": 351811450, + "step": 16304, + "time_per_iteration": 2.6905360221862793 + }, + { + "auxiliary_loss_clip": 0.01070996, + "auxiliary_loss_mlp": 0.01026297, + "balance_loss_clip": 1.03549314, + "balance_loss_mlp": 1.01519823, + "epoch": 0.98030963475124, + "flos": 21469841187840.0, + "grad_norm": 1.8378321137403202, + "language_loss": 0.70343494, + "learning_rate": 4.053388504930089e-09, + "loss": 0.72440791, + "num_input_tokens_seen": 351831960, + "step": 16305, + "time_per_iteration": 2.745544910430908 + }, + { + "auxiliary_loss_clip": 0.0107968, + "auxiliary_loss_mlp": 0.01040728, + "balance_loss_clip": 1.03601217, + "balance_loss_mlp": 1.0259459, + "epoch": 0.980369758003908, + "flos": 20412272027520.0, + "grad_norm": 2.7789217747629182, + "language_loss": 0.71784663, + "learning_rate": 4.028643358815032e-09, + "loss": 0.73905075, + "num_input_tokens_seen": 351851585, + "step": 16306, + "time_per_iteration": 4.391748905181885 + }, + { + "auxiliary_loss_clip": 0.01080084, + "auxiliary_loss_mlp": 0.01032092, + "balance_loss_clip": 1.03247881, + "balance_loss_mlp": 1.02032626, + "epoch": 0.9804298812565759, + "flos": 23399177431680.0, + "grad_norm": 1.5354293339216485, + "language_loss": 0.73557943, + "learning_rate": 4.00397390013385e-09, + "loss": 0.75670117, + "num_input_tokens_seen": 351871085, + "step": 16307, + "time_per_iteration": 4.338375091552734 + }, + { + "auxiliary_loss_clip": 0.01076228, + "auxiliary_loss_mlp": 0.01030733, + "balance_loss_clip": 1.03920865, + "balance_loss_mlp": 1.01993847, + "epoch": 0.980490004509244, + "flos": 23292666627840.0, + "grad_norm": 1.5555541089180664, + "language_loss": 0.74765921, + "learning_rate": 3.979380129822018e-09, + "loss": 0.76872879, + "num_input_tokens_seen": 351891775, + "step": 16308, + "time_per_iteration": 2.79581618309021 + }, + { + "auxiliary_loss_clip": 0.01007996, + "auxiliary_loss_mlp": 0.0100217, + "balance_loss_clip": 1.0048188, + "balance_loss_mlp": 1.00120437, + "epoch": 0.980550127761912, + "flos": 56051027798400.0, + "grad_norm": 0.7557884098405707, + "language_loss": 0.57835835, + "learning_rate": 3.954862048811902e-09, + "loss": 0.59845996, + "num_input_tokens_seen": 351946770, + "step": 16309, + "time_per_iteration": 3.0556421279907227 + }, + { + "auxiliary_loss_clip": 0.01065215, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.03367877, + "balance_loss_mlp": 1.02015948, + "epoch": 0.9806102510145799, + "flos": 25333290184320.0, + "grad_norm": 1.8451853001469216, + "language_loss": 0.66008574, + "learning_rate": 3.930419658033646e-09, + "loss": 0.68106461, + "num_input_tokens_seen": 351966155, + "step": 16310, + "time_per_iteration": 2.729114055633545 + }, + { + "auxiliary_loss_clip": 0.01008303, + "auxiliary_loss_mlp": 0.01000216, + "balance_loss_clip": 1.00770998, + "balance_loss_mlp": 0.99920315, + "epoch": 0.9806703742672479, + "flos": 67274837429760.0, + "grad_norm": 1.0882913527970195, + "language_loss": 0.54503131, + "learning_rate": 3.906052958413841e-09, + "loss": 0.56511647, + "num_input_tokens_seen": 352031655, + "step": 16311, + "time_per_iteration": 4.943628311157227 + }, + { + "auxiliary_loss_clip": 0.01095664, + "auxiliary_loss_mlp": 0.01027322, + "balance_loss_clip": 1.0345304, + "balance_loss_mlp": 1.01559234, + "epoch": 0.9807304975199158, + "flos": 25228970110080.0, + "grad_norm": 2.5422868238543836, + "language_loss": 0.79856956, + "learning_rate": 3.881761950876638e-09, + "loss": 0.81979948, + "num_input_tokens_seen": 352051920, + "step": 16312, + "time_per_iteration": 2.635751247406006 + }, + { + "auxiliary_loss_clip": 0.0108546, + "auxiliary_loss_mlp": 0.01029669, + "balance_loss_clip": 1.03607917, + "balance_loss_mlp": 1.01784945, + "epoch": 0.9807906207725838, + "flos": 17456392995840.0, + "grad_norm": 1.855062658283189, + "language_loss": 0.6311661, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.65231735, + "num_input_tokens_seen": 352069315, + "step": 16313, + "time_per_iteration": 2.71441650390625 + }, + { + "auxiliary_loss_clip": 0.01098236, + "auxiliary_loss_mlp": 0.01030326, + "balance_loss_clip": 1.03771138, + "balance_loss_mlp": 1.01765394, + "epoch": 0.9808507440252517, + "flos": 21032413361280.0, + "grad_norm": 2.0090087344647496, + "language_loss": 0.72602594, + "learning_rate": 3.833407015731316e-09, + "loss": 0.74731159, + "num_input_tokens_seen": 352089480, + "step": 16314, + "time_per_iteration": 2.789362907409668 + }, + { + "auxiliary_loss_clip": 0.01003668, + "auxiliary_loss_mlp": 0.01002, + "balance_loss_clip": 1.01054919, + "balance_loss_mlp": 1.00098097, + "epoch": 0.9809108672779198, + "flos": 64044491598720.0, + "grad_norm": 0.6894102027306396, + "language_loss": 0.51673484, + "learning_rate": 3.80934308995684e-09, + "loss": 0.53679156, + "num_input_tokens_seen": 352150000, + "step": 16315, + "time_per_iteration": 3.215070962905884 + }, + { + "auxiliary_loss_clip": 0.01097221, + "auxiliary_loss_mlp": 0.01032423, + "balance_loss_clip": 1.03522766, + "balance_loss_mlp": 1.02035928, + "epoch": 0.9809709905305877, + "flos": 22780616296320.0, + "grad_norm": 1.4165501611522262, + "language_loss": 0.69878519, + "learning_rate": 3.785354859932033e-09, + "loss": 0.72008169, + "num_input_tokens_seen": 352170990, + "step": 16316, + "time_per_iteration": 2.677259683609009 + }, + { + "auxiliary_loss_clip": 0.0111046, + "auxiliary_loss_mlp": 0.01032289, + "balance_loss_clip": 1.03727913, + "balance_loss_mlp": 1.02019501, + "epoch": 0.9810311137832557, + "flos": 37013415217920.0, + "grad_norm": 2.664112062764968, + "language_loss": 0.55067998, + "learning_rate": 3.76144232656661e-09, + "loss": 0.57210749, + "num_input_tokens_seen": 352195335, + "step": 16317, + "time_per_iteration": 2.7027530670166016 + }, + { + "auxiliary_loss_clip": 0.01052915, + "auxiliary_loss_mlp": 0.01035996, + "balance_loss_clip": 1.02941895, + "balance_loss_mlp": 1.02321708, + "epoch": 0.9810912370359236, + "flos": 18916305373440.0, + "grad_norm": 1.7269155229815298, + "language_loss": 0.73437709, + "learning_rate": 3.737605490767404e-09, + "loss": 0.75526619, + "num_input_tokens_seen": 352214170, + "step": 16318, + "time_per_iteration": 2.7383875846862793 + }, + { + "auxiliary_loss_clip": 0.01082811, + "auxiliary_loss_mlp": 0.01027164, + "balance_loss_clip": 1.0344367, + "balance_loss_mlp": 1.01589835, + "epoch": 0.9811513602885916, + "flos": 18441602208000.0, + "grad_norm": 2.1831479646107597, + "language_loss": 0.82135093, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.84245068, + "num_input_tokens_seen": 352231470, + "step": 16319, + "time_per_iteration": 2.6357314586639404 + }, + { + "auxiliary_loss_clip": 0.01018205, + "auxiliary_loss_mlp": 0.01008734, + "balance_loss_clip": 1.00481987, + "balance_loss_mlp": 1.00751829, + "epoch": 0.9812114835412595, + "flos": 68058945371520.0, + "grad_norm": 0.723170548219491, + "language_loss": 0.5353533, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.55562276, + "num_input_tokens_seen": 352291770, + "step": 16320, + "time_per_iteration": 3.0364413261413574 + }, + { + "auxiliary_loss_clip": 0.01057502, + "auxiliary_loss_mlp": 0.01036848, + "balance_loss_clip": 1.03194261, + "balance_loss_mlp": 1.02422416, + "epoch": 0.9812716067939276, + "flos": 25373007648000.0, + "grad_norm": 2.190666128056564, + "language_loss": 0.73492098, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.7558645, + "num_input_tokens_seen": 352310735, + "step": 16321, + "time_per_iteration": 2.7965734004974365 + }, + { + "auxiliary_loss_clip": 0.0108786, + "auxiliary_loss_mlp": 0.0103188, + "balance_loss_clip": 1.03798234, + "balance_loss_mlp": 1.01972055, + "epoch": 0.9813317300465956, + "flos": 22856818999680.0, + "grad_norm": 1.5299966395206919, + "language_loss": 0.78483856, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.806036, + "num_input_tokens_seen": 352329545, + "step": 16322, + "time_per_iteration": 2.762363910675049 + }, + { + "auxiliary_loss_clip": 0.0109714, + "auxiliary_loss_mlp": 0.01034816, + "balance_loss_clip": 1.03686166, + "balance_loss_mlp": 1.02228689, + "epoch": 0.9813918532992635, + "flos": 23586954756480.0, + "grad_norm": 1.7335029741380326, + "language_loss": 0.81064153, + "learning_rate": 3.619556806799595e-09, + "loss": 0.8319611, + "num_input_tokens_seen": 352352080, + "step": 16323, + "time_per_iteration": 2.674591541290283 + }, + { + "auxiliary_loss_clip": 0.01110489, + "auxiliary_loss_mlp": 0.01030491, + "balance_loss_clip": 1.03752804, + "balance_loss_mlp": 1.01852298, + "epoch": 0.9814519765519315, + "flos": 19606328616960.0, + "grad_norm": 2.350364849870627, + "language_loss": 0.84632325, + "learning_rate": 3.596174175278799e-09, + "loss": 0.86773306, + "num_input_tokens_seen": 352366455, + "step": 16324, + "time_per_iteration": 2.5407159328460693 + }, + { + "auxiliary_loss_clip": 0.01086747, + "auxiliary_loss_mlp": 0.01033784, + "balance_loss_clip": 1.03741539, + "balance_loss_mlp": 1.02086782, + "epoch": 0.9815120998045994, + "flos": 33946284787200.0, + "grad_norm": 1.4316902818633324, + "language_loss": 0.74605346, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.76725876, + "num_input_tokens_seen": 352386090, + "step": 16325, + "time_per_iteration": 2.817761182785034 + }, + { + "auxiliary_loss_clip": 0.01056448, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.03592491, + "balance_loss_mlp": 1.02158785, + "epoch": 0.9815722230572674, + "flos": 20850023076480.0, + "grad_norm": 1.5890667781038148, + "language_loss": 0.7638427, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.78473639, + "num_input_tokens_seen": 352404000, + "step": 16326, + "time_per_iteration": 2.804213523864746 + }, + { + "auxiliary_loss_clip": 0.01075422, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.03580999, + "balance_loss_mlp": 1.01923585, + "epoch": 0.9816323463099353, + "flos": 22894525301760.0, + "grad_norm": 2.465136585192098, + "language_loss": 0.67442954, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.69550526, + "num_input_tokens_seen": 352423540, + "step": 16327, + "time_per_iteration": 2.725055694580078 + }, + { + "auxiliary_loss_clip": 0.01102074, + "auxiliary_loss_mlp": 0.01036968, + "balance_loss_clip": 1.03595459, + "balance_loss_mlp": 1.0239681, + "epoch": 0.9816924695626034, + "flos": 31539444117120.0, + "grad_norm": 1.5972210745113198, + "language_loss": 0.73710746, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.75849789, + "num_input_tokens_seen": 352445530, + "step": 16328, + "time_per_iteration": 2.739084243774414 + }, + { + "auxiliary_loss_clip": 0.01091132, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.03452396, + "balance_loss_mlp": 1.0219171, + "epoch": 0.9817525928152713, + "flos": 21506901045120.0, + "grad_norm": 1.7593287132982667, + "language_loss": 0.8105092, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.83177972, + "num_input_tokens_seen": 352466325, + "step": 16329, + "time_per_iteration": 2.6751110553741455 + }, + { + "auxiliary_loss_clip": 0.0111119, + "auxiliary_loss_mlp": 0.01031516, + "balance_loss_clip": 1.03624225, + "balance_loss_mlp": 1.01837909, + "epoch": 0.9818127160679393, + "flos": 25550513683200.0, + "grad_norm": 3.221253947949931, + "language_loss": 0.75986403, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.78129113, + "num_input_tokens_seen": 352485505, + "step": 16330, + "time_per_iteration": 2.6681814193725586 + }, + { + "auxiliary_loss_clip": 0.01117551, + "auxiliary_loss_mlp": 0.01032164, + "balance_loss_clip": 1.03826082, + "balance_loss_mlp": 1.01716757, + "epoch": 0.9818728393206072, + "flos": 28803661672320.0, + "grad_norm": 2.4256142149996562, + "language_loss": 0.66364849, + "learning_rate": 3.434615511252126e-09, + "loss": 0.68514568, + "num_input_tokens_seen": 352505360, + "step": 16331, + "time_per_iteration": 2.703917980194092 + }, + { + "auxiliary_loss_clip": 0.01095043, + "auxiliary_loss_mlp": 0.0102903, + "balance_loss_clip": 1.03584874, + "balance_loss_mlp": 1.01704907, + "epoch": 0.9819329625732752, + "flos": 23222246014080.0, + "grad_norm": 1.857287483122114, + "language_loss": 0.73337162, + "learning_rate": 3.411838534981948e-09, + "loss": 0.75461233, + "num_input_tokens_seen": 352524035, + "step": 16332, + "time_per_iteration": 2.650766611099243 + }, + { + "auxiliary_loss_clip": 0.01097564, + "auxiliary_loss_mlp": 0.01029848, + "balance_loss_clip": 1.03841019, + "balance_loss_mlp": 1.01876128, + "epoch": 0.9819930858259431, + "flos": 17530440883200.0, + "grad_norm": 1.7088986460158127, + "language_loss": 0.76663387, + "learning_rate": 3.389137269534936e-09, + "loss": 0.78790796, + "num_input_tokens_seen": 352543210, + "step": 16333, + "time_per_iteration": 2.6083765029907227 + }, + { + "auxiliary_loss_clip": 0.01091914, + "auxiliary_loss_mlp": 0.00769838, + "balance_loss_clip": 1.03712809, + "balance_loss_mlp": 1.00018179, + "epoch": 0.9820532090786112, + "flos": 12529915971840.0, + "grad_norm": 2.124926384042051, + "language_loss": 0.72888857, + "learning_rate": 3.366511715771958e-09, + "loss": 0.74750608, + "num_input_tokens_seen": 352559770, + "step": 16334, + "time_per_iteration": 2.641460657119751 + }, + { + "auxiliary_loss_clip": 0.01059033, + "auxiliary_loss_mlp": 0.01035338, + "balance_loss_clip": 1.03467429, + "balance_loss_mlp": 1.02285099, + "epoch": 0.9821133323312792, + "flos": 18840174497280.0, + "grad_norm": 2.150602428971571, + "language_loss": 0.78196549, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.8029092, + "num_input_tokens_seen": 352577690, + "step": 16335, + "time_per_iteration": 2.813981056213379 + }, + { + "auxiliary_loss_clip": 0.01084888, + "auxiliary_loss_mlp": 0.01042166, + "balance_loss_clip": 1.03453565, + "balance_loss_mlp": 1.02693129, + "epoch": 0.9821734555839471, + "flos": 34824013528320.0, + "grad_norm": 1.9795504478924333, + "language_loss": 0.63792658, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.65919709, + "num_input_tokens_seen": 352598850, + "step": 16336, + "time_per_iteration": 2.8951098918914795 + }, + { + "auxiliary_loss_clip": 0.01077961, + "auxiliary_loss_mlp": 0.0103778, + "balance_loss_clip": 1.03655946, + "balance_loss_mlp": 1.02337968, + "epoch": 0.9822335788366151, + "flos": 17128169493120.0, + "grad_norm": 2.0134726146913517, + "language_loss": 0.73876464, + "learning_rate": 3.299089333152372e-09, + "loss": 0.75992203, + "num_input_tokens_seen": 352616130, + "step": 16337, + "time_per_iteration": 2.7202372550964355 + }, + { + "auxiliary_loss_clip": 0.0109231, + "auxiliary_loss_mlp": 0.01031669, + "balance_loss_clip": 1.03548503, + "balance_loss_mlp": 1.01803732, + "epoch": 0.982293702089283, + "flos": 20813250528000.0, + "grad_norm": 1.6907700121861502, + "language_loss": 0.72918296, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.75042278, + "num_input_tokens_seen": 352636885, + "step": 16338, + "time_per_iteration": 2.5943961143493652 + }, + { + "auxiliary_loss_clip": 0.0104046, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.03025174, + "balance_loss_mlp": 1.02005458, + "epoch": 0.982353825341951, + "flos": 24680829588480.0, + "grad_norm": 1.7984966178479147, + "language_loss": 0.81313229, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.83386666, + "num_input_tokens_seen": 352657905, + "step": 16339, + "time_per_iteration": 2.8950557708740234 + }, + { + "auxiliary_loss_clip": 0.01054842, + "auxiliary_loss_mlp": 0.01039766, + "balance_loss_clip": 1.02929127, + "balance_loss_mlp": 1.02653337, + "epoch": 0.982413948594619, + "flos": 20850489953280.0, + "grad_norm": 1.809625302780829, + "language_loss": 0.62418073, + "learning_rate": 3.232348386403405e-09, + "loss": 0.64512682, + "num_input_tokens_seen": 352676320, + "step": 16340, + "time_per_iteration": 2.8046703338623047 + }, + { + "auxiliary_loss_clip": 0.01112791, + "auxiliary_loss_mlp": 0.01031705, + "balance_loss_clip": 1.03891397, + "balance_loss_mlp": 1.01859832, + "epoch": 0.982474071847287, + "flos": 15377380778880.0, + "grad_norm": 2.356487189204491, + "language_loss": 0.86053795, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.88198292, + "num_input_tokens_seen": 352692665, + "step": 16341, + "time_per_iteration": 2.60111403465271 + }, + { + "auxiliary_loss_clip": 0.0108126, + "auxiliary_loss_mlp": 0.01031684, + "balance_loss_clip": 1.03337705, + "balance_loss_mlp": 1.01934528, + "epoch": 0.9825341950999549, + "flos": 23774732081280.0, + "grad_norm": 1.4139542605019915, + "language_loss": 0.66917169, + "learning_rate": 3.188233008645014e-09, + "loss": 0.69030112, + "num_input_tokens_seen": 352716130, + "step": 16342, + "time_per_iteration": 3.006946325302124 + }, + { + "auxiliary_loss_clip": 0.01109167, + "auxiliary_loss_mlp": 0.01027299, + "balance_loss_clip": 1.03658962, + "balance_loss_mlp": 1.0151639, + "epoch": 0.9825943183526229, + "flos": 22746285872640.0, + "grad_norm": 1.5649008890047298, + "language_loss": 0.77261454, + "learning_rate": 3.16628889830195e-09, + "loss": 0.79397917, + "num_input_tokens_seen": 352734705, + "step": 16343, + "time_per_iteration": 2.623782157897949 + }, + { + "auxiliary_loss_clip": 0.01074162, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.03596878, + "balance_loss_mlp": 1.01949716, + "epoch": 0.9826544416052908, + "flos": 27709966408320.0, + "grad_norm": 1.540067239801228, + "language_loss": 0.75307328, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.77411795, + "num_input_tokens_seen": 352756225, + "step": 16344, + "time_per_iteration": 2.747864007949829 + }, + { + "auxiliary_loss_clip": 0.0108221, + "auxiliary_loss_mlp": 0.01029576, + "balance_loss_clip": 1.03211427, + "balance_loss_mlp": 1.01698792, + "epoch": 0.9827145648579588, + "flos": 26941657472640.0, + "grad_norm": 3.214033329820644, + "language_loss": 0.66152173, + "learning_rate": 3.122627838848313e-09, + "loss": 0.6826396, + "num_input_tokens_seen": 352776210, + "step": 16345, + "time_per_iteration": 4.445494651794434 + }, + { + "auxiliary_loss_clip": 0.01092474, + "auxiliary_loss_mlp": 0.01026144, + "balance_loss_clip": 1.03578293, + "balance_loss_mlp": 1.0152061, + "epoch": 0.9827746881106267, + "flos": 21866545969920.0, + "grad_norm": 1.4391085603801235, + "language_loss": 0.79666579, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.81785202, + "num_input_tokens_seen": 352795455, + "step": 16346, + "time_per_iteration": 4.2288713455200195 + }, + { + "auxiliary_loss_clip": 0.01098997, + "auxiliary_loss_mlp": 0.01037578, + "balance_loss_clip": 1.03740525, + "balance_loss_mlp": 1.02411294, + "epoch": 0.9828348113632948, + "flos": 20850777262080.0, + "grad_norm": 2.0938671424216024, + "language_loss": 0.75089842, + "learning_rate": 3.079269666552031e-09, + "loss": 0.77226412, + "num_input_tokens_seen": 352812895, + "step": 16347, + "time_per_iteration": 2.571201801300049 + }, + { + "auxiliary_loss_clip": 0.01033873, + "auxiliary_loss_mlp": 0.01036396, + "balance_loss_clip": 1.02937937, + "balance_loss_mlp": 1.02430809, + "epoch": 0.9828949346159628, + "flos": 34569227381760.0, + "grad_norm": 1.7026010770508515, + "language_loss": 0.66808671, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.68878937, + "num_input_tokens_seen": 352835470, + "step": 16348, + "time_per_iteration": 2.9019980430603027 + }, + { + "auxiliary_loss_clip": 0.01087559, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.03562045, + "balance_loss_mlp": 1.0198462, + "epoch": 0.9829550578686307, + "flos": 24457464864000.0, + "grad_norm": 1.7338187903548066, + "language_loss": 0.69069308, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.71189135, + "num_input_tokens_seen": 352854295, + "step": 16349, + "time_per_iteration": 2.680927038192749 + }, + { + "auxiliary_loss_clip": 0.01075988, + "auxiliary_loss_mlp": 0.01029319, + "balance_loss_clip": 1.03591371, + "balance_loss_mlp": 1.01804733, + "epoch": 0.9830151811212987, + "flos": 16910084067840.0, + "grad_norm": 2.350613893884081, + "language_loss": 0.75915736, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.78021044, + "num_input_tokens_seen": 352869695, + "step": 16350, + "time_per_iteration": 4.1306681632995605 + }, + { + "auxiliary_loss_clip": 0.01078562, + "auxiliary_loss_mlp": 0.01032593, + "balance_loss_clip": 1.03499365, + "balance_loss_mlp": 1.01986754, + "epoch": 0.9830753043739666, + "flos": 21288312829440.0, + "grad_norm": 2.112400068998379, + "language_loss": 0.84269607, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.86380762, + "num_input_tokens_seen": 352887430, + "step": 16351, + "time_per_iteration": 2.6960017681121826 + }, + { + "auxiliary_loss_clip": 0.0107955, + "auxiliary_loss_mlp": 0.0102559, + "balance_loss_clip": 1.03737783, + "balance_loss_mlp": 1.0136925, + "epoch": 0.9831354276266346, + "flos": 31723522341120.0, + "grad_norm": 1.6146338638201096, + "language_loss": 0.68907672, + "learning_rate": 2.972199410170795e-09, + "loss": 0.71012813, + "num_input_tokens_seen": 352907555, + "step": 16352, + "time_per_iteration": 2.7532811164855957 + }, + { + "auxiliary_loss_clip": 0.01088475, + "auxiliary_loss_mlp": 0.00769371, + "balance_loss_clip": 1.03576922, + "balance_loss_mlp": 1.00027871, + "epoch": 0.9831955508793025, + "flos": 21619050284160.0, + "grad_norm": 1.4138760656880254, + "language_loss": 0.66266984, + "learning_rate": 2.951012538143782e-09, + "loss": 0.68124831, + "num_input_tokens_seen": 352928670, + "step": 16353, + "time_per_iteration": 2.6439483165740967 + }, + { + "auxiliary_loss_clip": 0.01082262, + "auxiliary_loss_mlp": 0.01030151, + "balance_loss_clip": 1.03453684, + "balance_loss_mlp": 1.01872444, + "epoch": 0.9832556741319706, + "flos": 22968214053120.0, + "grad_norm": 1.5813502969627034, + "language_loss": 0.74711162, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.76823574, + "num_input_tokens_seen": 352948345, + "step": 16354, + "time_per_iteration": 2.6886255741119385 + }, + { + "auxiliary_loss_clip": 0.01098034, + "auxiliary_loss_mlp": 0.010272, + "balance_loss_clip": 1.03713632, + "balance_loss_mlp": 1.01496959, + "epoch": 0.9833157973846385, + "flos": 21323900229120.0, + "grad_norm": 2.034749936082402, + "language_loss": 0.77509081, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.79634321, + "num_input_tokens_seen": 352967250, + "step": 16355, + "time_per_iteration": 2.655395269393921 + }, + { + "auxiliary_loss_clip": 0.01094864, + "auxiliary_loss_mlp": 0.01028209, + "balance_loss_clip": 1.03562486, + "balance_loss_mlp": 1.01627064, + "epoch": 0.9833759206373065, + "flos": 21068719032960.0, + "grad_norm": 2.2520856858074594, + "language_loss": 0.73119497, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.75242567, + "num_input_tokens_seen": 352984725, + "step": 16356, + "time_per_iteration": 2.604156017303467 + }, + { + "auxiliary_loss_clip": 0.01082002, + "auxiliary_loss_mlp": 0.01032823, + "balance_loss_clip": 1.03355122, + "balance_loss_mlp": 1.02010965, + "epoch": 0.9834360438899744, + "flos": 18697322108160.0, + "grad_norm": 1.536085672752046, + "language_loss": 0.75979388, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.7809422, + "num_input_tokens_seen": 353003480, + "step": 16357, + "time_per_iteration": 2.685453176498413 + }, + { + "auxiliary_loss_clip": 0.01086973, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.03633261, + "balance_loss_mlp": 1.01561737, + "epoch": 0.9834961671426424, + "flos": 21105240186240.0, + "grad_norm": 2.377018060234898, + "language_loss": 0.80362308, + "learning_rate": 2.846214118442436e-09, + "loss": 0.82477319, + "num_input_tokens_seen": 353021425, + "step": 16358, + "time_per_iteration": 2.672687292098999 + }, + { + "auxiliary_loss_clip": 0.01095168, + "auxiliary_loss_mlp": 0.01025846, + "balance_loss_clip": 1.03404856, + "balance_loss_mlp": 1.01396728, + "epoch": 0.9835562903953103, + "flos": 26687625511680.0, + "grad_norm": 2.5788189442251848, + "language_loss": 0.67699122, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.69820142, + "num_input_tokens_seen": 353039870, + "step": 16359, + "time_per_iteration": 2.603217601776123 + }, + { + "auxiliary_loss_clip": 0.0110407, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.03442788, + "balance_loss_mlp": 1.01891446, + "epoch": 0.9836164136479784, + "flos": 22090162089600.0, + "grad_norm": 1.6643082336304196, + "language_loss": 0.69579446, + "learning_rate": 2.804824870920264e-09, + "loss": 0.71713769, + "num_input_tokens_seen": 353059750, + "step": 16360, + "time_per_iteration": 2.590282440185547 + }, + { + "auxiliary_loss_clip": 0.01097129, + "auxiliary_loss_mlp": 0.01035655, + "balance_loss_clip": 1.03531575, + "balance_loss_mlp": 1.02293587, + "epoch": 0.9836765369006463, + "flos": 23878405710720.0, + "grad_norm": 1.804692326953609, + "language_loss": 0.8430177, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.86434555, + "num_input_tokens_seen": 353079940, + "step": 16361, + "time_per_iteration": 2.667570114135742 + }, + { + "auxiliary_loss_clip": 0.01107883, + "auxiliary_loss_mlp": 0.01027313, + "balance_loss_clip": 1.03631568, + "balance_loss_mlp": 1.01540446, + "epoch": 0.9837366601533143, + "flos": 25845017293440.0, + "grad_norm": 1.7879750486860067, + "language_loss": 0.75830048, + "learning_rate": 2.76373855876022e-09, + "loss": 0.77965236, + "num_input_tokens_seen": 353099990, + "step": 16362, + "time_per_iteration": 2.5723037719726562 + }, + { + "auxiliary_loss_clip": 0.01109574, + "auxiliary_loss_mlp": 0.01035763, + "balance_loss_clip": 1.03702784, + "balance_loss_mlp": 1.023103, + "epoch": 0.9837967834059823, + "flos": 21358015171200.0, + "grad_norm": 1.8659793314659867, + "language_loss": 0.71063733, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.73209071, + "num_input_tokens_seen": 353118710, + "step": 16363, + "time_per_iteration": 2.580556631088257 + }, + { + "auxiliary_loss_clip": 0.01083391, + "auxiliary_loss_mlp": 0.01030702, + "balance_loss_clip": 1.03492188, + "balance_loss_mlp": 1.01919198, + "epoch": 0.9838569066586502, + "flos": 18515793749760.0, + "grad_norm": 2.149367136223202, + "language_loss": 0.63062841, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.65176934, + "num_input_tokens_seen": 353136415, + "step": 16364, + "time_per_iteration": 2.6873748302459717 + }, + { + "auxiliary_loss_clip": 0.01071986, + "auxiliary_loss_mlp": 0.01031043, + "balance_loss_clip": 1.04158378, + "balance_loss_mlp": 1.01967084, + "epoch": 0.9839170299113182, + "flos": 22452392793600.0, + "grad_norm": 1.5415718965225467, + "language_loss": 0.75180268, + "learning_rate": 2.702677107943252e-09, + "loss": 0.77283293, + "num_input_tokens_seen": 353154650, + "step": 16365, + "time_per_iteration": 2.7838945388793945 + }, + { + "auxiliary_loss_clip": 0.0106364, + "auxiliary_loss_mlp": 0.01028118, + "balance_loss_clip": 1.03559554, + "balance_loss_mlp": 1.01572597, + "epoch": 0.9839771531639862, + "flos": 27892320779520.0, + "grad_norm": 2.0418627891356365, + "language_loss": 0.76325071, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.78416824, + "num_input_tokens_seen": 353174065, + "step": 16366, + "time_per_iteration": 2.723862886428833 + }, + { + "auxiliary_loss_clip": 0.01105139, + "auxiliary_loss_mlp": 0.01026883, + "balance_loss_clip": 1.03549993, + "balance_loss_mlp": 1.01568365, + "epoch": 0.9840372764166542, + "flos": 28214510797440.0, + "grad_norm": 1.5805895259506346, + "language_loss": 0.77362347, + "learning_rate": 2.662348161352357e-09, + "loss": 0.79494369, + "num_input_tokens_seen": 353193560, + "step": 16367, + "time_per_iteration": 2.6186344623565674 + }, + { + "auxiliary_loss_clip": 0.01085162, + "auxiliary_loss_mlp": 0.01036358, + "balance_loss_clip": 1.0372721, + "balance_loss_mlp": 1.02363229, + "epoch": 0.9840973996693221, + "flos": 23403989854080.0, + "grad_norm": 1.6315530107439746, + "language_loss": 0.6176089, + "learning_rate": 2.642297296540974e-09, + "loss": 0.63882411, + "num_input_tokens_seen": 353213525, + "step": 16368, + "time_per_iteration": 2.7051217555999756 + }, + { + "auxiliary_loss_clip": 0.01093129, + "auxiliary_loss_mlp": 0.0103668, + "balance_loss_clip": 1.03431225, + "balance_loss_mlp": 1.02538538, + "epoch": 0.9841575229219901, + "flos": 21395865127680.0, + "grad_norm": 1.4838055886631645, + "language_loss": 0.65539753, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.67669564, + "num_input_tokens_seen": 353234000, + "step": 16369, + "time_per_iteration": 2.684190273284912 + }, + { + "auxiliary_loss_clip": 0.01098619, + "auxiliary_loss_mlp": 0.00771023, + "balance_loss_clip": 1.03682351, + "balance_loss_mlp": 1.00028467, + "epoch": 0.984217646174658, + "flos": 24464072966400.0, + "grad_norm": 2.1848510788789053, + "language_loss": 0.68529809, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.70399457, + "num_input_tokens_seen": 353254940, + "step": 16370, + "time_per_iteration": 2.690066337585449 + }, + { + "auxiliary_loss_clip": 0.01109517, + "auxiliary_loss_mlp": 0.01035832, + "balance_loss_clip": 1.03602791, + "balance_loss_mlp": 1.02231407, + "epoch": 0.984277769427326, + "flos": 16435057680000.0, + "grad_norm": 1.7624959425131688, + "language_loss": 0.73149407, + "learning_rate": 2.582599145159792e-09, + "loss": 0.75294757, + "num_input_tokens_seen": 353272590, + "step": 16371, + "time_per_iteration": 2.647754669189453 + }, + { + "auxiliary_loss_clip": 0.01019499, + "auxiliary_loss_mlp": 0.01000721, + "balance_loss_clip": 1.00614071, + "balance_loss_mlp": 0.99977916, + "epoch": 0.9843378926799939, + "flos": 64530615288960.0, + "grad_norm": 0.8676443160581451, + "language_loss": 0.65173286, + "learning_rate": 2.562851244898745e-09, + "loss": 0.67193508, + "num_input_tokens_seen": 353334380, + "step": 16372, + "time_per_iteration": 3.1656858921051025 + }, + { + "auxiliary_loss_clip": 0.01095097, + "auxiliary_loss_mlp": 0.01034077, + "balance_loss_clip": 1.03569186, + "balance_loss_mlp": 1.02207279, + "epoch": 0.984398015932662, + "flos": 17382811985280.0, + "grad_norm": 1.8275470136153955, + "language_loss": 0.70694923, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.72824109, + "num_input_tokens_seen": 353351640, + "step": 16373, + "time_per_iteration": 2.658825635910034 + }, + { + "auxiliary_loss_clip": 0.0110683, + "auxiliary_loss_mlp": 0.01030818, + "balance_loss_clip": 1.03669751, + "balance_loss_mlp": 1.01893306, + "epoch": 0.9844581391853299, + "flos": 23879088069120.0, + "grad_norm": 1.8485344334805096, + "language_loss": 0.81536216, + "learning_rate": 2.523582674173186e-09, + "loss": 0.83673871, + "num_input_tokens_seen": 353372555, + "step": 16374, + "time_per_iteration": 2.6585822105407715 + }, + { + "auxiliary_loss_clip": 0.01064423, + "auxiliary_loss_mlp": 0.01034079, + "balance_loss_clip": 1.03823948, + "balance_loss_mlp": 1.02220547, + "epoch": 0.9845182624379979, + "flos": 19865352568320.0, + "grad_norm": 1.693148116934704, + "language_loss": 0.69581914, + "learning_rate": 2.504062005197927e-09, + "loss": 0.71680415, + "num_input_tokens_seen": 353391385, + "step": 16375, + "time_per_iteration": 2.7366557121276855 + }, + { + "auxiliary_loss_clip": 0.01083548, + "auxiliary_loss_mlp": 0.01043522, + "balance_loss_clip": 1.03258562, + "balance_loss_mlp": 1.02908564, + "epoch": 0.9845783856906659, + "flos": 28254659224320.0, + "grad_norm": 2.704312105533632, + "language_loss": 0.81189835, + "learning_rate": 2.484617081468521e-09, + "loss": 0.83316898, + "num_input_tokens_seen": 353411630, + "step": 16376, + "time_per_iteration": 2.695854663848877 + }, + { + "auxiliary_loss_clip": 0.01105113, + "auxiliary_loss_mlp": 0.01036471, + "balance_loss_clip": 1.03517056, + "balance_loss_mlp": 1.02441287, + "epoch": 0.9846385089433338, + "flos": 28328383889280.0, + "grad_norm": 1.6882577755341805, + "language_loss": 0.62188119, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.64329708, + "num_input_tokens_seen": 353432895, + "step": 16377, + "time_per_iteration": 2.655351161956787 + }, + { + "auxiliary_loss_clip": 0.01079528, + "auxiliary_loss_mlp": 0.01034135, + "balance_loss_clip": 1.03616428, + "balance_loss_mlp": 1.02145696, + "epoch": 0.9846986321960018, + "flos": 24316767290880.0, + "grad_norm": 1.743655266311487, + "language_loss": 0.72909814, + "learning_rate": 2.445954472695133e-09, + "loss": 0.75023472, + "num_input_tokens_seen": 353454195, + "step": 16378, + "time_per_iteration": 2.7620902061462402 + }, + { + "auxiliary_loss_clip": 0.01107968, + "auxiliary_loss_mlp": 0.0103452, + "balance_loss_clip": 1.0362848, + "balance_loss_mlp": 1.02246761, + "epoch": 0.9847587554486698, + "flos": 27271999877760.0, + "grad_norm": 1.9591429215255713, + "language_loss": 0.71231186, + "learning_rate": 2.426736789116868e-09, + "loss": 0.73373675, + "num_input_tokens_seen": 353475125, + "step": 16379, + "time_per_iteration": 2.6217269897460938 + }, + { + "auxiliary_loss_clip": 0.01076838, + "auxiliary_loss_mlp": 0.01033584, + "balance_loss_clip": 1.03647435, + "balance_loss_mlp": 1.02120376, + "epoch": 0.9848188787013378, + "flos": 16542717719040.0, + "grad_norm": 1.9414180351359185, + "language_loss": 0.68380785, + "learning_rate": 2.407594853716999e-09, + "loss": 0.70491207, + "num_input_tokens_seen": 353493265, + "step": 16380, + "time_per_iteration": 2.6951489448547363 + }, + { + "auxiliary_loss_clip": 0.01078007, + "auxiliary_loss_mlp": 0.01037173, + "balance_loss_clip": 1.0345068, + "balance_loss_mlp": 1.02463818, + "epoch": 0.9848790019540057, + "flos": 20193647898240.0, + "grad_norm": 2.8812935007679146, + "language_loss": 0.78948879, + "learning_rate": 2.38852866722139e-09, + "loss": 0.81064057, + "num_input_tokens_seen": 353511650, + "step": 16381, + "time_per_iteration": 2.733790159225464 + }, + { + "auxiliary_loss_clip": 0.01095102, + "auxiliary_loss_mlp": 0.01029949, + "balance_loss_clip": 1.03512669, + "balance_loss_mlp": 1.01729441, + "epoch": 0.9849391252066737, + "flos": 28259723041920.0, + "grad_norm": 1.4147052567064669, + "language_loss": 0.82457852, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.84582901, + "num_input_tokens_seen": 353534035, + "step": 16382, + "time_per_iteration": 2.738605499267578 + }, + { + "auxiliary_loss_clip": 0.01081484, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.03230476, + "balance_loss_mlp": 1.02016735, + "epoch": 0.9849992484593416, + "flos": 22454942659200.0, + "grad_norm": 1.7473709928633554, + "language_loss": 0.74585968, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.76701248, + "num_input_tokens_seen": 353549950, + "step": 16383, + "time_per_iteration": 2.754387378692627 + }, + { + "auxiliary_loss_clip": 0.01064953, + "auxiliary_loss_mlp": 0.01031021, + "balance_loss_clip": 1.03860319, + "balance_loss_mlp": 1.01868236, + "epoch": 0.9850593717120096, + "flos": 34497190656000.0, + "grad_norm": 1.8075355031260896, + "language_loss": 0.66479164, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.68575138, + "num_input_tokens_seen": 353573745, + "step": 16384, + "time_per_iteration": 2.9240455627441406 + }, + { + "auxiliary_loss_clip": 0.01090885, + "auxiliary_loss_mlp": 0.01035183, + "balance_loss_clip": 1.03831267, + "balance_loss_mlp": 1.02080083, + "epoch": 0.9851194949646775, + "flos": 38837282152320.0, + "grad_norm": 1.832467391931212, + "language_loss": 0.70495671, + "learning_rate": 2.313021424697359e-09, + "loss": 0.72621739, + "num_input_tokens_seen": 353595335, + "step": 16385, + "time_per_iteration": 6.049696922302246 + }, + { + "auxiliary_loss_clip": 0.01090368, + "auxiliary_loss_mlp": 0.01031869, + "balance_loss_clip": 1.03864157, + "balance_loss_mlp": 1.01980531, + "epoch": 0.9851796182173456, + "flos": 17712436118400.0, + "grad_norm": 2.4314123145549336, + "language_loss": 0.81251216, + "learning_rate": 2.294333993509978e-09, + "loss": 0.83373451, + "num_input_tokens_seen": 353614270, + "step": 16386, + "time_per_iteration": 2.663780689239502 + }, + { + "auxiliary_loss_clip": 0.01079909, + "auxiliary_loss_mlp": 0.01031709, + "balance_loss_clip": 1.03440416, + "balance_loss_mlp": 1.01883996, + "epoch": 0.9852397414700135, + "flos": 27454318335360.0, + "grad_norm": 1.9863892677340445, + "language_loss": 0.67923307, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.70034921, + "num_input_tokens_seen": 353634900, + "step": 16387, + "time_per_iteration": 2.7573816776275635 + }, + { + "auxiliary_loss_clip": 0.01089839, + "auxiliary_loss_mlp": 0.00769242, + "balance_loss_clip": 1.03422558, + "balance_loss_mlp": 1.00012159, + "epoch": 0.9852998647226815, + "flos": 18296702743680.0, + "grad_norm": 1.7527242127962226, + "language_loss": 0.74020231, + "learning_rate": 2.257186391438237e-09, + "loss": 0.75879306, + "num_input_tokens_seen": 353652890, + "step": 16388, + "time_per_iteration": 2.6196138858795166 + }, + { + "auxiliary_loss_clip": 0.01089517, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.03372729, + "balance_loss_mlp": 1.02051258, + "epoch": 0.9853599879753495, + "flos": 19642562461440.0, + "grad_norm": 1.885399673475778, + "language_loss": 0.82288045, + "learning_rate": 2.238726221962528e-09, + "loss": 0.8441, + "num_input_tokens_seen": 353671295, + "step": 16389, + "time_per_iteration": 4.203902959823608 + }, + { + "auxiliary_loss_clip": 0.01086383, + "auxiliary_loss_mlp": 0.00770398, + "balance_loss_clip": 1.03422093, + "balance_loss_mlp": 1.00023174, + "epoch": 0.9854201112280174, + "flos": 23841956384640.0, + "grad_norm": 2.246145821478881, + "language_loss": 0.67169315, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.69026095, + "num_input_tokens_seen": 353690560, + "step": 16390, + "time_per_iteration": 2.7021732330322266 + }, + { + "auxiliary_loss_clip": 0.01070253, + "auxiliary_loss_mlp": 0.01034626, + "balance_loss_clip": 1.03694236, + "balance_loss_mlp": 1.02117944, + "epoch": 0.9854802344806854, + "flos": 30080573233920.0, + "grad_norm": 1.5706472092274895, + "language_loss": 0.77193004, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.79297888, + "num_input_tokens_seen": 353710660, + "step": 16391, + "time_per_iteration": 2.763343572616577 + }, + { + "auxiliary_loss_clip": 0.01066236, + "auxiliary_loss_mlp": 0.00769461, + "balance_loss_clip": 1.03303838, + "balance_loss_mlp": 1.00014699, + "epoch": 0.9855403577333534, + "flos": 21907412668800.0, + "grad_norm": 2.034127349616756, + "language_loss": 0.6821295, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.70048642, + "num_input_tokens_seen": 353730440, + "step": 16392, + "time_per_iteration": 2.741312026977539 + }, + { + "auxiliary_loss_clip": 0.01076854, + "auxiliary_loss_mlp": 0.01030267, + "balance_loss_clip": 1.0342617, + "balance_loss_mlp": 1.0166111, + "epoch": 0.9856004809860214, + "flos": 15413794191360.0, + "grad_norm": 2.041115164847186, + "language_loss": 0.55706286, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.57813406, + "num_input_tokens_seen": 353748360, + "step": 16393, + "time_per_iteration": 2.6840660572052 + }, + { + "auxiliary_loss_clip": 0.01074406, + "auxiliary_loss_mlp": 0.01031603, + "balance_loss_clip": 1.0325073, + "balance_loss_mlp": 1.01706505, + "epoch": 0.9856606042386893, + "flos": 13653201064320.0, + "grad_norm": 2.576410490354787, + "language_loss": 0.79111844, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.81217849, + "num_input_tokens_seen": 353760880, + "step": 16394, + "time_per_iteration": 2.683983087539673 + }, + { + "auxiliary_loss_clip": 0.0109509, + "auxiliary_loss_mlp": 0.01033335, + "balance_loss_clip": 1.03339911, + "balance_loss_mlp": 1.02005494, + "epoch": 0.9857207274913573, + "flos": 23479151063040.0, + "grad_norm": 1.5070932402692028, + "language_loss": 0.76305884, + "learning_rate": 2.129556090869178e-09, + "loss": 0.78434312, + "num_input_tokens_seen": 353782255, + "step": 16395, + "time_per_iteration": 2.694324254989624 + }, + { + "auxiliary_loss_clip": 0.01094719, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.03447509, + "balance_loss_mlp": 1.01911831, + "epoch": 0.9857808507440252, + "flos": 21065486808960.0, + "grad_norm": 1.9132501064588425, + "language_loss": 0.7550149, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.77627826, + "num_input_tokens_seen": 353803580, + "step": 16396, + "time_per_iteration": 2.6768436431884766 + }, + { + "auxiliary_loss_clip": 0.01070405, + "auxiliary_loss_mlp": 0.01026163, + "balance_loss_clip": 1.03497076, + "balance_loss_mlp": 1.01392639, + "epoch": 0.9858409739966932, + "flos": 25301365971840.0, + "grad_norm": 1.5225711689164605, + "language_loss": 0.7122134, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.73317909, + "num_input_tokens_seen": 353824200, + "step": 16397, + "time_per_iteration": 2.7246475219726562 + }, + { + "auxiliary_loss_clip": 0.01081841, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.0351944, + "balance_loss_mlp": 1.01624179, + "epoch": 0.9859010972493611, + "flos": 20558751690240.0, + "grad_norm": 1.7750069543692049, + "language_loss": 0.7137388, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.73483771, + "num_input_tokens_seen": 353843350, + "step": 16398, + "time_per_iteration": 2.6708950996398926 + }, + { + "auxiliary_loss_clip": 0.0106975, + "auxiliary_loss_mlp": 0.01026745, + "balance_loss_clip": 1.03256011, + "balance_loss_mlp": 1.01534224, + "epoch": 0.9859612205020292, + "flos": 24754985216640.0, + "grad_norm": 2.8400971198256215, + "language_loss": 0.73956269, + "learning_rate": 2.058291183208771e-09, + "loss": 0.76052767, + "num_input_tokens_seen": 353864520, + "step": 16399, + "time_per_iteration": 2.7505059242248535 + }, + { + "auxiliary_loss_clip": 0.01107815, + "auxiliary_loss_mlp": 0.01030039, + "balance_loss_clip": 1.03546059, + "balance_loss_mlp": 1.01738548, + "epoch": 0.9860213437546971, + "flos": 21105850717440.0, + "grad_norm": 2.280806532195227, + "language_loss": 0.57755244, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.59893095, + "num_input_tokens_seen": 353882240, + "step": 16400, + "time_per_iteration": 2.5837459564208984 + }, + { + "auxiliary_loss_clip": 0.01087543, + "auxiliary_loss_mlp": 0.01029777, + "balance_loss_clip": 1.03587925, + "balance_loss_mlp": 1.01624036, + "epoch": 0.9860814670073651, + "flos": 19136078737920.0, + "grad_norm": 7.9161501077476775, + "language_loss": 0.80533803, + "learning_rate": 2.023113299582491e-09, + "loss": 0.82651126, + "num_input_tokens_seen": 353901590, + "step": 16401, + "time_per_iteration": 2.676846742630005 + }, + { + "auxiliary_loss_clip": 0.01095929, + "auxiliary_loss_mlp": 0.01033178, + "balance_loss_clip": 1.03656411, + "balance_loss_mlp": 1.02002931, + "epoch": 0.9861415902600331, + "flos": 17237050594560.0, + "grad_norm": 1.9620055100104796, + "language_loss": 0.77909809, + "learning_rate": 2.005638002662069e-09, + "loss": 0.80038917, + "num_input_tokens_seen": 353918785, + "step": 16402, + "time_per_iteration": 2.580324172973633 + }, + { + "auxiliary_loss_clip": 0.01099134, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.03702092, + "balance_loss_mlp": 1.02319241, + "epoch": 0.986201713512701, + "flos": 27782577751680.0, + "grad_norm": 1.8385726831624305, + "language_loss": 0.69819796, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.71954, + "num_input_tokens_seen": 353940390, + "step": 16403, + "time_per_iteration": 2.6051719188690186 + }, + { + "auxiliary_loss_clip": 0.01092549, + "auxiliary_loss_mlp": 0.0102806, + "balance_loss_clip": 1.03301835, + "balance_loss_mlp": 1.01602554, + "epoch": 0.986261836765369, + "flos": 28730403884160.0, + "grad_norm": 2.0540712691142, + "language_loss": 0.74826646, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.76947248, + "num_input_tokens_seen": 353962180, + "step": 16404, + "time_per_iteration": 2.6757051944732666 + }, + { + "auxiliary_loss_clip": 0.01096235, + "auxiliary_loss_mlp": 0.00769718, + "balance_loss_clip": 1.03480124, + "balance_loss_mlp": 1.00010228, + "epoch": 0.986321960018037, + "flos": 34313471568000.0, + "grad_norm": 1.7889327818353045, + "language_loss": 0.69631529, + "learning_rate": 1.953666699415768e-09, + "loss": 0.71497488, + "num_input_tokens_seen": 353984305, + "step": 16405, + "time_per_iteration": 2.7109172344207764 + }, + { + "auxiliary_loss_clip": 0.01085878, + "auxiliary_loss_mlp": 0.01034898, + "balance_loss_clip": 1.03745413, + "balance_loss_mlp": 1.02344775, + "epoch": 0.986382083270705, + "flos": 25189755436800.0, + "grad_norm": 1.6951529246514412, + "language_loss": 0.69703031, + "learning_rate": 1.93649446302846e-09, + "loss": 0.718238, + "num_input_tokens_seen": 354004495, + "step": 16406, + "time_per_iteration": 2.725384473800659 + }, + { + "auxiliary_loss_clip": 0.01049811, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.03564012, + "balance_loss_mlp": 1.02081645, + "epoch": 0.9864422065233729, + "flos": 11025904671360.0, + "grad_norm": 3.370275127153346, + "language_loss": 0.74895245, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.76977789, + "num_input_tokens_seen": 354015985, + "step": 16407, + "time_per_iteration": 2.711702585220337 + }, + { + "auxiliary_loss_clip": 0.01083953, + "auxiliary_loss_mlp": 0.01030883, + "balance_loss_clip": 1.03477526, + "balance_loss_mlp": 1.01853251, + "epoch": 0.9865023297760409, + "flos": 16545590807040.0, + "grad_norm": 2.111055087475785, + "language_loss": 0.77460712, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.79575551, + "num_input_tokens_seen": 354033260, + "step": 16408, + "time_per_iteration": 2.593550443649292 + }, + { + "auxiliary_loss_clip": 0.01101693, + "auxiliary_loss_mlp": 0.01032331, + "balance_loss_clip": 1.0380075, + "balance_loss_mlp": 1.01906323, + "epoch": 0.9865624530287088, + "flos": 18880179269760.0, + "grad_norm": 1.9451476197970003, + "language_loss": 0.68269604, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.70403636, + "num_input_tokens_seen": 354052825, + "step": 16409, + "time_per_iteration": 2.587090492248535 + }, + { + "auxiliary_loss_clip": 0.01011915, + "auxiliary_loss_mlp": 0.01002193, + "balance_loss_clip": 1.00871253, + "balance_loss_mlp": 1.00125718, + "epoch": 0.9866225762813768, + "flos": 68887798680960.0, + "grad_norm": 0.806349802754998, + "language_loss": 0.61002564, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.63016677, + "num_input_tokens_seen": 354113920, + "step": 16410, + "time_per_iteration": 3.278089761734009 + }, + { + "auxiliary_loss_clip": 0.0109769, + "auxiliary_loss_mlp": 0.01032935, + "balance_loss_clip": 1.03702283, + "balance_loss_mlp": 1.02077615, + "epoch": 0.9866826995340447, + "flos": 29023111814400.0, + "grad_norm": 3.0120361411647005, + "language_loss": 0.65963012, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.68093634, + "num_input_tokens_seen": 354134210, + "step": 16411, + "time_per_iteration": 2.632351875305176 + }, + { + "auxiliary_loss_clip": 0.01027186, + "auxiliary_loss_mlp": 0.0100133, + "balance_loss_clip": 1.00486875, + "balance_loss_mlp": 1.00040567, + "epoch": 0.9867428227867128, + "flos": 65376814867200.0, + "grad_norm": 0.7224745052479478, + "language_loss": 0.56279814, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.58308327, + "num_input_tokens_seen": 354198010, + "step": 16412, + "time_per_iteration": 3.1897354125976562 + }, + { + "auxiliary_loss_clip": 0.01079312, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.0352391, + "balance_loss_mlp": 1.01961815, + "epoch": 0.9868029460393807, + "flos": 26506312634880.0, + "grad_norm": 2.0935942074241685, + "language_loss": 0.72890359, + "learning_rate": 1.818410313934926e-09, + "loss": 0.75002241, + "num_input_tokens_seen": 354220000, + "step": 16413, + "time_per_iteration": 2.710663080215454 + }, + { + "auxiliary_loss_clip": 0.01060652, + "auxiliary_loss_mlp": 0.01030308, + "balance_loss_clip": 1.03323412, + "balance_loss_mlp": 1.01750505, + "epoch": 0.9868630692920487, + "flos": 22967280299520.0, + "grad_norm": 2.0312404595944664, + "language_loss": 0.71431053, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.73522013, + "num_input_tokens_seen": 354240910, + "step": 16414, + "time_per_iteration": 2.7031588554382324 + }, + { + "auxiliary_loss_clip": 0.01089485, + "auxiliary_loss_mlp": 0.01036435, + "balance_loss_clip": 1.03525519, + "balance_loss_mlp": 1.02446055, + "epoch": 0.9869231925447167, + "flos": 19828687760640.0, + "grad_norm": 1.5435516461575216, + "language_loss": 0.7039904, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.72524959, + "num_input_tokens_seen": 354259430, + "step": 16415, + "time_per_iteration": 2.702089309692383 + }, + { + "auxiliary_loss_clip": 0.01066346, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.0336014, + "balance_loss_mlp": 1.02089763, + "epoch": 0.9869833157973846, + "flos": 20195228096640.0, + "grad_norm": 2.9079123838637604, + "language_loss": 0.75465488, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.77563846, + "num_input_tokens_seen": 354279490, + "step": 16416, + "time_per_iteration": 2.703504800796509 + }, + { + "auxiliary_loss_clip": 0.0108217, + "auxiliary_loss_mlp": 0.01030102, + "balance_loss_clip": 1.03591037, + "balance_loss_mlp": 1.01787734, + "epoch": 0.9870434390500527, + "flos": 16099507802880.0, + "grad_norm": 2.1800846259576216, + "language_loss": 0.70927489, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.7303977, + "num_input_tokens_seen": 354295080, + "step": 16417, + "time_per_iteration": 2.694063663482666 + }, + { + "auxiliary_loss_clip": 0.01087544, + "auxiliary_loss_mlp": 0.0103492, + "balance_loss_clip": 1.03795171, + "balance_loss_mlp": 1.02209926, + "epoch": 0.9871035623027206, + "flos": 21760753438080.0, + "grad_norm": 1.6696226113868622, + "language_loss": 0.70512295, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.72634757, + "num_input_tokens_seen": 354314610, + "step": 16418, + "time_per_iteration": 2.7078118324279785 + }, + { + "auxiliary_loss_clip": 0.01027164, + "auxiliary_loss_mlp": 0.01000807, + "balance_loss_clip": 1.00479984, + "balance_loss_mlp": 0.99982989, + "epoch": 0.9871636855553886, + "flos": 70219583245440.0, + "grad_norm": 0.658515497705567, + "language_loss": 0.53645599, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.55673575, + "num_input_tokens_seen": 354383115, + "step": 16419, + "time_per_iteration": 3.2428295612335205 + }, + { + "auxiliary_loss_clip": 0.01087155, + "auxiliary_loss_mlp": 0.0104011, + "balance_loss_clip": 1.03351521, + "balance_loss_mlp": 1.0263052, + "epoch": 0.9872238088080565, + "flos": 25045825639680.0, + "grad_norm": 2.1175189547094457, + "language_loss": 0.77917069, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.80044335, + "num_input_tokens_seen": 354403115, + "step": 16420, + "time_per_iteration": 2.6854612827301025 + }, + { + "auxiliary_loss_clip": 0.01071773, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.03993893, + "balance_loss_mlp": 1.01702881, + "epoch": 0.9872839320607245, + "flos": 19465846525440.0, + "grad_norm": 2.3612194130787505, + "language_loss": 0.70805871, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.72907424, + "num_input_tokens_seen": 354424520, + "step": 16421, + "time_per_iteration": 2.7082440853118896 + }, + { + "auxiliary_loss_clip": 0.01100703, + "auxiliary_loss_mlp": 0.01035927, + "balance_loss_clip": 1.03684855, + "balance_loss_mlp": 1.02248001, + "epoch": 0.9873440553133924, + "flos": 26942914448640.0, + "grad_norm": 2.428011594135223, + "language_loss": 0.82735991, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.84872615, + "num_input_tokens_seen": 354444800, + "step": 16422, + "time_per_iteration": 2.6437931060791016 + }, + { + "auxiliary_loss_clip": 0.0107317, + "auxiliary_loss_mlp": 0.01028669, + "balance_loss_clip": 1.03409743, + "balance_loss_mlp": 1.0163486, + "epoch": 0.9874041785660604, + "flos": 19062210418560.0, + "grad_norm": 1.6808127811152613, + "language_loss": 0.86108935, + "learning_rate": 1.656159280223779e-09, + "loss": 0.88210779, + "num_input_tokens_seen": 354464590, + "step": 16423, + "time_per_iteration": 2.7554445266723633 + }, + { + "auxiliary_loss_clip": 0.01100362, + "auxiliary_loss_mlp": 0.01026203, + "balance_loss_clip": 1.03655839, + "balance_loss_mlp": 1.01384747, + "epoch": 0.9874643018187284, + "flos": 21105814803840.0, + "grad_norm": 2.086841049232087, + "language_loss": 0.70854056, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.72980618, + "num_input_tokens_seen": 354484145, + "step": 16424, + "time_per_iteration": 7.414201736450195 + }, + { + "auxiliary_loss_clip": 0.01097827, + "auxiliary_loss_mlp": 0.00769696, + "balance_loss_clip": 1.03443944, + "balance_loss_mlp": 1.00012803, + "epoch": 0.9875244250713964, + "flos": 24426043441920.0, + "grad_norm": 2.386486368838744, + "language_loss": 0.80787611, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.82655132, + "num_input_tokens_seen": 354502475, + "step": 16425, + "time_per_iteration": 2.6806702613830566 + }, + { + "auxiliary_loss_clip": 0.0105599, + "auxiliary_loss_mlp": 0.01033218, + "balance_loss_clip": 1.03098166, + "balance_loss_mlp": 1.01937222, + "epoch": 0.9875845483240643, + "flos": 25117610970240.0, + "grad_norm": 1.8226222464901614, + "language_loss": 0.79747486, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.81836694, + "num_input_tokens_seen": 354521855, + "step": 16426, + "time_per_iteration": 2.931814432144165 + }, + { + "auxiliary_loss_clip": 0.01099233, + "auxiliary_loss_mlp": 0.01035808, + "balance_loss_clip": 1.03837609, + "balance_loss_mlp": 1.02355909, + "epoch": 0.9876446715767323, + "flos": 16581788737920.0, + "grad_norm": 1.762658511590331, + "language_loss": 0.84837222, + "learning_rate": 1.593380599750338e-09, + "loss": 0.8697226, + "num_input_tokens_seen": 354539535, + "step": 16427, + "time_per_iteration": 2.615586042404175 + }, + { + "auxiliary_loss_clip": 0.01107577, + "auxiliary_loss_mlp": 0.01031141, + "balance_loss_clip": 1.03742325, + "balance_loss_mlp": 1.01907742, + "epoch": 0.9877047948294003, + "flos": 21616141282560.0, + "grad_norm": 1.7238113053204014, + "language_loss": 0.70466417, + "learning_rate": 1.577875377599458e-09, + "loss": 0.72605133, + "num_input_tokens_seen": 354557430, + "step": 16428, + "time_per_iteration": 2.5831527709960938 + }, + { + "auxiliary_loss_clip": 0.01068786, + "auxiliary_loss_mlp": 0.01032334, + "balance_loss_clip": 1.03269923, + "balance_loss_mlp": 1.02058625, + "epoch": 0.9877649180820682, + "flos": 21178497974400.0, + "grad_norm": 3.804683550860071, + "language_loss": 0.79990548, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.82091671, + "num_input_tokens_seen": 354574735, + "step": 16429, + "time_per_iteration": 4.215754270553589 + }, + { + "auxiliary_loss_clip": 0.01106379, + "auxiliary_loss_mlp": 0.01030944, + "balance_loss_clip": 1.03527224, + "balance_loss_mlp": 1.01933873, + "epoch": 0.9878250413347363, + "flos": 39749233576320.0, + "grad_norm": 1.5905003981287011, + "language_loss": 0.6204477, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.64182091, + "num_input_tokens_seen": 354597050, + "step": 16430, + "time_per_iteration": 2.7417891025543213 + }, + { + "auxiliary_loss_clip": 0.01109651, + "auxiliary_loss_mlp": 0.01032938, + "balance_loss_clip": 1.03770876, + "balance_loss_mlp": 1.02093995, + "epoch": 0.9878851645874042, + "flos": 29425634599680.0, + "grad_norm": 2.4386034001427848, + "language_loss": 0.73058724, + "learning_rate": 1.531814395687725e-09, + "loss": 0.75201309, + "num_input_tokens_seen": 354619095, + "step": 16431, + "time_per_iteration": 2.6387763023376465 + }, + { + "auxiliary_loss_clip": 0.01109492, + "auxiliary_loss_mlp": 0.01033474, + "balance_loss_clip": 1.03863847, + "balance_loss_mlp": 1.02136791, + "epoch": 0.9879452878400722, + "flos": 15806261168640.0, + "grad_norm": 2.1754704610847115, + "language_loss": 0.804088, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.82551765, + "num_input_tokens_seen": 354633790, + "step": 16432, + "time_per_iteration": 2.59206485748291 + }, + { + "auxiliary_loss_clip": 0.01092115, + "auxiliary_loss_mlp": 0.01029277, + "balance_loss_clip": 1.0342344, + "balance_loss_mlp": 1.01833928, + "epoch": 0.9880054110927401, + "flos": 22233912318720.0, + "grad_norm": 1.600766850259046, + "language_loss": 0.80293298, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.82414687, + "num_input_tokens_seen": 354653180, + "step": 16433, + "time_per_iteration": 2.705249071121216 + }, + { + "auxiliary_loss_clip": 0.01105179, + "auxiliary_loss_mlp": 0.01033969, + "balance_loss_clip": 1.03657746, + "balance_loss_mlp": 1.02180386, + "epoch": 0.9880655343454081, + "flos": 28763836467840.0, + "grad_norm": 2.2504734279341543, + "language_loss": 0.6503619, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.67175341, + "num_input_tokens_seen": 354669900, + "step": 16434, + "time_per_iteration": 2.5459141731262207 + }, + { + "auxiliary_loss_clip": 0.01097534, + "auxiliary_loss_mlp": 0.01033458, + "balance_loss_clip": 1.03373504, + "balance_loss_mlp": 1.0204587, + "epoch": 0.988125657598076, + "flos": 32853379622400.0, + "grad_norm": 1.6032981258064614, + "language_loss": 0.69355786, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.71486771, + "num_input_tokens_seen": 354693165, + "step": 16435, + "time_per_iteration": 2.691948652267456 + }, + { + "auxiliary_loss_clip": 0.01051732, + "auxiliary_loss_mlp": 0.01037505, + "balance_loss_clip": 1.03534651, + "balance_loss_mlp": 1.02496445, + "epoch": 0.988185780850744, + "flos": 19390685316480.0, + "grad_norm": 1.6101555042177864, + "language_loss": 0.75285351, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.77374589, + "num_input_tokens_seen": 354711915, + "step": 16436, + "time_per_iteration": 2.687253475189209 + }, + { + "auxiliary_loss_clip": 0.01078926, + "auxiliary_loss_mlp": 0.01034051, + "balance_loss_clip": 1.03449368, + "balance_loss_mlp": 1.02073479, + "epoch": 0.988245904103412, + "flos": 22528415928960.0, + "grad_norm": 2.1049247557685486, + "language_loss": 0.7397666, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.76089633, + "num_input_tokens_seen": 354729135, + "step": 16437, + "time_per_iteration": 2.6133415699005127 + }, + { + "auxiliary_loss_clip": 0.01070653, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.03327727, + "balance_loss_mlp": 1.02036154, + "epoch": 0.98830602735608, + "flos": 28659193171200.0, + "grad_norm": 1.7371031624510076, + "language_loss": 0.60138786, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.62241983, + "num_input_tokens_seen": 354752530, + "step": 16438, + "time_per_iteration": 2.747478485107422 + }, + { + "auxiliary_loss_clip": 0.01082521, + "auxiliary_loss_mlp": 0.01033624, + "balance_loss_clip": 1.03334987, + "balance_loss_mlp": 1.02030826, + "epoch": 0.9883661506087479, + "flos": 20996035862400.0, + "grad_norm": 1.7884546303638278, + "language_loss": 0.71630102, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.7374624, + "num_input_tokens_seen": 354771135, + "step": 16439, + "time_per_iteration": 2.64829158782959 + }, + { + "auxiliary_loss_clip": 0.01094806, + "auxiliary_loss_mlp": 0.01032761, + "balance_loss_clip": 1.03649998, + "balance_loss_mlp": 1.02029228, + "epoch": 0.9884262738614159, + "flos": 32706109860480.0, + "grad_norm": 1.9552284330659928, + "language_loss": 0.60129845, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.62257409, + "num_input_tokens_seen": 354791800, + "step": 16440, + "time_per_iteration": 2.709625482559204 + }, + { + "auxiliary_loss_clip": 0.01109217, + "auxiliary_loss_mlp": 0.0103132, + "balance_loss_clip": 1.03572154, + "balance_loss_mlp": 1.01920807, + "epoch": 0.9884863971140839, + "flos": 17564699479680.0, + "grad_norm": 2.3996756667882346, + "language_loss": 0.76234657, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.78375196, + "num_input_tokens_seen": 354809200, + "step": 16441, + "time_per_iteration": 2.5174717903137207 + }, + { + "auxiliary_loss_clip": 0.01084665, + "auxiliary_loss_mlp": 0.0102841, + "balance_loss_clip": 1.03476977, + "balance_loss_mlp": 1.0158329, + "epoch": 0.9885465203667518, + "flos": 40552519380480.0, + "grad_norm": 1.8936887176516917, + "language_loss": 0.67978179, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.70091248, + "num_input_tokens_seen": 354829945, + "step": 16442, + "time_per_iteration": 2.780667781829834 + }, + { + "auxiliary_loss_clip": 0.01094262, + "auxiliary_loss_mlp": 0.01030255, + "balance_loss_clip": 1.03508973, + "balance_loss_mlp": 1.01828051, + "epoch": 0.9886066436194199, + "flos": 13807976768640.0, + "grad_norm": 2.546287070023655, + "language_loss": 0.74541289, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.76665807, + "num_input_tokens_seen": 354845055, + "step": 16443, + "time_per_iteration": 2.5256857872009277 + }, + { + "auxiliary_loss_clip": 0.01085844, + "auxiliary_loss_mlp": 0.01029451, + "balance_loss_clip": 1.03409505, + "balance_loss_mlp": 1.01652241, + "epoch": 0.9886667668720878, + "flos": 23325129544320.0, + "grad_norm": 6.700934436882059, + "language_loss": 0.73816478, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.75931776, + "num_input_tokens_seen": 354864680, + "step": 16444, + "time_per_iteration": 2.6347739696502686 + }, + { + "auxiliary_loss_clip": 0.01058824, + "auxiliary_loss_mlp": 0.0103504, + "balance_loss_clip": 1.03483725, + "balance_loss_mlp": 1.02182567, + "epoch": 0.9887268901247558, + "flos": 22706029704960.0, + "grad_norm": 2.0399337200236483, + "language_loss": 0.69289607, + "learning_rate": 1.325881465858547e-09, + "loss": 0.7138347, + "num_input_tokens_seen": 354885685, + "step": 16445, + "time_per_iteration": 2.7339391708374023 + }, + { + "auxiliary_loss_clip": 0.01101302, + "auxiliary_loss_mlp": 0.01026048, + "balance_loss_clip": 1.03817463, + "balance_loss_mlp": 1.01369166, + "epoch": 0.9887870133774237, + "flos": 13041283944960.0, + "grad_norm": 2.484106533550889, + "language_loss": 0.60372651, + "learning_rate": 1.311740377491155e-09, + "loss": 0.625, + "num_input_tokens_seen": 354901505, + "step": 16446, + "time_per_iteration": 2.571403980255127 + }, + { + "auxiliary_loss_clip": 0.01080619, + "auxiliary_loss_mlp": 0.01032334, + "balance_loss_clip": 1.03539968, + "balance_loss_mlp": 1.02072275, + "epoch": 0.9888471366300917, + "flos": 15158864390400.0, + "grad_norm": 2.54961121966749, + "language_loss": 0.71147966, + "learning_rate": 1.297675079582783e-09, + "loss": 0.73260915, + "num_input_tokens_seen": 354920060, + "step": 16447, + "time_per_iteration": 2.6204898357391357 + }, + { + "auxiliary_loss_clip": 0.01106743, + "auxiliary_loss_mlp": 0.00769349, + "balance_loss_clip": 1.03625035, + "balance_loss_mlp": 1.00023174, + "epoch": 0.9889072598827596, + "flos": 25118796119040.0, + "grad_norm": 2.22311895255621, + "language_loss": 0.83816832, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.85692918, + "num_input_tokens_seen": 354938690, + "step": 16448, + "time_per_iteration": 2.615037679672241 + }, + { + "auxiliary_loss_clip": 0.01093774, + "auxiliary_loss_mlp": 0.01028295, + "balance_loss_clip": 1.03621387, + "balance_loss_mlp": 1.0171665, + "epoch": 0.9889673831354276, + "flos": 16728663450240.0, + "grad_norm": 1.5811514661156387, + "language_loss": 0.69698024, + "learning_rate": 1.26977185727406e-09, + "loss": 0.71820092, + "num_input_tokens_seen": 354956955, + "step": 16449, + "time_per_iteration": 2.5541889667510986 + }, + { + "auxiliary_loss_clip": 0.0109972, + "auxiliary_loss_mlp": 0.0102696, + "balance_loss_clip": 1.03743207, + "balance_loss_mlp": 1.01456869, + "epoch": 0.9890275063880956, + "flos": 35585175657600.0, + "grad_norm": 2.2330985869575106, + "language_loss": 0.7364139, + "learning_rate": 1.25593393393153e-09, + "loss": 0.75768065, + "num_input_tokens_seen": 354976800, + "step": 16450, + "time_per_iteration": 2.722463846206665 + }, + { + "auxiliary_loss_clip": 0.01108427, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.0343194, + "balance_loss_mlp": 1.01945782, + "epoch": 0.9890876296407636, + "flos": 18952359649920.0, + "grad_norm": 1.7405814084688636, + "language_loss": 0.79538721, + "learning_rate": 1.242171803164549e-09, + "loss": 0.81679177, + "num_input_tokens_seen": 354996625, + "step": 16451, + "time_per_iteration": 2.5799307823181152 + }, + { + "auxiliary_loss_clip": 0.01072025, + "auxiliary_loss_mlp": 0.01037826, + "balance_loss_clip": 1.03292084, + "balance_loss_mlp": 1.02433717, + "epoch": 0.9891477528934315, + "flos": 23769309127680.0, + "grad_norm": 2.076913559177625, + "language_loss": 0.70177102, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.72286958, + "num_input_tokens_seen": 355014535, + "step": 16452, + "time_per_iteration": 2.6568350791931152 + }, + { + "auxiliary_loss_clip": 0.01106285, + "auxiliary_loss_mlp": 0.010265, + "balance_loss_clip": 1.03735638, + "balance_loss_mlp": 1.01531219, + "epoch": 0.9892078761460995, + "flos": 20772922533120.0, + "grad_norm": 1.7039408259240933, + "language_loss": 0.73759556, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.75892341, + "num_input_tokens_seen": 355033280, + "step": 16453, + "time_per_iteration": 2.526846170425415 + }, + { + "auxiliary_loss_clip": 0.01068886, + "auxiliary_loss_mlp": 0.0103825, + "balance_loss_clip": 1.03598034, + "balance_loss_mlp": 1.02607906, + "epoch": 0.9892679993987675, + "flos": 23367827836800.0, + "grad_norm": 2.0358391498117765, + "language_loss": 0.69925886, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.72033024, + "num_input_tokens_seen": 355053320, + "step": 16454, + "time_per_iteration": 2.7736165523529053 + }, + { + "auxiliary_loss_clip": 0.01077684, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.03315997, + "balance_loss_mlp": 1.01950455, + "epoch": 0.9893281226514354, + "flos": 22705419173760.0, + "grad_norm": 1.754815441534383, + "language_loss": 0.75814426, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.77923727, + "num_input_tokens_seen": 355070230, + "step": 16455, + "time_per_iteration": 2.626431941986084 + }, + { + "auxiliary_loss_clip": 0.0107961, + "auxiliary_loss_mlp": 0.01026151, + "balance_loss_clip": 1.03627825, + "balance_loss_mlp": 1.01435518, + "epoch": 0.9893882459041035, + "flos": 21796664060160.0, + "grad_norm": 2.3755436774026037, + "language_loss": 0.6552164, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.676274, + "num_input_tokens_seen": 355090125, + "step": 16456, + "time_per_iteration": 2.6569387912750244 + }, + { + "auxiliary_loss_clip": 0.01100413, + "auxiliary_loss_mlp": 0.01031079, + "balance_loss_clip": 1.03849792, + "balance_loss_mlp": 1.01922965, + "epoch": 0.9894483691567714, + "flos": 18113773754880.0, + "grad_norm": 2.3001936476450484, + "language_loss": 0.73839563, + "learning_rate": 1.161190691666203e-09, + "loss": 0.75971055, + "num_input_tokens_seen": 355107890, + "step": 16457, + "time_per_iteration": 2.674736738204956 + }, + { + "auxiliary_loss_clip": 0.01108737, + "auxiliary_loss_mlp": 0.01029092, + "balance_loss_clip": 1.03762496, + "balance_loss_mlp": 1.01680112, + "epoch": 0.9895084924094394, + "flos": 31211615664000.0, + "grad_norm": 2.2264264445995474, + "language_loss": 0.6879859, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.70936424, + "num_input_tokens_seen": 355126340, + "step": 16458, + "time_per_iteration": 2.615215301513672 + }, + { + "auxiliary_loss_clip": 0.01093615, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.0354172, + "balance_loss_mlp": 1.01881158, + "epoch": 0.9895686156621073, + "flos": 19678042120320.0, + "grad_norm": 1.6836703680245058, + "language_loss": 0.79359543, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.81483769, + "num_input_tokens_seen": 355144025, + "step": 16459, + "time_per_iteration": 2.5571677684783936 + }, + { + "auxiliary_loss_clip": 0.01083172, + "auxiliary_loss_mlp": 0.01034843, + "balance_loss_clip": 1.03401232, + "balance_loss_mlp": 1.02273118, + "epoch": 0.9896287389147753, + "flos": 23581675457280.0, + "grad_norm": 1.883745911652252, + "language_loss": 0.7132234, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.73440349, + "num_input_tokens_seen": 355163125, + "step": 16460, + "time_per_iteration": 2.626668691635132 + }, + { + "auxiliary_loss_clip": 0.01086508, + "auxiliary_loss_mlp": 0.01026002, + "balance_loss_clip": 1.03445435, + "balance_loss_mlp": 1.01346099, + "epoch": 0.9896888621674432, + "flos": 29605331364480.0, + "grad_norm": 1.5613662208047323, + "language_loss": 0.87661237, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.8977375, + "num_input_tokens_seen": 355184060, + "step": 16461, + "time_per_iteration": 2.7060861587524414 + }, + { + "auxiliary_loss_clip": 0.01095459, + "auxiliary_loss_mlp": 0.01032824, + "balance_loss_clip": 1.03561902, + "balance_loss_mlp": 1.01949632, + "epoch": 0.9897489854201112, + "flos": 23695045758720.0, + "grad_norm": 1.7346501147556415, + "language_loss": 0.62446827, + "learning_rate": 1.09579082189315e-09, + "loss": 0.64575106, + "num_input_tokens_seen": 355204505, + "step": 16462, + "time_per_iteration": 2.64906907081604 + }, + { + "auxiliary_loss_clip": 0.01100978, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.03905725, + "balance_loss_mlp": 1.02028179, + "epoch": 0.9898091086727792, + "flos": 13225146687360.0, + "grad_norm": 1.8196712786211515, + "language_loss": 0.72961009, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.75093973, + "num_input_tokens_seen": 355223055, + "step": 16463, + "time_per_iteration": 5.664719343185425 + }, + { + "auxiliary_loss_clip": 0.01097369, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.03589058, + "balance_loss_mlp": 1.01759946, + "epoch": 0.9898692319254472, + "flos": 22930400010240.0, + "grad_norm": 1.8790074246381347, + "language_loss": 0.69955069, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.720837, + "num_input_tokens_seen": 355242000, + "step": 16464, + "time_per_iteration": 4.500953197479248 + }, + { + "auxiliary_loss_clip": 0.01079876, + "auxiliary_loss_mlp": 0.0102935, + "balance_loss_clip": 1.03554177, + "balance_loss_mlp": 1.01688099, + "epoch": 0.9899293551781151, + "flos": 12458346122880.0, + "grad_norm": 2.0237880256001635, + "language_loss": 0.73348618, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.75457835, + "num_input_tokens_seen": 355260175, + "step": 16465, + "time_per_iteration": 2.6900930404663086 + }, + { + "auxiliary_loss_clip": 0.01104028, + "auxiliary_loss_mlp": 0.010347, + "balance_loss_clip": 1.03416681, + "balance_loss_mlp": 1.02323794, + "epoch": 0.9899894784307831, + "flos": 26871129118080.0, + "grad_norm": 1.754568063294171, + "language_loss": 0.86540592, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.88679326, + "num_input_tokens_seen": 355281930, + "step": 16466, + "time_per_iteration": 2.5950276851654053 + }, + { + "auxiliary_loss_clip": 0.01071496, + "auxiliary_loss_mlp": 0.01024722, + "balance_loss_clip": 1.03584099, + "balance_loss_mlp": 1.01242614, + "epoch": 0.990049601683451, + "flos": 21542093395200.0, + "grad_norm": 1.7230422201542275, + "language_loss": 0.71486777, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.73583001, + "num_input_tokens_seen": 355301555, + "step": 16467, + "time_per_iteration": 2.7708022594451904 + }, + { + "auxiliary_loss_clip": 0.0108033, + "auxiliary_loss_mlp": 0.0104001, + "balance_loss_clip": 1.03324151, + "balance_loss_mlp": 1.02659893, + "epoch": 0.990109724936119, + "flos": 28771809287040.0, + "grad_norm": 1.3753584839252895, + "language_loss": 0.65033233, + "learning_rate": 1.019812338686643e-09, + "loss": 0.67153573, + "num_input_tokens_seen": 355324925, + "step": 16468, + "time_per_iteration": 4.24141263961792 + }, + { + "auxiliary_loss_clip": 0.01079098, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.03625393, + "balance_loss_mlp": 1.01935673, + "epoch": 0.9901698481887871, + "flos": 29274270687360.0, + "grad_norm": 2.0452120517655943, + "language_loss": 0.62340331, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.64450687, + "num_input_tokens_seen": 355343875, + "step": 16469, + "time_per_iteration": 2.7885043621063232 + }, + { + "auxiliary_loss_clip": 0.01073759, + "auxiliary_loss_mlp": 0.01031927, + "balance_loss_clip": 1.03337479, + "balance_loss_mlp": 1.01906407, + "epoch": 0.990229971441455, + "flos": 15959025711360.0, + "grad_norm": 2.562370039861896, + "language_loss": 0.70241368, + "learning_rate": 9.950925847685976e-10, + "loss": 0.72347051, + "num_input_tokens_seen": 355358835, + "step": 16470, + "time_per_iteration": 2.6540679931640625 + }, + { + "auxiliary_loss_clip": 0.01019159, + "auxiliary_loss_mlp": 0.01000231, + "balance_loss_clip": 1.00684953, + "balance_loss_mlp": 0.99926519, + "epoch": 0.990290094694123, + "flos": 69780287911680.0, + "grad_norm": 0.6776686516780072, + "language_loss": 0.55451435, + "learning_rate": 9.828464112755509e-10, + "loss": 0.57470822, + "num_input_tokens_seen": 355431225, + "step": 16471, + "time_per_iteration": 3.345576047897339 + }, + { + "auxiliary_loss_clip": 0.01088522, + "auxiliary_loss_mlp": 0.01034754, + "balance_loss_clip": 1.03816175, + "balance_loss_mlp": 1.02205849, + "epoch": 0.9903502179467909, + "flos": 16252451913600.0, + "grad_norm": 2.029976016877621, + "language_loss": 0.83828497, + "learning_rate": 9.706760407131032e-10, + "loss": 0.85951781, + "num_input_tokens_seen": 355448250, + "step": 16472, + "time_per_iteration": 2.7130064964294434 + }, + { + "auxiliary_loss_clip": 0.01095822, + "auxiliary_loss_mlp": 0.01026203, + "balance_loss_clip": 1.03632557, + "balance_loss_mlp": 1.01452053, + "epoch": 0.9904103411994589, + "flos": 21688393489920.0, + "grad_norm": 1.9314835507092933, + "language_loss": 0.8592447, + "learning_rate": 9.585814735431075e-10, + "loss": 0.88046497, + "num_input_tokens_seen": 355467040, + "step": 16473, + "time_per_iteration": 2.6023082733154297 + }, + { + "auxiliary_loss_clip": 0.01105804, + "auxiliary_loss_mlp": 0.01029675, + "balance_loss_clip": 1.03511405, + "balance_loss_mlp": 1.01830196, + "epoch": 0.9904704644521268, + "flos": 25739440243200.0, + "grad_norm": 1.8812560615029836, + "language_loss": 0.84657192, + "learning_rate": 9.465627102240859e-10, + "loss": 0.86792672, + "num_input_tokens_seen": 355487825, + "step": 16474, + "time_per_iteration": 2.6265671253204346 + }, + { + "auxiliary_loss_clip": 0.01079812, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.03155684, + "balance_loss_mlp": 1.0240823, + "epoch": 0.9905305877047949, + "flos": 21908346422400.0, + "grad_norm": 1.8096895142828726, + "language_loss": 0.76610988, + "learning_rate": 9.346197512116738e-10, + "loss": 0.78726262, + "num_input_tokens_seen": 355507445, + "step": 16475, + "time_per_iteration": 2.642179012298584 + }, + { + "auxiliary_loss_clip": 0.0106673, + "auxiliary_loss_mlp": 0.01035079, + "balance_loss_clip": 1.03210354, + "balance_loss_mlp": 1.02151895, + "epoch": 0.9905907109574628, + "flos": 21392417422080.0, + "grad_norm": 1.7909726122896426, + "language_loss": 0.76034641, + "learning_rate": 9.227525969588423e-10, + "loss": 0.78136444, + "num_input_tokens_seen": 355527205, + "step": 16476, + "time_per_iteration": 2.6616551876068115 + }, + { + "auxiliary_loss_clip": 0.01101675, + "auxiliary_loss_mlp": 0.00771329, + "balance_loss_clip": 1.03651643, + "balance_loss_mlp": 1.00030255, + "epoch": 0.9906508342101308, + "flos": 20521620005760.0, + "grad_norm": 2.1261117563309884, + "language_loss": 0.6759547, + "learning_rate": 9.109612479154538e-10, + "loss": 0.69468474, + "num_input_tokens_seen": 355544740, + "step": 16477, + "time_per_iteration": 2.5836856365203857 + }, + { + "auxiliary_loss_clip": 0.0109369, + "auxiliary_loss_mlp": 0.01034434, + "balance_loss_clip": 1.03875303, + "balance_loss_mlp": 1.02113652, + "epoch": 0.9907109574627987, + "flos": 21361211481600.0, + "grad_norm": 2.3791528799950283, + "language_loss": 0.71740925, + "learning_rate": 8.992457045289282e-10, + "loss": 0.7386905, + "num_input_tokens_seen": 355564385, + "step": 16478, + "time_per_iteration": 2.6684231758117676 + }, + { + "auxiliary_loss_clip": 0.0110905, + "auxiliary_loss_mlp": 0.01040049, + "balance_loss_clip": 1.03671718, + "balance_loss_mlp": 1.02660859, + "epoch": 0.9907710807154667, + "flos": 17338605321600.0, + "grad_norm": 2.44296615516407, + "language_loss": 0.80982149, + "learning_rate": 8.876059672433545e-10, + "loss": 0.83131254, + "num_input_tokens_seen": 355579260, + "step": 16479, + "time_per_iteration": 2.536628484725952 + }, + { + "auxiliary_loss_clip": 0.01099491, + "auxiliary_loss_mlp": 0.01033699, + "balance_loss_clip": 1.03593194, + "balance_loss_mlp": 1.02183723, + "epoch": 0.9908312039681346, + "flos": 28621881918720.0, + "grad_norm": 1.8095138235680064, + "language_loss": 0.66404873, + "learning_rate": 8.760420364999355e-10, + "loss": 0.68538064, + "num_input_tokens_seen": 355599790, + "step": 16480, + "time_per_iteration": 2.675546884536743 + }, + { + "auxiliary_loss_clip": 0.0109416, + "auxiliary_loss_mlp": 0.01032967, + "balance_loss_clip": 1.03466868, + "balance_loss_mlp": 1.02073646, + "epoch": 0.9908913272208026, + "flos": 35770654512000.0, + "grad_norm": 1.7378127875185636, + "language_loss": 0.72355247, + "learning_rate": 8.645539127374313e-10, + "loss": 0.74482375, + "num_input_tokens_seen": 355620925, + "step": 16481, + "time_per_iteration": 2.702287197113037 + }, + { + "auxiliary_loss_clip": 0.01095367, + "auxiliary_loss_mlp": 0.01023626, + "balance_loss_clip": 1.03528941, + "balance_loss_mlp": 1.01157379, + "epoch": 0.9909514504734707, + "flos": 19902196944000.0, + "grad_norm": 1.99195789913312, + "language_loss": 0.77529383, + "learning_rate": 8.531415963912713e-10, + "loss": 0.79648376, + "num_input_tokens_seen": 355639165, + "step": 16482, + "time_per_iteration": 2.623577117919922 + }, + { + "auxiliary_loss_clip": 0.01099605, + "auxiliary_loss_mlp": 0.01030033, + "balance_loss_clip": 1.03522539, + "balance_loss_mlp": 1.01804006, + "epoch": 0.9910115737261386, + "flos": 20004793165440.0, + "grad_norm": 1.7513452456570024, + "language_loss": 0.75167656, + "learning_rate": 8.418050878944427e-10, + "loss": 0.772973, + "num_input_tokens_seen": 355657320, + "step": 16483, + "time_per_iteration": 2.6707489490509033 + }, + { + "auxiliary_loss_clip": 0.01018817, + "auxiliary_loss_mlp": 0.01002356, + "balance_loss_clip": 1.00542712, + "balance_loss_mlp": 1.0013783, + "epoch": 0.9910716969788066, + "flos": 70688432494080.0, + "grad_norm": 0.6739717924016945, + "language_loss": 0.53652573, + "learning_rate": 8.305443876768237e-10, + "loss": 0.55673742, + "num_input_tokens_seen": 355726370, + "step": 16484, + "time_per_iteration": 3.2860820293426514 + }, + { + "auxiliary_loss_clip": 0.01103552, + "auxiliary_loss_mlp": 0.0103248, + "balance_loss_clip": 1.03575015, + "balance_loss_mlp": 1.0208987, + "epoch": 0.9911318202314745, + "flos": 21434038306560.0, + "grad_norm": 1.844066586555626, + "language_loss": 0.82151747, + "learning_rate": 8.19359496165184e-10, + "loss": 0.84287775, + "num_input_tokens_seen": 355745840, + "step": 16485, + "time_per_iteration": 2.572507619857788 + }, + { + "auxiliary_loss_clip": 0.0106644, + "auxiliary_loss_mlp": 0.01039998, + "balance_loss_clip": 1.03260577, + "balance_loss_mlp": 1.02652156, + "epoch": 0.9911919434841425, + "flos": 19826820253440.0, + "grad_norm": 1.5753889689051752, + "language_loss": 0.81565136, + "learning_rate": 8.082504137836288e-10, + "loss": 0.83671576, + "num_input_tokens_seen": 355763385, + "step": 16486, + "time_per_iteration": 2.6582581996917725 + }, + { + "auxiliary_loss_clip": 0.01099209, + "auxiliary_loss_mlp": 0.01034412, + "balance_loss_clip": 1.03707922, + "balance_loss_mlp": 1.02275944, + "epoch": 0.9912520667368104, + "flos": 41719364691840.0, + "grad_norm": 1.4744278341882329, + "language_loss": 0.66241539, + "learning_rate": 7.972171409538209e-10, + "loss": 0.68375158, + "num_input_tokens_seen": 355786075, + "step": 16487, + "time_per_iteration": 2.90215802192688 + }, + { + "auxiliary_loss_clip": 0.01094686, + "auxiliary_loss_mlp": 0.00769572, + "balance_loss_clip": 1.03547466, + "balance_loss_mlp": 1.00026965, + "epoch": 0.9913121899894785, + "flos": 23769668263680.0, + "grad_norm": 1.5978817773669494, + "language_loss": 0.76796007, + "learning_rate": 7.862596780936481e-10, + "loss": 0.78660262, + "num_input_tokens_seen": 355806295, + "step": 16488, + "time_per_iteration": 2.771479368209839 + }, + { + "auxiliary_loss_clip": 0.01080089, + "auxiliary_loss_mlp": 0.01030538, + "balance_loss_clip": 1.03599024, + "balance_loss_mlp": 1.01780689, + "epoch": 0.9913723132421464, + "flos": 23769668263680.0, + "grad_norm": 9.931415679730078, + "language_loss": 0.68562698, + "learning_rate": 7.753780256190001e-10, + "loss": 0.70673329, + "num_input_tokens_seen": 355825730, + "step": 16489, + "time_per_iteration": 2.8262085914611816 + }, + { + "auxiliary_loss_clip": 0.00990057, + "auxiliary_loss_mlp": 0.01006045, + "balance_loss_clip": 1.00620961, + "balance_loss_mlp": 1.00509155, + "epoch": 0.9914324364948144, + "flos": 71267419820160.0, + "grad_norm": 0.6117813667004609, + "language_loss": 0.52562964, + "learning_rate": 7.645721839424357e-10, + "loss": 0.54559064, + "num_input_tokens_seen": 355891545, + "step": 16490, + "time_per_iteration": 3.339395523071289 + }, + { + "auxiliary_loss_clip": 0.01081829, + "auxiliary_loss_mlp": 0.01038351, + "balance_loss_clip": 1.03499651, + "balance_loss_mlp": 1.02465963, + "epoch": 0.9914925597474823, + "flos": 23695440808320.0, + "grad_norm": 1.555410578963239, + "language_loss": 0.75512695, + "learning_rate": 7.538421534734052e-10, + "loss": 0.7763288, + "num_input_tokens_seen": 355909920, + "step": 16491, + "time_per_iteration": 2.7577908039093018 + }, + { + "auxiliary_loss_clip": 0.01068183, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.03941417, + "balance_loss_mlp": 1.02027285, + "epoch": 0.9915526830001503, + "flos": 13433822749440.0, + "grad_norm": 2.4260837732983656, + "language_loss": 0.70534217, + "learning_rate": 7.431879346191383e-10, + "loss": 0.72635806, + "num_input_tokens_seen": 355923130, + "step": 16492, + "time_per_iteration": 2.717663288116455 + }, + { + "auxiliary_loss_clip": 0.01072141, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.03324449, + "balance_loss_mlp": 1.02005327, + "epoch": 0.9916128062528182, + "flos": 20740962407040.0, + "grad_norm": 1.9482484238506383, + "language_loss": 0.6859107, + "learning_rate": 7.326095277837563e-10, + "loss": 0.7069636, + "num_input_tokens_seen": 355941960, + "step": 16493, + "time_per_iteration": 2.626917839050293 + }, + { + "auxiliary_loss_clip": 0.01084989, + "auxiliary_loss_mlp": 0.01034596, + "balance_loss_clip": 1.03742933, + "balance_loss_mlp": 1.02203131, + "epoch": 0.9916729295054862, + "flos": 22487082353280.0, + "grad_norm": 1.8545300883783822, + "language_loss": 0.7110008, + "learning_rate": 7.221069333678276e-10, + "loss": 0.73219669, + "num_input_tokens_seen": 355961640, + "step": 16494, + "time_per_iteration": 2.6934683322906494 + }, + { + "auxiliary_loss_clip": 0.01098932, + "auxiliary_loss_mlp": 0.0103153, + "balance_loss_clip": 1.0362227, + "balance_loss_mlp": 1.01829231, + "epoch": 0.9917330527581543, + "flos": 14792467708800.0, + "grad_norm": 3.5378309901622007, + "language_loss": 0.68413657, + "learning_rate": 7.116801517701443e-10, + "loss": 0.70544124, + "num_input_tokens_seen": 355977980, + "step": 16495, + "time_per_iteration": 2.6093251705169678 + }, + { + "auxiliary_loss_clip": 0.0101026, + "auxiliary_loss_mlp": 0.01002792, + "balance_loss_clip": 1.00642037, + "balance_loss_mlp": 1.00182664, + "epoch": 0.9917931760108222, + "flos": 59191595585280.0, + "grad_norm": 0.7170668056568608, + "language_loss": 0.53470147, + "learning_rate": 7.013291833859458e-10, + "loss": 0.55483198, + "num_input_tokens_seen": 356042900, + "step": 16496, + "time_per_iteration": 3.3146417140960693 + }, + { + "auxiliary_loss_clip": 0.01085309, + "auxiliary_loss_mlp": 0.00774025, + "balance_loss_clip": 1.03571773, + "balance_loss_mlp": 1.0002538, + "epoch": 0.9918532992634902, + "flos": 26761637485440.0, + "grad_norm": 1.4656425983930446, + "language_loss": 0.71419513, + "learning_rate": 6.91054028607585e-10, + "loss": 0.73278844, + "num_input_tokens_seen": 356063000, + "step": 16497, + "time_per_iteration": 2.7043471336364746 + }, + { + "auxiliary_loss_clip": 0.01081862, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.03701067, + "balance_loss_mlp": 1.02047634, + "epoch": 0.9919134225161581, + "flos": 14975719920000.0, + "grad_norm": 2.009806913835038, + "language_loss": 0.82173979, + "learning_rate": 6.808546878249721e-10, + "loss": 0.84289509, + "num_input_tokens_seen": 356078130, + "step": 16498, + "time_per_iteration": 2.7074508666992188 + }, + { + "auxiliary_loss_clip": 0.01075485, + "auxiliary_loss_mlp": 0.01035966, + "balance_loss_clip": 1.03759611, + "balance_loss_mlp": 1.02313316, + "epoch": 0.9919735457688261, + "flos": 27818201064960.0, + "grad_norm": 1.7332426459484291, + "language_loss": 0.68403494, + "learning_rate": 6.707311614246869e-10, + "loss": 0.70514941, + "num_input_tokens_seen": 356101655, + "step": 16499, + "time_per_iteration": 2.7545838356018066 + }, + { + "auxiliary_loss_clip": 0.01111074, + "auxiliary_loss_mlp": 0.01029883, + "balance_loss_clip": 1.03827095, + "balance_loss_mlp": 1.01769972, + "epoch": 0.992033669021494, + "flos": 22562782266240.0, + "grad_norm": 1.7667057026294906, + "language_loss": 0.8223446, + "learning_rate": 6.606834497904223e-10, + "loss": 0.84375417, + "num_input_tokens_seen": 356121425, + "step": 16500, + "time_per_iteration": 2.587153911590576 + }, + { + "auxiliary_loss_clip": 0.01080633, + "auxiliary_loss_mlp": 0.01032066, + "balance_loss_clip": 1.03477311, + "balance_loss_mlp": 1.01933479, + "epoch": 0.9920937922741621, + "flos": 25374587846400.0, + "grad_norm": 1.774651095771433, + "language_loss": 0.81949353, + "learning_rate": 6.507115533036511e-10, + "loss": 0.84062058, + "num_input_tokens_seen": 356140710, + "step": 16501, + "time_per_iteration": 2.769408702850342 + }, + { + "auxiliary_loss_clip": 0.01098639, + "auxiliary_loss_mlp": 0.01029969, + "balance_loss_clip": 1.03593433, + "balance_loss_mlp": 1.01757097, + "epoch": 0.99215391552683, + "flos": 22054466949120.0, + "grad_norm": 2.025781816358009, + "language_loss": 0.76823413, + "learning_rate": 6.408154723420711e-10, + "loss": 0.78952026, + "num_input_tokens_seen": 356159835, + "step": 16502, + "time_per_iteration": 4.115024566650391 + }, + { + "auxiliary_loss_clip": 0.01083856, + "auxiliary_loss_mlp": 0.01031996, + "balance_loss_clip": 1.03553808, + "balance_loss_mlp": 1.01820326, + "epoch": 0.992214038779498, + "flos": 15413937845760.0, + "grad_norm": 3.0841815262581127, + "language_loss": 0.71393132, + "learning_rate": 6.309952072811597e-10, + "loss": 0.7350899, + "num_input_tokens_seen": 356177555, + "step": 16503, + "time_per_iteration": 4.208997964859009 + }, + { + "auxiliary_loss_clip": 0.0101848, + "auxiliary_loss_mlp": 0.01003931, + "balance_loss_clip": 1.00507569, + "balance_loss_mlp": 1.00273323, + "epoch": 0.9922741620321659, + "flos": 62014498467840.0, + "grad_norm": 0.631144225573371, + "language_loss": 0.55076844, + "learning_rate": 6.212507584932858e-10, + "loss": 0.57099259, + "num_input_tokens_seen": 356244975, + "step": 16504, + "time_per_iteration": 4.945772647857666 + }, + { + "auxiliary_loss_clip": 0.01075926, + "auxiliary_loss_mlp": 0.01024279, + "balance_loss_clip": 1.0352273, + "balance_loss_mlp": 1.01286459, + "epoch": 0.9923342852848339, + "flos": 17165480745600.0, + "grad_norm": 1.970652781818717, + "language_loss": 0.6979568, + "learning_rate": 6.115821263481536e-10, + "loss": 0.71895891, + "num_input_tokens_seen": 356262605, + "step": 16505, + "time_per_iteration": 2.655355453491211 + }, + { + "auxiliary_loss_clip": 0.01074237, + "auxiliary_loss_mlp": 0.01032467, + "balance_loss_clip": 1.0348562, + "balance_loss_mlp": 1.01904368, + "epoch": 0.9923944085375018, + "flos": 23183210908800.0, + "grad_norm": 1.9797286096997044, + "language_loss": 0.65255636, + "learning_rate": 6.019893112119146e-10, + "loss": 0.67362338, + "num_input_tokens_seen": 356278935, + "step": 16506, + "time_per_iteration": 2.8993325233459473 + }, + { + "auxiliary_loss_clip": 0.01044661, + "auxiliary_loss_mlp": 0.01028638, + "balance_loss_clip": 1.03355384, + "balance_loss_mlp": 1.01587033, + "epoch": 0.9924545317901698, + "flos": 20813861059200.0, + "grad_norm": 3.222870025511436, + "language_loss": 0.62715226, + "learning_rate": 5.924723134487219e-10, + "loss": 0.64788526, + "num_input_tokens_seen": 356295675, + "step": 16507, + "time_per_iteration": 4.278958559036255 + }, + { + "auxiliary_loss_clip": 0.01108709, + "auxiliary_loss_mlp": 0.01033793, + "balance_loss_clip": 1.03649449, + "balance_loss_mlp": 1.02098358, + "epoch": 0.9925146550428379, + "flos": 20083437993600.0, + "grad_norm": 2.2915394393265567, + "language_loss": 0.73150027, + "learning_rate": 5.830311334193983e-10, + "loss": 0.75292528, + "num_input_tokens_seen": 356312885, + "step": 16508, + "time_per_iteration": 2.5459229946136475 + }, + { + "auxiliary_loss_clip": 0.01107576, + "auxiliary_loss_mlp": 0.0102854, + "balance_loss_clip": 1.03538799, + "balance_loss_mlp": 1.01548636, + "epoch": 0.9925747782955058, + "flos": 24973717086720.0, + "grad_norm": 1.713644660775738, + "language_loss": 0.70212501, + "learning_rate": 5.736657714818793e-10, + "loss": 0.72348613, + "num_input_tokens_seen": 356334070, + "step": 16509, + "time_per_iteration": 2.731260299682617 + }, + { + "auxiliary_loss_clip": 0.01096747, + "auxiliary_loss_mlp": 0.01036856, + "balance_loss_clip": 1.03462338, + "balance_loss_mlp": 1.02401757, + "epoch": 0.9926349015481738, + "flos": 60472526492160.0, + "grad_norm": 1.6611558247345184, + "language_loss": 0.68540096, + "learning_rate": 5.643762279912146e-10, + "loss": 0.70673692, + "num_input_tokens_seen": 356359410, + "step": 16510, + "time_per_iteration": 3.000253438949585 + }, + { + "auxiliary_loss_clip": 0.01074893, + "auxiliary_loss_mlp": 0.01037014, + "balance_loss_clip": 1.03464723, + "balance_loss_mlp": 1.02426445, + "epoch": 0.9926950248008417, + "flos": 20741716592640.0, + "grad_norm": 2.1741536524362544, + "language_loss": 0.81332445, + "learning_rate": 5.551625032997886e-10, + "loss": 0.83444357, + "num_input_tokens_seen": 356378345, + "step": 16511, + "time_per_iteration": 2.708442211151123 + }, + { + "auxiliary_loss_clip": 0.01064556, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.03382301, + "balance_loss_mlp": 1.02089787, + "epoch": 0.9927551480535097, + "flos": 24352965221760.0, + "grad_norm": 1.9230315347792497, + "language_loss": 0.91452694, + "learning_rate": 5.460245977570998e-10, + "loss": 0.93550122, + "num_input_tokens_seen": 356397345, + "step": 16512, + "time_per_iteration": 2.6810131072998047 + }, + { + "auxiliary_loss_clip": 0.00999495, + "auxiliary_loss_mlp": 0.01002603, + "balance_loss_clip": 1.00600088, + "balance_loss_mlp": 1.00150681, + "epoch": 0.9928152713061776, + "flos": 71275572207360.0, + "grad_norm": 0.7168790027045711, + "language_loss": 0.55182147, + "learning_rate": 5.369625117095378e-10, + "loss": 0.57184243, + "num_input_tokens_seen": 356459160, + "step": 16513, + "time_per_iteration": 3.3187079429626465 + }, + { + "auxiliary_loss_clip": 0.01081239, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.03556442, + "balance_loss_mlp": 1.01862729, + "epoch": 0.9928753945588457, + "flos": 57809499045120.0, + "grad_norm": 1.3394977995740782, + "language_loss": 0.65011883, + "learning_rate": 5.279762455006054e-10, + "loss": 0.67124498, + "num_input_tokens_seen": 356486405, + "step": 16514, + "time_per_iteration": 2.9586453437805176 + }, + { + "auxiliary_loss_clip": 0.01077404, + "auxiliary_loss_mlp": 0.01029066, + "balance_loss_clip": 1.03275108, + "balance_loss_mlp": 1.01589894, + "epoch": 0.9929355178115136, + "flos": 19568981450880.0, + "grad_norm": 1.9082841618912534, + "language_loss": 0.73075938, + "learning_rate": 5.190657994713632e-10, + "loss": 0.75182408, + "num_input_tokens_seen": 356502905, + "step": 16515, + "time_per_iteration": 2.7386841773986816 + }, + { + "auxiliary_loss_clip": 0.01065642, + "auxiliary_loss_mlp": 0.0104261, + "balance_loss_clip": 1.03322613, + "balance_loss_mlp": 1.02893686, + "epoch": 0.9929956410641816, + "flos": 22964658606720.0, + "grad_norm": 1.635364336808878, + "language_loss": 0.77238375, + "learning_rate": 5.102311739593191e-10, + "loss": 0.79346621, + "num_input_tokens_seen": 356523830, + "step": 16516, + "time_per_iteration": 2.7601654529571533 + }, + { + "auxiliary_loss_clip": 0.01077729, + "auxiliary_loss_mlp": 0.01026442, + "balance_loss_clip": 1.0354166, + "balance_loss_mlp": 1.01530802, + "epoch": 0.9930557643168495, + "flos": 22566409539840.0, + "grad_norm": 1.7024793755229197, + "language_loss": 0.78187561, + "learning_rate": 5.014723692997602e-10, + "loss": 0.8029173, + "num_input_tokens_seen": 356543965, + "step": 16517, + "time_per_iteration": 2.891570568084717 + }, + { + "auxiliary_loss_clip": 0.01097555, + "auxiliary_loss_mlp": 0.01037224, + "balance_loss_clip": 1.03813481, + "balance_loss_mlp": 1.02333033, + "epoch": 0.9931158875695175, + "flos": 17201032231680.0, + "grad_norm": 2.4652288610488604, + "language_loss": 0.67716908, + "learning_rate": 4.927893858248655e-10, + "loss": 0.69851696, + "num_input_tokens_seen": 356561530, + "step": 16518, + "time_per_iteration": 2.646632432937622 + }, + { + "auxiliary_loss_clip": 0.01008101, + "auxiliary_loss_mlp": 0.01002102, + "balance_loss_clip": 1.00879121, + "balance_loss_mlp": 1.00086808, + "epoch": 0.9931760108221854, + "flos": 63711204278400.0, + "grad_norm": 0.7468001018305941, + "language_loss": 0.53340489, + "learning_rate": 4.84182223863483e-10, + "loss": 0.55350691, + "num_input_tokens_seen": 356616845, + "step": 16519, + "time_per_iteration": 3.0809152126312256 + }, + { + "auxiliary_loss_clip": 0.01065697, + "auxiliary_loss_mlp": 0.01041583, + "balance_loss_clip": 1.03317142, + "balance_loss_mlp": 1.02780831, + "epoch": 0.9932361340748534, + "flos": 15304805349120.0, + "grad_norm": 1.6932132720943704, + "language_loss": 0.6033656, + "learning_rate": 4.756508837426842e-10, + "loss": 0.62443841, + "num_input_tokens_seen": 356633560, + "step": 16520, + "time_per_iteration": 2.7310233116149902 + }, + { + "auxiliary_loss_clip": 0.01078536, + "auxiliary_loss_mlp": 0.01034414, + "balance_loss_clip": 1.03465533, + "balance_loss_mlp": 1.021927, + "epoch": 0.9932962573275215, + "flos": 36064906727040.0, + "grad_norm": 1.7244534802172446, + "language_loss": 0.62099916, + "learning_rate": 4.671953657853223e-10, + "loss": 0.64212871, + "num_input_tokens_seen": 356657600, + "step": 16521, + "time_per_iteration": 2.883345603942871 + }, + { + "auxiliary_loss_clip": 0.01087845, + "auxiliary_loss_mlp": 0.01036694, + "balance_loss_clip": 1.03922451, + "balance_loss_mlp": 1.02330661, + "epoch": 0.9933563805801894, + "flos": 21470523546240.0, + "grad_norm": 4.353373904102065, + "language_loss": 0.74153936, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.76278472, + "num_input_tokens_seen": 356675880, + "step": 16522, + "time_per_iteration": 2.718522071838379 + }, + { + "auxiliary_loss_clip": 0.0107243, + "auxiliary_loss_mlp": 0.01030252, + "balance_loss_clip": 1.03389478, + "balance_loss_mlp": 1.01866508, + "epoch": 0.9934165038328574, + "flos": 23986532626560.0, + "grad_norm": 1.5310659871247791, + "language_loss": 0.73152745, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.75255424, + "num_input_tokens_seen": 356696000, + "step": 16523, + "time_per_iteration": 2.7667906284332275 + }, + { + "auxiliary_loss_clip": 0.0108301, + "auxiliary_loss_mlp": 0.00769257, + "balance_loss_clip": 1.03243899, + "balance_loss_mlp": 1.00031519, + "epoch": 0.9934766270855253, + "flos": 21907807718400.0, + "grad_norm": 1.6911603974446854, + "language_loss": 0.71271038, + "learning_rate": 4.422837480875241e-10, + "loss": 0.73123306, + "num_input_tokens_seen": 356716845, + "step": 16524, + "time_per_iteration": 2.716357707977295 + }, + { + "auxiliary_loss_clip": 0.0107654, + "auxiliary_loss_mlp": 0.01030893, + "balance_loss_clip": 1.0359261, + "balance_loss_mlp": 1.01835251, + "epoch": 0.9935367503381933, + "flos": 17129139160320.0, + "grad_norm": 2.2457086362374863, + "language_loss": 0.79875743, + "learning_rate": 4.341315219624775e-10, + "loss": 0.81983173, + "num_input_tokens_seen": 356732100, + "step": 16525, + "time_per_iteration": 2.7416329383850098 + }, + { + "auxiliary_loss_clip": 0.0106301, + "auxiliary_loss_mlp": 0.01026465, + "balance_loss_clip": 1.03451014, + "balance_loss_mlp": 1.01388836, + "epoch": 0.9935968735908612, + "flos": 22346241125760.0, + "grad_norm": 2.0057410904081165, + "language_loss": 0.75025058, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.77114534, + "num_input_tokens_seen": 356751480, + "step": 16526, + "time_per_iteration": 2.772752046585083 + }, + { + "auxiliary_loss_clip": 0.01103657, + "auxiliary_loss_mlp": 0.00769996, + "balance_loss_clip": 1.03466129, + "balance_loss_mlp": 1.0002234, + "epoch": 0.9936569968435293, + "flos": 29460539640960.0, + "grad_norm": 2.362336998601464, + "language_loss": 0.72371536, + "learning_rate": 4.180545412333369e-10, + "loss": 0.74245191, + "num_input_tokens_seen": 356772650, + "step": 16527, + "time_per_iteration": 2.622760057449341 + }, + { + "auxiliary_loss_clip": 0.01088795, + "auxiliary_loss_mlp": 0.01031074, + "balance_loss_clip": 1.03722143, + "balance_loss_mlp": 1.01860452, + "epoch": 0.9937171200961972, + "flos": 16544046522240.0, + "grad_norm": 2.2246858984054185, + "language_loss": 0.75991976, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.78111851, + "num_input_tokens_seen": 356788510, + "step": 16528, + "time_per_iteration": 2.6447527408599854 + }, + { + "auxiliary_loss_clip": 0.01089717, + "auxiliary_loss_mlp": 0.01029048, + "balance_loss_clip": 1.03432751, + "balance_loss_mlp": 1.01511216, + "epoch": 0.9937772433488652, + "flos": 24390276474240.0, + "grad_norm": 2.1876724581944504, + "language_loss": 0.6753338, + "learning_rate": 4.022808578922898e-10, + "loss": 0.6965214, + "num_input_tokens_seen": 356809115, + "step": 16529, + "time_per_iteration": 2.7714054584503174 + }, + { + "auxiliary_loss_clip": 0.01103653, + "auxiliary_loss_mlp": 0.01036146, + "balance_loss_clip": 1.03892541, + "balance_loss_mlp": 1.02186477, + "epoch": 0.9938373666015331, + "flos": 15669909141120.0, + "grad_norm": 2.3036099926169116, + "language_loss": 0.65350854, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.67490655, + "num_input_tokens_seen": 356826410, + "step": 16530, + "time_per_iteration": 2.6250078678131104 + }, + { + "auxiliary_loss_clip": 0.01093807, + "auxiliary_loss_mlp": 0.01032892, + "balance_loss_clip": 1.03568208, + "balance_loss_mlp": 1.02088773, + "epoch": 0.9938974898542011, + "flos": 19496190539520.0, + "grad_norm": 3.0743920406722274, + "language_loss": 0.71364164, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.7349087, + "num_input_tokens_seen": 356844990, + "step": 16531, + "time_per_iteration": 2.574047803878784 + }, + { + "auxiliary_loss_clip": 0.01094022, + "auxiliary_loss_mlp": 0.01034706, + "balance_loss_clip": 1.03513575, + "balance_loss_mlp": 1.02152801, + "epoch": 0.993957613106869, + "flos": 26906896085760.0, + "grad_norm": 1.3526587285658505, + "language_loss": 0.74083483, + "learning_rate": 3.791890207045512e-10, + "loss": 0.76212215, + "num_input_tokens_seen": 356866530, + "step": 16532, + "time_per_iteration": 2.6634178161621094 + }, + { + "auxiliary_loss_clip": 0.01051179, + "auxiliary_loss_mlp": 0.01032159, + "balance_loss_clip": 1.03274739, + "balance_loss_mlp": 1.02109611, + "epoch": 0.994017736359537, + "flos": 14939593816320.0, + "grad_norm": 1.6154702582280394, + "language_loss": 0.70493329, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.7257666, + "num_input_tokens_seen": 356884660, + "step": 16533, + "time_per_iteration": 2.721863031387329 + }, + { + "auxiliary_loss_clip": 0.01097407, + "auxiliary_loss_mlp": 0.01029682, + "balance_loss_clip": 1.03669178, + "balance_loss_mlp": 1.01622939, + "epoch": 0.9940778596122051, + "flos": 15377883569280.0, + "grad_norm": 3.6789979959756676, + "language_loss": 0.84027219, + "learning_rate": 3.641735912007782e-10, + "loss": 0.86154306, + "num_input_tokens_seen": 356900895, + "step": 16534, + "time_per_iteration": 2.619920492172241 + }, + { + "auxiliary_loss_clip": 0.01067064, + "auxiliary_loss_mlp": 0.01026895, + "balance_loss_clip": 1.03194451, + "balance_loss_mlp": 1.01531422, + "epoch": 0.994137982864873, + "flos": 25228108183680.0, + "grad_norm": 1.8397563877980199, + "language_loss": 0.65771168, + "learning_rate": 3.567796158934211e-10, + "loss": 0.67865133, + "num_input_tokens_seen": 356920985, + "step": 16535, + "time_per_iteration": 2.744962692260742 + }, + { + "auxiliary_loss_clip": 0.01070974, + "auxiliary_loss_mlp": 0.01028223, + "balance_loss_clip": 1.03729725, + "balance_loss_mlp": 1.01723814, + "epoch": 0.994198106117541, + "flos": 18442140912000.0, + "grad_norm": 2.0070166211015517, + "language_loss": 0.64754289, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.66853487, + "num_input_tokens_seen": 356939800, + "step": 16536, + "time_per_iteration": 2.706944465637207 + }, + { + "auxiliary_loss_clip": 0.01060285, + "auxiliary_loss_mlp": 0.01035363, + "balance_loss_clip": 1.03116417, + "balance_loss_mlp": 1.02181518, + "epoch": 0.9942582293702089, + "flos": 16654112772480.0, + "grad_norm": 1.840427715417003, + "language_loss": 0.78430796, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.80526441, + "num_input_tokens_seen": 356957780, + "step": 16537, + "time_per_iteration": 2.7647006511688232 + }, + { + "auxiliary_loss_clip": 0.01105131, + "auxiliary_loss_mlp": 0.01034471, + "balance_loss_clip": 1.03843594, + "balance_loss_mlp": 1.02052915, + "epoch": 0.9943183526228769, + "flos": 21944580266880.0, + "grad_norm": 1.5935823863373109, + "language_loss": 0.68781149, + "learning_rate": 3.35052651107004e-10, + "loss": 0.70920742, + "num_input_tokens_seen": 356979185, + "step": 16538, + "time_per_iteration": 2.7235569953918457 + }, + { + "auxiliary_loss_clip": 0.01063974, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.02961493, + "balance_loss_mlp": 1.02304578, + "epoch": 0.9943784758755448, + "flos": 23842566915840.0, + "grad_norm": 1.8915805101103145, + "language_loss": 0.75187773, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.77287686, + "num_input_tokens_seen": 356997735, + "step": 16539, + "time_per_iteration": 2.8062071800231934 + }, + { + "auxiliary_loss_clip": 0.01060765, + "auxiliary_loss_mlp": 0.01033233, + "balance_loss_clip": 1.03562832, + "balance_loss_mlp": 1.02050102, + "epoch": 0.9944385991282129, + "flos": 21469984842240.0, + "grad_norm": 2.030639619740989, + "language_loss": 0.70239884, + "learning_rate": 3.209471449341361e-10, + "loss": 0.72333884, + "num_input_tokens_seen": 357015660, + "step": 16540, + "time_per_iteration": 2.8070261478424072 + }, + { + "auxiliary_loss_clip": 0.01093159, + "auxiliary_loss_mlp": 0.01027717, + "balance_loss_clip": 1.03431797, + "balance_loss_mlp": 1.01676154, + "epoch": 0.9944987223808808, + "flos": 22927024131840.0, + "grad_norm": 1.9807950756120538, + "language_loss": 0.75202429, + "learning_rate": 3.140081337600353e-10, + "loss": 0.77323306, + "num_input_tokens_seen": 357034800, + "step": 16541, + "time_per_iteration": 5.754985570907593 + }, + { + "auxiliary_loss_clip": 0.01080974, + "auxiliary_loss_mlp": 0.01036156, + "balance_loss_clip": 1.0349412, + "balance_loss_mlp": 1.02397323, + "epoch": 0.9945588456335488, + "flos": 22383013674240.0, + "grad_norm": 1.7422746830873381, + "language_loss": 0.76555264, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.78672391, + "num_input_tokens_seen": 357053785, + "step": 16542, + "time_per_iteration": 2.708519458770752 + }, + { + "auxiliary_loss_clip": 0.01099205, + "auxiliary_loss_mlp": 0.01030947, + "balance_loss_clip": 1.03627014, + "balance_loss_mlp": 1.01776206, + "epoch": 0.9946189688862167, + "flos": 21397517153280.0, + "grad_norm": 2.4054715264061435, + "language_loss": 0.74274677, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.76404828, + "num_input_tokens_seen": 357072025, + "step": 16543, + "time_per_iteration": 5.371897459030151 + }, + { + "auxiliary_loss_clip": 0.01094886, + "auxiliary_loss_mlp": 0.01036182, + "balance_loss_clip": 1.03557873, + "balance_loss_mlp": 1.02270508, + "epoch": 0.9946790921388847, + "flos": 12416545670400.0, + "grad_norm": 6.4447794910959235, + "language_loss": 0.82093954, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.84225017, + "num_input_tokens_seen": 357086960, + "step": 16544, + "time_per_iteration": 2.648569107055664 + }, + { + "auxiliary_loss_clip": 0.0110726, + "auxiliary_loss_mlp": 0.01027758, + "balance_loss_clip": 1.0360719, + "balance_loss_mlp": 1.01605737, + "epoch": 0.9947392153915526, + "flos": 19058295836160.0, + "grad_norm": 1.8960221821622298, + "language_loss": 0.78761363, + "learning_rate": 2.870103745831187e-10, + "loss": 0.80896378, + "num_input_tokens_seen": 357105095, + "step": 16545, + "time_per_iteration": 2.6322624683380127 + }, + { + "auxiliary_loss_clip": 0.01078686, + "auxiliary_loss_mlp": 0.01030665, + "balance_loss_clip": 1.03411245, + "balance_loss_mlp": 1.01840401, + "epoch": 0.9947993386442207, + "flos": 27308808339840.0, + "grad_norm": 1.8256767545594197, + "language_loss": 0.72650462, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.74759817, + "num_input_tokens_seen": 357125065, + "step": 16546, + "time_per_iteration": 4.327521562576294 + }, + { + "auxiliary_loss_clip": 0.0109393, + "auxiliary_loss_mlp": 0.01034353, + "balance_loss_clip": 1.03408468, + "balance_loss_mlp": 1.02212179, + "epoch": 0.9948594618968887, + "flos": 20806498771200.0, + "grad_norm": 2.038631565735454, + "language_loss": 0.77378041, + "learning_rate": 2.739664698798716e-10, + "loss": 0.79506326, + "num_input_tokens_seen": 357141600, + "step": 16547, + "time_per_iteration": 2.6839520931243896 + }, + { + "auxiliary_loss_clip": 0.01085655, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.03360105, + "balance_loss_mlp": 1.01823926, + "epoch": 0.9949195851495566, + "flos": 23292953936640.0, + "grad_norm": 2.7868363629317097, + "language_loss": 0.70053595, + "learning_rate": 2.67558262122769e-10, + "loss": 0.72169393, + "num_input_tokens_seen": 357157880, + "step": 16548, + "time_per_iteration": 2.6629064083099365 + }, + { + "auxiliary_loss_clip": 0.0109367, + "auxiliary_loss_mlp": 0.01034955, + "balance_loss_clip": 1.03438258, + "balance_loss_mlp": 1.02264059, + "epoch": 0.9949797084022246, + "flos": 18515470527360.0, + "grad_norm": 1.8066834079511649, + "language_loss": 0.75463164, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.77591789, + "num_input_tokens_seen": 357176705, + "step": 16549, + "time_per_iteration": 2.6749610900878906 + }, + { + "auxiliary_loss_clip": 0.01080946, + "auxiliary_loss_mlp": 0.01034852, + "balance_loss_clip": 1.03683913, + "balance_loss_mlp": 1.02162528, + "epoch": 0.9950398316548925, + "flos": 30407719328640.0, + "grad_norm": 1.591136058001085, + "language_loss": 0.74426466, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.7654227, + "num_input_tokens_seen": 357197630, + "step": 16550, + "time_per_iteration": 2.8009731769561768 + }, + { + "auxiliary_loss_clip": 0.01058637, + "auxiliary_loss_mlp": 0.00770717, + "balance_loss_clip": 1.03213239, + "balance_loss_mlp": 1.00024307, + "epoch": 0.9950999549075605, + "flos": 19900868140800.0, + "grad_norm": 1.804349453887292, + "language_loss": 0.78024846, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.79854202, + "num_input_tokens_seen": 357215445, + "step": 16551, + "time_per_iteration": 2.7871713638305664 + }, + { + "auxiliary_loss_clip": 0.01090386, + "auxiliary_loss_mlp": 0.01032903, + "balance_loss_clip": 1.03510332, + "balance_loss_mlp": 1.02215683, + "epoch": 0.9951600781602284, + "flos": 17603555016960.0, + "grad_norm": 1.3788671688466283, + "language_loss": 0.66577691, + "learning_rate": 2.426837340270271e-10, + "loss": 0.68700981, + "num_input_tokens_seen": 357234285, + "step": 16552, + "time_per_iteration": 2.7981386184692383 + }, + { + "auxiliary_loss_clip": 0.01108432, + "auxiliary_loss_mlp": 0.010277, + "balance_loss_clip": 1.03545749, + "balance_loss_mlp": 1.01527882, + "epoch": 0.9952202014128965, + "flos": 28950715952640.0, + "grad_norm": 1.414763440540152, + "language_loss": 0.81504261, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.83640391, + "num_input_tokens_seen": 357257565, + "step": 16553, + "time_per_iteration": 2.7514050006866455 + }, + { + "auxiliary_loss_clip": 0.01016193, + "auxiliary_loss_mlp": 0.01001561, + "balance_loss_clip": 1.00488806, + "balance_loss_mlp": 1.00064945, + "epoch": 0.9952803246655644, + "flos": 70810386145920.0, + "grad_norm": 0.7163424076202736, + "language_loss": 0.57331538, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.59349293, + "num_input_tokens_seen": 357320205, + "step": 16554, + "time_per_iteration": 3.343486785888672 + }, + { + "auxiliary_loss_clip": 0.01092483, + "auxiliary_loss_mlp": 0.01037848, + "balance_loss_clip": 1.03661346, + "balance_loss_mlp": 1.02556348, + "epoch": 0.9953404479182324, + "flos": 21799070271360.0, + "grad_norm": 1.572030875373758, + "language_loss": 0.77075458, + "learning_rate": 2.24824062597051e-10, + "loss": 0.79205793, + "num_input_tokens_seen": 357340695, + "step": 16555, + "time_per_iteration": 2.6856164932250977 + }, + { + "auxiliary_loss_clip": 0.01077447, + "auxiliary_loss_mlp": 0.01031566, + "balance_loss_clip": 1.03370774, + "balance_loss_mlp": 1.01910233, + "epoch": 0.9954005711709003, + "flos": 21937397546880.0, + "grad_norm": 2.812435043549039, + "language_loss": 0.86056131, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.8816514, + "num_input_tokens_seen": 357357505, + "step": 16556, + "time_per_iteration": 2.8494834899902344 + }, + { + "auxiliary_loss_clip": 0.01062031, + "auxiliary_loss_mlp": 0.01032977, + "balance_loss_clip": 1.03475928, + "balance_loss_mlp": 1.02018583, + "epoch": 0.9954606944235683, + "flos": 19354559212800.0, + "grad_norm": 1.7341010844964493, + "language_loss": 0.73350233, + "learning_rate": 2.132967729762125e-10, + "loss": 0.75445241, + "num_input_tokens_seen": 357375395, + "step": 16557, + "time_per_iteration": 2.776954412460327 + }, + { + "auxiliary_loss_clip": 0.01096785, + "auxiliary_loss_mlp": 0.01035737, + "balance_loss_clip": 1.03676844, + "balance_loss_mlp": 1.02380407, + "epoch": 0.9955208176762362, + "flos": 30518611591680.0, + "grad_norm": 1.8889126824734808, + "language_loss": 0.76071554, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.78204083, + "num_input_tokens_seen": 357397375, + "step": 16558, + "time_per_iteration": 2.725471258163452 + }, + { + "auxiliary_loss_clip": 0.01082875, + "auxiliary_loss_mlp": 0.0103135, + "balance_loss_clip": 1.0333569, + "balance_loss_mlp": 1.01868999, + "epoch": 0.9955809409289043, + "flos": 30008249199360.0, + "grad_norm": 1.8788857895854807, + "language_loss": 0.6342541, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.65539634, + "num_input_tokens_seen": 357418880, + "step": 16559, + "time_per_iteration": 2.754697322845459 + }, + { + "auxiliary_loss_clip": 0.01094664, + "auxiliary_loss_mlp": 0.01027311, + "balance_loss_clip": 1.03535438, + "balance_loss_mlp": 1.01506233, + "epoch": 0.9956410641815723, + "flos": 21543278544000.0, + "grad_norm": 2.2357683381447044, + "language_loss": 0.74527764, + "learning_rate": 1.965745799148433e-10, + "loss": 0.76649737, + "num_input_tokens_seen": 357438310, + "step": 16560, + "time_per_iteration": 2.675863265991211 + }, + { + "auxiliary_loss_clip": 0.01050704, + "auxiliary_loss_mlp": 0.01026972, + "balance_loss_clip": 1.03353262, + "balance_loss_mlp": 1.0149498, + "epoch": 0.9957011874342402, + "flos": 21689470897920.0, + "grad_norm": 1.7541279105695071, + "language_loss": 0.7902168, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.81099355, + "num_input_tokens_seen": 357457155, + "step": 16561, + "time_per_iteration": 2.800518751144409 + }, + { + "auxiliary_loss_clip": 0.01105362, + "auxiliary_loss_mlp": 0.01030097, + "balance_loss_clip": 1.03662086, + "balance_loss_mlp": 1.01839638, + "epoch": 0.9957613106869082, + "flos": 17702667619200.0, + "grad_norm": 2.712205364252532, + "language_loss": 0.65797567, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.67933023, + "num_input_tokens_seen": 357468060, + "step": 16562, + "time_per_iteration": 2.6822054386138916 + }, + { + "auxiliary_loss_clip": 0.01086196, + "auxiliary_loss_mlp": 0.00770624, + "balance_loss_clip": 1.03828645, + "balance_loss_mlp": 1.00018549, + "epoch": 0.9958214339395761, + "flos": 30555994671360.0, + "grad_norm": 3.146501927176202, + "language_loss": 0.6437341, + "learning_rate": 1.805348815528962e-10, + "loss": 0.66230226, + "num_input_tokens_seen": 357489665, + "step": 16563, + "time_per_iteration": 2.7867605686187744 + }, + { + "auxiliary_loss_clip": 0.01085894, + "auxiliary_loss_mlp": 0.01032654, + "balance_loss_clip": 1.03683317, + "balance_loss_mlp": 1.01987505, + "epoch": 0.9958815571922441, + "flos": 24169174306560.0, + "grad_norm": 2.104507608134006, + "language_loss": 0.64749634, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.66868186, + "num_input_tokens_seen": 357511975, + "step": 16564, + "time_per_iteration": 2.7374000549316406 + }, + { + "auxiliary_loss_clip": 0.010846, + "auxiliary_loss_mlp": 0.00769579, + "balance_loss_clip": 1.03644037, + "balance_loss_mlp": 1.00024891, + "epoch": 0.995941680444912, + "flos": 15487016065920.0, + "grad_norm": 2.0341967172049857, + "language_loss": 0.7462337, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.76477551, + "num_input_tokens_seen": 357529345, + "step": 16565, + "time_per_iteration": 2.6312754154205322 + }, + { + "auxiliary_loss_clip": 0.01087362, + "auxiliary_loss_mlp": 0.01027615, + "balance_loss_clip": 1.03376865, + "balance_loss_mlp": 1.0157181, + "epoch": 0.9960018036975801, + "flos": 18621227145600.0, + "grad_norm": 1.7027522514634321, + "language_loss": 0.79018843, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.81133819, + "num_input_tokens_seen": 357547615, + "step": 16566, + "time_per_iteration": 2.6870059967041016 + }, + { + "auxiliary_loss_clip": 0.01056958, + "auxiliary_loss_mlp": 0.00769517, + "balance_loss_clip": 1.03390598, + "balance_loss_mlp": 1.0001384, + "epoch": 0.996061926950248, + "flos": 20084120352000.0, + "grad_norm": 1.7390992367091276, + "language_loss": 0.70729011, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.72555488, + "num_input_tokens_seen": 357567380, + "step": 16567, + "time_per_iteration": 2.7366580963134766 + }, + { + "auxiliary_loss_clip": 0.01097619, + "auxiliary_loss_mlp": 0.01032972, + "balance_loss_clip": 1.03566027, + "balance_loss_mlp": 1.01965046, + "epoch": 0.996122050202916, + "flos": 24347829576960.0, + "grad_norm": 2.9373159802346076, + "language_loss": 0.79025483, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.81156075, + "num_input_tokens_seen": 357586435, + "step": 16568, + "time_per_iteration": 2.6557395458221436 + }, + { + "auxiliary_loss_clip": 0.01093714, + "auxiliary_loss_mlp": 0.01028241, + "balance_loss_clip": 1.03664947, + "balance_loss_mlp": 1.0173099, + "epoch": 0.9961821734555839, + "flos": 24199302839040.0, + "grad_norm": 1.7081910845577881, + "language_loss": 0.81825495, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.83947456, + "num_input_tokens_seen": 357604720, + "step": 16569, + "time_per_iteration": 2.750368118286133 + }, + { + "auxiliary_loss_clip": 0.0106979, + "auxiliary_loss_mlp": 0.00770531, + "balance_loss_clip": 1.03594494, + "balance_loss_mlp": 1.00018477, + "epoch": 0.9962422967082519, + "flos": 22633741584000.0, + "grad_norm": 1.812159782234162, + "language_loss": 0.7033971, + "learning_rate": 1.457630950747468e-10, + "loss": 0.72180027, + "num_input_tokens_seen": 357622345, + "step": 16570, + "time_per_iteration": 2.6845390796661377 + }, + { + "auxiliary_loss_clip": 0.01079783, + "auxiliary_loss_mlp": 0.01026272, + "balance_loss_clip": 1.03678036, + "balance_loss_mlp": 1.01413023, + "epoch": 0.9963024199609198, + "flos": 26396030903040.0, + "grad_norm": 1.5778392939659474, + "language_loss": 0.75031984, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.77138042, + "num_input_tokens_seen": 357642710, + "step": 16571, + "time_per_iteration": 2.6998531818389893 + }, + { + "auxiliary_loss_clip": 0.01085876, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.03418159, + "balance_loss_mlp": 1.01976252, + "epoch": 0.9963625432135879, + "flos": 16581537342720.0, + "grad_norm": 2.056267788643989, + "language_loss": 0.79312503, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.81430686, + "num_input_tokens_seen": 357659870, + "step": 16572, + "time_per_iteration": 2.6602699756622314 + }, + { + "auxiliary_loss_clip": 0.01083413, + "auxiliary_loss_mlp": 0.01031239, + "balance_loss_clip": 1.0354054, + "balance_loss_mlp": 1.01907969, + "epoch": 0.9964226664662559, + "flos": 26468534505600.0, + "grad_norm": 1.9072175246303182, + "language_loss": 0.7072866, + "learning_rate": 1.3199841727074e-10, + "loss": 0.72843313, + "num_input_tokens_seen": 357677075, + "step": 16573, + "time_per_iteration": 2.7399983406066895 + }, + { + "auxiliary_loss_clip": 0.01085736, + "auxiliary_loss_mlp": 0.01031859, + "balance_loss_clip": 1.03562653, + "balance_loss_mlp": 1.01902056, + "epoch": 0.9964827897189238, + "flos": 27448320764160.0, + "grad_norm": 17.706098733972073, + "language_loss": 0.63426065, + "learning_rate": 1.275618614968721e-10, + "loss": 0.65543658, + "num_input_tokens_seen": 357696715, + "step": 16574, + "time_per_iteration": 2.7760708332061768 + }, + { + "auxiliary_loss_clip": 0.01079151, + "auxiliary_loss_mlp": 0.01032112, + "balance_loss_clip": 1.03830504, + "balance_loss_mlp": 1.01859987, + "epoch": 0.9965429129715918, + "flos": 11721566350080.0, + "grad_norm": 2.269343954820431, + "language_loss": 0.76514804, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.78626072, + "num_input_tokens_seen": 357712345, + "step": 16575, + "time_per_iteration": 2.670433759689331 + }, + { + "auxiliary_loss_clip": 0.01086412, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.03638375, + "balance_loss_mlp": 1.01757431, + "epoch": 0.9966030362242597, + "flos": 19756004590080.0, + "grad_norm": 1.890239065955032, + "language_loss": 0.70341682, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.72458476, + "num_input_tokens_seen": 357731815, + "step": 16576, + "time_per_iteration": 2.7393879890441895 + }, + { + "auxiliary_loss_clip": 0.0109524, + "auxiliary_loss_mlp": 0.01024289, + "balance_loss_clip": 1.03612185, + "balance_loss_mlp": 1.01186752, + "epoch": 0.9966631594769277, + "flos": 23915178259200.0, + "grad_norm": 1.5127574576312723, + "language_loss": 0.71783984, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.73903513, + "num_input_tokens_seen": 357751640, + "step": 16577, + "time_per_iteration": 2.6747822761535645 + }, + { + "auxiliary_loss_clip": 0.0108308, + "auxiliary_loss_mlp": 0.01034487, + "balance_loss_clip": 1.03563082, + "balance_loss_mlp": 1.02241135, + "epoch": 0.9967232827295956, + "flos": 15559591495680.0, + "grad_norm": 1.8839463168037793, + "language_loss": 0.78829128, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.80946696, + "num_input_tokens_seen": 357769850, + "step": 16578, + "time_per_iteration": 2.6458945274353027 + }, + { + "auxiliary_loss_clip": 0.01069592, + "auxiliary_loss_mlp": 0.0076966, + "balance_loss_clip": 1.0383426, + "balance_loss_mlp": 1.00028622, + "epoch": 0.9967834059822637, + "flos": 20813035046400.0, + "grad_norm": 1.6210978789721697, + "language_loss": 0.76015878, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.77855128, + "num_input_tokens_seen": 357789550, + "step": 16579, + "time_per_iteration": 2.7179081439971924 + }, + { + "auxiliary_loss_clip": 0.01085459, + "auxiliary_loss_mlp": 0.01037581, + "balance_loss_clip": 1.03625321, + "balance_loss_mlp": 1.02271509, + "epoch": 0.9968435292349316, + "flos": 36719234830080.0, + "grad_norm": 2.1621427705186513, + "language_loss": 0.69284117, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.71407157, + "num_input_tokens_seen": 357809525, + "step": 16580, + "time_per_iteration": 4.3343565464019775 + }, + { + "auxiliary_loss_clip": 0.0105428, + "auxiliary_loss_mlp": 0.01032231, + "balance_loss_clip": 1.03120196, + "balance_loss_mlp": 1.02024424, + "epoch": 0.9969036524875996, + "flos": 26760919213440.0, + "grad_norm": 1.9697975439158977, + "language_loss": 0.79967076, + "learning_rate": 9.862937031113184e-11, + "loss": 0.8205359, + "num_input_tokens_seen": 357829795, + "step": 16581, + "time_per_iteration": 4.272336483001709 + }, + { + "auxiliary_loss_clip": 0.01078953, + "auxiliary_loss_mlp": 0.01027322, + "balance_loss_clip": 1.03649044, + "balance_loss_mlp": 1.01607418, + "epoch": 0.9969637757402675, + "flos": 24827237424000.0, + "grad_norm": 1.8090343516567968, + "language_loss": 0.80200183, + "learning_rate": 9.479950191249031e-11, + "loss": 0.82306457, + "num_input_tokens_seen": 357851655, + "step": 16582, + "time_per_iteration": 4.770942449569702 + }, + { + "auxiliary_loss_clip": 0.01092857, + "auxiliary_loss_mlp": 0.01033363, + "balance_loss_clip": 1.03387117, + "balance_loss_mlp": 1.02106702, + "epoch": 0.9970238989929355, + "flos": 23038742407680.0, + "grad_norm": 1.6176460264436903, + "language_loss": 0.60509884, + "learning_rate": 9.104547011951069e-11, + "loss": 0.62636101, + "num_input_tokens_seen": 357871205, + "step": 16583, + "time_per_iteration": 2.670657157897949 + }, + { + "auxiliary_loss_clip": 0.01088101, + "auxiliary_loss_mlp": 0.01037237, + "balance_loss_clip": 1.03633022, + "balance_loss_mlp": 1.0250237, + "epoch": 0.9970840222456034, + "flos": 25298816106240.0, + "grad_norm": 1.6127986377425965, + "language_loss": 0.77779889, + "learning_rate": 8.736727507452357e-11, + "loss": 0.79905224, + "num_input_tokens_seen": 357892145, + "step": 16584, + "time_per_iteration": 2.6968681812286377 + }, + { + "auxiliary_loss_clip": 0.01081813, + "auxiliary_loss_mlp": 0.01030965, + "balance_loss_clip": 1.03400755, + "balance_loss_mlp": 1.01991463, + "epoch": 0.9971441454982715, + "flos": 21615602578560.0, + "grad_norm": 1.5491233705844139, + "language_loss": 0.69406962, + "learning_rate": 8.376491691697297e-11, + "loss": 0.71519732, + "num_input_tokens_seen": 357911205, + "step": 16585, + "time_per_iteration": 4.212535381317139 + }, + { + "auxiliary_loss_clip": 0.0110602, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.03605747, + "balance_loss_mlp": 1.02094698, + "epoch": 0.9972042687509394, + "flos": 14975612179200.0, + "grad_norm": 2.688083222566017, + "language_loss": 0.82222629, + "learning_rate": 8.023839578363834e-11, + "loss": 0.84362036, + "num_input_tokens_seen": 357928190, + "step": 16586, + "time_per_iteration": 2.5343804359436035 + }, + { + "auxiliary_loss_clip": 0.01084137, + "auxiliary_loss_mlp": 0.01038457, + "balance_loss_clip": 1.03290653, + "balance_loss_mlp": 1.02660799, + "epoch": 0.9972643920036074, + "flos": 25806664546560.0, + "grad_norm": 2.102200677561442, + "language_loss": 0.7796334, + "learning_rate": 7.678771180796851e-11, + "loss": 0.80085933, + "num_input_tokens_seen": 357946985, + "step": 16587, + "time_per_iteration": 2.653956174850464 + }, + { + "auxiliary_loss_clip": 0.01083114, + "auxiliary_loss_mlp": 0.01036983, + "balance_loss_clip": 1.03732991, + "balance_loss_mlp": 1.02448964, + "epoch": 0.9973245152562754, + "flos": 23326242865920.0, + "grad_norm": 5.715123647273254, + "language_loss": 0.73174369, + "learning_rate": 7.341286512074773e-11, + "loss": 0.75294471, + "num_input_tokens_seen": 357966720, + "step": 16588, + "time_per_iteration": 2.5937352180480957 + }, + { + "auxiliary_loss_clip": 0.01112154, + "auxiliary_loss_mlp": 0.01029109, + "balance_loss_clip": 1.03663898, + "balance_loss_mlp": 1.01646113, + "epoch": 0.9973846385089433, + "flos": 12166212810240.0, + "grad_norm": 5.411177211250548, + "language_loss": 0.82386965, + "learning_rate": 7.011385585031781e-11, + "loss": 0.84528232, + "num_input_tokens_seen": 357981375, + "step": 16589, + "time_per_iteration": 2.5262768268585205 + }, + { + "auxiliary_loss_clip": 0.01100757, + "auxiliary_loss_mlp": 0.0103796, + "balance_loss_clip": 1.03564775, + "balance_loss_mlp": 1.02382755, + "epoch": 0.9974447617616113, + "flos": 20045157073920.0, + "grad_norm": 4.308142641596885, + "language_loss": 0.70464408, + "learning_rate": 6.689068412168986e-11, + "loss": 0.72603118, + "num_input_tokens_seen": 358000290, + "step": 16590, + "time_per_iteration": 2.5830941200256348 + }, + { + "auxiliary_loss_clip": 0.01086738, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.03551257, + "balance_loss_mlp": 1.01895654, + "epoch": 0.9975048850142793, + "flos": 32014614159360.0, + "grad_norm": 4.864987201646496, + "language_loss": 0.63802195, + "learning_rate": 6.374335005676634e-11, + "loss": 0.65920961, + "num_input_tokens_seen": 358022075, + "step": 16591, + "time_per_iteration": 2.68571400642395 + }, + { + "auxiliary_loss_clip": 0.0108584, + "auxiliary_loss_mlp": 0.01030286, + "balance_loss_clip": 1.03423333, + "balance_loss_mlp": 1.01809728, + "epoch": 0.9975650082669473, + "flos": 36933728895360.0, + "grad_norm": 2.6236190500257726, + "language_loss": 0.73096275, + "learning_rate": 6.067185377522933e-11, + "loss": 0.75212401, + "num_input_tokens_seen": 358043940, + "step": 16592, + "time_per_iteration": 2.7373883724212646 + }, + { + "auxiliary_loss_clip": 0.01087724, + "auxiliary_loss_mlp": 0.01032375, + "balance_loss_clip": 1.03737628, + "balance_loss_mlp": 1.01964951, + "epoch": 0.9976251315196152, + "flos": 16472117537280.0, + "grad_norm": 1.4821546433362938, + "language_loss": 0.85078406, + "learning_rate": 5.767619539343016e-11, + "loss": 0.87198508, + "num_input_tokens_seen": 358062720, + "step": 16593, + "time_per_iteration": 2.662369966506958 + }, + { + "auxiliary_loss_clip": 0.01104576, + "auxiliary_loss_mlp": 0.0076981, + "balance_loss_clip": 1.03564858, + "balance_loss_mlp": 1.00020099, + "epoch": 0.9976852547722832, + "flos": 19646836179840.0, + "grad_norm": 1.9769219864730705, + "language_loss": 0.6983223, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.71706617, + "num_input_tokens_seen": 358081560, + "step": 16594, + "time_per_iteration": 2.5857043266296387 + }, + { + "auxiliary_loss_clip": 0.01069224, + "auxiliary_loss_mlp": 0.01026926, + "balance_loss_clip": 1.03892672, + "balance_loss_mlp": 1.01451635, + "epoch": 0.9977453780249511, + "flos": 20448434044800.0, + "grad_norm": 2.022789522013575, + "language_loss": 0.72606945, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.74703097, + "num_input_tokens_seen": 358099065, + "step": 16595, + "time_per_iteration": 2.7689433097839355 + }, + { + "auxiliary_loss_clip": 0.01007096, + "auxiliary_loss_mlp": 0.01003373, + "balance_loss_clip": 1.00481117, + "balance_loss_mlp": 1.00250244, + "epoch": 0.9978055012776191, + "flos": 65455097581440.0, + "grad_norm": 0.7875629365454856, + "language_loss": 0.60383916, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.62394392, + "num_input_tokens_seen": 358156095, + "step": 16596, + "time_per_iteration": 3.08450984954834 + }, + { + "auxiliary_loss_clip": 0.01096892, + "auxiliary_loss_mlp": 0.01029762, + "balance_loss_clip": 1.03594232, + "balance_loss_mlp": 1.01738787, + "epoch": 0.997865624530287, + "flos": 20631506688000.0, + "grad_norm": 2.0986961825985087, + "language_loss": 0.77297747, + "learning_rate": 4.645194309227385e-11, + "loss": 0.79424405, + "num_input_tokens_seen": 358175230, + "step": 16597, + "time_per_iteration": 2.6868622303009033 + }, + { + "auxiliary_loss_clip": 0.01097035, + "auxiliary_loss_mlp": 0.01030189, + "balance_loss_clip": 1.03486156, + "balance_loss_mlp": 1.01730847, + "epoch": 0.9979257477829551, + "flos": 29387102284800.0, + "grad_norm": 1.756861728101755, + "language_loss": 0.82217014, + "learning_rate": 4.383547585562475e-11, + "loss": 0.84344238, + "num_input_tokens_seen": 358197075, + "step": 16598, + "time_per_iteration": 2.7054567337036133 + }, + { + "auxiliary_loss_clip": 0.01081519, + "auxiliary_loss_mlp": 0.01044245, + "balance_loss_clip": 1.03558803, + "balance_loss_mlp": 1.03068531, + "epoch": 0.997985871035623, + "flos": 22635070387200.0, + "grad_norm": 2.4545106380847335, + "language_loss": 0.64762008, + "learning_rate": 4.129484715709175e-11, + "loss": 0.66887772, + "num_input_tokens_seen": 358215925, + "step": 16599, + "time_per_iteration": 2.6614456176757812 + }, + { + "auxiliary_loss_clip": 0.01010593, + "auxiliary_loss_mlp": 0.01000422, + "balance_loss_clip": 1.00784099, + "balance_loss_mlp": 0.9994688, + "epoch": 0.998045994288291, + "flos": 61806968663040.0, + "grad_norm": 0.9370474706148707, + "language_loss": 0.62274885, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.64285898, + "num_input_tokens_seen": 358269035, + "step": 16600, + "time_per_iteration": 3.1614274978637695 + }, + { + "auxiliary_loss_clip": 0.01085288, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.03679216, + "balance_loss_mlp": 1.02095842, + "epoch": 0.998106117540959, + "flos": 19245534456960.0, + "grad_norm": 1.511083911813729, + "language_loss": 0.78393221, + "learning_rate": 3.644110575717896e-11, + "loss": 0.80510521, + "num_input_tokens_seen": 358287680, + "step": 16601, + "time_per_iteration": 2.772331953048706 + }, + { + "auxiliary_loss_clip": 0.01077732, + "auxiliary_loss_mlp": 0.01031133, + "balance_loss_clip": 1.03514004, + "balance_loss_mlp": 1.01892638, + "epoch": 0.9981662407936269, + "flos": 21106209853440.0, + "grad_norm": 2.513777021712519, + "language_loss": 0.82537293, + "learning_rate": 3.412799323987414e-11, + "loss": 0.84646153, + "num_input_tokens_seen": 358304080, + "step": 16602, + "time_per_iteration": 2.6796252727508545 + }, + { + "auxiliary_loss_clip": 0.01068281, + "auxiliary_loss_mlp": 0.01034598, + "balance_loss_clip": 1.03651309, + "balance_loss_mlp": 1.02249801, + "epoch": 0.998226364046295, + "flos": 24316839118080.0, + "grad_norm": 2.030453284598539, + "language_loss": 0.62777632, + "learning_rate": 3.189071962883538e-11, + "loss": 0.64880514, + "num_input_tokens_seen": 358323670, + "step": 16603, + "time_per_iteration": 2.693939447402954 + }, + { + "auxiliary_loss_clip": 0.01084524, + "auxiliary_loss_mlp": 0.01027967, + "balance_loss_clip": 1.03433418, + "balance_loss_mlp": 1.01537895, + "epoch": 0.9982864872989629, + "flos": 23836389776640.0, + "grad_norm": 1.7079397475017406, + "language_loss": 0.70913982, + "learning_rate": 2.972928500866168e-11, + "loss": 0.73026478, + "num_input_tokens_seen": 358341980, + "step": 16604, + "time_per_iteration": 2.8074941635131836 + }, + { + "auxiliary_loss_clip": 0.0110762, + "auxiliary_loss_mlp": 0.01027575, + "balance_loss_clip": 1.03609681, + "balance_loss_mlp": 1.01511717, + "epoch": 0.9983466105516309, + "flos": 18333116156160.0, + "grad_norm": 1.8321992225796084, + "language_loss": 0.64592469, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.66727662, + "num_input_tokens_seen": 358360400, + "step": 16605, + "time_per_iteration": 2.559711456298828 + }, + { + "auxiliary_loss_clip": 0.01072745, + "auxiliary_loss_mlp": 0.0103157, + "balance_loss_clip": 1.03378582, + "balance_loss_mlp": 1.01944005, + "epoch": 0.9984067338042988, + "flos": 17236763285760.0, + "grad_norm": 1.7112583965701615, + "language_loss": 0.7144081, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.73545122, + "num_input_tokens_seen": 358378990, + "step": 16606, + "time_per_iteration": 2.6522889137268066 + }, + { + "auxiliary_loss_clip": 0.0109612, + "auxiliary_loss_mlp": 0.00770001, + "balance_loss_clip": 1.03534591, + "balance_loss_mlp": 1.00014746, + "epoch": 0.9984668570569668, + "flos": 20667884186880.0, + "grad_norm": 1.989921171025738, + "language_loss": 0.82035434, + "learning_rate": 2.370001590090709e-11, + "loss": 0.8390156, + "num_input_tokens_seen": 358395970, + "step": 16607, + "time_per_iteration": 2.6804637908935547 + }, + { + "auxiliary_loss_clip": 0.0107541, + "auxiliary_loss_mlp": 0.01033615, + "balance_loss_clip": 1.03306758, + "balance_loss_mlp": 1.02051961, + "epoch": 0.9985269803096347, + "flos": 30262532555520.0, + "grad_norm": 1.6639542456977479, + "language_loss": 0.67119384, + "learning_rate": 2.184193803622669e-11, + "loss": 0.69228399, + "num_input_tokens_seen": 358417355, + "step": 16608, + "time_per_iteration": 2.906008005142212 + }, + { + "auxiliary_loss_clip": 0.01063208, + "auxiliary_loss_mlp": 0.0103334, + "balance_loss_clip": 1.03657353, + "balance_loss_mlp": 1.02062011, + "epoch": 0.9985871035623027, + "flos": 10560970005120.0, + "grad_norm": 1.8164676216631945, + "language_loss": 0.80704165, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.82800716, + "num_input_tokens_seen": 358434345, + "step": 16609, + "time_per_iteration": 2.7321889400482178 + }, + { + "auxiliary_loss_clip": 0.01087746, + "auxiliary_loss_mlp": 0.01034825, + "balance_loss_clip": 1.03453326, + "balance_loss_mlp": 1.02246904, + "epoch": 0.9986472268149706, + "flos": 16873455173760.0, + "grad_norm": 1.4927424952025787, + "language_loss": 0.62772417, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.64894992, + "num_input_tokens_seen": 358452870, + "step": 16610, + "time_per_iteration": 2.6517322063446045 + }, + { + "auxiliary_loss_clip": 0.01089605, + "auxiliary_loss_mlp": 0.01032987, + "balance_loss_clip": 1.03502405, + "balance_loss_mlp": 1.02091718, + "epoch": 0.9987073500676387, + "flos": 22054538776320.0, + "grad_norm": 2.237128509248557, + "language_loss": 0.67805243, + "learning_rate": 1.672274094288717e-11, + "loss": 0.69927835, + "num_input_tokens_seen": 358472210, + "step": 16611, + "time_per_iteration": 2.634993553161621 + }, + { + "auxiliary_loss_clip": 0.01066706, + "auxiliary_loss_mlp": 0.01038076, + "balance_loss_clip": 1.03627813, + "balance_loss_mlp": 1.02527332, + "epoch": 0.9987674733203066, + "flos": 30482880537600.0, + "grad_norm": 1.4582335875749615, + "language_loss": 0.69769359, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.71874142, + "num_input_tokens_seen": 358493840, + "step": 16612, + "time_per_iteration": 2.8596408367156982 + }, + { + "auxiliary_loss_clip": 0.01083064, + "auxiliary_loss_mlp": 0.0103316, + "balance_loss_clip": 1.03643417, + "balance_loss_mlp": 1.02126336, + "epoch": 0.9988275965729746, + "flos": 27745230585600.0, + "grad_norm": 1.6439272991561156, + "language_loss": 0.73709273, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.75825495, + "num_input_tokens_seen": 358515060, + "step": 16613, + "time_per_iteration": 2.7902584075927734 + }, + { + "auxiliary_loss_clip": 0.01071712, + "auxiliary_loss_mlp": 0.0077277, + "balance_loss_clip": 1.0345372, + "balance_loss_mlp": 1.00019884, + "epoch": 0.9988877198256426, + "flos": 17524191916800.0, + "grad_norm": 1.920035389313773, + "language_loss": 0.73619223, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.754637, + "num_input_tokens_seen": 358528200, + "step": 16614, + "time_per_iteration": 2.6406190395355225 + }, + { + "auxiliary_loss_clip": 0.01094466, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.03571415, + "balance_loss_mlp": 1.02093267, + "epoch": 0.9989478430783105, + "flos": 20996502739200.0, + "grad_norm": 1.5665679331066227, + "language_loss": 0.722013, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.74328637, + "num_input_tokens_seen": 358548360, + "step": 16615, + "time_per_iteration": 2.639946222305298 + }, + { + "auxiliary_loss_clip": 0.01112886, + "auxiliary_loss_mlp": 0.00770149, + "balance_loss_clip": 1.03912163, + "balance_loss_mlp": 1.00027168, + "epoch": 0.9990079663309785, + "flos": 13370620769280.0, + "grad_norm": 2.06456218997016, + "language_loss": 0.77498305, + "learning_rate": 9.70753783247069e-12, + "loss": 0.79381335, + "num_input_tokens_seen": 358566270, + "step": 16616, + "time_per_iteration": 2.703230619430542 + }, + { + "auxiliary_loss_clip": 0.01081698, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.03569651, + "balance_loss_mlp": 1.01701295, + "epoch": 0.9990680895836465, + "flos": 17310236555520.0, + "grad_norm": 2.2671180152095696, + "language_loss": 0.82647479, + "learning_rate": 8.532016508855378e-12, + "loss": 0.84758323, + "num_input_tokens_seen": 358584710, + "step": 16617, + "time_per_iteration": 2.6513431072235107 + }, + { + "auxiliary_loss_clip": 0.01086051, + "auxiliary_loss_mlp": 0.01027312, + "balance_loss_clip": 1.03479171, + "balance_loss_mlp": 1.01599932, + "epoch": 0.9991282128363145, + "flos": 24207993930240.0, + "grad_norm": 1.6210542380677302, + "language_loss": 0.78575474, + "learning_rate": 7.43233506206309e-12, + "loss": 0.8068884, + "num_input_tokens_seen": 358606750, + "step": 16618, + "time_per_iteration": 2.6931798458099365 + }, + { + "auxiliary_loss_clip": 0.01105935, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.03507876, + "balance_loss_mlp": 1.01963282, + "epoch": 0.9991883360889824, + "flos": 21175301664000.0, + "grad_norm": 1.7008832792892883, + "language_loss": 0.74742877, + "learning_rate": 6.408493534060255e-12, + "loss": 0.76880378, + "num_input_tokens_seen": 358624675, + "step": 16619, + "time_per_iteration": 4.155118942260742 + }, + { + "auxiliary_loss_clip": 0.01093229, + "auxiliary_loss_mlp": 0.01028121, + "balance_loss_clip": 1.03345323, + "balance_loss_mlp": 1.01702261, + "epoch": 0.9992484593416504, + "flos": 19901155449600.0, + "grad_norm": 2.899887344454389, + "language_loss": 0.8699075, + "learning_rate": 5.460491963260594e-12, + "loss": 0.89112103, + "num_input_tokens_seen": 358640715, + "step": 16620, + "time_per_iteration": 4.1041669845581055 + }, + { + "auxiliary_loss_clip": 0.01066897, + "auxiliary_loss_mlp": 0.01026513, + "balance_loss_clip": 1.02997065, + "balance_loss_mlp": 1.01463938, + "epoch": 0.9993085825943183, + "flos": 24857832833280.0, + "grad_norm": 2.059624292912411, + "language_loss": 0.72426653, + "learning_rate": 4.58833038607942e-12, + "loss": 0.74520063, + "num_input_tokens_seen": 358659630, + "step": 16621, + "time_per_iteration": 4.831484794616699 + }, + { + "auxiliary_loss_clip": 0.01000795, + "auxiliary_loss_mlp": 0.00999466, + "balance_loss_clip": 1.00817204, + "balance_loss_mlp": 0.99854225, + "epoch": 0.9993687058469863, + "flos": 71284478780160.0, + "grad_norm": 0.7355150485503724, + "language_loss": 0.56584859, + "learning_rate": 3.79200883515729e-12, + "loss": 0.58585119, + "num_input_tokens_seen": 358727840, + "step": 16622, + "time_per_iteration": 3.3878767490386963 + }, + { + "auxiliary_loss_clip": 0.0106847, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.0328269, + "balance_loss_mlp": 1.01847744, + "epoch": 0.9994288290996542, + "flos": 12199573566720.0, + "grad_norm": 2.3925047005917244, + "language_loss": 0.71642292, + "learning_rate": 3.071527340914315e-12, + "loss": 0.73742235, + "num_input_tokens_seen": 358744125, + "step": 16623, + "time_per_iteration": 2.725473642349243 + }, + { + "auxiliary_loss_clip": 0.01064784, + "auxiliary_loss_mlp": 0.0103218, + "balance_loss_clip": 1.0360446, + "balance_loss_mlp": 1.01946068, + "epoch": 0.9994889523523223, + "flos": 17889942153600.0, + "grad_norm": 1.8384385141171624, + "language_loss": 0.7497015, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.77067113, + "num_input_tokens_seen": 358761420, + "step": 16624, + "time_per_iteration": 4.1755170822143555 + }, + { + "auxiliary_loss_clip": 0.010734, + "auxiliary_loss_mlp": 0.01030652, + "balance_loss_clip": 1.03599191, + "balance_loss_mlp": 1.01818919, + "epoch": 0.9995490756049902, + "flos": 26578888064640.0, + "grad_norm": 1.666585360491219, + "language_loss": 0.73861277, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.75965327, + "num_input_tokens_seen": 358782600, + "step": 16625, + "time_per_iteration": 2.77114200592041 + }, + { + "auxiliary_loss_clip": 0.01094699, + "auxiliary_loss_mlp": 0.01032931, + "balance_loss_clip": 1.03575385, + "balance_loss_mlp": 1.02107549, + "epoch": 0.9996091988576582, + "flos": 22200048771840.0, + "grad_norm": 2.255734866069882, + "language_loss": 0.76902807, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.7903043, + "num_input_tokens_seen": 358801220, + "step": 16626, + "time_per_iteration": 2.687950611114502 + }, + { + "auxiliary_loss_clip": 0.01107588, + "auxiliary_loss_mlp": 0.010339, + "balance_loss_clip": 1.03792691, + "balance_loss_mlp": 1.02201486, + "epoch": 0.9996693221103262, + "flos": 27373195468800.0, + "grad_norm": 1.6905180098527337, + "language_loss": 0.82313097, + "learning_rate": 9.480024334429515e-13, + "loss": 0.84454584, + "num_input_tokens_seen": 358819190, + "step": 16627, + "time_per_iteration": 2.609881639480591 + }, + { + "auxiliary_loss_clip": 0.01095764, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.0381639, + "balance_loss_mlp": 1.02206075, + "epoch": 0.9997294453629941, + "flos": 26870410846080.0, + "grad_norm": 1.8853432771890695, + "language_loss": 0.70178521, + "learning_rate": 6.067215747584952e-13, + "loss": 0.72309601, + "num_input_tokens_seen": 358839850, + "step": 16628, + "time_per_iteration": 2.7530713081359863 + }, + { + "auxiliary_loss_clip": 0.01097289, + "auxiliary_loss_mlp": 0.01026328, + "balance_loss_clip": 1.0342133, + "balance_loss_mlp": 1.01419258, + "epoch": 0.9997895686156621, + "flos": 23476996247040.0, + "grad_norm": 1.3278590412352376, + "language_loss": 0.75475144, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.77598757, + "num_input_tokens_seen": 358859805, + "step": 16629, + "time_per_iteration": 2.7801589965820312 + }, + { + "auxiliary_loss_clip": 0.01089302, + "auxiliary_loss_mlp": 0.0103599, + "balance_loss_clip": 1.03652728, + "balance_loss_mlp": 1.02316952, + "epoch": 0.9998496918683301, + "flos": 20224961579520.0, + "grad_norm": 1.6419420095870436, + "language_loss": 0.60239536, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.62364829, + "num_input_tokens_seen": 358877900, + "step": 16630, + "time_per_iteration": 2.6396772861480713 + }, + { + "auxiliary_loss_clip": 0.0106418, + "auxiliary_loss_mlp": 0.01027947, + "balance_loss_clip": 1.03312218, + "balance_loss_mlp": 1.01544785, + "epoch": 0.9999098151209981, + "flos": 21652913831040.0, + "grad_norm": 3.1563978222436964, + "language_loss": 0.6076231, + "learning_rate": 3.792010017100722e-14, + "loss": 0.62854433, + "num_input_tokens_seen": 358897285, + "step": 16631, + "time_per_iteration": 2.699958086013794 + }, + { + "auxiliary_loss_clip": 0.01046835, + "auxiliary_loss_mlp": 0.00770368, + "balance_loss_clip": 1.03351796, + "balance_loss_mlp": 1.00018144, + "epoch": 0.999969938373666, + "flos": 11544599018880.0, + "grad_norm": 13.911116352216522, + "language_loss": 0.7268914, + "learning_rate": 0.0, + "loss": 0.74506336, + "num_input_tokens_seen": 358911570, + "step": 16632, + "time_per_iteration": 2.6853044033050537 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3992169073237033e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +}